Linux内核构成
本文讲解的内核版本是2.6,arm平台的启动过程。因为arm在嵌入式中确实具有统治地位的。闲话少说开始吧,因为真的很多内容,但是童鞋们一定要耐心看完。
1 arch/arm/boot/compressed/Makefile arch/arm/boot/compressed
2. arch/arm/kernel
Linux内核启动流程
arch/arm/boot/compressed
Start:
.type start,#function
.rept 8
mov r0, r0
.endr
b 1f
.word 0x016f2818 @ Magic numbers to help the loader
.word start @ absolute load/run zImage address
.word _edata @ zImage end address
1: mov r7, r1 @ save architecture ID
mov r8, r2 @ save atags pointer
这也标志着u-boot将系统完全的交给了OS,bootloader生命终止。之后代码在133行会读取cpsr并判断是否处理器处于supervisor模式——从u-boot进入kernel,系统已经处于SVC32模式;而利用angel进入则处于user模式,还需要额外两条指令。之后是再次确认中断关闭,并完成cpsr写入
mrs r2, cpsr @ get current mode
tst r2, #3 @ not user?
bne not_angel
mov r0, #0x17 @ angel_SWIreason_EnterSVC
swi 0x123456 @ angel_SWI_ARM
not_angel:
mrs r2, cpsr @ turn off interrupts to
orr r2, r2, #0xc0 @ prevent angel from running
msr cpsr_c, r2
然后在LC0地址处将分段信息导入r0-r6、ip、sp等寄存器,并检查代码是否运行在与链接时相同的目标地址,以决定是否进行处理。由于现在很少有人不使用loader和tags,将zImage烧写到rom直接从0x0位置执行,所以这个处理是必须的(但是zImage的头现在也保留了不用loader也可启动的能力)。arm架构下自解压头一般是链接在0x0地址而被加载到0x30008000运行,所以要修正这个变化。涉及到
- r5寄存器存放的zImage基地址
- r6和r12(即ip寄存器)存放的got(global offset table)
- r2和r3存放的bss段起止地址
- sp栈指针地址
很简单,这些寄存器统统被加上一个你也能猜到的偏移地址 0x30008000。该地址是s3c2410相关的,其他的ARM处理器可以参考下表
PXA2xx是0xa0008000
IXP2x00和IXP4xx是0x00008000
Freescale i.MX31/37是0x80008000
TI davinci DM64xx是0x80008000
TI omap系列是0x80008000
AT91RM/SAM92xx系列是0x20008000
Cirrus EP93xx是0x00008000
这些操作发生在代码172行开始的地方,下面只粘贴一部分
add r5, r5, r0
add r6, r6, r0
add ip, ip, r0
后面在211行进行bss段的清零工作
not_relocated: mov r0, #0
1: str r0, [r2], #4 @ clear bss
str r0, [r2], #4
str r0, [r2], #4
str r0, [r2], #4
cmp r2, r3
blo 1b
然后224行,打开cache,并为后面解压缩设置64KB的临时malloc空间
bl cache_on
mov r1, sp @ malloc space above stack
add r2, sp, #0x10000 @ 64k max 接下来238行进行检查,确定内核解压缩后的Image目标地址是否会覆盖到zImage头,如果是则准备将zImage头转移到解压出来的内核后面
cmp r4, r2
bhs wont_overwrite
sub r3, sp, r5 @ > compressed kernel size
add r0, r4, r3, lsl #2 @ allow for 4x expansion
cmp r0, r5
bls wont_overwrite
mov r5, r2 @ decompress after malloc space
mov r0, r5
mov r3, r7
bl decompress_kernel
真实情况——在大多数的应用中,内核编译都会把压缩的zImage和非压缩的Image链接到同样的地址。这样做的好处是,人们不用关心内核是Image还是zImage,放到这个位置执行就OK,所以在解压缩后zImage头必须为真正的内核让路。
在250行解压完毕,内核长度返回值存放在r0寄存器里。在内核末尾空出128字节的栈空间用,并且使其长度128字节对齐。
add r0, r0, #127 + 128 @ alignment + stack
bic r0, r0, #127 @ align the kernel length
算出搬移代码的参数:计算内核末尾地址并存放于r1寄存器,需要搬移代码原来地址放在r2,需要搬移的长度放在r3。然后执行搬移,并设置好sp指针指向新的栈(原来的栈也会被内核覆盖掉)
add r1, r5, r0 @ end of decompressed kernel
adr r2, reloc_start
ldr r3, LC1
add r3, r2, r3
1: ldmia r2!, {r9 - r14} @ copy relocation code
stmia r1!, {r9 - r14}
ldmia r2!, {r9 - r14}
stmia r1!, {r9 - r14}
cmp r2, r3
blo 1b
add sp, r1, #128 @ relocate the stack
搬移完成后刷新cache,因为代码地址变化了不能让cache再命中被内核覆盖的老地址。然后跳转到新的地址继续执行
bl cache_clean_flush
add pc, r5, r0 @ call relocation code
注意——zImage在解压后的搬移和跳转会给gdb调试内核带来麻烦。因为用来调试的符号表是在编译是生成的,并不知道以后会被搬移到何处去,只有在内核解压缩完成之后,根据计算出来的参数“告诉”调试器这个变化。以撰写本文时使用的zImage为例,内核自解压头重定向后,reloc_start地址由0x30008360变为0x30533e60。故我们要把vmlinux的符号表也相应的从0x30008000后移到0x30533b00开始,这样gdb就可以正确的对应源代码和机器指令。
随着头部代码移动到新的位置,不会再和内核的目标地址冲突,可以开始内核自身的搬移了。此时r0寄存器存放的是内核长度(严格的说是长度外加128Byte的栈),r4存放的是内核的目的地址0x30008000,r5是目前内核存放地址,r6是CPU ID,r7是machine ID,r8是atags地址。代码从501行开始
reloc_start: add r9, r5, r0
sub r9, r9, #128 @ do not copy the stack
debug_reloc_start
mov r1, r4
1:
.rept 4
ldmia r5!, {r0, r2, r3, r10 - r14} @ relocate kernel
stmia r1!, {r0, r2, r3, r10 - r14}
.endr
cmp r5, r9
blo 1b
add sp, r1, #128 @ relocate the stack
接下来在516行清除并关闭cache,清零r0,将machine ID存入r1,atags指针存入r2,再跳入0x30008000执行真正的内核Image
call_kernel: bl cache_clean_flush
bl cache_off
mov r0, #0 @ must be zero
mov r1, r7 @ restore architecture number
mov r2, r8 @ restore atags pointer
mov pc, r4 @ call kernel
内核代码入口在arch/arm/kernel文件的83行。首先进入SVC32模式,并查询CPU ID,检查合法性
msr cpsr_c, #PSR_F_BIT | PSR_I_BIT | SVC_MODE @ ensure svc mode
@ and irqs disabled
mrc p15, 0, r9, c0, c0 @ get processor id
bl __lookup_processor_type @ r5=procinfo r9=cpuid
movs r10, r5 @ invalid processor (r5=0)?
beq __error_p @ yes, error 'p'
接着在87行进一步查询machine ID并检查合法性
bl __lookup_machine_type @ r5=machinfo
movs r8, r5 @ invalid machine (r5=0)?
beq __error_a @ yes, error 'a'
其中__lookup_processor_type在linux-2.6.24-moko-linuxbj/arch/arm/kernel文件的149行,该函数首将标号3的实际地址加载到r3,然后将编译时生成的__proc_info_begin虚拟地址载入到r5,__proc_info_end虚拟地址载入到r6,标号3的虚拟地址载入到r7。由于adr伪指令和标号3的使用,以及__proc_info_begin等符号在linux-2.6.24-moko-linuxbj/arch/arm/kernel而不是代码中被定义,此处代码不是非常直观,想弄清楚代码缘由的读者请耐心阅读这两个文件和adr伪指令的说明。
r3和r7分别存储的是同一位置标号3的物理地址(由于没有启用mmu,所以当前肯定是物理地址)和虚拟地址,所以儿者相减即得到虚拟地址和物理地址之间的offset。利用此offset,将r5和r6中保存的虚拟地址转变为物理地址
__lookup_processor_type:
adr r3, 3f
ldmda r3, {r5 - r7}
sub r3, r3, r7 @ get offset between virt&phys
add r5, r5, r3 @ convert virt addresses to
add r6, r6, r3 @ physical address space
然后从proc_info中读出内核编译时写入的processor ID和之前从cpsr中读到的processor ID对比,查看代码和CPU硬件是否匹配(想在arm920t上运行为cortex-a8编译的内核?不让!)。如果编译了多种处理器支持,如versatile板,则会循环每种type依次检验,如果硬件读出的ID在内核中找不到匹配,则r5置0返回
1: ldmia r5, {r3, r4} @ value, mask
and r4, r4, r9 @ mask wanted bits
teq r3, r4
beq 2f
add r5, r5, #PROC_INFO_SZ @ sizeof(proc_info_list)
cmp r5, r6
blo 1b
mov r5, #0 @ unknown processor
2: mov pc, lr
__lookup_machine_type在linux-2.6.24-moko-linuxbj/arch/arm/kernel文件的197行,编码方法与检查processor ID完全一样,请参考前段
__lookup_machine_type:
adr r3, 3b
ldmia r3, {r4, r5, r6}
sub r3, r3, r4 @ get offset between virt&phys
add r5, r5, r3 @ convert virt addresses to
add r6, r6, r3 @ physical address space
1: ldr r3, [r5, #MACHINFO_TYPE] @ get machine type
teq r3, r1 @ matches loader number?
beq 2f @ found
add r5, r5, #SIZEOF_MACHINE_DESC @ next machine_desc
cmp r5, r6
blo 1b
mov r5, #0 @ unknown machine
2: mov pc, lr
代码回到第92行,检查atags合法性,然后创建初始页表
bl __vet_atags
bl __create_page_tables
创建页表的代码在218行,首先将内核起始地址-0x4000到内核起始地址之间的16K存储器清0
__create_page_tables:
pgtbl r4 @ page table address
/*
* Clear the 16K level 1 swapper page table
*/
mov r0, r4
mov r3, #0
add r6, r0, #0x4000
1: str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
str r3, [r0], #4
teq r0, r6
bne 1b
然后在234行将proc_info中的mmu_flags加载到r7
ldr r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags在242行将PC指针右移20位,得到内核第一个1MB空间的段地址存入r6,在s3c2410平台该值是0x300。接着根据此值存入映射标识
mov r6, pc, lsr #20 @ start of kernel section
orr r3, r7, r6, lsl #20 @ flags + kernel base
str r3, [r4, r6, lsl #2] @ identity mapping
完成页表设置后回到102行,为打开虚拟地址映射作准备。设置sp指针,函数返回地址lr指向__enable_mmu,并跳转到linux-2.6.24-moko-linuxbj/arch/arm/mm的386行,清除I-cache、D-cache、write buffer和TLB
__arm920_setup:
mov r0, #0
mcr p15, 0, r0, c7, c7 @ invalidate I,D caches on v4
mcr p15, 0, r0, c7, c10, 4 @ drain write buffer on v4
#ifdef CONFIG_MMU
mcr p15, 0, r0, c8, c7 @ invalidate I,D TLBs on v4
#endif然后返回的158行,加载domain和页表,跳转到__turn_mmu_on
__enable_mmu:
#ifdef CONFIG_ALIGNMENT_TRAP
orr r0, r0, #CR_A
#else
bic r0, r0, #CR_A
#endif
#ifdef CONFIG_CPU_DCACHE_DISABLE
bic r0, r0, #CR_C
#endif
#ifdef CONFIG_CPU_BPREDICT_DISABLE
bic r0, r0, #CR_Z
#endif
#ifdef CONFIG_CPU_ICACHE_DISABLE
bic r0, r0, #CR_I
#endif
mov r5, #(domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \
domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
domain_val(DOMAIN_TABLE, DOMAIN_MANAGER) | \
domain_val(DOMAIN_IO, DOMAIN_CLIENT))
mcr p15, 0, r5, c3, c0, 0 @ load domain access register
mcr p15, 0, r4, c2, c0, 0 @ load page table pointer
b __turn_mmu_on在194行把mmu使能位写入mmu,激活虚拟地址。然后将原来保存在sp中的地址载入pc,跳转到的__mmap_switched,至此代码进入虚拟地址的世界
mov r0, r0
mcr p15, 0, r0, c1, c0, 0 @ write control reg
mrc p15, 0, r3, c0, c0, 0 @ read id reg
mov r3, r3
mov r3, r3
mov pc, r13
在的37行开始清除内核bss段,processor ID保存在r9,machine ID报存在r1,atags地址保存在r2,并将控制寄存器保存到r7定义的内存地址。接下来跳入linux-2.6.24-moko-linuxbj/ini的507行,start_kernel函数。这里只粘贴部分代码
__mmap_switched:
adr r3, __switch_data + 4
ldmia r3!, {r4, r5, r6, r7}
cmp r4, r5 @ Copy data segment if needed
1: cmpne r5, r6
ldrne fp, [r4], #4
strne fp, [r5], #4
bne 1b
asmlinkage void __init start_kernel(void)
{
char * command_line;
extern struct kernel_param __start___param[], __stop___param[];
smp_setup_processor_id();
/*
* Need to run as early as possible, to initialize the
* lockdep hash:
*/
lockdep_init();
debug_objects_early_init();
cgroup_init_early();
local_irq_disable();
early_boot_irqs_off();
early_init_irq_lock_class();
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
lock_kernel();
tick_init();
boot_cpu_init();
page_address_init();
printk(KERN_NOTICE);
printk(linux_banner);
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task);
setup_command_line(command_line);
setup_per_cpu_areas();
setup_nr_cpu_ids();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
/*
* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
build_all_zonelists();
page_alloc_init();
printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
parse_early_param();
parse_args("Booting kernel", static_command_line, __start___param,
__stop___param - __start___param,
&unknown_bootoption);
if (!irqs_disabled()) {
printk(KERN_WARNING "start_kernel(): bug: interrupts were "
"enabled *very* early, fixing it\n");
local_irq_disable();
}
sort_main_extable();
trap_init();
rcu_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
pidhash_init();
init_timers();
hrtimers_init();
softirq_init();
timekeeping_init();
time_init();
sched_clock_init();
profile_init();
if (!irqs_disabled())
printk(KERN_CRIT "start_kernel(): bug: interrupts were "
"enabled early\n");
early_boot_irqs_on();
local_irq_enable();
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
console_init();
if (panic_later)
panic(panic_later, panic_param);
lockdep_info();
/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
"disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
vmalloc_init();
vfs_caches_init_early();
cpuset_init_early();
page_cgroup_init();
mem_init();
enable_debug_pagealloc();
cpu_hotplug_init();
kmem_cache_init();
debug_objects_mem_init();
idr_init_cache();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
calibrate_delay();
pidmap_init();
pgtable_cache_init();
prio_tree_init();
anon_vma_init();
#ifdef CONFIG_X86
if (efi_enabled)
efi_enter_virtual_mode();
#endif
thread_info_cache_init();
cred_init();
fork_init(num_physpages);
proc_caches_init();
buffer_init();
key_init();
security_init();
vfs_caches_init(num_physpages);
radix_tree_init();
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
cgroup_init();
cpuset_init();
taskstats_init_early();
delayacct_init();
check_bugs();
acpi_early_init(); /* before LAPIC and SMP init */
ftrace_init();
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}
tatic noinline void __init_refok rest_init(void)
__releases(kernel_lock)
{
int pid;
kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
unlock_kernel();
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
init_idle_bootup_task(current);
rcu_scheduler_starting();
preempt_enable_no_resched();
schedule();
preempt_disable();
/* Call into cpu_idle with preempt disabled */
cpu_idle();
}
static noinline int init_post(void)
{
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
free_initmem();
unlock_kernel();
mark_rodata_ro();
system_state = SYSTEM_RUNNING;
numa_default_policy();
if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
printk(KERN_WARNING "Warning: unable to open an initial console.\n");
(void) sys_dup(0);
(void) sys_dup(0);
current->signal->flags |= SIGNAL_UNKILLABLE;
if (ramdisk_execute_command) {
run_init_process(ramdisk_execute_command);
printk(KERN_WARNING "Failed to execute %s\n",
ramdisk_execute_command);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
run_init_process(execute_command);
printk(KERN_WARNING "Failed to execute %s. Attempting "
"defaults...\n", execute_command);
}
run_init_process("/sbin/init");
run_init_process("/etc/init");
run_init_process("/bin/init");
run_init_process("/bin/sh");
panic("No init found. Try passing init= option to kernel.");
}
好了,整个流程结束了。看不懂也没关系。因为在头条上代码排版真的难看懂。
喜欢我的文章的话,就关注我吧!在本头条号的置顶文章中有【文章分类】包含:
[C++进阶篇系列]
[高级网络编程篇系列]
[Linux系统篇系列]
[C++基础知识篇]
[协议篇系列]
[数据结构和算法系列]
[设计模式系列]
不要只收藏和转发哦,写文章其实很累的。