学userfaultfd→涉及mmap机制→为了mmap机制开始看linux内存管理系列源码解析→开始好奇类似于page结构体之类的东西是咋初始化的→继续研究系统启动
总结:万物的源头是系统启动doge(又在支线任务上越走越远了)
ps:还是主要关注内存相关的部分
内核版本4.15.8,x86_64
startup_64(书接上回)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
| # arch\x86\kernel\head_64.S 1: UNWIND_HINT_EMPTY
/* 检查CPU是否支持NX位(获得处理器信息) */ movl $0x80000001, %eax cpuid movl %edx,%edi
/* 把MSR_EFER(0xc0000080)放入ecx,然后执行rdmsr指令来读取CPU中的Model Specific Register(MSR) */ movl $MSR_EFER, %ecx rdmsr btsl $_EFER_SCE, %eax /* 启用syscall和sysret */ btl $20,%edi /* 不支持NX? */ jnc 1f btsl $_EFER_NX, %eax btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) 1: wrmsr /* 启用以上变化 */
/* 设置cr0 */ movl $CR0_STATE, %eax movq %rax, %cr0
/* 建立启动阶段栈 */ movq initial_stack(%rip), %rsp
/* 清空eflags */ pushq $0 popfq
/* 更新全局描述符表 */ lgdt early_gdt_descr(%rip)
/* 重新加载各个段 */ xorl %eax,%eax movl %eax,%ds movl %eax,%ss movl %eax,%es movl %eax,%fs movl %eax,%gs
/* 设置一下gs寄存器,令它指向一个特殊的栈irqstack,用于处理中断 */ movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax movl initial_gs+4(%rip),%edx wrmsr
movq %rsi, %rdi
.Ljump_to_C_code: pushq $.Lafter_lret # put return address on stack for unwinder xorq %rbp, %rbp # clear frame pointer movq initial_code(%rip), %rax # initial_code指向x86_64_start_kernel pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq # 跳转至x86_64_start_kernel
|
initial_stack
1 2 3
| # arch\x86\kernel\head_64.S GLOBAL(initial_stack) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
|
init_thread_union中thread_info在低地址(栈顶),以上的内存空间作为栈使用,预留了SIZEOF_PTREGS的空间
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
| union thread_union { #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; };
#define __init_task_data __attribute__((__section__(".data..init_task")))
#define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ .flags = 0, \ .cpu = 0, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ }
struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task);
union thread_union init_thread_union __init_task_data = { #ifndef CONFIG_THREAD_INFO_IN_TASK INIT_THREAD_INFO(init_task) #endif };
|
early_gdt_descr
1 2 3 4 5 6 7 8
| # arch\x86\kernel\head_64.S .data .align 16 .globl early_gdt_descr early_gdt_descr: .word GDT_ENTRIES*8-1 early_gdt_descr_base: .quad INIT_PER_CPU_VAR(gdt_page)
|
虽然目前内核工作在用户空间的低地址中,但很快内核将会在它自己的内存地址空间中运行,所以要重新加载全局描述符表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| #define GDT_ENTRIES 16
struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE)));
struct desc_struct { u16 limit0; u16 base0; u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed));
|
x86_64_start_kernel
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
| asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map); BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE); BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); cr4_init_shadow();
reset_early_page_tables(); clear_bss(); clear_page(init_top_pgt);
sme_early_init();
kasan_early_init();
idt_setup_early_handler();
copy_bootdata(__va(real_mode_data));
load_ucode_bsp();
init_top_pgt[511] = early_top_pgt[511];
x86_64_start_reservations(real_mode_data); }
|
1 2 3 4
| # arch\x86\kernel\head_64.S NEXT_PGD_PAGE(init_top_pgt) .fill 512,8,0 .fill PTI_USER_PGD_FILL,8,0
|
reset_early_page_tables
清空early_top_pgt,将early_top_pgt的物理地址写入cr3
1 2 3 4 5 6 7
| static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; write_cr3(__sme_pa_nodebug(early_top_pgt)); }
|
其实就是把之前的物理地址的映射清空了,early_top_pgt的前511项都不使用,相关内容见系统启动(一)
idt_setup_early_handler
idt_setup_early_handler函数主要过程
- set_intr_gate函数设置每一个中断
- load_idt加载中断
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| void __init idt_setup_early_handler(void) { int i;
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt(&idt_descr); }
struct desc_ptr idt_descr __ro_after_init = { .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, .address = (unsigned long) idt_table, };
|
early_idt_handler_array是一个32x9的数组,每一项9字节,其中2个字节的备用指令用于向栈中压入默认错误码(如果异常本身没有提供错误码的话),2个字节的指令用于向栈中压入向量号,剩余5个字节用于跳转到异常处理程序
1 2 3 4
| #define NUM_EXCEPTION_VECTORS 32 #define EARLY_IDT_HANDLER_SIZE 9 extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| # arch\x86\kernel\head_64.S ENTRY(early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 # 如果当前中断向量不需要错误码 UNWIND_HINT_IRET_REGS pushq $0 # 将一个值为0的64位数据压入栈中,这是为了在栈帧中保持统一的结构,即使当前中断向量不需要错误码 .else UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 入栈中断向量号 jmp early_idt_handler_common # 跳转至同一异常处理程序 UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc # 剩下的空间填充0xcc .endr UNWIND_HINT_IRET_REGS offset=16 END(early_idt_handler_array)
|
最后长这样
set_intr_gate函数用一个idt_data结构体打包数据,然后传给idt_setup_from_table函数,同时把idt_table也传过去
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| struct idt_bits { u16 ist : 3, zero : 5, type : 5, dpl : 2, p : 1; } __attribute__((packed));
struct idt_data { unsigned int vector; unsigned int segment; struct idt_bits bits; const void *addr; };
static void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data;
BUG_ON(n > 0xFF);
memset(&data, 0, sizeof(data)); data.vector = n; data.addr = addr; data.segment = __KERNEL_CS; data.bits.type = GATE_INTERRUPT; data.bits.p = 1;
idt_setup_from_table(idt_table, &data, 1, false); }
|
idt_setup_from_table主要过程
- idt_init_desc初始化gate_desc,基本上就是把idt_data结构体的内容填进gate_desc结构体
- write_idt_entry将gate_desc写入idt_table对应表项
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
| #define IDT_ENTRIES 256
typedef struct gate_struct gate_desc;
struct gate_struct { u16 offset_low; u16 segment; struct idt_bits bits; u16 offset_middle; #ifdef CONFIG_X86_64 u32 offset_high; u32 reserved; #endif } __attribute__((packed));
gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
static void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { gate_desc desc;
for (; size > 0; t++, size--) { idt_init_desc(&desc, t); write_idt_entry(idt, t->vector, &desc); if (sys) set_bit(t->vector, system_vectors); } }
static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) { unsigned long addr = (unsigned long) d->addr;
gate->offset_low = (u16) addr; gate->segment = (u16) d->segment; gate->bits = d->bits; gate->offset_middle = (u16) (addr >> 16); #ifdef CONFIG_X86_64 gate->offset_high = (u32) (addr >> 32); gate->reserved = 0; #endif }
|
early_idt_handler_common
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| # arch\x86\kernel\head_64.S early_idt_handler_common: cld # early_recursion_flag避免在early_idt_handler_common程序中递归地产生中断(有点类似于锁) incl early_recursion_flag(%rip)
/* 压栈通用寄存器 */ pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* RSI = vector number */ movq %rdi, 8(%rsp) /* pt_regs->di = RDI */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq %rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ pushq %r9 /* pt_regs->r9 */ pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ pushq %rbx /* pt_regs->bx */ pushq %rbp /* pt_regs->bp */ pushq %r12 /* pt_regs->r12 */ pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ UNWIND_HINT_REGS
cmpq $14,%rsi /* 如果是缺页中断则把cr2寄存器中的值赋值给rdi,然后调用early_make_pgtable */ jnz 10f GET_CR2_INTO(%rdi) call early_make_pgtable andl %eax,%eax jz 20f /* All good */
10: movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ call early_fixup_exception /* 不是缺页中断则调用early_fixup_exception进行下面的步骤 */
20: decl early_recursion_flag(%rip) /* 解锁 */ jmp restore_regs_and_return_to_kernel /* kernel人最熟悉的函数(,恢复寄存器并返回 */ END(early_idt_handler_common)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| void __init early_fixup_exception(struct pt_regs *regs, int trapnr) { if (trapnr == X86_TRAP_NMI) return; if (early_recursion_flag > 2) goto halt_loop; if (!xen_pv_domain() && regs->cs != __KERNEL_CS) goto fail;
if (fixup_exception(regs, trapnr)) return;
if (fixup_bug(regs, trapnr)) return;
fail: early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, regs->orig_ax, read_cr2());
show_regs(regs);
halt_loop: while (true) halt(); }
|
early_make_pgtable
传给early_make_pgtable的参数address是cr2的值,cr2中储存的是一个地址,这个地址引起了缺页中断
1 2 3 4 5 6 7 8 9
| int __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; pmdval_t pmd;
pmd = (physaddr & PMD_MASK) + early_pmd_flags;
return __early_make_pgtable(address, pmd); }
|
先看一些定义
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
static unsigned int __initdata next_early_pgt;
|
1 2 3
| # arch\x86\kernel\head_64.S NEXT_PAGE(early_dynamic_pgts) # 一个固定大小缓冲区 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
| int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; pmdval_t *pmd_p;
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) return -1;
again: pgd_p = &early_top_pgt[pgd_index(address)].pgd; pgd = *pgd_p;
if (!IS_ENABLED(CONFIG_X86_5LEVEL)) p4d_p = pgd_p; else if (pgd) p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); goto again; }
p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } p4d_p += p4d_index(address); p4d = *p4d_p;
if (p4d) pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); goto again; }
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); pud = *pud_p;
if (pud) pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); goto again; }
pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pmd_p[pmd_index(address)] = pmd;
return 0; }
|
copy_bootdata
主要不是讲这个函数,因为这个函数触发了缺页中断使得页表发生了变化
调用copy_bootdata时使用了一个宏,对real_mode_data的值进行了修正
1 2 3 4
| copy_bootdata(__va(real_mode_data));
#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
|
根据调试这个宏应该是将物理地址转化为了线性映射地址
显然这时候的页表没有这个地址的映射,所以会触发缺页中断