Kernel源码分析-系统启动(二)

学userfaultfd→涉及mmap机制→为了mmap机制开始看linux内存管理系列源码解析→开始好奇类似于page结构体之类的东西是咋初始化的→继续研究系统启动

总结:万物的源头是系统启动doge(又在支线任务上越走越远了)

ps:还是主要关注内存相关的部分

内核版本4.15.8,x86_64

startup_64(书接上回)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# arch\x86\kernel\head_64.S
1:
UNWIND_HINT_EMPTY

/* 检查CPU是否支持NX位(获得处理器信息) */
movl $0x80000001, %eax
cpuid
movl %edx,%edi

/* 把MSR_EFER(0xc0000080)放入ecx,然后执行rdmsr指令来读取CPU中的Model Specific Register(MSR) */
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_SCE, %eax /* 启用syscall和sysret */
btl $20,%edi /* 不支持NX? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
1: wrmsr /* 启用以上变化 */

/* 设置cr0 */
movl $CR0_STATE, %eax
movq %rax, %cr0

/* 建立启动阶段栈 */
movq initial_stack(%rip), %rsp

/* 清空eflags */
pushq $0
popfq

/* 更新全局描述符表 */
lgdt early_gdt_descr(%rip)

/* 重新加载各个段 */
xorl %eax,%eax
movl %eax,%ds
movl %eax,%ss
movl %eax,%es
movl %eax,%fs
movl %eax,%gs

/* 设置一下gs寄存器,令它指向一个特殊的栈irqstack,用于处理中断 */
movl $MSR_GS_BASE,%ecx
movl initial_gs(%rip),%eax
movl initial_gs+4(%rip),%edx
wrmsr

movq %rsi, %rdi

.Ljump_to_C_code:
pushq $.Lafter_lret # put return address on stack for unwinder
xorq %rbp, %rbp # clear frame pointer
movq initial_code(%rip), %rax # initial_code指向x86_64_start_kernel
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq # 跳转至x86_64_start_kernel

initial_stack

1
2
3
# arch\x86\kernel\head_64.S
GLOBAL(initial_stack)
.quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS

init_thread_union中thread_info在低地址(栈顶),以上的内存空间作为栈使用,预留了SIZEOF_PTREGS的空间

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// include\linux\sched.h
union thread_union {
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
unsigned long stack[THREAD_SIZE/sizeof(long)];
};

// include\linux\init_task.h
#define __init_task_data __attribute__((__section__(".data..init_task")))

// arch\ia64\include\asm\thread_info.h
#define INIT_THREAD_INFO(tsk) \
{ \
.task = &tsk, \
.flags = 0, \
.cpu = 0, \
.addr_limit = KERNEL_DS, \
.preempt_count = INIT_PREEMPT_COUNT, \
}

// init\init_task.c
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);

union thread_union init_thread_union __init_task_data = {
#ifndef CONFIG_THREAD_INFO_IN_TASK
INIT_THREAD_INFO(init_task)
#endif
};

early_gdt_descr

1
2
3
4
5
6
7
8
# arch\x86\kernel\head_64.S
.data
.align 16
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
early_gdt_descr_base:
.quad INIT_PER_CPU_VAR(gdt_page)

虽然目前内核工作在用户空间的低地址中,但很快内核将会在它自己的内存地址空间中运行,所以要重新加载全局描述符表

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
// arch\x86\include\asm\segment.h
#define GDT_ENTRIES 16

// arch\x86\include\asm\desc.h
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));

// arch\x86\include\asm\desc_defs.h
/* 8 byte segment descriptor */
struct desc_struct {
u16 limit0;
u16 base0;
u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1;
u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
} __attribute__((packed));

x86_64_start_kernel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// arch\x86\kernel\head64.c
asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
{
/* 一些检查 */
BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

/* 存储了每个CPU中cr4的Shadow Copy */
cr4_init_shadow();

/* 重置了所有的全局页目录项,同时向cr3中重新写入了的全局页目录表的地址 */
reset_early_page_tables();

/* 清空bss段 */
clear_bss();

/* 清空init_top_gpt页 */
clear_page(init_top_pgt);

/* sme和kasan初始化 */
sme_early_init();

kasan_early_init();

/* 初期中断和异常处理初始化 */
idt_setup_early_handler();

/* 处理boot_params */
copy_bootdata(__va(real_mode_data));

/* 加载处理器微代码 */
load_ucode_bsp();

/* 设置init_top_gpt */
init_top_pgt[511] = early_top_pgt[511];

x86_64_start_reservations(real_mode_data); // 这个函数的最后调用了start_kernel
}
1
2
3
4
# arch\x86\kernel\head_64.S
NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0

reset_early_page_tables

清空early_top_pgt,将early_top_pgt的物理地址写入cr3

1
2
3
4
5
6
7
// arch\x86\kernel\head64.c
static void __init reset_early_page_tables(void)
{
memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
next_early_pgt = 0;
write_cr3(__sme_pa_nodebug(early_top_pgt));
}

其实就是把之前的物理地址的映射清空了,early_top_pgt的前511项都不使用,相关内容见系统启动(一)

idt_setup_early_handler

idt_setup_early_handler函数主要过程

  • set_intr_gate函数设置每一个中断
  • load_idt加载中断
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// arch\x86\kernel\idt.c
void __init idt_setup_early_handler(void)
{
int i;

for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt(&idt_descr);
}

// arch\x86\kernel\idt.c
struct desc_ptr idt_descr __ro_after_init = {
.size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
.address = (unsigned long) idt_table,
};

early_idt_handler_array是一个32x9的数组,每一项9字节,其中2个字节的备用指令用于向栈中压入默认错误码(如果异常本身没有提供错误码的话),2个字节的指令用于向栈中压入向量号,剩余5个字节用于跳转到异常处理程序

1
2
3
4
// arch\x86\include\asm\segment.h
#define NUM_EXCEPTION_VECTORS 32
#define EARLY_IDT_HANDLER_SIZE 9
extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# arch\x86\kernel\head_64.S
ENTRY(early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 # 如果当前中断向量不需要错误码
UNWIND_HINT_IRET_REGS
pushq $0 # 将一个值为0的64位数据压入栈中,这是为了在栈帧中保持统一的结构,即使当前中断向量不需要错误码
.else
UNWIND_HINT_IRET_REGS offset=8
.endif
pushq $i # 入栈中断向量号
jmp early_idt_handler_common # 跳转至同一异常处理程序
UNWIND_HINT_IRET_REGS
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc # 剩下的空间填充0xcc
.endr
UNWIND_HINT_IRET_REGS offset=16
END(early_idt_handler_array)

最后长这样

set_intr_gate函数用一个idt_data结构体打包数据,然后传给idt_setup_from_table函数,同时把idt_table也传过去

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// arch\x86\include\asm\desc_defs.h
struct idt_bits {
u16 ist : 3,
zero : 5,
type : 5,
dpl : 2,
p : 1;
} __attribute__((packed));

// arch\x86\kernel\idt.c
struct idt_data {
unsigned int vector;
unsigned int segment;
struct idt_bits bits;
const void *addr;
};

static void set_intr_gate(unsigned int n, const void *addr)
{
struct idt_data data;

BUG_ON(n > 0xFF);

memset(&data, 0, sizeof(data));
data.vector = n;
data.addr = addr;
data.segment = __KERNEL_CS;
data.bits.type = GATE_INTERRUPT;
data.bits.p = 1;

idt_setup_from_table(idt_table, &data, 1, false);
}

idt_setup_from_table主要过程

  • idt_init_desc初始化gate_desc,基本上就是把idt_data结构体的内容填进gate_desc结构体
  • write_idt_entry将gate_desc写入idt_table对应表项
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// arch\x86\include\asm\segment.h
#define IDT_ENTRIES 256

// arch\x86\kernel\idt.c
typedef struct gate_struct gate_desc;

struct gate_struct {
u16 offset_low;
u16 segment;
struct idt_bits bits;
u16 offset_middle;
#ifdef CONFIG_X86_64
u32 offset_high;
u32 reserved;
#endif
} __attribute__((packed));

gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;

static void
idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{
gate_desc desc;

for (; size > 0; t++, size--) {
idt_init_desc(&desc, t);
write_idt_entry(idt, t->vector, &desc);
if (sys)
set_bit(t->vector, system_vectors);
}
}

static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
{
unsigned long addr = (unsigned long) d->addr;

gate->offset_low = (u16) addr;
gate->segment = (u16) d->segment;
gate->bits = d->bits;
gate->offset_middle = (u16) (addr >> 16);
#ifdef CONFIG_X86_64
gate->offset_high = (u32) (addr >> 32);
gate->reserved = 0;
#endif
}

early_idt_handler_common

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# arch\x86\kernel\head_64.S
early_idt_handler_common:
cld

# early_recursion_flag避免在early_idt_handler_common程序中递归地产生中断(有点类似于锁)
incl early_recursion_flag(%rip)

/* 压栈通用寄存器 */
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* RSI = vector number */
movq %rdi, 8(%rsp) /* pt_regs->di = RDI */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq %rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
pushq %rbx /* pt_regs->bx */
pushq %rbp /* pt_regs->bp */
pushq %r12 /* pt_regs->r12 */
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
UNWIND_HINT_REGS

cmpq $14,%rsi /* 如果是缺页中断则把cr2寄存器中的值赋值给rdi,然后调用early_make_pgtable */
jnz 10f
GET_CR2_INTO(%rdi)
call early_make_pgtable
andl %eax,%eax
jz 20f /* All good */

10:
movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
call early_fixup_exception /* 不是缺页中断则调用early_fixup_exception进行下面的步骤 */

20:
decl early_recursion_flag(%rip) /* 解锁 */
jmp restore_regs_and_return_to_kernel /* kernel人最熟悉的函数(,恢复寄存器并返回 */
END(early_idt_handler_common)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
// arch\x86\mm\extable.c
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
{
/* 忽略NMIs */
if (trapnr == X86_TRAP_NMI)
return;

/* 没有锁,挂起 */
if (early_recursion_flag > 2)
goto halt_loop;

/* 检查cs寄存器 */
if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
goto fail;

/* 执行对应中断的handler,不展开了 */
if (fixup_exception(regs, trapnr))
return;

/* 报告bug */
if (fixup_bug(regs, trapnr))
return;

fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
regs->orig_ax, read_cr2());

show_regs(regs);

halt_loop:
while (true)
halt();
}

early_make_pgtable

传给early_make_pgtable的参数address是cr2的值,cr2中储存的是一个地址,这个地址引起了缺页中断

1
2
3
4
5
6
7
8
9
int __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET; // __PAGE_OFFSET这里是0xffffffff80000000,获取物理地址
pmdval_t pmd;

pmd = (physaddr & PMD_MASK) + early_pmd_flags;

return __early_make_pgtable(address, pmd);
}

先看一些定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
// arch\x86\include\asm\pgtable_64_types.h

typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;

typedef struct { pteval_t pte; } pte_t;

#define EARLY_DYNAMIC_PAGE_TABLES 64

// arch\x86\kernel\head64.c
static unsigned int __initdata next_early_pgt;
1
2
3
# arch\x86\kernel\head_64.S
NEXT_PAGE(early_dynamic_pgts) # 一个固定大小缓冲区
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// arch\x86\kernel\head64.c
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
{
unsigned long physaddr = address - __PAGE_OFFSET; // 获取物理地址
pgdval_t pgd, *pgd_p; // 17-25:全局页目录(pgd)
p4dval_t p4d, *p4d_p; // 5级页表使用
pudval_t pud, *pud_p; // 26-34:上层页目录(pud)
pmdval_t *pmd_p; // 35-43:中间页目录(pmd)

/* 非法地址或页表出错 */
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1;

again:
pgd_p = &early_top_pgt[pgd_index(address)].pgd;
pgd = *pgd_p;

if (!IS_ENABLED(CONFIG_X86_5LEVEL)) // 不支持5级页表则直接赋值,获取对应p4d表
p4d_p = pgd_p;
else if (pgd) // 支持5级页表且pgd对应表项不为空则获取p4d
p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { // 如果重置次数超过限值则重来
reset_early_page_tables();
goto again;
}

p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; // 从early_dynamic_pgts中取一项用作p4d
memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
*pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; // 填入pgd对应表项
}
p4d_p += p4d_index(address);
p4d = *p4d_p; // 获取对应p4d表项(或pgd)

// 以下类似

if (p4d)
pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
goto again;
}

pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
*p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pud_p += pud_index(address);
pud = *pud_p;

if (pud)
pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
else {
if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
goto again;
}

pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
}
pmd_p[pmd_index(address)] = pmd; // 赋值物理地址+一些标志

return 0;
}

copy_bootdata

主要不是讲这个函数,因为这个函数触发了缺页中断使得页表发生了变化

调用copy_bootdata时使用了一个宏,对real_mode_data的值进行了修正

1
2
3
4
copy_bootdata(__va(real_mode_data));

// arch\arm\include\asm\memory.h
#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))

根据调试这个宏应该是将物理地址转化为了线性映射地址

显然这时候的页表没有这个地址的映射,所以会触发缺页中断


Kernel源码分析-系统启动(二)
http://akaieurus.github.io/2023/10/07/Kernel源码分析-系统启动(二)/
作者
Eurus
发布于
2023年10月7日
许可协议