Kernel源码分析-系统启动(一)

前几天沉迷调试SeaBIOS无法自拔,调了两天终于跟踪完了BIOS和boot loader的过程进入内核启动阶段了,之后再整理一下可能会写一下SeaBIOS和boot loader的源码分析和调试(立一个flag)

之所以调试SeaBIOS是因为想从头开始调试内核但找不到入口,然后支线任务做着做着就偏离主线了(ˉ▽ˉ;)…,现在终于回归主线了

ps:主要关注内存相关的部分,页表和段之类的

内核版本4.15.8,x86_64

Setup

kernel加载到了物理地址0x10000,源码可以看出来

1
2
3
4
#arch\x86\boot\header.S

BOOTSEG = 0x07C0 /* original address of boot-sector */
SYSSEG = 0x1000 /* historical load address >> 4 */

或者gdb调试也可以看出来,此时cs寄存器的值为0x1020

进入内核的第一条指令是一个跳转至start_of_setup,地址为0x10268

1
2
3
4
5
6
7
8
9
10
11
#arch\x86\boot\header.S

# offset 512, entry point

.globl _start
_start:
# Explicitly enter this as bytes, or the assembler
# tries to generate a 3-byte jump here, which causes
# everything else to push off to the wrong offset.
.byte 0xeb # short (2-byte) jump
.byte start_of_setup-1f

start_of_setup的主要作用有

  • 保证所有段寄存器值相等
  • 建立栈
  • 建立bss段
  • 跳转至main函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#arch\x86\boot\header.S

.section ".entrytext", "ax"
start_of_setup:
# 使ds和es相等
movw %ds, %ax
movw %ax, %es
cld

# 建立栈:判断ss是否等于ds,如果等于则说明栈已建立

movw %ss, %dx
cmpw %ax, %dx # %ds == %ss?
movw %sp, %dx
je 2f # -> assume %sp is reasonably set

# ss不等于ds,栈不合法,建立新栈
# 如果使用堆,则在堆地址结束处建立栈,否则在_end处建立栈
movw $_end, %dx
testb $CAN_USE_HEAP, loadflags
jz 1f
movw heap_end_ptr, %dx
1: addw $STACK_SIZE, %dx
jnc 2f
xorw %dx, %dx

2: # 此时dx指向栈顶
andw $~3, %dx # 对齐判断
jnz 3f
movw $0xfffc, %dx # 确保dx不为零
3: movw %ax, %ss # 栈基址和数据段基址相同
movzwl %dx, %esp
sti # 栈建立完毕

# 同步cs和其他段寄存器,此时cs为0x1020,其他段寄存器为0x1000
pushw %ds
pushw $6f
lretw # lretw将6的地址放入rip,将ds寄存器的值放入cs寄存器,cs寄存器同步0x1000
6:

# 检查签名
cmpl $0x5a5aaa55, setup_sig
jne setup_bad

# 清空bss段
movw $__bss_start, %di
movw $_end+3, %cx
xorl %eax, %eax
subw %di, %cx
shrw $2, %cx
rep; stosl

# 跳转至main函数
calll main

# Setup corrupt somehow...
setup_bad:
movl $setup_corrupt, %eax
calll puts
# Fall through...

.globl die
.type die, @function
die:
hlt
jmp die

.size die, .-die

.section ".initdata", "a"
setup_corrupt:
.byte 7
.string "No setup signature found...\n"

这段代码结束后

  • sp:0xfff0
  • __bss_start:0x13c00
  • _end:0x14f10

main

main函数的主要功能为

  • 初始化console、heap
  • 检测内存、CPU验证、键盘初始化
  • 进入保护模式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
//arch\x86\boot\main.c

void main(void)
{
/* 将boot header复制到zeropage中 */
copy_boot_params();

/* 初始化早期启动状态下的控制台 */
console_init();
if (cmdline_find_option_bool("debug"))
puts("early console in setup code\n");

/* 初始化堆 */
init_heap();

/* 检测 CPU 相关信息 */
if (validate_cpu()) {
puts("Unable to boot - please use a kernel appropriate "
"for your CPU.\n");
die();
}

/* 通过向 BIOS 查询的方式,收集硬件相关信息,并将结果存放在zeropage中 */
set_bios_mode();

/* 从BIOS处收集内存信息,包括内存段起始、内存段大小、内存段类型等 */
detect_memory();

/* 初始化键盘 */
keyboard_init();

/* 询问IST */
query_ist();

/* 询问APM */
#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
query_apm_bios();
#endif

/* 询问EDD */
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
query_edd();
#endif

/* 设置显示模式 */
set_video();

/* 切换保护模式 */
go_to_protected_mode();
}

copy_boot_params

copy_boot_params主要完成:

  • 将hdr的内容复制给boot_params:0x13ef0
  • 如果使用的是旧的command line则将其复制到0x9000
  • 更新boot_params.cmd_line_ptr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
static void copy_boot_params(void)
{
struct old_cmdline {
u16 cl_magic;
u16 cl_offset;
};
const struct old_cmdline * const oldcmd =
(const struct old_cmdline *)OLD_CL_ADDRESS;

BUILD_BUG_ON(sizeof boot_params != 4096);
memcpy(&boot_params.hdr, &hdr, sizeof hdr);

if (!boot_params.hdr.cmd_line_ptr &&
oldcmd->cl_magic == OLD_CL_MAGIC) {
/* Old-style command line protocol. */
u16 cmdline_seg;

/* Figure out if the command line falls in the region
of memory that an old kernel would have copied up
to 0x90000... */
if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
cmdline_seg = ds();
else
cmdline_seg = 0x9000;

boot_params.hdr.cmd_line_ptr =
(cmdline_seg << 4) + oldcmd->cl_offset;
}
}

ps:

  • hdr的地址为0x1f1,hdr和boot_params的地址都可以通过调试得到(定位内存复制的命令)

  • hdr结构定义在header.S文件中

    1
    2
    3
    4
    5
    6
    7
    8
    9
    	.globl	hdr
    hdr:
    setup_sects: .byte 0 /* Filled in by build.c */
    root_flags: .word ROOT_RDONLY
    syssize: .long 0 /* Filled in by build.c */
    ram_size: .word 0 /* Obsolete */
    vid_mode: .word SVGA_MODE
    root_dev: .word 0 /* Filled in by build.c */
    boot_flag: .word 0xAA55

    md这破玩意我纠结了一下午

init_heap

调试看boot_params结构体可以看出heap_end:0xfe00

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
static void init_heap(void)
{
char *stack_end;

if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
asm("leal %P1(%%esp),%0"
: "=r" (stack_end) : "i" (-STACK_SIZE));

heap_end = (char *)
((size_t)boot_params.hdr.heap_end_ptr + 0x200);
if (heap_end > stack_end)
heap_end = stack_end;
} else {
/* Boot protocol 2.00 only, no heap available */
puts("WARNING: Ancient bootloader, some functionality "
"may be limited!\n");
}
}

go_to_protected_mode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
void go_to_protected_mode(void)
{
/* 禁止中断 */
realmode_switch_hook();

/* 启动A20 */
if (enable_a20()) {
puts("A20 gate not responding, unable to boot...\n");
die();
}

reset_coprocessor();

/* 配置中断 */
mask_all_interrupts();

/* 配置idt和gdt表,转换至保护模式 */
setup_idt();
setup_gdt();
protected_mode_jump(boot_params.hdr.code32_start,
(u32)&boot_params + (ds() << 4));
}

gdt基址都为0

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static void setup_gdt(void)
{
/* There are machines which are known to not boot with the GDT
being 8-byte unaligned. Intel recommends 16 byte alignment. */
static const u64 boot_gdt[] __attribute__((aligned(16))) = {
/* CS: code, read/execute, 4 GB, base 0 */
[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
/* DS: data, read/write, 4 GB, base 0 */
[GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
/* TSS: 32-bit tss, 104 bytes, base 4096 */
/* We only have a TSS here to keep Intel VT happy;
we don't actually use it for anything. */
[GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
};
/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
of the gdt_ptr contents. Thus, make it static so it will
stay in memory, at least long enough that we switch to the
proper kernel GDT. */
static struct gdt_ptr gdt;

gdt.len = sizeof(boot_gdt)-1;
gdt.ptr = (u32)&boot_gdt + (ds() << 4);

asm volatile("lgdtl %0" : : "m" (gdt));
}

进入保护模式:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
GLOBAL(protected_mode_jump)
movl %edx, %esi # 指向boot_params

xorl %ebx, %ebx
movw %cs, %bx
shll $4, %ebx
addl %ebx, 2f
jmp 1f # Short jump to serialize on 386/486
1:

movw $__BOOT_DS, %cx
movw $__BOOT_TSS, %di

movl %cr0, %edx
orb $X86_CR0_PE, %dl # 调整至保护模式(更改cr0)
movl %edx, %cr0

# 长跳转进入保护模式
.byte 0x66, 0xea # ljmpl opcode
2: .long in_pm32 # offset
.word __BOOT_CS # segment
ENDPROC(protected_mode_jump)

.code32
.section ".text32","ax"
GLOBAL(in_pm32)
# 建立32位的data段
movl %ecx, %ds
movl %ecx, %es
movl %ecx, %fs
movl %ecx, %gs
movl %ecx, %ss
# 建立保护模式栈
addl %ebx, %esp

# 设置TR以使Intel VT正常工
ltr %di

# 清空寄存器
xorl %ecx, %ecx
xorl %edx, %edx
xorl %ebx, %ebx
xorl %ebp, %ebp
xorl %edi, %edi

# 设置LDTR以使Intel VT正常工作
lldt %cx

jmpl *%eax # Jump to the 32-bit entrypoint
ENDPROC(in_pm32)

之后函数流程跳转到0x100000:startup_32

startup_32

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
ENTRY(startup_32)

cld
testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 1f

cli
movl $(__BOOT_DS), %eax
movl %eax, %ds
movl %eax, %es
movl %eax, %ss
1:

/*
* 长模式支持rip相对寻址但保护模式不支持,所以需要计算编译地址和实际运行地址偏差(编译地址为0)
* 此时esi中是boot_params的地址,之前已经填充过了
* boot_params.hdr中包含一个成员scratch(偏移量为0x1e4)
*/
leal (BP_scratch+4)(%esi), %esp # scratch空间成为call指令的临时栈(+4是因为栈从上往下增长)
call 1f # esp的值存入栈顶ebp,1的地址(返回地址)被入栈(scratch)
1: popl %ebp # 1的地址存入ebp
subl $1b, %ebp #计算差值

/* 建立栈并确保cpu支持64位 */
movl $boot_stack_end, %eax
addl %ebp, %eax
movl %eax, %esp

call verify_cpu
testl %eax, %eax
jnz no_longmode

/*
* 计算内核解压地址
*/

#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx # 将startup_32基址放入ebx
movl BP_kernel_alignment(%esi), %eax # 将boot_params.hdr->kernel_alignment放入eax
decl %eax # eax--
addl %eax, %ebx # ebx+=eax
notl %eax # eax=~eax
andl %eax, %ebx # ebx~=eax
cmpl $LOAD_PHYSICAL_ADDR, %ebx # 以上步骤就是为了对齐
jge 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
1:

/* 重定位地址准备解压 */
movl BP_init_size(%esi), %eax # eax=boot_params.hdr->init_size
subl $_end, %eax # eax-=_end
addl %eax, %ebx # ebx+=eax(重定位物理地址)

/*
* 准备进入64位模式
*/

/* 更新gdt */
addl %ebp, gdt+2(%ebp)
lgdt gdt(%ebp)

/* 开启PAE */
movl %cr4, %eax
orl $X86_CR4_PAE, %eax
movl %eax, %cr4

/*
* 建立映射4G内存的页表
*/
/*
* 页表加密(?)
*/
call get_sev_encryption_bit
xorl %edx, %edx
testl %eax, %eax
jz 1f
subl $32, %eax /* Encryption bit is always above bit 31 */
bts %eax, %edx /* Set encryption mask for page tables */
1:

/* 清空页表:0x2bec000-0x2bf2000 */
leal pgtable(%ebx), %edi
xorl %eax, %eax
movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl

/* Build Level 4 */
leal pgtable + 0(%ebx), %edi
leal 0x1007 (%edi), %eax
movl %eax, 0(%edi)
addl %edx, 4(%edi)

/* Build Level 3 */
leal pgtable + 0x1000(%ebx), %edi
leal 0x1007(%edi), %eax
movl $4, %ecx
1: movl %eax, 0x00(%edi)
addl %edx, 0x04(%edi)
addl $0x00001000, %eax
addl $8, %edi
decl %ecx
jnz 1b

/* Build Level 2 */
leal pgtable + 0x2000(%ebx), %edi
movl $0x00000183, %eax
movl $2048, %ecx
1: movl %eax, 0(%edi)
addl %edx, 4(%edi)
addl $0x00200000, %eax
addl $8, %edi
decl %ecx
jnz 1b

/* 将PML4的地址放入cr3 */
leal pgtable(%ebx), %eax
movl %eax, %cr3

/* 开启长模式EFER */
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
wrmsr

/* After gdt is loaded */
xorl %eax, %eax
lldt %ax
movl $__BOOT_TSS, %eax
ltr %ax

/*
* 准备进入长模式
*/
pushl $__KERNEL_CS
leal startup_64(%ebp), %eax
#ifdef CONFIG_EFI_MIXED
movl efi32_config(%ebp), %ebx
cmp $0, %ebx
jz 1f
leal handover_entry(%ebp), %eax
1:
#endif
pushl %eax

/* 启用分页 */
movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
movl %eax, %cr0

/* Jump from 32bit compatibility mode into 64bit mode. */
lret
ENDPROC(startup_32)

一些变量和宏定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#define BLANK() asm volatile("\n.ascii \"->\"" : : )

#define OFFSET(sym, str, mem) \
DEFINE(sym, offsetof(struct str, mem))
#define DEFINE(sym, val) \
asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val))

BLANK();
OFFSET(BP_scratch, boot_params, scratch);
OFFSET(BP_secure_boot, boot_params, secure_boot);
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
OFFSET(BP_init_size, boot_params, hdr.init_size);
OFFSET(BP_pref_address, boot_params, hdr.pref_address);
OFFSET(BP_code32_start, boot_params, hdr.code32_start);

#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ (CONFIG_PHYSICAL_ALIGN - 1)) \
& ~(CONFIG_PHYSICAL_ALIGN - 1))

建立栈

建立栈的代码如下:

1
2
3
movl	$boot_stack_end, %eax
addl %ebp, %eax
movl %eax, %esp

boot_stack_end的定义如下:

1
2
3
4
5
6
7
	.bss
.balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
  • 将boot_stack_end的链接地址放入eax
  • 加ebp得到boot_stack_end的实际地址
  • 把栈移过去

更新gdt

gdt的定义如下:

  • gdt大小:.word
  • gdt物理地址:.long
1
2
3
4
5
6
7
8
9
10
11
	.data
gdt:
.word gdt_end - gdt
.long gdt
.word 0
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
gdt_end:

更新gdt的过程如下:

1
2
3
/* 更新gdt */
addl %ebp, gdt+2(%ebp)
lgdt gdt(%ebp)
  • gdt+2的内容+=程序基址,更新gdt物理地址
  • 更新lgdt寄存器(lgdt寄存器内容为gdt大小:gdt物理地址)

初期页表初始化

建立映射4G内存的页表

Linux内核使用4级页表,建立6个页表

  • 1个PML4或称为4级页映射表,包含1个项
  • 1个PDP或称为页目录指针表,包含4个项
  • 4个页目录表,一共包含2048个项,一页2MB

先清理一块内存,每个表都是4096字节,所以需要24KB内存

1
2
3
4
leal	pgtable(%ebx), %edi # ebx是加载的物理地址
xorl %eax, %eax
movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl

pgtable的定义如下:

1
2
3
4
	.section ".pgtable","a",@nobits
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0

分配空间后先建立PML4:

1
2
3
4
5
/* Build Level 4 */
leal pgtable + 0(%ebx), %edi # edi=pgtable_addr
leal 0x1007 (%edi), %eax # eax=edi+0x1007
movl %eax, 0(%edi) # [edi]=eax
addl %edx, 4(%edi) # [edi+4]+=edx(之前已经把edx清零了)
  • &PML4+0x1000是PDP的地址
  • 7是页表的标记,表示PRESENT+RW+USER但UNACCESSED(开启分页后访问一次就会变成ACCESSED,0x27)

页表标记定义如下(简化过的):

1
2
3
4
5
6
7
8
9
10
11
12
13
#define _PAGE_PRESENT   0x001
#define _PAGE_RW 0x002
#define _PAGE_USER 0x004
#define _PAGE_PWT 0x008
#define _PAGE_PCD 0x010
#define _PAGE_ACCESSED 0x020
#define _PAGE_DIRTY 0x040
#define _PAGE_PSE 0x080
#define _PAGE_GLOBAL 0x100
#define _PAGE_SOFTW1 0x200
#define _PAGE_SOFTW2 0x400
#define _PAGE_PAT 0x800
#define _PAGE_PAT_LARGE 0x1000

初始化4个PDP:

1
2
3
4
5
6
7
8
9
10
	/* Build Level 3 */
leal pgtable + 0x1000(%ebx), %edi # edi=pgtable_addr+0x1000(PDP地址)
leal 0x1007(%edi), %eax # eax=edi+0x1007
movl $4, %ecx # ecx=4
1: movl %eax, 0x00(%edi) # [edi]=eax
addl %edx, 0x04(%edi) # [edi+4]+=edx
addl $0x00001000, %eax # eax+=0x1000
addl $8, %edi # edi+=8
decl %ecx # ecx--
jnz 1b

初始化4个PDP的共2048个项:

1
2
3
4
5
6
7
8
9
10
	/* Build Level 2 */
leal pgtable + 0x2000(%ebx), %edi # edi=pgtable_addr+0x2000(PDP[0]地址)
movl $0x00000183, %eax # eax=0x183
movl $2048, %ecx # ecx=2048
1: movl %eax, 0(%edi) # eax=[edi]
addl %edx, 4(%edi) # [edi+4]+=edx
addl $0x00200000, %eax # eax+=0x200000
addl $8, %edi # edi+=8
decl %ecx # ecx--
jnz 1b
  • 0x183标记表示_PAGE_PRESENT+_PAGE_RW+_PAGE_PSE+_PAGE_GLOBAL
  • PSE表示2M或4M的页

startup_64

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
	.code64
.org 0x200
ENTRY(startup_64)

/* 清空段寄存器 */
xorl %eax, %eax
movl %eax, %ds
movl %eax, %es
movl %eax, %ss
movl %eax, %fs
movl %eax, %gs

/* 计算内核编译时的位置和它被加载的位置的差(和startup_32的步骤类似) */
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip), %rbp # rbp=startup_32_addr
movl BP_kernel_alignment(%rsi), %eax # eax=boot_params.hdr->kernel_alignment
decl %eax # eax--
addq %rax, %rbp # rbp+=rax
notq %rax # rax=~rax
andq %rax, %rbp # rbp&=rax
cmpq $LOAD_PHYSICAL_ADDR, %rbp
jge 1f
#endif
movq $LOAD_PHYSICAL_ADDR, %rbp
1:

movl BP_init_size(%rsi), %ebx # ebx=boot_params.hdr->init_size
subl $_end, %ebx # ebx-=_end
addq %rbp, %rbx # ebp+=eax(重定位物理地址)

/* 栈指针的设置和标志寄存器的重置 */
leaq boot_stack_end(%rbx), %rsp

#ifdef CONFIG_X86_5LEVEL
/*
* 检查是否支持level5分页
*/
pushq %rsi
call l5_paging_required
popq %rsi

/* If l5_paging_required() returned zero, we're done here. */
cmpq $0, %rax
je lvl5

……

lvl5:
#endif

/* 清空标志寄存器 */
pushq $0
popfq

/*
* 将压缩的内核复制过去,还有relocated代码
*/
pushq %rsi # 保存rsi,因为rsi此时保存指向boot_params的指针
leaq (_bss-8)(%rip), %rsi # rsi=_bss-8绝对地址
leaq (_bss-8)(%rbx), %rdi # rdi=_bss-8重定位地址
movq $_bss, %rcx # rcx=_bss
shrq $3, %rcx # rcx>>=3
std # 复制从高地址到低地址进行
rep movsq # 从rsi到rdi复制数据
cld
popq %rsi

/*
* 跳至重定位的relocated
*/
leaq relocated(%rbx), %rax
jmp *%rax

relocated

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
	.text
relocated:

/*
* 清空bss
*/
xorl %eax, %eax # eax=0
leaq _bss(%rip), %rdi # rdi=_bss_addr
leaq _ebss(%rip), %rcx # rcx=_ebss_addr
subq %rdi, %rcx # rcx-=rdi(rcx:size)
shrq $3, %rcx # rcx>>=3
rep stosq

/*
* 调整GOT
*/
leaq _got(%rip), %rdx # rdx=_got_addr
leaq _egot(%rip), %rcx # rcx=_egot_addr
1:
cmpq %rcx, %rdx
jae 2f
addq %rbx, (%rdx)
addq $8, %rdx
jmp 1b
2:

/*
* 提取并跳至内核代码
*/
pushq %rsi /* 保存boot_params指针 */
movq %rsi, %rdi
leaq boot_heap(%rip), %rsi /* rsi=boot_heap_addr */
leaq input_data(%rip), %rdx /* rdx=input_data_addr */
movl $z_input_len, %ecx /* ecx=z_input_len */
movq %rbp, %r8 /* r8=rbp */
movq $z_output_len, %r9 /* r9=z_output_len */
call extract_kernel /* rax=kernel_addr */
popq %rsi

/*
* 跳至内核代码
*/
jmp *%rax

startup_64(长模式)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
	.text
__HEAD
.code64
.globl startup_64
startup_64:
UNWIND_HINT_EMPTY

/* 为verify_cpu建立栈 */
leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp

call verify_cpu

leaq _text(%rip), %rdi # rdi=_text_addr
pushq %rsi # 保存rsi,rsi=&boot_params
call __startup_64 # 调用__startup_64
# 传入参数(rdi=&_text,rsi=&boot_params)
popq %rsi # 恢复rsi

/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f

……

1:

/* 打开PAE和PGE(cr4) */
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
orl $X86_CR4_LA57, %ecx
#endif
movq %rcx, %cr4

/* 将early_top_pgt放入cr3 */
addq phys_base(%rip), %rax
movq %rax, %cr3

/* 确保在使用虚拟地址执行 */
movq $1f, %rax
jmp *%rax # 跳至__startup_secondary_64的1

开始使用虚拟地址了,结束*★,°*:.☆( ̄▽ ̄)/$:*.°★* 。

__startup_64

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
//使用的定义
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ // 0x1000000
CONFIG_PHYSICAL_ALIGN)
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) //取17-25位
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512

#define EARLY_DYNAMIC_PAGE_TABLES 64
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
static unsigned int __initdata next_early_pgt;
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);

//arch\x86\kernel\head64.c
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
int i;
unsigned int *next_pgt_ptr;

/* 检查物理地址是否过大 */
if (physaddr >> MAX_PHYSMEM_BITS)
for (;;);

/*
* 计算物理地址和编译地址之间的差值
*
* _text的默认虚拟地址是0xffffffff81000000,__START_KERNEL_map=0xffffffff80000000
* _text-__START_KERNEL_map就是_text的相对地址
* load_dalta就是实际加载基址
*/
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);

/* 判断是否2M对齐 */
if (load_delta & ~PMD_PAGE_MASK)
for (;;);

/* 如果支持SME则开启 */
sme_enable(bp);

/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();

/* 修正全局页目录地址,并将使用表项+实际加载基址 */

pgd = fixup_pointer(&early_top_pgt, physaddr);
pgd[pgd_index(__START_KERNEL_map)] += load_delta;

/* 如果开启5级页表 */
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
}

/* 修正上层页目录地址,并将使用表项+实际加载基址 */
pud = fixup_pointer(&level3_kernel_pgt, physaddr);
pud[510] += load_delta;
pud[511] += load_delta;

/* 修正中间页目录地址,并将使用表项+实际加载基址 */
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
pmd[506] += load_delta;

/*
* 把现在使用的物理地址进行映射,为了在使用虚拟地址之前还能正常访问执行
* 物理地址为0x1000000-0x2800000(差不多这个范围)
*/

next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);

pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();

if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);

i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;

i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
} else { // pgd=fixup_pointer(&early_top_pgt, physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}

i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
pud[i + 0] = (pudval_t)pmd + pgtable_flags;
pud[i + 1] = (pudval_t)pmd + pgtable_flags;

pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;

for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
pmd[idx] = pmd_entry + i * PMD_SIZE; // 填充物理地址
}

pmd = fixup_pointer(level2_kernel_pgt, physaddr); // 修正level2_kernel_pgt中的物理地址
for (i = 0; i < PTRS_PER_PMD; i++) {
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
}

/*
* 修正phys_base
*/
p = fixup_pointer(&phys_base, physaddr);
*p += load_delta - sme_get_me_mask();

/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);

/*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
*/
return sme_get_me_mask();
}

64位页表

x86_64使用4级页表:

  • 全局页目录
  • 上层页目录
  • 中间页目录
  • 页表项

地址的第1-16位(高到低)用于区分用户态和内核态

  • 全0为用户空间
  • 全1为内核空间

分页的具体位数如下:

  • 1-16:不使用
  • 17-25:全局页目录(pgd)
  • 26-34:上层页目录(pud)
  • 35-43:中间页目录(pmd)
  • 44-52:页表项
  • 53-64:页内偏移

以下源码只有三级页表,页大小为0x200000(PAE)

__startup_64中使用的页表相关定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#define _PAGE_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
_PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
_PAGE_ACCESSED | _PAGE_DIRTY)
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
.rept (COUNT) ; \
.quad (START) + (i << PMD_SHIFT) + (PERM) ; \
i = i + 1 ; \
.endr
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)

L3_START_KERNEL = pud_index(__START_KERNEL_map)

#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512

/*
* 3rd level page
*/
#define PUD_SHIFT 30
#define PTRS_PER_PUD 512

/*
* PMD_SHIFT determines the size of the area a middle-level
* page table can map
*/
#define PMD_SHIFT 21
#define PTRS_PER_PMD 512
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/*
* 每个页表使用9位,共有1<<9=512个表项
*/

/*
* 前511项都不使用
* 第512项填充level3_kernel_pgt的物理地址和标志
* 内核空间的映射基址是0xffffffff80000000,第17-25位都为1
*/
NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0

NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0

/*
* 前510(L3_START_KERNEL)项不使用
* 0xffffffff80000000的第26-34位为0b111111110(内核空间),填充level2_kernel_pgt的物理地址和标志
* 0xffffffffc0000000的第26-34位为0b111111111(用户空间可使用的vsyscall),填充level2_fixmap_pgt的物理地址和标志
*/
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC

/*
* 前256项填充物理地址 : 0x200000,0x400000,0x800000……
* 内核使用2MB大小的页,所以第44-64位用于页内偏移
* 内核空间的地址范围是0xffffffff80000000-0xffffffff9fffffff,第35-43位最大值为255,只使用前256项
*/
NEXT_PAGE(level2_kernel_pgt)
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)

/*
* 前506项不使用
* vsyscalls地址为0xffffffffff600000-0xffffffffff600fff,第35-43位为507,只使用第507项,填充level1_fixmap_pgt
*/
NEXT_PAGE(level2_fixmap_pgt)
.fill 506,8,0
.quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
.fill 5,8,0

/*
* 先都填充0,不分配物理页
* 用户空间使用4KB大小的页,所以第44-52位用于页表项
*/
NEXT_PAGE(level1_fixmap_pgt)
.fill 512,8,0

/*
* 物理地址基址,先填充0
*/
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
EXPORT_SYMBOL(phys_base)

Kernel源码分析-系统启动(一)
http://akaieurus.github.io/2023/08/08/Kernel源码分析-系统启动(一)/
作者
Eurus
发布于
2023年8月8日
许可协议