记一次奇妙的内核冒险

一次历经多次反转的debug之旅,谨以此博客记录

起因

事情的起因是2045交的一个patch,他发现device结构体的driver_override字段由于缺乏锁保护可能会出现race导致的uaf

这个字段主要用于驱动和设备的匹配,驱动会实现一个match函数来处理这个匹配过程,其中会读取driver_override

1
2
3
4
5
6
7
8
9
static const struct pci_device_id *pci_match_device(struct pci_driver *drv,
struct pci_dev *dev)
{
/* When driver_override is set, only bind to the matching driver */
if (dev->driver_override && strcmp(dev->driver_override, drv->name))
return NULL;

/* ... */
}

match函数在driver_match_device中被调用

1
2
3
4
5
static inline int driver_match_device(const struct device_driver *drv,
struct device *dev)
{
return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

driver_override_store函数可以对driver_override字段进行更改,由于没有加锁所以两个进程读写race会导致UAF

1
2
3
4
5
6
7
8
9
10
11
12
13
static ssize_t driver_override_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct pci_dev *pdev = to_pci_dev(dev);
int ret;

ret = driver_set_override(dev, &pdev->driver_override, buf, count);
if (ret)
return ret;

return count;
}

调用driver_match_device的路径有三条:__device_attach_driver、__driver_attach、bind_store,其中__device_attach_driver是有锁的,其他两条是无锁的,2045的patch给这两条路径加了锁

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 430cbefbc97f..677320881af1 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -182,9 +182,18 @@ void device_set_deferred_probe_reason(const struct device *dev, struct va_format
static inline int driver_match_device(const struct device_driver *drv,
struct device *dev)
{
+ device_lock_assert(dev);
+
return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

+static inline int driver_match_device_locked(const struct device_driver *drv,
+ struct device *dev)
+{
+ guard(device)(dev);
+ return driver_match_device(drv, dev);
+}
+
static inline void dev_sync_state(struct device *dev)
{
if (dev->bus->sync_state)
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 9eb7771706f0..331d750465e2 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -263,7 +263,7 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf,
int err = -ENODEV;

dev = bus_find_device_by_name(bus, NULL, buf);
- if (dev && driver_match_device(drv, dev)) {
+ if (dev && driver_match_device_locked(drv, dev)) {
err = device_driver_attach(drv, dev);
if (!err) {
/* success */
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 349f31bedfa1..98feb4c77160 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -1178,7 +1178,7 @@ static int __driver_attach(struct device *dev, void *data)
* is an error.
*/

- ret = driver_match_device(drv, dev);
+ ret = driver_match_device_locked(drv, dev);
if (ret == 0) {
/* no match */
return 0;

需要注意的是driver_override加的锁是整个device的锁mutex

Round 1: Juno

Mark发现Arm Juno开发板出现boot hang,二分定位到2045的patch,一些高通的平台也出现了类似的问题

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
I'm seeing boot hangs on Arm Juno in next/pending-fixes which bisect to
this commit. The boot grinds to a halt near the end of boot:

[ 2.570549] ledtrig-cpu: registered to indicate activity on CPUs
[ 2.618301] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
[ 2.623547] msm_serial: driver initialized
[ 2.624058] SuperH (H)SCI(F) driver initialized
[ 2.624312] STM32 USART driver initialized

with no further output, full log:

https://lava.sirena.org.uk/scheduler/job/2387335#L862

We are also seeing similar looking boot hangs on some Qualcomm platforms
in Arm's test lab which aren't verified to be the same thing but are
hanging at a similar point in boot.

想de这个bug,显然我们不可能凭空变出一个开发板,所以尝试qemu模拟

Eurus vs Linux

现在插播一些基础知识

涉及硬件相关代码那就有一个问题,内核如何确定硬件拓扑?

  • 对于嵌入式系统SoC这种不支持热插拔的系统,由dtb静态提供
  • 对于支持热插拔的系统,比如PC,由UEFI/BIOS动态探测提供(ACPI)

本节内容基于SoC,因此只讨论dtb情形

dtb(设备树二进制文件)向内核描述了硬件拓扑,unflatten_device_tree(调用链如下)将dtb文件转化为树状数据结构,结点数据类型device_node,根结点全局变量of_root

1
2
3
start_kernel
-> setup_arch
-> unflatten_device_tree
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/dts-v1/;

/ {
compatible = "nvidia,p3509-0000+p3668-0000\\0nvidia,tegra194";
interrupt-parent = <0x01>;
#address-cells = <0x02>;
#size-cells = <0x02>;
model = "NVIDIA Jetson Xavier NX Developer Kit (SD-card)";

bus@0 {
compatible = "simple-bus";
#address-cells = <0x02>;
#size-cells = <0x02>;
ranges = <0x00 0x00 0x00 0x00 0x100 0x00>;

misc@100000 {
compatible = "nvidia,tegra194-misc";
reg = <0x00 0x100000 0x00 0xf000 0x00 0x10f000 0x00 0x1000>;
phandle = <0x04>;
};

cbb-noc@2300000 {
compatible = "nvidia,tegra194-cbb-noc";
reg = <0x00 0x2300000 0x00 0x1000>;
interrupts = <0x00 0xe6 0x04 0x00 0xe7 0x04>;
nvidia,axi2apb = <0x03>;
nvidia,apbmisc = <0x04>;
status = "okay";
};

pinmux@2430000 {
compatible = "nvidia,tegra194-pinmux";
reg = <0x00 0x2430000 0x00 0x17000>;
status = "okay";
phandle = <0x02>;

pinmux-pex-clkreq-c5-bi-dir {
phandle = <0x110>;

clkreq {
nvidia,pins = "pex_l5_clkreq_n_pgg0";
nvidia,schmitt = <0x00>;
nvidia,enable-input = <0x01>;
nvidia,io-hv = <0x01>;
nvidia,tristate = <0x00>;
nvidia,pull = <0x00>;
};
};

pinmux-pex-rst-c5-out {
phandle = <0x10f>;

pex_rst {
nvidia,pins = "pex_l5_rst_n_pgg1";
nvidia,schmitt = <0x00>;
nvidia,enable-input = <0x00>;
nvidia,io-hv = <0x01>;
nvidia,tristate = <0x00>;
nvidia,pull = <0x00>;
};
};
};
# ...

在以下调用链:

of_platform_default_populate_init会根据device_node tree创建platform_device tree,对于每个platform_device填充resource,如记录IO内存范围、IRQ号等,此时不会探测设备是否存在,只是对设备树进行静态的解释

1
2
3
4
5
6
7
8
start_kernel
-> rest_init
-> kernel_init
-> kernel_init_freeable
-> do_basic_setup
-> do_initcalls
-> arch_initcal_sync of_platform_default_populate_init
-> device_initcall arm_smmu_driver_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/sys/devices/platform # tree .
.
├── 9050000.smmuv3
│ ├── driver -> ../../../bus/platform/drivers/arm-smmu-v3
│ ├── driver_override
│ ├── iommu
│ │ └── smmu3.0x0000000009050000
│ │ ├── device -> ../../../9050000.smmuv3
│ │ ├── devices
│ │ │ ├── 0000:00:00.0 -> ../../../../4010000000.pcie/pci0000:00/0000:00:00.0
│ │ │ └── 0000:00:01.0 -> ../../../../4010000000.pcie/pci0000:00/0000:00:01.0
│ │ ├── power
│ │ │ ├── autosuspend_delay_ms
│ │ │ ├── control
│ │ │ ├── runtime_active_time
│ │ │ ├── runtime_status
│ │ │ └── runtime_suspended_time
│ │ ├── subsystem -> ../../../../../class/iommu
│ │ └── uevent
│ ├── modalias
│ ├── of_node -> ../../../firmware/devicetree/base/smmuv3@9050000
│ ├── power
│ │ ├── autosuspend_delay_ms
│ │ ├── control
│ │ ├── runtime_active_time
│ │ ├── runtime_status
│ │ └── runtime_suspended_time
│ ├── subsystem -> ../../../bus/platform
│ └── uevent
├── a000000.virtio_mmio
│ ├── driver_override
│ ├── modalias
│ ├── of_node -> ../../../firmware/devicetree/base/virtio_mmio@a000000
│ ├── power
│ │ ├── autosuspend_delay_ms
│ │ ├── control
│ │ ├── runtime_active_time
│ │ ├── runtime_status
│ │ └── runtime_suspended_time
│ ├── subsystem -> ../../../bus/platform
│ ├── uevent
│ └── waiting_for_supplier
# ...

device_initcall的各种driver_init(本节特指arm_smmu_driver_init)以以下调用栈用driver匹配platform_device,找到匹配的platform_device后就会调用probe函数探测设备,可能会映射寄存器,然后从中读取一些值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static const struct of_device_id arm_smmu_of_match[] = {
{ .compatible = "arm,smmu-v1", .data = &smmu_generic_v1 },
{ .compatible = "arm,smmu-v2", .data = &smmu_generic_v2 },
{ .compatible = "arm,mmu-400", .data = &smmu_generic_v1 },
{ .compatible = "arm,mmu-401", .data = &arm_mmu401 },
{ .compatible = "arm,mmu-500", .data = &arm_mmu500 },
{ .compatible = "cavium,smmu-v2", .data = &cavium_smmuv2 },
{ .compatible = "nvidia,smmu-500", .data = &arm_mmu500 },
{ .compatible = "qcom,smmu-v2", .data = &qcom_smmuv2 },
{ },
};
MODULE_DEVICE_TABLE(of, arm_smmu_of_match);

static struct platform_driver arm_smmu_driver = {
.driver = {
.name = "arm-smmu",
.of_match_table = arm_smmu_of_match,
.pm = &arm_smmu_pm_ops,
.suppress_bind_attrs = true,
},
.probe = arm_smmu_device_probe,
.remove = arm_smmu_device_remove,
.shutdown = arm_smmu_device_shutdown,
};
1
2
3
4
5
6
7
8
9
10
11
12
arm_smmu_driver_init
-> __platform_driver_register
-> driver_register
-> bus_add_driver
-> driver_attach
-> bus_for_each_dev
-> __driver_attach
-> driver_probe_device
-> really_probe
-> call_driver_probe
-> platform_probe
-> arm_smmu_device_probe

注:核心是设备→驱动→设备探测,但本人只看了几个driver,不保证所有driver都遵循这个流程

Eurus vs Qemu

qemu提供两个参数与上述过程相关:

  • -M:指定机器类型,决定qemu模拟什么样的硬件
  • -dtb:指定dtb文件,传递给内核,决定内核视角有什么样的硬件

想要调试内核特定驱动至少在内核视角是要有这么一个硬件的,所以选择直接qemu指定dtb传递给内核。模拟Juno直接指定-dtb linux/arch/arm64/boot/dts/arm/juno.dtb

但这样最大的问题是实际上某些硬件是不存在的,当内核访问这些硬件时会panic,这是强行模拟主要需要解决的问题

上面扯了这么多其实都是事后补充的知识,Eurus在当前时间线选择了一条最简单粗暴的方法:大胆开试,出问题就拷打gpt

  1. 先写了一个朴素的qemu启动脚本

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    #!/bin/bash
    qemu-system-aarch64 \
    -M virt \
    -cpu cortex-a57 \
    -smp 4 \
    -m 2G \
    -kernel ./Image \
    -dtb ./juno.dtb \
    -drive file=./trixie.img,format=raw \
    -nographic \
    -s \
    -no-reboot \
    -append "loglevel=8 root=/dev/sda panic=1"

    没有输出,gdb进去好像已经进内核了

    -append参数指定earlycon和console,earlycon需要指定地址,推测是因为virt默认的serial地址和juno的serial地址不一样,但如果是这样的话理论上直接改dtb的地址就行,实际上行不通,这个过程还有很多不清楚的地方,以后再来探索吧

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    #!/bin/bash
    qemu-system-aarch64 \
    -M virt \
    -cpu cortex-a57 \
    -smp 4 \
    -m 2G \
    -kernel ./Image \
    -dtb ./juno.dtb \
    -drive file=./trixie.img,format=raw \
    -nographic \
    -s \
    -no-reboot \
    -append "earlycon=pl011,0x9000000 console=ttyAMA0,115200n8 \
    loglevel=8 root=/dev/sda panic=1"

    注:ttyAMA0和pl011语法等价

  2. 然后panic了

    1
    2
    3
    4
    5
    6
    7
    8
    9
    [    0.000000] Call trace:
    [ 0.000000] __arm_smccc_smc+0x4/0x30 (P)
    [ 0.000000] psci_probe+0x30/0x33c
    [ 0.000000] psci_0_2_init+0x18/0x24
    [ 0.000000] psci_dt_init+0x58/0xa4
    [ 0.000000] setup_arch+0x568/0x5ec
    [ 0.000000] start_kernel+0x6c/0x83c
    [ 0.000000] __primary_switched+0x88/0x90
    [ 0.000000] Code: d53cd045 d53cd042 d53cd043 d503245f (d4000003)

    本机的qemu版本比较低(6.2.0),对PSCI的支持不好,高版本会好一些,所以编译了一个10.2.0的,也方便之后魔改qemu

  3. 然后hang在这了

    1
    2
    3
    4
    5
    6
    7
    8
    [    0.009681] pid_max: default: 32768 minimum: 301
    [ 0.014076] Mount-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
    [ 0.014359] Mountpoint-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
    [ 0.053853] rcu: Hierarchical SRCU implementation.
    [ 0.054044] rcu: Max phase no-delay instances is 1000.
    [ 0.055157] Timer migration: 1 hierarchy levels; 8 children per group; 1 crossnode level
    [ 0.060032] EFI services will not be available.
    [ 0.061388] smp: Bringing up secondary CPUs ...

    看起来是smp的问题,直接-append加一个nosmp把smp ban了(

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    #!/bin/bash
    ./qemu-system-aarch64 \
    -M virt \
    -cpu cortex-a57 \
    -m 2G \
    -kernel ./Image \
    -dtb ./juno.dtb \
    -drive file=./trixie.img,format=raw \
    -nographic \
    -s \
    -no-reboot \
    -append "earlycon=pl011,0x9000000 console=ttyAMA0,115200n8 \
    loglevel=8 root=/dev/sda panic=1 nosmp"
  4. 从log可以看出hang出现在STM32 USART driver initialized之后,那问题可能出在smmu上,但这里smmu probe失败了

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    [    0.490446] SuperH (H)SCI(F) driver initialized
    [ 0.490748] STM32 USART driver initialized
    [ 0.494168] arm-smmu 7fb00000.iommu: error -EBUSY: can't request region for resource [mem 0x7fb00000-0x7fb0ffff]
    [ 0.494432] arm-smmu 7fb00000.iommu: probe with driver arm-smmu failed with error -16
    [ 0.494735] arm-smmu 7fb10000.iommu: error -EBUSY: can't request region for resource [mem 0x7fb10000-0x7fb1ffff]
    [ 0.494982] arm-smmu 7fb10000.iommu: probe with driver arm-smmu failed with error -16
    [ 0.495241] arm-smmu 7fb20000.iommu: error -EBUSY: can't request region for resource [mem 0x7fb20000-0x7fb2ffff]
    [ 0.495484] arm-smmu 7fb20000.iommu: probe with driver arm-smmu failed with error -16
    [ 0.495737] arm-smmu 7fb30000.iommu: error -EBUSY: can't request region for resource [mem 0x7fb30000-0x7fb3ffff]
    [ 0.495957] arm-smmu 7fb30000.iommu: probe with driver arm-smmu failed with error -16

    dtb指定的smmu地址和qemu实际模拟的地址不一样,用dtc命令把dtb转化为dts,把smmu地址改成qemu模拟smmu的地址就行,juno有多个smmu,改一个就行

    1
    2
    3
    4
    5
    6
    7
    8
    9
    iommu@9050000 {
    compatible = "arm,mmu-401\0arm,smmu-v1";
    reg = <0x00 0x9050000 0x00 0x10000>;
    interrupts = <0x00 0x5f 0x04 0x00 0x5f 0x04>;
    #iommu-cells = <0x01>;
    #global-interrupts = <0x01>;
    dma-coherent;
    phandle = <0x3d>;
    };

    注:reg的地址是mmio的地址,这个小知识在后面会用到

然后就…结束了,最终确定hang发生在smmu的probe里

1
2
3
4
5
6
7
8
9
10
11
12
13
[    0.426055] leds-syscon 1c010008.0.led: registered LED (null)
[ 0.427326] leds-syscon 1c010008.1.led: registered LED (null)
[ 0.428027] leds-syscon 1c010008.2.led: registered LED (null)
[ 0.428615] leds-syscon 1c010008.3.led: registered LED (null)
[ 0.429211] leds-syscon 1c010008.4.led: registered LED (null)
[ 0.429816] leds-syscon 1c010008.5.led: registered LED (null)
[ 0.430246] leds-syscon 1c010008.6.led: registered LED (null)
[ 0.430606] leds-syscon 1c010008.7.led: registered LED (null)
[ 0.432012] ledtrig-cpu: registered to indicate activity on CPUs
[ 0.477846] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled
[ 0.486792] msm_serial: driver initialized
[ 0.487597] SuperH (H)SCI(F) driver initialized
[ 0.487913] STM32 USART driver initialized

probe嵌套导致的deadlock

backtrace可以看出死锁了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#0  0xffff800080114ae0 in mutex_spin_on_owner (lock=0xffff0000036bfc90, owner=0xffff000003510000, ww_ctx=0x0, waiter=0x0) at kernel/locking/mutex.c:377
#1 0xffff80008118cecc in mutex_optimistic_spin (waiter=<optimized out>, ww_ctx=<optimized out>, lock=<optimized out>) at kernel/locking/mutex.c:480
#2 __mutex_lock_common (use_ww_ctx=<optimized out>, ww_ctx=<optimized out>, ip=<optimized out>, nest_lock=<optimized out>, subclass=<optimized out>, state=<optimized out>, lock=<optimized out>) at kernel/locking/mutex.c:618
#3 __mutex_lock (lock=0xffff0000036bfc90, state=0x2, ip=<optimized out>, nest_lock=<optimized out>, subclass=<optimized out>) at kernel/locking/mutex.c:776
#4 0xffff80008118d1dc in __mutex_lock_slowpath (lock=0xffff0000036bfc90) at kernel/locking/mutex.c:1065
#5 0xffff80008118d230 in mutex_lock (lock=0xffff0000036bfc90) at kernel/locking/mutex.c:290
#6 0xffff8000809cdd1c in device_lock (dev=<optimized out>) at ./include/linux/device.h:895
#7 class_device_constructor (_T=<optimized out>) at ./include/linux/device.h:913
#8 driver_match_device_locked (dev=<optimized out>, drv=<optimized out>) at drivers/base/base.h:193
#9 __driver_attach (dev=0xffff0000036bfc10, data=0xffff800082e64440 <qcom_smmu_tbu_driver+40>) at drivers/base/dd.c:1183
#10 0xffff8000809cb17c in bus_for_each_dev (bus=0xffff0000036bfc90, start=0x0, data=0xffff800082e64440 <qcom_smmu_tbu_driver+40>, fn=0xffff8000809cdcec <__driver_attach>) at drivers/base/bus.c:383
#11 0xffff8000809cd03c in driver_attach (drv=0x0) at drivers/base/dd.c:1245
#12 0xffff8000809cc748 in bus_add_driver (drv=0xffff800082e64440 <qcom_smmu_tbu_driver+40>) at drivers/base/bus.c:715
#13 0xffff8000809ced28 in driver_register (drv=0xffff800082e64440 <qcom_smmu_tbu_driver+40>) at drivers/base/driver.c:249
#14 0xffff8000809d0254 in __platform_driver_register (drv=0x0, owner=0xffff000003510000) at drivers/base/platform.c:908
#15 0xffff8000809a6208 in qcom_smmu_impl_init (smmu=0xffff0000037c0080) at drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c:780
#16 0xffff8000809a48a0 in arm_smmu_impl_init (smmu=0xffff0000037c0080) at drivers/iommu/arm/arm-smmu/arm-smmu-impl.c:224
#17 0xffff8000809a2ae0 in arm_smmu_device_probe (pdev=0xffff0000036bfc00) at drivers/iommu/arm/arm-smmu/arm-smmu.c:2155
#18 0xffff8000809d060c in platform_probe (_dev=0xffff0000036bfc10) at drivers/base/platform.c:1446
#19 0xffff8000809cd6a4 in call_driver_probe (drv=<optimized out>, dev=<optimized out>) at drivers/base/dd.c:583
#20 really_probe (dev=0xffff0000036bfc10, drv=0xffff800082e641c0 <arm_smmu_driver+40>) at drivers/base/dd.c:661
#21 0xffff8000809cd8f8 in __driver_probe_device (drv=0xffff800082e641c0 <arm_smmu_driver+40>, dev=0xffff0000036bfc10) at drivers/base/dd.c:803
#22 0xffff8000809cdb34 in driver_probe_device (drv=0xffff0000036bfc90, dev=0xffff0000036bfc10) at drivers/base/dd.c:833
#23 0xffff8000809cddb8 in __driver_attach (data=<optimized out>, dev=<optimized out>) at drivers/base/dd.c:1227
#24 __driver_attach (dev=0xffff0000036bfc10, data=0xffff800082e641c0 <arm_smmu_driver+40>) at drivers/base/dd.c:1167
#25 0xffff8000809cb17c in bus_for_each_dev (bus=0xffff0000036bfc90, start=0x0, data=0xffff800082e641c0 <arm_smmu_driver+40>, fn=0xffff8000809cdcec <__driver_attach>) at drivers/base/bus.c:383
#26 0xffff8000809cd03c in driver_attach (drv=0x0) at drivers/base/dd.c:1245
#27 0xffff8000809cc748 in bus_add_driver (drv=0xffff800082e641c0 <arm_smmu_driver+40>) at drivers/base/bus.c:715
#28 0xffff8000809ced28 in driver_register (drv=0xffff800082e641c0 <arm_smmu_driver+40>) at drivers/base/driver.c:249
#29 0xffff8000809d0254 in __platform_driver_register (drv=0x0, owner=0xffff000003510000) at drivers/base/platform.c:908
#30 0xffff800081f3d12c in arm_smmu_driver_init () at drivers/iommu/arm/arm-smmu/arm-smmu.c:2368
#31 0xffff800080015218 in do_one_initcall (fn=0xffff800081f3d10c <arm_smmu_driver_init>) at init/main.c:1378
#32 0xffff800081ed13e4 in do_initcall_level (command_line=<optimized out>, level=<optimized out>) at init/main.c:1440
#33 do_initcalls () at init/main.c:1456
#34 do_basic_setup () at init/main.c:1475
#35 kernel_init_freeable () at init/main.c:1688
#36 0xffff800081187b50 in kernel_init (unused=0xffff0000036bfc90) at init/main.c:1578
#37 0xffff800080015f58 in ret_from_fork () at arch/arm64/kernel/entry.S:860

arm-smmu驱动在probe的时候注册了另一个驱动,两次platform_driver_register的driver_attach匹配到了同一个设备,导致该设备两次持锁,死锁了

最后判定2045的patch没有问题,这种在probe里注册驱动的行为不合理,新开了一个patch修复,成功混了一个Tested-by和Reviewed-by😎

这样邮件里Juno和高通的问题就解决了

这个问题会影响所有开启了ARM_SMMU_QCOM和ARCH_QCOM的设备,因为arm_smmu_impl_init只有在开启ARM_SMMU_QCOM时才会进qcom_smmu_impl_init,而ARM_SMMU_QCOM depend on ARCH_QCOM

另外ARM_SMMU_QCOM是默认开启的

1
2
3
4
5
6
7
8
9
10
11
12
struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
{
/* ... */
if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM))
smmu = qcom_smmu_impl_init(smmu);

if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = &mrvl_mmu500_impl;

return smmu;
}

Round 2: Tegra

这样就结束了吗?并没有

Jon发现类似的问题同样出现在tegra194上,并且和Juno不是一个问题

1
2
3
4
5
6
7
8
9
10
11
12
I am seeing a similar issue on one of our Tegra boards and bisect also 
points to this commit.

It is odd because it only appears to impact the Tegra194 Jetson Xavier
NX board (tegra194-p3509-0000+p3668-0000.dts).

It appears to boot enough so the test can SSH into the device, but the
kernel log does not show the us getting to the console prompt. It also
appears that a lot of drivers are not bound as expected. I would need to
check if those are all modules or not.

Jon

顺带一提我的记忆在这里出现了一点偏差,我把高通的log记成了tegra的,但其实Jon并没有贴log,所以我对着虚空log折腾了好几天:(

一段原地踏步

模拟tegra比Juno问题要多的多,首先tegra的硬件要比Juno多,其次tegra的大部分硬件qemu都不支持

首先遇到的就是gic的问题,这个倒好解决,qemu是支持gic的,设置一下gic-version再改一下dts里gic的地址到qemu默认地址就行

1
2
3
4
5
6
7
8
9
10
[    0.000000] Call trace:
[ 0.000000] gic_init_bases+0x2c/0x1cc (P)
[ 0.000000] __gic_init_bases+0xb8/0x148
[ 0.000000] gic_of_init+0x88/0x41c
[ 0.000000] of_irq_init+0x318/0x420
[ 0.000000] irqchip_init+0x18/0x40
[ 0.000000] init_IRQ+0xa0/0xb8
[ 0.000000] start_kernel+0x48c/0x83c
[ 0.000000] __primary_switched+0x88/0x90
[ 0.000000] Code: 35000a41 a9410262 a9000262 91001042 (b9400042)
1
2
3
4
5
6
7
8
9
10
11
12
13
#!/bin/bash
./qemu-system-aarch64 \
-M virt,gic-version=2 \
-cpu cortex-a57 \
-m 2G \
-kernel ./Image \
-drive file=./trixie.img,format=raw \
-dtb ./tegra194-p3509-0000+p3668-000.dtb \
-nographic \
-s \
-no-reboot \
-append "nokaslr console=pl011 earlycon=pl011,0x9000000 \
loglevel=8 panic=1 root=/dev/sda nosmp"

之后进度就在这里停滞了…

碰到的下一个panic是fuse

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
[    0.055755] Tegra APB MISC not yet available
[ 0.056269] WARNING: drivers/soc/tegra/fuse/tegra-apbmisc.c:37 at tegra_read_chipid+0x3c/0x54, CPU#0: swapper/0/1
[ 0.056654] Modules linked in:
[ 0.057071] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.19.0-rc6-00325-g2a8ff25d2be0 #8 PREEMPT
[ 0.057522] Hardware name: NVIDIA Jetson Xavier NX Developer Kit (SD-card) (DT)
[ 0.057932] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 0.058187] pc : tegra_read_chipid+0x3c/0x54
[ 0.058355] lr : tegra_read_chipid+0x3c/0x54
[ 0.058522] sp : ffff80008305bc90
[ 0.058640] x29: ffff80008305bc90 x28: 0000000000000000 x27: 0000000000000000
[ 0.058940] x26: 0000000000000000 x25: ffff800082f57000 x24: 0000000000000000
[ 0.059200] x23: 0000000000000000 x22: 0000000000000000 x21: ffff000003560000
[ 0.059463] x20: ffff800082fbd000 x19: ffff800082fbd190 x18: 00000000ffffffff
[ 0.059724] x17: 00000000f78a42b2 x16: ffff800082e60930 x15: ffff80010305b8f7
[ 0.059965] x14: 0000000000000000 x13: ffff800082aa6b58 x12: 00000000000000ea
[ 0.060214] x11: 000000000000004e x10: ffff800082afeb58 x9 : ffff800082aa6b58
[ 0.060493] x8 : 00000000ffffefff x7 : ffff800082afeb58 x6 : 0000000000000000
[ 0.060757] x5 : 000000000000bff4 x4 : 0000000000000000 x3 : 0000000000000000
[ 0.061009] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000003560000
[ 0.061356] Call trace:
[ 0.061573] tegra_read_chipid+0x3c/0x54 (P)
[ 0.061848] tegra_init_revision+0x14/0xcc
[ 0.062014] tegra30_fuse_init+0x40/0x130
[ 0.062160] tegra_init_fuse+0x13c/0x21c
[ 0.062301] do_one_initcall+0x6c/0x1b0
[ 0.062441] kernel_init_freeable+0x108/0x2e0
[ 0.062606] kernel_init+0x20/0x1d8
[ 0.062733] ret_from_fork+0x10/0x20
[ 0.062996] ---[ end trace 0000000000000000 ]---

这个就完全没办法用qemu支持的硬件平替了

由于此时我还以为高通的log是tegra的log,我就想既然卡在了STM32 USART driver initialized之前,那么在这条输出出现之前造成panic的硬件可以直接从dtb上删掉,但这样不能解决根本问题,STM32 USART driver initialized之后导致panic的硬件还是无法模拟

然后就是一段原地踏步…

最开始在github上找到一个模拟tegra的qemu,但这玩意跑不起来,而且是在进内核之前就寄了的跑不起来

后来想直接改qemu的virt,但折腾了半天没折腾出来

然后又反转了

Jon贴了一个log,QSPI设备在probe中出现了空指针解引用,2045分析出由于tegra_qspi_probe在持有锁时panic,mutex可能仍然被标记为持有/孤立状态,从而阻塞同一总线上后续的驱动程序绑定,相当于他的patch没有导致问题只是放大了另一个bug导致的崩溃的影响

The End

Eurus vs Qemu

结束了吗?并不,我还没玩够:)

我在某天坐车的时候突然想起来其实不用改virt这么大动干戈,可以只写一个新的qemu设备,然后-device挂上就行了,qemu PCI设备的编写方法之前在搞qemu逃逸的时候也见过。由于我们只需要骗过probe,所以设备不需要实现复杂的功能

比如要模拟一个misc设备,只需要写这么一个device,然后在meason.build文件加一行system_ss.add(files('tegra194-misc.c'))就行

probe的大部分panic都是由于访问设备的mmio内存(reg字段的地址)但该地址不存在导致的,所以qemu device只需要实现地址映射就可以

1
2
3
4
5
misc@100000 {
compatible = "nvidia,tegra194-misc";
reg = <0x00 0x100000 0x00 0xf000 0x00 0x10f000 0x00 0x1000>;
phandle = <0x04>;
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "qemu/units.h"
#include "hw/pci/pci.h"
#include "hw/pci/msi.h"
#include "qemu/timer.h"
#include "qom/object.h"
#include "qemu/main-loop.h" /* iothread mutex */
#include "qemu/module.h"
#include "qapi/visitor.h"

#define TYPE_PCI_MISC_DEVICE "tegra194-misc"
typedef struct MISCState MISCState;
DECLARE_INSTANCE_CHECKER(MISCState, MISC,
TYPE_PCI_MISC_DEVICE)

struct MISCState {
PCIDevice pdev;
MemoryRegion mmio;
};

static uint64_t misc_mmio_read(void *opaque, hwaddr addr, unsigned size)
{
return 0xcafe;
}

static void misc_mmio_write(void *opaque, hwaddr addr, uint64_t val,
unsigned size)
{
// pass
}

static const MemoryRegionOps misc_mmio_ops = {
.read = misc_mmio_read,
.write = misc_mmio_write,
.endianness = DEVICE_NATIVE_ENDIAN,
.valid = {
.min_access_size = 4,
.max_access_size = 8,
},
.impl = {
.min_access_size = 4,
.max_access_size = 8,
},

};

static void pci_misc_realize(PCIDevice *pdev, Error **errp)
{
MemoryRegion *addr_space = get_system_memory();
MISCState *misc = MISC(pdev);

memory_region_init_io(&misc->mmio, OBJECT(misc), &misc_mmio_ops, misc,
"misc-mmio", 0x10000);
pci_register_bar(pdev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &misc->mmio);
memory_region_add_subregion(addr_space, 0x100000, &misc->mmio);
}

static void pci_misc_uninit(PCIDevice *pdev)
{
// pass
}

static void misc_instance_init(Object *obj)
{
// pass
}

static void misc_class_init(ObjectClass *class, const void *data)
{
DeviceClass *dc = DEVICE_CLASS(class);
PCIDeviceClass *k = PCI_DEVICE_CLASS(class);

k->realize = pci_misc_realize;
k->exit = pci_misc_uninit;
k->vendor_id = PCI_VENDOR_ID_QEMU;
k->device_id = 0xdead;
k->revision = 0x10;
k->class_id = PCI_CLASS_OTHERS;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
}

static const TypeInfo misc_types[] = {
{
.name = TYPE_PCI_MISC_DEVICE,
.parent = TYPE_PCI_DEVICE,
.instance_size = sizeof(MISCState),
.instance_init = misc_instance_init,
.class_init = misc_class_init,
.interfaces = (const InterfaceInfo[]) {
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
{ },
},
}
};

DEFINE_TYPES(misc_types)

然后就是进行一个重复工作一步步把导致panic的设备地址都映射了

在这个过程中Eurus又干了很多很唐的事情,比如映射了misc但还是在tegra_init_fuse里报错,捣鼓捣鼓从ioremap_prot地址发现misc的映射没问题,tegra_init_fuse还访问了fuse设备:)。以及解决了cbb的panic但后来cbb又panic了,搞了半天发现cbb的设备不止一个,等等…

最后整好几个设备,改了几次dtb,整出了这么个挂了n个device的启动脚本(其实也可以一个解决,但这样比较符合语义)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/bin/bash
./qemu-system-aarch64 \
-M virt,gic-version=2 \
-cpu cortex-a57 \
-m 2G \
-kernel ./Image \
-drive file=./trixie.img,format=raw \
-dtb ./tegra194-p3509-0000+p3668-0000-fix3.dtb \
-nographic \
-device tegra194-misc \
-device tegra194-fuse \
-device tegra194-pmc \
-device tegra194-mc \
-device tegra194-hsp \
-device tegra194-cbb \
-device tegra194-gpio \
-device tegra194-timer \
-device tegra194-serial \
-s \
-no-reboot \
-append "nokaslr console=pl011 earlycon=pl011,0x9000000 \
loglevel=8 panic=1 root=/dev/sda"

然后就可以顺利启动到执行init,到这里我才发现Jon只贴了一个log

可能是由于强行模拟,我无法成功挂载文件系统,但Jon的log中空指针解引用触发在文件系统挂载之后,最终还是没办法完美复现

总结

我这种简单粗暴的模拟方式还是有很多的问题,比如虽然使用的是同样的dtb,但我模拟出来的启动log和Jon的log区别还是很大,可能是模拟的粒度不够细

以及还有console和文件系统的问题没有解决,以后再来探索吧

也是没想到2045一个patch挖出这么多问题,看看这天梯一样的邮件doge


记一次奇妙的内核冒险
http://akaieurus.github.io/2026/01/25/2045-deadlock-debug/
作者
Eurus
发布于
2026年1月25日
许可协议