nft源码分析

非常好nft,L3HCTF见到了,学之👈

Netfilter

Netfilter 通过在网络协议栈的不同阶段注册钩子函数来实现对数据包的处理与过滤,过滤位于 IP

  • Netfilter提供了5个HOOK点(包处理的不同阶段),区分的重要节点是 路由判决(是发往本地的包还是中转的包)

    • PRE_ROUTING:刚收到包,路由判断前
    • LOCAL_IN:路由判决结束,是发往本地的包
    • FORWARD:路由判决结束,是中转的包
    • LOCAL_OUT:路由判决前,本地要发出的包
    • POST_ROUTING:路由判决后,发出的包
  • 实际上不同的协议hook略有区别

    源码中有相关的定义

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    enum nf_inet_hooks {
    NF_INET_PRE_ROUTING,
    NF_INET_LOCAL_IN,
    NF_INET_FORWARD,
    NF_INET_LOCAL_OUT,
    NF_INET_POST_ROUTING,
    NF_INET_NUMHOOKS,
    NF_INET_INGRESS = NF_INET_NUMHOOKS,
    };

    /* ARP Hooks */
    #define NF_ARP_IN 0
    #define NF_ARP_OUT 1
    #define NF_ARP_FORWARD 2

hook挂载:NF_INET_XXX

先康康在哪些函数里调用了hook函数

附一张IPv4的函数调用关系图

  • NF_INET_PRE_ROUTING

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    /*
    * IP receive entry point
    */
    int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
    struct net_device *orig_dev)
    {
    /* IP数据报的合法性检查和一些必要字段设置 */
    struct net *net = dev_net(dev);

    skb = ip_rcv_core(skb, net);
    if (skb == NULL)
    return NET_RX_DROP;

    /* 经过NF_INET_PRE_ROUTING hook点 */
    return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
    net, NULL, skb, dev, NULL,
    ip_rcv_finish);
    }
  • NF_INET_LOCAL_IN

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    int ip_local_deliver(struct sk_buff *skb)
    {
    /*
    * 分片重组
    */
    struct net *net = dev_net(skb->dev);

    if (ip_is_fragment(ip_hdr(skb))) {
    if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
    return 0;
    }

    /* 经过NF_INET_LOCAL_IN hook点 */
    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
    net, NULL, skb, skb->dev, NULL,
    ip_local_deliver_finish);
    }
  • NF_INET_FORWARD

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    int ip_forward(struct sk_buff *skb)
    {

    /*
    * 合法性检查
    */


    /*
    * 递减TTL,判断是否需要icmp
    */

    /* 经过NF_INET_FORWARD hook点 */
    return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
    net, NULL, skb, skb->dev, rt->dst.dev,
    ip_forward_finish);

    /*
    * 错误处理
    */
    }
  • NF_INET_LOCAL_OUT

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
    {
    struct iphdr *iph = ip_hdr(skb);

    /* 计算总长度 */
    iph->tot_len = htons(skb->len);
    /* 计算校验和 */
    ip_send_check(iph);

    /* if egress device is enslaved to an L3 master device pass the
    * skb to its handler for processing
    */
    skb = l3mdev_ip_out(sk, skb);
    if (unlikely(!skb))
    return 0;

    /* 设置ip协议 */
    skb->protocol = htons(ETH_P_IP);

    /* 经过NF_INET_LOCAL_OUT hook点 */
    return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
    net, sk, skb, NULL, skb_dst(skb)->dev,
    dst_output);
    }
  • NF_INET_POST_ROUTING

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
    {
    struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;

    IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

    /* 输出设备和协议 */
    skb->dev = dev;
    skb->protocol = htons(ETH_P_IP);

    /* 经过NF_INET_POST_ROUTING hook点 */
    return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
    net, sk, skb, indev, dev,
    ip_finish_output,
    !(IPCB(skb)->flags & IPSKB_REROUTED));
    }
    EXPORT_SYMBOL(ip_output);

再放两张bridge的hook点的图

hook执行:NF_HOOK

看看NF_HOOK函数是怎么执行注册的过滤函数的

1
2
3
4
5
6
7
8
9
10
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
struct net_device *in, struct net_device *out,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
if (ret == 1)
ret = okfn(net, sk, skb);
return ret;
}

调用nf_hook执行hook,成功则执行接下来要执行的函数okfn,如

  • ip_rcv 👉 ip_rcv_finish
  • ip_local_deliver 👉 ip_local_deliver_finish
  • ip_forward 👉 ip_forward_finish
  • ip_output 👉 ip_output_finish

详情可见👆函数调用图和hook点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
struct nf_hook_entries *hook_head = NULL;
int ret = 1;

/*
* 1. 优化一些几乎始终为真或者始终为假的分支
*/
#ifdef CONFIG_JUMP_LABEL
if (__builtin_constant_p(pf) &&
__builtin_constant_p(hook) &&
!static_key_false(&nf_hooks_needed[pf][hook]))
return 1;
#endif

/*
* 2. 从对应namespace中取对应hook点的hook链
*/
rcu_read_lock();
switch (pf) {
case NFPROTO_IPV4:
hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
break;
case NFPROTO_IPV6:
hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
break;
case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
break;
hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
break;
case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
break;
#if IS_ENABLED(CONFIG_DECNET)
case NFPROTO_DECNET:
hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
break;
#endif
default:
WARN_ON_ONCE(1);
break;
}

/*
* 3. 调用nf_hook_slow执行hook链
*/
if (hook_head) {
struct nf_hook_state state;

nf_hook_state_init(&state, hook, pf, indev, outdev,
sk, net, okfn);

ret = nf_hook_slow(skb, &state, hook_head, 0);
}
rcu_read_unlock();

return ret;
}
  • CONFIG_JUMP_LABEL的定义,简单说就是优化一些几乎为真 / 假的分支,加快执行速度

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    config JUMP_LABEL
    bool "Optimize very unlikely/likely branches"
    depends on HAVE_ARCH_JUMP_LABEL
    depends on CC_HAS_ASM_GOTO
    help
    This option enables a transparent branch optimization that
    makes certain almost-always-true or almost-always-false branch
    conditions even cheaper to execute within the kernel.

    Certain performance-sensitive kernel code, such as trace points,
    scheduler functionality, networking code and KVM have such
    branches and include support for this optimization technique.

    If it is detected that the compiler has support for "asm goto",
    the kernel will compile such branches with just a nop
    instruction. When the condition flag is toggled to true, the
    nop will be converted to a jump instruction to execute the
    conditional block of instructions.

    This technique lowers overhead and stress on the branch prediction
    of the processor and generally makes the kernel faster. The update
    of the condition is slower, but those are always very rare.

    ( On 32-bit x86, the necessary options added to the compiler
    flags may increase the size of the kernel slightly. )
  • 高版本的nf_hook不是全局储存的,而是注册在network namespace中,net结构体表示network namespace

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    struct net {


    #ifdef CONFIG_NETFILTER
    struct netns_nf nf;
    #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
    struct netns_ct ct;
    #endif
    #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
    struct netns_nftables nft;
    #endif


    } __randomize_layout;

    netns_nf储存了不同协议的hook链

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    struct netns_nf {


    struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS];
    struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS];
    #ifdef CONFIG_NETFILTER_FAMILY_ARP
    struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS];
    #endif
    #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
    struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
    #endif
    #if IS_ENABLED(CONFIG_DECNET)
    struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS];
    #endif


    };
    1
    2
    3
    4
    5
    struct nf_hook_entries {
    u16 num_hook_entries;
    /* padding */
    struct nf_hook_entry hooks[];
    };
    1
    2
    3
    4
    struct nf_hook_entry {
    nf_hookfn *hook;
    void *priv;
    };
    1
    2
    3
    typedef unsigned int nf_hookfn(void *priv,
    struct sk_buff *skb,
    const struct nf_hook_state *state);

    结构目前看来大概是这样👇

    • nf_hook_entries表示NF_INET_PRE_ROUTING等5种hook点
    • nf_hook_entry表示一个hook点上的一个hook函数
  • nf_hook_state_init初始化了中间结构体nf_hook_state

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    static inline void nf_hook_state_init(struct nf_hook_state *p,
    unsigned int hook,
    u_int8_t pf,
    struct net_device *indev,
    struct net_device *outdev,
    struct sock *sk,
    struct net *net,
    int (*okfn)(struct net *, struct sock *, struct sk_buff *))
    {
    p->hook = hook;
    p->pf = pf;
    p->in = indev;
    p->out = outdev;
    p->sk = sk;
    p->net = net;
    p->okfn = okfn;
    }
    1
    2
    3
    4
    5
    6
    7
    8
    9
    struct nf_hook_state {
    u8 hook;
    u8 pf;
    struct net_device *in;
    struct net_device *out;
    struct sock *sk;
    struct net *net;
    int (*okfn)(struct net *, struct sock *, struct sk_buff *);
    };

    nf_hook_slow真正执行了所有的hook,并根据hook的返回值决定下一步动作,丢弃报文 / 接受报文 / 加入队列由对应模块处理

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    /* Returns 1 if okfn() needs to be executed by the caller,
    * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */
    int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
    const struct nf_hook_entries *e, unsigned int s)
    {
    unsigned int verdict;
    int ret;

    for (; s < e->num_hook_entries; s++) {
    verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
    switch (verdict & NF_VERDICT_MASK) {
    case NF_ACCEPT:
    break;
    case NF_DROP:
    kfree_skb(skb);
    ret = NF_DROP_GETERR(verdict);
    if (ret == 0)
    ret = -EPERM;
    return ret;
    case NF_QUEUE:
    ret = nf_queue(skb, state, s, verdict);
    if (ret == 1)
    continue;
    return ret;
    default:
    /* Implicit handling for NF_STOLEN, as well as any other
    * non conventional verdicts.
    */
    return 0;
    }
    }

    return 1;
    }
    EXPORT_SYMBOL(nf_hook_slow);
    1
    2
    3
    4
    5
    6
    static inline int
    nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
    struct nf_hook_state *state)
    {
    return entry->hook(entry->priv, skb, state);
    }

hook注册:nf_register_net_hook

根据协议注册不同的hook

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
{
int err;

if (reg->pf == NFPROTO_INET) {
if (reg->hooknum == NF_INET_INGRESS) {
err = __nf_register_net_hook(net, NFPROTO_INET, reg);
if (err < 0)
return err;
} else {
err = __nf_register_net_hook(net, NFPROTO_IPV4, reg);
if (err < 0)
return err;

err = __nf_register_net_hook(net, NFPROTO_IPV6, reg);
if (err < 0) {
__nf_unregister_net_hook(net, NFPROTO_IPV4, reg);
return err;
}
}
} else {
err = __nf_register_net_hook(net, reg->pf, reg);
if (err < 0)
return err;
}

return 0;
}
EXPORT_SYMBOL(nf_register_net_hook);

nf_hook_ops结构体

1
2
3
4
5
6
7
8
9
10
11
struct nf_hook_ops {
/* User fills in from here down. */
nf_hookfn *hook;
struct net_device *dev;
void *priv;
u8 pf;
enum nf_hook_ops_type hook_ops_type:8;
unsigned int hooknum;
/* Hooks are ordered in ascending priority. */
int priority;
};
  • hook:hook函数

  • dev:net设备

  • priv:指针

  • pf:协议类型,PF_INET之类的

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    /* Supported address families. */
    #define AF_UNSPEC 0
    #define AF_UNIX 1 /* Unix domain sockets */
    #define AF_LOCAL 1 /* POSIX name for AF_UNIX */
    #define AF_INET 2 /* Internet IP Protocol */


    /* Protocol families, same as address families. */
    #define PF_UNSPEC AF_UNSPEC
    #define PF_UNIX AF_UNIX
    #define PF_LOCAL AF_LOCAL
    #define PF_INET AF_INET


  • hook_ops_type:注册的hook类型,是不是nf_tables

    1
    2
    3
    4
    enum nf_hook_ops_type {
    NF_HOOK_OP_UNDEFINED,
    NF_HOOK_OP_NF_TABLES,
    };
  • hooknum:hook类型(是哪个hook点),也是nf_hook_entries的index

  • priority:优先级

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    enum nf_ip_hook_priorities {
    NF_IP_PRI_FIRST = INT_MIN,
    NF_IP_PRI_RAW_BEFORE_DEFRAG = -450,
    NF_IP_PRI_CONNTRACK_DEFRAG = -400,
    NF_IP_PRI_RAW = -300,
    NF_IP_PRI_SELINUX_FIRST = -225,
    NF_IP_PRI_CONNTRACK = -200,
    NF_IP_PRI_MANGLE = -150,
    NF_IP_PRI_NAT_DST = -100,
    NF_IP_PRI_FILTER = 0,
    NF_IP_PRI_SECURITY = 50,
    NF_IP_PRI_NAT_SRC = 100,
    NF_IP_PRI_SELINUX_LAST = 225,
    NF_IP_PRI_CONNTRACK_HELPER = 300,
    NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
    NF_IP_PRI_LAST = INT_MAX,
    };

__nf_register_net_hook将新的hook插入对应的nf_hook_entries表项

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
static int __nf_register_net_hook(struct net *net, int pf,
const struct nf_hook_ops *reg)
{
struct nf_hook_entries *p, *new_hooks;
struct nf_hook_entries __rcu **pp;
int err;

/*
* 1. 类型检查
*/
switch (pf) {
case NFPROTO_NETDEV:
#ifndef CONFIG_NETFILTER_INGRESS
if (reg->hooknum == NF_NETDEV_INGRESS)
return -EOPNOTSUPP;
#endif
#ifndef CONFIG_NETFILTER_EGRESS
if (reg->hooknum == NF_NETDEV_EGRESS)
return -EOPNOTSUPP;
#endif
if ((reg->hooknum != NF_NETDEV_INGRESS &&
reg->hooknum != NF_NETDEV_EGRESS) ||
!reg->dev || dev_net(reg->dev) != net)
return -EINVAL;
break;
case NFPROTO_INET:
if (reg->hooknum != NF_INET_INGRESS)
break;

err = nf_ingress_check(net, reg, NF_INET_INGRESS);
if (err < 0)
return err;
break;
}

/*
* 2. 取出pf对应协议的hooknum对应hook点的nf_hook_entries项
*/
pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev);
if (!pp)
return -EINVAL;

mutex_lock(&nf_hook_mutex);

/*
* 3. 更新nf_hook_entries,插入新的hook
*/
p = nf_entry_dereference(*pp);
new_hooks = nf_hook_entries_grow(p, reg);

if (!IS_ERR(new_hooks)) {
hooks_validate(new_hooks);
rcu_assign_pointer(*pp, new_hooks);
}

/*
* 4. 一些检查
*/
mutex_unlock(&nf_hook_mutex);
if (IS_ERR(new_hooks))
return PTR_ERR(new_hooks);

#ifdef CONFIG_NETFILTER_INGRESS
if (nf_ingress_hook(reg, pf))
net_inc_ingress_queue();
#endif
#ifdef CONFIG_NETFILTER_EGRESS
if (nf_egress_hook(reg, pf))
net_inc_egress_queue();
#endif
nf_static_key_inc(reg, pf);

BUG_ON(p == new_hooks);
nf_hook_entries_free(p);
return 0;
}

看看nf_hook_entries更新的过程,就是nf_hook_entries_grow函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
static struct nf_hook_entries *
nf_hook_entries_grow(const struct nf_hook_entries *old,
const struct nf_hook_ops *reg)
{
unsigned int i, alloc_entries, nhooks, old_entries;
struct nf_hook_ops **orig_ops = NULL;
struct nf_hook_ops **new_ops;
struct nf_hook_entries *new;
bool inserted = false;

/*
* 1. 获取原来的nf_hook_entries中有效的nf_hook_ops数量
*/
alloc_entries = 1;
old_entries = old ? old->num_hook_entries : 0;

if (old) {
orig_ops = nf_hook_entries_get_hook_ops(old);

for (i = 0; i < old_entries; i++) {
if (orig_ops[i] != &dummy_ops)
alloc_entries++;
}
}

/*
* 2. 为新的nf_hook_entries分配空间
*/
if (alloc_entries > MAX_HOOK_COUNT)
return ERR_PTR(-E2BIG);

new = allocate_hook_entries_size(alloc_entries);
if (!new)
return ERR_PTR(-ENOMEM);

/*
* 3. 将原来nf_hook_entries中有效的nf_hook_ops和nf_hook_entry项复制过来
* 根据priority插入新的hook项
*/
new_ops = nf_hook_entries_get_hook_ops(new);

i = 0;
nhooks = 0;
while (i < old_entries) {
if (orig_ops[i] == &dummy_ops) {
++i;
continue;
}

if (inserted || reg->priority > orig_ops[i]->priority) {
new_ops[nhooks] = (void *)orig_ops[i];
new->hooks[nhooks] = old->hooks[i];
i++;
} else {
new_ops[nhooks] = (void *)reg;
new->hooks[nhooks].hook = reg->hook;
new->hooks[nhooks].priv = reg->priv;
inserted = true;
}
nhooks++;
}

if (!inserted) {
new_ops[nhooks] = (void *)reg;
new->hooks[nhooks].hook = reg->hook;
new->hooks[nhooks].priv = reg->priv;
}

return new;
}
  • nf_hook_entries_get_hook_ops函数用于从nf_hook_entries中定位nf_hook_ops指针数组的位置(存在nf_hook_entry数组之后)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e)
    {
    unsigned int n = e->num_hook_entries;
    const void *hook_end;

    hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */

    return (struct nf_hook_ops **)hook_end;
    }
  • allocate_hook_entries_size申请空间,nf_hook_entries实际上包含

    • num_hook_entries项
    • 一个nf_hook_entry数组
    • 一个nf_hook_ops指针数组

    nf_hook_entry数组和nf_hook_ops指针数组中的项一一对应

    还有一个rcu的结构,先忽略(~o ̄3 ̄)~

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
    {
    struct nf_hook_entries *e;
    size_t alloc = sizeof(*e) +
    sizeof(struct nf_hook_entry) * num +
    sizeof(struct nf_hook_ops *) * num +
    sizeof(struct nf_hook_entries_rcu_head);

    if (num == 0)
    return NULL;

    e = kvzalloc(alloc, GFP_KERNEL);
    if (e)
    e->num_hook_entries = num;
    return e;
    }

更新后的结构应该是这样的👇

iptables

由于某个强迫症非要把这条链全都捋一遍,工作量++

iptables使用

四表五链

  • 四表(还有其他表比如security,忽略)
    • filter:过滤,内核模块 iprables_filter
    • nat:网络地址转换(目标ip或者源ip),内核模块 iptables_nat
    • mangle:拆解报文,修改,重新封装,内核模块 iptables_mangle
    • raw:关闭nat表上启用的连接追踪机制,内核模块 iptables_raw
  • 五链(详情见上hook挂载👆)
    • PREROUTING
    • INPUT
    • OUTPUT
    • FORWARD
    • POSTROUTING

表链关系,优先级向下递减

表 / 链 PREROUTING INPUT OUTPUT FORWARD POSTROUTING
raw
mangle
nat
filter

规则

一条规则由 匹配条件(match)处理动作(target) 构成,匹配条件又分为 基本匹配条件扩展匹配条件

  • 匹配条件
    • 基本匹配条件
      • 源IP
      • 目的IP
    • 扩展匹配条件(通常以模块形式存在,模块可以按需安装)
      • 源端口
      • 目标端口
  • 处理动作
    • ACCEPT:允许数据包通过
    • DROP:直接丢弃数据包,不给任何回应信息
    • REJECT:拒绝数据包通过,必要时会给数据发送端一个响应的信息,客户端刚请求就会收到拒绝的信息
    • SNAT:源地址转换,解决内网用户用同一个公网地址上网的问题
    • MASQUERADE:是SNAT的一种特殊形式,适用于动态的、临时会变的ip上
    • DNAT:目标地址转换
    • REDIRECT:在本机做端口映射
    • LOG:在/var/log/messages文件中记录日志信息,然后将数据包传递给下一条规则,也就是说除了记录以外不对数据包做任何其他操作,仍然让下一条规则去匹配

命令格式

1
2
3
4
5
iptables [-t 表]
命令选项
[链]
[匹配选项]
[操作选项]
  • 命令选项

    选项名 功能及特点
    -A –append 在指定链的末尾添加一条新的规则
    -D –delete 删除指定链中的某一条规则,按规则序号或内容确定要删除的规则
    -I –insert 在指定链中插入一条新的规则,默认在链的开头插入
    -R –replace 修改、替换指定链中的一条规则,按规则序号或内容确定
    -F –flush 清空指定链中的所有规则,默认清空表中所有链的内容
    -N –new 新建一条用户自己定义的规则链
    -X –delete-chain 删除指定表中用户自定义的规则链
    -P –policy 设置指定链的默认策略
    -F, –flush 清空指定链上面的所有规则,如果没有指定链,清空表上所有链的所有规则
    -Z, –zero 把指定链或表中的所有链上的所有计数器清零
    -L –list 列出指定链中的所有的规则进行查看,默认列出表中所有链的内容
    -S –list-rules 以原始格式列出链中所有规则
    -v –verbose 查看规则列表时显示详细的信息
    -n –numeric 用数字形式显示输出结果,如显示主机的 IP 地址而不是主机名
    –line-number 查看规则列表时,同时显示规则在链中的顺序号
  • 匹配选项

    选项名 功能及特点
    -i –in-interface 匹配输入接口,如 eth0,eth1
    -o –out-interface 匹配输出接口
    -p –proto 匹配协议类型,如 TCP、UDP 和 ICMP等
    -s –source 匹配的源地址
    –sport 匹配的源端口号
    -d –destination 匹配的目的地址
    –dport 匹配的目的端口号
    -m –match 匹配规则所使用的过滤模块
  • 操作选项

    般为 -j 处理动作 的形式,处理动作包括ACCEPT,DROP,RETURN,REJECT,DNAT,SNAT等

iptables相关源码

iptables数据结构

回顾一下iptables命令的格式

1
2
3
4
5
iptables [-t 表]
命令选项
[链]
[匹配选项]
[操作选项]

再放一张图

接下来的结构体都对应着命令格式和图看

  • xt_table 表示“四表五链”中的表

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    struct xt_table {
    struct list_head list;
    unsigned int valid_hooks;
    struct xt_table_info *private;
    struct nf_hook_ops *ops;
    struct module *me;
    u_int8_t af;
    int priority;
    const char name[XT_TABLE_MAXNAMELEN];
    };
    • list:xt_table以链表储存

    • valid_hooks:用掩码的方式表示有哪些hook,如

      1
      2
      3
      4
      .valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
      (1 << NF_INET_POST_ROUTING) |
      (1 << NF_INET_LOCAL_OUT) |
      (1 << NF_INET_LOCAL_IN),
    • private:指向命令的描述结构体 xt_table_info

    • ops:指向之前分析过的nf_hook_ops结构体

    • af:协议簇

    • priority:优先级

    • name:表名,如nat,filter,mangle

  • xt_table_info 表示表中的命令

    注:所有的idx表示entries[]的index

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    struct xt_table_info {
    unsigned int size;
    unsigned int number;
    unsigned int initial_entries;

    unsigned int hook_entry[NF_INET_NUMHOOKS];
    unsigned int underflow[NF_INET_NUMHOOKS];

    unsigned int stacksize;
    void ***jumpstack;

    unsigned char entries[] __aligned(8);
    };
    • size:table的大小
    • number:包含的entry个数,命令的条数
    • initial_entries:table的起始idx
    • hook_entry:每一条链的起始idx
    • underflow:每一条链的结束idx
    • stacksize:栈大小
    • jumpstack:等同于 jumpstack[cpuid][entry*]
      • 当命令中发生了jump(A→B)时:push A->idx
      • 当命令的target为RETURN时:pop A->idx,继续执行
    • entries:ipt_entry 数组,一个entry表示一条命令
  • ipt_entry 一个entry表示一条命令

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    struct ipt_entry {
    struct ipt_ip ip;

    unsigned int nfcache;

    __u16 target_offset;
    __u16 next_offset;

    unsigned int comefrom;

    struct xt_counters counters;

    unsigned char elems[0];
    };
    • ip:基本匹配IP
    • target_offset:target元素的地址,elems + sizeof(xt_match) * numof(matchs)
    • next_offset:下一个ipt_entry的地址
    • comefrom:从哪个entry来的
    • counters:packet和byte的计数器
    • elems:保存了 xt_matchxt_target
      • elems的地址即xt_match的地址
      • xt_target的地址由target_offset指定

xt_table和net的连接

我们从 xt_register_table 中可以窥见 xt_table 是怎么储存在 网络命名空间 net 结构体 中的

1
2
3
4
5
6
7
8
9
10
11
struct xt_table *xt_register_table(struct net *net,
const struct xt_table *input_table,
struct xt_table_info *bootstrap,
struct xt_table_info *newinfo)
{
struct xt_pernet *xt_net = net_generic(net, xt_pernet_id);

struct xt_table *t, *table;

list_add(&table->list, &xt_net->tables[table->af]);
}
  • xt_table在net中的存储依赖成员 gen

    1
    2
    3
    4
    5
    struct net {

    struct net_generic __rcu *gen;

    };
  • net_generic结构体和xt_pernet结构体

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    struct net_generic {
    union {
    struct {
    unsigned int len;
    struct rcu_head rcu;
    } s;

    void *ptr[0];
    };
    };

    struct xt_pernet {
    struct list_head tables[NFPROTO_NUMPROTO];
    };
  • net_generic函数

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    static inline void *net_generic(const struct net *net, unsigned int id)
    {
    struct net_generic *ng;
    void *ptr;

    rcu_read_lock();
    ng = rcu_dereference(net->gen);
    ptr = ng->ptr[id];
    rcu_read_unlock();

    return ptr;
    }
  • 不考虑rcu的话其实就是

xt_table和hook的注册

四表都有各自的内核模块,以filter为例

1
2
module_init(iptable_filter_init);
module_exit(iptable_filter_fini);

iptable_filter_init 的函数调用关系图

  • 上文已经提到了,xt_table的注册在 xt_register_table 中完成

  • hooks的注册在 nf_register_net_hooks 中完成

    在Netfilter中已经介绍过了 nf_register_net_hook,nf_register_net_hooks其实就是一次注册多个

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
    unsigned int n)
    {
    unsigned int i;
    int err = 0;

    for (i = 0; i < n; i++) {
    err = nf_register_net_hook(net, &reg[i]);
    if (err)
    goto err;
    }
    return err;

    err:
    if (i > 0)
    nf_unregister_net_hooks(net, reg, i);
    return err;
    }
    EXPORT_SYMBOL(nf_register_net_hooks);

用户态和内核态的消息传递

我们可以使用iptables命令行自定义命令,那我们自定义的命令是怎么传递给内核的呢(・∀・(・∀・(・∀・*)

用户态的iptables源码可以看出来

TC_COMMIT 函数利用 setsockopt 向内核提交数据

1
2
3
4
5
6
7
8
int
TC_COMMIT(struct xtc_handle *handle)
{

ret = setsockopt(handle->sockfd, TC_IPPROTO, SO_SET_REPLACE, repl,
sizeof(*repl) + repl->size);

}

repl类型是 ipt_replace,可以看出来和xt_table相似度很高

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
struct ipt_replace {
char name[XT_TABLE_MAXNAMELEN];

unsigned int valid_hooks;

unsigned int num_entries;

unsigned int size;

unsigned int hook_entry[NF_INET_NUMHOOKS];

unsigned int underflow[NF_INET_NUMHOOKS];

unsigned int num_counters;
struct xt_counters __user *counters;

struct ipt_entry entries[0];
};

nftables

nftables使用

创建表

1
$ nft add table inet my_table

一个表一个地址簇

nftables簇 iptables命令行工具
ip iptables
ip6 ip6tables
inet iptables和ip6tables
arp arptables
bridge ebtables

列出所有规则

1
2
3
$ nft list ruleset
table inet my_table {
}

创建链

两种链

  • 常规链:不需要指定钩子类型和优先级,可以用来做跳转,从逻辑上对规则进行分类

    1
    $ nft add chain inet my_table my_utility_chain
  • 基本链:数据包的入口点,需要指定钩子类型和优先级

    1
    $ nft add chain inet my_table my_filter_chain { type filter hook input priority 0 \; }
1
2
3
4
5
6
7
8
9
10
11
12
$ nft list chain inet my_table my_utility_chain
table inet my_table {
chain my_utility_chain {
}
}

$ nft list chain inet my_table my_filter_chain
table inet my_table {
chain my_filter_chain {
type filter hook input priority 0; policy accept;
}
}

创建规则

  • add:规则添加到链末尾

  • insert:规则添加到链头

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    $ nft add rule inet my_table my_filter_chain tcp dport ssh accept
    $ nft insert rule inet my_table my_filter_chain tcp dport http accept
    $ nft list ruleset
    table inet my_table {
    chain my_filter_chain {
    type filter hook input priority 0; policy accept;
    tcp dport http accept
    tcp dport ssh accept
    }
    }
  • index:指定规则index

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    $ nft insert rule inet my_table my_filter_chain index 1 tcp dport nfs accept
    $ nft list ruleset
    table inet my_table {
    chain my_filter_chain {
    type filter hook input priority 0; policy accept;
    tcp dport http accept
    tcp dport nfs accept
    tcp dport ssh accept
    }
    }

    $ nft add rule inet my_table my_filter_chain index 0 tcp dport 1234 accept
    $ nft list ruleset
    table inet my_table {
    chain my_filter_chain {
    type filter hook input priority 0; policy accept;
    tcp dport http accept
    tcp dport 1234 accept
    tcp dport nfs accept
    tcp dport ssh accept
    }
    }
  • handle:指定规则句柄

    • add 表示新规则添加在索引位置的规则后面
    • insert 表示新规则添加在索引位置的规则前面
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    $ nft --handle list ruleset
    table inet my_table { # handle 10
    chain my_filter_chain { # handle 2
    type filter hook input priority 0; policy accept;
    tcp dport http accept # handle 4
    tcp dport 1234 accept # handle 6
    tcp dport nfs accept # handle 5
    tcp dport ssh accept # handle 3
    }
    }

    $ nft add rule inet my_table my_filter_chain handle 4 tcp dport 1234 accept
    $ nft insert rule inet my_table my_filter_chain handle 5 tcp dport nfs accept
    $ nft --handle list ruleset
    table inet my_table { # handle 10
    chain my_filter_chain { # handle 2
    type filter hook input priority 0; policy accept;
    tcp dport http accept # handle 4
    tcp dport 2345 accept # handle 8
    tcp dport 1234 accept # handle 6
    tcp dport 3456 accept # handle 9
    tcp dport nfs accept # handle 5
    tcp dport ssh accept # handle 3
    }
    }

删除规则

通过handle删除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
$ nft --handle list ruleset
table inet my_table { # handle 10
chain my_filter_chain { # handle 2
type filter hook input priority 0; policy accept;
tcp dport http accept # handle 4
tcp dport 2345 accept # handle 8
tcp dport 1234 accept # handle 6
tcp dport 3456 accept # handle 9
tcp dport nfs accept # handle 5
tcp dport ssh accept # handle 3
udp dport 3333 accept # handle 10
}
}
$ nft delete rule inet my_table my_filter_chain handle 8
$ nft --handle list ruleset
table inet my_table { # handle 10
chain my_filter_chain { # handle 2
type filter hook input priority 0; policy accept;
tcp dport http accept # handle 4
tcp dport 1234 accept # handle 6
tcp dport 3456 accept # handle 9
tcp dport nfs accept # handle 5
tcp dport ssh accept # handle 3
udp dport 3333 accept # handle 10
}
}

列出规则

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
$ nft list table inet my_table # 列出表中规则
table inet my_table {
chain my_filter_chain {
type filter hook input priority 0; policy accept;
tcp dport http accept
tcp dport 1234 accept
tcp dport 3456 accept
tcp dport nfs accept
tcp dport ssh accept
udp dport 3333 accept
}
}
$ nft list chain inet my_table my_other_chain # 列出链中规则
table inet my_table {
chain my_other_chain {
udp dport 12345 log prefix "UDP-12345"
}
}

集合

  • 匿名集合

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    $ nft add rule inet my_table my_filter_chain ip saddr { 10.10.10.123, 10.10.10.231 } accept
    $ nft list ruleset
    table inet my_table {
    chain my_filter_chain {
    type filter hook input priority 0; policy accept;
    tcp dport http accept
    tcp dport nfs accept
    tcp dport ssh accept
    ip saddr { 10.10.10.123, 10.10.10.231 } accept
    }
    }
  • 命名集合

    支持的数据类型

    • ipv4_addr:IPv4 地址
    • ipv6_addr:IPv6 地址
    • ether_addr:以太网(Ethernet)地址
    • inet_proto:网络协议
    • inet_service:网络服务
    • mark:标记类型

    创建空的命名集合

    1
    2
    3
    4
    5
    6
    7
    $ nft add set inet my_table my_set { type ipv4_addr \; }
    $ nft list sets
    table inet my_table {
    set my_set {
    type ipv4_addr
    }
    }

    向集合中添加元素

    1
    2
    3
    4
    5
    6
    7
    8
    $ nft add element inet my_table my_set { 10.10.10.22, 10.10.10.33 }
    $ nft list set inet my_table my_set
    table inet my_table {
    set my_set {
    type ipv4_addr
    elements = { 10.10.10.22, 10.10.10.33 }
    }
    }

    引用集合

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    $ nft insert rule inet my_table my_filter_chain ip saddr @my_set drop
    $ nft list chain inet my_table my_filter_chain
    table inet my_table {
    chain my_filter_chain {
    type filter hook input priority 0; policy accept;
    ip saddr @my_set drop
    tcp dport http accept
    tcp dport nfs accept
    tcp dport ssh accept
    ip saddr { 10.10.10.123, 10.10.10.231 } accept
    }
    }
  • 支持区间

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    $ nft add set inet my_table my_range_set { type ipv4_addr \; flags interval
    $ nft add element inet my_table my_range_set { 10.20.20.0/24 }
    $ nft list set inet my_table my_range_set
    table inet my_table {
    set my_range_set {
    type ipv4_addr
    flags interval
    elements = { 10.20.20.0/24 }
    }
    }
  • 级联不同类型

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    $ nft add set inet my_table my_concat_set  { type ipv4_addr . inet_proto . inet_service \; }

    $ nft list set inet my_table my_concat_set
    table inet my_table {
    set my_concat_set {
    type ipv4_addr . inet_proto . inet_service
    }
    }

    $ nft add element inet my_table my_concat_set { 10.30.30.30 . tcp . telnet }
    $ nft add rule inet my_table my_filter_chain ip saddr . meta l4proto . tcp dport @my_concat_set accept
    $ nft add rule inet my_table my_filter_chain ip saddr . meta l4proto . udp dport { 10.30.30.30 . udp . bootps } accept

字典

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
$ nft add chain inet my_table my_tcp_chain
$ nft add chain inet my_table my_udp_chain
$ nft add rule inet my_table my_filter_chain meta l4proto vmap { tcp : jump my_tcp_chain, udp : jump my_udp_chain }
$ nft list chain inet my_table my_filter_chain
table inet my_table {
chain my_filter_chain {
...
meta nfproto ipv4 ip saddr . meta l4proto . udp dport { 10.30.30.30 . udp . bootps } accept
meta l4proto vmap { tcp : jump my_tcp_chain, udp : jump my_udp_chain }
}
}

$ nft add map inet my_table my_vmap { type inet_proto : verdict \; }
$ nft add element inet my_table my_vmap { 192.168.0.10 : drop, 192.168.0.11 : accept }
$ nft add rule inet my_table my_filter_chain ip saddr vmap @my_vmap

nftables相关源码

nftables也有对应模块

1
2
module_init(nf_tables_module_init);
module_exit(nf_tables_module_exit);

nf_tables_module_init函数调用关系图

相关变量

用户态和内核态的消息传递

内核态和用户态通过 Netlink 交互

Netlink是一个内核接口,也是一种协议,便于用户与内核进行网络信息交互,最初开发是为了克服ioctl的限制

使用sendto向内核传递消息时的函数调用栈

nfnetlink_rcv_batch 利用 subsys_id 获取nftables的 nfnetlink_subsystem

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
u16 subsys_id, u32 genid)
{
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;


ss = nfnl_dereference_protected(subsys_id);


nc = nfnetlink_find_client(type, ss);


err = nc->call(skb, &info, (const struct nlattr **)cda);


}

static inline const struct nfnl_callback *
nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss)
{
u8 cb_id = NFNL_MSG_TYPE(type);

if (cb_id >= ss->cb_count)
return NULL;

return &ss->cb[cb_id];
}

nftables的subsys_id是10,对应的nfnetlink_subsystem

1
2
3
4
5
6
7
8
9
10
11
static const struct nfnetlink_subsystem nf_tables_subsys = {
.name = "nf_tables",
.subsys_id = NFNL_SUBSYS_NFTABLES,
.cb_count = NFT_MSG_MAX,
.cb = nf_tables_cb,
.commit = nf_tables_commit,
.abort = nf_tables_abort,
.cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
.owner = THIS_MODULE,
};

nfnl_callback数组,包含了表和链各种数据结构的增删改查函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
[NFT_MSG_NEWTABLE] = {
.call = nf_tables_newtable,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_GETTABLE] = {
.call = nf_tables_gettable,
.type = NFNL_CB_RCU,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_DELTABLE] = {
.call = nf_tables_deltable,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_TABLE_MAX,
.policy = nft_table_policy,
},
[NFT_MSG_NEWCHAIN] = {
.call = nf_tables_newchain,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_GETCHAIN] = {
.call = nf_tables_getchain,
.type = NFNL_CB_RCU,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_DELCHAIN] = {
.call = nf_tables_delchain,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_CHAIN_MAX,
.policy = nft_chain_policy,
},
[NFT_MSG_NEWRULE] = {
.call = nf_tables_newrule,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_GETRULE] = {
.call = nf_tables_getrule,
.type = NFNL_CB_RCU,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_DELRULE] = {
.call = nf_tables_delrule,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_RULE_MAX,
.policy = nft_rule_policy,
},
[NFT_MSG_NEWSET] = {
.call = nf_tables_newset,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_GETSET] = {
.call = nf_tables_getset,
.type = NFNL_CB_RCU,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_DELSET] = {
.call = nf_tables_delset,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_MAX,
.policy = nft_set_policy,
},
[NFT_MSG_NEWSETELEM] = {
.call = nf_tables_newsetelem,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETSETELEM] = {
.call = nf_tables_getsetelem,
.type = NFNL_CB_RCU,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_DELSETELEM] = {
.call = nf_tables_delsetelem,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_SET_ELEM_LIST_MAX,
.policy = nft_set_elem_list_policy,
},
[NFT_MSG_GETGEN] = {
.call = nf_tables_getgen,
.type = NFNL_CB_RCU,
},
[NFT_MSG_NEWOBJ] = {
.call = nf_tables_newobj,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ] = {
.call = nf_tables_getobj,
.type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_DELOBJ] = {
.call = nf_tables_delobj,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_GETOBJ_RESET] = {
.call = nf_tables_getobj,
.type = NFNL_CB_RCU,
.attr_count = NFTA_OBJ_MAX,
.policy = nft_obj_policy,
},
[NFT_MSG_NEWFLOWTABLE] = {
.call = nf_tables_newflowtable,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_GETFLOWTABLE] = {
.call = nf_tables_getflowtable,
.type = NFNL_CB_RCU,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
[NFT_MSG_DELFLOWTABLE] = {
.call = nf_tables_delflowtable,
.type = NFNL_CB_BATCH,
.attr_count = NFTA_FLOWTABLE_MAX,
.policy = nft_flowtable_policy,
},
};

nftables数据结构

1
2
3
网络命名空间	→ 连接			 → 表		  	→ 链			   → 规则	     → 表达式

net → nftables_pernet → nft_table → nft_chain → nft_rule → nft_expr

数据结构不复杂,上下级结构和同级结构之间都是双链表连接

注意一下nft_expr有 nft_expr_ops 表示表达式的处理函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
struct nft_expr {
const struct nft_expr_ops *ops;
unsigned char data[]
__attribute__((aligned(__alignof__(u64))));
};

struct nft_expr_ops {
void (*eval)(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt);


int (*init)(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[]);


const struct nft_expr_type *type;
void *data;
};
  • 实际运行调用 eval
  • 初始化调用 init

nftables数据结构创建函数

终极目的是注册hook,所以主要关注怎么插入了什么hook

nf_tables_newtable

没什么需要注意的

  • 查看table是否已经存在,已存在则调用 nf_tables_updtable 更新table
  • 不存在则新建table,进行各种初始化,加入net的tables链表

nf_tables_newchain

开始和创建table流程一致

  • 查找table是否存在
  • 查找chain是否存在,存在则调用 nf_tables_updchain 更新chain
  • 不存在则调用 nf_tables_addchain 新建chain

nf_tables_addchain 流程

  • 如果chain是basechain,则初始化basechain
    • 调用 nft_chain_parse_hook 初始化 nft_chain_hook
    • 调用 nft_basechain_init 初始化basechain
  • 否则只是分配空间
  • 分配handle
  • 复制name
  • 分配rules空间
  • 调用 nf_tables_register_hook 注册hook
  • 将chain链入table

这个过程中涉及注册hook的过程有两个

  • basechain的初始化
  • 调用nf_tables_register_hook注册hook

看一下具体函数

  • 首先是 nft_chain_parse_hook,用于初始化 nft_chain_hook

    nft_chain_hook结构体要注册的hook由它的 type 决定

    • type先由 __nft_chain_type_get 获取default的type

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);

      static const struct nft_chain_type *
      __nft_chain_type_get(u8 family, enum nft_chain_types type)
      {
      if (family >= NFPROTO_NUMPROTO ||
      type >= NFT_CHAIN_T_MAX)
      return NULL;

      return chain_type[family][type];
      }

      chain_type 的初始化在 nf_tables_module_init 中进行👆

      懒的画二维数组了QAQ

      nf_tables_module_init初始化的type都是NFT_CHAIN_T_DEFAULT类型,举个:chestnut:

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      static const struct nft_chain_type nft_chain_filter_netdev = {
      .name = "filter",
      .type = NFT_CHAIN_T_DEFAULT,
      .family = NFPROTO_NETDEV,
      .hook_mask = (1 << NF_NETDEV_INGRESS) |
      (1 << NF_NETDEV_EGRESS),
      .hooks = {
      [NF_NETDEV_INGRESS] = nft_do_chain_netdev,
      [NF_NETDEV_EGRESS] = nft_do_chain_netdev,
      },
      };
    • 如果用户定义了具体的type,则由 nf_tables_chain_type_lookup 获取type

      具体执行过程是遍历chain_type数组对比name是否一致

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      static const struct nft_chain_type *
      __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
      {
      const struct nft_chain_type *type;
      int i;

      for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
      type = __nft_chain_type_get(family, i);
      if (!type)
      continue;
      if (!nla_strcmp(nla, type->name))
      return type;
      }
      return NULL;
      }
  • nft_basechain_init 中进行了basechain的type和ops的初始化(红线数据赋值方向,黑线指针指向)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
    const struct nft_chain_hook *hook,
    struct nft_chain *chain)
    {
    ops->pf = family;
    ops->hooknum = hook->num;
    ops->priority = hook->priority;
    ops->priv = chain;
    ops->hook = hook->type->hooks[ops->hooknum];
    ops->hook_ops_type = NF_HOOK_OP_NF_TABLES;
    }

    static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
    struct nft_chain_hook *hook, u32 flags)
    {
    struct nft_chain *chain;
    struct nft_hook *h;

    basechain->type = hook->type;
    INIT_LIST_HEAD(&basechain->hook_list);
    chain = &basechain->chain;

    if (nft_base_chain_netdev(family, hook->num)) {
    list_splice_init(&hook->list, &basechain->hook_list);
    list_for_each_entry(h, &basechain->hook_list, list)
    nft_basechain_hook_init(&h->ops, family, hook, chain);

    basechain->ops.hooknum = hook->num;
    basechain->ops.priority = hook->priority;
    } else {
    nft_basechain_hook_init(&basechain->ops, family, hook, chain);
    }


    return 0;
    }
  • nf_tables_register_hook 可以看出注册的是basechain的ops或者hook_list中的nft_hook

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    static int nf_tables_register_hook(struct net *net,
    const struct nft_table *table,
    struct nft_chain *chain)
    {
    struct nft_base_chain *basechain;
    const struct nf_hook_ops *ops;

    if (table->flags & NFT_TABLE_F_DORMANT ||
    !nft_is_base_chain(chain))
    return 0;

    basechain = nft_base_chain(chain);
    ops = &basechain->ops;

    if (basechain->type->ops_register)
    return basechain->type->ops_register(net, ops);

    if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
    return nft_netdev_register_hooks(net, &basechain->hook_list);

    return nf_register_net_hook(net, &basechain->ops);
    }

    static int nft_netdev_register_hooks(struct net *net,
    struct list_head *hook_list)
    {
    struct nft_hook *hook;
    int err, j;

    j = 0;
    list_for_each_entry(hook, hook_list, list) {
    err = nf_register_net_hook(net, &hook->ops);
    if (err < 0)
    goto err_register;

    j++;
    }
    return 0;

    err_register:
    list_for_each_entry(hook, hook_list, list) {
    if (j-- <= 0)
    break;

    nf_unregister_net_hook(net, &hook->ops);
    }
    return err;
    }

nf_tables_newrule

  • 查找table
  • 查找chain
  • 如果有NFTA_RULE_EXPRESSIONS,遍历所有expressions,调用 nf_tables_expr_parse 初始化 expr_info
  • 调用 nf_tables_newexpr 创建每一个expression

需要关注 nf_tables_expr_parsenf_tables_newexpr 两个函数

  • nf_tables_expr_parse 函数初始化 nft_expr_info 结构体

    • 首先需要根据NFTA_EXPR_NAME获取 nft_expr_type

      最终调用的是 __nft_expr_type_get 函数,从 nf_tables_expressions 中获取(对比name)

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      static const struct nft_expr_type *__nft_expr_type_get(u8 family,
      struct nlattr *nla)
      {
      const struct nft_expr_type *type, *candidate = NULL;

      list_for_each_entry(type, &nf_tables_expressions, list) {
      if (!nla_strcmp(nla, type->name)) {
      if (!type->family && !candidate)
      candidate = type;
      else if (type->family == family)
      candidate = type;
      }
      }
      return candidate;
      }

      nf_tables_expressions的初始化也在nf_tables_module_init中,就是把 nft_basic_types 数组的所有元素链进链表

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      static struct nft_expr_type *nft_basic_types[] = {
      &nft_imm_type,
      &nft_cmp_type,
      &nft_lookup_type,
      &nft_bitwise_type,
      &nft_byteorder_type,
      &nft_payload_type,
      &nft_dynset_type,
      &nft_range_type,
      &nft_meta_type,
      &nft_rt_type,
      &nft_exthdr_type,
      &nft_last_type,
      &nft_counter_type,
      };

      举个:chestnut:

      1
      2
      3
      4
      5
      6
      7
      struct nft_expr_type nft_imm_type __read_mostly = {
      .name = "immediate",
      .ops = &nft_imm_ops,
      .policy = nft_immediate_policy,
      .maxattr = NFTA_IMMEDIATE_MAX,
      .owner = THIS_MODULE,
      };
    • info的ops即type的ops

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      static int nf_tables_expr_parse(const struct nft_ctx *ctx,
      const struct nlattr *nla,
      struct nft_expr_info *info)
      {
      const struct nft_expr_type *type;
      const struct nft_expr_ops *ops;


      type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);


      ops = type->ops;


      info->ops = ops;


      }
    • nf_tables_newexpr 将info的ops赋值给expr,然后调用init

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      static int nf_tables_newexpr(const struct nft_ctx *ctx,
      const struct nft_expr_info *expr_info,
      struct nft_expr *expr)
      {
      const struct nft_expr_ops *ops = expr_info->ops;
      int err;

      expr->ops = ops;
      if (ops->init) {
      err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
      if (err < 0)
      goto err1;
      }

      return 0;
      err1:
      expr->ops = NULL;
      return err;
      }

nft_do_chain执行命令

nft_rule_dp的准备

nft_do_chain中的rule使用的都是nft_rule_dp而不是nft_rule,但nf_tables_newrule只创建了nft_rule

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
const struct nft_rule_dp *rule, *last_rule;


const struct nft_expr *expr, *last;


struct nft_rule_blob *blob;


do_chain:
if (genbit)
blob = rcu_dereference(chain->blob_gen_1);
else
blob = rcu_dereference(chain->blob_gen_0);

rule = (struct nft_rule_dp *)blob->data;
last_rule = (void *)blob->data + blob->size;
next_rule:
regs.verdict.code = NFT_CONTINUE;
for (; rule < last_rule; rule = nft_rule_next(rule)) {
nft_rule_dp_for_each_expr(expr, last, rule) {


}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
struct nft_chain {
struct nft_rule_blob __rcu *blob_gen_0;
struct nft_rule_blob __rcu *blob_gen_1;


struct nft_rule_blob *blob_next;
};

struct nft_rule_blob {
unsigned long size;
unsigned char data[]
__attribute__((aligned(__alignof__(struct nft_rule_dp))));
};

struct nft_rule_dp {
u64 is_last:1,
dlen:12,
handle:42; /* for tracing */
unsigned char data[]
__attribute__((aligned(__alignof__(struct nft_expr))));
};

nft_rule_blob结构体的赋值在 nf_tables_commit_chain_prepare

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
const struct nft_expr *expr, *last;
struct nft_regs_track track = {};
unsigned int size, data_size;
void *data, *data_boundary;
struct nft_rule_dp *prule;
struct nft_rule *rule;

/*
* 1. 判断是否已经赋值
*/
if (chain->blob_next || !nft_is_active_next(net, chain))
return 0;

/*
* 2. 计算所需大小
*/
rule = list_entry(&chain->rules, struct nft_rule, list);

data_size = 0;
list_for_each_entry_continue(rule, &chain->rules, list) {
if (nft_is_active_next(net, rule)) {
data_size += sizeof(*prule) + rule->dlen;
if (data_size > INT_MAX)
return -ENOMEM;
}
}
data_size += offsetof(struct nft_rule_dp, data); /* last rule */

/*
* 3. 创建nft_rule_blob
*/
chain->blob_next = nf_tables_chain_alloc_rules(data_size);
if (!chain->blob_next)
return -ENOMEM;

data = (void *)chain->blob_next->data;
data_boundary = data + data_size;
size = 0;

/*
* 4. 将所有expr的内容复制到nft_rule_blob中
*/
list_for_each_entry_continue(rule, &chain->rules, list) {
if (!nft_is_active_next(net, rule))
continue;

prule = (struct nft_rule_dp *)data;
data += offsetof(struct nft_rule_dp, data);
if (WARN_ON_ONCE(data > data_boundary))
return -ENOMEM;

size = 0;
track.last = nft_expr_last(rule);
nft_rule_for_each_expr(expr, last, rule) {
track.cur = expr;

if (nft_expr_reduce(&track, expr)) {
expr = track.cur;
continue;
}

if (WARN_ON_ONCE(data + expr->ops->size > data_boundary))
return -ENOMEM;

memcpy(data + size, expr, expr->ops->size);
size += expr->ops->size;
}
if (WARN_ON_ONCE(size >= 1 << 12))
return -ENOMEM;

prule->handle = rule->handle;
prule->dlen = size;
prule->is_last = 0;

data += size;
size = 0;
chain->blob_next->size += (unsigned long)(data - (void *)prule);
}

/*
* 5. 标记islast
*/
prule = (struct nft_rule_dp *)data;
data += offsetof(struct nft_rule_dp, data);
if (WARN_ON_ONCE(data > data_boundary))
return -ENOMEM;

nft_last_rule(chain->blob_next, prule);

return 0;
}
  • nf_tables_commit_chain_prepare在nf_tables_commit中被调用

  • nf_tables_commit是nf_tables_subsys的commit成员

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    static const struct nfnetlink_subsystem nf_tables_subsys = {
    .name = "nf_tables",
    .subsys_id = NFNL_SUBSYS_NFTABLES,
    .cb_count = NFT_MSG_MAX,
    .cb = nf_tables_cb,
    .commit = nf_tables_commit,
    .abort = nf_tables_abort,
    .cleanup = nf_tables_cleanup,
    .valid_genid = nf_tables_valid_genid,
    .owner = THIS_MODULE,
    };
  • nfnetlink_rcv_batch中调用了commit,表示提交更改

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    } else if (status == NFNL_BATCH_DONE) {
    err = ss->commit(net, oskb);
    if (err == -EAGAIN) {
    status |= NFNL_BATCH_REPLAY;
    goto done;
    } else if (err) {
    ss->abort(net, oskb, NFNL_ABORT_NONE);
    netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL);
    }
    }

nft_do_chain

之前提到的注册的hook

1
2
3
4
5
6
7
.hooks		= {
[NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
[NF_INET_LOCAL_OUT] = nft_do_chain_ipv4,
[NF_INET_FORWARD] = nft_do_chain_ipv4,
[NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
[NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
},

最终都是调用nft_do_chain

1
2
3
4
5
6
7
8
9
10
11
static unsigned int nft_do_chain_ipv4(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nft_pktinfo pkt;

nft_set_pktinfo(&pkt, skb, state);
nft_set_pktinfo_ipv4(&pkt);

return nft_do_chain(&pkt, priv);
}

regs和jumpstack相关数据结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
struct nft_jumpstack {
const struct nft_chain *chain;
const struct nft_rule_dp *rule;
const struct nft_rule_dp *last_rule;
};

struct nft_regs {
union {
u32 data[NFT_REG32_NUM];
struct nft_verdict verdict;
};
};

struct nft_verdict {
u32 code;
struct nft_chain *chain;
};

enum nft_registers {
NFT_REG_VERDICT,
NFT_REG_1,
NFT_REG_2,
NFT_REG_3,
NFT_REG_4,
__NFT_REG_MAX,

NFT_REG32_00 = 8,
NFT_REG32_01,
NFT_REG32_02,
NFT_REG32_03,
NFT_REG32_04,
NFT_REG32_05,
NFT_REG32_06,
NFT_REG32_07,
NFT_REG32_08,
NFT_REG32_09,
NFT_REG32_10,
NFT_REG32_11,
NFT_REG32_12,
NFT_REG32_13,
NFT_REG32_14,
NFT_REG32_15,
};
#define NFT_REG_MAX (__NFT_REG_MAX - 1)

#define NFT_REG_SIZE 16
#define NFT_REG32_SIZE 4
#define NFT_REG32_COUNT (NFT_REG32_15 - NFT_REG32_00 + 1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
const struct nft_chain *chain = priv, *basechain = chain;
const struct nft_rule_dp *rule, *last_rule;
const struct net *net = nft_net(pkt);
const struct nft_expr *expr, *last;
struct nft_regs regs;
unsigned int stackptr = 0;
struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
bool genbit = READ_ONCE(net->nft.gencursor);
struct nft_rule_blob *blob;
struct nft_traceinfo info;

/*
* 1. 获取rule
*/
info.trace = false;
if (static_branch_unlikely(&nft_trace_enabled))
nft_trace_init(&info, pkt, &regs.verdict, basechain);
do_chain:
if (genbit)
blob = rcu_dereference(chain->blob_gen_1);
else
blob = rcu_dereference(chain->blob_gen_0);

/*
* 2. 遍历rule中的所有expr并执行
*/
rule = (struct nft_rule_dp *)blob->data;
last_rule = (void *)blob->data + blob->size;
next_rule:
regs.verdict.code = NFT_CONTINUE;
for (; rule < last_rule; rule = nft_rule_next(rule)) {
nft_rule_dp_for_each_expr(expr, last, rule) {
if (expr->ops == &nft_cmp_fast_ops)
nft_cmp_fast_eval(expr, &regs);
else if (expr->ops == &nft_bitwise_fast_ops)
nft_bitwise_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
expr_call_ops_eval(expr, &regs, pkt);

/*
* 3. 如果没有设置verdict则继续执行
*/
if (regs.verdict.code != NFT_CONTINUE)
break;
}

/*
* 4. 一条rule执行完成或者中途发生跳转,进行判决
*/
switch (regs.verdict.code) {
case NFT_BREAK: // rule中途设置了break,继续执行下一条rule
regs.verdict.code = NFT_CONTINUE;
continue;
case NFT_CONTINUE: // 一条rule执行完,继续执行下一条rule
nft_trace_packet(&info, chain, rule,
NFT_TRACETYPE_RULE);
continue;
}
break; // 否则执行完成,进行判决
}

nft_trace_verdict(&info, chain, rule, &regs);

/*
* 5. 没有跳转,返回链最终判决结果
*/
switch (regs.verdict.code & NF_VERDICT_MASK) {
case NF_ACCEPT:
case NF_DROP:
case NF_QUEUE:
case NF_STOLEN:
return regs.verdict.code;
}

/*
* 6. 存在跳转,处理跳转
*/
switch (regs.verdict.code) {
case NFT_JUMP: // jump:入栈返回地址
if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
return NF_DROP;
jumpstack[stackptr].chain = chain;
jumpstack[stackptr].rule = nft_rule_next(rule);
jumpstack[stackptr].last_rule = last_rule;
stackptr++;
fallthrough;
case NFT_GOTO: // goto:直接跳转
chain = regs.verdict.chain;
goto do_chain;
case NFT_CONTINUE: // 其他特殊处理
case NFT_RETURN:
break;
default:
WARN_ON_ONCE(1);
}

if (stackptr > 0) { // return:出栈返回地址
stackptr--;
chain = jumpstack[stackptr].chain;
rule = jumpstack[stackptr].rule;
last_rule = jumpstack[stackptr].last_rule;
goto next_rule;
}

nft_trace_packet(&info, basechain, NULL, NFT_TRACETYPE_POLICY);

if (static_branch_unlikely(&nft_counters_enabled))
nft_update_chain_stats(basechain, pkt);

/*
* 7. 如果没有到达明确的verdict,返回chain的policy(默认为accept或drop)
*/
return nft_base_chain(basechain)->policy;
}
EXPORT_SYMBOL_GPL(nft_do_chain);

nftables expression

通用函数

每个expression都会有一个init函数和一个eval函数

  • init函数负责初始化对应expr结构体
  • eval函数负责执行对应的动作,比如向某个寄存器写入数据
nft_regs内存布局
  • nftables最开始使用 16bytes verdict + 4 x 16bytes data reg
  • 后来使用 16bytes verdict + 16 x 4bytes data reg

使用 nft_parse_register 获取寄存器下标

1
2
3
4
5
6
7
8
9
10
11
12
static unsigned int nft_parse_register(const struct nlattr *attr)
{
unsigned int reg;

reg = ntohl(nla_get_be32(attr));
switch (reg) {
case NFT_REG_VERDICT...NFT_REG_4:
return reg * NFT_REG_SIZE / NFT_REG32_SIZE;
default:
return reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
}
}
  • NFT_REG 使用16bytes data reg

    • 枚举从0开始

    • 4个u32表示一个reg

      1
      2
      3
      4
      5
      6
      struct nft_regs {
      union {
      u32 data[NFT_REG32_NUM];
      struct nft_verdict verdict;
      };
      };

    所以乘四就行

  • NFT_REG32 使用4bytes data reg

    • 枚举从8开始
    • 一个u32表示一个reg
    • 第一个verdict占16bytes,4个u32

    所以减枚举基数再加verdict占的空间

nft_parse_register_xxx
  • nft_parse_register_load:解析源寄存器

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len)
    {
    u32 reg;
    int err;

    reg = nft_parse_register(attr);
    err = nft_validate_register_load(reg, len);
    if (err < 0)
    return err;

    *sreg = reg;
    return 0;
    }
    EXPORT_SYMBOL_GPL(nft_parse_register_load);
    • nft_parse_register 获取index

    • nft_validate_register_load 检验index合法性

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
      {
      if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
      return -EINVAL;
      if (len == 0)
      return -EINVAL;
      if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data))
      return -ERANGE;

      return 0;
      }
      • 源寄存器不能是verdict
      • 读取长度不能为0
      • 读取范围不能超出nft_regs结构体
    • 将index赋值给expr

  • nft_parse_register_store:解析目标寄存器

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    int nft_parse_register_store(const struct nft_ctx *ctx,
    const struct nlattr *attr, u8 *dreg,
    const struct nft_data *data,
    enum nft_data_types type, unsigned int len)
    {
    int err;
    u32 reg;

    reg = nft_parse_register(attr);
    err = nft_validate_register_store(ctx, reg, data, type, len);
    if (err < 0)
    return err;

    *dreg = reg;
    return 0;
    }
    EXPORT_SYMBOL_GPL(nft_parse_register_store);

    同上👆

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    static int nft_validate_register_store(const struct nft_ctx *ctx,
    enum nft_registers reg,
    const struct nft_data *data,
    enum nft_data_types type,
    unsigned int len)
    {
    int err;

    switch (reg) {
    case NFT_REG_VERDICT:
    if (type != NFT_DATA_VERDICT)
    return -EINVAL;

    if (data != NULL &&
    (data->verdict.code == NFT_GOTO ||
    data->verdict.code == NFT_JUMP)) {
    err = nf_tables_check_loops(ctx, data->verdict.chain);
    if (err < 0)
    return err;
    }

    return 0;
    default:
    if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
    return -EINVAL;
    if (len == 0)
    return -EINVAL;
    if (reg * NFT_REG32_SIZE + len >
    sizeof_field(struct nft_regs, data))
    return -ERANGE;

    if (data != NULL && type != NFT_DATA_VALUE)
    return -EINVAL;
    return 0;
    }
    }
    • 目的寄存器为verdict
      • type要匹配
      • 写入code为两个跳转指令时需要调用 nf_tables_check_loops 判断是否产生了循环
    • index不能落在verdict的范围里
nft_data_init

这个函数用于解析data,初始化nft_data结构体,用于表示常量或者verdict

  • ctx:expression上下文
  • data:待初始化的data结构体
  • size:data的最大长度限制
  • desc:data的描述结构体
  • nla:待解析的data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int nft_data_init(const struct nft_ctx *ctx,
struct nft_data *data, unsigned int size,
struct nft_data_desc *desc, const struct nlattr *nla)
{
struct nlattr *tb[NFTA_DATA_MAX + 1];
int err;

/*
* 1. 解析nla中的data,并将各个部分指针放进指针数组tb
*/
err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
nft_data_policy, NULL);
if (err < 0)
return err;

/*
* 2. 数据类型是普通的data,初始化nft_data结构体
*/
if (tb[NFTA_DATA_VALUE])
return nft_value_init(ctx, data, size, desc,
tb[NFTA_DATA_VALUE]);

/*
* 3. 数据类型是verdict,初始化verdict结构体
*/
if (tb[NFTA_DATA_VERDICT] && ctx != NULL)
return nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
return -EINVAL;
}
EXPORT_SYMBOL_GPL(nft_data_init);
  • nft_value_init 数据复制,长度不能超过size的限制

    1
    2
    3
    4
    5
    6
    struct nft_data {
    union {
    u32 data[4];
    struct nft_verdict verdict;
    };
    } __attribute__((aligned(__alignof__(u64))));
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    static int nft_value_init(const struct nft_ctx *ctx,
    struct nft_data *data, unsigned int size,
    struct nft_data_desc *desc, const struct nlattr *nla)
    {
    unsigned int len;

    len = nla_len(nla);
    if (len == 0)
    return -EINVAL;
    if (len > size)
    return -EOVERFLOW;

    nla_memcpy(data->data, nla, len);
    desc->type = NFT_DATA_VALUE;
    desc->len = len;
    return 0;
    }
  • nft_verdict_init

    1
    2
    3
    4
    struct nft_verdict {
    u32 code;
    struct nft_chain *chain;
    };
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
    struct nft_data_desc *desc, const struct nlattr *nla)
    {
    u8 genmask = nft_genmask_next(ctx->net);
    struct nlattr *tb[NFTA_VERDICT_MAX + 1];
    struct nft_chain *chain;
    int err;

    /*
    * 1. 解析nla中的data,并将各个部分指针放进指针数组tb
    */
    err = nla_parse_nested_deprecated(tb, NFTA_VERDICT_MAX, nla,
    nft_verdict_policy, NULL);
    if (err < 0)
    return err;

    /*
    * 2. verdict.code的赋值
    */
    if (!tb[NFTA_VERDICT_CODE])
    return -EINVAL;
    data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));

    /*
    * 3. 根据链名或id查找chain并赋值给verdict.chain
    */
    switch (data->verdict.code) {
    default:
    switch (data->verdict.code & NF_VERDICT_MASK) {
    case NF_ACCEPT:
    case NF_DROP:
    case NF_QUEUE:
    break;
    default:
    return -EINVAL;
    }
    fallthrough;
    case NFT_CONTINUE:
    case NFT_BREAK:
    case NFT_RETURN:
    break;
    case NFT_JUMP:
    case NFT_GOTO:
    if (tb[NFTA_VERDICT_CHAIN]) {
    chain = nft_chain_lookup(ctx->net, ctx->table,
    tb[NFTA_VERDICT_CHAIN],
    genmask);
    } else if (tb[NFTA_VERDICT_CHAIN_ID]) {
    chain = nft_chain_lookup_byid(ctx->net,
    tb[NFTA_VERDICT_CHAIN_ID]);
    if (IS_ERR(chain))
    return PTR_ERR(chain);
    } else {
    return -EINVAL;
    }

    if (IS_ERR(chain))
    return PTR_ERR(chain);
    if (nft_is_base_chain(chain))
    return -EOPNOTSUPP;

    chain->use++;
    data->verdict.chain = chain;
    break;
    }

    desc->len = sizeof(data->verdict);
    desc->type = NFT_DATA_VERDICT;
    return 0;
    }

nft_immediate_expr

  • 结构

    1
    2
    3
    4
    5
    struct nft_immediate_expr {
    struct nft_data data;
    u8 dreg;
    u8 dlen;
    };
    • data:要写入的常量
    • dreg:目标寄存器index
    • dlen:要写入的常量的长度,<16
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    static int nft_immediate_init(const struct nft_ctx *ctx,
    const struct nft_expr *expr,
    const struct nlattr * const tb[])
    {
    struct nft_immediate_expr *priv = nft_expr_priv(expr);
    struct nft_data_desc desc;
    int err;

    if (tb[NFTA_IMMEDIATE_DREG] == NULL ||
    tb[NFTA_IMMEDIATE_DATA] == NULL)
    return -EINVAL;

    /*
    * 1. 要写入的data初始化,size限制len在nft_data结构体范围内
    */
    err = nft_data_init(ctx, &priv->data, sizeof(priv->data), &desc,
    tb[NFTA_IMMEDIATE_DATA]);
    if (err < 0)
    return err;

    priv->dlen = desc.len;

    /*
    * 2. 目的寄存器index解析
    */
    err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG],
    &priv->dreg, &priv->data, desc.type,
    desc.len);
    if (err < 0)
    goto err1;

    /*
    * 3. 如果目的寄存器是verdict,且为跳转则需要判定目标链
    */
    if (priv->dreg == NFT_REG_VERDICT) {
    struct nft_chain *chain = priv->data.verdict.chain;

    switch (priv->data.verdict.code) {
    case NFT_JUMP:
    case NFT_GOTO:
    if (nft_chain_is_bound(chain)) {
    err = -EBUSY;
    goto err1;
    }
    chain->bound = true;
    break;
    default:
    break;
    }
    }

    return 0;

    err1:
    nft_data_release(&priv->data, desc.type);
    return err;
    }
  • 用途 往寄存器中写入最多16bytes的常量

    就是利用 nft_data_copy 函数之间进行数据复制

    1
    2
    3
    4
    5
    6
    7
    8
    void nft_immediate_eval(const struct nft_expr *expr,
    struct nft_regs *regs,
    const struct nft_pktinfo *pkt)
    {
    const struct nft_immediate_expr *priv = nft_expr_priv(expr);

    nft_data_copy(&regs->data[priv->dreg], &priv->data, priv->dlen);
    }

nft_cmp_expr

  • 结构

    这个结构有select函数

    • 32字节以下 && 是否相等的比较使用 nft_cmp_fast_ops
    • 否则使用 nft_cmp_ops
    1
    2
    3
    4
    5
    6
    struct nft_cmp_expr {
    struct nft_data data;
    u8 sreg;
    u8 len;
    enum nft_cmp_ops op:8;
    };
    • data:常量,调用nft_data_init初始化
    • sreg:源寄存器index,调用nft_parse_register_load获取
    • len:常量长度,限制在nft_data结构体范围内
    • op:哪种判断,大于?小于?…
    1
    2
    3
    4
    5
    6
    7
    struct nft_cmp_fast_expr {
    u32 data;
    u32 mask;
    u8 sreg;
    u8 len;
    bool inv;
    };
    • data:常量,调用nft_data_init初始化
    • mask:掩码
    • sreg:源寄存器index,调用nft_parse_register_load获取
    • len:常量长度,限制在nft_data结构体范围内
    • inv:相等 or 不相等
  • 用途 将寄存器值和常量进行比较

    计算时

    • nft_cmp_eval 就是使用memcmp并对结果进行判断,match则break,否则continue
    • nft_cmp_fast_eval 直接使用等号判断,再判断和inv是否match

nft_bitwise_expr

  • 结构

    有select函数

    • 32位data && 位操作为异或则使用 nft_bitwise_fast_ops
    • 否则使用 nft_bitwise_ops
    1
    2
    3
    4
    5
    6
    7
    8
    9
    struct nft_bitwise {
    u8 sreg;
    u8 dreg;
    enum nft_bitwise_ops op:8;
    u8 len;
    struct nft_data mask;
    struct nft_data xor;
    struct nft_data data;
    };
    • sreg:源寄存器index,nft_parse_register_load获取
    • dreg:目的寄存器index,nft_parse_register_store获取
    • op:异或,左移或右移
    • len:操作数据长度,<0xff
    • mask:掩码
      • mask.len == len
      • type == NFT_DATA_VALUE
      • nft_data_init获取,nft_data结构体范围内
    • xor:同上👆
    • data:位移位数
      • type为NFT_DATA_VALUE
      • 32位
      • <32
    1
    2
    3
    4
    5
    6
    struct nft_bitwise_fast_expr {
    u32 mask;
    u32 xor;
    u8 sreg;
    u8 dreg;
    };
    • mask:源码
    • xor:异或数
    • sreg:源寄存器index
    • dreg:目的寄存器index
  • 用途 位操作,异或,左移或右移

    计算时

    • nft_bitwise_eval
      • bool:四字节为单位异或data和xor
      • lshift:data[0]表示位移位数
      • rshift:同上👆
    • nft_bitwise_fast_eval 直接异或

nft_payload_expr

1
2
3
4
5
6
enum nft_payload_bases {
NFT_PAYLOAD_LL_HEADER, // 连接层header,如ethernet
NFT_PAYLOAD_NETWORK_HEADER, // 网络层header,如IPv4或IPv6
NFT_PAYLOAD_TRANSPORT_HEADER, // 传输层header,如UDP或TCP
NFT_PAYLOAD_INNER_HEADER, // 里层header或者payload
};
  • 结构

    有select函数

    • 写packet使用 nft_payload_set_ops

    • 读packet

      • 读网络层或者传输层
      • len <= 4
      • len为2的次方
      • offset基于len对齐

      使用 nft_payload_fast_ops

      否则使用 nft_payload_ops

    读写按是否有源 / 目的寄存器判断

    • nft_payload_set

      1
      2
      3
      4
      5
      6
      7
      8
      9
      struct nft_payload_set {
      enum nft_payload_bases base:8;
      u8 offset;
      u8 len;
      u8 sreg;
      u8 csum_type;
      u8 csum_offset;
      u8 csum_flags;
      };
      • base:要改的header类型,见 nft_payload_bases 👆

      • offset:要改的的offset

      • len:要改的长度

      • sreg:data源寄存器

      • csum_type:checksum类型

        1
        2
        3
        4
        5
        enum nft_payload_csum_types {
        NFT_PAYLOAD_CSUM_NONE, // 没有checksum
        NFT_PAYLOAD_CSUM_INET, // IP协议checksum
        NFT_PAYLOAD_CSUM_SCTP, // CRC-32c,SCTP中使用
        };
      • csum_offset:checksum在header中的偏移

      • csum_flags:checksum的flags,只有一个

        1
        2
        3
        enum nft_payload_csum_flags {
        NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0),
        };

        有其他值报错

        1
        2
        if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR)
        return -EINVAL;

        表示使用伪首部计算TCP checksum(四级校验)

    • nft_payload

      1
      2
      3
      4
      5
      6
      struct nft_payload {
      enum nft_payload_bases base:8;
      u8 offset;
      u8 len;
      u8 dreg;
      };
      • base:要读的header类型
      • offset:要读的偏移
      • dreg:目的寄存器
  • 用途 读写packet的header

    • nft_payload_set_eval

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      static void nft_payload_set_eval(const struct nft_expr *expr,
      struct nft_regs *regs,
      const struct nft_pktinfo *pkt)
      {
      const struct nft_payload_set *priv = nft_expr_priv(expr);
      struct sk_buff *skb = pkt->skb;
      const u32 *src = &regs->data[priv->sreg];
      int offset, csum_offset;
      __wsum fsum, tsum;

      /*
      * 1. 获取header的offset
      */
      switch (priv->base) {
      case NFT_PAYLOAD_LL_HEADER:
      if (!skb_mac_header_was_set(skb))
      goto err;
      offset = skb_mac_header(skb) - skb->data;
      break;
      case NFT_PAYLOAD_NETWORK_HEADER:
      offset = skb_network_offset(skb);
      break;
      case NFT_PAYLOAD_TRANSPORT_HEADER:
      if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff)
      goto err;
      offset = nft_thoff(pkt);
      break;
      case NFT_PAYLOAD_INNER_HEADER:
      offset = nft_payload_inner_offset(pkt);
      if (offset < 0)
      goto err;
      break;
      default:
      WARN_ON_ONCE(1);
      goto err;
      }

      /*
      * 2. 获取checksum的offset和要更改的offset
      */
      csum_offset = offset + priv->csum_offset;
      offset += priv->offset;

      /*
      * 3. 计算更新checksum
      * * 运输层四级校验或者IP校验
      * * 要更改网络层或者数据链路层header,或者还未进行校验
      * 以上情况需要更新checksum
      */
      if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) &&
      ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER &&
      priv->base != NFT_PAYLOAD_INNER_HEADER) ||
      skb->ip_summed != CHECKSUM_PARTIAL)) {

      fsum = skb_checksum(skb, offset, priv->len, 0); // 计算我们要更改部分的checksum
      tsum = csum_partial(src, priv->len, 0); // 计算我们要改成的内容的checksum

      // IP校验,更新payload的checksum
      if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
      nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset))
      goto err;

      // 四级校验,更新payload的checksum
      if (priv->csum_flags &&
      nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0)
      goto err;
      }

      /*
      * 4. 更改header
      */
      if (skb_ensure_writable(skb, max(offset + priv->len, 0)) ||
      skb_store_bits(skb, offset, src, priv->len) < 0)
      goto err;

      /*
      * 5. 重新计算SCTP的checksum
      */
      if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP &&
      pkt->tprot == IPPROTO_SCTP &&
      skb->ip_summed != CHECKSUM_PARTIAL) { // 还没进行校验
      if (pkt->fragoff == 0 &&
      nft_payload_csum_sctp(skb, nft_thoff(pkt)))
      goto err;
      }

      return;
      err:
      regs->verdict.code = NFT_BREAK;
      }
    • nft_payload_eval

      • 获取绝对offset
      • 读进目标寄存器
      • 如果出错break
    • nft_payload_fast_eval

      • 获取读取的指针ptr
      • 判断ptr有没有越界
      • 直接使用*,因为fast都是2的power

TODO

  • rcu是什么

nft源码分析
http://akaieurus.github.io/2024/02/08/nft源码分析/
作者
Eurus
发布于
2024年2月8日
许可协议