|
| 1 | +net: Fix skb consume leak in sch_handle_egress |
| 2 | + |
| 3 | +jira LE-1907 |
| 4 | +Rebuild_History Non-Buildable kernel-5.14.0-427.18.1.el9_4 |
| 5 | +commit-author Daniel Borkmann <daniel@iogearbox.net> |
| 6 | +commit 28d18b673ffa2d13112ddb6e4c32c60d9b0cda50 |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-5.14.0-427.18.1.el9_4/28d18b67.failed |
| 10 | + |
| 11 | +Fix a memory leak for the tc egress path with TC_ACT_{STOLEN,QUEUED,TRAP}: |
| 12 | + |
| 13 | + [...] |
| 14 | + unreferenced object 0xffff88818bcb4f00 (size 232): |
| 15 | + comm "softirq", pid 0, jiffies 4299085078 (age 134.028s) |
| 16 | + hex dump (first 32 bytes): |
| 17 | + 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ |
| 18 | + 00 80 70 61 81 88 ff ff 00 41 31 14 81 88 ff ff ..pa.....A1..... |
| 19 | + backtrace: |
| 20 | + [<ffffffff9991b938>] kmem_cache_alloc_node+0x268/0x400 |
| 21 | + [<ffffffff9b3d9231>] __alloc_skb+0x211/0x2c0 |
| 22 | + [<ffffffff9b3f0c7e>] alloc_skb_with_frags+0xbe/0x6b0 |
| 23 | + [<ffffffff9b3bf9a9>] sock_alloc_send_pskb+0x6a9/0x870 |
| 24 | + [<ffffffff9b6b3f00>] __ip_append_data+0x14d0/0x3bf0 |
| 25 | + [<ffffffff9b6ba24e>] ip_append_data+0xee/0x190 |
| 26 | + [<ffffffff9b7e1496>] icmp_push_reply+0xa6/0x470 |
| 27 | + [<ffffffff9b7e4030>] icmp_reply+0x900/0xa00 |
| 28 | + [<ffffffff9b7e42e3>] icmp_echo.part.0+0x1a3/0x230 |
| 29 | + [<ffffffff9b7e444d>] icmp_echo+0xcd/0x190 |
| 30 | + [<ffffffff9b7e9566>] icmp_rcv+0x806/0xe10 |
| 31 | + [<ffffffff9b699bd1>] ip_protocol_deliver_rcu+0x351/0x3d0 |
| 32 | + [<ffffffff9b699f14>] ip_local_deliver_finish+0x2b4/0x450 |
| 33 | + [<ffffffff9b69a234>] ip_local_deliver+0x174/0x1f0 |
| 34 | + [<ffffffff9b69a4b2>] ip_sublist_rcv_finish+0x1f2/0x420 |
| 35 | + [<ffffffff9b69ab56>] ip_sublist_rcv+0x466/0x920 |
| 36 | + [...] |
| 37 | + |
| 38 | +I was able to reproduce this via: |
| 39 | + |
| 40 | + ip link add dev dummy0 type dummy |
| 41 | + ip link set dev dummy0 up |
| 42 | + tc qdisc add dev eth0 clsact |
| 43 | + tc filter add dev eth0 egress protocol ip prio 1 u32 match ip protocol 1 0xff action mirred egress redirect dev dummy0 |
| 44 | + ping 1.1.1.1 |
| 45 | + <stolen> |
| 46 | + |
| 47 | +After the fix, there are no kmemleak reports with the reproducer. This is |
| 48 | +in line with what is also done on the ingress side, and from debugging the |
| 49 | +skb_unref(skb) on dummy xmit and sch_handle_egress() side, it is visible |
| 50 | +that these are two different skbs with both skb_unref(skb) as true. The two |
| 51 | +seen skbs are due to mirred doing a skb_clone() internally as use_reinsert |
| 52 | +is false in tcf_mirred_act() for egress. This was initially reported by Gal. |
| 53 | + |
| 54 | +Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") |
| 55 | + Reported-by: Gal Pressman <gal@nvidia.com> |
| 56 | + Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> |
| 57 | +Link: https://lore.kernel.org/bpf/bdfc2640-8f65-5b56-4472-db8e2b161aab@nvidia.com |
| 58 | + Reviewed-by: Simon Horman <horms@kernel.org> |
| 59 | + Signed-off-by: David S. Miller <davem@davemloft.net> |
| 60 | +(cherry picked from commit 28d18b673ffa2d13112ddb6e4c32c60d9b0cda50) |
| 61 | + Signed-off-by: Jonathan Maple <jmaple@ciq.com> |
| 62 | + |
| 63 | +# Conflicts: |
| 64 | +# net/core/dev.c |
| 65 | +diff --cc net/core/dev.c |
| 66 | +index 6439f9b64980,9f6ed6d97f89..000000000000 |
| 67 | +--- a/net/core/dev.c |
| 68 | ++++ b/net/core/dev.c |
| 69 | +@@@ -3897,6 -3909,180 +3897,183 @@@ void netdev_xmit_skip_txqueue(bool skip |
| 70 | + EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); |
| 71 | + #endif /* CONFIG_NET_EGRESS */ |
| 72 | + |
| 73 | +++<<<<<<< HEAD |
| 74 | +++======= |
| 75 | ++ #ifdef CONFIG_NET_XGRESS |
| 76 | ++ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) |
| 77 | ++ { |
| 78 | ++ int ret = TC_ACT_UNSPEC; |
| 79 | ++ #ifdef CONFIG_NET_CLS_ACT |
| 80 | ++ struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); |
| 81 | ++ struct tcf_result res; |
| 82 | ++ |
| 83 | ++ if (!miniq) |
| 84 | ++ return ret; |
| 85 | ++ |
| 86 | ++ tc_skb_cb(skb)->mru = 0; |
| 87 | ++ tc_skb_cb(skb)->post_ct = false; |
| 88 | ++ |
| 89 | ++ mini_qdisc_bstats_cpu_update(miniq, skb); |
| 90 | ++ ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); |
| 91 | ++ /* Only tcf related quirks below. */ |
| 92 | ++ switch (ret) { |
| 93 | ++ case TC_ACT_SHOT: |
| 94 | ++ mini_qdisc_qstats_cpu_drop(miniq); |
| 95 | ++ break; |
| 96 | ++ case TC_ACT_OK: |
| 97 | ++ case TC_ACT_RECLASSIFY: |
| 98 | ++ skb->tc_index = TC_H_MIN(res.classid); |
| 99 | ++ break; |
| 100 | ++ } |
| 101 | ++ #endif /* CONFIG_NET_CLS_ACT */ |
| 102 | ++ return ret; |
| 103 | ++ } |
| 104 | ++ |
| 105 | ++ static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); |
| 106 | ++ |
| 107 | ++ void tcx_inc(void) |
| 108 | ++ { |
| 109 | ++ static_branch_inc(&tcx_needed_key); |
| 110 | ++ } |
| 111 | ++ |
| 112 | ++ void tcx_dec(void) |
| 113 | ++ { |
| 114 | ++ static_branch_dec(&tcx_needed_key); |
| 115 | ++ } |
| 116 | ++ |
| 117 | ++ static __always_inline enum tcx_action_base |
| 118 | ++ tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, |
| 119 | ++ const bool needs_mac) |
| 120 | ++ { |
| 121 | ++ const struct bpf_mprog_fp *fp; |
| 122 | ++ const struct bpf_prog *prog; |
| 123 | ++ int ret = TCX_NEXT; |
| 124 | ++ |
| 125 | ++ if (needs_mac) |
| 126 | ++ __skb_push(skb, skb->mac_len); |
| 127 | ++ bpf_mprog_foreach_prog(entry, fp, prog) { |
| 128 | ++ bpf_compute_data_pointers(skb); |
| 129 | ++ ret = bpf_prog_run(prog, skb); |
| 130 | ++ if (ret != TCX_NEXT) |
| 131 | ++ break; |
| 132 | ++ } |
| 133 | ++ if (needs_mac) |
| 134 | ++ __skb_pull(skb, skb->mac_len); |
| 135 | ++ return tcx_action_code(skb, ret); |
| 136 | ++ } |
| 137 | ++ |
| 138 | ++ static __always_inline struct sk_buff * |
| 139 | ++ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, |
| 140 | ++ struct net_device *orig_dev, bool *another) |
| 141 | ++ { |
| 142 | ++ struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); |
| 143 | ++ int sch_ret; |
| 144 | ++ |
| 145 | ++ if (!entry) |
| 146 | ++ return skb; |
| 147 | ++ if (*pt_prev) { |
| 148 | ++ *ret = deliver_skb(skb, *pt_prev, orig_dev); |
| 149 | ++ *pt_prev = NULL; |
| 150 | ++ } |
| 151 | ++ |
| 152 | ++ qdisc_skb_cb(skb)->pkt_len = skb->len; |
| 153 | ++ tcx_set_ingress(skb, true); |
| 154 | ++ |
| 155 | ++ if (static_branch_unlikely(&tcx_needed_key)) { |
| 156 | ++ sch_ret = tcx_run(entry, skb, true); |
| 157 | ++ if (sch_ret != TC_ACT_UNSPEC) |
| 158 | ++ goto ingress_verdict; |
| 159 | ++ } |
| 160 | ++ sch_ret = tc_run(tcx_entry(entry), skb); |
| 161 | ++ ingress_verdict: |
| 162 | ++ switch (sch_ret) { |
| 163 | ++ case TC_ACT_REDIRECT: |
| 164 | ++ /* skb_mac_header check was done by BPF, so we can safely |
| 165 | ++ * push the L2 header back before redirecting to another |
| 166 | ++ * netdev. |
| 167 | ++ */ |
| 168 | ++ __skb_push(skb, skb->mac_len); |
| 169 | ++ if (skb_do_redirect(skb) == -EAGAIN) { |
| 170 | ++ __skb_pull(skb, skb->mac_len); |
| 171 | ++ *another = true; |
| 172 | ++ break; |
| 173 | ++ } |
| 174 | ++ *ret = NET_RX_SUCCESS; |
| 175 | ++ return NULL; |
| 176 | ++ case TC_ACT_SHOT: |
| 177 | ++ kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); |
| 178 | ++ *ret = NET_RX_DROP; |
| 179 | ++ return NULL; |
| 180 | ++ /* used by tc_run */ |
| 181 | ++ case TC_ACT_STOLEN: |
| 182 | ++ case TC_ACT_QUEUED: |
| 183 | ++ case TC_ACT_TRAP: |
| 184 | ++ consume_skb(skb); |
| 185 | ++ fallthrough; |
| 186 | ++ case TC_ACT_CONSUMED: |
| 187 | ++ *ret = NET_RX_SUCCESS; |
| 188 | ++ return NULL; |
| 189 | ++ } |
| 190 | ++ |
| 191 | ++ return skb; |
| 192 | ++ } |
| 193 | ++ |
| 194 | ++ static __always_inline struct sk_buff * |
| 195 | ++ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) |
| 196 | ++ { |
| 197 | ++ struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); |
| 198 | ++ int sch_ret; |
| 199 | ++ |
| 200 | ++ if (!entry) |
| 201 | ++ return skb; |
| 202 | ++ |
| 203 | ++ /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was |
| 204 | ++ * already set by the caller. |
| 205 | ++ */ |
| 206 | ++ if (static_branch_unlikely(&tcx_needed_key)) { |
| 207 | ++ sch_ret = tcx_run(entry, skb, false); |
| 208 | ++ if (sch_ret != TC_ACT_UNSPEC) |
| 209 | ++ goto egress_verdict; |
| 210 | ++ } |
| 211 | ++ sch_ret = tc_run(tcx_entry(entry), skb); |
| 212 | ++ egress_verdict: |
| 213 | ++ switch (sch_ret) { |
| 214 | ++ case TC_ACT_REDIRECT: |
| 215 | ++ /* No need to push/pop skb's mac_header here on egress! */ |
| 216 | ++ skb_do_redirect(skb); |
| 217 | ++ *ret = NET_XMIT_SUCCESS; |
| 218 | ++ return NULL; |
| 219 | ++ case TC_ACT_SHOT: |
| 220 | ++ kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); |
| 221 | ++ *ret = NET_XMIT_DROP; |
| 222 | ++ return NULL; |
| 223 | ++ /* used by tc_run */ |
| 224 | ++ case TC_ACT_STOLEN: |
| 225 | ++ case TC_ACT_QUEUED: |
| 226 | ++ case TC_ACT_TRAP: |
| 227 | ++ consume_skb(skb); |
| 228 | ++ *ret = NET_XMIT_SUCCESS; |
| 229 | ++ return NULL; |
| 230 | ++ } |
| 231 | ++ |
| 232 | ++ return skb; |
| 233 | ++ } |
| 234 | ++ #else |
| 235 | ++ static __always_inline struct sk_buff * |
| 236 | ++ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, |
| 237 | ++ struct net_device *orig_dev, bool *another) |
| 238 | ++ { |
| 239 | ++ return skb; |
| 240 | ++ } |
| 241 | ++ |
| 242 | ++ static __always_inline struct sk_buff * |
| 243 | ++ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) |
| 244 | ++ { |
| 245 | ++ return skb; |
| 246 | ++ } |
| 247 | ++ #endif /* CONFIG_NET_XGRESS */ |
| 248 | ++ |
| 249 | +++>>>>>>> 28d18b673ffa (net: Fix skb consume leak in sch_handle_egress) |
| 250 | + #ifdef CONFIG_XPS |
| 251 | + static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, |
| 252 | + struct xps_dev_maps *dev_maps, unsigned int tci) |
| 253 | +* Unmerged path net/core/dev.c |
0 commit comments