diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 11b9525dff27..69ef9cce5858 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1072,12 +1072,14 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct + * Note: Caller to this function must hold rcu_read_lock() and no refcnt + * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); - int flags = RT6_LOOKUP_F_IFACE; + int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; @@ -1087,7 +1089,6 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; - dst_hold(dst); return dst; } diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 7375a165fd98..89ad7917b98d 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -36,6 +36,7 @@ struct route_info { #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 #define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 +#define RT6_LOOKUP_F_DST_NOREF 0x00000080 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header @@ -83,6 +84,10 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags); + struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); @@ -93,6 +98,16 @@ static inline struct dst_entry *ip6_route_output(struct net *net, return ip6_route_output_flags(net, sk, fl6, 0); } +/* Only conditionally release dst if flags indicates + * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. + */ +static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) +{ + if (!(flags & RT6_LOOKUP_F_DST_NOREF) || + !list_empty(&rt->rt6i_uncached)) + ip6_rt_put(rt); +} + struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1d16a01eccf5..5b1c9b5b9247 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -316,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4c5142a30808..3975ae8e2440 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1391,9 +1391,6 @@ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); - if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt); - return pcpu_rt; } @@ -1403,12 +1400,9 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); - if (!pcpu_rt) { - dst_hold(&net->ipv6.ip6_null_entry->dst); - return net->ipv6.ip6_null_entry; - } + if (!pcpu_rt) + return NULL; - dst_hold(&pcpu_rt->dst); p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -2189,9 +2183,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; - struct rt6_info *rt; + struct rt6_info *rt = NULL; int strict = 0; + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && + !rcu_read_lock_held()); + strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) @@ -2200,23 +2197,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); - if (res.f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - rcu_read_unlock(); - dst_hold(&rt->dst); - return rt; - } + if (res.f6i == net->ipv6.fib6_null_entry) + goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt)) - dst_use_noref(&rt->dst, jiffies); - - rcu_read_unlock(); - return rt; + goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be @@ -2224,40 +2213,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - - rcu_read_unlock(); - - if (uncached_rt) { - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() - * No need for another dst_hold() + if (rt) { + /* 1 refcnt is taken during ip6_rt_cache_alloc(). + * As rt6_uncached_list_add() does not consume refcnt, + * this refcnt is always returned to the caller even + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ - rt6_uncached_list_add(uncached_rt); + rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); - } else { - uncached_rt = net->ipv6.ip6_null_entry; - dst_hold(&uncached_rt->dst); - } + rcu_read_unlock(); - return uncached_rt; + return rt; + } } else { /* Get a percpu copy */ - - struct rt6_info *pcpu_rt; - local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(&res); + rt = rt6_get_pcpu_route(&res); - if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, &res); + if (!rt) + rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); - rcu_read_unlock(); - - return pcpu_rt; } +out: + if (!rt) + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + ip6_hold_safe(net, &rt); + rcu_read_unlock(); + + return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -2388,11 +2375,12 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, return mhash >> 1; } +/* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); - int flags = RT6_LOOKUP_F_HAS_SADDR; + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, @@ -2414,8 +2402,8 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, + &fl6, skb, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, @@ -2427,8 +2415,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2436,6 +2425,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2443,6 +2433,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2455,6 +2446,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -6029,6 +6042,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -6040,6 +6054,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -6049,6 +6064,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); #endif net->ipv6.sysctl.flush_delay = 0; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index cfc9fcb97465..f35899d45a9a 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -118,6 +118,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup + * This function does not hold refcnt on the returned dst. + * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, @@ -126,9 +128,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct dst_entry *dst = NULL; struct net_device *dev; + WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); @@ -136,8 +137,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); - - rcu_read_unlock(); } return dst;