Merge tag 'ASB-2024-01-05_11-5.4' of https://android.googlesource.com/kernel/common into android13-5.4-lahaina

https://source.android.com/docs/security/bulletin/2024-01-01

* tag 'ASB-2024-01-05_11-5.4' of https://android.googlesource.com/kernel/common:
  UPSTREAM: ipv4: igmp: fix refcnt uaf issue when receiving igmp query packet
  ANDROID: Snapshot Mainline's version of checkpatch.pl
  UPSTREAM: nvmet-tcp: Fix a possible UAF in queue intialization setup
  UPSTREAM: nvmet-tcp: move send/recv error handling in the send/recv methods instead of call-sites
  UPSTREAM: netfilter: nf_tables: remove busy mark and gc batch API
  UPSTREAM: netfilter: nft_set_hash: mark set element as dead when deleting from packet path
  UPSTREAM: netfilter: nf_tables: adapt set backend to use GC transaction API
  UPSTREAM: netfilter: nf_tables: GC transaction API to avoid race with control plane
  UPSTREAM: netfilter: nft_set_rbtree: fix overlap expiration walk
  UPSTREAM: netfilter: nft_set_rbtree: fix null deref on element insertion
  UPSTREAM: netfilter: nft_set_rbtree: Switch to node list walk for overlap detection
  UPSTREAM: netfilter: nf_tables: drop map element references from preparation phase
  UPSTREAM: netfilter: nftables: rename set element data activation/deactivation functions

 Conflicts:
	scripts/checkpatch.pl

Change-Id: I2d2a3adcb627da605dd3a6a40c724bd7fb8115c5
This commit is contained in:
Bruno Martins 2024-01-09 14:50:44 +00:00
commit b976c2c438
8 changed files with 1866 additions and 559 deletions

View File

@ -321,6 +321,15 @@ static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
}
static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
{
queue->rcv_state = NVMET_TCP_RECV_ERR;
if (status == -EPIPE || status == -ECONNRESET)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
else
nvmet_tcp_fatal_error(queue);
}
static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
{
struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
@ -714,11 +723,15 @@ static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
for (i = 0; i < budget; i++) {
ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
if (ret <= 0)
if (unlikely(ret < 0)) {
nvmet_tcp_socket_error(queue, ret);
goto done;
} else if (ret == 0) {
break;
}
(*sends)++;
}
done:
return ret;
}
@ -816,15 +829,11 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
iov.iov_len = sizeof(*icresp);
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (ret < 0)
goto free_crypto;
return ret; /* queue removal will cleanup */
queue->state = NVMET_TCP_Q_LIVE;
nvmet_prepare_receive_pdu(queue);
return 0;
free_crypto:
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
return ret;
}
static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
@ -1167,11 +1176,15 @@ static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
for (i = 0; i < budget; i++) {
ret = nvmet_tcp_try_recv_one(queue);
if (ret <= 0)
if (unlikely(ret < 0)) {
nvmet_tcp_socket_error(queue, ret);
goto done;
} else if (ret == 0) {
break;
}
(*recvs)++;
}
done:
return ret;
}
@ -1196,27 +1209,16 @@ static void nvmet_tcp_io_work(struct work_struct *w)
pending = false;
ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
if (ret > 0) {
if (ret > 0)
pending = true;
} else if (ret < 0) {
if (ret == -EPIPE || ret == -ECONNRESET)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
else
nvmet_tcp_fatal_error(queue);
else if (ret < 0)
return;
}
ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
if (ret > 0) {
/* transmitted message/data */
if (ret > 0)
pending = true;
} else if (ret < 0) {
if (ret == -EPIPE || ret == -ECONNRESET)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
else
nvmet_tcp_fatal_error(queue);
else if (ret < 0)
return;
}
} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);

View File

@ -371,7 +371,8 @@ struct nft_set_ops {
int (*init)(const struct nft_set *set,
const struct nft_set_desc *desc,
const struct nlattr * const nla[]);
void (*destroy)(const struct nft_set *set);
void (*destroy)(const struct nft_ctx *ctx,
const struct nft_set *set);
void (*gc_init)(const struct nft_set *set);
unsigned int elemsize;
@ -401,6 +402,7 @@ void nft_unregister_set(struct nft_set_type *type);
*
* @list: table set list node
* @bindings: list of set bindings
* @refs: internal refcounting for async set destruction
* @table: table this set belongs to
* @net: netnamespace this set belongs to
* @name: name of the set
@ -427,6 +429,7 @@ void nft_unregister_set(struct nft_set_type *type);
struct nft_set {
struct list_head list;
struct list_head bindings;
refcount_t refs;
struct nft_table *table;
possible_net_t net;
char *name;
@ -445,7 +448,8 @@ struct nft_set {
unsigned char *udata;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
u16 flags:14,
u16 flags:13,
dead:1,
genmask:2;
u8 klen;
u8 dlen;
@ -665,62 +669,8 @@ void *nft_set_elem_init(const struct nft_set *set,
u64 timeout, u64 expiration, gfp_t gfp);
void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr);
/**
* struct nft_set_gc_batch_head - nf_tables set garbage collection batch
*
* @rcu: rcu head
* @set: set the elements belong to
* @cnt: count of elements
*/
struct nft_set_gc_batch_head {
struct rcu_head rcu;
const struct nft_set *set;
unsigned int cnt;
};
#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \
sizeof(struct nft_set_gc_batch_head)) / \
sizeof(void *))
/**
* struct nft_set_gc_batch - nf_tables set garbage collection batch
*
* @head: GC batch head
* @elems: garbage collection elements
*/
struct nft_set_gc_batch {
struct nft_set_gc_batch_head head;
void *elems[NFT_SET_GC_BATCH_SIZE];
};
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gfp_t gfp);
void nft_set_gc_batch_release(struct rcu_head *rcu);
static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb)
{
if (gcb != NULL)
call_rcu(&gcb->head.rcu, nft_set_gc_batch_release);
}
static inline struct nft_set_gc_batch *
nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb,
gfp_t gfp)
{
if (gcb != NULL) {
if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems))
return gcb;
nft_set_gc_batch_complete(gcb);
}
return nft_set_gc_batch_alloc(set, gfp);
}
static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb,
void *elem)
{
gcb->elems[gcb->head.cnt++] = elem;
}
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem);
struct nft_expr_ops;
/**
@ -1348,39 +1298,30 @@ static inline void nft_set_elem_change_active(const struct net *net,
#endif /* IS_ENABLED(CONFIG_NF_TABLES) */
/*
* We use a free bit in the genmask field to indicate the element
* is busy, meaning it is currently being processed either by
* the netlink API or GC.
*
* Even though the genmask is only a single byte wide, this works
* because the extension structure if fully constant once initialized,
* so there are no non-atomic write accesses unless it is already
* marked busy.
*/
#define NFT_SET_ELEM_BUSY_MASK (1 << 2)
#define NFT_SET_ELEM_DEAD_MASK (1 << 2)
#if defined(__LITTLE_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_BUSY_BIT 2
#define NFT_SET_ELEM_DEAD_BIT 2
#elif defined(__BIG_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
#else
#error
#endif
static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext)
static inline void nft_set_elem_dead(struct nft_set_ext *ext)
{
unsigned long *word = (unsigned long *)ext;
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word);
set_bit(NFT_SET_ELEM_DEAD_BIT, word);
}
static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
{
unsigned long *word = (unsigned long *)ext;
clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
}
/**
@ -1487,6 +1428,35 @@ struct nft_trans_flowtable {
#define nft_trans_flowtable(trans) \
(((struct nft_trans_flowtable *)trans->data)->flowtable)
#define NFT_TRANS_GC_BATCHCOUNT 256
struct nft_trans_gc {
struct list_head list;
struct net *net;
struct nft_set *set;
u32 seq;
u8 count;
void *priv[NFT_TRANS_GC_BATCHCOUNT];
struct rcu_head rcu;
};
struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_destroy(struct nft_trans_gc *trans);
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
void nft_setelem_data_deactivate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem);
int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);
@ -1507,6 +1477,7 @@ struct nftables_pernet {
struct mutex commit_mutex;
unsigned int base_seq;
u8 validate_state;
unsigned int gc_seq;
};
#endif /* _NET_NF_TABLES_H */

View File

@ -218,8 +218,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
int tv = prandom_u32() % max_delay;
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
refcount_inc(&im->refcnt);
if (refcount_inc_not_zero(&im->refcnt)) {
if (mod_timer(&im->timer, jiffies + tv + 2))
ip_ma_put(im);
}
}
static void igmp_gq_start_timer(struct in_device *in_dev)

View File

@ -26,12 +26,15 @@
#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
unsigned int nf_tables_net_id __read_mostly;
EXPORT_SYMBOL_GPL(nf_tables_net_id);
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_destroy_list);
static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
static u64 table_handle;
enum {
@ -88,6 +91,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
static void nf_tables_trans_destroy_work(struct work_struct *w);
static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
static void nft_trans_gc_work(struct work_struct *work);
static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
const struct sk_buff *skb,
@ -403,6 +409,27 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
return 0;
}
static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
struct nft_set_elem *elem)
{
nft_setelem_data_deactivate(ctx->net, set, elem);
return 0;
}
static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
{
struct nft_set_iter iter = {
.genmask = nft_genmask_next(ctx->net),
.fn = nft_mapelem_deactivate,
};
set->ops->walk(ctx, set, &iter);
WARN_ON_ONCE(iter.err);
}
static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
int err;
@ -411,6 +438,9 @@ static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
if (err < 0)
return err;
if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
nft_map_deactivate(ctx, set);
nft_deactivate_next(ctx->net, set);
nft_use_dec(&ctx->table->use);
@ -3810,6 +3840,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
}
INIT_LIST_HEAD(&set->bindings);
refcount_set(&set->refs, 1);
set->table = table;
write_pnet(&set->net, net);
set->ops = ops;
@ -3840,7 +3871,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return 0;
err4:
ops->destroy(set);
ops->destroy(&ctx, set);
err3:
kfree(set->name);
err2:
@ -3852,15 +3883,22 @@ err1:
return err;
}
static void nft_set_put(struct nft_set *set)
{
if (refcount_dec_and_test(&set->refs)) {
kfree(set->name);
kvfree(set);
}
}
static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
if (WARN_ON(set->use > 0))
return;
set->ops->destroy(set);
set->ops->destroy(ctx, set);
module_put(to_set_type(set->ops)->owner);
kfree(set->name);
kvfree(set);
nft_set_put(set);
}
static int nf_tables_delset(struct net *net, struct sock *nlsk,
@ -3981,10 +4019,39 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
}
}
static void nft_setelem_data_activate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem);
static int nft_mapelem_activate(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
struct nft_set_elem *elem)
{
nft_setelem_data_activate(ctx->net, set, elem);
return 0;
}
static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set)
{
struct nft_set_iter iter = {
.genmask = nft_genmask_next(ctx->net),
.fn = nft_mapelem_activate,
};
set->ops->walk(ctx, set, &iter);
WARN_ON_ONCE(iter.err);
}
void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
{
if (nft_set_is_anonymous(set))
if (nft_set_is_anonymous(set)) {
if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
nft_map_activate(ctx, set);
nft_clear(ctx->net, set);
}
nft_use_inc_restore(&set->use);
}
@ -4005,13 +4072,20 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
nft_use_dec(&set->use);
break;
case NFT_TRANS_PREPARE:
if (nft_set_is_anonymous(set))
nft_deactivate_next(ctx->net, set);
if (nft_set_is_anonymous(set)) {
if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
nft_map_deactivate(ctx, set);
nft_deactivate_next(ctx->net, set);
}
nft_use_dec(&set->use);
return;
case NFT_TRANS_ABORT:
case NFT_TRANS_RELEASE:
if (nft_set_is_anonymous(set) &&
set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
nft_map_deactivate(ctx, set);
nft_use_dec(&set->use);
/* fall through */
default:
@ -4578,6 +4652,7 @@ void *nft_set_elem_init(const struct nft_set *set,
return elem;
}
/* Drop references and destroy. Called from gc, dynset and abort path. */
void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr)
{
@ -4606,11 +4681,11 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
/* Only called from commit path, nft_set_elem_deactivate() already deals with
* the refcounting from the preparation phase.
/* Destroy element. References have been already dropped in the preparation
* path via nft_setelem_data_deactivate().
*/
static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem)
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem)
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
@ -4618,6 +4693,7 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
kfree(elem);
}
EXPORT_SYMBOL_GPL(nf_tables_set_elem_destroy);
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr, u32 nlmsg_flags)
@ -4803,7 +4879,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (trans == NULL)
goto err4;
ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
ext->genmask = nft_genmask_cur(ctx->net);
err = set->ops->insert(ctx->net, set, &elem, &ext2);
if (err) {
if (err == -EEXIST) {
@ -4923,9 +5000,9 @@ void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
}
}
static void nft_set_elem_activate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem)
static void nft_setelem_data_activate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
@ -4935,9 +5012,9 @@ static void nft_set_elem_activate(const struct net *net,
nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}
static void nft_set_elem_deactivate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem)
void nft_setelem_data_deactivate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
@ -4946,6 +5023,7 @@ static void nft_set_elem_deactivate(const struct net *net,
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
nft_use_dec(&(*nft_set_ext_obj(ext))->use);
}
EXPORT_SYMBOL_GPL(nft_setelem_data_deactivate);
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
@ -5004,7 +5082,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
kfree(elem.priv);
elem.priv = priv;
nft_set_elem_deactivate(ctx->net, set, &elem);
nft_setelem_data_deactivate(ctx->net, set, &elem);
nft_trans_elem(trans) = elem;
nft_trans_commit_list_add_tail(ctx->net, trans);
@ -5038,7 +5116,7 @@ static int nft_flush_set(const struct nft_ctx *ctx,
}
set->ndeact++;
nft_set_elem_deactivate(ctx->net, set, elem);
nft_setelem_data_deactivate(ctx->net, set, elem);
nft_trans_elem_set(trans) = set;
nft_trans_elem(trans) = *elem;
nft_trans_commit_list_add_tail(ctx->net, trans);
@ -5095,31 +5173,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
return err;
}
void nft_set_gc_batch_release(struct rcu_head *rcu)
{
struct nft_set_gc_batch *gcb;
unsigned int i;
gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
for (i = 0; i < gcb->head.cnt; i++)
nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
kfree(gcb);
}
EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gfp_t gfp)
{
struct nft_set_gc_batch *gcb;
gcb = kzalloc(sizeof(*gcb), gfp);
if (gcb == NULL)
return gcb;
gcb->head.set = set;
return gcb;
}
EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/*
* Stateful objects
*/
@ -6896,6 +6949,186 @@ static void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
struct nft_trans_gc *trans)
{
void **priv = trans->priv;
unsigned int i;
for (i = 0; i < trans->count; i++) {
struct nft_set_elem elem = {
.priv = priv[i],
};
nft_setelem_data_deactivate(ctx->net, trans->set, &elem);
trans->set->ops->remove(trans->net, trans->set, &elem);
}
}
void nft_trans_gc_destroy(struct nft_trans_gc *trans)
{
nft_set_put(trans->set);
put_net(trans->net);
kfree(trans);
}
EXPORT_SYMBOL_GPL(nft_trans_gc_destroy);
static void nft_trans_gc_trans_free(struct rcu_head *rcu)
{
struct nft_set_elem elem = {};
struct nft_trans_gc *trans;
struct nft_ctx ctx = {};
unsigned int i;
trans = container_of(rcu, struct nft_trans_gc, rcu);
ctx.net = read_pnet(&trans->set->net);
for (i = 0; i < trans->count; i++) {
elem.priv = trans->priv[i];
atomic_dec(&trans->set->nelems);
nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv);
}
nft_trans_gc_destroy(trans);
}
static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
{
struct nftables_pernet *nft_net;
struct nft_ctx ctx = {};
nft_net = net_generic(trans->net, nf_tables_net_id);
mutex_lock(&nft_net->commit_mutex);
/* Check for race with transaction, otherwise this batch refers to
* stale objects that might not be there anymore. Skip transaction if
* set has been destroyed from control plane transaction in case gc
* worker loses race.
*/
if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
mutex_unlock(&nft_net->commit_mutex);
return false;
}
ctx.net = trans->net;
ctx.table = trans->set->table;
nft_trans_gc_setelem_remove(&ctx, trans);
mutex_unlock(&nft_net->commit_mutex);
return true;
}
static void nft_trans_gc_work(struct work_struct *work)
{
struct nft_trans_gc *trans, *next;
LIST_HEAD(trans_gc_list);
spin_lock(&nf_tables_destroy_list_lock);
list_splice_init(&nf_tables_gc_list, &trans_gc_list);
spin_unlock(&nf_tables_destroy_list_lock);
list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
list_del(&trans->list);
if (!nft_trans_gc_work_done(trans)) {
nft_trans_gc_destroy(trans);
continue;
}
call_rcu(&trans->rcu, nft_trans_gc_trans_free);
}
}
struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
unsigned int gc_seq, gfp_t gfp)
{
struct net *net = read_pnet(&set->net);
struct nft_trans_gc *trans;
trans = kzalloc(sizeof(*trans), gfp);
if (!trans)
return NULL;
refcount_inc(&set->refs);
trans->set = set;
trans->net = get_net(net);
trans->seq = gc_seq;
return trans;
}
EXPORT_SYMBOL_GPL(nft_trans_gc_alloc);
void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
{
trans->priv[trans->count++] = priv;
}
EXPORT_SYMBOL_GPL(nft_trans_gc_elem_add);
static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
{
spin_lock(&nf_tables_gc_list_lock);
list_add_tail(&trans->list, &nf_tables_gc_list);
spin_unlock(&nf_tables_gc_list_lock);
schedule_work(&trans_gc_work);
}
static int nft_trans_gc_space(struct nft_trans_gc *trans)
{
return NFT_TRANS_GC_BATCHCOUNT - trans->count;
}
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
unsigned int gc_seq, gfp_t gfp)
{
if (nft_trans_gc_space(gc))
return gc;
nft_trans_gc_queue_work(gc);
return nft_trans_gc_alloc(gc->set, gc_seq, gfp);
}
EXPORT_SYMBOL_GPL(nft_trans_gc_queue_async);
void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
{
if (trans->count == 0) {
nft_trans_gc_destroy(trans);
return;
}
nft_trans_gc_queue_work(trans);
}
EXPORT_SYMBOL_GPL(nft_trans_gc_queue_async_done);
struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
{
if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
return NULL;
if (nft_trans_gc_space(gc))
return gc;
call_rcu(&gc->rcu, nft_trans_gc_trans_free);
return nft_trans_gc_alloc(gc->set, 0, gfp);
}
EXPORT_SYMBOL_GPL(nft_trans_gc_queue_sync);
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
{
WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
if (trans->count == 0) {
nft_trans_gc_destroy(trans);
return;
}
call_rcu(&trans->rcu, nft_trans_gc_trans_free);
}
EXPORT_SYMBOL_GPL(nft_trans_gc_queue_sync_done);
static void nf_tables_module_autoload_cleanup(struct net *net)
{
struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id);
@ -6950,6 +7183,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
unsigned int gc_seq;
int err;
if (list_empty(&nft_net->commit_list)) {
@ -7006,6 +7240,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
while (++nft_net->base_seq == 0)
;
/* Bump gc counter, it becomes odd, this is the busy mark. */
gc_seq = READ_ONCE(nft_net->gc_seq);
WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
@ -7083,6 +7321,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_trans_destroy(trans);
break;
case NFT_MSG_DELSET:
nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
NFT_MSG_DELSET, GFP_KERNEL);
@ -7144,6 +7383,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
}
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
nf_tables_commit_release(net);
return 0;
@ -7265,6 +7506,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
case NFT_MSG_DELSET:
nft_use_inc_restore(&trans->ctx.table->use);
nft_clear(trans->ctx.net, nft_trans_set(trans));
if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
nft_map_activate(&trans->ctx, nft_trans_set(trans));
nft_trans_destroy(trans);
break;
case NFT_MSG_NEWSETELEM:
@ -7279,7 +7522,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
case NFT_MSG_DELSETELEM:
te = (struct nft_trans_elem *)trans->data;
nft_set_elem_activate(net, te->set, &te->elem);
nft_setelem_data_activate(net, te->set, &te->elem);
te->set->ops->activate(net, te->set, &te->elem);
te->set->ndeact--;
@ -7959,6 +8202,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table)
list_for_each_entry_safe(set, ns, &table->sets, list) {
list_del(&set->list);
nft_use_dec(&table->use);
if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
nft_map_deactivate(&ctx, set);
nft_set_destroy(&ctx, set);
}
list_for_each_entry_safe(obj, ne, &table->objects, list) {
@ -7997,6 +8243,7 @@ static int __net_init nf_tables_init_net(struct net *net)
mutex_init(&nft_net->commit_mutex);
nft_net->base_seq = 1;
nft_net->validate_state = NFT_VALIDATE_SKIP;
nft_net->gc_seq = 0;
return 0;
}
@ -8023,10 +8270,16 @@ static void __net_exit nf_tables_exit_net(struct net *net)
WARN_ON_ONCE(!list_empty(&nft_net->module_list));
}
static void nf_tables_exit_batch(struct list_head *net_exit_list)
{
flush_work(&trans_gc_work);
}
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
.pre_exit = nf_tables_pre_exit_net,
.exit = nf_tables_exit_net,
.exit_batch = nf_tables_exit_batch,
.id = &nf_tables_net_id,
.size = sizeof(struct nftables_pernet),
};
@ -8091,6 +8344,7 @@ static void __exit nf_tables_module_exit(void)
nft_chain_filter_fini();
nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
cancel_work_sync(&trans_gc_work);
cancel_work_sync(&trans_destroy_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);

View File

@ -270,13 +270,14 @@ static int nft_bitmap_init(const struct nft_set *set,
return 0;
}
static void nft_bitmap_destroy(const struct nft_set *set)
static void nft_bitmap_destroy(const struct nft_ctx *ctx,
const struct nft_set *set)
{
struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be, *n;
list_for_each_entry_safe(be, n, &priv->list, head)
nft_set_elem_destroy(set, be, true);
nf_tables_set_elem_destroy(ctx, set, be);
}
static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,

View File

@ -17,6 +17,9 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netns/generic.h>
extern unsigned int nf_tables_net_id;
/* We target a hash table size of 4, element hint is 75% of final size */
#define NFT_RHASH_ELEMENT_HINT 3
@ -59,6 +62,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1;
if (nft_set_elem_is_dead(&he->ext))
return 1;
if (nft_set_elem_expired(&he->ext))
return 1;
if (!nft_set_elem_active(&he->ext, x->genmask))
@ -187,7 +192,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
struct nft_rhash_elem *he = elem->priv;
nft_set_elem_change_active(net, set, &he->ext);
nft_set_elem_clear_busy(&he->ext);
}
static bool nft_rhash_flush(const struct net *net,
@ -195,12 +199,9 @@ static bool nft_rhash_flush(const struct net *net,
{
struct nft_rhash_elem *he = priv;
if (!nft_set_elem_mark_busy(&he->ext) ||
!nft_is_active(net, &he->ext)) {
nft_set_elem_change_active(net, set, &he->ext);
return true;
}
return false;
nft_set_elem_change_active(net, set, &he->ext);
return true;
}
static void *nft_rhash_deactivate(const struct net *net,
@ -217,9 +218,8 @@ static void *nft_rhash_deactivate(const struct net *net,
rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL &&
!nft_rhash_flush(net, set, he))
he = NULL;
if (he)
nft_set_elem_change_active(net, set, &he->ext);
rcu_read_unlock();
@ -251,7 +251,9 @@ static bool nft_rhash_delete(const struct nft_set *set,
if (he == NULL)
return false;
return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
nft_set_elem_dead(&he->ext);
return true;
}
static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
@ -295,49 +297,77 @@ cont:
static void nft_rhash_gc(struct work_struct *work)
{
struct nftables_pernet *nft_net;
struct nft_set *set;
struct nft_rhash_elem *he;
struct nft_rhash *priv;
struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
struct nft_trans_gc *gc;
struct net *net;
u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
net = read_pnet(&set->net);
nft_net = net_generic(net, nf_tables_net_id);
gc_seq = READ_ONCE(nft_net->gc_seq);
gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
if (!gc)
goto done;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
if (PTR_ERR(he) != -EAGAIN)
break;
if (PTR_ERR(he) != -EAGAIN) {
nft_trans_gc_destroy(gc);
gc = NULL;
goto try_later;
}
continue;
}
/* Ruleset has been updated, try later. */
if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
nft_trans_gc_destroy(gc);
gc = NULL;
goto try_later;
}
if (nft_set_elem_is_dead(&he->ext))
goto dead_elem;
if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) {
struct nft_expr *expr = nft_set_ext_expr(&he->ext);
if (expr->ops->gc &&
expr->ops->gc(read_pnet(&set->net), expr))
goto gc;
goto needs_gc_run;
}
if (!nft_set_elem_expired(&he->ext))
continue;
gc:
if (nft_set_elem_mark_busy(&he->ext))
continue;
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (gcb == NULL)
break;
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, he);
needs_gc_run:
nft_set_elem_dead(&he->ext);
dead_elem:
gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
if (!gc)
goto try_later;
nft_trans_gc_elem_add(gc, he);
}
try_later:
/* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
nft_set_gc_batch_complete(gcb);
if (gc)
nft_trans_gc_queue_async_done(gc);
done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
@ -372,25 +402,36 @@ static int nft_rhash_init(const struct nft_set *set,
return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
if (set->flags & NFT_SET_TIMEOUT)
if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set);
return 0;
}
struct nft_rhash_ctx {
const struct nft_ctx ctx;
const struct nft_set *set;
};
static void nft_rhash_elem_destroy(void *ptr, void *arg)
{
nft_set_elem_destroy(arg, ptr, true);
struct nft_rhash_ctx *rhash_ctx = arg;
nf_tables_set_elem_destroy(&rhash_ctx->ctx, rhash_ctx->set, ptr);
}
static void nft_rhash_destroy(const struct nft_set *set)
static void nft_rhash_destroy(const struct nft_ctx *ctx,
const struct nft_set *set)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_ctx rhash_ctx = {
.ctx = *ctx,
.set = set,
};
cancel_delayed_work_sync(&priv->gc_work);
rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
(void *)set);
(void *)&rhash_ctx);
}
/* Number of buckets is stored in u32, so cap our result to 1U<<31 */
@ -619,7 +660,8 @@ static int nft_hash_init(const struct nft_set *set,
return 0;
}
static void nft_hash_destroy(const struct nft_set *set)
static void nft_hash_destroy(const struct nft_ctx *ctx,
const struct nft_set *set)
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_elem *he;
@ -629,7 +671,7 @@ static void nft_hash_destroy(const struct nft_set *set)
for (i = 0; i < priv->buckets; i++) {
hlist_for_each_entry_safe(he, next, &priv->table[i], node) {
hlist_del_rcu(&he->node);
nft_set_elem_destroy(set, he, true);
nf_tables_set_elem_destroy(ctx, set, he);
}
}
}

View File

@ -14,6 +14,9 @@
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netns/generic.h>
extern unsigned int nf_tables_net_id;
struct nft_rbtree {
struct rb_root root;
@ -38,10 +41,18 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
return !nft_rbtree_interval_end(rbe);
}
static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
const struct nft_rbtree_elem *interval)
static int nft_rbtree_cmp(const struct nft_set *set,
const struct nft_rbtree_elem *e1,
const struct nft_rbtree_elem *e2)
{
return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
set->klen);
}
static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
{
return nft_set_elem_expired(&rbe->ext) ||
nft_set_elem_is_dead(&rbe->ext);
}
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
@ -52,7 +63,6 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
const struct nft_rbtree_elem *rbe, *interval = NULL;
u8 genmask = nft_genmask_cur(net);
const struct rb_node *parent;
const void *this;
int d;
parent = rcu_dereference_raw(priv->root.rb_node);
@ -62,12 +72,11 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
this = nft_set_ext_key(&rbe->ext);
d = memcmp(this, key, set->klen);
d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
if (d < 0) {
parent = rcu_dereference_raw(parent->rb_left);
if (interval &&
nft_rbtree_equal(set, this, interval) &&
!nft_rbtree_cmp(set, rbe, interval) &&
nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_start(interval))
continue;
@ -80,7 +89,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
continue;
}
if (nft_set_elem_expired(&rbe->ext))
if (nft_rbtree_elem_expired(rbe))
return false;
if (nft_rbtree_interval_end(rbe)) {
@ -98,7 +107,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
!nft_set_elem_expired(&interval->ext) &&
!nft_rbtree_elem_expired(interval) &&
nft_rbtree_interval_start(interval)) {
*ext = &interval->ext;
return true;
@ -214,43 +223,254 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
return rbe;
}
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe)
{
struct nft_set_elem elem = {
.priv = rbe,
};
nft_setelem_data_deactivate(net, set, &elem);
rb_erase(&rbe->node, &priv->root);
}
static int nft_rbtree_gc_elem(const struct nft_set *__set,
struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe,
u8 genmask)
{
struct nft_set *set = (struct nft_set *)__set;
struct rb_node *prev = rb_prev(&rbe->node);
struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev;
struct nft_trans_gc *gc;
gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
if (!gc)
return -ENOMEM;
/* search for end interval coming before this element.
* end intervals don't carry a timeout extension, they
* are coupled with the interval start element.
*/
while (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
if (nft_rbtree_interval_end(rbe_prev) &&
nft_set_elem_active(&rbe_prev->ext, genmask))
break;
prev = rb_prev(prev);
}
if (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
nft_rbtree_gc_remove(net, set, priv, rbe_prev);
/* There is always room in this trans gc for this element,
* memory allocation never actually happens, hence, the warning
* splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
* this is synchronous gc which never fails.
*/
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
if (WARN_ON_ONCE(!gc))
return -ENOMEM;
nft_trans_gc_elem_add(gc, rbe_prev);
}
nft_rbtree_gc_remove(net, set, priv, rbe);
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
if (WARN_ON_ONCE(!gc))
return -ENOMEM;
nft_trans_gc_elem_add(gc, rbe);
nft_trans_gc_queue_sync_done(gc);
return 0;
}
static bool nft_rbtree_update_first(const struct nft_set *set,
struct nft_rbtree_elem *rbe,
struct rb_node *first)
{
struct nft_rbtree_elem *first_elem;
first_elem = rb_entry(first, struct nft_rbtree_elem, node);
/* this element is closest to where the new element is to be inserted:
* update the first element for the node list path.
*/
if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
return true;
return false;
}
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
struct nft_set_ext **ext)
{
struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
struct rb_node *node, *next, *parent, **p, *first = NULL;
struct nft_rbtree *priv = nft_set_priv(set);
u8 genmask = nft_genmask_next(net);
struct nft_rbtree_elem *rbe;
struct rb_node *parent, **p;
int d;
int d, err;
/* Descend the tree to search for an existing element greater than the
* key value to insert that is greater than the new element. This is the
* first element to walk the ordered elements to find possible overlap.
*/
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
parent = *p;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
d = memcmp(nft_set_ext_key(&rbe->ext),
nft_set_ext_key(&new->ext),
set->klen);
d = nft_rbtree_cmp(set, rbe, new);
if (d < 0) {
p = &parent->rb_left;
} else if (d > 0) {
if (!first ||
nft_rbtree_update_first(set, rbe, first))
first = &rbe->node;
p = &parent->rb_right;
} else {
if (nft_rbtree_interval_end(rbe))
p = &parent->rb_left;
else
p = &parent->rb_right;
}
}
if (!first)
first = rb_first(&priv->root);
/* Detect overlap by going through the list of valid tree nodes.
* Values stored in the tree are in reversed order, starting from
* highest to lowest value.
*/
for (node = first; node != NULL; node = next) {
next = rb_next(node);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (!nft_set_elem_active(&rbe->ext, genmask))
continue;
/* perform garbage collection to avoid bogus overlap reports. */
if (nft_set_elem_expired(&rbe->ext)) {
err = nft_rbtree_gc_elem(set, priv, rbe, genmask);
if (err < 0)
return err;
continue;
}
d = nft_rbtree_cmp(set, rbe, new);
if (d == 0) {
/* Matching end element: no need to look for an
* overlapping greater or equal element.
*/
if (nft_rbtree_interval_end(rbe)) {
rbe_le = rbe;
break;
}
/* first element that is greater or equal to key value. */
if (!rbe_ge) {
rbe_ge = rbe;
continue;
}
/* this is a closer more or equal element, update it. */
if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
rbe_ge = rbe;
continue;
}
/* element is equal to key value, make sure flags are
* the same, an existing more or equal start element
* must not be replaced by more or equal end element.
*/
if ((nft_rbtree_interval_start(new) &&
nft_rbtree_interval_start(rbe_ge)) ||
(nft_rbtree_interval_end(new) &&
nft_rbtree_interval_end(rbe_ge))) {
rbe_ge = rbe;
continue;
}
} else if (d > 0) {
/* annotate element greater than the new element. */
rbe_ge = rbe;
continue;
} else if (d < 0) {
/* annotate element less than the new element. */
rbe_le = rbe;
break;
}
}
/* - new start element matching existing start element: full overlap
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
*/
if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
*ext = &rbe_ge->ext;
return -EEXIST;
}
/* - new end element matching existing end element: full overlap
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
*/
if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
*ext = &rbe_le->ext;
return -EEXIST;
}
/* - new start element with existing closest, less or equal key value
* being a start element: partial overlap, reported as -ENOTEMPTY.
* Anonymous sets allow for two consecutive start element since they
* are constant, skip them to avoid bogus overlap reports.
*/
if (!nft_set_is_anonymous(set) && rbe_le &&
nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
return -ENOTEMPTY;
/* - new end element with existing closest, less or equal key value
* being a end element: partial overlap, reported as -ENOTEMPTY.
*/
if (rbe_le &&
nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
return -ENOTEMPTY;
/* - new end element with existing closest, greater or equal key value
* being an end element: partial overlap, reported as -ENOTEMPTY
*/
if (rbe_ge &&
nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
return -ENOTEMPTY;
/* Accepted element: pick insertion point depending on key value */
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
parent = *p;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
d = nft_rbtree_cmp(set, rbe, new);
if (d < 0)
p = &parent->rb_left;
else if (d > 0)
p = &parent->rb_right;
else {
if (nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_start(new)) {
p = &parent->rb_left;
} else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(new)) {
p = &parent->rb_right;
} else if (nft_set_elem_active(&rbe->ext, genmask)) {
*ext = &rbe->ext;
return -EEXIST;
} else {
p = &parent->rb_left;
}
}
else if (nft_rbtree_interval_end(rbe))
p = &parent->rb_left;
else
p = &parent->rb_right;
}
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@ -294,7 +514,6 @@ static void nft_rbtree_activate(const struct net *net,
struct nft_rbtree_elem *rbe = elem->priv;
nft_set_elem_change_active(net, set, &rbe->ext);
nft_set_elem_clear_busy(&rbe->ext);
}
static bool nft_rbtree_flush(const struct net *net,
@ -302,12 +521,9 @@ static bool nft_rbtree_flush(const struct net *net,
{
struct nft_rbtree_elem *rbe = priv;
if (!nft_set_elem_mark_busy(&rbe->ext) ||
!nft_is_active(net, &rbe->ext)) {
nft_set_elem_change_active(net, set, &rbe->ext);
return true;
}
return false;
nft_set_elem_change_active(net, set, &rbe->ext);
return true;
}
static void *nft_rbtree_deactivate(const struct net *net,
@ -384,26 +600,40 @@ cont:
static void nft_rbtree_gc(struct work_struct *work)
{
struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
struct nft_set_gc_batch *gcb = NULL;
struct nft_rbtree_elem *rbe, *rbe_end = NULL;
struct nftables_pernet *nft_net;
struct nft_rbtree *priv;
struct nft_trans_gc *gc;
struct rb_node *node;
struct nft_set *set;
unsigned int gc_seq;
struct net *net;
u8 genmask;
priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv);
net = read_pnet(&set->net);
genmask = nft_genmask_cur(net);
nft_net = net_generic(net, nf_tables_net_id);
gc_seq = READ_ONCE(nft_net->gc_seq);
gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
if (!gc)
goto done;
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
/* Ruleset has been updated, try later. */
if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
nft_trans_gc_destroy(gc);
gc = NULL;
goto try_later;
}
rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (!nft_set_elem_active(&rbe->ext, genmask))
continue;
if (nft_set_elem_is_dead(&rbe->ext))
goto dead_elem;
/* elements are reversed in the rbtree for historical reasons,
* from highest to lowest value, that is why end element is
@ -413,43 +643,38 @@ static void nft_rbtree_gc(struct work_struct *work)
rbe_end = rbe;
continue;
}
if (!nft_set_elem_expired(&rbe->ext))
continue;
if (nft_set_elem_mark_busy(&rbe->ext)) {
rbe_end = NULL;
nft_set_elem_dead(&rbe->ext);
if (!rbe_end)
continue;
}
if (rbe_prev) {
rb_erase(&rbe_prev->node, &priv->root);
rbe_prev = NULL;
}
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (!gcb)
break;
nft_set_elem_dead(&rbe_end->ext);
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, rbe);
rbe_prev = rbe;
gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
if (!gc)
goto try_later;
if (rbe_end) {
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, rbe_end);
rb_erase(&rbe_end->node, &priv->root);
rbe_end = NULL;
}
node = rb_next(node);
if (!node)
break;
nft_trans_gc_elem_add(gc, rbe_end);
rbe_end = NULL;
dead_elem:
gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
if (!gc)
goto try_later;
nft_trans_gc_elem_add(gc, rbe);
}
if (rbe_prev)
rb_erase(&rbe_prev->node, &priv->root);
try_later:
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
nft_set_gc_batch_complete(gcb);
if (gc)
nft_trans_gc_queue_async_done(gc);
done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
@ -478,7 +703,8 @@ static int nft_rbtree_init(const struct nft_set *set,
return 0;
}
static void nft_rbtree_destroy(const struct nft_set *set)
static void nft_rbtree_destroy(const struct nft_ctx *ctx,
const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
@ -489,7 +715,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
nft_set_elem_destroy(set, rbe, true);
nf_tables_set_elem_destroy(ctx, set, rbe);
}
}

File diff suppressed because it is too large Load Diff