diff --git a/mm/slub.c b/mm/slub.c index cd8d3712b195..872340cc5f92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -248,6 +248,14 @@ struct partial_context { void *object; }; +/* Structure holding parameters for get_partial_node_bulk() */ +struct partial_bulk_context { + gfp_t flags; + unsigned int min_objects; + unsigned int max_objects; + struct list_head slabs; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); @@ -779,7 +787,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old, if (slab->freelist == old->freelist && slab->counters == old->counters) { slab->freelist = new->freelist; - slab->counters = new->counters; + /* prevent tearing for the read in get_partial_node_bulk() */ + WRITE_ONCE(slab->counters, new->counters); ret = true; } slab_unlock(slab); @@ -2638,9 +2647,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) stat(s, SHEAF_FREE); } -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p); - +static unsigned int +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max); static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) @@ -2651,8 +2660,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, if (!to_fill) return 0; - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, - &sheaf->objects[sheaf->size]); + filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp, + to_fill, to_fill); sheaf->size += filled; @@ -3518,6 +3527,57 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, #endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); +static bool get_partial_node_bulk(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_bulk_context *pc) +{ + struct slab *slab, *slab2; + unsigned int total_free = 0; + unsigned long flags; + + /* Racy check to avoid taking the lock unnecessarily. */ + if (!n || data_race(!n->nr_partial)) + return false; + + INIT_LIST_HEAD(&pc->slabs); + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + struct freelist_counters flc; + unsigned int slab_free; + + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + /* + * determine the number of free objects in the slab racily + * + * slab_free is a lower bound due to possible subsequent + * concurrent freeing, so the caller may get more objects than + * requested and must handle that + */ + flc.counters = data_race(READ_ONCE(slab->counters)); + slab_free = flc.objects - flc.inuse; + + /* we have already min and this would get us over the max */ + if (total_free >= pc->min_objects + && total_free + slab_free > pc->max_objects) + break; + + remove_partial(n, slab); + + list_add(&slab->slab_list, &pc->slabs); + + total_free += slab_free; + if (total_free >= pc->max_objects) + break; + } + + spin_unlock_irqrestore(&n->list_lock, flags); + return total_free > 0; +} + /* * Try to allocate a partial slab from a specific node. */ @@ -4444,6 +4504,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) return old.freelist; } +/* + * Get the slab's freelist and do not freeze it. + * + * Assumes the slab is isolated from node partial list and not frozen. + * + * Assumes this is performed only for caches without debugging so we + * don't need to worry about adding the slab to the full list. + */ +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) +{ + struct freelist_counters old, new; + + do { + old.freelist = slab->freelist; + old.counters = slab->counters; + + new.freelist = NULL; + new.counters = old.counters; + VM_WARN_ON_ONCE(new.frozen); + + new.inuse = old.objects; + + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); + + return old.freelist; +} + /* * Freeze the partial slab and return the pointer to the freelist. */ @@ -4467,6 +4554,72 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) return old.freelist; } +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, + void **p, unsigned int count, bool allow_spin) +{ + unsigned int allocated = 0; + struct kmem_cache_node *n; + bool needs_add_partial; + unsigned long flags; + void *object; + + /* + * Are we going to put the slab on the partial list? + * Note slab->inuse is 0 on a new slab. + */ + needs_add_partial = (slab->objects > count); + + if (!allow_spin && needs_add_partial) { + + n = get_node(s, slab_nid(slab)); + + if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + defer_deactivate_slab(slab, NULL); + return 0; + } + } + + object = slab->freelist; + while (object && allocated < count) { + p[allocated] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[allocated]); + + slab->inuse++; + allocated++; + } + slab->freelist = object; + + if (needs_add_partial) { + + if (allow_spin) { + n = get_node(s, slab_nid(slab)); + spin_lock_irqsave(&n->list_lock, flags); + } + add_partial(n, slab, DEACTIVATE_TO_HEAD); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + inc_slabs_node(s, slab_nid(slab), slab->objects); + return allocated; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -4909,21 +5062,6 @@ redo: return object; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - * - * Note that we also wipe custom freelist pointers. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj && - !freeptr_outside_object(s)) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - static __fastpath_inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { @@ -5384,6 +5522,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, return ret; } +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -7497,6 +7638,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); +static unsigned int +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + struct partial_bulk_context pc; + struct slab *slab, *slab2; + unsigned int refilled = 0; + unsigned long flags; + void *object; + int node; + + pc.flags = gfp; + pc.min_objects = min; + pc.max_objects = max; + + node = numa_mem_id(); + + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + return 0; + + /* TODO: consider also other nodes? */ + if (!get_partial_node_bulk(s, get_node(s, node), &pc)) + goto new_slab; + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + + object = get_freelist_nofreeze(s, slab); + + while (object && refilled < max) { + p[refilled] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[refilled]); + + refilled++; + } + + /* + * Freelist had more objects than we can accommodate, we need to + * free them back. We can treat it like a detached freelist, just + * need to find the tail object. + */ + if (unlikely(object)) { + void *head = object; + void *tail; + int cnt = 0; + + do { + tail = object; + cnt++; + object = get_freepointer(s, object); + } while (object); + do_slab_free(s, slab, head, tail, cnt, _RET_IP_); + } + + if (refilled >= max) + break; + } + + if (unlikely(!list_empty(&pc.slabs))) { + struct kmem_cache_node *n = get_node(s, node); + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) + continue; + + list_del(&slab->slab_list); + add_partial(n, slab, DEACTIVATE_TO_HEAD); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + /* any slabs left are completely free and for discard */ + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + discard_slab(s, slab); + } + } + + + if (likely(refilled >= min)) + goto out; + +new_slab: + + slab = new_slab(s, pc.flags, node); + if (!slab) + goto out; + + stat(s, ALLOC_SLAB); + + /* + * TODO: possible optimization - if we know we will consume the whole + * slab we might skip creating the freelist? + */ + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, + /* allow_spin = */ true); + + if (refilled < min) + goto new_slab; +out: + + return refilled; +} + static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p)