8eb8284b41
This patch prepares the slab allocator to handle caches having annotations (useroffset and usersize) defining usercopy regions. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Currently, hardened usercopy performs dynamic bounds checking on slab cache objects. This is good, but still leaves a lot of kernel memory available to be copied to/from userspace in the face of bugs. To further restrict what memory is available for copying, this creates a way to whitelist specific areas of a given slab cache object for copying to/from userspace, allowing much finer granularity of access control. Slab caches that are never exposed to userspace can declare no whitelist for their objects, thereby keeping them unavailable to userspace via dynamic copy operations. (Note, an implicit form of whitelisting is the use of constant sizes in usercopy operations and get_user()/put_user(); these bypass hardened usercopy checks since these sizes cannot change at runtime.) To support this whitelist annotation, usercopy region offset and size members are added to struct kmem_cache. The slab allocator receives a new function, kmem_cache_create_usercopy(), that creates a new cache with a usercopy region defined, suitable for declaring spans of fields within the objects that get copied to/from userspace. In this patch, the default kmem_cache_create() marks the entire allocation as whitelisted, leaving it semantically unchanged. Once all fine-grained whitelists have been added (in subsequent patches), this will be changed to a usersize of 0, making caches created with kmem_cache_create() not copyable to/from userspace. After the entire usercopy whitelist series is applied, less than 15% of the slab cache memory remains exposed to potential usercopy bugs after a fresh boot: Total Slab Memory: 48074720 Usercopyable Memory: 6367532 13.2% task_struct 0.2% 4480/1630720 RAW 0.3% 300/96000 RAWv6 2.1% 1408/64768 ext4_inode_cache 3.0% 269760/8740224 dentry 11.1% 585984/5273856 mm_struct 29.1% 54912/188448 kmalloc-8 100.0% 24576/24576 kmalloc-16 100.0% 28672/28672 kmalloc-32 100.0% 81920/81920 kmalloc-192 100.0% 96768/96768 kmalloc-128 100.0% 143360/143360 names_cache 100.0% 163840/163840 kmalloc-64 100.0% 167936/167936 kmalloc-256 100.0% 339968/339968 kmalloc-512 100.0% 350720/350720 kmalloc-96 100.0% 455616/455616 kmalloc-8192 100.0% 655360/655360 kmalloc-1024 100.0% 812032/812032 kmalloc-4096 100.0% 819200/819200 kmalloc-2048 100.0% 1310720/1310720 After some kernel build workloads, the percentage (mainly driven by dentry and inode caches expanding) drops under 10%: Total Slab Memory: 95516184 Usercopyable Memory: 8497452 8.8% task_struct 0.2% 4000/1456000 RAW 0.3% 300/96000 RAWv6 2.1% 1408/64768 ext4_inode_cache 3.0% 1217280/39439872 dentry 11.1% 1623200/14608800 mm_struct 29.1% 73216/251264 kmalloc-8 100.0% 24576/24576 kmalloc-16 100.0% 28672/28672 kmalloc-32 100.0% 94208/94208 kmalloc-192 100.0% 96768/96768 kmalloc-128 100.0% 143360/143360 names_cache 100.0% 163840/163840 kmalloc-64 100.0% 245760/245760 kmalloc-256 100.0% 339968/339968 kmalloc-512 100.0% 350720/350720 kmalloc-96 100.0% 563520/563520 kmalloc-8192 100.0% 655360/655360 kmalloc-1024 100.0% 794624/794624 kmalloc-4096 100.0% 819200/819200 kmalloc-2048 100.0% 1257472/1257472 Signed-off-by: David Windsor <dave@nullcore.net> [kees: adjust commit log, split out a few extra kmalloc hunks] [kees: add field names to function declarations] [kees: convert BUGs to WARNs and fail closed] [kees: add attack surface reduction analysis to commit log] Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: linux-mm@kvack.org Cc: linux-xfs@vger.kernel.org Signed-off-by: Kees Cook <keescook@chromium.org> Acked-by: Christoph Lameter <cl@linux.com>
180 lines
5.4 KiB
C
180 lines
5.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SLUB_DEF_H
|
|
#define _LINUX_SLUB_DEF_H
|
|
|
|
/*
|
|
* SLUB : A Slab allocator without object queues.
|
|
*
|
|
* (C) 2007 SGI, Christoph Lameter
|
|
*/
|
|
#include <linux/kobject.h>
|
|
|
|
enum stat_item {
|
|
ALLOC_FASTPATH, /* Allocation from cpu slab */
|
|
ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
|
|
FREE_FASTPATH, /* Free to cpu slab */
|
|
FREE_SLOWPATH, /* Freeing not to cpu slab */
|
|
FREE_FROZEN, /* Freeing to frozen slab */
|
|
FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */
|
|
FREE_REMOVE_PARTIAL, /* Freeing removes last object */
|
|
ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */
|
|
ALLOC_SLAB, /* Cpu slab acquired from page allocator */
|
|
ALLOC_REFILL, /* Refill cpu slab from slab freelist */
|
|
ALLOC_NODE_MISMATCH, /* Switching cpu slab */
|
|
FREE_SLAB, /* Slab freed to the page allocator */
|
|
CPUSLAB_FLUSH, /* Abandoning of the cpu slab */
|
|
DEACTIVATE_FULL, /* Cpu slab was full when deactivated */
|
|
DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */
|
|
DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */
|
|
DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */
|
|
DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
|
|
DEACTIVATE_BYPASS, /* Implicit deactivation */
|
|
ORDER_FALLBACK, /* Number of times fallback was necessary */
|
|
CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */
|
|
CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */
|
|
CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */
|
|
CPU_PARTIAL_FREE, /* Refill cpu partial on free */
|
|
CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
|
|
CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
|
|
NR_SLUB_STAT_ITEMS };
|
|
|
|
struct kmem_cache_cpu {
|
|
void **freelist; /* Pointer to next available object */
|
|
unsigned long tid; /* Globally unique transaction id */
|
|
struct page *page; /* The slab from which we are allocating */
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
struct page *partial; /* Partially allocated frozen slabs */
|
|
#endif
|
|
#ifdef CONFIG_SLUB_STATS
|
|
unsigned stat[NR_SLUB_STAT_ITEMS];
|
|
#endif
|
|
};
|
|
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
#define slub_percpu_partial(c) ((c)->partial)
|
|
|
|
#define slub_set_percpu_partial(c, p) \
|
|
({ \
|
|
slub_percpu_partial(c) = (p)->next; \
|
|
})
|
|
|
|
#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c))
|
|
#else
|
|
#define slub_percpu_partial(c) NULL
|
|
|
|
#define slub_set_percpu_partial(c, p)
|
|
|
|
#define slub_percpu_partial_read_once(c) NULL
|
|
#endif // CONFIG_SLUB_CPU_PARTIAL
|
|
|
|
/*
|
|
* Word size structure that can be atomically updated or read and that
|
|
* contains both the order and the number of objects that a slab of the
|
|
* given order would contain.
|
|
*/
|
|
struct kmem_cache_order_objects {
|
|
unsigned long x;
|
|
};
|
|
|
|
/*
|
|
* Slab cache management.
|
|
*/
|
|
struct kmem_cache {
|
|
struct kmem_cache_cpu __percpu *cpu_slab;
|
|
/* Used for retriving partial slabs etc */
|
|
slab_flags_t flags;
|
|
unsigned long min_partial;
|
|
int size; /* The size of an object including meta data */
|
|
int object_size; /* The size of an object without meta data */
|
|
int offset; /* Free pointer offset. */
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
int cpu_partial; /* Number of per cpu partial objects to keep around */
|
|
#endif
|
|
struct kmem_cache_order_objects oo;
|
|
|
|
/* Allocation and freeing of slabs */
|
|
struct kmem_cache_order_objects max;
|
|
struct kmem_cache_order_objects min;
|
|
gfp_t allocflags; /* gfp flags to use on each alloc */
|
|
int refcount; /* Refcount for slab cache destroy */
|
|
void (*ctor)(void *);
|
|
int inuse; /* Offset to metadata */
|
|
int align; /* Alignment */
|
|
int reserved; /* Reserved bytes at the end of slabs */
|
|
int red_left_pad; /* Left redzone padding size */
|
|
const char *name; /* Name (only for display!) */
|
|
struct list_head list; /* List of slab caches */
|
|
#ifdef CONFIG_SYSFS
|
|
struct kobject kobj; /* For sysfs */
|
|
struct work_struct kobj_remove_work;
|
|
#endif
|
|
#ifdef CONFIG_MEMCG
|
|
struct memcg_cache_params memcg_params;
|
|
int max_attr_size; /* for propagation, maximum size of a stored attr */
|
|
#ifdef CONFIG_SYSFS
|
|
struct kset *memcg_kset;
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef CONFIG_SLAB_FREELIST_HARDENED
|
|
unsigned long random;
|
|
#endif
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/*
|
|
* Defragmentation by allocating from a remote node.
|
|
*/
|
|
int remote_node_defrag_ratio;
|
|
#endif
|
|
|
|
#ifdef CONFIG_SLAB_FREELIST_RANDOM
|
|
unsigned int *random_seq;
|
|
#endif
|
|
|
|
#ifdef CONFIG_KASAN
|
|
struct kasan_cache kasan_info;
|
|
#endif
|
|
|
|
size_t useroffset; /* Usercopy region offset */
|
|
size_t usersize; /* Usercopy region size */
|
|
|
|
struct kmem_cache_node *node[MAX_NUMNODES];
|
|
};
|
|
|
|
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
|
#define slub_cpu_partial(s) ((s)->cpu_partial)
|
|
#define slub_set_cpu_partial(s, n) \
|
|
({ \
|
|
slub_cpu_partial(s) = (n); \
|
|
})
|
|
#else
|
|
#define slub_cpu_partial(s) (0)
|
|
#define slub_set_cpu_partial(s, n)
|
|
#endif // CONFIG_SLUB_CPU_PARTIAL
|
|
|
|
#ifdef CONFIG_SYSFS
|
|
#define SLAB_SUPPORTS_SYSFS
|
|
void sysfs_slab_release(struct kmem_cache *);
|
|
#else
|
|
static inline void sysfs_slab_release(struct kmem_cache *s)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
void object_err(struct kmem_cache *s, struct page *page,
|
|
u8 *object, char *reason);
|
|
|
|
void *fixup_red_left(struct kmem_cache *s, void *p);
|
|
|
|
static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
|
|
void *x) {
|
|
void *object = x - (x - page_address(page)) % cache->size;
|
|
void *last_object = page_address(page) +
|
|
(page->objects - 1) * cache->size;
|
|
void *result = (unlikely(object > last_object)) ? last_object : object;
|
|
|
|
result = fixup_red_left(cache, result);
|
|
return result;
|
|
}
|
|
|
|
#endif /* _LINUX_SLUB_DEF_H */
|