From aa4bf44dc851c6bdd4f7b61b5f2c56c84dfe2ff0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2017 00:04:40 +0200 Subject: [PATCH 1/7] userns: use union in {g,u}idmap struct - Add a struct containing two pointer to extents and wrap both the static extent array and the struct into a union. This is done in preparation for bumping the {g,u}idmap limits for user namespaces. - Add brackets around anonymous union when using designated initializers to initialize members in order to please gcc <= 4.4. Signed-off-by: Christian Brauner Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 18 +++++++++++++----- kernel/user.c | 30 ++++++++++++++++++------------ 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c18e01252346..7c83d7f6289b 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -12,13 +12,21 @@ #define UID_GID_MAP_MAX_EXTENTS 5 +struct uid_gid_extent { + u32 first; + u32 lower_first; + u32 count; +}; + struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; - struct uid_gid_extent { - u32 first; - u32 lower_first; - u32 count; - } extent[UID_GID_MAP_MAX_EXTENTS]; + union { + struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS]; + struct { + struct uid_gid_extent *forward; + struct uid_gid_extent *reverse; + }; + }; }; #define USERNS_SETGROUPS_ALLOWED 1UL diff --git a/kernel/user.c b/kernel/user.c index 00281add65b2..9a20acce460d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,26 +26,32 @@ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .gid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .projid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .count = ATOMIC_INIT(3), From 6397fac4915ab3002dc15aae751455da1a852f25 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2017 00:04:41 +0200 Subject: [PATCH 2/7] userns: bump idmap limits to 340 There are quite some use cases where users run into the current limit for {g,u}id mappings. Consider a user requesting us to map everything but 999, and 1001 for a given range of 1000000000 with a sub{g,u}id layout of: some-user:100000:1000000000 some-user:999:1 some-user:1000:1 some-user:1001:1 some-user:1002:1 This translates to: MAPPING-TYPE | CONTAINER | HOST | RANGE | -------------|-----------|---------|-----------| uid | 999 | 999 | 1 | uid | 1001 | 1001 | 1 | uid | 0 | 1000000 | 999 | uid | 1000 | 1001000 | 1 | uid | 1002 | 1001002 | 999998998 | ------------------------------------------------ gid | 999 | 999 | 1 | gid | 1001 | 1001 | 1 | gid | 0 | 1000000 | 999 | gid | 1000 | 1001000 | 1 | gid | 1002 | 1001002 | 999998998 | which is already the current limit. As discussed at LPC simply bumping the number of limits is not going to work since this would mean that struct uid_gid_map won't fit into a single cache-line anymore thereby regressing performance for the base-cases. The same problem seems to arise when using a single pointer. So the idea is to use struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; For the base cases we will only use the struct uid_gid_extent extent member. If we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k kmalloc() which means we can have a maximum of 340 mappings (340 * size(struct uid_gid_extent) = 4080). For the latter case we use two pointers "forward" and "reverse". The forward pointer points to an array sorted by "first" and the reverse pointer points to an array sorted by "lower_first". We can then perform binary search on those arrays. Performance Testing: When Eric introduced the extent-based struct uid_gid_map approach he measured the performanc impact of his idmap changes: > My benchmark consisted of going to single user mode where nothing else was > running. On an ext4 filesystem opening 1,000,000 files and looping through all > of the files 1000 times and calling fstat on the individuals files. This was > to ensure I was benchmarking stat times where the inodes were in the kernels > cache, but the inode values were not in the processors cache. My results: > v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled) > v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled) > v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled) I used an identical approach on my laptop. Here's a thorough description of what I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I booted into single user mode and used an ext4 filesystem to open/create 1,000,000 files. Then I looped through all of the files calling fstat() on each of them 1000 times and calculated the mean fstat() time for a single file. (The test program can be found below.) Here are the results. For fun, I compared the first version of my patch which scaled linearly with the new version of the patch: | # MAPPINGS | PATCH-V1 | PATCH-NEW | |--------------|------------|-----------| | 0 mappings | 158 ns | 158 ns | | 1 mappings | 164 ns | 157 ns | | 2 mappings | 170 ns | 158 ns | | 3 mappings | 175 ns | 161 ns | | 5 mappings | 187 ns | 165 ns | | 10 mappings | 218 ns | 199 ns | | 50 mappings | 528 ns | 218 ns | | 100 mappings | 980 ns | 229 ns | | 200 mappings | 1880 ns | 239 ns | | 300 mappings | 2760 ns | 240 ns | | 340 mappings | not tested | 248 ns | Here's the test program I used. I asked Eric what he did and this is a more "advanced" implementation of the idea. It's pretty straight-forward: #define __GNU_SOURCE #define __STDC_FORMAT_MACROS #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char *argv[]) { int ret; size_t i, k; int fd[1000000]; int times[1000]; char pathname[4096]; struct stat st; struct timeval t1, t2; uint64_t time_in_mcs; uint64_t sum = 0; if (argc != 2) { fprintf(stderr, "Please specify a directory where to create " "the test files\n"); exit(EXIT_FAILURE); } for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { sprintf(pathname, "%s/idmap_test_%zu", argv[1], i); fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH); if (fd[i] < 0) { ssize_t j; for (j = i; j >= 0; j--) close(fd[j]); exit(EXIT_FAILURE); } } for (k = 0; k < 1000; k++) { ret = gettimeofday(&t1, NULL); if (ret < 0) goto close_all; for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { ret = fstat(fd[i], &st); if (ret < 0) goto close_all; } ret = gettimeofday(&t2, NULL); if (ret < 0) goto close_all; time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) - (1000000 * t1.tv_sec + t1.tv_usec); printf("Total time in micro seconds: %" PRIu64 "\n", time_in_mcs); printf("Total time in nanoseconds: %" PRIu64 "\n", time_in_mcs * 1000); printf("Time per file in nanoseconds: %" PRIu64 "\n", (time_in_mcs * 1000) / 1000000); times[k] = (time_in_mcs * 1000) / 1000000; } close_all: for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) close(fd[i]); if (ret < 0) exit(EXIT_FAILURE); for (k = 0; k < 1000; k++) { sum += times[k]; } printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000); exit(EXIT_SUCCESS);; } Signed-off-by: Christian Brauner CC: Serge Hallyn CC: Eric Biederman Signed-off-by: Eric W. Biederman --- include/linux/user_namespace.h | 7 +- kernel/user_namespace.c | 350 ++++++++++++++++++++++++++++++--- 2 files changed, 324 insertions(+), 33 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7c83d7f6289b..1c1046a60fb4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -10,7 +10,8 @@ #include #include -#define UID_GID_MAP_MAX_EXTENTS 5 +#define UID_GID_MAP_MAX_BASE_EXTENTS 5 +#define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; @@ -18,10 +19,10 @@ struct uid_gid_extent { u32 count; }; -struct uid_gid_map { /* 64 bytes -- 1 cache line */ +struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { - struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS]; + struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..5fd2d53dbc75 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->gid_map.forward); + kfree(ns->gid_map.reverse); + } + if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->uid_map.forward); + kfree(ns->uid_map.reverse); + } + if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->projid_map.forward); + kfree(ns->projid_map.reverse); + } retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); @@ -198,7 +212,84 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +/** + * idmap_key struct holds the information necessary to find an idmapping in a + * sorted idmap array. It is passed to cmp_map_id() as first argument. + */ +struct idmap_key { + bool map_up; /* true -> id from kid; false -> kid from id */ + u32 id; /* id to find */ + u32 count; /* == 0 unless used with map_id_range_down() */ +}; + +/** + * cmp_map_id - Function to be passed to bsearch() to find the requested + * idmapping. Expects struct idmap_key to be passed via @k. + */ +static int cmp_map_id(const void *k, const void *e) +{ + u32 first, last, id2; + const struct idmap_key *key = k; + const struct uid_gid_extent *el = e; + + /* handle map_id_range_down() */ + if (key->count) + id2 = key->id + key->count - 1; + else + id2 = key->id; + + /* handle map_id_{down,up}() */ + if (key->map_up) + first = el->lower_first; + else + first = el->first; + + last = first + el->count - 1; + + if (key->id >= first && key->id <= last && + (id2 >= first && id2 <= last)) + return 0; + + if (key->id < first || id2 < first) + return -1; + + return 1; +} + +/** + * map_id_range_down_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = false; + key.count = count; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; +} + +/** + * map_id_range_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) { unsigned idx, extents; u32 first, last, id2; @@ -224,7 +315,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_range_down_base(map, id, count); + + return map_id_range_down_max(map, id, count); +} + +/** + * map_id_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_down_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -247,7 +354,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +static u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_down_base(map, id); + + return map_id_range_down_max(map, id, 0); +} + +/** + * map_id_up_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -270,6 +393,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +/** + * map_id_up_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = true; + key.count = 0; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->lower_first) + extent->first; + else + id = (u32) -1; + + return id; +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_up_base(map, id); + + return map_id_up_max(map, id); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in @@ -540,13 +702,15 @@ static int projid_m_show(struct seq_file *seq, void *v) static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { - struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; - if (pos < map->nr_extents) - extent = &map->extent[pos]; + if (pos >= map->nr_extents) + return NULL; - return extent; + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return &map->extent[pos]; + + return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) @@ -618,7 +782,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map, u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; - prev = &new_map->extent[idx]; + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + prev = &new_map->extent[idx]; + else + prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; @@ -638,6 +805,102 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } +/** + * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. + * Takes care to allocate a 4K block of memory if the number of mappings exceeds + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) +{ + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { + map->extent[map->nr_extents].first = extent->first; + map->extent[map->nr_extents].lower_first = extent->lower_first; + map->extent[map->nr_extents].count = extent->count; + return 0; + } + + if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { + struct uid_gid_extent *forward; + + /* Allocate memory for 340 mappings. */ + forward = kmalloc(sizeof(struct uid_gid_extent) * + UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + if (!forward) + return -ENOMEM; + + /* Copy over memory. Only set up memory for the forward pointer. + * Defer the memory setup for the reverse pointer. + */ + memcpy(forward, map->extent, + map->nr_extents * sizeof(map->extent[0])); + + map->forward = forward; + map->reverse = NULL; + } + + map->forward[map->nr_extents].first = extent->first; + map->forward[map->nr_extents].lower_first = extent->lower_first; + map->forward[map->nr_extents].count = extent->count; + return 0; +} + +/* cmp function to sort() forward mappings */ +static int cmp_extents_forward(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->first < e2->first) + return -1; + + if (e1->first > e2->first) + return 1; + + return 0; +} + +/* cmp function to sort() reverse mappings */ +static int cmp_extents_reverse(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->lower_first < e2->lower_first) + return -1; + + if (e1->lower_first > e2->lower_first) + return 1; + + return 0; +} + +/** + * sort_idmaps - Sorts an array of idmap entries. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int sort_idmaps(struct uid_gid_map *map) +{ + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return 0; + + /* Sort forward array. */ + sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_forward, NULL); + + /* Only copy the memory from forward we actually need. */ + map->reverse = kmemdup(map->forward, + map->nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL); + if (!map->reverse) + return -ENOMEM; + + /* Sort reverse array. */ + sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_reverse, NULL); + + return 0; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -648,7 +911,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent = NULL; + struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; @@ -673,6 +936,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, */ mutex_lock(&userns_state_mutex); + memset(&new_map, 0, sizeof(struct uid_gid_map)); + ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) @@ -700,9 +965,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Parse the user data */ ret = -EINVAL; pos = kbuf; - new_map.nr_extents = 0; for (; pos; pos = next_line) { - extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); @@ -714,17 +977,17 @@ static ssize_t map_write(struct file *file, const char __user *buf, } pos = skip_spaces(pos); - extent->first = simple_strtoul(pos, &pos, 10); + extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->lower_first = simple_strtoul(pos, &pos, 10); + extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->count = simple_strtoul(pos, &pos, 10); + extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; @@ -734,29 +997,33 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; /* Verify we have been given valid starting values */ - if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1)) + if ((extent.first == (u32) -1) || + (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ - if ((extent->first + extent->count) <= extent->first) + if ((extent.first + extent.count) <= extent.first) goto out; - if ((extent->lower_first + extent->count) <= - extent->lower_first) + if ((extent.lower_first + extent.count) <= + extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ - if (mappings_overlap(&new_map, extent)) + if (mappings_overlap(&new_map, &extent)) goto out; - new_map.nr_extents++; - - /* Fail if the file contains too many extents */ - if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; + + ret = insert_extent(&new_map, &extent); + if (ret < 0) + goto out; + ret = -EINVAL; + + new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) @@ -767,16 +1034,26 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + + ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { + struct uid_gid_extent *e; u32 lower_first; - extent = &new_map.extent[idx]; + + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + e = &new_map.extent[idx]; + else + e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, - extent->lower_first, - extent->count); + e->lower_first, + e->count); /* Fail if we can not map the specified extent to * the kernel global id space. @@ -784,18 +1061,31 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (lower_first == (u32) -1) goto out; - extent->lower_first = lower_first; + e->lower_first = lower_first; } /* Install the map */ - memcpy(map->extent, new_map.extent, - new_map.nr_extents*sizeof(new_map.extent[0])); + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + memcpy(map->extent, new_map.extent, + new_map.nr_extents * sizeof(new_map.extent[0])); + } else { + map->forward = new_map.forward; + map->reverse = new_map.reverse; + } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: + if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(new_map.forward); + kfree(new_map.reverse); + map->forward = NULL; + map->reverse = NULL; + map->nr_extents = 0; + } + mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; From 11a8b9270e16e36d5fb607ba4b60db2958b7c625 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 15:54:32 -0500 Subject: [PATCH 3/7] userns: Don't special case a count of 0 We can always use a count of 1 so there is no reason to have a special case of a count of 0. Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5fd2d53dbc75..c9904ee084c4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -232,11 +232,7 @@ static int cmp_map_id(const void *k, const void *e) const struct idmap_key *key = k; const struct uid_gid_extent *el = e; - /* handle map_id_range_down() */ - if (key->count) - id2 = key->id + key->count - 1; - else - id2 = key->id; + id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) @@ -362,7 +358,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return map_id_down_base(map, id); - return map_id_range_down_max(map, id, 0); + return map_id_range_down_max(map, id, 1); } /** @@ -404,7 +400,7 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id) struct idmap_key key; key.map_up = true; - key.count = 0; + key.count = 1; key.id = id; extents = map->nr_extents; From 3edf652fa16562fb57a5a4b996ba72e2d7cdc38b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 16:27:29 -0500 Subject: [PATCH 4/7] userns: Simplify the user and group mapping functions Consolidate reading the number of extents and computing the return value in the map_id_down, map_id_range_down and map_id_range. This removal of one read of extents makes one smp_rmb unnecessary and makes the code safe it is executed during the map write. Reading the number of extents twice and depending on the result being the same is not safe, as it could be 0 the first time and > 5 the second time, which would lead to misinterpreting the union fields. The consolidation of the return value just removes a duplicate caluculation which should make it easier to understand and maintain the code. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 168 ++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 92 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c9904ee084c4..563a2981d7c7 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -256,21 +256,54 @@ static int cmp_map_id(const void *k, const void *e) * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +static struct uid_gid_extent * +map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { - u32 extents; - struct uid_gid_extent *extent; struct idmap_key key; key.map_up = false; key.count = count; key.id = id; - extents = map->nr_extents; + return bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); +} + +/** + * map_id_range_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static struct uid_gid_extent * +map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) +{ + unsigned idx; + u32 first, last, id2; + + id2 = id + count - 1; + + /* Find the matching extent */ + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) + return &map->extent[idx]; + } + return NULL; +} + +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +{ + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); - extent = bsearch(&key, map->forward, extents, - sizeof(struct uid_gid_extent), cmp_map_id); + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = map_id_range_down_base(extents, map, id, count); + else + extent = map_id_range_down_max(extents, map, id, count); + /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; @@ -280,85 +313,45 @@ static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) return id; } -/** - * map_id_range_down_base - Find idmap via binary search in static extent array. - * Can only be called if number of mappings is equal or less than - * UID_GID_MAP_MAX_BASE_EXTENTS. - */ -static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) -{ - unsigned idx, extents; - u32 first, last, id2; - - id2 = id + count - 1; - - /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); - for (idx = 0; idx < extents; idx++) { - first = map->extent[idx].first; - last = first + map->extent[idx].count - 1; - if (id >= first && id <= last && - (id2 >= first && id2 <= last)) - break; - } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; - - return id; -} - -static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) -{ - u32 extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_range_down_base(map, id, count); - - return map_id_range_down_max(map, id, count); -} - /** * map_id_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_down_base(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; - - return id; + return NULL; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { - u32 extents = map->nr_extents; + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_down_base(map, id); + extent = map_id_down_base(extents, map, id); + else + extent = map_id_range_down_max(extents, map, id, 1); - return map_id_range_down_max(map, id, 1); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; } /** @@ -366,48 +359,50 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_up_base(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].first; - else - id = (u32) -1; - - return id; + return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { - u32 extents; - struct uid_gid_extent *extent; struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; - extents = map->nr_extents; + return bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); - extent = bsearch(&key, map->reverse, extents, - sizeof(struct uid_gid_extent), cmp_map_id); + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = map_id_up_base(extents, map, id); + else + extent = map_id_up_max(extents, map, id); + /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; @@ -417,17 +412,6 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) -{ - u32 extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_up_base(map, id); - - return map_id_up_max(map, id); -} - /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in From d5e7b3c5f51fc6d34e12b6d87bfd30ab277c4625 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 17:09:34 -0500 Subject: [PATCH 5/7] userns: Don't read extents twice in m_start This is important so reading /proc//{uid_map,gid_map,projid_map} while the map is being written does not do strange things. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 563a2981d7c7..4f7e357ac1e2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; + unsigned extents = map->nr_extents; + smp_rmb(); - if (pos >= map->nr_extents) + if (pos >= extents) return NULL; - if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; From ece66133979b211324cc6aff9285889b425243d2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 16:53:09 -0500 Subject: [PATCH 6/7] userns: Make map_id_down a wrapper for map_id_range_down There is no good reason for this code duplication, the number of cache line accesses not the number of instructions are the bottleneck in this code. Therefore simplify maintenance by removing unnecessary code. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4f7e357ac1e2..1d0298870ee3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -313,45 +313,9 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -/** - * map_id_down_base - Find idmap via binary search in static extent array. - * Can only be called if number of mappings is equal or less than - * UID_GID_MAP_MAX_BASE_EXTENTS. - */ -static struct uid_gid_extent * -map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id) -{ - unsigned idx; - u32 first, last; - - /* Find the matching extent */ - for (idx = 0; idx < extents; idx++) { - first = map->extent[idx].first; - last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) - return &map->extent[idx]; - } - return NULL; -} - static u32 map_id_down(struct uid_gid_map *map, u32 id) { - struct uid_gid_extent *extent; - unsigned extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_down_base(extents, map, id); - else - extent = map_id_range_down_max(extents, map, id, 1); - - /* Map the id or note failure */ - if (extent) - id = (id - extent->first) + extent->lower_first; - else - id = (u32) -1; - - return id; + return map_id_range_down(map, id, 1); } /** From 3fda0e737e906ce73220b20c27e7f792d0aac6a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 17:15:30 -0500 Subject: [PATCH 7/7] userns: Simplify insert_extent Consolidate the code to write to the new mapping at the end of the function to remove the duplication. Move the increase in the number of mappings into insert_extent, keeping the logic together. Just a small increase in readability and maintainability. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d0298870ee3..899c31060ff3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -758,12 +758,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { - if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { - map->extent[map->nr_extents].first = extent->first; - map->extent[map->nr_extents].lower_first = extent->lower_first; - map->extent[map->nr_extents].count = extent->count; - return 0; - } + struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; @@ -784,9 +779,13 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) map->reverse = NULL; } - map->forward[map->nr_extents].first = extent->first; - map->forward[map->nr_extents].lower_first = extent->lower_first; - map->forward[map->nr_extents].count = extent->count; + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) + dest = &map->extent[map->nr_extents]; + else + dest = &map->forward[map->nr_extents]; + + *dest = *extent; + map->nr_extents++; return 0; } @@ -968,8 +967,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (ret < 0) goto out; ret = -EINVAL; - - new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0)