diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index a09a8eb80665..e2d9afc30d2d 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -192,6 +192,7 @@ containing the following files describing that cpuset: - cpus: list of CPUs in that cpuset - mems: list of Memory Nodes in that cpuset + - memory_migrate flag: if set, move pages to cpusets nodes - cpu_exclusive flag: is cpu placement exclusive? - mem_exclusive flag: is memory placement exclusive? - tasks: list of tasks (by pid) attached to that cpuset @@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset. This is done to avoid impacting the scheduler code in the kernel with a check for changes in a tasks processor placement. +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'mems' subsequently changes. +If the cpuset flag file 'memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the tasks new cpuset. Depending on the implementation, +this migration may either be done by swapping the page out, +so that the next time the page is referenced, it will be paged +into the tasks new cpuset, usually on the node where it was +referenced, or this migration may be done by directly copying +the pages from the tasks previous cpuset to the new cpuset, +where possible to the same node, relative to the new cpuset, +as the node that held the page, relative to the old cpuset. +Also if 'memory_migrate' is set true, then if that cpusets +'mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'mems', +will be moved to nodes in the new setting of 'mems.' Again, +depending on the implementation, this might be done by swapping, +or by direct copying. In either case, pages that were not in +the tasks prior cpuset, or in the cpusets prior 'mems' setting, +will not be moved. + There is an exception to the above. If hotplug functionality is used to remove all the CPUs that are currently assigned to a cpuset, then the kernel will automatically update the cpus_allowed of all diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3e61e829681d..66247eff24a0 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER); } +static inline int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, + const nodemask_t *to_nodes, int flags) +{ + return 0; +} + static inline void check_highest_zone(int k) { } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f9816..f63383e01ec7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -87,6 +87,7 @@ struct cpuset { typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + CS_MEMORY_MIGRATE, CS_REMOVED, CS_NOTIFY_ON_RELEASE } cpuset_flagbits_t; @@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs) return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); } +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + /* * Increment this atomic integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -602,16 +608,24 @@ static void refresh_mems(void) if (current->cpuset_mems_generation != my_cpusets_mem_gen) { struct cpuset *cs; nodemask_t oldmem = current->mems_allowed; + int migrate; down(&callback_sem); task_lock(current); cs = current->cpuset; + migrate = is_memory_migrate(cs); guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; task_unlock(current); up(&callback_sem); - if (!nodes_equal(oldmem, current->mems_allowed)) + if (!nodes_equal(oldmem, current->mems_allowed)) { numa_policy_rebind(&oldmem, ¤t->mems_allowed); + if (migrate) { + do_migrate_pages(current->mm, &oldmem, + ¤t->mems_allowed, + MPOL_MF_MOVE_ALL); + } + } } } @@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * @@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; + nodemask_t from, to; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + from = oldcs->mems_allowed; + to = cs->mems_allowed; + up(&callback_sem); + if (is_memory_migrate(cs)) + do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); @@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) typedef enum { FILE_ROOT, FILE_DIR, + FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_CPU_EXCLUSIVE, @@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us case FILE_NOTIFY_ON_RELEASE: retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; + case FILE_MEMORY_MIGRATE: + retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_NOTIFY_ON_RELEASE: *s++ = notify_on_release(cs) ? '1' : '0'; break; + case FILE_MEMORY_MIGRATE: + *s++ = is_memory_migrate(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = { .private = FILE_NOTIFY_ON_RELEASE, }; +static struct cftype cft_memory_migrate = { + .name = "memory_migrate", + .private = FILE_MEMORY_MIGRATE, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) return err; if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0;