sysctl: add the kernel.ns_last_pid control
The sysctl works on the current task's pid namespace, getting and setting its last_pid field. Writing is allowed for CAP_SYS_ADMIN-capable tasks thus making it possible to create a task with desired pid value. This ability is required badly for the checkpoint/restore in userspace. This approach suits all the parties for now. Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Acked-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
f5138e4221
commit
b8f566b04d
3 changed files with 42 additions and 1 deletions
|
@ -415,6 +415,14 @@ PIDs of value pid_max or larger are not allocated.
|
||||||
|
|
||||||
==============================================================
|
==============================================================
|
||||||
|
|
||||||
|
ns_last_pid:
|
||||||
|
|
||||||
|
The last pid allocated in the current (the one task using this sysctl
|
||||||
|
lives in) pid namespace. When selecting a pid for a next task on fork
|
||||||
|
kernel tries to allocate a number starting from this one.
|
||||||
|
|
||||||
|
==============================================================
|
||||||
|
|
||||||
powersave-nap: (PPC only)
|
powersave-nap: (PPC only)
|
||||||
|
|
||||||
If set, Linux-PPC will use the 'nap' mode of powersaving,
|
If set, Linux-PPC will use the 'nap' mode of powersaving,
|
||||||
|
|
|
@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We might be racing with someone else trying to set pid_ns->last_pid.
|
* We might be racing with someone else trying to set pid_ns->last_pid
|
||||||
|
* at the pid allocation time (there's also a sysctl for this, but racing
|
||||||
|
* with this one is OK, see comment in kernel/pid_namespace.c about it).
|
||||||
* We want the winner to have the "later" value, because if the
|
* We want the winner to have the "later" value, because if the
|
||||||
* "earlier" value prevails, then a pid may get reused immediately.
|
* "earlier" value prevails, then a pid may get reused immediately.
|
||||||
*
|
*
|
||||||
|
|
|
@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
|
||||||
|
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||||
|
{
|
||||||
|
struct ctl_table tmp = *table;
|
||||||
|
|
||||||
|
if (write && !capable(CAP_SYS_ADMIN))
|
||||||
|
return -EPERM;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Writing directly to ns' last_pid field is OK, since this field
|
||||||
|
* is volatile in a living namespace anyway and a code writing to
|
||||||
|
* it should synchronize its usage with external means.
|
||||||
|
*/
|
||||||
|
|
||||||
|
tmp.data = ¤t->nsproxy->pid_ns->last_pid;
|
||||||
|
return proc_dointvec(&tmp, write, buffer, lenp, ppos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ctl_table pid_ns_ctl_table[] = {
|
||||||
|
{
|
||||||
|
.procname = "ns_last_pid",
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0666, /* permissions are checked in the handler */
|
||||||
|
.proc_handler = pid_ns_ctl_handler,
|
||||||
|
},
|
||||||
|
{ }
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
|
||||||
|
|
||||||
static __init int pid_namespaces_init(void)
|
static __init int pid_namespaces_init(void)
|
||||||
{
|
{
|
||||||
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
|
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
|
||||||
|
register_sysctl_paths(kern_path, pid_ns_ctl_table);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue