2006-12-08 03:37:58 -07:00
|
|
|
#ifndef _LINUX_PID_NS_H
|
|
|
|
#define _LINUX_PID_NS_H
|
2006-10-02 03:17:23 -06:00
|
|
|
|
|
|
|
#include <linux/sched.h>
|
2011-11-23 18:12:59 -07:00
|
|
|
#include <linux/bug.h>
|
2006-10-02 03:17:23 -06:00
|
|
|
#include <linux/mm.h>
|
2013-05-07 17:19:08 -06:00
|
|
|
#include <linux/workqueue.h>
|
2006-10-02 03:17:23 -06:00
|
|
|
#include <linux/threads.h>
|
2006-12-08 03:37:59 -07:00
|
|
|
#include <linux/nsproxy.h>
|
|
|
|
#include <linux/kref.h>
|
2006-10-02 03:17:23 -06:00
|
|
|
|
|
|
|
struct pidmap {
|
|
|
|
atomic_t nr_free;
|
|
|
|
void *page;
|
|
|
|
};
|
|
|
|
|
2013-04-30 16:28:27 -06:00
|
|
|
#define BITS_PER_PAGE (PAGE_SIZE * 8)
|
|
|
|
#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
|
|
|
|
#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
|
2006-10-02 03:17:23 -06:00
|
|
|
|
2008-07-25 02:48:43 -06:00
|
|
|
struct bsd_acct_struct;
|
|
|
|
|
2006-12-08 03:37:58 -07:00
|
|
|
struct pid_namespace {
|
2006-12-08 03:37:59 -07:00
|
|
|
struct kref kref;
|
|
|
|
struct pidmap pidmap[PIDMAP_ENTRIES];
|
2013-10-03 11:28:06 -06:00
|
|
|
struct rcu_head rcu;
|
2006-12-08 03:37:59 -07:00
|
|
|
int last_pid;
|
2012-12-21 21:27:12 -07:00
|
|
|
unsigned int nr_hashed;
|
2006-12-08 03:38:01 -07:00
|
|
|
struct task_struct *child_reaper;
|
2007-10-19 00:39:48 -06:00
|
|
|
struct kmem_cache *pid_cachep;
|
2008-04-30 01:54:31 -06:00
|
|
|
unsigned int level;
|
2007-10-19 00:40:04 -06:00
|
|
|
struct pid_namespace *parent;
|
2007-10-19 00:40:08 -06:00
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
struct vfsmount *proc_mnt;
|
2013-03-29 17:27:05 -06:00
|
|
|
struct dentry *proc_self;
|
2007-10-19 00:40:08 -06:00
|
|
|
#endif
|
2008-07-25 02:48:43 -06:00
|
|
|
#ifdef CONFIG_BSD_PROCESS_ACCT
|
|
|
|
struct bsd_acct_struct *bacct;
|
|
|
|
#endif
|
2012-08-02 05:25:10 -06:00
|
|
|
struct user_namespace *user_ns;
|
2012-08-01 11:33:47 -06:00
|
|
|
struct work_struct proc_work;
|
2012-02-09 09:48:21 -07:00
|
|
|
kgid_t pid_gid;
|
procfs: add hidepid= and gid= mount options
Add support for mount options to restrict access to /proc/PID/
directories. The default backward-compatible "relaxed" behaviour is left
untouched.
The first mount option is called "hidepid" and its value defines how much
info about processes we want to be available for non-owners:
hidepid=0 (default) means the old behavior - anybody may read all
world-readable /proc/PID/* files.
hidepid=1 means users may not access any /proc/<pid>/ directories, but
their own. Sensitive files like cmdline, sched*, status are now protected
against other users. As permission checking done in proc_pid_permission()
and files' permissions are left untouched, programs expecting specific
files' modes are not confused.
hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other
users. It doesn't mean that it hides whether a process exists (it can be
learned by other means, e.g. by kill -0 $PID), but it hides process' euid
and egid. It compicates intruder's task of gathering info about running
processes, whether some daemon runs with elevated privileges, whether
another user runs some sensitive program, whether other users run any
program at all, etc.
gid=XXX defines a group that will be able to gather all processes' info
(as in hidepid=0 mode). This group should be used instead of putting
nonroot user in sudoers file or something. However, untrusted users (like
daemons, etc.) which are not supposed to monitor the tasks in the whole
system should not be added to the group.
hidepid=1 or higher is designed to restrict access to procfs files, which
might reveal some sensitive private information like precise keystrokes
timings:
http://www.openwall.com/lists/oss-security/2011/11/05/3
hidepid=1/2 doesn't break monitoring userspace tools. ps, top, pgrep, and
conky gracefully handle EPERM/ENOENT and behave as if the current user is
the only user running processes. pstree shows the process subtree which
contains "pstree" process.
Note: the patch doesn't deal with setuid/setgid issues of keeping
preopened descriptors of procfs files (like
https://lkml.org/lkml/2011/2/7/368). We rely on that the leaked
information like the scheduling counters of setuid apps doesn't threaten
anybody's privacy - only the user started the setuid program may read the
counters.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Theodore Tso <tytso@MIT.EDU>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: James Morris <jmorris@namei.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-10 16:11:31 -07:00
|
|
|
int hide_pid;
|
pidns: add reboot_pid_ns() to handle the reboot syscall
In the case of a child pid namespace, rebooting the system does not really
makes sense. When the pid namespace is used in conjunction with the other
namespaces in order to create a linux container, the reboot syscall leads
to some problems.
A container can reboot the host. That can be fixed by dropping the
sys_reboot capability but we are unable to correctly to poweroff/
halt/reboot a container and the container stays stuck at the shutdown time
with the container's init process waiting indefinitively.
After several attempts, no solution from userspace was found to reliabily
handle the shutdown from a container.
This patch propose to make the init process of the child pid namespace to
exit with a signal status set to : SIGINT if the child pid namespace
called "halt/poweroff" and SIGHUP if the child pid namespace called
"reboot". When the reboot syscall is called and we are not in the initial
pid namespace, we kill the pid namespace for "HALT", "POWEROFF",
"RESTART", and "RESTART2". Otherwise we return EINVAL.
Returning EINVAL is also an easy way to check if this feature is supported
by the kernel when invoking another 'reboot' option like CAD.
By this way the parent process of the child pid namespace knows if it
rebooted or not and can take the right decision.
Test case:
==========
#include <alloca.h>
#include <stdio.h>
#include <sched.h>
#include <unistd.h>
#include <signal.h>
#include <sys/reboot.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <linux/reboot.h>
static int do_reboot(void *arg)
{
int *cmd = arg;
if (reboot(*cmd))
printf("failed to reboot(%d): %m\n", *cmd);
}
int test_reboot(int cmd, int sig)
{
long stack_size = 4096;
void *stack = alloca(stack_size) + stack_size;
int status;
pid_t ret;
ret = clone(do_reboot, stack, CLONE_NEWPID | SIGCHLD, &cmd);
if (ret < 0) {
printf("failed to clone: %m\n");
return -1;
}
if (wait(&status) < 0) {
printf("unexpected wait error: %m\n");
return -1;
}
if (!WIFSIGNALED(status)) {
printf("child process exited but was not signaled\n");
return -1;
}
if (WTERMSIG(status) != sig) {
printf("signal termination is not the one expected\n");
return -1;
}
return 0;
}
int main(int argc, char *argv[])
{
int status;
status = test_reboot(LINUX_REBOOT_CMD_RESTART, SIGHUP);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_RESTART) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_RESTART2, SIGHUP);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_RESTART2) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_HALT, SIGINT);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_HALT) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_POWER_OFF, SIGINT);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_POWERR_OFF) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_CAD_ON, -1);
if (status >= 0) {
printf("reboot(LINUX_REBOOT_CMD_CAD_ON) should have failed\n");
return 1;
}
printf("reboot(LINUX_REBOOT_CMD_CAD_ON) has failed as expected\n");
return 0;
}
[akpm@linux-foundation.org: tweak and add comments]
[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Tested-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-28 15:42:51 -06:00
|
|
|
int reboot; /* group exit code if this pidns was rebooted */
|
2011-06-15 11:21:48 -06:00
|
|
|
unsigned int proc_inum;
|
2006-10-02 03:17:24 -06:00
|
|
|
};
|
|
|
|
|
2006-12-08 03:37:58 -07:00
|
|
|
extern struct pid_namespace init_pid_ns;
|
2006-10-02 03:17:24 -06:00
|
|
|
|
2012-12-21 21:27:12 -07:00
|
|
|
#define PIDNS_HASH_ADDING (1U << 31)
|
|
|
|
|
2007-11-14 18:00:13 -07:00
|
|
|
#ifdef CONFIG_PID_NS
|
2007-10-19 00:39:47 -06:00
|
|
|
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
|
2006-12-08 03:37:59 -07:00
|
|
|
{
|
2007-10-19 00:40:09 -06:00
|
|
|
if (ns != &init_pid_ns)
|
|
|
|
kref_get(&ns->kref);
|
2007-10-19 00:39:47 -06:00
|
|
|
return ns;
|
2006-12-08 03:37:59 -07:00
|
|
|
}
|
|
|
|
|
2012-08-02 05:25:10 -06:00
|
|
|
extern struct pid_namespace *copy_pid_ns(unsigned long flags,
|
|
|
|
struct user_namespace *user_ns, struct pid_namespace *ns);
|
2008-02-08 05:18:24 -07:00
|
|
|
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
|
pidns: add reboot_pid_ns() to handle the reboot syscall
In the case of a child pid namespace, rebooting the system does not really
makes sense. When the pid namespace is used in conjunction with the other
namespaces in order to create a linux container, the reboot syscall leads
to some problems.
A container can reboot the host. That can be fixed by dropping the
sys_reboot capability but we are unable to correctly to poweroff/
halt/reboot a container and the container stays stuck at the shutdown time
with the container's init process waiting indefinitively.
After several attempts, no solution from userspace was found to reliabily
handle the shutdown from a container.
This patch propose to make the init process of the child pid namespace to
exit with a signal status set to : SIGINT if the child pid namespace
called "halt/poweroff" and SIGHUP if the child pid namespace called
"reboot". When the reboot syscall is called and we are not in the initial
pid namespace, we kill the pid namespace for "HALT", "POWEROFF",
"RESTART", and "RESTART2". Otherwise we return EINVAL.
Returning EINVAL is also an easy way to check if this feature is supported
by the kernel when invoking another 'reboot' option like CAD.
By this way the parent process of the child pid namespace knows if it
rebooted or not and can take the right decision.
Test case:
==========
#include <alloca.h>
#include <stdio.h>
#include <sched.h>
#include <unistd.h>
#include <signal.h>
#include <sys/reboot.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <linux/reboot.h>
static int do_reboot(void *arg)
{
int *cmd = arg;
if (reboot(*cmd))
printf("failed to reboot(%d): %m\n", *cmd);
}
int test_reboot(int cmd, int sig)
{
long stack_size = 4096;
void *stack = alloca(stack_size) + stack_size;
int status;
pid_t ret;
ret = clone(do_reboot, stack, CLONE_NEWPID | SIGCHLD, &cmd);
if (ret < 0) {
printf("failed to clone: %m\n");
return -1;
}
if (wait(&status) < 0) {
printf("unexpected wait error: %m\n");
return -1;
}
if (!WIFSIGNALED(status)) {
printf("child process exited but was not signaled\n");
return -1;
}
if (WTERMSIG(status) != sig) {
printf("signal termination is not the one expected\n");
return -1;
}
return 0;
}
int main(int argc, char *argv[])
{
int status;
status = test_reboot(LINUX_REBOOT_CMD_RESTART, SIGHUP);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_RESTART) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_RESTART2, SIGHUP);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_RESTART2) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_HALT, SIGINT);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_HALT) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_POWER_OFF, SIGINT);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_POWERR_OFF) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_CAD_ON, -1);
if (status >= 0) {
printf("reboot(LINUX_REBOOT_CMD_CAD_ON) should have failed\n");
return 1;
}
printf("reboot(LINUX_REBOOT_CMD_CAD_ON) has failed as expected\n");
return 0;
}
[akpm@linux-foundation.org: tweak and add comments]
[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Tested-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-28 15:42:51 -06:00
|
|
|
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
|
2012-10-19 14:56:53 -06:00
|
|
|
extern void put_pid_ns(struct pid_namespace *ns);
|
2006-12-08 03:37:59 -07:00
|
|
|
|
2007-11-14 18:00:13 -07:00
|
|
|
#else /* !CONFIG_PID_NS */
|
|
|
|
#include <linux/err.h>
|
|
|
|
|
|
|
|
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
return ns;
|
|
|
|
}
|
|
|
|
|
2012-08-02 05:25:10 -06:00
|
|
|
static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
|
|
|
|
struct user_namespace *user_ns, struct pid_namespace *ns)
|
2007-11-14 18:00:13 -07:00
|
|
|
{
|
|
|
|
if (flags & CLONE_NEWPID)
|
|
|
|
ns = ERR_PTR(-EINVAL);
|
|
|
|
return ns;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void put_pid_ns(struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-02-08 05:18:24 -07:00
|
|
|
static inline void zap_pid_ns_processes(struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
}
|
pidns: add reboot_pid_ns() to handle the reboot syscall
In the case of a child pid namespace, rebooting the system does not really
makes sense. When the pid namespace is used in conjunction with the other
namespaces in order to create a linux container, the reboot syscall leads
to some problems.
A container can reboot the host. That can be fixed by dropping the
sys_reboot capability but we are unable to correctly to poweroff/
halt/reboot a container and the container stays stuck at the shutdown time
with the container's init process waiting indefinitively.
After several attempts, no solution from userspace was found to reliabily
handle the shutdown from a container.
This patch propose to make the init process of the child pid namespace to
exit with a signal status set to : SIGINT if the child pid namespace
called "halt/poweroff" and SIGHUP if the child pid namespace called
"reboot". When the reboot syscall is called and we are not in the initial
pid namespace, we kill the pid namespace for "HALT", "POWEROFF",
"RESTART", and "RESTART2". Otherwise we return EINVAL.
Returning EINVAL is also an easy way to check if this feature is supported
by the kernel when invoking another 'reboot' option like CAD.
By this way the parent process of the child pid namespace knows if it
rebooted or not and can take the right decision.
Test case:
==========
#include <alloca.h>
#include <stdio.h>
#include <sched.h>
#include <unistd.h>
#include <signal.h>
#include <sys/reboot.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <linux/reboot.h>
static int do_reboot(void *arg)
{
int *cmd = arg;
if (reboot(*cmd))
printf("failed to reboot(%d): %m\n", *cmd);
}
int test_reboot(int cmd, int sig)
{
long stack_size = 4096;
void *stack = alloca(stack_size) + stack_size;
int status;
pid_t ret;
ret = clone(do_reboot, stack, CLONE_NEWPID | SIGCHLD, &cmd);
if (ret < 0) {
printf("failed to clone: %m\n");
return -1;
}
if (wait(&status) < 0) {
printf("unexpected wait error: %m\n");
return -1;
}
if (!WIFSIGNALED(status)) {
printf("child process exited but was not signaled\n");
return -1;
}
if (WTERMSIG(status) != sig) {
printf("signal termination is not the one expected\n");
return -1;
}
return 0;
}
int main(int argc, char *argv[])
{
int status;
status = test_reboot(LINUX_REBOOT_CMD_RESTART, SIGHUP);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_RESTART) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_RESTART2, SIGHUP);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_RESTART2) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_HALT, SIGINT);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_HALT) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_POWER_OFF, SIGINT);
if (status < 0)
return 1;
printf("reboot(LINUX_REBOOT_CMD_POWERR_OFF) succeed\n");
status = test_reboot(LINUX_REBOOT_CMD_CAD_ON, -1);
if (status >= 0) {
printf("reboot(LINUX_REBOOT_CMD_CAD_ON) should have failed\n");
return 1;
}
printf("reboot(LINUX_REBOOT_CMD_CAD_ON) has failed as expected\n");
return 0;
}
[akpm@linux-foundation.org: tweak and add comments]
[akpm@linux-foundation.org: checkpatch fixes]
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Tested-by: Serge Hallyn <serge.hallyn@canonical.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-28 15:42:51 -06:00
|
|
|
|
|
|
|
static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2007-11-14 18:00:13 -07:00
|
|
|
#endif /* CONFIG_PID_NS */
|
|
|
|
|
2009-01-07 19:08:49 -07:00
|
|
|
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
|
2008-07-25 02:48:34 -06:00
|
|
|
void pidhash_init(void);
|
|
|
|
void pidmap_init(void);
|
|
|
|
|
2006-12-08 03:37:58 -07:00
|
|
|
#endif /* _LINUX_PID_NS_H */
|