1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds * Generic pidhash and scalable, time-bounded PID allocator
41da177e4SLinus Torvalds *
56d49e352SNadia Yvette Chambers * (C) 2002-2003 Nadia Yvette Chambers, IBM
66d49e352SNadia Yvette Chambers * (C) 2004 Nadia Yvette Chambers, Oracle
71da177e4SLinus Torvalds * (C) 2002-2004 Ingo Molnar, Red Hat
81da177e4SLinus Torvalds *
91da177e4SLinus Torvalds * pid-structures are backing objects for tasks sharing a given ID to chain
101da177e4SLinus Torvalds * against. There is very little to them aside from hashing them and
111da177e4SLinus Torvalds * parking tasks using given ID's on a list.
121da177e4SLinus Torvalds *
131da177e4SLinus Torvalds * The hash is always changed with the tasklist_lock write-acquired,
141da177e4SLinus Torvalds * and the hash is only accessed with the tasklist_lock at least
151da177e4SLinus Torvalds * read-acquired, so there's no additional SMP locking needed here.
161da177e4SLinus Torvalds *
171da177e4SLinus Torvalds * We have a list of bitmap pages, which bitmaps represent the PID space.
181da177e4SLinus Torvalds * Allocating and freeing PIDs is completely lockless. The worst-case
191da177e4SLinus Torvalds * allocation scenario when all but one out of 1 million PIDs possible are
201da177e4SLinus Torvalds * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
211da177e4SLinus Torvalds * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
2230e49c26SPavel Emelyanov *
2330e49c26SPavel Emelyanov * Pid namespaces:
2430e49c26SPavel Emelyanov * (C) 2007 Pavel Emelyanov <[email protected]>, OpenVZ, SWsoft Inc.
2530e49c26SPavel Emelyanov * (C) 2007 Sukadev Bhattiprolu <[email protected]>, IBM
2630e49c26SPavel Emelyanov * Many thanks to Oleg Nesterov for comments and help
2730e49c26SPavel Emelyanov *
281da177e4SLinus Torvalds */
291da177e4SLinus Torvalds
301da177e4SLinus Torvalds #include <linux/mm.h>
319984de1aSPaul Gortmaker #include <linux/export.h>
321da177e4SLinus Torvalds #include <linux/slab.h>
331da177e4SLinus Torvalds #include <linux/init.h>
3482524746SFranck Bui-Huu #include <linux/rculist.h>
3557c8a661SMike Rapoport #include <linux/memblock.h>
3661a58c6cSSukadev Bhattiprolu #include <linux/pid_namespace.h>
37820e45dbSSukadev Bhattiprolu #include <linux/init_task.h>
383eb07c8cSSukadev Bhattiprolu #include <linux/syscalls.h>
390bb80f24SDavid Howells #include <linux/proc_ns.h>
40f57e515aSJoel Fernandes (Google) #include <linux/refcount.h>
4132fcb426SChristian Brauner #include <linux/anon_inodes.h>
4232fcb426SChristian Brauner #include <linux/sched/signal.h>
4329930025SIngo Molnar #include <linux/sched/task.h>
4495846ecfSGargi Sharma #include <linux/idr.h>
45cb12fd8eSChristian Brauner #include <linux/pidfs.h>
4616ecd47cSChristian Brauner #include <linux/seqlock.h>
474969f8a0SKees Cook #include <net/sock.h>
486da73d15SChristian Brauner #include <uapi/linux/pidfd.h>
491da177e4SLinus Torvalds
50e1e871afSDavid Howells struct pid init_struct_pid = {
51f57e515aSJoel Fernandes (Google) .count = REFCOUNT_INIT(1),
52e1e871afSDavid Howells .tasks = {
53e1e871afSDavid Howells { .first = NULL },
54e1e871afSDavid Howells { .first = NULL },
55e1e871afSDavid Howells { .first = NULL },
56e1e871afSDavid Howells },
57e1e871afSDavid Howells .level = 0,
58e1e871afSDavid Howells .numbers = { {
59e1e871afSDavid Howells .nr = 0,
60e1e871afSDavid Howells .ns = &init_pid_ns,
61e1e871afSDavid Howells }, }
62e1e871afSDavid Howells };
631da177e4SLinus Torvalds
647863dcc7SChristian Brauner static int pid_max_min = RESERVED_PIDS + 1;
657863dcc7SChristian Brauner static int pid_max_max = PID_MAX_LIMIT;
661da177e4SLinus Torvalds
671da177e4SLinus Torvalds /*
681da177e4SLinus Torvalds * PID-map pages start out as NULL, they get allocated upon
691da177e4SLinus Torvalds * first use and are never deallocated. This way a low pid_max
701da177e4SLinus Torvalds * value does not cause lots of bitmaps to be allocated, but
711da177e4SLinus Torvalds * the scheme scales to up to 4 million PIDs, runtime.
721da177e4SLinus Torvalds */
7361a58c6cSSukadev Bhattiprolu struct pid_namespace init_pid_ns = {
748eb71d95SKirill Tkhai .ns.count = REFCOUNT_INIT(2),
75f6bb2a2cSMatthew Wilcox .idr = IDR_INIT(init_pid_ns.idr),
76e8cfbc24SGargi Sharma .pid_allocated = PIDNS_ADDING,
77faacbfd3SPavel Emelyanov .level = 0,
78faacbfd3SPavel Emelyanov .child_reaper = &init_task,
7949f4d8b9SEric W. Biederman .user_ns = &init_user_ns,
80435d5f4bSAl Viro .ns.inum = PROC_PID_INIT_INO,
8133c42940SAl Viro #ifdef CONFIG_PID_NS
8233c42940SAl Viro .ns.ops = &pidns_operations,
8333c42940SAl Viro #endif
847863dcc7SChristian Brauner .pid_max = PID_MAX_DEFAULT,
859876cfe8SAleksa Sarai #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
869876cfe8SAleksa Sarai .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
879876cfe8SAleksa Sarai #endif
883fbc9648SSukadev Bhattiprolu };
89198fe21bSPavel Emelyanov EXPORT_SYMBOL_GPL(init_pid_ns);
901da177e4SLinus Torvalds
911da177e4SLinus Torvalds static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
9216ecd47cSChristian Brauner seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
931da177e4SLinus Torvalds
put_pid(struct pid * pid)947ad5b3a5SHarvey Harrison void put_pid(struct pid *pid)
9592476d7fSEric W. Biederman {
96baf8f0f8SPavel Emelianov struct pid_namespace *ns;
97baf8f0f8SPavel Emelianov
9892476d7fSEric W. Biederman if (!pid)
9992476d7fSEric W. Biederman return;
100baf8f0f8SPavel Emelianov
1018ef047aaSPavel Emelyanov ns = pid->numbers[pid->level].ns;
102f57e515aSJoel Fernandes (Google) if (refcount_dec_and_test(&pid->count)) {
103baf8f0f8SPavel Emelianov kmem_cache_free(ns->pid_cachep, pid);
1048ef047aaSPavel Emelyanov put_pid_ns(ns);
1058ef047aaSPavel Emelyanov }
10692476d7fSEric W. Biederman }
107bbf73147SEric W. Biederman EXPORT_SYMBOL_GPL(put_pid);
10892476d7fSEric W. Biederman
delayed_put_pid(struct rcu_head * rhp)10992476d7fSEric W. Biederman static void delayed_put_pid(struct rcu_head *rhp)
11092476d7fSEric W. Biederman {
11192476d7fSEric W. Biederman struct pid *pid = container_of(rhp, struct pid, rcu);
11292476d7fSEric W. Biederman put_pid(pid);
11392476d7fSEric W. Biederman }
11492476d7fSEric W. Biederman
free_pid(struct pid * pid)1157ad5b3a5SHarvey Harrison void free_pid(struct pid *pid)
11692476d7fSEric W. Biederman {
1178ef047aaSPavel Emelyanov int i;
11892476d7fSEric W. Biederman
1197903f907SMateusz Guzik lockdep_assert_not_held(&tasklist_lock);
1207903f907SMateusz Guzik
121*627454c0SMateusz Guzik spin_lock(&pidmap_lock);
1220a01f2ccSEric W. Biederman for (i = 0; i <= pid->level; i++) {
1230a01f2ccSEric W. Biederman struct upid *upid = pid->numbers + i;
124af4b8a83SEric W. Biederman struct pid_namespace *ns = upid->ns;
125e8cfbc24SGargi Sharma switch (--ns->pid_allocated) {
126a6064885SEric W. Biederman case 2:
127af4b8a83SEric W. Biederman case 1:
128af4b8a83SEric W. Biederman /* When all that is left in the pid namespace
129af4b8a83SEric W. Biederman * is the reaper wake up the reaper. The reaper
130af4b8a83SEric W. Biederman * may be sleeping in zap_pid_ns_processes().
131af4b8a83SEric W. Biederman */
132af4b8a83SEric W. Biederman wake_up_process(ns->child_reaper);
133af4b8a83SEric W. Biederman break;
134e8cfbc24SGargi Sharma case PIDNS_ADDING:
135314a8ad0SOleg Nesterov /* Handle a fork failure of the first process */
136314a8ad0SOleg Nesterov WARN_ON(ns->child_reaper);
137e8cfbc24SGargi Sharma ns->pid_allocated = 0;
138af4b8a83SEric W. Biederman break;
1390a01f2ccSEric W. Biederman }
14095846ecfSGargi Sharma
14195846ecfSGargi Sharma idr_remove(&ns->idr, upid->nr);
1425e1182deSEric W. Biederman }
1439698d5a4SChristian Brauner pidfs_remove_pid(pid);
144*627454c0SMateusz Guzik spin_unlock(&pidmap_lock);
14592476d7fSEric W. Biederman
14692476d7fSEric W. Biederman call_rcu(&pid->rcu, delayed_put_pid);
14792476d7fSEric W. Biederman }
14892476d7fSEric W. Biederman
free_pids(struct pid ** pids)1497903f907SMateusz Guzik void free_pids(struct pid **pids)
1507903f907SMateusz Guzik {
1517903f907SMateusz Guzik int tmp;
1527903f907SMateusz Guzik
1537903f907SMateusz Guzik /*
1547903f907SMateusz Guzik * This can batch pidmap_lock.
1557903f907SMateusz Guzik */
1567903f907SMateusz Guzik for (tmp = PIDTYPE_MAX; --tmp >= 0; )
1577903f907SMateusz Guzik if (pids[tmp])
1587903f907SMateusz Guzik free_pid(pids[tmp]);
1597903f907SMateusz Guzik }
1607903f907SMateusz Guzik
alloc_pid(struct pid_namespace * ns,pid_t * set_tid,size_t set_tid_size)16149cb2fc4SAdrian Reber struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
16249cb2fc4SAdrian Reber size_t set_tid_size)
16392476d7fSEric W. Biederman {
16492476d7fSEric W. Biederman struct pid *pid;
16592476d7fSEric W. Biederman enum pid_type type;
1668ef047aaSPavel Emelyanov int i, nr;
1678ef047aaSPavel Emelyanov struct pid_namespace *tmp;
168198fe21bSPavel Emelyanov struct upid *upid;
16935f71bc0SMichal Hocko int retval = -ENOMEM;
17092476d7fSEric W. Biederman
17149cb2fc4SAdrian Reber /*
17249cb2fc4SAdrian Reber * set_tid_size contains the size of the set_tid array. Starting at
17349cb2fc4SAdrian Reber * the most nested currently active PID namespace it tells alloc_pid()
17449cb2fc4SAdrian Reber * which PID to set for a process in that most nested PID namespace
17549cb2fc4SAdrian Reber * up to set_tid_size PID namespaces. It does not have to set the PID
17649cb2fc4SAdrian Reber * for a process in all nested PID namespaces but set_tid_size must
17749cb2fc4SAdrian Reber * never be greater than the current ns->level + 1.
17849cb2fc4SAdrian Reber */
17949cb2fc4SAdrian Reber if (set_tid_size > ns->level + 1)
18049cb2fc4SAdrian Reber return ERR_PTR(-EINVAL);
18149cb2fc4SAdrian Reber
182baf8f0f8SPavel Emelianov pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
18392476d7fSEric W. Biederman if (!pid)
18435f71bc0SMichal Hocko return ERR_PTR(retval);
18592476d7fSEric W. Biederman
1868ef047aaSPavel Emelyanov tmp = ns;
1870a01f2ccSEric W. Biederman pid->level = ns->level;
18895846ecfSGargi Sharma
1898ef047aaSPavel Emelyanov for (i = ns->level; i >= 0; i--) {
19049cb2fc4SAdrian Reber int tid = 0;
1917863dcc7SChristian Brauner int pid_max = READ_ONCE(tmp->pid_max);
19249cb2fc4SAdrian Reber
19349cb2fc4SAdrian Reber if (set_tid_size) {
19449cb2fc4SAdrian Reber tid = set_tid[ns->level - i];
19549cb2fc4SAdrian Reber
19649cb2fc4SAdrian Reber retval = -EINVAL;
19749cb2fc4SAdrian Reber if (tid < 1 || tid >= pid_max)
19849cb2fc4SAdrian Reber goto out_free;
19949cb2fc4SAdrian Reber /*
20049cb2fc4SAdrian Reber * Also fail if a PID != 1 is requested and
20149cb2fc4SAdrian Reber * no PID 1 exists.
20249cb2fc4SAdrian Reber */
20349cb2fc4SAdrian Reber if (tid != 1 && !tmp->child_reaper)
20449cb2fc4SAdrian Reber goto out_free;
20549cb2fc4SAdrian Reber retval = -EPERM;
2061caef81dSAdrian Reber if (!checkpoint_restore_ns_capable(tmp->user_ns))
20749cb2fc4SAdrian Reber goto out_free;
20849cb2fc4SAdrian Reber set_tid_size--;
20949cb2fc4SAdrian Reber }
21095846ecfSGargi Sharma
21195846ecfSGargi Sharma idr_preload(GFP_KERNEL);
212*627454c0SMateusz Guzik spin_lock(&pidmap_lock);
21395846ecfSGargi Sharma
21449cb2fc4SAdrian Reber if (tid) {
21549cb2fc4SAdrian Reber nr = idr_alloc(&tmp->idr, NULL, tid,
21649cb2fc4SAdrian Reber tid + 1, GFP_ATOMIC);
21795846ecfSGargi Sharma /*
21849cb2fc4SAdrian Reber * If ENOSPC is returned it means that the PID is
21949cb2fc4SAdrian Reber * alreay in use. Return EEXIST in that case.
22049cb2fc4SAdrian Reber */
22149cb2fc4SAdrian Reber if (nr == -ENOSPC)
22249cb2fc4SAdrian Reber nr = -EEXIST;
22349cb2fc4SAdrian Reber } else {
22449cb2fc4SAdrian Reber int pid_min = 1;
22549cb2fc4SAdrian Reber /*
22649cb2fc4SAdrian Reber * init really needs pid 1, but after reaching the
22749cb2fc4SAdrian Reber * maximum wrap back to RESERVED_PIDS
22895846ecfSGargi Sharma */
22995846ecfSGargi Sharma if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
23095846ecfSGargi Sharma pid_min = RESERVED_PIDS;
23195846ecfSGargi Sharma
23295846ecfSGargi Sharma /*
23395846ecfSGargi Sharma * Store a null pointer so find_pid_ns does not find
23495846ecfSGargi Sharma * a partially initialized PID (see below).
23595846ecfSGargi Sharma */
23695846ecfSGargi Sharma nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
23795846ecfSGargi Sharma pid_max, GFP_ATOMIC);
23849cb2fc4SAdrian Reber }
239*627454c0SMateusz Guzik spin_unlock(&pidmap_lock);
24095846ecfSGargi Sharma idr_preload_end();
24195846ecfSGargi Sharma
242287980e4SArnd Bergmann if (nr < 0) {
243f83606f5SKJ Tsanaktsidis retval = (nr == -ENOSPC) ? -EAGAIN : nr;
24492476d7fSEric W. Biederman goto out_free;
24535f71bc0SMichal Hocko }
24692476d7fSEric W. Biederman
2478ef047aaSPavel Emelyanov pid->numbers[i].nr = nr;
2488ef047aaSPavel Emelyanov pid->numbers[i].ns = tmp;
2498ef047aaSPavel Emelyanov tmp = tmp->parent;
2508ef047aaSPavel Emelyanov }
2518ef047aaSPavel Emelyanov
25210dab84cSChristian Brauner /*
25310dab84cSChristian Brauner * ENOMEM is not the most obvious choice especially for the case
25410dab84cSChristian Brauner * where the child subreaper has already exited and the pid
25510dab84cSChristian Brauner * namespace denies the creation of any new processes. But ENOMEM
25610dab84cSChristian Brauner * is what we have exposed to userspace for a long time and it is
25710dab84cSChristian Brauner * documented behavior for pid namespaces. So we can't easily
25810dab84cSChristian Brauner * change it even if there were an error code better suited.
25910dab84cSChristian Brauner */
260b26ebfe1SCorey Minyard retval = -ENOMEM;
261b26ebfe1SCorey Minyard
2628ef047aaSPavel Emelyanov get_pid_ns(ns);
263f57e515aSJoel Fernandes (Google) refcount_set(&pid->count, 1);
26463f818f4SEric W. Biederman spin_lock_init(&pid->lock);
26592476d7fSEric W. Biederman for (type = 0; type < PIDTYPE_MAX; ++type)
26692476d7fSEric W. Biederman INIT_HLIST_HEAD(&pid->tasks[type]);
26792476d7fSEric W. Biederman
268b53b0b9dSJoel Fernandes (Google) init_waitqueue_head(&pid->wait_pidfd);
2697bc3e6e5SEric W. Biederman INIT_HLIST_HEAD(&pid->inodes);
270b53b0b9dSJoel Fernandes (Google)
271417e3152SAndré Goddard Rosa upid = pid->numbers + ns->level;
2729698d5a4SChristian Brauner idr_preload(GFP_KERNEL);
273*627454c0SMateusz Guzik spin_lock(&pidmap_lock);
274e8cfbc24SGargi Sharma if (!(ns->pid_allocated & PIDNS_ADDING))
2755e1182deSEric W. Biederman goto out_unlock;
27616ecd47cSChristian Brauner pidfs_add_pid(pid);
2770a01f2ccSEric W. Biederman for ( ; upid >= pid->numbers; --upid) {
27895846ecfSGargi Sharma /* Make the PID visible to find_pid_ns. */
27995846ecfSGargi Sharma idr_replace(&upid->ns->idr, pid, upid->nr);
280e8cfbc24SGargi Sharma upid->ns->pid_allocated++;
2810a01f2ccSEric W. Biederman }
282*627454c0SMateusz Guzik spin_unlock(&pidmap_lock);
2839698d5a4SChristian Brauner idr_preload_end();
28492476d7fSEric W. Biederman
28592476d7fSEric W. Biederman return pid;
28692476d7fSEric W. Biederman
2875e1182deSEric W. Biederman out_unlock:
288*627454c0SMateusz Guzik spin_unlock(&pidmap_lock);
2899698d5a4SChristian Brauner idr_preload_end();
29024c037ebSOleg Nesterov put_pid_ns(ns);
29124c037ebSOleg Nesterov
29292476d7fSEric W. Biederman out_free:
293*627454c0SMateusz Guzik spin_lock(&pidmap_lock);
2941a80dadeSMatthew Wilcox while (++i <= ns->level) {
2951a80dadeSMatthew Wilcox upid = pid->numbers + i;
2961a80dadeSMatthew Wilcox idr_remove(&upid->ns->idr, upid->nr);
2971a80dadeSMatthew Wilcox }
29895846ecfSGargi Sharma
299c0ee5549SEric W. Biederman /* On failure to allocate the first pid, reset the state */
300c0ee5549SEric W. Biederman if (ns->pid_allocated == PIDNS_ADDING)
301c0ee5549SEric W. Biederman idr_set_cursor(&ns->idr, 0);
302c0ee5549SEric W. Biederman
303*627454c0SMateusz Guzik spin_unlock(&pidmap_lock);
3048ef047aaSPavel Emelyanov
305baf8f0f8SPavel Emelianov kmem_cache_free(ns->pid_cachep, pid);
30635f71bc0SMichal Hocko return ERR_PTR(retval);
30792476d7fSEric W. Biederman }
30892476d7fSEric W. Biederman
disable_pid_allocation(struct pid_namespace * ns)309c876ad76SEric W. Biederman void disable_pid_allocation(struct pid_namespace *ns)
310c876ad76SEric W. Biederman {
311*627454c0SMateusz Guzik spin_lock(&pidmap_lock);
312e8cfbc24SGargi Sharma ns->pid_allocated &= ~PIDNS_ADDING;
313*627454c0SMateusz Guzik spin_unlock(&pidmap_lock);
314c876ad76SEric W. Biederman }
315c876ad76SEric W. Biederman
find_pid_ns(int nr,struct pid_namespace * ns)3167ad5b3a5SHarvey Harrison struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
3171da177e4SLinus Torvalds {
318e8cfbc24SGargi Sharma return idr_find(&ns->idr, nr);
3191da177e4SLinus Torvalds }
320198fe21bSPavel Emelyanov EXPORT_SYMBOL_GPL(find_pid_ns);
3211da177e4SLinus Torvalds
find_vpid(int nr)3228990571eSPavel Emelyanov struct pid *find_vpid(int nr)
3238990571eSPavel Emelyanov {
32417cf22c3SEric W. Biederman return find_pid_ns(nr, task_active_pid_ns(current));
3258990571eSPavel Emelyanov }
3268990571eSPavel Emelyanov EXPORT_SYMBOL_GPL(find_vpid);
3278990571eSPavel Emelyanov
task_pid_ptr(struct task_struct * task,enum pid_type type)3282c470475SEric W. Biederman static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
3292c470475SEric W. Biederman {
3302c470475SEric W. Biederman return (type == PIDTYPE_PID) ?
3312c470475SEric W. Biederman &task->thread_pid :
3322c470475SEric W. Biederman &task->signal->pids[type];
3332c470475SEric W. Biederman }
3342c470475SEric W. Biederman
335e713d0daSSukadev Bhattiprolu /*
336e713d0daSSukadev Bhattiprolu * attach_pid() must be called with the tasklist_lock write-held.
337e713d0daSSukadev Bhattiprolu */
attach_pid(struct task_struct * task,enum pid_type type)33881907739SOleg Nesterov void attach_pid(struct task_struct *task, enum pid_type type)
3391da177e4SLinus Torvalds {
34074198dc2SMateusz Guzik struct pid *pid;
34174198dc2SMateusz Guzik
34274198dc2SMateusz Guzik lockdep_assert_held_write(&tasklist_lock);
34374198dc2SMateusz Guzik
34474198dc2SMateusz Guzik pid = *task_pid_ptr(task, type);
3452c470475SEric W. Biederman hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
3461da177e4SLinus Torvalds }
3471da177e4SLinus Torvalds
__change_pid(struct pid ** pids,struct task_struct * task,enum pid_type type,struct pid * new)3487903f907SMateusz Guzik static void __change_pid(struct pid **pids, struct task_struct *task,
3497903f907SMateusz Guzik enum pid_type type, struct pid *new)
3501da177e4SLinus Torvalds {
35174198dc2SMateusz Guzik struct pid **pid_ptr, *pid;
35292476d7fSEric W. Biederman int tmp;
3531da177e4SLinus Torvalds
35474198dc2SMateusz Guzik lockdep_assert_held_write(&tasklist_lock);
35574198dc2SMateusz Guzik
35674198dc2SMateusz Guzik pid_ptr = task_pid_ptr(task, type);
3572c470475SEric W. Biederman pid = *pid_ptr;
35892476d7fSEric W. Biederman
3592c470475SEric W. Biederman hlist_del_rcu(&task->pid_links[type]);
3602c470475SEric W. Biederman *pid_ptr = new;
3611da177e4SLinus Torvalds
36243f0df54SOleg Nesterov if (type == PIDTYPE_PID) {
36343f0df54SOleg Nesterov WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
36443f0df54SOleg Nesterov wake_up_all(&pid->wait_pidfd);
36543f0df54SOleg Nesterov }
36643f0df54SOleg Nesterov
3671da177e4SLinus Torvalds for (tmp = PIDTYPE_MAX; --tmp >= 0; )
3681d416a11SChristian Brauner if (pid_has_task(pid, tmp))
3691da177e4SLinus Torvalds return;
3701da177e4SLinus Torvalds
3717903f907SMateusz Guzik WARN_ON(pids[type]);
3727903f907SMateusz Guzik pids[type] = pid;
3731da177e4SLinus Torvalds }
3741da177e4SLinus Torvalds
detach_pid(struct pid ** pids,struct task_struct * task,enum pid_type type)3757903f907SMateusz Guzik void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
37624336eaeSOleg Nesterov {
3777903f907SMateusz Guzik __change_pid(pids, task, type, NULL);
37824336eaeSOleg Nesterov }
37924336eaeSOleg Nesterov
change_pid(struct pid ** pids,struct task_struct * task,enum pid_type type,struct pid * pid)3807903f907SMateusz Guzik void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
38124336eaeSOleg Nesterov struct pid *pid)
38224336eaeSOleg Nesterov {
3837903f907SMateusz Guzik __change_pid(pids, task, type, pid);
38481907739SOleg Nesterov attach_pid(task, type);
38524336eaeSOleg Nesterov }
38624336eaeSOleg Nesterov
exchange_tids(struct task_struct * left,struct task_struct * right)3876b03d130SEric W. Biederman void exchange_tids(struct task_struct *left, struct task_struct *right)
3886b03d130SEric W. Biederman {
3896b03d130SEric W. Biederman struct pid *pid1 = left->thread_pid;
3906b03d130SEric W. Biederman struct pid *pid2 = right->thread_pid;
3916b03d130SEric W. Biederman struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
3926b03d130SEric W. Biederman struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
3936b03d130SEric W. Biederman
39474198dc2SMateusz Guzik lockdep_assert_held_write(&tasklist_lock);
39574198dc2SMateusz Guzik
3966b03d130SEric W. Biederman /* Swap the single entry tid lists */
3976b03d130SEric W. Biederman hlists_swap_heads_rcu(head1, head2);
3986b03d130SEric W. Biederman
3996b03d130SEric W. Biederman /* Swap the per task_struct pid */
4006b03d130SEric W. Biederman rcu_assign_pointer(left->thread_pid, pid2);
4016b03d130SEric W. Biederman rcu_assign_pointer(right->thread_pid, pid1);
4026b03d130SEric W. Biederman
4036b03d130SEric W. Biederman /* Swap the cached value */
4046b03d130SEric W. Biederman WRITE_ONCE(left->pid, pid_nr(pid2));
4056b03d130SEric W. Biederman WRITE_ONCE(right->pid, pid_nr(pid1));
4066b03d130SEric W. Biederman }
4076b03d130SEric W. Biederman
408c18258c6SEric W. Biederman /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
transfer_pid(struct task_struct * old,struct task_struct * new,enum pid_type type)4097ad5b3a5SHarvey Harrison void transfer_pid(struct task_struct *old, struct task_struct *new,
410c18258c6SEric W. Biederman enum pid_type type)
411c18258c6SEric W. Biederman {
412a1c6d543SOleg Nesterov WARN_ON_ONCE(type == PIDTYPE_PID);
41374198dc2SMateusz Guzik lockdep_assert_held_write(&tasklist_lock);
4142c470475SEric W. Biederman hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
415c18258c6SEric W. Biederman }
416c18258c6SEric W. Biederman
pid_task(struct pid * pid,enum pid_type type)4177ad5b3a5SHarvey Harrison struct task_struct *pid_task(struct pid *pid, enum pid_type type)
41892476d7fSEric W. Biederman {
41992476d7fSEric W. Biederman struct task_struct *result = NULL;
42092476d7fSEric W. Biederman if (pid) {
42192476d7fSEric W. Biederman struct hlist_node *first;
42267bdbffdSArnd Bergmann first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
423db1466b3SPaul E. McKenney lockdep_tasklist_lock_is_held());
42492476d7fSEric W. Biederman if (first)
4252c470475SEric W. Biederman result = hlist_entry(first, struct task_struct, pid_links[(type)]);
42692476d7fSEric W. Biederman }
42792476d7fSEric W. Biederman return result;
42892476d7fSEric W. Biederman }
429eccba068SPavel Emelyanov EXPORT_SYMBOL(pid_task);
43092476d7fSEric W. Biederman
43192476d7fSEric W. Biederman /*
4329728e5d6STetsuo Handa * Must be called under rcu_read_lock().
43392476d7fSEric W. Biederman */
find_task_by_pid_ns(pid_t nr,struct pid_namespace * ns)43417f98dcfSChristoph Hellwig struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
4351da177e4SLinus Torvalds {
436f78f5b90SPaul E. McKenney RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
437f78f5b90SPaul E. McKenney "find_task_by_pid_ns() needs rcu_read_lock() protection");
43817f98dcfSChristoph Hellwig return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
4391da177e4SLinus Torvalds }
4401da177e4SLinus Torvalds
find_task_by_vpid(pid_t vnr)441228ebcbeSPavel Emelyanov struct task_struct *find_task_by_vpid(pid_t vnr)
442228ebcbeSPavel Emelyanov {
44317cf22c3SEric W. Biederman return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
444228ebcbeSPavel Emelyanov }
445228ebcbeSPavel Emelyanov
find_get_task_by_vpid(pid_t nr)4462ee08260SMike Rapoport struct task_struct *find_get_task_by_vpid(pid_t nr)
4472ee08260SMike Rapoport {
4482ee08260SMike Rapoport struct task_struct *task;
4492ee08260SMike Rapoport
4502ee08260SMike Rapoport rcu_read_lock();
4512ee08260SMike Rapoport task = find_task_by_vpid(nr);
4522ee08260SMike Rapoport if (task)
4532ee08260SMike Rapoport get_task_struct(task);
4542ee08260SMike Rapoport rcu_read_unlock();
4552ee08260SMike Rapoport
4562ee08260SMike Rapoport return task;
4572ee08260SMike Rapoport }
4582ee08260SMike Rapoport
get_task_pid(struct task_struct * task,enum pid_type type)4591a657f78SOleg Nesterov struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
4601a657f78SOleg Nesterov {
4611a657f78SOleg Nesterov struct pid *pid;
4621a657f78SOleg Nesterov rcu_read_lock();
4632c470475SEric W. Biederman pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
4641a657f78SOleg Nesterov rcu_read_unlock();
4651a657f78SOleg Nesterov return pid;
4661a657f78SOleg Nesterov }
46777c100c8SRik van Riel EXPORT_SYMBOL_GPL(get_task_pid);
4681a657f78SOleg Nesterov
get_pid_task(struct pid * pid,enum pid_type type)4697ad5b3a5SHarvey Harrison struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
47092476d7fSEric W. Biederman {
47192476d7fSEric W. Biederman struct task_struct *result;
47292476d7fSEric W. Biederman rcu_read_lock();
47392476d7fSEric W. Biederman result = pid_task(pid, type);
47492476d7fSEric W. Biederman if (result)
47592476d7fSEric W. Biederman get_task_struct(result);
47692476d7fSEric W. Biederman rcu_read_unlock();
47792476d7fSEric W. Biederman return result;
47892476d7fSEric W. Biederman }
47977c100c8SRik van Riel EXPORT_SYMBOL_GPL(get_pid_task);
48092476d7fSEric W. Biederman
find_get_pid(pid_t nr)48192476d7fSEric W. Biederman struct pid *find_get_pid(pid_t nr)
48292476d7fSEric W. Biederman {
48392476d7fSEric W. Biederman struct pid *pid;
48492476d7fSEric W. Biederman
48592476d7fSEric W. Biederman rcu_read_lock();
486198fe21bSPavel Emelyanov pid = get_pid(find_vpid(nr));
48792476d7fSEric W. Biederman rcu_read_unlock();
48892476d7fSEric W. Biederman
48992476d7fSEric W. Biederman return pid;
49092476d7fSEric W. Biederman }
491339caf2aSDavid Sterba EXPORT_SYMBOL_GPL(find_get_pid);
49292476d7fSEric W. Biederman
pid_nr_ns(struct pid * pid,struct pid_namespace * ns)4937af57294SPavel Emelyanov pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
4947af57294SPavel Emelyanov {
4957af57294SPavel Emelyanov struct upid *upid;
4967af57294SPavel Emelyanov pid_t nr = 0;
4977af57294SPavel Emelyanov
4987af57294SPavel Emelyanov if (pid && ns->level <= pid->level) {
4997af57294SPavel Emelyanov upid = &pid->numbers[ns->level];
5007af57294SPavel Emelyanov if (upid->ns == ns)
5017af57294SPavel Emelyanov nr = upid->nr;
5027af57294SPavel Emelyanov }
5037af57294SPavel Emelyanov return nr;
5047af57294SPavel Emelyanov }
5054f82f457SEric W. Biederman EXPORT_SYMBOL_GPL(pid_nr_ns);
5067af57294SPavel Emelyanov
pid_vnr(struct pid * pid)50744c4e1b2SEric W. Biederman pid_t pid_vnr(struct pid *pid)
50844c4e1b2SEric W. Biederman {
50917cf22c3SEric W. Biederman return pid_nr_ns(pid, task_active_pid_ns(current));
51044c4e1b2SEric W. Biederman }
51144c4e1b2SEric W. Biederman EXPORT_SYMBOL_GPL(pid_vnr);
51244c4e1b2SEric W. Biederman
__task_pid_nr_ns(struct task_struct * task,enum pid_type type,struct pid_namespace * ns)51352ee2dfdSOleg Nesterov pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
51452ee2dfdSOleg Nesterov struct pid_namespace *ns)
5152f2a3a46SPavel Emelyanov {
51652ee2dfdSOleg Nesterov pid_t nr = 0;
51752ee2dfdSOleg Nesterov
51852ee2dfdSOleg Nesterov rcu_read_lock();
51952ee2dfdSOleg Nesterov if (!ns)
52017cf22c3SEric W. Biederman ns = task_active_pid_ns(current);
5212c470475SEric W. Biederman nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
52252ee2dfdSOleg Nesterov rcu_read_unlock();
52352ee2dfdSOleg Nesterov
52452ee2dfdSOleg Nesterov return nr;
52552ee2dfdSOleg Nesterov }
52652ee2dfdSOleg Nesterov EXPORT_SYMBOL(__task_pid_nr_ns);
5272f2a3a46SPavel Emelyanov
task_active_pid_ns(struct task_struct * tsk)52861bce0f1SEric W. Biederman struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
52961bce0f1SEric W. Biederman {
53061bce0f1SEric W. Biederman return ns_of_pid(task_pid(tsk));
53161bce0f1SEric W. Biederman }
53261bce0f1SEric W. Biederman EXPORT_SYMBOL_GPL(task_active_pid_ns);
53361bce0f1SEric W. Biederman
5341da177e4SLinus Torvalds /*
535025dfdafSFrederik Schwarzer * Used by proc to find the first pid that is greater than or equal to nr.
5360804ef4bSEric W. Biederman *
537e49859e7SPavel Emelyanov * If there is a pid at nr this function is exactly the same as find_pid_ns.
5380804ef4bSEric W. Biederman */
find_ge_pid(int nr,struct pid_namespace * ns)539198fe21bSPavel Emelyanov struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
5400804ef4bSEric W. Biederman {
54195846ecfSGargi Sharma return idr_get_next(&ns->idr, &nr);
5420804ef4bSEric W. Biederman }
5434480c27cSAndreas Gruenbacher EXPORT_SYMBOL_GPL(find_ge_pid);
5440804ef4bSEric W. Biederman
pidfd_get_pid(unsigned int fd,unsigned int * flags)5451aa92cd3SMinchan Kim struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
5461aa92cd3SMinchan Kim {
5476348be02SAl Viro CLASS(fd, f)(fd);
5481aa92cd3SMinchan Kim struct pid *pid;
5491aa92cd3SMinchan Kim
5506348be02SAl Viro if (fd_empty(f))
5511aa92cd3SMinchan Kim return ERR_PTR(-EBADF);
5521aa92cd3SMinchan Kim
5531da91ea8SAl Viro pid = pidfd_pid(fd_file(f));
5541aa92cd3SMinchan Kim if (!IS_ERR(pid)) {
5551aa92cd3SMinchan Kim get_pid(pid);
5561da91ea8SAl Viro *flags = fd_file(f)->f_flags;
5571aa92cd3SMinchan Kim }
5581aa92cd3SMinchan Kim return pid;
5591aa92cd3SMinchan Kim }
5601aa92cd3SMinchan Kim
56132fcb426SChristian Brauner /**
562e9bdcdbfSChristian Brauner * pidfd_get_task() - Get the task associated with a pidfd
563e9bdcdbfSChristian Brauner *
564e9bdcdbfSChristian Brauner * @pidfd: pidfd for which to get the task
565e9bdcdbfSChristian Brauner * @flags: flags associated with this pidfd
566e9bdcdbfSChristian Brauner *
567e9bdcdbfSChristian Brauner * Return the task associated with @pidfd. The function takes a reference on
568e9bdcdbfSChristian Brauner * the returned task. The caller is responsible for releasing that reference.
569e9bdcdbfSChristian Brauner *
570e9bdcdbfSChristian Brauner * Return: On success, the task_struct associated with the pidfd.
571e9bdcdbfSChristian Brauner * On error, a negative errno number will be returned.
572e9bdcdbfSChristian Brauner */
pidfd_get_task(int pidfd,unsigned int * flags)573e9bdcdbfSChristian Brauner struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
574e9bdcdbfSChristian Brauner {
575e9bdcdbfSChristian Brauner unsigned int f_flags = 0;
576e9bdcdbfSChristian Brauner struct pid *pid;
577e9bdcdbfSChristian Brauner struct task_struct *task;
578e9bdcdbfSChristian Brauner enum pid_type type;
579e9bdcdbfSChristian Brauner
580e9bdcdbfSChristian Brauner switch (pidfd) {
581e9bdcdbfSChristian Brauner case PIDFD_SELF_THREAD:
582e9bdcdbfSChristian Brauner type = PIDTYPE_PID;
583e9bdcdbfSChristian Brauner pid = get_task_pid(current, type);
584e9bdcdbfSChristian Brauner break;
585e9bdcdbfSChristian Brauner case PIDFD_SELF_THREAD_GROUP:
586e9bdcdbfSChristian Brauner type = PIDTYPE_TGID;
587e9bdcdbfSChristian Brauner pid = get_task_pid(current, type);
588e9bdcdbfSChristian Brauner break;
589e9bdcdbfSChristian Brauner default:
590e9bdcdbfSChristian Brauner pid = pidfd_get_pid(pidfd, &f_flags);
591e9bdcdbfSChristian Brauner if (IS_ERR(pid))
592e9bdcdbfSChristian Brauner return ERR_CAST(pid);
59332fcb426SChristian Brauner type = PIDTYPE_TGID;
59432fcb426SChristian Brauner break;
59532fcb426SChristian Brauner }
5966da73d15SChristian Brauner
59732fcb426SChristian Brauner task = get_pid_task(pid, type);
59832fcb426SChristian Brauner put_pid(pid);
59932fcb426SChristian Brauner if (!task)
60032fcb426SChristian Brauner return ERR_PTR(-ESRCH);
60132fcb426SChristian Brauner
60232fcb426SChristian Brauner *flags = f_flags;
603c576e0fcSMatthew Bobrowski return task;
604c576e0fcSMatthew Bobrowski }
60532fcb426SChristian Brauner
60632fcb426SChristian Brauner /**
60732fcb426SChristian Brauner * pidfd_create() - Create a new pid file descriptor.
608cdefbf23SOleg Nesterov *
60932fcb426SChristian Brauner * @pid: struct pid that the pidfd will reference
6106ae930d9SChristian Brauner * @flags: flags to pass
6116ae930d9SChristian Brauner *
61232fcb426SChristian Brauner * This creates a new pid file descriptor with the O_CLOEXEC flag set.
6136ae930d9SChristian Brauner *
6146ae930d9SChristian Brauner * Note, that this function can only be called after the fd table has
6156ae930d9SChristian Brauner * been unshared to avoid leaking the pidfd to the new process.
616490b9ba8SMatthew Bobrowski *
6176ae930d9SChristian Brauner * This symbol should not be explicitly exported to loadable modules.
6186ae930d9SChristian Brauner *
61932fcb426SChristian Brauner * Return: On success, a cloexec pidfd is returned.
62032fcb426SChristian Brauner * On error, a negative errno number will be returned.
62132fcb426SChristian Brauner */
pidfd_create(struct pid * pid,unsigned int flags)6220c7752d5SRandy Dunlap static int pidfd_create(struct pid *pid, unsigned int flags)
62332fcb426SChristian Brauner {
62432fcb426SChristian Brauner int pidfd;
62532fcb426SChristian Brauner struct file *pidfd_file;
62632fcb426SChristian Brauner
62732fcb426SChristian Brauner pidfd = pidfd_prepare(pid, flags, &pidfd_file);
62864bef697SOleg Nesterov if (pidfd < 0)
62964bef697SOleg Nesterov return pidfd;
63032fcb426SChristian Brauner
63132fcb426SChristian Brauner fd_install(pidfd, pidfd_file);
63232fcb426SChristian Brauner return pidfd;
63332fcb426SChristian Brauner }
63432fcb426SChristian Brauner
63532fcb426SChristian Brauner /**
6361e1d0f0bSChristian Brauner * sys_pidfd_open() - Open new pid file descriptor.
63732fcb426SChristian Brauner *
63832fcb426SChristian Brauner * @pid: pid for which to retrieve a pidfd
63964bef697SOleg Nesterov * @flags: flags to pass
64032fcb426SChristian Brauner *
64132fcb426SChristian Brauner * This creates a new pid file descriptor with the O_CLOEXEC flag set for
64232fcb426SChristian Brauner * the task identified by @pid. Without PIDFD_THREAD flag the target task
64332fcb426SChristian Brauner * must be a thread-group leader.
64432fcb426SChristian Brauner *
64532fcb426SChristian Brauner * Return: On success, a cloexec pidfd is returned.
64632fcb426SChristian Brauner * On error, a negative errno number will be returned.
64732fcb426SChristian Brauner */
SYSCALL_DEFINE2(pidfd_open,pid_t,pid,unsigned int,flags)64832fcb426SChristian Brauner SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
6496da73d15SChristian Brauner {
65032fcb426SChristian Brauner int fd;
65132fcb426SChristian Brauner struct pid *p;
65232fcb426SChristian Brauner
65332fcb426SChristian Brauner if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
65432fcb426SChristian Brauner return -EINVAL;
6557863dcc7SChristian Brauner
6567863dcc7SChristian Brauner if (pid <= 0)
6577863dcc7SChristian Brauner return -EINVAL;
6587863dcc7SChristian Brauner
6597863dcc7SChristian Brauner p = find_get_pid(pid);
6607863dcc7SChristian Brauner if (!p)
6617863dcc7SChristian Brauner return -ESRCH;
6627863dcc7SChristian Brauner
6637863dcc7SChristian Brauner fd = pidfd_create(p, flags);
6647863dcc7SChristian Brauner
6657863dcc7SChristian Brauner put_pid(p);
6667863dcc7SChristian Brauner return fd;
6677863dcc7SChristian Brauner }
6687863dcc7SChristian Brauner
6697863dcc7SChristian Brauner #ifdef CONFIG_SYSCTL
pid_table_root_lookup(struct ctl_table_root * root)6707863dcc7SChristian Brauner static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
6717863dcc7SChristian Brauner {
6727863dcc7SChristian Brauner return &task_active_pid_ns(current)->set;
6737863dcc7SChristian Brauner }
6747863dcc7SChristian Brauner
set_is_seen(struct ctl_table_set * set)6757863dcc7SChristian Brauner static int set_is_seen(struct ctl_table_set *set)
6767863dcc7SChristian Brauner {
6777863dcc7SChristian Brauner return &task_active_pid_ns(current)->set == set;
6787863dcc7SChristian Brauner }
6797863dcc7SChristian Brauner
pid_table_root_permissions(struct ctl_table_header * head,const struct ctl_table * table)6807863dcc7SChristian Brauner static int pid_table_root_permissions(struct ctl_table_header *head,
6817863dcc7SChristian Brauner const struct ctl_table *table)
6827863dcc7SChristian Brauner {
6837863dcc7SChristian Brauner struct pid_namespace *pidns =
6847863dcc7SChristian Brauner container_of(head->set, struct pid_namespace, set);
6857863dcc7SChristian Brauner int mode = table->mode;
6867863dcc7SChristian Brauner
6877863dcc7SChristian Brauner if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
6887863dcc7SChristian Brauner uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
6897863dcc7SChristian Brauner mode = (mode & S_IRWXU) >> 6;
6907863dcc7SChristian Brauner else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
6917863dcc7SChristian Brauner mode = (mode & S_IRWXG) >> 3;
6927863dcc7SChristian Brauner else
6937863dcc7SChristian Brauner mode = mode & S_IROTH;
6947863dcc7SChristian Brauner return (mode << 6) | (mode << 3) | mode;
6957863dcc7SChristian Brauner }
6967863dcc7SChristian Brauner
pid_table_root_set_ownership(struct ctl_table_header * head,kuid_t * uid,kgid_t * gid)6977863dcc7SChristian Brauner static void pid_table_root_set_ownership(struct ctl_table_header *head,
6987863dcc7SChristian Brauner kuid_t *uid, kgid_t *gid)
6997863dcc7SChristian Brauner {
7007863dcc7SChristian Brauner struct pid_namespace *pidns =
7017863dcc7SChristian Brauner container_of(head->set, struct pid_namespace, set);
7027863dcc7SChristian Brauner kuid_t ns_root_uid;
7037863dcc7SChristian Brauner kgid_t ns_root_gid;
7047863dcc7SChristian Brauner
7057863dcc7SChristian Brauner ns_root_uid = make_kuid(pidns->user_ns, 0);
7061751f872SJoel Granados if (uid_valid(ns_root_uid))
7077863dcc7SChristian Brauner *uid = ns_root_uid;
7087863dcc7SChristian Brauner
7097863dcc7SChristian Brauner ns_root_gid = make_kgid(pidns->user_ns, 0);
7107863dcc7SChristian Brauner if (gid_valid(ns_root_gid))
7117863dcc7SChristian Brauner *gid = ns_root_gid;
7127863dcc7SChristian Brauner }
7137863dcc7SChristian Brauner
7147863dcc7SChristian Brauner static struct ctl_table_root pid_table_root = {
7157863dcc7SChristian Brauner .lookup = pid_table_root_lookup,
7167863dcc7SChristian Brauner .permissions = pid_table_root_permissions,
7177863dcc7SChristian Brauner .set_ownership = pid_table_root_set_ownership,
7187863dcc7SChristian Brauner };
7197863dcc7SChristian Brauner
7207863dcc7SChristian Brauner static const struct ctl_table pid_table[] = {
7217863dcc7SChristian Brauner {
7227863dcc7SChristian Brauner .procname = "pid_max",
7237863dcc7SChristian Brauner .data = &init_pid_ns.pid_max,
7247863dcc7SChristian Brauner .maxlen = sizeof(int),
7257863dcc7SChristian Brauner .mode = 0644,
7267863dcc7SChristian Brauner .proc_handler = proc_dointvec_minmax,
7277863dcc7SChristian Brauner .extra1 = &pid_max_min,
7287863dcc7SChristian Brauner .extra2 = &pid_max_max,
7297863dcc7SChristian Brauner },
7307863dcc7SChristian Brauner };
7317863dcc7SChristian Brauner #endif
7327863dcc7SChristian Brauner
register_pidns_sysctls(struct pid_namespace * pidns)7337863dcc7SChristian Brauner int register_pidns_sysctls(struct pid_namespace *pidns)
7347863dcc7SChristian Brauner {
7357863dcc7SChristian Brauner #ifdef CONFIG_SYSCTL
7367863dcc7SChristian Brauner struct ctl_table *tbl;
7377863dcc7SChristian Brauner
7387863dcc7SChristian Brauner setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
7397863dcc7SChristian Brauner
7407863dcc7SChristian Brauner tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
7417863dcc7SChristian Brauner if (!tbl)
7427863dcc7SChristian Brauner return -ENOMEM;
7437863dcc7SChristian Brauner tbl->data = &pidns->pid_max;
7447863dcc7SChristian Brauner pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
7457863dcc7SChristian Brauner PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
7467863dcc7SChristian Brauner
7477863dcc7SChristian Brauner pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
7487863dcc7SChristian Brauner ARRAY_SIZE(pid_table));
7497863dcc7SChristian Brauner if (!pidns->sysctls) {
7507863dcc7SChristian Brauner kfree(tbl);
7517863dcc7SChristian Brauner retire_sysctl_set(&pidns->set);
7527863dcc7SChristian Brauner return -ENOMEM;
7537863dcc7SChristian Brauner }
7547863dcc7SChristian Brauner #endif
7557863dcc7SChristian Brauner return 0;
75695846ecfSGargi Sharma }
7571da177e4SLinus Torvalds
unregister_pidns_sysctls(struct pid_namespace * pidns)758840d6fe7SZhen Lei void unregister_pidns_sysctls(struct pid_namespace *pidns)
759e8cfbc24SGargi Sharma {
760c876ad76SEric W. Biederman #ifdef CONFIG_SYSCTL
76172680a19SHedi Berriche const struct ctl_table *tbl;
7627863dcc7SChristian Brauner
76372680a19SHedi Berriche tbl = pidns->sysctls->ctl_table_arg;
76472680a19SHedi Berriche unregister_sysctl_table(pidns->sysctls);
76572680a19SHedi Berriche retire_sysctl_set(&pidns->set);
7667863dcc7SChristian Brauner kfree(tbl);
76772680a19SHedi Berriche #endif
76895846ecfSGargi Sharma }
76992476d7fSEric W. Biederman
pid_idr_init(void)770b69f0aebSKees Cook void __init pid_idr_init(void)
771dd546618SChristian Brauner {
772b69f0aebSKees Cook /* Verify no one has done anything silly: */
773b69f0aebSKees Cook BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
774b69f0aebSKees Cook
7751da177e4SLinus Torvalds /* bump default and minimum pid_max based on number of cpus */
7768649c322SSargun Dhillon init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
7777863dcc7SChristian Brauner PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
7787863dcc7SChristian Brauner pid_max_min = max_t(int, pid_max_min,
7797863dcc7SChristian Brauner PIDS_PER_CPU_MIN * num_possible_cpus());
7807863dcc7SChristian Brauner pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
7817863dcc7SChristian Brauner
7827863dcc7SChristian Brauner idr_init(&init_pid_ns.idr);
7837863dcc7SChristian Brauner
7847863dcc7SChristian Brauner init_pid_ns.pid_cachep = kmem_cache_create("pid",
7857863dcc7SChristian Brauner struct_size_t(struct pid, numbers, 1),
7867863dcc7SChristian Brauner __alignof__(struct pid),
7878649c322SSargun Dhillon SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
7888649c322SSargun Dhillon NULL);
7898649c322SSargun Dhillon }
7908649c322SSargun Dhillon
pid_namespace_sysctl_init(void)7918649c322SSargun Dhillon static __init int pid_namespace_sysctl_init(void)
792f7cfd871SEric W. Biederman {
7938649c322SSargun Dhillon #ifdef CONFIG_SYSCTL
7948649c322SSargun Dhillon /* "kernel" directory will have already been initialized. */
7958649c322SSargun Dhillon BUG_ON(register_pidns_sysctls(&init_pid_ns));
7968649c322SSargun Dhillon #endif
7978649c322SSargun Dhillon return 0;
7988649c322SSargun Dhillon }
7998649c322SSargun Dhillon subsys_initcall(pid_namespace_sysctl_init);
8008649c322SSargun Dhillon
__pidfd_fget(struct task_struct * task,int fd)801f7cfd871SEric W. Biederman static struct file *__pidfd_fget(struct task_struct *task, int fd)
8028649c322SSargun Dhillon {
8030c9bd6bcSTycho Andersen struct file *file;
8040c9bd6bcSTycho Andersen int ret;
8050c9bd6bcSTycho Andersen
8060c9bd6bcSTycho Andersen ret = down_read_killable(&task->signal->exec_update_lock);
8070c9bd6bcSTycho Andersen if (ret)
8080c9bd6bcSTycho Andersen return ERR_PTR(ret);
8090c9bd6bcSTycho Andersen
8100c9bd6bcSTycho Andersen if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
8110c9bd6bcSTycho Andersen file = fget_task(task, fd);
8120c9bd6bcSTycho Andersen else
8130c9bd6bcSTycho Andersen file = ERR_PTR(-EPERM);
8140c9bd6bcSTycho Andersen
8150c9bd6bcSTycho Andersen up_read(&task->signal->exec_update_lock);
8160c9bd6bcSTycho Andersen
8170c9bd6bcSTycho Andersen if (!file) {
8180c9bd6bcSTycho Andersen /*
8190c9bd6bcSTycho Andersen * It is possible that the target thread is exiting; it can be
8200c9bd6bcSTycho Andersen * either:
8210c9bd6bcSTycho Andersen * 1. before exit_signals(), which gives a real fd
8220c9bd6bcSTycho Andersen * 2. before exit_files() takes the task_lock() gives a real fd
8238649c322SSargun Dhillon * 3. after exit_files() releases task_lock(), ->files is NULL;
8248649c322SSargun Dhillon * this has PF_EXITING, since it was set in exit_signals(),
8258649c322SSargun Dhillon * __pidfd_fget() returns EBADF.
8268649c322SSargun Dhillon * In case 3 we get EBADF, but that really means ESRCH, since
8278649c322SSargun Dhillon * the task is currently exiting and has freed its files
8288649c322SSargun Dhillon * struct, so we fix it up.
8298649c322SSargun Dhillon */
8308649c322SSargun Dhillon if (task->flags & PF_EXITING)
8318649c322SSargun Dhillon file = ERR_PTR(-ESRCH);
8328649c322SSargun Dhillon else
8338649c322SSargun Dhillon file = ERR_PTR(-EBADF);
8348649c322SSargun Dhillon }
8358649c322SSargun Dhillon
8368649c322SSargun Dhillon return file;
8378649c322SSargun Dhillon }
8388649c322SSargun Dhillon
pidfd_getfd(struct pid * pid,int fd)8398649c322SSargun Dhillon static int pidfd_getfd(struct pid *pid, int fd)
8404e94ddfeSChristian Brauner {
8418649c322SSargun Dhillon struct task_struct *task;
8428649c322SSargun Dhillon struct file *file;
8438649c322SSargun Dhillon int ret;
8448649c322SSargun Dhillon
8458649c322SSargun Dhillon task = get_pid_task(pid, PIDTYPE_PID);
8468649c322SSargun Dhillon if (!task)
8478649c322SSargun Dhillon return -ESRCH;
8488649c322SSargun Dhillon
8498649c322SSargun Dhillon file = __pidfd_fget(task, fd);
8508649c322SSargun Dhillon put_task_struct(task);
8518649c322SSargun Dhillon if (IS_ERR(file))
8528649c322SSargun Dhillon return PTR_ERR(file);
8538649c322SSargun Dhillon
8548649c322SSargun Dhillon ret = receive_fd(file, NULL, O_CLOEXEC);
8558649c322SSargun Dhillon fput(file);
8568649c322SSargun Dhillon
8578649c322SSargun Dhillon return ret;
8588649c322SSargun Dhillon }
8598649c322SSargun Dhillon
8608649c322SSargun Dhillon /**
8618649c322SSargun Dhillon * sys_pidfd_getfd() - Get a file descriptor from another process
8628649c322SSargun Dhillon *
8638649c322SSargun Dhillon * @pidfd: the pidfd file descriptor of the process
8648649c322SSargun Dhillon * @fd: the file descriptor number to get
8658649c322SSargun Dhillon * @flags: flags on how to get the fd (reserved)
8668649c322SSargun Dhillon *
8678649c322SSargun Dhillon * This syscall gets a copy of a file descriptor from another process
8688649c322SSargun Dhillon * based on the pidfd, and file descriptor number. It requires that
8698649c322SSargun Dhillon * the calling process has the ability to ptrace the process represented
8708649c322SSargun Dhillon * by the pidfd. The process which is having its file descriptor copied
8718152f820SAl Viro * is otherwise unaffected.
8728152f820SAl Viro *
8738649c322SSargun Dhillon * Return: On success, a cloexec file descriptor is returned.
8748649c322SSargun Dhillon * On error, a negative errno number will be returned.
8751da91ea8SAl Viro */
SYSCALL_DEFINE3(pidfd_getfd,int,pidfd,int,fd,unsigned int,flags)8768649c322SSargun Dhillon SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
8778152f820SAl Viro unsigned int, flags)
8788649c322SSargun Dhillon {
8798152f820SAl Viro struct pid *pid;
8808649c322SSargun Dhillon
881 /* flags is currently unused - make sure it's unset */
882 if (flags)
883 return -EINVAL;
884
885 CLASS(fd, f)(pidfd);
886 if (fd_empty(f))
887 return -EBADF;
888
889 pid = pidfd_pid(fd_file(f));
890 if (IS_ERR(pid))
891 return PTR_ERR(pid);
892
893 return pidfd_getfd(pid, fd);
894 }
895