xref: /linux-6.15/kernel/pid.c (revision 627454c0)
1457c8996SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  * Generic pidhash and scalable, time-bounded PID allocator
41da177e4SLinus Torvalds  *
56d49e352SNadia Yvette Chambers  * (C) 2002-2003 Nadia Yvette Chambers, IBM
66d49e352SNadia Yvette Chambers  * (C) 2004 Nadia Yvette Chambers, Oracle
71da177e4SLinus Torvalds  * (C) 2002-2004 Ingo Molnar, Red Hat
81da177e4SLinus Torvalds  *
91da177e4SLinus Torvalds  * pid-structures are backing objects for tasks sharing a given ID to chain
101da177e4SLinus Torvalds  * against. There is very little to them aside from hashing them and
111da177e4SLinus Torvalds  * parking tasks using given ID's on a list.
121da177e4SLinus Torvalds  *
131da177e4SLinus Torvalds  * The hash is always changed with the tasklist_lock write-acquired,
141da177e4SLinus Torvalds  * and the hash is only accessed with the tasklist_lock at least
151da177e4SLinus Torvalds  * read-acquired, so there's no additional SMP locking needed here.
161da177e4SLinus Torvalds  *
171da177e4SLinus Torvalds  * We have a list of bitmap pages, which bitmaps represent the PID space.
181da177e4SLinus Torvalds  * Allocating and freeing PIDs is completely lockless. The worst-case
191da177e4SLinus Torvalds  * allocation scenario when all but one out of 1 million PIDs possible are
201da177e4SLinus Torvalds  * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
211da177e4SLinus Torvalds  * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
2230e49c26SPavel Emelyanov  *
2330e49c26SPavel Emelyanov  * Pid namespaces:
2430e49c26SPavel Emelyanov  *    (C) 2007 Pavel Emelyanov <[email protected]>, OpenVZ, SWsoft Inc.
2530e49c26SPavel Emelyanov  *    (C) 2007 Sukadev Bhattiprolu <[email protected]>, IBM
2630e49c26SPavel Emelyanov  *     Many thanks to Oleg Nesterov for comments and help
2730e49c26SPavel Emelyanov  *
281da177e4SLinus Torvalds  */
291da177e4SLinus Torvalds 
301da177e4SLinus Torvalds #include <linux/mm.h>
319984de1aSPaul Gortmaker #include <linux/export.h>
321da177e4SLinus Torvalds #include <linux/slab.h>
331da177e4SLinus Torvalds #include <linux/init.h>
3482524746SFranck Bui-Huu #include <linux/rculist.h>
3557c8a661SMike Rapoport #include <linux/memblock.h>
3661a58c6cSSukadev Bhattiprolu #include <linux/pid_namespace.h>
37820e45dbSSukadev Bhattiprolu #include <linux/init_task.h>
383eb07c8cSSukadev Bhattiprolu #include <linux/syscalls.h>
390bb80f24SDavid Howells #include <linux/proc_ns.h>
40f57e515aSJoel Fernandes (Google) #include <linux/refcount.h>
4132fcb426SChristian Brauner #include <linux/anon_inodes.h>
4232fcb426SChristian Brauner #include <linux/sched/signal.h>
4329930025SIngo Molnar #include <linux/sched/task.h>
4495846ecfSGargi Sharma #include <linux/idr.h>
45cb12fd8eSChristian Brauner #include <linux/pidfs.h>
4616ecd47cSChristian Brauner #include <linux/seqlock.h>
474969f8a0SKees Cook #include <net/sock.h>
486da73d15SChristian Brauner #include <uapi/linux/pidfd.h>
491da177e4SLinus Torvalds 
50e1e871afSDavid Howells struct pid init_struct_pid = {
51f57e515aSJoel Fernandes (Google) 	.count		= REFCOUNT_INIT(1),
52e1e871afSDavid Howells 	.tasks		= {
53e1e871afSDavid Howells 		{ .first = NULL },
54e1e871afSDavid Howells 		{ .first = NULL },
55e1e871afSDavid Howells 		{ .first = NULL },
56e1e871afSDavid Howells 	},
57e1e871afSDavid Howells 	.level		= 0,
58e1e871afSDavid Howells 	.numbers	= { {
59e1e871afSDavid Howells 		.nr		= 0,
60e1e871afSDavid Howells 		.ns		= &init_pid_ns,
61e1e871afSDavid Howells 	}, }
62e1e871afSDavid Howells };
631da177e4SLinus Torvalds 
647863dcc7SChristian Brauner static int pid_max_min = RESERVED_PIDS + 1;
657863dcc7SChristian Brauner static int pid_max_max = PID_MAX_LIMIT;
661da177e4SLinus Torvalds 
671da177e4SLinus Torvalds /*
681da177e4SLinus Torvalds  * PID-map pages start out as NULL, they get allocated upon
691da177e4SLinus Torvalds  * first use and are never deallocated. This way a low pid_max
701da177e4SLinus Torvalds  * value does not cause lots of bitmaps to be allocated, but
711da177e4SLinus Torvalds  * the scheme scales to up to 4 million PIDs, runtime.
721da177e4SLinus Torvalds  */
7361a58c6cSSukadev Bhattiprolu struct pid_namespace init_pid_ns = {
748eb71d95SKirill Tkhai 	.ns.count = REFCOUNT_INIT(2),
75f6bb2a2cSMatthew Wilcox 	.idr = IDR_INIT(init_pid_ns.idr),
76e8cfbc24SGargi Sharma 	.pid_allocated = PIDNS_ADDING,
77faacbfd3SPavel Emelyanov 	.level = 0,
78faacbfd3SPavel Emelyanov 	.child_reaper = &init_task,
7949f4d8b9SEric W. Biederman 	.user_ns = &init_user_ns,
80435d5f4bSAl Viro 	.ns.inum = PROC_PID_INIT_INO,
8133c42940SAl Viro #ifdef CONFIG_PID_NS
8233c42940SAl Viro 	.ns.ops = &pidns_operations,
8333c42940SAl Viro #endif
847863dcc7SChristian Brauner 	.pid_max = PID_MAX_DEFAULT,
859876cfe8SAleksa Sarai #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
869876cfe8SAleksa Sarai 	.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
879876cfe8SAleksa Sarai #endif
883fbc9648SSukadev Bhattiprolu };
89198fe21bSPavel Emelyanov EXPORT_SYMBOL_GPL(init_pid_ns);
901da177e4SLinus Torvalds 
911da177e4SLinus Torvalds static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
9216ecd47cSChristian Brauner seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
931da177e4SLinus Torvalds 
put_pid(struct pid * pid)947ad5b3a5SHarvey Harrison void put_pid(struct pid *pid)
9592476d7fSEric W. Biederman {
96baf8f0f8SPavel Emelianov 	struct pid_namespace *ns;
97baf8f0f8SPavel Emelianov 
9892476d7fSEric W. Biederman 	if (!pid)
9992476d7fSEric W. Biederman 		return;
100baf8f0f8SPavel Emelianov 
1018ef047aaSPavel Emelyanov 	ns = pid->numbers[pid->level].ns;
102f57e515aSJoel Fernandes (Google) 	if (refcount_dec_and_test(&pid->count)) {
103baf8f0f8SPavel Emelianov 		kmem_cache_free(ns->pid_cachep, pid);
1048ef047aaSPavel Emelyanov 		put_pid_ns(ns);
1058ef047aaSPavel Emelyanov 	}
10692476d7fSEric W. Biederman }
107bbf73147SEric W. Biederman EXPORT_SYMBOL_GPL(put_pid);
10892476d7fSEric W. Biederman 
delayed_put_pid(struct rcu_head * rhp)10992476d7fSEric W. Biederman static void delayed_put_pid(struct rcu_head *rhp)
11092476d7fSEric W. Biederman {
11192476d7fSEric W. Biederman 	struct pid *pid = container_of(rhp, struct pid, rcu);
11292476d7fSEric W. Biederman 	put_pid(pid);
11392476d7fSEric W. Biederman }
11492476d7fSEric W. Biederman 
free_pid(struct pid * pid)1157ad5b3a5SHarvey Harrison void free_pid(struct pid *pid)
11692476d7fSEric W. Biederman {
1178ef047aaSPavel Emelyanov 	int i;
11892476d7fSEric W. Biederman 
1197903f907SMateusz Guzik 	lockdep_assert_not_held(&tasklist_lock);
1207903f907SMateusz Guzik 
121*627454c0SMateusz Guzik 	spin_lock(&pidmap_lock);
1220a01f2ccSEric W. Biederman 	for (i = 0; i <= pid->level; i++) {
1230a01f2ccSEric W. Biederman 		struct upid *upid = pid->numbers + i;
124af4b8a83SEric W. Biederman 		struct pid_namespace *ns = upid->ns;
125e8cfbc24SGargi Sharma 		switch (--ns->pid_allocated) {
126a6064885SEric W. Biederman 		case 2:
127af4b8a83SEric W. Biederman 		case 1:
128af4b8a83SEric W. Biederman 			/* When all that is left in the pid namespace
129af4b8a83SEric W. Biederman 			 * is the reaper wake up the reaper.  The reaper
130af4b8a83SEric W. Biederman 			 * may be sleeping in zap_pid_ns_processes().
131af4b8a83SEric W. Biederman 			 */
132af4b8a83SEric W. Biederman 			wake_up_process(ns->child_reaper);
133af4b8a83SEric W. Biederman 			break;
134e8cfbc24SGargi Sharma 		case PIDNS_ADDING:
135314a8ad0SOleg Nesterov 			/* Handle a fork failure of the first process */
136314a8ad0SOleg Nesterov 			WARN_ON(ns->child_reaper);
137e8cfbc24SGargi Sharma 			ns->pid_allocated = 0;
138af4b8a83SEric W. Biederman 			break;
1390a01f2ccSEric W. Biederman 		}
14095846ecfSGargi Sharma 
14195846ecfSGargi Sharma 		idr_remove(&ns->idr, upid->nr);
1425e1182deSEric W. Biederman 	}
1439698d5a4SChristian Brauner 	pidfs_remove_pid(pid);
144*627454c0SMateusz Guzik 	spin_unlock(&pidmap_lock);
14592476d7fSEric W. Biederman 
14692476d7fSEric W. Biederman 	call_rcu(&pid->rcu, delayed_put_pid);
14792476d7fSEric W. Biederman }
14892476d7fSEric W. Biederman 
free_pids(struct pid ** pids)1497903f907SMateusz Guzik void free_pids(struct pid **pids)
1507903f907SMateusz Guzik {
1517903f907SMateusz Guzik 	int tmp;
1527903f907SMateusz Guzik 
1537903f907SMateusz Guzik 	/*
1547903f907SMateusz Guzik 	 * This can batch pidmap_lock.
1557903f907SMateusz Guzik 	 */
1567903f907SMateusz Guzik 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
1577903f907SMateusz Guzik 		if (pids[tmp])
1587903f907SMateusz Guzik 			free_pid(pids[tmp]);
1597903f907SMateusz Guzik }
1607903f907SMateusz Guzik 
alloc_pid(struct pid_namespace * ns,pid_t * set_tid,size_t set_tid_size)16149cb2fc4SAdrian Reber struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
16249cb2fc4SAdrian Reber 		      size_t set_tid_size)
16392476d7fSEric W. Biederman {
16492476d7fSEric W. Biederman 	struct pid *pid;
16592476d7fSEric W. Biederman 	enum pid_type type;
1668ef047aaSPavel Emelyanov 	int i, nr;
1678ef047aaSPavel Emelyanov 	struct pid_namespace *tmp;
168198fe21bSPavel Emelyanov 	struct upid *upid;
16935f71bc0SMichal Hocko 	int retval = -ENOMEM;
17092476d7fSEric W. Biederman 
17149cb2fc4SAdrian Reber 	/*
17249cb2fc4SAdrian Reber 	 * set_tid_size contains the size of the set_tid array. Starting at
17349cb2fc4SAdrian Reber 	 * the most nested currently active PID namespace it tells alloc_pid()
17449cb2fc4SAdrian Reber 	 * which PID to set for a process in that most nested PID namespace
17549cb2fc4SAdrian Reber 	 * up to set_tid_size PID namespaces. It does not have to set the PID
17649cb2fc4SAdrian Reber 	 * for a process in all nested PID namespaces but set_tid_size must
17749cb2fc4SAdrian Reber 	 * never be greater than the current ns->level + 1.
17849cb2fc4SAdrian Reber 	 */
17949cb2fc4SAdrian Reber 	if (set_tid_size > ns->level + 1)
18049cb2fc4SAdrian Reber 		return ERR_PTR(-EINVAL);
18149cb2fc4SAdrian Reber 
182baf8f0f8SPavel Emelianov 	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
18392476d7fSEric W. Biederman 	if (!pid)
18435f71bc0SMichal Hocko 		return ERR_PTR(retval);
18592476d7fSEric W. Biederman 
1868ef047aaSPavel Emelyanov 	tmp = ns;
1870a01f2ccSEric W. Biederman 	pid->level = ns->level;
18895846ecfSGargi Sharma 
1898ef047aaSPavel Emelyanov 	for (i = ns->level; i >= 0; i--) {
19049cb2fc4SAdrian Reber 		int tid = 0;
1917863dcc7SChristian Brauner 		int pid_max = READ_ONCE(tmp->pid_max);
19249cb2fc4SAdrian Reber 
19349cb2fc4SAdrian Reber 		if (set_tid_size) {
19449cb2fc4SAdrian Reber 			tid = set_tid[ns->level - i];
19549cb2fc4SAdrian Reber 
19649cb2fc4SAdrian Reber 			retval = -EINVAL;
19749cb2fc4SAdrian Reber 			if (tid < 1 || tid >= pid_max)
19849cb2fc4SAdrian Reber 				goto out_free;
19949cb2fc4SAdrian Reber 			/*
20049cb2fc4SAdrian Reber 			 * Also fail if a PID != 1 is requested and
20149cb2fc4SAdrian Reber 			 * no PID 1 exists.
20249cb2fc4SAdrian Reber 			 */
20349cb2fc4SAdrian Reber 			if (tid != 1 && !tmp->child_reaper)
20449cb2fc4SAdrian Reber 				goto out_free;
20549cb2fc4SAdrian Reber 			retval = -EPERM;
2061caef81dSAdrian Reber 			if (!checkpoint_restore_ns_capable(tmp->user_ns))
20749cb2fc4SAdrian Reber 				goto out_free;
20849cb2fc4SAdrian Reber 			set_tid_size--;
20949cb2fc4SAdrian Reber 		}
21095846ecfSGargi Sharma 
21195846ecfSGargi Sharma 		idr_preload(GFP_KERNEL);
212*627454c0SMateusz Guzik 		spin_lock(&pidmap_lock);
21395846ecfSGargi Sharma 
21449cb2fc4SAdrian Reber 		if (tid) {
21549cb2fc4SAdrian Reber 			nr = idr_alloc(&tmp->idr, NULL, tid,
21649cb2fc4SAdrian Reber 				       tid + 1, GFP_ATOMIC);
21795846ecfSGargi Sharma 			/*
21849cb2fc4SAdrian Reber 			 * If ENOSPC is returned it means that the PID is
21949cb2fc4SAdrian Reber 			 * alreay in use. Return EEXIST in that case.
22049cb2fc4SAdrian Reber 			 */
22149cb2fc4SAdrian Reber 			if (nr == -ENOSPC)
22249cb2fc4SAdrian Reber 				nr = -EEXIST;
22349cb2fc4SAdrian Reber 		} else {
22449cb2fc4SAdrian Reber 			int pid_min = 1;
22549cb2fc4SAdrian Reber 			/*
22649cb2fc4SAdrian Reber 			 * init really needs pid 1, but after reaching the
22749cb2fc4SAdrian Reber 			 * maximum wrap back to RESERVED_PIDS
22895846ecfSGargi Sharma 			 */
22995846ecfSGargi Sharma 			if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
23095846ecfSGargi Sharma 				pid_min = RESERVED_PIDS;
23195846ecfSGargi Sharma 
23295846ecfSGargi Sharma 			/*
23395846ecfSGargi Sharma 			 * Store a null pointer so find_pid_ns does not find
23495846ecfSGargi Sharma 			 * a partially initialized PID (see below).
23595846ecfSGargi Sharma 			 */
23695846ecfSGargi Sharma 			nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
23795846ecfSGargi Sharma 					      pid_max, GFP_ATOMIC);
23849cb2fc4SAdrian Reber 		}
239*627454c0SMateusz Guzik 		spin_unlock(&pidmap_lock);
24095846ecfSGargi Sharma 		idr_preload_end();
24195846ecfSGargi Sharma 
242287980e4SArnd Bergmann 		if (nr < 0) {
243f83606f5SKJ Tsanaktsidis 			retval = (nr == -ENOSPC) ? -EAGAIN : nr;
24492476d7fSEric W. Biederman 			goto out_free;
24535f71bc0SMichal Hocko 		}
24692476d7fSEric W. Biederman 
2478ef047aaSPavel Emelyanov 		pid->numbers[i].nr = nr;
2488ef047aaSPavel Emelyanov 		pid->numbers[i].ns = tmp;
2498ef047aaSPavel Emelyanov 		tmp = tmp->parent;
2508ef047aaSPavel Emelyanov 	}
2518ef047aaSPavel Emelyanov 
25210dab84cSChristian Brauner 	/*
25310dab84cSChristian Brauner 	 * ENOMEM is not the most obvious choice especially for the case
25410dab84cSChristian Brauner 	 * where the child subreaper has already exited and the pid
25510dab84cSChristian Brauner 	 * namespace denies the creation of any new processes. But ENOMEM
25610dab84cSChristian Brauner 	 * is what we have exposed to userspace for a long time and it is
25710dab84cSChristian Brauner 	 * documented behavior for pid namespaces. So we can't easily
25810dab84cSChristian Brauner 	 * change it even if there were an error code better suited.
25910dab84cSChristian Brauner 	 */
260b26ebfe1SCorey Minyard 	retval = -ENOMEM;
261b26ebfe1SCorey Minyard 
2628ef047aaSPavel Emelyanov 	get_pid_ns(ns);
263f57e515aSJoel Fernandes (Google) 	refcount_set(&pid->count, 1);
26463f818f4SEric W. Biederman 	spin_lock_init(&pid->lock);
26592476d7fSEric W. Biederman 	for (type = 0; type < PIDTYPE_MAX; ++type)
26692476d7fSEric W. Biederman 		INIT_HLIST_HEAD(&pid->tasks[type]);
26792476d7fSEric W. Biederman 
268b53b0b9dSJoel Fernandes (Google) 	init_waitqueue_head(&pid->wait_pidfd);
2697bc3e6e5SEric W. Biederman 	INIT_HLIST_HEAD(&pid->inodes);
270b53b0b9dSJoel Fernandes (Google) 
271417e3152SAndré Goddard Rosa 	upid = pid->numbers + ns->level;
2729698d5a4SChristian Brauner 	idr_preload(GFP_KERNEL);
273*627454c0SMateusz Guzik 	spin_lock(&pidmap_lock);
274e8cfbc24SGargi Sharma 	if (!(ns->pid_allocated & PIDNS_ADDING))
2755e1182deSEric W. Biederman 		goto out_unlock;
27616ecd47cSChristian Brauner 	pidfs_add_pid(pid);
2770a01f2ccSEric W. Biederman 	for ( ; upid >= pid->numbers; --upid) {
27895846ecfSGargi Sharma 		/* Make the PID visible to find_pid_ns. */
27995846ecfSGargi Sharma 		idr_replace(&upid->ns->idr, pid, upid->nr);
280e8cfbc24SGargi Sharma 		upid->ns->pid_allocated++;
2810a01f2ccSEric W. Biederman 	}
282*627454c0SMateusz Guzik 	spin_unlock(&pidmap_lock);
2839698d5a4SChristian Brauner 	idr_preload_end();
28492476d7fSEric W. Biederman 
28592476d7fSEric W. Biederman 	return pid;
28692476d7fSEric W. Biederman 
2875e1182deSEric W. Biederman out_unlock:
288*627454c0SMateusz Guzik 	spin_unlock(&pidmap_lock);
2899698d5a4SChristian Brauner 	idr_preload_end();
29024c037ebSOleg Nesterov 	put_pid_ns(ns);
29124c037ebSOleg Nesterov 
29292476d7fSEric W. Biederman out_free:
293*627454c0SMateusz Guzik 	spin_lock(&pidmap_lock);
2941a80dadeSMatthew Wilcox 	while (++i <= ns->level) {
2951a80dadeSMatthew Wilcox 		upid = pid->numbers + i;
2961a80dadeSMatthew Wilcox 		idr_remove(&upid->ns->idr, upid->nr);
2971a80dadeSMatthew Wilcox 	}
29895846ecfSGargi Sharma 
299c0ee5549SEric W. Biederman 	/* On failure to allocate the first pid, reset the state */
300c0ee5549SEric W. Biederman 	if (ns->pid_allocated == PIDNS_ADDING)
301c0ee5549SEric W. Biederman 		idr_set_cursor(&ns->idr, 0);
302c0ee5549SEric W. Biederman 
303*627454c0SMateusz Guzik 	spin_unlock(&pidmap_lock);
3048ef047aaSPavel Emelyanov 
305baf8f0f8SPavel Emelianov 	kmem_cache_free(ns->pid_cachep, pid);
30635f71bc0SMichal Hocko 	return ERR_PTR(retval);
30792476d7fSEric W. Biederman }
30892476d7fSEric W. Biederman 
disable_pid_allocation(struct pid_namespace * ns)309c876ad76SEric W. Biederman void disable_pid_allocation(struct pid_namespace *ns)
310c876ad76SEric W. Biederman {
311*627454c0SMateusz Guzik 	spin_lock(&pidmap_lock);
312e8cfbc24SGargi Sharma 	ns->pid_allocated &= ~PIDNS_ADDING;
313*627454c0SMateusz Guzik 	spin_unlock(&pidmap_lock);
314c876ad76SEric W. Biederman }
315c876ad76SEric W. Biederman 
find_pid_ns(int nr,struct pid_namespace * ns)3167ad5b3a5SHarvey Harrison struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
3171da177e4SLinus Torvalds {
318e8cfbc24SGargi Sharma 	return idr_find(&ns->idr, nr);
3191da177e4SLinus Torvalds }
320198fe21bSPavel Emelyanov EXPORT_SYMBOL_GPL(find_pid_ns);
3211da177e4SLinus Torvalds 
find_vpid(int nr)3228990571eSPavel Emelyanov struct pid *find_vpid(int nr)
3238990571eSPavel Emelyanov {
32417cf22c3SEric W. Biederman 	return find_pid_ns(nr, task_active_pid_ns(current));
3258990571eSPavel Emelyanov }
3268990571eSPavel Emelyanov EXPORT_SYMBOL_GPL(find_vpid);
3278990571eSPavel Emelyanov 
task_pid_ptr(struct task_struct * task,enum pid_type type)3282c470475SEric W. Biederman static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
3292c470475SEric W. Biederman {
3302c470475SEric W. Biederman 	return (type == PIDTYPE_PID) ?
3312c470475SEric W. Biederman 		&task->thread_pid :
3322c470475SEric W. Biederman 		&task->signal->pids[type];
3332c470475SEric W. Biederman }
3342c470475SEric W. Biederman 
335e713d0daSSukadev Bhattiprolu /*
336e713d0daSSukadev Bhattiprolu  * attach_pid() must be called with the tasklist_lock write-held.
337e713d0daSSukadev Bhattiprolu  */
attach_pid(struct task_struct * task,enum pid_type type)33881907739SOleg Nesterov void attach_pid(struct task_struct *task, enum pid_type type)
3391da177e4SLinus Torvalds {
34074198dc2SMateusz Guzik 	struct pid *pid;
34174198dc2SMateusz Guzik 
34274198dc2SMateusz Guzik 	lockdep_assert_held_write(&tasklist_lock);
34374198dc2SMateusz Guzik 
34474198dc2SMateusz Guzik 	pid = *task_pid_ptr(task, type);
3452c470475SEric W. Biederman 	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
3461da177e4SLinus Torvalds }
3471da177e4SLinus Torvalds 
__change_pid(struct pid ** pids,struct task_struct * task,enum pid_type type,struct pid * new)3487903f907SMateusz Guzik static void __change_pid(struct pid **pids, struct task_struct *task,
3497903f907SMateusz Guzik 			 enum pid_type type, struct pid *new)
3501da177e4SLinus Torvalds {
35174198dc2SMateusz Guzik 	struct pid **pid_ptr, *pid;
35292476d7fSEric W. Biederman 	int tmp;
3531da177e4SLinus Torvalds 
35474198dc2SMateusz Guzik 	lockdep_assert_held_write(&tasklist_lock);
35574198dc2SMateusz Guzik 
35674198dc2SMateusz Guzik 	pid_ptr = task_pid_ptr(task, type);
3572c470475SEric W. Biederman 	pid = *pid_ptr;
35892476d7fSEric W. Biederman 
3592c470475SEric W. Biederman 	hlist_del_rcu(&task->pid_links[type]);
3602c470475SEric W. Biederman 	*pid_ptr = new;
3611da177e4SLinus Torvalds 
36243f0df54SOleg Nesterov 	if (type == PIDTYPE_PID) {
36343f0df54SOleg Nesterov 		WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
36443f0df54SOleg Nesterov 		wake_up_all(&pid->wait_pidfd);
36543f0df54SOleg Nesterov 	}
36643f0df54SOleg Nesterov 
3671da177e4SLinus Torvalds 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
3681d416a11SChristian Brauner 		if (pid_has_task(pid, tmp))
3691da177e4SLinus Torvalds 			return;
3701da177e4SLinus Torvalds 
3717903f907SMateusz Guzik 	WARN_ON(pids[type]);
3727903f907SMateusz Guzik 	pids[type] = pid;
3731da177e4SLinus Torvalds }
3741da177e4SLinus Torvalds 
detach_pid(struct pid ** pids,struct task_struct * task,enum pid_type type)3757903f907SMateusz Guzik void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
37624336eaeSOleg Nesterov {
3777903f907SMateusz Guzik 	__change_pid(pids, task, type, NULL);
37824336eaeSOleg Nesterov }
37924336eaeSOleg Nesterov 
change_pid(struct pid ** pids,struct task_struct * task,enum pid_type type,struct pid * pid)3807903f907SMateusz Guzik void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
38124336eaeSOleg Nesterov 		struct pid *pid)
38224336eaeSOleg Nesterov {
3837903f907SMateusz Guzik 	__change_pid(pids, task, type, pid);
38481907739SOleg Nesterov 	attach_pid(task, type);
38524336eaeSOleg Nesterov }
38624336eaeSOleg Nesterov 
exchange_tids(struct task_struct * left,struct task_struct * right)3876b03d130SEric W. Biederman void exchange_tids(struct task_struct *left, struct task_struct *right)
3886b03d130SEric W. Biederman {
3896b03d130SEric W. Biederman 	struct pid *pid1 = left->thread_pid;
3906b03d130SEric W. Biederman 	struct pid *pid2 = right->thread_pid;
3916b03d130SEric W. Biederman 	struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
3926b03d130SEric W. Biederman 	struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
3936b03d130SEric W. Biederman 
39474198dc2SMateusz Guzik 	lockdep_assert_held_write(&tasklist_lock);
39574198dc2SMateusz Guzik 
3966b03d130SEric W. Biederman 	/* Swap the single entry tid lists */
3976b03d130SEric W. Biederman 	hlists_swap_heads_rcu(head1, head2);
3986b03d130SEric W. Biederman 
3996b03d130SEric W. Biederman 	/* Swap the per task_struct pid */
4006b03d130SEric W. Biederman 	rcu_assign_pointer(left->thread_pid, pid2);
4016b03d130SEric W. Biederman 	rcu_assign_pointer(right->thread_pid, pid1);
4026b03d130SEric W. Biederman 
4036b03d130SEric W. Biederman 	/* Swap the cached value */
4046b03d130SEric W. Biederman 	WRITE_ONCE(left->pid, pid_nr(pid2));
4056b03d130SEric W. Biederman 	WRITE_ONCE(right->pid, pid_nr(pid1));
4066b03d130SEric W. Biederman }
4076b03d130SEric W. Biederman 
408c18258c6SEric W. Biederman /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
transfer_pid(struct task_struct * old,struct task_struct * new,enum pid_type type)4097ad5b3a5SHarvey Harrison void transfer_pid(struct task_struct *old, struct task_struct *new,
410c18258c6SEric W. Biederman 			   enum pid_type type)
411c18258c6SEric W. Biederman {
412a1c6d543SOleg Nesterov 	WARN_ON_ONCE(type == PIDTYPE_PID);
41374198dc2SMateusz Guzik 	lockdep_assert_held_write(&tasklist_lock);
4142c470475SEric W. Biederman 	hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
415c18258c6SEric W. Biederman }
416c18258c6SEric W. Biederman 
pid_task(struct pid * pid,enum pid_type type)4177ad5b3a5SHarvey Harrison struct task_struct *pid_task(struct pid *pid, enum pid_type type)
41892476d7fSEric W. Biederman {
41992476d7fSEric W. Biederman 	struct task_struct *result = NULL;
42092476d7fSEric W. Biederman 	if (pid) {
42192476d7fSEric W. Biederman 		struct hlist_node *first;
42267bdbffdSArnd Bergmann 		first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
423db1466b3SPaul E. McKenney 					      lockdep_tasklist_lock_is_held());
42492476d7fSEric W. Biederman 		if (first)
4252c470475SEric W. Biederman 			result = hlist_entry(first, struct task_struct, pid_links[(type)]);
42692476d7fSEric W. Biederman 	}
42792476d7fSEric W. Biederman 	return result;
42892476d7fSEric W. Biederman }
429eccba068SPavel Emelyanov EXPORT_SYMBOL(pid_task);
43092476d7fSEric W. Biederman 
43192476d7fSEric W. Biederman /*
4329728e5d6STetsuo Handa  * Must be called under rcu_read_lock().
43392476d7fSEric W. Biederman  */
find_task_by_pid_ns(pid_t nr,struct pid_namespace * ns)43417f98dcfSChristoph Hellwig struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
4351da177e4SLinus Torvalds {
436f78f5b90SPaul E. McKenney 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
437f78f5b90SPaul E. McKenney 			 "find_task_by_pid_ns() needs rcu_read_lock() protection");
43817f98dcfSChristoph Hellwig 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
4391da177e4SLinus Torvalds }
4401da177e4SLinus Torvalds 
find_task_by_vpid(pid_t vnr)441228ebcbeSPavel Emelyanov struct task_struct *find_task_by_vpid(pid_t vnr)
442228ebcbeSPavel Emelyanov {
44317cf22c3SEric W. Biederman 	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
444228ebcbeSPavel Emelyanov }
445228ebcbeSPavel Emelyanov 
find_get_task_by_vpid(pid_t nr)4462ee08260SMike Rapoport struct task_struct *find_get_task_by_vpid(pid_t nr)
4472ee08260SMike Rapoport {
4482ee08260SMike Rapoport 	struct task_struct *task;
4492ee08260SMike Rapoport 
4502ee08260SMike Rapoport 	rcu_read_lock();
4512ee08260SMike Rapoport 	task = find_task_by_vpid(nr);
4522ee08260SMike Rapoport 	if (task)
4532ee08260SMike Rapoport 		get_task_struct(task);
4542ee08260SMike Rapoport 	rcu_read_unlock();
4552ee08260SMike Rapoport 
4562ee08260SMike Rapoport 	return task;
4572ee08260SMike Rapoport }
4582ee08260SMike Rapoport 
get_task_pid(struct task_struct * task,enum pid_type type)4591a657f78SOleg Nesterov struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
4601a657f78SOleg Nesterov {
4611a657f78SOleg Nesterov 	struct pid *pid;
4621a657f78SOleg Nesterov 	rcu_read_lock();
4632c470475SEric W. Biederman 	pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
4641a657f78SOleg Nesterov 	rcu_read_unlock();
4651a657f78SOleg Nesterov 	return pid;
4661a657f78SOleg Nesterov }
46777c100c8SRik van Riel EXPORT_SYMBOL_GPL(get_task_pid);
4681a657f78SOleg Nesterov 
get_pid_task(struct pid * pid,enum pid_type type)4697ad5b3a5SHarvey Harrison struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
47092476d7fSEric W. Biederman {
47192476d7fSEric W. Biederman 	struct task_struct *result;
47292476d7fSEric W. Biederman 	rcu_read_lock();
47392476d7fSEric W. Biederman 	result = pid_task(pid, type);
47492476d7fSEric W. Biederman 	if (result)
47592476d7fSEric W. Biederman 		get_task_struct(result);
47692476d7fSEric W. Biederman 	rcu_read_unlock();
47792476d7fSEric W. Biederman 	return result;
47892476d7fSEric W. Biederman }
47977c100c8SRik van Riel EXPORT_SYMBOL_GPL(get_pid_task);
48092476d7fSEric W. Biederman 
find_get_pid(pid_t nr)48192476d7fSEric W. Biederman struct pid *find_get_pid(pid_t nr)
48292476d7fSEric W. Biederman {
48392476d7fSEric W. Biederman 	struct pid *pid;
48492476d7fSEric W. Biederman 
48592476d7fSEric W. Biederman 	rcu_read_lock();
486198fe21bSPavel Emelyanov 	pid = get_pid(find_vpid(nr));
48792476d7fSEric W. Biederman 	rcu_read_unlock();
48892476d7fSEric W. Biederman 
48992476d7fSEric W. Biederman 	return pid;
49092476d7fSEric W. Biederman }
491339caf2aSDavid Sterba EXPORT_SYMBOL_GPL(find_get_pid);
49292476d7fSEric W. Biederman 
pid_nr_ns(struct pid * pid,struct pid_namespace * ns)4937af57294SPavel Emelyanov pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
4947af57294SPavel Emelyanov {
4957af57294SPavel Emelyanov 	struct upid *upid;
4967af57294SPavel Emelyanov 	pid_t nr = 0;
4977af57294SPavel Emelyanov 
4987af57294SPavel Emelyanov 	if (pid && ns->level <= pid->level) {
4997af57294SPavel Emelyanov 		upid = &pid->numbers[ns->level];
5007af57294SPavel Emelyanov 		if (upid->ns == ns)
5017af57294SPavel Emelyanov 			nr = upid->nr;
5027af57294SPavel Emelyanov 	}
5037af57294SPavel Emelyanov 	return nr;
5047af57294SPavel Emelyanov }
5054f82f457SEric W. Biederman EXPORT_SYMBOL_GPL(pid_nr_ns);
5067af57294SPavel Emelyanov 
pid_vnr(struct pid * pid)50744c4e1b2SEric W. Biederman pid_t pid_vnr(struct pid *pid)
50844c4e1b2SEric W. Biederman {
50917cf22c3SEric W. Biederman 	return pid_nr_ns(pid, task_active_pid_ns(current));
51044c4e1b2SEric W. Biederman }
51144c4e1b2SEric W. Biederman EXPORT_SYMBOL_GPL(pid_vnr);
51244c4e1b2SEric W. Biederman 
__task_pid_nr_ns(struct task_struct * task,enum pid_type type,struct pid_namespace * ns)51352ee2dfdSOleg Nesterov pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
51452ee2dfdSOleg Nesterov 			struct pid_namespace *ns)
5152f2a3a46SPavel Emelyanov {
51652ee2dfdSOleg Nesterov 	pid_t nr = 0;
51752ee2dfdSOleg Nesterov 
51852ee2dfdSOleg Nesterov 	rcu_read_lock();
51952ee2dfdSOleg Nesterov 	if (!ns)
52017cf22c3SEric W. Biederman 		ns = task_active_pid_ns(current);
5212c470475SEric W. Biederman 	nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
52252ee2dfdSOleg Nesterov 	rcu_read_unlock();
52352ee2dfdSOleg Nesterov 
52452ee2dfdSOleg Nesterov 	return nr;
52552ee2dfdSOleg Nesterov }
52652ee2dfdSOleg Nesterov EXPORT_SYMBOL(__task_pid_nr_ns);
5272f2a3a46SPavel Emelyanov 
task_active_pid_ns(struct task_struct * tsk)52861bce0f1SEric W. Biederman struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
52961bce0f1SEric W. Biederman {
53061bce0f1SEric W. Biederman 	return ns_of_pid(task_pid(tsk));
53161bce0f1SEric W. Biederman }
53261bce0f1SEric W. Biederman EXPORT_SYMBOL_GPL(task_active_pid_ns);
53361bce0f1SEric W. Biederman 
5341da177e4SLinus Torvalds /*
535025dfdafSFrederik Schwarzer  * Used by proc to find the first pid that is greater than or equal to nr.
5360804ef4bSEric W. Biederman  *
537e49859e7SPavel Emelyanov  * If there is a pid at nr this function is exactly the same as find_pid_ns.
5380804ef4bSEric W. Biederman  */
find_ge_pid(int nr,struct pid_namespace * ns)539198fe21bSPavel Emelyanov struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
5400804ef4bSEric W. Biederman {
54195846ecfSGargi Sharma 	return idr_get_next(&ns->idr, &nr);
5420804ef4bSEric W. Biederman }
5434480c27cSAndreas Gruenbacher EXPORT_SYMBOL_GPL(find_ge_pid);
5440804ef4bSEric W. Biederman 
pidfd_get_pid(unsigned int fd,unsigned int * flags)5451aa92cd3SMinchan Kim struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
5461aa92cd3SMinchan Kim {
5476348be02SAl Viro 	CLASS(fd, f)(fd);
5481aa92cd3SMinchan Kim 	struct pid *pid;
5491aa92cd3SMinchan Kim 
5506348be02SAl Viro 	if (fd_empty(f))
5511aa92cd3SMinchan Kim 		return ERR_PTR(-EBADF);
5521aa92cd3SMinchan Kim 
5531da91ea8SAl Viro 	pid = pidfd_pid(fd_file(f));
5541aa92cd3SMinchan Kim 	if (!IS_ERR(pid)) {
5551aa92cd3SMinchan Kim 		get_pid(pid);
5561da91ea8SAl Viro 		*flags = fd_file(f)->f_flags;
5571aa92cd3SMinchan Kim 	}
5581aa92cd3SMinchan Kim 	return pid;
5591aa92cd3SMinchan Kim }
5601aa92cd3SMinchan Kim 
56132fcb426SChristian Brauner /**
562e9bdcdbfSChristian Brauner  * pidfd_get_task() - Get the task associated with a pidfd
563e9bdcdbfSChristian Brauner  *
564e9bdcdbfSChristian Brauner  * @pidfd: pidfd for which to get the task
565e9bdcdbfSChristian Brauner  * @flags: flags associated with this pidfd
566e9bdcdbfSChristian Brauner  *
567e9bdcdbfSChristian Brauner  * Return the task associated with @pidfd. The function takes a reference on
568e9bdcdbfSChristian Brauner  * the returned task. The caller is responsible for releasing that reference.
569e9bdcdbfSChristian Brauner  *
570e9bdcdbfSChristian Brauner  * Return: On success, the task_struct associated with the pidfd.
571e9bdcdbfSChristian Brauner  *	   On error, a negative errno number will be returned.
572e9bdcdbfSChristian Brauner  */
pidfd_get_task(int pidfd,unsigned int * flags)573e9bdcdbfSChristian Brauner struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
574e9bdcdbfSChristian Brauner {
575e9bdcdbfSChristian Brauner 	unsigned int f_flags = 0;
576e9bdcdbfSChristian Brauner 	struct pid *pid;
577e9bdcdbfSChristian Brauner 	struct task_struct *task;
578e9bdcdbfSChristian Brauner 	enum pid_type type;
579e9bdcdbfSChristian Brauner 
580e9bdcdbfSChristian Brauner 	switch (pidfd) {
581e9bdcdbfSChristian Brauner 	case  PIDFD_SELF_THREAD:
582e9bdcdbfSChristian Brauner 		type = PIDTYPE_PID;
583e9bdcdbfSChristian Brauner 		pid = get_task_pid(current, type);
584e9bdcdbfSChristian Brauner 		break;
585e9bdcdbfSChristian Brauner 	case  PIDFD_SELF_THREAD_GROUP:
586e9bdcdbfSChristian Brauner 		type = PIDTYPE_TGID;
587e9bdcdbfSChristian Brauner 		pid = get_task_pid(current, type);
588e9bdcdbfSChristian Brauner 		break;
589e9bdcdbfSChristian Brauner 	default:
590e9bdcdbfSChristian Brauner 		pid = pidfd_get_pid(pidfd, &f_flags);
591e9bdcdbfSChristian Brauner 		if (IS_ERR(pid))
592e9bdcdbfSChristian Brauner 			return ERR_CAST(pid);
59332fcb426SChristian Brauner 		type = PIDTYPE_TGID;
59432fcb426SChristian Brauner 		break;
59532fcb426SChristian Brauner 	}
5966da73d15SChristian Brauner 
59732fcb426SChristian Brauner 	task = get_pid_task(pid, type);
59832fcb426SChristian Brauner 	put_pid(pid);
59932fcb426SChristian Brauner 	if (!task)
60032fcb426SChristian Brauner 		return ERR_PTR(-ESRCH);
60132fcb426SChristian Brauner 
60232fcb426SChristian Brauner 	*flags = f_flags;
603c576e0fcSMatthew Bobrowski 	return task;
604c576e0fcSMatthew Bobrowski }
60532fcb426SChristian Brauner 
60632fcb426SChristian Brauner /**
60732fcb426SChristian Brauner  * pidfd_create() - Create a new pid file descriptor.
608cdefbf23SOleg Nesterov  *
60932fcb426SChristian Brauner  * @pid:   struct pid that the pidfd will reference
6106ae930d9SChristian Brauner  * @flags: flags to pass
6116ae930d9SChristian Brauner  *
61232fcb426SChristian Brauner  * This creates a new pid file descriptor with the O_CLOEXEC flag set.
6136ae930d9SChristian Brauner  *
6146ae930d9SChristian Brauner  * Note, that this function can only be called after the fd table has
6156ae930d9SChristian Brauner  * been unshared to avoid leaking the pidfd to the new process.
616490b9ba8SMatthew Bobrowski  *
6176ae930d9SChristian Brauner  * This symbol should not be explicitly exported to loadable modules.
6186ae930d9SChristian Brauner  *
61932fcb426SChristian Brauner  * Return: On success, a cloexec pidfd is returned.
62032fcb426SChristian Brauner  *         On error, a negative errno number will be returned.
62132fcb426SChristian Brauner  */
pidfd_create(struct pid * pid,unsigned int flags)6220c7752d5SRandy Dunlap static int pidfd_create(struct pid *pid, unsigned int flags)
62332fcb426SChristian Brauner {
62432fcb426SChristian Brauner 	int pidfd;
62532fcb426SChristian Brauner 	struct file *pidfd_file;
62632fcb426SChristian Brauner 
62732fcb426SChristian Brauner 	pidfd = pidfd_prepare(pid, flags, &pidfd_file);
62864bef697SOleg Nesterov 	if (pidfd < 0)
62964bef697SOleg Nesterov 		return pidfd;
63032fcb426SChristian Brauner 
63132fcb426SChristian Brauner 	fd_install(pidfd, pidfd_file);
63232fcb426SChristian Brauner 	return pidfd;
63332fcb426SChristian Brauner }
63432fcb426SChristian Brauner 
63532fcb426SChristian Brauner /**
6361e1d0f0bSChristian Brauner  * sys_pidfd_open() - Open new pid file descriptor.
63732fcb426SChristian Brauner  *
63832fcb426SChristian Brauner  * @pid:   pid for which to retrieve a pidfd
63964bef697SOleg Nesterov  * @flags: flags to pass
64032fcb426SChristian Brauner  *
64132fcb426SChristian Brauner  * This creates a new pid file descriptor with the O_CLOEXEC flag set for
64232fcb426SChristian Brauner  * the task identified by @pid. Without PIDFD_THREAD flag the target task
64332fcb426SChristian Brauner  * must be a thread-group leader.
64432fcb426SChristian Brauner  *
64532fcb426SChristian Brauner  * Return: On success, a cloexec pidfd is returned.
64632fcb426SChristian Brauner  *         On error, a negative errno number will be returned.
64732fcb426SChristian Brauner  */
SYSCALL_DEFINE2(pidfd_open,pid_t,pid,unsigned int,flags)64832fcb426SChristian Brauner SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
6496da73d15SChristian Brauner {
65032fcb426SChristian Brauner 	int fd;
65132fcb426SChristian Brauner 	struct pid *p;
65232fcb426SChristian Brauner 
65332fcb426SChristian Brauner 	if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
65432fcb426SChristian Brauner 		return -EINVAL;
6557863dcc7SChristian Brauner 
6567863dcc7SChristian Brauner 	if (pid <= 0)
6577863dcc7SChristian Brauner 		return -EINVAL;
6587863dcc7SChristian Brauner 
6597863dcc7SChristian Brauner 	p = find_get_pid(pid);
6607863dcc7SChristian Brauner 	if (!p)
6617863dcc7SChristian Brauner 		return -ESRCH;
6627863dcc7SChristian Brauner 
6637863dcc7SChristian Brauner 	fd = pidfd_create(p, flags);
6647863dcc7SChristian Brauner 
6657863dcc7SChristian Brauner 	put_pid(p);
6667863dcc7SChristian Brauner 	return fd;
6677863dcc7SChristian Brauner }
6687863dcc7SChristian Brauner 
6697863dcc7SChristian Brauner #ifdef CONFIG_SYSCTL
pid_table_root_lookup(struct ctl_table_root * root)6707863dcc7SChristian Brauner static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
6717863dcc7SChristian Brauner {
6727863dcc7SChristian Brauner 	return &task_active_pid_ns(current)->set;
6737863dcc7SChristian Brauner }
6747863dcc7SChristian Brauner 
set_is_seen(struct ctl_table_set * set)6757863dcc7SChristian Brauner static int set_is_seen(struct ctl_table_set *set)
6767863dcc7SChristian Brauner {
6777863dcc7SChristian Brauner 	return &task_active_pid_ns(current)->set == set;
6787863dcc7SChristian Brauner }
6797863dcc7SChristian Brauner 
pid_table_root_permissions(struct ctl_table_header * head,const struct ctl_table * table)6807863dcc7SChristian Brauner static int pid_table_root_permissions(struct ctl_table_header *head,
6817863dcc7SChristian Brauner 				      const struct ctl_table *table)
6827863dcc7SChristian Brauner {
6837863dcc7SChristian Brauner 	struct pid_namespace *pidns =
6847863dcc7SChristian Brauner 		container_of(head->set, struct pid_namespace, set);
6857863dcc7SChristian Brauner 	int mode = table->mode;
6867863dcc7SChristian Brauner 
6877863dcc7SChristian Brauner 	if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
6887863dcc7SChristian Brauner 	    uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
6897863dcc7SChristian Brauner 		mode = (mode & S_IRWXU) >> 6;
6907863dcc7SChristian Brauner 	else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
6917863dcc7SChristian Brauner 		mode = (mode & S_IRWXG) >> 3;
6927863dcc7SChristian Brauner 	else
6937863dcc7SChristian Brauner 		mode = mode & S_IROTH;
6947863dcc7SChristian Brauner 	return (mode << 6) | (mode << 3) | mode;
6957863dcc7SChristian Brauner }
6967863dcc7SChristian Brauner 
pid_table_root_set_ownership(struct ctl_table_header * head,kuid_t * uid,kgid_t * gid)6977863dcc7SChristian Brauner static void pid_table_root_set_ownership(struct ctl_table_header *head,
6987863dcc7SChristian Brauner 					 kuid_t *uid, kgid_t *gid)
6997863dcc7SChristian Brauner {
7007863dcc7SChristian Brauner 	struct pid_namespace *pidns =
7017863dcc7SChristian Brauner 		container_of(head->set, struct pid_namespace, set);
7027863dcc7SChristian Brauner 	kuid_t ns_root_uid;
7037863dcc7SChristian Brauner 	kgid_t ns_root_gid;
7047863dcc7SChristian Brauner 
7057863dcc7SChristian Brauner 	ns_root_uid = make_kuid(pidns->user_ns, 0);
7061751f872SJoel Granados 	if (uid_valid(ns_root_uid))
7077863dcc7SChristian Brauner 		*uid = ns_root_uid;
7087863dcc7SChristian Brauner 
7097863dcc7SChristian Brauner 	ns_root_gid = make_kgid(pidns->user_ns, 0);
7107863dcc7SChristian Brauner 	if (gid_valid(ns_root_gid))
7117863dcc7SChristian Brauner 		*gid = ns_root_gid;
7127863dcc7SChristian Brauner }
7137863dcc7SChristian Brauner 
7147863dcc7SChristian Brauner static struct ctl_table_root pid_table_root = {
7157863dcc7SChristian Brauner 	.lookup		= pid_table_root_lookup,
7167863dcc7SChristian Brauner 	.permissions	= pid_table_root_permissions,
7177863dcc7SChristian Brauner 	.set_ownership	= pid_table_root_set_ownership,
7187863dcc7SChristian Brauner };
7197863dcc7SChristian Brauner 
7207863dcc7SChristian Brauner static const struct ctl_table pid_table[] = {
7217863dcc7SChristian Brauner 	{
7227863dcc7SChristian Brauner 		.procname	= "pid_max",
7237863dcc7SChristian Brauner 		.data		= &init_pid_ns.pid_max,
7247863dcc7SChristian Brauner 		.maxlen		= sizeof(int),
7257863dcc7SChristian Brauner 		.mode		= 0644,
7267863dcc7SChristian Brauner 		.proc_handler	= proc_dointvec_minmax,
7277863dcc7SChristian Brauner 		.extra1		= &pid_max_min,
7287863dcc7SChristian Brauner 		.extra2		= &pid_max_max,
7297863dcc7SChristian Brauner 	},
7307863dcc7SChristian Brauner };
7317863dcc7SChristian Brauner #endif
7327863dcc7SChristian Brauner 
register_pidns_sysctls(struct pid_namespace * pidns)7337863dcc7SChristian Brauner int register_pidns_sysctls(struct pid_namespace *pidns)
7347863dcc7SChristian Brauner {
7357863dcc7SChristian Brauner #ifdef CONFIG_SYSCTL
7367863dcc7SChristian Brauner 	struct ctl_table *tbl;
7377863dcc7SChristian Brauner 
7387863dcc7SChristian Brauner 	setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
7397863dcc7SChristian Brauner 
7407863dcc7SChristian Brauner 	tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
7417863dcc7SChristian Brauner 	if (!tbl)
7427863dcc7SChristian Brauner 		return -ENOMEM;
7437863dcc7SChristian Brauner 	tbl->data = &pidns->pid_max;
7447863dcc7SChristian Brauner 	pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
7457863dcc7SChristian Brauner 			     PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
7467863dcc7SChristian Brauner 
7477863dcc7SChristian Brauner 	pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
7487863dcc7SChristian Brauner 						 ARRAY_SIZE(pid_table));
7497863dcc7SChristian Brauner 	if (!pidns->sysctls) {
7507863dcc7SChristian Brauner 		kfree(tbl);
7517863dcc7SChristian Brauner 		retire_sysctl_set(&pidns->set);
7527863dcc7SChristian Brauner 		return -ENOMEM;
7537863dcc7SChristian Brauner 	}
7547863dcc7SChristian Brauner #endif
7557863dcc7SChristian Brauner 	return 0;
75695846ecfSGargi Sharma }
7571da177e4SLinus Torvalds 
unregister_pidns_sysctls(struct pid_namespace * pidns)758840d6fe7SZhen Lei void unregister_pidns_sysctls(struct pid_namespace *pidns)
759e8cfbc24SGargi Sharma {
760c876ad76SEric W. Biederman #ifdef CONFIG_SYSCTL
76172680a19SHedi Berriche 	const struct ctl_table *tbl;
7627863dcc7SChristian Brauner 
76372680a19SHedi Berriche 	tbl = pidns->sysctls->ctl_table_arg;
76472680a19SHedi Berriche 	unregister_sysctl_table(pidns->sysctls);
76572680a19SHedi Berriche 	retire_sysctl_set(&pidns->set);
7667863dcc7SChristian Brauner 	kfree(tbl);
76772680a19SHedi Berriche #endif
76895846ecfSGargi Sharma }
76992476d7fSEric W. Biederman 
pid_idr_init(void)770b69f0aebSKees Cook void __init pid_idr_init(void)
771dd546618SChristian Brauner {
772b69f0aebSKees Cook 	/* Verify no one has done anything silly: */
773b69f0aebSKees Cook 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
774b69f0aebSKees Cook 
7751da177e4SLinus Torvalds 	/* bump default and minimum pid_max based on number of cpus */
7768649c322SSargun Dhillon 	init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
7777863dcc7SChristian Brauner 				  PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
7787863dcc7SChristian Brauner 	pid_max_min = max_t(int, pid_max_min,
7797863dcc7SChristian Brauner 				PIDS_PER_CPU_MIN * num_possible_cpus());
7807863dcc7SChristian Brauner 	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
7817863dcc7SChristian Brauner 
7827863dcc7SChristian Brauner 	idr_init(&init_pid_ns.idr);
7837863dcc7SChristian Brauner 
7847863dcc7SChristian Brauner 	init_pid_ns.pid_cachep = kmem_cache_create("pid",
7857863dcc7SChristian Brauner 			struct_size_t(struct pid, numbers, 1),
7867863dcc7SChristian Brauner 			__alignof__(struct pid),
7878649c322SSargun Dhillon 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
7888649c322SSargun Dhillon 			NULL);
7898649c322SSargun Dhillon }
7908649c322SSargun Dhillon 
pid_namespace_sysctl_init(void)7918649c322SSargun Dhillon static __init int pid_namespace_sysctl_init(void)
792f7cfd871SEric W. Biederman {
7938649c322SSargun Dhillon #ifdef CONFIG_SYSCTL
7948649c322SSargun Dhillon 	/* "kernel" directory will have already been initialized. */
7958649c322SSargun Dhillon 	BUG_ON(register_pidns_sysctls(&init_pid_ns));
7968649c322SSargun Dhillon #endif
7978649c322SSargun Dhillon 	return 0;
7988649c322SSargun Dhillon }
7998649c322SSargun Dhillon subsys_initcall(pid_namespace_sysctl_init);
8008649c322SSargun Dhillon 
__pidfd_fget(struct task_struct * task,int fd)801f7cfd871SEric W. Biederman static struct file *__pidfd_fget(struct task_struct *task, int fd)
8028649c322SSargun Dhillon {
8030c9bd6bcSTycho Andersen 	struct file *file;
8040c9bd6bcSTycho Andersen 	int ret;
8050c9bd6bcSTycho Andersen 
8060c9bd6bcSTycho Andersen 	ret = down_read_killable(&task->signal->exec_update_lock);
8070c9bd6bcSTycho Andersen 	if (ret)
8080c9bd6bcSTycho Andersen 		return ERR_PTR(ret);
8090c9bd6bcSTycho Andersen 
8100c9bd6bcSTycho Andersen 	if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
8110c9bd6bcSTycho Andersen 		file = fget_task(task, fd);
8120c9bd6bcSTycho Andersen 	else
8130c9bd6bcSTycho Andersen 		file = ERR_PTR(-EPERM);
8140c9bd6bcSTycho Andersen 
8150c9bd6bcSTycho Andersen 	up_read(&task->signal->exec_update_lock);
8160c9bd6bcSTycho Andersen 
8170c9bd6bcSTycho Andersen 	if (!file) {
8180c9bd6bcSTycho Andersen 		/*
8190c9bd6bcSTycho Andersen 		 * It is possible that the target thread is exiting; it can be
8200c9bd6bcSTycho Andersen 		 * either:
8210c9bd6bcSTycho Andersen 		 * 1. before exit_signals(), which gives a real fd
8220c9bd6bcSTycho Andersen 		 * 2. before exit_files() takes the task_lock() gives a real fd
8238649c322SSargun Dhillon 		 * 3. after exit_files() releases task_lock(), ->files is NULL;
8248649c322SSargun Dhillon 		 *    this has PF_EXITING, since it was set in exit_signals(),
8258649c322SSargun Dhillon 		 *    __pidfd_fget() returns EBADF.
8268649c322SSargun Dhillon 		 * In case 3 we get EBADF, but that really means ESRCH, since
8278649c322SSargun Dhillon 		 * the task is currently exiting and has freed its files
8288649c322SSargun Dhillon 		 * struct, so we fix it up.
8298649c322SSargun Dhillon 		 */
8308649c322SSargun Dhillon 		if (task->flags & PF_EXITING)
8318649c322SSargun Dhillon 			file = ERR_PTR(-ESRCH);
8328649c322SSargun Dhillon 		else
8338649c322SSargun Dhillon 			file = ERR_PTR(-EBADF);
8348649c322SSargun Dhillon 	}
8358649c322SSargun Dhillon 
8368649c322SSargun Dhillon 	return file;
8378649c322SSargun Dhillon }
8388649c322SSargun Dhillon 
pidfd_getfd(struct pid * pid,int fd)8398649c322SSargun Dhillon static int pidfd_getfd(struct pid *pid, int fd)
8404e94ddfeSChristian Brauner {
8418649c322SSargun Dhillon 	struct task_struct *task;
8428649c322SSargun Dhillon 	struct file *file;
8438649c322SSargun Dhillon 	int ret;
8448649c322SSargun Dhillon 
8458649c322SSargun Dhillon 	task = get_pid_task(pid, PIDTYPE_PID);
8468649c322SSargun Dhillon 	if (!task)
8478649c322SSargun Dhillon 		return -ESRCH;
8488649c322SSargun Dhillon 
8498649c322SSargun Dhillon 	file = __pidfd_fget(task, fd);
8508649c322SSargun Dhillon 	put_task_struct(task);
8518649c322SSargun Dhillon 	if (IS_ERR(file))
8528649c322SSargun Dhillon 		return PTR_ERR(file);
8538649c322SSargun Dhillon 
8548649c322SSargun Dhillon 	ret = receive_fd(file, NULL, O_CLOEXEC);
8558649c322SSargun Dhillon 	fput(file);
8568649c322SSargun Dhillon 
8578649c322SSargun Dhillon 	return ret;
8588649c322SSargun Dhillon }
8598649c322SSargun Dhillon 
8608649c322SSargun Dhillon /**
8618649c322SSargun Dhillon  * sys_pidfd_getfd() - Get a file descriptor from another process
8628649c322SSargun Dhillon  *
8638649c322SSargun Dhillon  * @pidfd:	the pidfd file descriptor of the process
8648649c322SSargun Dhillon  * @fd:		the file descriptor number to get
8658649c322SSargun Dhillon  * @flags:	flags on how to get the fd (reserved)
8668649c322SSargun Dhillon  *
8678649c322SSargun Dhillon  * This syscall gets a copy of a file descriptor from another process
8688649c322SSargun Dhillon  * based on the pidfd, and file descriptor number. It requires that
8698649c322SSargun Dhillon  * the calling process has the ability to ptrace the process represented
8708649c322SSargun Dhillon  * by the pidfd. The process which is having its file descriptor copied
8718152f820SAl Viro  * is otherwise unaffected.
8728152f820SAl Viro  *
8738649c322SSargun Dhillon  * Return: On success, a cloexec file descriptor is returned.
8748649c322SSargun Dhillon  *         On error, a negative errno number will be returned.
8751da91ea8SAl Viro  */
SYSCALL_DEFINE3(pidfd_getfd,int,pidfd,int,fd,unsigned int,flags)8768649c322SSargun Dhillon SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
8778152f820SAl Viro 		unsigned int, flags)
8788649c322SSargun Dhillon {
8798152f820SAl Viro 	struct pid *pid;
8808649c322SSargun Dhillon 
881 	/* flags is currently unused - make sure it's unset */
882 	if (flags)
883 		return -EINVAL;
884 
885 	CLASS(fd, f)(pidfd);
886 	if (fd_empty(f))
887 		return -EBADF;
888 
889 	pid = pidfd_pid(fd_file(f));
890 	if (IS_ERR(pid))
891 		return PTR_ERR(pid);
892 
893 	return pidfd_getfd(pid, fd);
894 }
895