xref: /linux-6.15/ipc/mqueue.c (revision 3f07c014)
1 /*
2  * POSIX message queues filesystem for Linux.
3  *
4  * Copyright (C) 2003,2004  Krzysztof Benedyczak    ([email protected])
5  *                          Michal Wronski          ([email protected])
6  *
7  * Spinlocks:               Mohamed Abbas           ([email protected])
8  * Lockless receive & send, fd based notify:
9  *			    Manfred Spraul	    ([email protected])
10  *
11  * Audit:                   George Wilson           ([email protected])
12  *
13  * This file is released under the GPL.
14  */
15 
16 #include <linux/capability.h>
17 #include <linux/init.h>
18 #include <linux/pagemap.h>
19 #include <linux/file.h>
20 #include <linux/mount.h>
21 #include <linux/namei.h>
22 #include <linux/sysctl.h>
23 #include <linux/poll.h>
24 #include <linux/mqueue.h>
25 #include <linux/msg.h>
26 #include <linux/skbuff.h>
27 #include <linux/vmalloc.h>
28 #include <linux/netlink.h>
29 #include <linux/syscalls.h>
30 #include <linux/audit.h>
31 #include <linux/signal.h>
32 #include <linux/mutex.h>
33 #include <linux/nsproxy.h>
34 #include <linux/pid.h>
35 #include <linux/ipc_namespace.h>
36 #include <linux/user_namespace.h>
37 #include <linux/slab.h>
38 #include <linux/sched/wake_q.h>
39 #include <linux/sched/signal.h>
40 
41 #include <net/sock.h>
42 #include "util.h"
43 
44 #define MQUEUE_MAGIC	0x19800202
45 #define DIRENT_SIZE	20
46 #define FILENT_SIZE	80
47 
48 #define SEND		0
49 #define RECV		1
50 
51 #define STATE_NONE	0
52 #define STATE_READY	1
53 
54 struct posix_msg_tree_node {
55 	struct rb_node		rb_node;
56 	struct list_head	msg_list;
57 	int			priority;
58 };
59 
60 struct ext_wait_queue {		/* queue of sleeping tasks */
61 	struct task_struct *task;
62 	struct list_head list;
63 	struct msg_msg *msg;	/* ptr of loaded message */
64 	int state;		/* one of STATE_* values */
65 };
66 
67 struct mqueue_inode_info {
68 	spinlock_t lock;
69 	struct inode vfs_inode;
70 	wait_queue_head_t wait_q;
71 
72 	struct rb_root msg_tree;
73 	struct posix_msg_tree_node *node_cache;
74 	struct mq_attr attr;
75 
76 	struct sigevent notify;
77 	struct pid *notify_owner;
78 	struct user_namespace *notify_user_ns;
79 	struct user_struct *user;	/* user who created, for accounting */
80 	struct sock *notify_sock;
81 	struct sk_buff *notify_cookie;
82 
83 	/* for tasks waiting for free space and messages, respectively */
84 	struct ext_wait_queue e_wait_q[2];
85 
86 	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
87 };
88 
89 static const struct inode_operations mqueue_dir_inode_operations;
90 static const struct file_operations mqueue_file_operations;
91 static const struct super_operations mqueue_super_ops;
92 static void remove_notification(struct mqueue_inode_info *info);
93 
94 static struct kmem_cache *mqueue_inode_cachep;
95 
96 static struct ctl_table_header *mq_sysctl_table;
97 
98 static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
99 {
100 	return container_of(inode, struct mqueue_inode_info, vfs_inode);
101 }
102 
103 /*
104  * This routine should be called with the mq_lock held.
105  */
106 static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
107 {
108 	return get_ipc_ns(inode->i_sb->s_fs_info);
109 }
110 
111 static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
112 {
113 	struct ipc_namespace *ns;
114 
115 	spin_lock(&mq_lock);
116 	ns = __get_ns_from_inode(inode);
117 	spin_unlock(&mq_lock);
118 	return ns;
119 }
120 
121 /* Auxiliary functions to manipulate messages' list */
122 static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
123 {
124 	struct rb_node **p, *parent = NULL;
125 	struct posix_msg_tree_node *leaf;
126 
127 	p = &info->msg_tree.rb_node;
128 	while (*p) {
129 		parent = *p;
130 		leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
131 
132 		if (likely(leaf->priority == msg->m_type))
133 			goto insert_msg;
134 		else if (msg->m_type < leaf->priority)
135 			p = &(*p)->rb_left;
136 		else
137 			p = &(*p)->rb_right;
138 	}
139 	if (info->node_cache) {
140 		leaf = info->node_cache;
141 		info->node_cache = NULL;
142 	} else {
143 		leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC);
144 		if (!leaf)
145 			return -ENOMEM;
146 		INIT_LIST_HEAD(&leaf->msg_list);
147 	}
148 	leaf->priority = msg->m_type;
149 	rb_link_node(&leaf->rb_node, parent, p);
150 	rb_insert_color(&leaf->rb_node, &info->msg_tree);
151 insert_msg:
152 	info->attr.mq_curmsgs++;
153 	info->qsize += msg->m_ts;
154 	list_add_tail(&msg->m_list, &leaf->msg_list);
155 	return 0;
156 }
157 
158 static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
159 {
160 	struct rb_node **p, *parent = NULL;
161 	struct posix_msg_tree_node *leaf;
162 	struct msg_msg *msg;
163 
164 try_again:
165 	p = &info->msg_tree.rb_node;
166 	while (*p) {
167 		parent = *p;
168 		/*
169 		 * During insert, low priorities go to the left and high to the
170 		 * right.  On receive, we want the highest priorities first, so
171 		 * walk all the way to the right.
172 		 */
173 		p = &(*p)->rb_right;
174 	}
175 	if (!parent) {
176 		if (info->attr.mq_curmsgs) {
177 			pr_warn_once("Inconsistency in POSIX message queue, "
178 				     "no tree element, but supposedly messages "
179 				     "should exist!\n");
180 			info->attr.mq_curmsgs = 0;
181 		}
182 		return NULL;
183 	}
184 	leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node);
185 	if (unlikely(list_empty(&leaf->msg_list))) {
186 		pr_warn_once("Inconsistency in POSIX message queue, "
187 			     "empty leaf node but we haven't implemented "
188 			     "lazy leaf delete!\n");
189 		rb_erase(&leaf->rb_node, &info->msg_tree);
190 		if (info->node_cache) {
191 			kfree(leaf);
192 		} else {
193 			info->node_cache = leaf;
194 		}
195 		goto try_again;
196 	} else {
197 		msg = list_first_entry(&leaf->msg_list,
198 				       struct msg_msg, m_list);
199 		list_del(&msg->m_list);
200 		if (list_empty(&leaf->msg_list)) {
201 			rb_erase(&leaf->rb_node, &info->msg_tree);
202 			if (info->node_cache) {
203 				kfree(leaf);
204 			} else {
205 				info->node_cache = leaf;
206 			}
207 		}
208 	}
209 	info->attr.mq_curmsgs--;
210 	info->qsize -= msg->m_ts;
211 	return msg;
212 }
213 
214 static struct inode *mqueue_get_inode(struct super_block *sb,
215 		struct ipc_namespace *ipc_ns, umode_t mode,
216 		struct mq_attr *attr)
217 {
218 	struct user_struct *u = current_user();
219 	struct inode *inode;
220 	int ret = -ENOMEM;
221 
222 	inode = new_inode(sb);
223 	if (!inode)
224 		goto err;
225 
226 	inode->i_ino = get_next_ino();
227 	inode->i_mode = mode;
228 	inode->i_uid = current_fsuid();
229 	inode->i_gid = current_fsgid();
230 	inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode);
231 
232 	if (S_ISREG(mode)) {
233 		struct mqueue_inode_info *info;
234 		unsigned long mq_bytes, mq_treesize;
235 
236 		inode->i_fop = &mqueue_file_operations;
237 		inode->i_size = FILENT_SIZE;
238 		/* mqueue specific info */
239 		info = MQUEUE_I(inode);
240 		spin_lock_init(&info->lock);
241 		init_waitqueue_head(&info->wait_q);
242 		INIT_LIST_HEAD(&info->e_wait_q[0].list);
243 		INIT_LIST_HEAD(&info->e_wait_q[1].list);
244 		info->notify_owner = NULL;
245 		info->notify_user_ns = NULL;
246 		info->qsize = 0;
247 		info->user = NULL;	/* set when all is ok */
248 		info->msg_tree = RB_ROOT;
249 		info->node_cache = NULL;
250 		memset(&info->attr, 0, sizeof(info->attr));
251 		info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
252 					   ipc_ns->mq_msg_default);
253 		info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
254 					    ipc_ns->mq_msgsize_default);
255 		if (attr) {
256 			info->attr.mq_maxmsg = attr->mq_maxmsg;
257 			info->attr.mq_msgsize = attr->mq_msgsize;
258 		}
259 		/*
260 		 * We used to allocate a static array of pointers and account
261 		 * the size of that array as well as one msg_msg struct per
262 		 * possible message into the queue size. That's no longer
263 		 * accurate as the queue is now an rbtree and will grow and
264 		 * shrink depending on usage patterns.  We can, however, still
265 		 * account one msg_msg struct per message, but the nodes are
266 		 * allocated depending on priority usage, and most programs
267 		 * only use one, or a handful, of priorities.  However, since
268 		 * this is pinned memory, we need to assume worst case, so
269 		 * that means the min(mq_maxmsg, max_priorities) * struct
270 		 * posix_msg_tree_node.
271 		 */
272 		mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
273 			min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
274 			sizeof(struct posix_msg_tree_node);
275 
276 		mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
277 					  info->attr.mq_msgsize);
278 
279 		spin_lock(&mq_lock);
280 		if (u->mq_bytes + mq_bytes < u->mq_bytes ||
281 		    u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
282 			spin_unlock(&mq_lock);
283 			/* mqueue_evict_inode() releases info->messages */
284 			ret = -EMFILE;
285 			goto out_inode;
286 		}
287 		u->mq_bytes += mq_bytes;
288 		spin_unlock(&mq_lock);
289 
290 		/* all is ok */
291 		info->user = get_uid(u);
292 	} else if (S_ISDIR(mode)) {
293 		inc_nlink(inode);
294 		/* Some things misbehave if size == 0 on a directory */
295 		inode->i_size = 2 * DIRENT_SIZE;
296 		inode->i_op = &mqueue_dir_inode_operations;
297 		inode->i_fop = &simple_dir_operations;
298 	}
299 
300 	return inode;
301 out_inode:
302 	iput(inode);
303 err:
304 	return ERR_PTR(ret);
305 }
306 
307 static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
308 {
309 	struct inode *inode;
310 	struct ipc_namespace *ns = sb->s_fs_info;
311 
312 	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
313 	sb->s_blocksize = PAGE_SIZE;
314 	sb->s_blocksize_bits = PAGE_SHIFT;
315 	sb->s_magic = MQUEUE_MAGIC;
316 	sb->s_op = &mqueue_super_ops;
317 
318 	inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
319 	if (IS_ERR(inode))
320 		return PTR_ERR(inode);
321 
322 	sb->s_root = d_make_root(inode);
323 	if (!sb->s_root)
324 		return -ENOMEM;
325 	return 0;
326 }
327 
328 static struct dentry *mqueue_mount(struct file_system_type *fs_type,
329 			 int flags, const char *dev_name,
330 			 void *data)
331 {
332 	struct ipc_namespace *ns;
333 	if (flags & MS_KERNMOUNT) {
334 		ns = data;
335 		data = NULL;
336 	} else {
337 		ns = current->nsproxy->ipc_ns;
338 	}
339 	return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super);
340 }
341 
342 static void init_once(void *foo)
343 {
344 	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
345 
346 	inode_init_once(&p->vfs_inode);
347 }
348 
349 static struct inode *mqueue_alloc_inode(struct super_block *sb)
350 {
351 	struct mqueue_inode_info *ei;
352 
353 	ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL);
354 	if (!ei)
355 		return NULL;
356 	return &ei->vfs_inode;
357 }
358 
359 static void mqueue_i_callback(struct rcu_head *head)
360 {
361 	struct inode *inode = container_of(head, struct inode, i_rcu);
362 	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
363 }
364 
365 static void mqueue_destroy_inode(struct inode *inode)
366 {
367 	call_rcu(&inode->i_rcu, mqueue_i_callback);
368 }
369 
370 static void mqueue_evict_inode(struct inode *inode)
371 {
372 	struct mqueue_inode_info *info;
373 	struct user_struct *user;
374 	unsigned long mq_bytes, mq_treesize;
375 	struct ipc_namespace *ipc_ns;
376 	struct msg_msg *msg;
377 
378 	clear_inode(inode);
379 
380 	if (S_ISDIR(inode->i_mode))
381 		return;
382 
383 	ipc_ns = get_ns_from_inode(inode);
384 	info = MQUEUE_I(inode);
385 	spin_lock(&info->lock);
386 	while ((msg = msg_get(info)) != NULL)
387 		free_msg(msg);
388 	kfree(info->node_cache);
389 	spin_unlock(&info->lock);
390 
391 	/* Total amount of bytes accounted for the mqueue */
392 	mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
393 		min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
394 		sizeof(struct posix_msg_tree_node);
395 
396 	mq_bytes = mq_treesize + (info->attr.mq_maxmsg *
397 				  info->attr.mq_msgsize);
398 
399 	user = info->user;
400 	if (user) {
401 		spin_lock(&mq_lock);
402 		user->mq_bytes -= mq_bytes;
403 		/*
404 		 * get_ns_from_inode() ensures that the
405 		 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
406 		 * to which we now hold a reference, or it is NULL.
407 		 * We can't put it here under mq_lock, though.
408 		 */
409 		if (ipc_ns)
410 			ipc_ns->mq_queues_count--;
411 		spin_unlock(&mq_lock);
412 		free_uid(user);
413 	}
414 	if (ipc_ns)
415 		put_ipc_ns(ipc_ns);
416 }
417 
418 static int mqueue_create(struct inode *dir, struct dentry *dentry,
419 				umode_t mode, bool excl)
420 {
421 	struct inode *inode;
422 	struct mq_attr *attr = dentry->d_fsdata;
423 	int error;
424 	struct ipc_namespace *ipc_ns;
425 
426 	spin_lock(&mq_lock);
427 	ipc_ns = __get_ns_from_inode(dir);
428 	if (!ipc_ns) {
429 		error = -EACCES;
430 		goto out_unlock;
431 	}
432 
433 	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max &&
434 	    !capable(CAP_SYS_RESOURCE)) {
435 		error = -ENOSPC;
436 		goto out_unlock;
437 	}
438 	ipc_ns->mq_queues_count++;
439 	spin_unlock(&mq_lock);
440 
441 	inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr);
442 	if (IS_ERR(inode)) {
443 		error = PTR_ERR(inode);
444 		spin_lock(&mq_lock);
445 		ipc_ns->mq_queues_count--;
446 		goto out_unlock;
447 	}
448 
449 	put_ipc_ns(ipc_ns);
450 	dir->i_size += DIRENT_SIZE;
451 	dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
452 
453 	d_instantiate(dentry, inode);
454 	dget(dentry);
455 	return 0;
456 out_unlock:
457 	spin_unlock(&mq_lock);
458 	if (ipc_ns)
459 		put_ipc_ns(ipc_ns);
460 	return error;
461 }
462 
463 static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
464 {
465 	struct inode *inode = d_inode(dentry);
466 
467 	dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
468 	dir->i_size -= DIRENT_SIZE;
469 	drop_nlink(inode);
470 	dput(dentry);
471 	return 0;
472 }
473 
474 /*
475 *	This is routine for system read from queue file.
476 *	To avoid mess with doing here some sort of mq_receive we allow
477 *	to read only queue size & notification info (the only values
478 *	that are interesting from user point of view and aren't accessible
479 *	through std routines)
480 */
481 static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
482 				size_t count, loff_t *off)
483 {
484 	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
485 	char buffer[FILENT_SIZE];
486 	ssize_t ret;
487 
488 	spin_lock(&info->lock);
489 	snprintf(buffer, sizeof(buffer),
490 			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
491 			info->qsize,
492 			info->notify_owner ? info->notify.sigev_notify : 0,
493 			(info->notify_owner &&
494 			 info->notify.sigev_notify == SIGEV_SIGNAL) ?
495 				info->notify.sigev_signo : 0,
496 			pid_vnr(info->notify_owner));
497 	spin_unlock(&info->lock);
498 	buffer[sizeof(buffer)-1] = '\0';
499 
500 	ret = simple_read_from_buffer(u_data, count, off, buffer,
501 				strlen(buffer));
502 	if (ret <= 0)
503 		return ret;
504 
505 	file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp));
506 	return ret;
507 }
508 
509 static int mqueue_flush_file(struct file *filp, fl_owner_t id)
510 {
511 	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
512 
513 	spin_lock(&info->lock);
514 	if (task_tgid(current) == info->notify_owner)
515 		remove_notification(info);
516 
517 	spin_unlock(&info->lock);
518 	return 0;
519 }
520 
521 static unsigned int mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab)
522 {
523 	struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
524 	int retval = 0;
525 
526 	poll_wait(filp, &info->wait_q, poll_tab);
527 
528 	spin_lock(&info->lock);
529 	if (info->attr.mq_curmsgs)
530 		retval = POLLIN | POLLRDNORM;
531 
532 	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
533 		retval |= POLLOUT | POLLWRNORM;
534 	spin_unlock(&info->lock);
535 
536 	return retval;
537 }
538 
539 /* Adds current to info->e_wait_q[sr] before element with smaller prio */
540 static void wq_add(struct mqueue_inode_info *info, int sr,
541 			struct ext_wait_queue *ewp)
542 {
543 	struct ext_wait_queue *walk;
544 
545 	ewp->task = current;
546 
547 	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
548 		if (walk->task->static_prio <= current->static_prio) {
549 			list_add_tail(&ewp->list, &walk->list);
550 			return;
551 		}
552 	}
553 	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
554 }
555 
556 /*
557  * Puts current task to sleep. Caller must hold queue lock. After return
558  * lock isn't held.
559  * sr: SEND or RECV
560  */
561 static int wq_sleep(struct mqueue_inode_info *info, int sr,
562 		    ktime_t *timeout, struct ext_wait_queue *ewp)
563 	__releases(&info->lock)
564 {
565 	int retval;
566 	signed long time;
567 
568 	wq_add(info, sr, ewp);
569 
570 	for (;;) {
571 		__set_current_state(TASK_INTERRUPTIBLE);
572 
573 		spin_unlock(&info->lock);
574 		time = schedule_hrtimeout_range_clock(timeout, 0,
575 			HRTIMER_MODE_ABS, CLOCK_REALTIME);
576 
577 		if (ewp->state == STATE_READY) {
578 			retval = 0;
579 			goto out;
580 		}
581 		spin_lock(&info->lock);
582 		if (ewp->state == STATE_READY) {
583 			retval = 0;
584 			goto out_unlock;
585 		}
586 		if (signal_pending(current)) {
587 			retval = -ERESTARTSYS;
588 			break;
589 		}
590 		if (time == 0) {
591 			retval = -ETIMEDOUT;
592 			break;
593 		}
594 	}
595 	list_del(&ewp->list);
596 out_unlock:
597 	spin_unlock(&info->lock);
598 out:
599 	return retval;
600 }
601 
602 /*
603  * Returns waiting task that should be serviced first or NULL if none exists
604  */
605 static struct ext_wait_queue *wq_get_first_waiter(
606 		struct mqueue_inode_info *info, int sr)
607 {
608 	struct list_head *ptr;
609 
610 	ptr = info->e_wait_q[sr].list.prev;
611 	if (ptr == &info->e_wait_q[sr].list)
612 		return NULL;
613 	return list_entry(ptr, struct ext_wait_queue, list);
614 }
615 
616 
617 static inline void set_cookie(struct sk_buff *skb, char code)
618 {
619 	((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
620 }
621 
622 /*
623  * The next function is only to split too long sys_mq_timedsend
624  */
625 static void __do_notify(struct mqueue_inode_info *info)
626 {
627 	/* notification
628 	 * invoked when there is registered process and there isn't process
629 	 * waiting synchronously for message AND state of queue changed from
630 	 * empty to not empty. Here we are sure that no one is waiting
631 	 * synchronously. */
632 	if (info->notify_owner &&
633 	    info->attr.mq_curmsgs == 1) {
634 		struct siginfo sig_i;
635 		switch (info->notify.sigev_notify) {
636 		case SIGEV_NONE:
637 			break;
638 		case SIGEV_SIGNAL:
639 			/* sends signal */
640 
641 			sig_i.si_signo = info->notify.sigev_signo;
642 			sig_i.si_errno = 0;
643 			sig_i.si_code = SI_MESGQ;
644 			sig_i.si_value = info->notify.sigev_value;
645 			/* map current pid/uid into info->owner's namespaces */
646 			rcu_read_lock();
647 			sig_i.si_pid = task_tgid_nr_ns(current,
648 						ns_of_pid(info->notify_owner));
649 			sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid());
650 			rcu_read_unlock();
651 
652 			kill_pid_info(info->notify.sigev_signo,
653 				      &sig_i, info->notify_owner);
654 			break;
655 		case SIGEV_THREAD:
656 			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
657 			netlink_sendskb(info->notify_sock, info->notify_cookie);
658 			break;
659 		}
660 		/* after notification unregisters process */
661 		put_pid(info->notify_owner);
662 		put_user_ns(info->notify_user_ns);
663 		info->notify_owner = NULL;
664 		info->notify_user_ns = NULL;
665 	}
666 	wake_up(&info->wait_q);
667 }
668 
669 static int prepare_timeout(const struct timespec __user *u_abs_timeout,
670 			   ktime_t *expires, struct timespec *ts)
671 {
672 	if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
673 		return -EFAULT;
674 	if (!timespec_valid(ts))
675 		return -EINVAL;
676 
677 	*expires = timespec_to_ktime(*ts);
678 	return 0;
679 }
680 
681 static void remove_notification(struct mqueue_inode_info *info)
682 {
683 	if (info->notify_owner != NULL &&
684 	    info->notify.sigev_notify == SIGEV_THREAD) {
685 		set_cookie(info->notify_cookie, NOTIFY_REMOVED);
686 		netlink_sendskb(info->notify_sock, info->notify_cookie);
687 	}
688 	put_pid(info->notify_owner);
689 	put_user_ns(info->notify_user_ns);
690 	info->notify_owner = NULL;
691 	info->notify_user_ns = NULL;
692 }
693 
694 static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
695 {
696 	int mq_treesize;
697 	unsigned long total_size;
698 
699 	if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0)
700 		return -EINVAL;
701 	if (capable(CAP_SYS_RESOURCE)) {
702 		if (attr->mq_maxmsg > HARD_MSGMAX ||
703 		    attr->mq_msgsize > HARD_MSGSIZEMAX)
704 			return -EINVAL;
705 	} else {
706 		if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||
707 				attr->mq_msgsize > ipc_ns->mq_msgsize_max)
708 			return -EINVAL;
709 	}
710 	/* check for overflow */
711 	if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
712 		return -EOVERFLOW;
713 	mq_treesize = attr->mq_maxmsg * sizeof(struct msg_msg) +
714 		min_t(unsigned int, attr->mq_maxmsg, MQ_PRIO_MAX) *
715 		sizeof(struct posix_msg_tree_node);
716 	total_size = attr->mq_maxmsg * attr->mq_msgsize;
717 	if (total_size + mq_treesize < total_size)
718 		return -EOVERFLOW;
719 	return 0;
720 }
721 
722 /*
723  * Invoked when creating a new queue via sys_mq_open
724  */
725 static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
726 			struct path *path, int oflag, umode_t mode,
727 			struct mq_attr *attr)
728 {
729 	const struct cred *cred = current_cred();
730 	int ret;
731 
732 	if (attr) {
733 		ret = mq_attr_ok(ipc_ns, attr);
734 		if (ret)
735 			return ERR_PTR(ret);
736 		/* store for use during create */
737 		path->dentry->d_fsdata = attr;
738 	} else {
739 		struct mq_attr def_attr;
740 
741 		def_attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
742 					 ipc_ns->mq_msg_default);
743 		def_attr.mq_msgsize = min(ipc_ns->mq_msgsize_max,
744 					  ipc_ns->mq_msgsize_default);
745 		ret = mq_attr_ok(ipc_ns, &def_attr);
746 		if (ret)
747 			return ERR_PTR(ret);
748 	}
749 
750 	mode &= ~current_umask();
751 	ret = vfs_create(dir, path->dentry, mode, true);
752 	path->dentry->d_fsdata = NULL;
753 	if (ret)
754 		return ERR_PTR(ret);
755 	return dentry_open(path, oflag, cred);
756 }
757 
758 /* Opens existing queue */
759 static struct file *do_open(struct path *path, int oflag)
760 {
761 	static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
762 						  MAY_READ | MAY_WRITE };
763 	int acc;
764 	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
765 		return ERR_PTR(-EINVAL);
766 	acc = oflag2acc[oflag & O_ACCMODE];
767 	if (inode_permission(d_inode(path->dentry), acc))
768 		return ERR_PTR(-EACCES);
769 	return dentry_open(path, oflag, current_cred());
770 }
771 
772 SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
773 		struct mq_attr __user *, u_attr)
774 {
775 	struct path path;
776 	struct file *filp;
777 	struct filename *name;
778 	struct mq_attr attr;
779 	int fd, error;
780 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
781 	struct vfsmount *mnt = ipc_ns->mq_mnt;
782 	struct dentry *root = mnt->mnt_root;
783 	int ro;
784 
785 	if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr)))
786 		return -EFAULT;
787 
788 	audit_mq_open(oflag, mode, u_attr ? &attr : NULL);
789 
790 	if (IS_ERR(name = getname(u_name)))
791 		return PTR_ERR(name);
792 
793 	fd = get_unused_fd_flags(O_CLOEXEC);
794 	if (fd < 0)
795 		goto out_putname;
796 
797 	ro = mnt_want_write(mnt);	/* we'll drop it in any case */
798 	error = 0;
799 	inode_lock(d_inode(root));
800 	path.dentry = lookup_one_len(name->name, root, strlen(name->name));
801 	if (IS_ERR(path.dentry)) {
802 		error = PTR_ERR(path.dentry);
803 		goto out_putfd;
804 	}
805 	path.mnt = mntget(mnt);
806 
807 	if (oflag & O_CREAT) {
808 		if (d_really_is_positive(path.dentry)) {	/* entry already exists */
809 			audit_inode(name, path.dentry, 0);
810 			if (oflag & O_EXCL) {
811 				error = -EEXIST;
812 				goto out;
813 			}
814 			filp = do_open(&path, oflag);
815 		} else {
816 			if (ro) {
817 				error = ro;
818 				goto out;
819 			}
820 			audit_inode_parent_hidden(name, root);
821 			filp = do_create(ipc_ns, d_inode(root),
822 						&path, oflag, mode,
823 						u_attr ? &attr : NULL);
824 		}
825 	} else {
826 		if (d_really_is_negative(path.dentry)) {
827 			error = -ENOENT;
828 			goto out;
829 		}
830 		audit_inode(name, path.dentry, 0);
831 		filp = do_open(&path, oflag);
832 	}
833 
834 	if (!IS_ERR(filp))
835 		fd_install(fd, filp);
836 	else
837 		error = PTR_ERR(filp);
838 out:
839 	path_put(&path);
840 out_putfd:
841 	if (error) {
842 		put_unused_fd(fd);
843 		fd = error;
844 	}
845 	inode_unlock(d_inode(root));
846 	if (!ro)
847 		mnt_drop_write(mnt);
848 out_putname:
849 	putname(name);
850 	return fd;
851 }
852 
853 SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
854 {
855 	int err;
856 	struct filename *name;
857 	struct dentry *dentry;
858 	struct inode *inode = NULL;
859 	struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
860 	struct vfsmount *mnt = ipc_ns->mq_mnt;
861 
862 	name = getname(u_name);
863 	if (IS_ERR(name))
864 		return PTR_ERR(name);
865 
866 	audit_inode_parent_hidden(name, mnt->mnt_root);
867 	err = mnt_want_write(mnt);
868 	if (err)
869 		goto out_name;
870 	inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
871 	dentry = lookup_one_len(name->name, mnt->mnt_root,
872 				strlen(name->name));
873 	if (IS_ERR(dentry)) {
874 		err = PTR_ERR(dentry);
875 		goto out_unlock;
876 	}
877 
878 	inode = d_inode(dentry);
879 	if (!inode) {
880 		err = -ENOENT;
881 	} else {
882 		ihold(inode);
883 		err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
884 	}
885 	dput(dentry);
886 
887 out_unlock:
888 	inode_unlock(d_inode(mnt->mnt_root));
889 	if (inode)
890 		iput(inode);
891 	mnt_drop_write(mnt);
892 out_name:
893 	putname(name);
894 
895 	return err;
896 }
897 
898 /* Pipelined send and receive functions.
899  *
900  * If a receiver finds no waiting message, then it registers itself in the
901  * list of waiting receivers. A sender checks that list before adding the new
902  * message into the message array. If there is a waiting receiver, then it
903  * bypasses the message array and directly hands the message over to the
904  * receiver. The receiver accepts the message and returns without grabbing the
905  * queue spinlock:
906  *
907  * - Set pointer to message.
908  * - Queue the receiver task for later wakeup (without the info->lock).
909  * - Update its state to STATE_READY. Now the receiver can continue.
910  * - Wake up the process after the lock is dropped. Should the process wake up
911  *   before this wakeup (due to a timeout or a signal) it will either see
912  *   STATE_READY and continue or acquire the lock to check the state again.
913  *
914  * The same algorithm is used for senders.
915  */
916 
917 /* pipelined_send() - send a message directly to the task waiting in
918  * sys_mq_timedreceive() (without inserting message into a queue).
919  */
920 static inline void pipelined_send(struct wake_q_head *wake_q,
921 				  struct mqueue_inode_info *info,
922 				  struct msg_msg *message,
923 				  struct ext_wait_queue *receiver)
924 {
925 	receiver->msg = message;
926 	list_del(&receiver->list);
927 	wake_q_add(wake_q, receiver->task);
928 	/*
929 	 * Rely on the implicit cmpxchg barrier from wake_q_add such
930 	 * that we can ensure that updating receiver->state is the last
931 	 * write operation: As once set, the receiver can continue,
932 	 * and if we don't have the reference count from the wake_q,
933 	 * yet, at that point we can later have a use-after-free
934 	 * condition and bogus wakeup.
935 	 */
936 	receiver->state = STATE_READY;
937 }
938 
939 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
940  * gets its message and put to the queue (we have one free place for sure). */
941 static inline void pipelined_receive(struct wake_q_head *wake_q,
942 				     struct mqueue_inode_info *info)
943 {
944 	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
945 
946 	if (!sender) {
947 		/* for poll */
948 		wake_up_interruptible(&info->wait_q);
949 		return;
950 	}
951 	if (msg_insert(sender->msg, info))
952 		return;
953 
954 	list_del(&sender->list);
955 	wake_q_add(wake_q, sender->task);
956 	sender->state = STATE_READY;
957 }
958 
959 SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
960 		size_t, msg_len, unsigned int, msg_prio,
961 		const struct timespec __user *, u_abs_timeout)
962 {
963 	struct fd f;
964 	struct inode *inode;
965 	struct ext_wait_queue wait;
966 	struct ext_wait_queue *receiver;
967 	struct msg_msg *msg_ptr;
968 	struct mqueue_inode_info *info;
969 	ktime_t expires, *timeout = NULL;
970 	struct timespec ts;
971 	struct posix_msg_tree_node *new_leaf = NULL;
972 	int ret = 0;
973 	DEFINE_WAKE_Q(wake_q);
974 
975 	if (u_abs_timeout) {
976 		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
977 		if (res)
978 			return res;
979 		timeout = &expires;
980 	}
981 
982 	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
983 		return -EINVAL;
984 
985 	audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
986 
987 	f = fdget(mqdes);
988 	if (unlikely(!f.file)) {
989 		ret = -EBADF;
990 		goto out;
991 	}
992 
993 	inode = file_inode(f.file);
994 	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
995 		ret = -EBADF;
996 		goto out_fput;
997 	}
998 	info = MQUEUE_I(inode);
999 	audit_file(f.file);
1000 
1001 	if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
1002 		ret = -EBADF;
1003 		goto out_fput;
1004 	}
1005 
1006 	if (unlikely(msg_len > info->attr.mq_msgsize)) {
1007 		ret = -EMSGSIZE;
1008 		goto out_fput;
1009 	}
1010 
1011 	/* First try to allocate memory, before doing anything with
1012 	 * existing queues. */
1013 	msg_ptr = load_msg(u_msg_ptr, msg_len);
1014 	if (IS_ERR(msg_ptr)) {
1015 		ret = PTR_ERR(msg_ptr);
1016 		goto out_fput;
1017 	}
1018 	msg_ptr->m_ts = msg_len;
1019 	msg_ptr->m_type = msg_prio;
1020 
1021 	/*
1022 	 * msg_insert really wants us to have a valid, spare node struct so
1023 	 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1024 	 * fall back to that if necessary.
1025 	 */
1026 	if (!info->node_cache)
1027 		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1028 
1029 	spin_lock(&info->lock);
1030 
1031 	if (!info->node_cache && new_leaf) {
1032 		/* Save our speculative allocation into the cache */
1033 		INIT_LIST_HEAD(&new_leaf->msg_list);
1034 		info->node_cache = new_leaf;
1035 		new_leaf = NULL;
1036 	} else {
1037 		kfree(new_leaf);
1038 	}
1039 
1040 	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
1041 		if (f.file->f_flags & O_NONBLOCK) {
1042 			ret = -EAGAIN;
1043 		} else {
1044 			wait.task = current;
1045 			wait.msg = (void *) msg_ptr;
1046 			wait.state = STATE_NONE;
1047 			ret = wq_sleep(info, SEND, timeout, &wait);
1048 			/*
1049 			 * wq_sleep must be called with info->lock held, and
1050 			 * returns with the lock released
1051 			 */
1052 			goto out_free;
1053 		}
1054 	} else {
1055 		receiver = wq_get_first_waiter(info, RECV);
1056 		if (receiver) {
1057 			pipelined_send(&wake_q, info, msg_ptr, receiver);
1058 		} else {
1059 			/* adds message to the queue */
1060 			ret = msg_insert(msg_ptr, info);
1061 			if (ret)
1062 				goto out_unlock;
1063 			__do_notify(info);
1064 		}
1065 		inode->i_atime = inode->i_mtime = inode->i_ctime =
1066 				current_time(inode);
1067 	}
1068 out_unlock:
1069 	spin_unlock(&info->lock);
1070 	wake_up_q(&wake_q);
1071 out_free:
1072 	if (ret)
1073 		free_msg(msg_ptr);
1074 out_fput:
1075 	fdput(f);
1076 out:
1077 	return ret;
1078 }
1079 
1080 SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
1081 		size_t, msg_len, unsigned int __user *, u_msg_prio,
1082 		const struct timespec __user *, u_abs_timeout)
1083 {
1084 	ssize_t ret;
1085 	struct msg_msg *msg_ptr;
1086 	struct fd f;
1087 	struct inode *inode;
1088 	struct mqueue_inode_info *info;
1089 	struct ext_wait_queue wait;
1090 	ktime_t expires, *timeout = NULL;
1091 	struct timespec ts;
1092 	struct posix_msg_tree_node *new_leaf = NULL;
1093 
1094 	if (u_abs_timeout) {
1095 		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
1096 		if (res)
1097 			return res;
1098 		timeout = &expires;
1099 	}
1100 
1101 	audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
1102 
1103 	f = fdget(mqdes);
1104 	if (unlikely(!f.file)) {
1105 		ret = -EBADF;
1106 		goto out;
1107 	}
1108 
1109 	inode = file_inode(f.file);
1110 	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
1111 		ret = -EBADF;
1112 		goto out_fput;
1113 	}
1114 	info = MQUEUE_I(inode);
1115 	audit_file(f.file);
1116 
1117 	if (unlikely(!(f.file->f_mode & FMODE_READ))) {
1118 		ret = -EBADF;
1119 		goto out_fput;
1120 	}
1121 
1122 	/* checks if buffer is big enough */
1123 	if (unlikely(msg_len < info->attr.mq_msgsize)) {
1124 		ret = -EMSGSIZE;
1125 		goto out_fput;
1126 	}
1127 
1128 	/*
1129 	 * msg_insert really wants us to have a valid, spare node struct so
1130 	 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will
1131 	 * fall back to that if necessary.
1132 	 */
1133 	if (!info->node_cache)
1134 		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL);
1135 
1136 	spin_lock(&info->lock);
1137 
1138 	if (!info->node_cache && new_leaf) {
1139 		/* Save our speculative allocation into the cache */
1140 		INIT_LIST_HEAD(&new_leaf->msg_list);
1141 		info->node_cache = new_leaf;
1142 	} else {
1143 		kfree(new_leaf);
1144 	}
1145 
1146 	if (info->attr.mq_curmsgs == 0) {
1147 		if (f.file->f_flags & O_NONBLOCK) {
1148 			spin_unlock(&info->lock);
1149 			ret = -EAGAIN;
1150 		} else {
1151 			wait.task = current;
1152 			wait.state = STATE_NONE;
1153 			ret = wq_sleep(info, RECV, timeout, &wait);
1154 			msg_ptr = wait.msg;
1155 		}
1156 	} else {
1157 		DEFINE_WAKE_Q(wake_q);
1158 
1159 		msg_ptr = msg_get(info);
1160 
1161 		inode->i_atime = inode->i_mtime = inode->i_ctime =
1162 				current_time(inode);
1163 
1164 		/* There is now free space in queue. */
1165 		pipelined_receive(&wake_q, info);
1166 		spin_unlock(&info->lock);
1167 		wake_up_q(&wake_q);
1168 		ret = 0;
1169 	}
1170 	if (ret == 0) {
1171 		ret = msg_ptr->m_ts;
1172 
1173 		if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
1174 			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
1175 			ret = -EFAULT;
1176 		}
1177 		free_msg(msg_ptr);
1178 	}
1179 out_fput:
1180 	fdput(f);
1181 out:
1182 	return ret;
1183 }
1184 
1185 /*
1186  * Notes: the case when user wants us to deregister (with NULL as pointer)
1187  * and he isn't currently owner of notification, will be silently discarded.
1188  * It isn't explicitly defined in the POSIX.
1189  */
1190 SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
1191 		const struct sigevent __user *, u_notification)
1192 {
1193 	int ret;
1194 	struct fd f;
1195 	struct sock *sock;
1196 	struct inode *inode;
1197 	struct sigevent notification;
1198 	struct mqueue_inode_info *info;
1199 	struct sk_buff *nc;
1200 
1201 	if (u_notification) {
1202 		if (copy_from_user(&notification, u_notification,
1203 					sizeof(struct sigevent)))
1204 			return -EFAULT;
1205 	}
1206 
1207 	audit_mq_notify(mqdes, u_notification ? &notification : NULL);
1208 
1209 	nc = NULL;
1210 	sock = NULL;
1211 	if (u_notification != NULL) {
1212 		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
1213 			     notification.sigev_notify != SIGEV_SIGNAL &&
1214 			     notification.sigev_notify != SIGEV_THREAD))
1215 			return -EINVAL;
1216 		if (notification.sigev_notify == SIGEV_SIGNAL &&
1217 			!valid_signal(notification.sigev_signo)) {
1218 			return -EINVAL;
1219 		}
1220 		if (notification.sigev_notify == SIGEV_THREAD) {
1221 			long timeo;
1222 
1223 			/* create the notify skb */
1224 			nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
1225 			if (!nc) {
1226 				ret = -ENOMEM;
1227 				goto out;
1228 			}
1229 			if (copy_from_user(nc->data,
1230 					notification.sigev_value.sival_ptr,
1231 					NOTIFY_COOKIE_LEN)) {
1232 				ret = -EFAULT;
1233 				goto out;
1234 			}
1235 
1236 			/* TODO: add a header? */
1237 			skb_put(nc, NOTIFY_COOKIE_LEN);
1238 			/* and attach it to the socket */
1239 retry:
1240 			f = fdget(notification.sigev_signo);
1241 			if (!f.file) {
1242 				ret = -EBADF;
1243 				goto out;
1244 			}
1245 			sock = netlink_getsockbyfilp(f.file);
1246 			fdput(f);
1247 			if (IS_ERR(sock)) {
1248 				ret = PTR_ERR(sock);
1249 				sock = NULL;
1250 				goto out;
1251 			}
1252 
1253 			timeo = MAX_SCHEDULE_TIMEOUT;
1254 			ret = netlink_attachskb(sock, nc, &timeo, NULL);
1255 			if (ret == 1)
1256 				goto retry;
1257 			if (ret) {
1258 				sock = NULL;
1259 				nc = NULL;
1260 				goto out;
1261 			}
1262 		}
1263 	}
1264 
1265 	f = fdget(mqdes);
1266 	if (!f.file) {
1267 		ret = -EBADF;
1268 		goto out;
1269 	}
1270 
1271 	inode = file_inode(f.file);
1272 	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
1273 		ret = -EBADF;
1274 		goto out_fput;
1275 	}
1276 	info = MQUEUE_I(inode);
1277 
1278 	ret = 0;
1279 	spin_lock(&info->lock);
1280 	if (u_notification == NULL) {
1281 		if (info->notify_owner == task_tgid(current)) {
1282 			remove_notification(info);
1283 			inode->i_atime = inode->i_ctime = current_time(inode);
1284 		}
1285 	} else if (info->notify_owner != NULL) {
1286 		ret = -EBUSY;
1287 	} else {
1288 		switch (notification.sigev_notify) {
1289 		case SIGEV_NONE:
1290 			info->notify.sigev_notify = SIGEV_NONE;
1291 			break;
1292 		case SIGEV_THREAD:
1293 			info->notify_sock = sock;
1294 			info->notify_cookie = nc;
1295 			sock = NULL;
1296 			nc = NULL;
1297 			info->notify.sigev_notify = SIGEV_THREAD;
1298 			break;
1299 		case SIGEV_SIGNAL:
1300 			info->notify.sigev_signo = notification.sigev_signo;
1301 			info->notify.sigev_value = notification.sigev_value;
1302 			info->notify.sigev_notify = SIGEV_SIGNAL;
1303 			break;
1304 		}
1305 
1306 		info->notify_owner = get_pid(task_tgid(current));
1307 		info->notify_user_ns = get_user_ns(current_user_ns());
1308 		inode->i_atime = inode->i_ctime = current_time(inode);
1309 	}
1310 	spin_unlock(&info->lock);
1311 out_fput:
1312 	fdput(f);
1313 out:
1314 	if (sock)
1315 		netlink_detachskb(sock, nc);
1316 	else if (nc)
1317 		dev_kfree_skb(nc);
1318 
1319 	return ret;
1320 }
1321 
1322 SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
1323 		const struct mq_attr __user *, u_mqstat,
1324 		struct mq_attr __user *, u_omqstat)
1325 {
1326 	int ret;
1327 	struct mq_attr mqstat, omqstat;
1328 	struct fd f;
1329 	struct inode *inode;
1330 	struct mqueue_inode_info *info;
1331 
1332 	if (u_mqstat != NULL) {
1333 		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
1334 			return -EFAULT;
1335 		if (mqstat.mq_flags & (~O_NONBLOCK))
1336 			return -EINVAL;
1337 	}
1338 
1339 	f = fdget(mqdes);
1340 	if (!f.file) {
1341 		ret = -EBADF;
1342 		goto out;
1343 	}
1344 
1345 	inode = file_inode(f.file);
1346 	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
1347 		ret = -EBADF;
1348 		goto out_fput;
1349 	}
1350 	info = MQUEUE_I(inode);
1351 
1352 	spin_lock(&info->lock);
1353 
1354 	omqstat = info->attr;
1355 	omqstat.mq_flags = f.file->f_flags & O_NONBLOCK;
1356 	if (u_mqstat) {
1357 		audit_mq_getsetattr(mqdes, &mqstat);
1358 		spin_lock(&f.file->f_lock);
1359 		if (mqstat.mq_flags & O_NONBLOCK)
1360 			f.file->f_flags |= O_NONBLOCK;
1361 		else
1362 			f.file->f_flags &= ~O_NONBLOCK;
1363 		spin_unlock(&f.file->f_lock);
1364 
1365 		inode->i_atime = inode->i_ctime = current_time(inode);
1366 	}
1367 
1368 	spin_unlock(&info->lock);
1369 
1370 	ret = 0;
1371 	if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
1372 						sizeof(struct mq_attr)))
1373 		ret = -EFAULT;
1374 
1375 out_fput:
1376 	fdput(f);
1377 out:
1378 	return ret;
1379 }
1380 
1381 static const struct inode_operations mqueue_dir_inode_operations = {
1382 	.lookup = simple_lookup,
1383 	.create = mqueue_create,
1384 	.unlink = mqueue_unlink,
1385 };
1386 
1387 static const struct file_operations mqueue_file_operations = {
1388 	.flush = mqueue_flush_file,
1389 	.poll = mqueue_poll_file,
1390 	.read = mqueue_read_file,
1391 	.llseek = default_llseek,
1392 };
1393 
1394 static const struct super_operations mqueue_super_ops = {
1395 	.alloc_inode = mqueue_alloc_inode,
1396 	.destroy_inode = mqueue_destroy_inode,
1397 	.evict_inode = mqueue_evict_inode,
1398 	.statfs = simple_statfs,
1399 };
1400 
1401 static struct file_system_type mqueue_fs_type = {
1402 	.name = "mqueue",
1403 	.mount = mqueue_mount,
1404 	.kill_sb = kill_litter_super,
1405 	.fs_flags = FS_USERNS_MOUNT,
1406 };
1407 
1408 int mq_init_ns(struct ipc_namespace *ns)
1409 {
1410 	ns->mq_queues_count  = 0;
1411 	ns->mq_queues_max    = DFLT_QUEUESMAX;
1412 	ns->mq_msg_max       = DFLT_MSGMAX;
1413 	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX;
1414 	ns->mq_msg_default   = DFLT_MSG;
1415 	ns->mq_msgsize_default  = DFLT_MSGSIZE;
1416 
1417 	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);
1418 	if (IS_ERR(ns->mq_mnt)) {
1419 		int err = PTR_ERR(ns->mq_mnt);
1420 		ns->mq_mnt = NULL;
1421 		return err;
1422 	}
1423 	return 0;
1424 }
1425 
1426 void mq_clear_sbinfo(struct ipc_namespace *ns)
1427 {
1428 	ns->mq_mnt->mnt_sb->s_fs_info = NULL;
1429 }
1430 
1431 void mq_put_mnt(struct ipc_namespace *ns)
1432 {
1433 	kern_unmount(ns->mq_mnt);
1434 }
1435 
1436 static int __init init_mqueue_fs(void)
1437 {
1438 	int error;
1439 
1440 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
1441 				sizeof(struct mqueue_inode_info), 0,
1442 				SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
1443 	if (mqueue_inode_cachep == NULL)
1444 		return -ENOMEM;
1445 
1446 	/* ignore failures - they are not fatal */
1447 	mq_sysctl_table = mq_register_sysctl_table();
1448 
1449 	error = register_filesystem(&mqueue_fs_type);
1450 	if (error)
1451 		goto out_sysctl;
1452 
1453 	spin_lock_init(&mq_lock);
1454 
1455 	error = mq_init_ns(&init_ipc_ns);
1456 	if (error)
1457 		goto out_filesystem;
1458 
1459 	return 0;
1460 
1461 out_filesystem:
1462 	unregister_filesystem(&mqueue_fs_type);
1463 out_sysctl:
1464 	if (mq_sysctl_table)
1465 		unregister_sysctl_table(mq_sysctl_table);
1466 	kmem_cache_destroy(mqueue_inode_cachep);
1467 	return error;
1468 }
1469 
1470 device_initcall(init_mqueue_fs);
1471