1 #ifndef _LINUX_CGROUP_H 2 #define _LINUX_CGROUP_H 3 /* 4 * cgroup interface 5 * 6 * Copyright (C) 2003 BULL SA 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 8 * 9 */ 10 11 #include <linux/sched.h> 12 #include <linux/cpumask.h> 13 #include <linux/nodemask.h> 14 #include <linux/rcupdate.h> 15 #include <linux/cgroupstats.h> 16 #include <linux/prio_heap.h> 17 #include <linux/rwsem.h> 18 #include <linux/idr.h> 19 #include <linux/workqueue.h> 20 #include <linux/xattr.h> 21 22 #ifdef CONFIG_CGROUPS 23 24 struct cgroupfs_root; 25 struct cgroup_subsys; 26 struct inode; 27 struct cgroup; 28 struct css_id; 29 30 extern int cgroup_init_early(void); 31 extern int cgroup_init(void); 32 extern void cgroup_lock(void); 33 extern int cgroup_lock_is_held(void); 34 extern bool cgroup_lock_live_group(struct cgroup *cgrp); 35 extern void cgroup_unlock(void); 36 extern void cgroup_fork(struct task_struct *p); 37 extern void cgroup_fork_callbacks(struct task_struct *p); 38 extern void cgroup_post_fork(struct task_struct *p); 39 extern void cgroup_exit(struct task_struct *p, int run_callbacks); 40 extern int cgroupstats_build(struct cgroupstats *stats, 41 struct dentry *dentry); 42 extern int cgroup_load_subsys(struct cgroup_subsys *ss); 43 extern void cgroup_unload_subsys(struct cgroup_subsys *ss); 44 45 extern const struct file_operations proc_cgroup_operations; 46 47 /* Define the enumeration of all builtin cgroup subsystems */ 48 #define SUBSYS(_x) _x ## _subsys_id, 49 #define IS_SUBSYS_ENABLED(option) IS_ENABLED(option) 50 enum cgroup_subsys_id { 51 #include <linux/cgroup_subsys.h> 52 CGROUP_SUBSYS_COUNT, 53 }; 54 #undef IS_SUBSYS_ENABLED 55 #undef SUBSYS 56 57 /* Per-subsystem/per-cgroup state maintained by the system. */ 58 struct cgroup_subsys_state { 59 /* 60 * The cgroup that this subsystem is attached to. Useful 61 * for subsystems that want to know about the cgroup 62 * hierarchy structure 63 */ 64 struct cgroup *cgroup; 65 66 /* 67 * State maintained by the cgroup system to allow subsystems 68 * to be "busy". Should be accessed via css_get(), 69 * css_tryget() and and css_put(). 70 */ 71 72 atomic_t refcnt; 73 74 unsigned long flags; 75 /* ID for this css, if possible */ 76 struct css_id __rcu *id; 77 78 /* Used to put @cgroup->dentry on the last css_put() */ 79 struct work_struct dput_work; 80 }; 81 82 /* bits in struct cgroup_subsys_state flags field */ 83 enum { 84 CSS_ROOT, /* This CSS is the root of the subsystem */ 85 CSS_REMOVED, /* This CSS is dead */ 86 CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ 87 }; 88 89 /* Caller must verify that the css is not for root cgroup */ 90 static inline void __css_get(struct cgroup_subsys_state *css, int count) 91 { 92 atomic_add(count, &css->refcnt); 93 } 94 95 /* 96 * Call css_get() to hold a reference on the css; it can be used 97 * for a reference obtained via: 98 * - an existing ref-counted reference to the css 99 * - task->cgroups for a locked task 100 */ 101 102 static inline void css_get(struct cgroup_subsys_state *css) 103 { 104 /* We don't need to reference count the root state */ 105 if (!test_bit(CSS_ROOT, &css->flags)) 106 __css_get(css, 1); 107 } 108 109 static inline bool css_is_removed(struct cgroup_subsys_state *css) 110 { 111 return test_bit(CSS_REMOVED, &css->flags); 112 } 113 114 /* 115 * Call css_tryget() to take a reference on a css if your existing 116 * (known-valid) reference isn't already ref-counted. Returns false if 117 * the css has been destroyed. 118 */ 119 120 extern bool __css_tryget(struct cgroup_subsys_state *css); 121 static inline bool css_tryget(struct cgroup_subsys_state *css) 122 { 123 if (test_bit(CSS_ROOT, &css->flags)) 124 return true; 125 return __css_tryget(css); 126 } 127 128 /* 129 * css_put() should be called to release a reference taken by 130 * css_get() or css_tryget() 131 */ 132 133 extern void __css_put(struct cgroup_subsys_state *css); 134 static inline void css_put(struct cgroup_subsys_state *css) 135 { 136 if (!test_bit(CSS_ROOT, &css->flags)) 137 __css_put(css); 138 } 139 140 /* bits in struct cgroup flags field */ 141 enum { 142 /* Control Group is dead */ 143 CGRP_REMOVED, 144 /* 145 * Control Group has previously had a child cgroup or a task, 146 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) 147 */ 148 CGRP_RELEASABLE, 149 /* Control Group requires release notifications to userspace */ 150 CGRP_NOTIFY_ON_RELEASE, 151 /* 152 * A thread in rmdir() is wating for this cgroup. 153 */ 154 CGRP_WAIT_ON_RMDIR, 155 /* 156 * Clone cgroup values when creating a new child cgroup 157 */ 158 CGRP_CLONE_CHILDREN, 159 }; 160 161 struct cgroup { 162 unsigned long flags; /* "unsigned long" so bitops work */ 163 164 /* 165 * count users of this cgroup. >0 means busy, but doesn't 166 * necessarily indicate the number of tasks in the cgroup 167 */ 168 atomic_t count; 169 170 /* 171 * We link our 'sibling' struct into our parent's 'children'. 172 * Our children link their 'sibling' into our 'children'. 173 */ 174 struct list_head sibling; /* my parent's children */ 175 struct list_head children; /* my children */ 176 struct list_head files; /* my files */ 177 178 struct cgroup *parent; /* my parent */ 179 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */ 180 181 /* Private pointers for each registered subsystem */ 182 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 183 184 struct cgroupfs_root *root; 185 struct cgroup *top_cgroup; 186 187 /* 188 * List of cg_cgroup_links pointing at css_sets with 189 * tasks in this cgroup. Protected by css_set_lock 190 */ 191 struct list_head css_sets; 192 193 struct list_head allcg_node; /* cgroupfs_root->allcg_list */ 194 struct list_head cft_q_node; /* used during cftype add/rm */ 195 196 /* 197 * Linked list running through all cgroups that can 198 * potentially be reaped by the release agent. Protected by 199 * release_list_lock 200 */ 201 struct list_head release_list; 202 203 /* 204 * list of pidlists, up to two for each namespace (one for procs, one 205 * for tasks); created on demand. 206 */ 207 struct list_head pidlists; 208 struct mutex pidlist_mutex; 209 210 /* For RCU-protected deletion */ 211 struct rcu_head rcu_head; 212 213 /* List of events which userspace want to receive */ 214 struct list_head event_list; 215 spinlock_t event_list_lock; 216 217 /* directory xattrs */ 218 struct simple_xattrs xattrs; 219 }; 220 221 /* 222 * A css_set is a structure holding pointers to a set of 223 * cgroup_subsys_state objects. This saves space in the task struct 224 * object and speeds up fork()/exit(), since a single inc/dec and a 225 * list_add()/del() can bump the reference count on the entire cgroup 226 * set for a task. 227 */ 228 229 struct css_set { 230 231 /* Reference count */ 232 atomic_t refcount; 233 234 /* 235 * List running through all cgroup groups in the same hash 236 * slot. Protected by css_set_lock 237 */ 238 struct hlist_node hlist; 239 240 /* 241 * List running through all tasks using this cgroup 242 * group. Protected by css_set_lock 243 */ 244 struct list_head tasks; 245 246 /* 247 * List of cg_cgroup_link objects on link chains from 248 * cgroups referenced from this css_set. Protected by 249 * css_set_lock 250 */ 251 struct list_head cg_links; 252 253 /* 254 * Set of subsystem states, one for each subsystem. This array 255 * is immutable after creation apart from the init_css_set 256 * during subsystem registration (at boot time) and modular subsystem 257 * loading/unloading. 258 */ 259 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 260 261 /* For RCU-protected deletion */ 262 struct rcu_head rcu_head; 263 }; 264 265 /* 266 * cgroup_map_cb is an abstract callback API for reporting map-valued 267 * control files 268 */ 269 270 struct cgroup_map_cb { 271 int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); 272 void *state; 273 }; 274 275 /* 276 * struct cftype: handler definitions for cgroup control files 277 * 278 * When reading/writing to a file: 279 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 280 * - the 'cftype' of the file is file->f_dentry->d_fsdata 281 */ 282 283 /* cftype->flags */ 284 #define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ 285 #define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create onp root cg */ 286 287 #define MAX_CFTYPE_NAME 64 288 289 struct cftype { 290 /* 291 * By convention, the name should begin with the name of the 292 * subsystem, followed by a period. Zero length string indicates 293 * end of cftype array. 294 */ 295 char name[MAX_CFTYPE_NAME]; 296 int private; 297 /* 298 * If not 0, file mode is set to this value, otherwise it will 299 * be figured out automatically 300 */ 301 umode_t mode; 302 303 /* 304 * If non-zero, defines the maximum length of string that can 305 * be passed to write_string; defaults to 64 306 */ 307 size_t max_write_len; 308 309 /* CFTYPE_* flags */ 310 unsigned int flags; 311 312 /* file xattrs */ 313 struct simple_xattrs xattrs; 314 315 int (*open)(struct inode *inode, struct file *file); 316 ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, 317 struct file *file, 318 char __user *buf, size_t nbytes, loff_t *ppos); 319 /* 320 * read_u64() is a shortcut for the common case of returning a 321 * single integer. Use it in place of read() 322 */ 323 u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); 324 /* 325 * read_s64() is a signed version of read_u64() 326 */ 327 s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); 328 /* 329 * read_map() is used for defining a map of key/value 330 * pairs. It should call cb->fill(cb, key, value) for each 331 * entry. The key/value pairs (and their ordering) should not 332 * change between reboots. 333 */ 334 int (*read_map)(struct cgroup *cont, struct cftype *cft, 335 struct cgroup_map_cb *cb); 336 /* 337 * read_seq_string() is used for outputting a simple sequence 338 * using seqfile. 339 */ 340 int (*read_seq_string)(struct cgroup *cont, struct cftype *cft, 341 struct seq_file *m); 342 343 ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, 344 struct file *file, 345 const char __user *buf, size_t nbytes, loff_t *ppos); 346 347 /* 348 * write_u64() is a shortcut for the common case of accepting 349 * a single integer (as parsed by simple_strtoull) from 350 * userspace. Use in place of write(); return 0 or error. 351 */ 352 int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); 353 /* 354 * write_s64() is a signed version of write_u64() 355 */ 356 int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); 357 358 /* 359 * write_string() is passed a nul-terminated kernelspace 360 * buffer of maximum length determined by max_write_len. 361 * Returns 0 or -ve error code. 362 */ 363 int (*write_string)(struct cgroup *cgrp, struct cftype *cft, 364 const char *buffer); 365 /* 366 * trigger() callback can be used to get some kick from the 367 * userspace, when the actual string written is not important 368 * at all. The private field can be used to determine the 369 * kick type for multiplexing. 370 */ 371 int (*trigger)(struct cgroup *cgrp, unsigned int event); 372 373 int (*release)(struct inode *inode, struct file *file); 374 375 /* 376 * register_event() callback will be used to add new userspace 377 * waiter for changes related to the cftype. Implement it if 378 * you want to provide this functionality. Use eventfd_signal() 379 * on eventfd to send notification to userspace. 380 */ 381 int (*register_event)(struct cgroup *cgrp, struct cftype *cft, 382 struct eventfd_ctx *eventfd, const char *args); 383 /* 384 * unregister_event() callback will be called when userspace 385 * closes the eventfd or on cgroup removing. 386 * This callback must be implemented, if you want provide 387 * notification functionality. 388 */ 389 void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, 390 struct eventfd_ctx *eventfd); 391 }; 392 393 /* 394 * cftype_sets describe cftypes belonging to a subsystem and are chained at 395 * cgroup_subsys->cftsets. Each cftset points to an array of cftypes 396 * terminated by zero length name. 397 */ 398 struct cftype_set { 399 struct list_head node; /* chained at subsys->cftsets */ 400 struct cftype *cfts; 401 }; 402 403 struct cgroup_scanner { 404 struct cgroup *cg; 405 int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); 406 void (*process_task)(struct task_struct *p, 407 struct cgroup_scanner *scan); 408 struct ptr_heap *heap; 409 void *data; 410 }; 411 412 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 413 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 414 415 int cgroup_is_removed(const struct cgroup *cgrp); 416 417 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); 418 419 int cgroup_task_count(const struct cgroup *cgrp); 420 421 /* Return true if cgrp is a descendant of the task's cgroup */ 422 int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); 423 424 /* 425 * When the subsys has to access css and may add permanent refcnt to css, 426 * it should take care of racy conditions with rmdir(). Following set of 427 * functions, is for stop/restart rmdir if necessary. 428 * Because these will call css_get/put, "css" should be alive css. 429 * 430 * cgroup_exclude_rmdir(); 431 * ...do some jobs which may access arbitrary empty cgroup 432 * cgroup_release_and_wakeup_rmdir(); 433 * 434 * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, 435 * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. 436 */ 437 438 void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); 439 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); 440 441 /* 442 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 443 * methods. 444 */ 445 struct cgroup_taskset; 446 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 447 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 448 struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); 449 int cgroup_taskset_size(struct cgroup_taskset *tset); 450 451 /** 452 * cgroup_taskset_for_each - iterate cgroup_taskset 453 * @task: the loop cursor 454 * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all 455 * @tset: taskset to iterate 456 */ 457 #define cgroup_taskset_for_each(task, skip_cgrp, tset) \ 458 for ((task) = cgroup_taskset_first((tset)); (task); \ 459 (task) = cgroup_taskset_next((tset))) \ 460 if (!(skip_cgrp) || \ 461 cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) 462 463 /* 464 * Control Group subsystem type. 465 * See Documentation/cgroups/cgroups.txt for details 466 */ 467 468 struct cgroup_subsys { 469 struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); 470 int (*pre_destroy)(struct cgroup *cgrp); 471 void (*destroy)(struct cgroup *cgrp); 472 int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 473 void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 474 void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); 475 void (*fork)(struct task_struct *task); 476 void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, 477 struct task_struct *task); 478 void (*post_clone)(struct cgroup *cgrp); 479 void (*bind)(struct cgroup *root); 480 481 int subsys_id; 482 int active; 483 int disabled; 484 int early_init; 485 /* 486 * True if this subsys uses ID. ID is not available before cgroup_init() 487 * (not available in early_init time.) 488 */ 489 bool use_id; 490 491 /* 492 * If %true, cgroup removal will try to clear css refs by retrying 493 * ss->pre_destroy() until there's no css ref left. This behavior 494 * is strictly for backward compatibility and will be removed as 495 * soon as the current user (memcg) is updated. 496 * 497 * If %false, ss->pre_destroy() can't fail and cgroup removal won't 498 * wait for css refs to drop to zero before proceeding. 499 */ 500 bool __DEPRECATED_clear_css_refs; 501 502 /* 503 * If %false, this subsystem is properly hierarchical - 504 * configuration, resource accounting and restriction on a parent 505 * cgroup cover those of its children. If %true, hierarchy support 506 * is broken in some ways - some subsystems ignore hierarchy 507 * completely while others are only implemented half-way. 508 * 509 * It's now disallowed to create nested cgroups if the subsystem is 510 * broken and cgroup core will emit a warning message on such 511 * cases. Eventually, all subsystems will be made properly 512 * hierarchical and this will go away. 513 */ 514 bool broken_hierarchy; 515 bool warned_broken_hierarchy; 516 517 #define MAX_CGROUP_TYPE_NAMELEN 32 518 const char *name; 519 520 /* 521 * Link to parent, and list entry in parent's children. 522 * Protected by cgroup_lock() 523 */ 524 struct cgroupfs_root *root; 525 struct list_head sibling; 526 /* used when use_id == true */ 527 struct idr idr; 528 spinlock_t id_lock; 529 530 /* list of cftype_sets */ 531 struct list_head cftsets; 532 533 /* base cftypes, automatically [de]registered with subsys itself */ 534 struct cftype *base_cftypes; 535 struct cftype_set base_cftset; 536 537 /* should be defined only by modular subsystems */ 538 struct module *module; 539 }; 540 541 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; 542 #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 543 #include <linux/cgroup_subsys.h> 544 #undef IS_SUBSYS_ENABLED 545 #undef SUBSYS 546 547 static inline struct cgroup_subsys_state *cgroup_subsys_state( 548 struct cgroup *cgrp, int subsys_id) 549 { 550 return cgrp->subsys[subsys_id]; 551 } 552 553 /* 554 * function to get the cgroup_subsys_state which allows for extra 555 * rcu_dereference_check() conditions, such as locks used during the 556 * cgroup_subsys::attach() methods. 557 */ 558 #define task_subsys_state_check(task, subsys_id, __c) \ 559 rcu_dereference_check(task->cgroups->subsys[subsys_id], \ 560 lockdep_is_held(&task->alloc_lock) || \ 561 cgroup_lock_is_held() || (__c)) 562 563 static inline struct cgroup_subsys_state * 564 task_subsys_state(struct task_struct *task, int subsys_id) 565 { 566 return task_subsys_state_check(task, subsys_id, false); 567 } 568 569 static inline struct cgroup* task_cgroup(struct task_struct *task, 570 int subsys_id) 571 { 572 return task_subsys_state(task, subsys_id)->cgroup; 573 } 574 575 /* A cgroup_iter should be treated as an opaque object */ 576 struct cgroup_iter { 577 struct list_head *cg_link; 578 struct list_head *task; 579 }; 580 581 /* 582 * To iterate across the tasks in a cgroup: 583 * 584 * 1) call cgroup_iter_start to initialize an iterator 585 * 586 * 2) call cgroup_iter_next() to retrieve member tasks until it 587 * returns NULL or until you want to end the iteration 588 * 589 * 3) call cgroup_iter_end() to destroy the iterator. 590 * 591 * Or, call cgroup_scan_tasks() to iterate through every task in a 592 * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling 593 * the test_task() callback, but not while calling the process_task() 594 * callback. 595 */ 596 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); 597 struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 598 struct cgroup_iter *it); 599 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); 600 int cgroup_scan_tasks(struct cgroup_scanner *scan); 601 int cgroup_attach_task(struct cgroup *, struct task_struct *); 602 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 603 604 /* 605 * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works 606 * if cgroup_subsys.use_id == true. It can be used for looking up and scanning. 607 * CSS ID is assigned at cgroup allocation (create) automatically 608 * and removed when subsys calls free_css_id() function. This is because 609 * the lifetime of cgroup_subsys_state is subsys's matter. 610 * 611 * Looking up and scanning function should be called under rcu_read_lock(). 612 * Taking cgroup_mutex is not necessary for following calls. 613 * But the css returned by this routine can be "not populated yet" or "being 614 * destroyed". The caller should check css and cgroup's status. 615 */ 616 617 /* 618 * Typically Called at ->destroy(), or somewhere the subsys frees 619 * cgroup_subsys_state. 620 */ 621 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); 622 623 /* Find a cgroup_subsys_state which has given ID */ 624 625 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); 626 627 /* 628 * Get a cgroup whose id is greater than or equal to id under tree of root. 629 * Returning a cgroup_subsys_state or NULL. 630 */ 631 struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, 632 struct cgroup_subsys_state *root, int *foundid); 633 634 /* Returns true if root is ancestor of cg */ 635 bool css_is_ancestor(struct cgroup_subsys_state *cg, 636 const struct cgroup_subsys_state *root); 637 638 /* Get id and depth of css */ 639 unsigned short css_id(struct cgroup_subsys_state *css); 640 unsigned short css_depth(struct cgroup_subsys_state *css); 641 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); 642 643 #else /* !CONFIG_CGROUPS */ 644 645 static inline int cgroup_init_early(void) { return 0; } 646 static inline int cgroup_init(void) { return 0; } 647 static inline void cgroup_fork(struct task_struct *p) {} 648 static inline void cgroup_fork_callbacks(struct task_struct *p) {} 649 static inline void cgroup_post_fork(struct task_struct *p) {} 650 static inline void cgroup_exit(struct task_struct *p, int callbacks) {} 651 652 static inline void cgroup_lock(void) {} 653 static inline void cgroup_unlock(void) {} 654 static inline int cgroupstats_build(struct cgroupstats *stats, 655 struct dentry *dentry) 656 { 657 return -EINVAL; 658 } 659 660 /* No cgroups - nothing to do */ 661 static inline int cgroup_attach_task_all(struct task_struct *from, 662 struct task_struct *t) 663 { 664 return 0; 665 } 666 667 #endif /* !CONFIG_CGROUPS */ 668 669 #endif /* _LINUX_CGROUP_H */ 670