1 /* 2 * taskstats.c - Export per-task statistics to userland 3 * 4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006 5 * (C) Balbir Singh, IBM Corp. 2006 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 */ 18 19 #include <linux/kernel.h> 20 #include <linux/taskstats_kern.h> 21 #include <linux/tsacct_kern.h> 22 #include <linux/delayacct.h> 23 #include <linux/cpumask.h> 24 #include <linux/percpu.h> 25 #include <linux/cgroupstats.h> 26 #include <linux/cgroup.h> 27 #include <linux/fs.h> 28 #include <linux/file.h> 29 #include <net/genetlink.h> 30 #include <asm/atomic.h> 31 32 /* 33 * Maximum length of a cpumask that can be specified in 34 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute 35 */ 36 #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 37 38 static DEFINE_PER_CPU(__u32, taskstats_seqnum); 39 static int family_registered; 40 struct kmem_cache *taskstats_cache; 41 42 static struct genl_family family = { 43 .id = GENL_ID_GENERATE, 44 .name = TASKSTATS_GENL_NAME, 45 .version = TASKSTATS_GENL_VERSION, 46 .maxattr = TASKSTATS_CMD_ATTR_MAX, 47 }; 48 49 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { 50 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, 51 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, 52 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, 53 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; 54 55 static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { 56 [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, 57 }; 58 59 struct listener { 60 struct list_head list; 61 pid_t pid; 62 char valid; 63 }; 64 65 struct listener_list { 66 struct rw_semaphore sem; 67 struct list_head list; 68 }; 69 static DEFINE_PER_CPU(struct listener_list, listener_array); 70 71 enum actions { 72 REGISTER, 73 DEREGISTER, 74 CPU_DONT_CARE 75 }; 76 77 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, 78 size_t size) 79 { 80 struct sk_buff *skb; 81 void *reply; 82 83 /* 84 * If new attributes are added, please revisit this allocation 85 */ 86 skb = genlmsg_new(size, GFP_KERNEL); 87 if (!skb) 88 return -ENOMEM; 89 90 if (!info) { 91 int seq = get_cpu_var(taskstats_seqnum)++; 92 put_cpu_var(taskstats_seqnum); 93 94 reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); 95 } else 96 reply = genlmsg_put_reply(skb, info, &family, 0, cmd); 97 if (reply == NULL) { 98 nlmsg_free(skb); 99 return -EINVAL; 100 } 101 102 *skbp = skb; 103 return 0; 104 } 105 106 /* 107 * Send taskstats data in @skb to listener with nl_pid @pid 108 */ 109 static int send_reply(struct sk_buff *skb, struct genl_info *info) 110 { 111 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 112 void *reply = genlmsg_data(genlhdr); 113 int rc; 114 115 rc = genlmsg_end(skb, reply); 116 if (rc < 0) { 117 nlmsg_free(skb); 118 return rc; 119 } 120 121 return genlmsg_reply(skb, info); 122 } 123 124 /* 125 * Send taskstats data in @skb to listeners registered for @cpu's exit data 126 */ 127 static void send_cpu_listeners(struct sk_buff *skb, 128 struct listener_list *listeners) 129 { 130 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 131 struct listener *s, *tmp; 132 struct sk_buff *skb_next, *skb_cur = skb; 133 void *reply = genlmsg_data(genlhdr); 134 int rc, delcount = 0; 135 136 rc = genlmsg_end(skb, reply); 137 if (rc < 0) { 138 nlmsg_free(skb); 139 return; 140 } 141 142 rc = 0; 143 down_read(&listeners->sem); 144 list_for_each_entry(s, &listeners->list, list) { 145 skb_next = NULL; 146 if (!list_is_last(&s->list, &listeners->list)) { 147 skb_next = skb_clone(skb_cur, GFP_KERNEL); 148 if (!skb_next) 149 break; 150 } 151 rc = genlmsg_unicast(&init_net, skb_cur, s->pid); 152 if (rc == -ECONNREFUSED) { 153 s->valid = 0; 154 delcount++; 155 } 156 skb_cur = skb_next; 157 } 158 up_read(&listeners->sem); 159 160 if (skb_cur) 161 nlmsg_free(skb_cur); 162 163 if (!delcount) 164 return; 165 166 /* Delete invalidated entries */ 167 down_write(&listeners->sem); 168 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 169 if (!s->valid) { 170 list_del(&s->list); 171 kfree(s); 172 } 173 } 174 up_write(&listeners->sem); 175 } 176 177 static int fill_pid(pid_t pid, struct task_struct *tsk, 178 struct taskstats *stats) 179 { 180 int rc = 0; 181 182 if (!tsk) { 183 rcu_read_lock(); 184 tsk = find_task_by_vpid(pid); 185 if (tsk) 186 get_task_struct(tsk); 187 rcu_read_unlock(); 188 if (!tsk) 189 return -ESRCH; 190 } else 191 get_task_struct(tsk); 192 193 memset(stats, 0, sizeof(*stats)); 194 /* 195 * Each accounting subsystem adds calls to its functions to 196 * fill in relevant parts of struct taskstsats as follows 197 * 198 * per-task-foo(stats, tsk); 199 */ 200 201 delayacct_add_tsk(stats, tsk); 202 203 /* fill in basic acct fields */ 204 stats->version = TASKSTATS_VERSION; 205 stats->nvcsw = tsk->nvcsw; 206 stats->nivcsw = tsk->nivcsw; 207 bacct_add_tsk(stats, tsk); 208 209 /* fill in extended acct fields */ 210 xacct_add_tsk(stats, tsk); 211 212 /* Define err: label here if needed */ 213 put_task_struct(tsk); 214 return rc; 215 216 } 217 218 static int fill_tgid(pid_t tgid, struct task_struct *first, 219 struct taskstats *stats) 220 { 221 struct task_struct *tsk; 222 unsigned long flags; 223 int rc = -ESRCH; 224 225 /* 226 * Add additional stats from live tasks except zombie thread group 227 * leaders who are already counted with the dead tasks 228 */ 229 rcu_read_lock(); 230 if (!first) 231 first = find_task_by_vpid(tgid); 232 233 if (!first || !lock_task_sighand(first, &flags)) 234 goto out; 235 236 if (first->signal->stats) 237 memcpy(stats, first->signal->stats, sizeof(*stats)); 238 else 239 memset(stats, 0, sizeof(*stats)); 240 241 tsk = first; 242 do { 243 if (tsk->exit_state) 244 continue; 245 /* 246 * Accounting subsystem can call its functions here to 247 * fill in relevant parts of struct taskstsats as follows 248 * 249 * per-task-foo(stats, tsk); 250 */ 251 delayacct_add_tsk(stats, tsk); 252 253 stats->nvcsw += tsk->nvcsw; 254 stats->nivcsw += tsk->nivcsw; 255 } while_each_thread(first, tsk); 256 257 unlock_task_sighand(first, &flags); 258 rc = 0; 259 out: 260 rcu_read_unlock(); 261 262 stats->version = TASKSTATS_VERSION; 263 /* 264 * Accounting subsystems can also add calls here to modify 265 * fields of taskstats. 266 */ 267 return rc; 268 } 269 270 271 static void fill_tgid_exit(struct task_struct *tsk) 272 { 273 unsigned long flags; 274 275 spin_lock_irqsave(&tsk->sighand->siglock, flags); 276 if (!tsk->signal->stats) 277 goto ret; 278 279 /* 280 * Each accounting subsystem calls its functions here to 281 * accumalate its per-task stats for tsk, into the per-tgid structure 282 * 283 * per-task-foo(tsk->signal->stats, tsk); 284 */ 285 delayacct_add_tsk(tsk->signal->stats, tsk); 286 ret: 287 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 288 return; 289 } 290 291 static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) 292 { 293 struct listener_list *listeners; 294 struct listener *s, *tmp; 295 unsigned int cpu; 296 297 if (!cpumask_subset(mask, cpu_possible_mask)) 298 return -EINVAL; 299 300 if (isadd == REGISTER) { 301 for_each_cpu(cpu, mask) { 302 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 303 cpu_to_node(cpu)); 304 if (!s) 305 goto cleanup; 306 s->pid = pid; 307 INIT_LIST_HEAD(&s->list); 308 s->valid = 1; 309 310 listeners = &per_cpu(listener_array, cpu); 311 down_write(&listeners->sem); 312 list_add(&s->list, &listeners->list); 313 up_write(&listeners->sem); 314 } 315 return 0; 316 } 317 318 /* Deregister or cleanup */ 319 cleanup: 320 for_each_cpu(cpu, mask) { 321 listeners = &per_cpu(listener_array, cpu); 322 down_write(&listeners->sem); 323 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 324 if (s->pid == pid) { 325 list_del(&s->list); 326 kfree(s); 327 break; 328 } 329 } 330 up_write(&listeners->sem); 331 } 332 return 0; 333 } 334 335 static int parse(struct nlattr *na, struct cpumask *mask) 336 { 337 char *data; 338 int len; 339 int ret; 340 341 if (na == NULL) 342 return 1; 343 len = nla_len(na); 344 if (len > TASKSTATS_CPUMASK_MAXLEN) 345 return -E2BIG; 346 if (len < 1) 347 return -EINVAL; 348 data = kmalloc(len, GFP_KERNEL); 349 if (!data) 350 return -ENOMEM; 351 nla_strlcpy(data, na, len); 352 ret = cpulist_parse(data, mask); 353 kfree(data); 354 return ret; 355 } 356 357 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) 358 { 359 struct nlattr *na, *ret; 360 int aggr; 361 362 aggr = (type == TASKSTATS_TYPE_PID) 363 ? TASKSTATS_TYPE_AGGR_PID 364 : TASKSTATS_TYPE_AGGR_TGID; 365 366 na = nla_nest_start(skb, aggr); 367 if (!na) 368 goto err; 369 if (nla_put(skb, type, sizeof(pid), &pid) < 0) 370 goto err; 371 ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); 372 if (!ret) 373 goto err; 374 nla_nest_end(skb, na); 375 376 return nla_data(ret); 377 err: 378 return NULL; 379 } 380 381 static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 382 { 383 int rc = 0; 384 struct sk_buff *rep_skb; 385 struct cgroupstats *stats; 386 struct nlattr *na; 387 size_t size; 388 u32 fd; 389 struct file *file; 390 int fput_needed; 391 392 na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; 393 if (!na) 394 return -EINVAL; 395 396 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 397 file = fget_light(fd, &fput_needed); 398 if (!file) 399 return 0; 400 401 size = nla_total_size(sizeof(struct cgroupstats)); 402 403 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 404 size); 405 if (rc < 0) 406 goto err; 407 408 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 409 sizeof(struct cgroupstats)); 410 stats = nla_data(na); 411 memset(stats, 0, sizeof(*stats)); 412 413 rc = cgroupstats_build(stats, file->f_dentry); 414 if (rc < 0) { 415 nlmsg_free(rep_skb); 416 goto err; 417 } 418 419 rc = send_reply(rep_skb, info); 420 421 err: 422 fput_light(file, fput_needed); 423 return rc; 424 } 425 426 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 427 { 428 int rc; 429 struct sk_buff *rep_skb; 430 struct taskstats *stats; 431 size_t size; 432 cpumask_var_t mask; 433 434 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 435 return -ENOMEM; 436 437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); 438 if (rc < 0) 439 goto free_return_rc; 440 if (rc == 0) { 441 rc = add_del_listener(info->snd_pid, mask, REGISTER); 442 goto free_return_rc; 443 } 444 445 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); 446 if (rc < 0) 447 goto free_return_rc; 448 if (rc == 0) { 449 rc = add_del_listener(info->snd_pid, mask, DEREGISTER); 450 free_return_rc: 451 free_cpumask_var(mask); 452 return rc; 453 } 454 free_cpumask_var(mask); 455 456 /* 457 * Size includes space for nested attributes 458 */ 459 size = nla_total_size(sizeof(u32)) + 460 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 461 462 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); 463 if (rc < 0) 464 return rc; 465 466 rc = -EINVAL; 467 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { 468 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); 469 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); 470 if (!stats) 471 goto err; 472 473 rc = fill_pid(pid, NULL, stats); 474 if (rc < 0) 475 goto err; 476 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { 477 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); 478 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); 479 if (!stats) 480 goto err; 481 482 rc = fill_tgid(tgid, NULL, stats); 483 if (rc < 0) 484 goto err; 485 } else 486 goto err; 487 488 return send_reply(rep_skb, info); 489 err: 490 nlmsg_free(rep_skb); 491 return rc; 492 } 493 494 static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) 495 { 496 struct signal_struct *sig = tsk->signal; 497 struct taskstats *stats; 498 499 if (sig->stats || thread_group_empty(tsk)) 500 goto ret; 501 502 /* No problem if kmem_cache_zalloc() fails */ 503 stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); 504 505 spin_lock_irq(&tsk->sighand->siglock); 506 if (!sig->stats) { 507 sig->stats = stats; 508 stats = NULL; 509 } 510 spin_unlock_irq(&tsk->sighand->siglock); 511 512 if (stats) 513 kmem_cache_free(taskstats_cache, stats); 514 ret: 515 return sig->stats; 516 } 517 518 /* Send pid data out on exit */ 519 void taskstats_exit(struct task_struct *tsk, int group_dead) 520 { 521 int rc; 522 struct listener_list *listeners; 523 struct taskstats *stats; 524 struct sk_buff *rep_skb; 525 size_t size; 526 int is_thread_group; 527 528 if (!family_registered) 529 return; 530 531 /* 532 * Size includes space for nested attributes 533 */ 534 size = nla_total_size(sizeof(u32)) + 535 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); 536 537 is_thread_group = !!taskstats_tgid_alloc(tsk); 538 if (is_thread_group) { 539 /* PID + STATS + TGID + STATS */ 540 size = 2 * size; 541 /* fill the tsk->signal->stats structure */ 542 fill_tgid_exit(tsk); 543 } 544 545 listeners = &__raw_get_cpu_var(listener_array); 546 if (list_empty(&listeners->list)) 547 return; 548 549 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); 550 if (rc < 0) 551 return; 552 553 stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); 554 if (!stats) 555 goto err; 556 557 rc = fill_pid(-1, tsk, stats); 558 if (rc < 0) 559 goto err; 560 561 /* 562 * Doesn't matter if tsk is the leader or the last group member leaving 563 */ 564 if (!is_thread_group || !group_dead) 565 goto send; 566 567 stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); 568 if (!stats) 569 goto err; 570 571 memcpy(stats, tsk->signal->stats, sizeof(*stats)); 572 573 send: 574 send_cpu_listeners(rep_skb, listeners); 575 return; 576 err: 577 nlmsg_free(rep_skb); 578 } 579 580 static struct genl_ops taskstats_ops = { 581 .cmd = TASKSTATS_CMD_GET, 582 .doit = taskstats_user_cmd, 583 .policy = taskstats_cmd_get_policy, 584 }; 585 586 static struct genl_ops cgroupstats_ops = { 587 .cmd = CGROUPSTATS_CMD_GET, 588 .doit = cgroupstats_user_cmd, 589 .policy = cgroupstats_cmd_get_policy, 590 }; 591 592 /* Needed early in initialization */ 593 void __init taskstats_init_early(void) 594 { 595 unsigned int i; 596 597 taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); 598 for_each_possible_cpu(i) { 599 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); 600 init_rwsem(&(per_cpu(listener_array, i).sem)); 601 } 602 } 603 604 static int __init taskstats_init(void) 605 { 606 int rc; 607 608 rc = genl_register_family(&family); 609 if (rc) 610 return rc; 611 612 rc = genl_register_ops(&family, &taskstats_ops); 613 if (rc < 0) 614 goto err; 615 616 rc = genl_register_ops(&family, &cgroupstats_ops); 617 if (rc < 0) 618 goto err_cgroup_ops; 619 620 family_registered = 1; 621 printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); 622 return 0; 623 err_cgroup_ops: 624 genl_unregister_ops(&family, &taskstats_ops); 625 err: 626 genl_unregister_family(&family); 627 return rc; 628 } 629 630 /* 631 * late initcall ensures initialization of statistics collection 632 * mechanisms precedes initialization of the taskstats interface 633 */ 634 late_initcall(taskstats_init); 635