1 /* 2 * thread-stack.c: Synthesize a thread's stack using call / return events 3 * Copyright (c) 2014, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 * 14 */ 15 16 #include <linux/rbtree.h> 17 #include <linux/list.h> 18 #include <linux/log2.h> 19 #include <errno.h> 20 #include "thread.h" 21 #include "event.h" 22 #include "machine.h" 23 #include "env.h" 24 #include "util.h" 25 #include "debug.h" 26 #include "symbol.h" 27 #include "comm.h" 28 #include "call-path.h" 29 #include "thread-stack.h" 30 31 #define STACK_GROWTH 2048 32 33 /* 34 * State of retpoline detection. 35 * 36 * RETPOLINE_NONE: no retpoline detection 37 * X86_RETPOLINE_POSSIBLE: x86 retpoline possible 38 * X86_RETPOLINE_DETECTED: x86 retpoline detected 39 */ 40 enum retpoline_state_t { 41 RETPOLINE_NONE, 42 X86_RETPOLINE_POSSIBLE, 43 X86_RETPOLINE_DETECTED, 44 }; 45 46 /** 47 * struct thread_stack_entry - thread stack entry. 48 * @ret_addr: return address 49 * @timestamp: timestamp (if known) 50 * @ref: external reference (e.g. db_id of sample) 51 * @branch_count: the branch count when the entry was created 52 * @insn_count: the instruction count when the entry was created 53 * @cyc_count the cycle count when the entry was created 54 * @db_id: id used for db-export 55 * @cp: call path 56 * @no_call: a 'call' was not seen 57 * @trace_end: a 'call' but trace ended 58 * @non_call: a branch but not a 'call' to the start of a different symbol 59 */ 60 struct thread_stack_entry { 61 u64 ret_addr; 62 u64 timestamp; 63 u64 ref; 64 u64 branch_count; 65 u64 insn_count; 66 u64 cyc_count; 67 u64 db_id; 68 struct call_path *cp; 69 bool no_call; 70 bool trace_end; 71 bool non_call; 72 }; 73 74 /** 75 * struct thread_stack - thread stack constructed from 'call' and 'return' 76 * branch samples. 77 * @stack: array that holds the stack 78 * @cnt: number of entries in the stack 79 * @sz: current maximum stack size 80 * @trace_nr: current trace number 81 * @branch_count: running branch count 82 * @insn_count: running instruction count 83 * @cyc_count running cycle count 84 * @kernel_start: kernel start address 85 * @last_time: last timestamp 86 * @crp: call/return processor 87 * @comm: current comm 88 * @arr_sz: size of array if this is the first element of an array 89 * @rstate: used to detect retpolines 90 */ 91 struct thread_stack { 92 struct thread_stack_entry *stack; 93 size_t cnt; 94 size_t sz; 95 u64 trace_nr; 96 u64 branch_count; 97 u64 insn_count; 98 u64 cyc_count; 99 u64 kernel_start; 100 u64 last_time; 101 struct call_return_processor *crp; 102 struct comm *comm; 103 unsigned int arr_sz; 104 enum retpoline_state_t rstate; 105 }; 106 107 /* 108 * Assume pid == tid == 0 identifies the idle task as defined by 109 * perf_session__register_idle_thread(). The idle task is really 1 task per cpu, 110 * and therefore requires a stack for each cpu. 111 */ 112 static inline bool thread_stack__per_cpu(struct thread *thread) 113 { 114 return !(thread->tid || thread->pid_); 115 } 116 117 static int thread_stack__grow(struct thread_stack *ts) 118 { 119 struct thread_stack_entry *new_stack; 120 size_t sz, new_sz; 121 122 new_sz = ts->sz + STACK_GROWTH; 123 sz = new_sz * sizeof(struct thread_stack_entry); 124 125 new_stack = realloc(ts->stack, sz); 126 if (!new_stack) 127 return -ENOMEM; 128 129 ts->stack = new_stack; 130 ts->sz = new_sz; 131 132 return 0; 133 } 134 135 static int thread_stack__init(struct thread_stack *ts, struct thread *thread, 136 struct call_return_processor *crp) 137 { 138 int err; 139 140 err = thread_stack__grow(ts); 141 if (err) 142 return err; 143 144 if (thread->mg && thread->mg->machine) { 145 struct machine *machine = thread->mg->machine; 146 const char *arch = perf_env__arch(machine->env); 147 148 ts->kernel_start = machine__kernel_start(machine); 149 if (!strcmp(arch, "x86")) 150 ts->rstate = X86_RETPOLINE_POSSIBLE; 151 } else { 152 ts->kernel_start = 1ULL << 63; 153 } 154 ts->crp = crp; 155 156 return 0; 157 } 158 159 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, 160 struct call_return_processor *crp) 161 { 162 struct thread_stack *ts = thread->ts, *new_ts; 163 unsigned int old_sz = ts ? ts->arr_sz : 0; 164 unsigned int new_sz = 1; 165 166 if (thread_stack__per_cpu(thread) && cpu > 0) 167 new_sz = roundup_pow_of_two(cpu + 1); 168 169 if (!ts || new_sz > old_sz) { 170 new_ts = calloc(new_sz, sizeof(*ts)); 171 if (!new_ts) 172 return NULL; 173 if (ts) 174 memcpy(new_ts, ts, old_sz * sizeof(*ts)); 175 new_ts->arr_sz = new_sz; 176 zfree(&thread->ts); 177 thread->ts = new_ts; 178 ts = new_ts; 179 } 180 181 if (thread_stack__per_cpu(thread) && cpu > 0 && 182 (unsigned int)cpu < ts->arr_sz) 183 ts += cpu; 184 185 if (!ts->stack && 186 thread_stack__init(ts, thread, crp)) 187 return NULL; 188 189 return ts; 190 } 191 192 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu) 193 { 194 struct thread_stack *ts = thread->ts; 195 196 if (cpu < 0) 197 cpu = 0; 198 199 if (!ts || (unsigned int)cpu >= ts->arr_sz) 200 return NULL; 201 202 ts += cpu; 203 204 if (!ts->stack) 205 return NULL; 206 207 return ts; 208 } 209 210 static inline struct thread_stack *thread__stack(struct thread *thread, 211 int cpu) 212 { 213 if (!thread) 214 return NULL; 215 216 if (thread_stack__per_cpu(thread)) 217 return thread__cpu_stack(thread, cpu); 218 219 return thread->ts; 220 } 221 222 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr, 223 bool trace_end) 224 { 225 int err = 0; 226 227 if (ts->cnt == ts->sz) { 228 err = thread_stack__grow(ts); 229 if (err) { 230 pr_warning("Out of memory: discarding thread stack\n"); 231 ts->cnt = 0; 232 } 233 } 234 235 ts->stack[ts->cnt].trace_end = trace_end; 236 ts->stack[ts->cnt++].ret_addr = ret_addr; 237 238 return err; 239 } 240 241 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr) 242 { 243 size_t i; 244 245 /* 246 * In some cases there may be functions which are not seen to return. 247 * For example when setjmp / longjmp has been used. Or the perf context 248 * switch in the kernel which doesn't stop and start tracing in exactly 249 * the same code path. When that happens the return address will be 250 * further down the stack. If the return address is not found at all, 251 * we assume the opposite (i.e. this is a return for a call that wasn't 252 * seen for some reason) and leave the stack alone. 253 */ 254 for (i = ts->cnt; i; ) { 255 if (ts->stack[--i].ret_addr == ret_addr) { 256 ts->cnt = i; 257 return; 258 } 259 } 260 } 261 262 static void thread_stack__pop_trace_end(struct thread_stack *ts) 263 { 264 size_t i; 265 266 for (i = ts->cnt; i; ) { 267 if (ts->stack[--i].trace_end) 268 ts->cnt = i; 269 else 270 return; 271 } 272 } 273 274 static bool thread_stack__in_kernel(struct thread_stack *ts) 275 { 276 if (!ts->cnt) 277 return false; 278 279 return ts->stack[ts->cnt - 1].cp->in_kernel; 280 } 281 282 static int thread_stack__call_return(struct thread *thread, 283 struct thread_stack *ts, size_t idx, 284 u64 timestamp, u64 ref, bool no_return) 285 { 286 struct call_return_processor *crp = ts->crp; 287 struct thread_stack_entry *tse; 288 struct call_return cr = { 289 .thread = thread, 290 .comm = ts->comm, 291 .db_id = 0, 292 }; 293 u64 *parent_db_id; 294 295 tse = &ts->stack[idx]; 296 cr.cp = tse->cp; 297 cr.call_time = tse->timestamp; 298 cr.return_time = timestamp; 299 cr.branch_count = ts->branch_count - tse->branch_count; 300 cr.insn_count = ts->insn_count - tse->insn_count; 301 cr.cyc_count = ts->cyc_count - tse->cyc_count; 302 cr.db_id = tse->db_id; 303 cr.call_ref = tse->ref; 304 cr.return_ref = ref; 305 if (tse->no_call) 306 cr.flags |= CALL_RETURN_NO_CALL; 307 if (no_return) 308 cr.flags |= CALL_RETURN_NO_RETURN; 309 if (tse->non_call) 310 cr.flags |= CALL_RETURN_NON_CALL; 311 312 /* 313 * The parent db_id must be assigned before exporting the child. Note 314 * it is not possible to export the parent first because its information 315 * is not yet complete because its 'return' has not yet been processed. 316 */ 317 parent_db_id = idx ? &(tse - 1)->db_id : NULL; 318 319 return crp->process(&cr, parent_db_id, crp->data); 320 } 321 322 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) 323 { 324 struct call_return_processor *crp = ts->crp; 325 int err; 326 327 if (!crp) { 328 ts->cnt = 0; 329 return 0; 330 } 331 332 while (ts->cnt) { 333 err = thread_stack__call_return(thread, ts, --ts->cnt, 334 ts->last_time, 0, true); 335 if (err) { 336 pr_err("Error flushing thread stack!\n"); 337 ts->cnt = 0; 338 return err; 339 } 340 } 341 342 return 0; 343 } 344 345 int thread_stack__flush(struct thread *thread) 346 { 347 struct thread_stack *ts = thread->ts; 348 unsigned int pos; 349 int err = 0; 350 351 if (ts) { 352 for (pos = 0; pos < ts->arr_sz; pos++) { 353 int ret = __thread_stack__flush(thread, ts + pos); 354 355 if (ret) 356 err = ret; 357 } 358 } 359 360 return err; 361 } 362 363 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, 364 u64 to_ip, u16 insn_len, u64 trace_nr) 365 { 366 struct thread_stack *ts = thread__stack(thread, cpu); 367 368 if (!thread) 369 return -EINVAL; 370 371 if (!ts) { 372 ts = thread_stack__new(thread, cpu, NULL); 373 if (!ts) { 374 pr_warning("Out of memory: no thread stack\n"); 375 return -ENOMEM; 376 } 377 ts->trace_nr = trace_nr; 378 } 379 380 /* 381 * When the trace is discontinuous, the trace_nr changes. In that case 382 * the stack might be completely invalid. Better to report nothing than 383 * to report something misleading, so flush the stack. 384 */ 385 if (trace_nr != ts->trace_nr) { 386 if (ts->trace_nr) 387 __thread_stack__flush(thread, ts); 388 ts->trace_nr = trace_nr; 389 } 390 391 /* Stop here if thread_stack__process() is in use */ 392 if (ts->crp) 393 return 0; 394 395 if (flags & PERF_IP_FLAG_CALL) { 396 u64 ret_addr; 397 398 if (!to_ip) 399 return 0; 400 ret_addr = from_ip + insn_len; 401 if (ret_addr == to_ip) 402 return 0; /* Zero-length calls are excluded */ 403 return thread_stack__push(ts, ret_addr, 404 flags & PERF_IP_FLAG_TRACE_END); 405 } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) { 406 /* 407 * If the caller did not change the trace number (which would 408 * have flushed the stack) then try to make sense of the stack. 409 * Possibly, tracing began after returning to the current 410 * address, so try to pop that. Also, do not expect a call made 411 * when the trace ended, to return, so pop that. 412 */ 413 thread_stack__pop(ts, to_ip); 414 thread_stack__pop_trace_end(ts); 415 } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) { 416 thread_stack__pop(ts, to_ip); 417 } 418 419 return 0; 420 } 421 422 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr) 423 { 424 struct thread_stack *ts = thread__stack(thread, cpu); 425 426 if (!ts) 427 return; 428 429 if (trace_nr != ts->trace_nr) { 430 if (ts->trace_nr) 431 __thread_stack__flush(thread, ts); 432 ts->trace_nr = trace_nr; 433 } 434 } 435 436 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts) 437 { 438 __thread_stack__flush(thread, ts); 439 zfree(&ts->stack); 440 } 441 442 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts) 443 { 444 unsigned int arr_sz = ts->arr_sz; 445 446 __thread_stack__free(thread, ts); 447 memset(ts, 0, sizeof(*ts)); 448 ts->arr_sz = arr_sz; 449 } 450 451 void thread_stack__free(struct thread *thread) 452 { 453 struct thread_stack *ts = thread->ts; 454 unsigned int pos; 455 456 if (ts) { 457 for (pos = 0; pos < ts->arr_sz; pos++) 458 __thread_stack__free(thread, ts + pos); 459 zfree(&thread->ts); 460 } 461 } 462 463 static inline u64 callchain_context(u64 ip, u64 kernel_start) 464 { 465 return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL; 466 } 467 468 void thread_stack__sample(struct thread *thread, int cpu, 469 struct ip_callchain *chain, 470 size_t sz, u64 ip, u64 kernel_start) 471 { 472 struct thread_stack *ts = thread__stack(thread, cpu); 473 u64 context = callchain_context(ip, kernel_start); 474 u64 last_context; 475 size_t i, j; 476 477 if (sz < 2) { 478 chain->nr = 0; 479 return; 480 } 481 482 chain->ips[0] = context; 483 chain->ips[1] = ip; 484 485 if (!ts) { 486 chain->nr = 2; 487 return; 488 } 489 490 last_context = context; 491 492 for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) { 493 ip = ts->stack[ts->cnt - j].ret_addr; 494 context = callchain_context(ip, kernel_start); 495 if (context != last_context) { 496 if (i >= sz - 1) 497 break; 498 chain->ips[i++] = context; 499 last_context = context; 500 } 501 chain->ips[i] = ip; 502 } 503 504 chain->nr = i; 505 } 506 507 struct call_return_processor * 508 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), 509 void *data) 510 { 511 struct call_return_processor *crp; 512 513 crp = zalloc(sizeof(struct call_return_processor)); 514 if (!crp) 515 return NULL; 516 crp->cpr = call_path_root__new(); 517 if (!crp->cpr) 518 goto out_free; 519 crp->process = process; 520 crp->data = data; 521 return crp; 522 523 out_free: 524 free(crp); 525 return NULL; 526 } 527 528 void call_return_processor__free(struct call_return_processor *crp) 529 { 530 if (crp) { 531 call_path_root__free(crp->cpr); 532 free(crp); 533 } 534 } 535 536 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, 537 u64 timestamp, u64 ref, struct call_path *cp, 538 bool no_call, bool trace_end) 539 { 540 struct thread_stack_entry *tse; 541 int err; 542 543 if (!cp) 544 return -ENOMEM; 545 546 if (ts->cnt == ts->sz) { 547 err = thread_stack__grow(ts); 548 if (err) 549 return err; 550 } 551 552 tse = &ts->stack[ts->cnt++]; 553 tse->ret_addr = ret_addr; 554 tse->timestamp = timestamp; 555 tse->ref = ref; 556 tse->branch_count = ts->branch_count; 557 tse->insn_count = ts->insn_count; 558 tse->cyc_count = ts->cyc_count; 559 tse->cp = cp; 560 tse->no_call = no_call; 561 tse->trace_end = trace_end; 562 tse->non_call = false; 563 tse->db_id = 0; 564 565 return 0; 566 } 567 568 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts, 569 u64 ret_addr, u64 timestamp, u64 ref, 570 struct symbol *sym) 571 { 572 int err; 573 574 if (!ts->cnt) 575 return 1; 576 577 if (ts->cnt == 1) { 578 struct thread_stack_entry *tse = &ts->stack[0]; 579 580 if (tse->cp->sym == sym) 581 return thread_stack__call_return(thread, ts, --ts->cnt, 582 timestamp, ref, false); 583 } 584 585 if (ts->stack[ts->cnt - 1].ret_addr == ret_addr && 586 !ts->stack[ts->cnt - 1].non_call) { 587 return thread_stack__call_return(thread, ts, --ts->cnt, 588 timestamp, ref, false); 589 } else { 590 size_t i = ts->cnt - 1; 591 592 while (i--) { 593 if (ts->stack[i].ret_addr != ret_addr || 594 ts->stack[i].non_call) 595 continue; 596 i += 1; 597 while (ts->cnt > i) { 598 err = thread_stack__call_return(thread, ts, 599 --ts->cnt, 600 timestamp, ref, 601 true); 602 if (err) 603 return err; 604 } 605 return thread_stack__call_return(thread, ts, --ts->cnt, 606 timestamp, ref, false); 607 } 608 } 609 610 return 1; 611 } 612 613 static int thread_stack__bottom(struct thread_stack *ts, 614 struct perf_sample *sample, 615 struct addr_location *from_al, 616 struct addr_location *to_al, u64 ref) 617 { 618 struct call_path_root *cpr = ts->crp->cpr; 619 struct call_path *cp; 620 struct symbol *sym; 621 u64 ip; 622 623 if (sample->ip) { 624 ip = sample->ip; 625 sym = from_al->sym; 626 } else if (sample->addr) { 627 ip = sample->addr; 628 sym = to_al->sym; 629 } else { 630 return 0; 631 } 632 633 cp = call_path__findnew(cpr, &cpr->call_path, sym, ip, 634 ts->kernel_start); 635 636 return thread_stack__push_cp(ts, ip, sample->time, ref, cp, 637 true, false); 638 } 639 640 static int thread_stack__no_call_return(struct thread *thread, 641 struct thread_stack *ts, 642 struct perf_sample *sample, 643 struct addr_location *from_al, 644 struct addr_location *to_al, u64 ref) 645 { 646 struct call_path_root *cpr = ts->crp->cpr; 647 struct call_path *root = &cpr->call_path; 648 struct symbol *fsym = from_al->sym; 649 struct symbol *tsym = to_al->sym; 650 struct call_path *cp, *parent; 651 u64 ks = ts->kernel_start; 652 u64 addr = sample->addr; 653 u64 tm = sample->time; 654 u64 ip = sample->ip; 655 int err; 656 657 if (ip >= ks && addr < ks) { 658 /* Return to userspace, so pop all kernel addresses */ 659 while (thread_stack__in_kernel(ts)) { 660 err = thread_stack__call_return(thread, ts, --ts->cnt, 661 tm, ref, true); 662 if (err) 663 return err; 664 } 665 666 /* If the stack is empty, push the userspace address */ 667 if (!ts->cnt) { 668 cp = call_path__findnew(cpr, root, tsym, addr, ks); 669 return thread_stack__push_cp(ts, 0, tm, ref, cp, true, 670 false); 671 } 672 } else if (thread_stack__in_kernel(ts) && ip < ks) { 673 /* Return to userspace, so pop all kernel addresses */ 674 while (thread_stack__in_kernel(ts)) { 675 err = thread_stack__call_return(thread, ts, --ts->cnt, 676 tm, ref, true); 677 if (err) 678 return err; 679 } 680 } 681 682 if (ts->cnt) 683 parent = ts->stack[ts->cnt - 1].cp; 684 else 685 parent = root; 686 687 if (parent->sym == from_al->sym) { 688 /* 689 * At the bottom of the stack, assume the missing 'call' was 690 * before the trace started. So, pop the current symbol and push 691 * the 'to' symbol. 692 */ 693 if (ts->cnt == 1) { 694 err = thread_stack__call_return(thread, ts, --ts->cnt, 695 tm, ref, false); 696 if (err) 697 return err; 698 } 699 700 if (!ts->cnt) { 701 cp = call_path__findnew(cpr, root, tsym, addr, ks); 702 703 return thread_stack__push_cp(ts, addr, tm, ref, cp, 704 true, false); 705 } 706 707 /* 708 * Otherwise assume the 'return' is being used as a jump (e.g. 709 * retpoline) and just push the 'to' symbol. 710 */ 711 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 712 713 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false); 714 if (!err) 715 ts->stack[ts->cnt - 1].non_call = true; 716 717 return err; 718 } 719 720 /* 721 * Assume 'parent' has not yet returned, so push 'to', and then push and 722 * pop 'from'. 723 */ 724 725 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 726 727 err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false); 728 if (err) 729 return err; 730 731 cp = call_path__findnew(cpr, cp, fsym, ip, ks); 732 733 err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false); 734 if (err) 735 return err; 736 737 return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false); 738 } 739 740 static int thread_stack__trace_begin(struct thread *thread, 741 struct thread_stack *ts, u64 timestamp, 742 u64 ref) 743 { 744 struct thread_stack_entry *tse; 745 int err; 746 747 if (!ts->cnt) 748 return 0; 749 750 /* Pop trace end */ 751 tse = &ts->stack[ts->cnt - 1]; 752 if (tse->trace_end) { 753 err = thread_stack__call_return(thread, ts, --ts->cnt, 754 timestamp, ref, false); 755 if (err) 756 return err; 757 } 758 759 return 0; 760 } 761 762 static int thread_stack__trace_end(struct thread_stack *ts, 763 struct perf_sample *sample, u64 ref) 764 { 765 struct call_path_root *cpr = ts->crp->cpr; 766 struct call_path *cp; 767 u64 ret_addr; 768 769 /* No point having 'trace end' on the bottom of the stack */ 770 if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref)) 771 return 0; 772 773 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0, 774 ts->kernel_start); 775 776 ret_addr = sample->ip + sample->insn_len; 777 778 return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp, 779 false, true); 780 } 781 782 static bool is_x86_retpoline(const char *name) 783 { 784 const char *p = strstr(name, "__x86_indirect_thunk_"); 785 786 return p == name || !strcmp(name, "__indirect_thunk_start"); 787 } 788 789 /* 790 * x86 retpoline functions pollute the call graph. This function removes them. 791 * This does not handle function return thunks, nor is there any improvement 792 * for the handling of inline thunks or extern thunks. 793 */ 794 static int thread_stack__x86_retpoline(struct thread_stack *ts, 795 struct perf_sample *sample, 796 struct addr_location *to_al) 797 { 798 struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1]; 799 struct call_path_root *cpr = ts->crp->cpr; 800 struct symbol *sym = tse->cp->sym; 801 struct symbol *tsym = to_al->sym; 802 struct call_path *cp; 803 804 if (sym && is_x86_retpoline(sym->name)) { 805 /* 806 * This is a x86 retpoline fn. It pollutes the call graph by 807 * showing up everywhere there is an indirect branch, but does 808 * not itself mean anything. Here the top-of-stack is removed, 809 * by decrementing the stack count, and then further down, the 810 * resulting top-of-stack is replaced with the actual target. 811 * The result is that the retpoline functions will no longer 812 * appear in the call graph. Note this only affects the call 813 * graph, since all the original branches are left unchanged. 814 */ 815 ts->cnt -= 1; 816 sym = ts->stack[ts->cnt - 2].cp->sym; 817 if (sym && sym == tsym && to_al->addr != tsym->start) { 818 /* 819 * Target is back to the middle of the symbol we came 820 * from so assume it is an indirect jmp and forget it 821 * altogether. 822 */ 823 ts->cnt -= 1; 824 return 0; 825 } 826 } else if (sym && sym == tsym) { 827 /* 828 * Target is back to the symbol we came from so assume it is an 829 * indirect jmp and forget it altogether. 830 */ 831 ts->cnt -= 1; 832 return 0; 833 } 834 835 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym, 836 sample->addr, ts->kernel_start); 837 if (!cp) 838 return -ENOMEM; 839 840 /* Replace the top-of-stack with the actual target */ 841 ts->stack[ts->cnt - 1].cp = cp; 842 843 return 0; 844 } 845 846 int thread_stack__process(struct thread *thread, struct comm *comm, 847 struct perf_sample *sample, 848 struct addr_location *from_al, 849 struct addr_location *to_al, u64 ref, 850 struct call_return_processor *crp) 851 { 852 struct thread_stack *ts = thread__stack(thread, sample->cpu); 853 enum retpoline_state_t rstate; 854 int err = 0; 855 856 if (ts && !ts->crp) { 857 /* Supersede thread_stack__event() */ 858 thread_stack__reset(thread, ts); 859 ts = NULL; 860 } 861 862 if (!ts) { 863 ts = thread_stack__new(thread, sample->cpu, crp); 864 if (!ts) 865 return -ENOMEM; 866 ts->comm = comm; 867 } 868 869 rstate = ts->rstate; 870 if (rstate == X86_RETPOLINE_DETECTED) 871 ts->rstate = X86_RETPOLINE_POSSIBLE; 872 873 /* Flush stack on exec */ 874 if (ts->comm != comm && thread->pid_ == thread->tid) { 875 err = __thread_stack__flush(thread, ts); 876 if (err) 877 return err; 878 ts->comm = comm; 879 } 880 881 /* If the stack is empty, put the current symbol on the stack */ 882 if (!ts->cnt) { 883 err = thread_stack__bottom(ts, sample, from_al, to_al, ref); 884 if (err) 885 return err; 886 } 887 888 ts->branch_count += 1; 889 ts->insn_count += sample->insn_cnt; 890 ts->cyc_count += sample->cyc_cnt; 891 ts->last_time = sample->time; 892 893 if (sample->flags & PERF_IP_FLAG_CALL) { 894 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END; 895 struct call_path_root *cpr = ts->crp->cpr; 896 struct call_path *cp; 897 u64 ret_addr; 898 899 if (!sample->ip || !sample->addr) 900 return 0; 901 902 ret_addr = sample->ip + sample->insn_len; 903 if (ret_addr == sample->addr) 904 return 0; /* Zero-length calls are excluded */ 905 906 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 907 to_al->sym, sample->addr, 908 ts->kernel_start); 909 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref, 910 cp, false, trace_end); 911 912 /* 913 * A call to the same symbol but not the start of the symbol, 914 * may be the start of a x86 retpoline. 915 */ 916 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym && 917 from_al->sym == to_al->sym && 918 to_al->addr != to_al->sym->start) 919 ts->rstate = X86_RETPOLINE_DETECTED; 920 921 } else if (sample->flags & PERF_IP_FLAG_RETURN) { 922 if (!sample->ip || !sample->addr) 923 return 0; 924 925 /* x86 retpoline 'return' doesn't match the stack */ 926 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 && 927 ts->stack[ts->cnt - 1].ret_addr != sample->addr) 928 return thread_stack__x86_retpoline(ts, sample, to_al); 929 930 err = thread_stack__pop_cp(thread, ts, sample->addr, 931 sample->time, ref, from_al->sym); 932 if (err) { 933 if (err < 0) 934 return err; 935 err = thread_stack__no_call_return(thread, ts, sample, 936 from_al, to_al, ref); 937 } 938 } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) { 939 err = thread_stack__trace_begin(thread, ts, sample->time, ref); 940 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) { 941 err = thread_stack__trace_end(ts, sample, ref); 942 } else if (sample->flags & PERF_IP_FLAG_BRANCH && 943 from_al->sym != to_al->sym && to_al->sym && 944 to_al->addr == to_al->sym->start) { 945 struct call_path_root *cpr = ts->crp->cpr; 946 struct call_path *cp; 947 948 /* 949 * The compiler might optimize a call/ret combination by making 950 * it a jmp. Make that visible by recording on the stack a 951 * branch to the start of a different symbol. Note, that means 952 * when a ret pops the stack, all jmps must be popped off first. 953 */ 954 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 955 to_al->sym, sample->addr, 956 ts->kernel_start); 957 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false, 958 false); 959 if (!err) 960 ts->stack[ts->cnt - 1].non_call = true; 961 } 962 963 return err; 964 } 965 966 size_t thread_stack__depth(struct thread *thread, int cpu) 967 { 968 struct thread_stack *ts = thread__stack(thread, cpu); 969 970 if (!ts) 971 return 0; 972 return ts->cnt; 973 } 974