1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * thread-stack.c: Synthesize a thread's stack using call / return events 4 * Copyright (c) 2014, Intel Corporation. 5 */ 6 7 #include <linux/rbtree.h> 8 #include <linux/list.h> 9 #include <linux/log2.h> 10 #include <errno.h> 11 #include "thread.h" 12 #include "event.h" 13 #include "machine.h" 14 #include "env.h" 15 #include "util.h" 16 #include "debug.h" 17 #include "symbol.h" 18 #include "comm.h" 19 #include "call-path.h" 20 #include "thread-stack.h" 21 22 #define STACK_GROWTH 2048 23 24 /* 25 * State of retpoline detection. 26 * 27 * RETPOLINE_NONE: no retpoline detection 28 * X86_RETPOLINE_POSSIBLE: x86 retpoline possible 29 * X86_RETPOLINE_DETECTED: x86 retpoline detected 30 */ 31 enum retpoline_state_t { 32 RETPOLINE_NONE, 33 X86_RETPOLINE_POSSIBLE, 34 X86_RETPOLINE_DETECTED, 35 }; 36 37 /** 38 * struct thread_stack_entry - thread stack entry. 39 * @ret_addr: return address 40 * @timestamp: timestamp (if known) 41 * @ref: external reference (e.g. db_id of sample) 42 * @branch_count: the branch count when the entry was created 43 * @insn_count: the instruction count when the entry was created 44 * @cyc_count the cycle count when the entry was created 45 * @db_id: id used for db-export 46 * @cp: call path 47 * @no_call: a 'call' was not seen 48 * @trace_end: a 'call' but trace ended 49 * @non_call: a branch but not a 'call' to the start of a different symbol 50 */ 51 struct thread_stack_entry { 52 u64 ret_addr; 53 u64 timestamp; 54 u64 ref; 55 u64 branch_count; 56 u64 insn_count; 57 u64 cyc_count; 58 u64 db_id; 59 struct call_path *cp; 60 bool no_call; 61 bool trace_end; 62 bool non_call; 63 }; 64 65 /** 66 * struct thread_stack - thread stack constructed from 'call' and 'return' 67 * branch samples. 68 * @stack: array that holds the stack 69 * @cnt: number of entries in the stack 70 * @sz: current maximum stack size 71 * @trace_nr: current trace number 72 * @branch_count: running branch count 73 * @insn_count: running instruction count 74 * @cyc_count running cycle count 75 * @kernel_start: kernel start address 76 * @last_time: last timestamp 77 * @crp: call/return processor 78 * @comm: current comm 79 * @arr_sz: size of array if this is the first element of an array 80 * @rstate: used to detect retpolines 81 */ 82 struct thread_stack { 83 struct thread_stack_entry *stack; 84 size_t cnt; 85 size_t sz; 86 u64 trace_nr; 87 u64 branch_count; 88 u64 insn_count; 89 u64 cyc_count; 90 u64 kernel_start; 91 u64 last_time; 92 struct call_return_processor *crp; 93 struct comm *comm; 94 unsigned int arr_sz; 95 enum retpoline_state_t rstate; 96 }; 97 98 /* 99 * Assume pid == tid == 0 identifies the idle task as defined by 100 * perf_session__register_idle_thread(). The idle task is really 1 task per cpu, 101 * and therefore requires a stack for each cpu. 102 */ 103 static inline bool thread_stack__per_cpu(struct thread *thread) 104 { 105 return !(thread->tid || thread->pid_); 106 } 107 108 static int thread_stack__grow(struct thread_stack *ts) 109 { 110 struct thread_stack_entry *new_stack; 111 size_t sz, new_sz; 112 113 new_sz = ts->sz + STACK_GROWTH; 114 sz = new_sz * sizeof(struct thread_stack_entry); 115 116 new_stack = realloc(ts->stack, sz); 117 if (!new_stack) 118 return -ENOMEM; 119 120 ts->stack = new_stack; 121 ts->sz = new_sz; 122 123 return 0; 124 } 125 126 static int thread_stack__init(struct thread_stack *ts, struct thread *thread, 127 struct call_return_processor *crp) 128 { 129 int err; 130 131 err = thread_stack__grow(ts); 132 if (err) 133 return err; 134 135 if (thread->mg && thread->mg->machine) { 136 struct machine *machine = thread->mg->machine; 137 const char *arch = perf_env__arch(machine->env); 138 139 ts->kernel_start = machine__kernel_start(machine); 140 if (!strcmp(arch, "x86")) 141 ts->rstate = X86_RETPOLINE_POSSIBLE; 142 } else { 143 ts->kernel_start = 1ULL << 63; 144 } 145 ts->crp = crp; 146 147 return 0; 148 } 149 150 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, 151 struct call_return_processor *crp) 152 { 153 struct thread_stack *ts = thread->ts, *new_ts; 154 unsigned int old_sz = ts ? ts->arr_sz : 0; 155 unsigned int new_sz = 1; 156 157 if (thread_stack__per_cpu(thread) && cpu > 0) 158 new_sz = roundup_pow_of_two(cpu + 1); 159 160 if (!ts || new_sz > old_sz) { 161 new_ts = calloc(new_sz, sizeof(*ts)); 162 if (!new_ts) 163 return NULL; 164 if (ts) 165 memcpy(new_ts, ts, old_sz * sizeof(*ts)); 166 new_ts->arr_sz = new_sz; 167 zfree(&thread->ts); 168 thread->ts = new_ts; 169 ts = new_ts; 170 } 171 172 if (thread_stack__per_cpu(thread) && cpu > 0 && 173 (unsigned int)cpu < ts->arr_sz) 174 ts += cpu; 175 176 if (!ts->stack && 177 thread_stack__init(ts, thread, crp)) 178 return NULL; 179 180 return ts; 181 } 182 183 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu) 184 { 185 struct thread_stack *ts = thread->ts; 186 187 if (cpu < 0) 188 cpu = 0; 189 190 if (!ts || (unsigned int)cpu >= ts->arr_sz) 191 return NULL; 192 193 ts += cpu; 194 195 if (!ts->stack) 196 return NULL; 197 198 return ts; 199 } 200 201 static inline struct thread_stack *thread__stack(struct thread *thread, 202 int cpu) 203 { 204 if (!thread) 205 return NULL; 206 207 if (thread_stack__per_cpu(thread)) 208 return thread__cpu_stack(thread, cpu); 209 210 return thread->ts; 211 } 212 213 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr, 214 bool trace_end) 215 { 216 int err = 0; 217 218 if (ts->cnt == ts->sz) { 219 err = thread_stack__grow(ts); 220 if (err) { 221 pr_warning("Out of memory: discarding thread stack\n"); 222 ts->cnt = 0; 223 } 224 } 225 226 ts->stack[ts->cnt].trace_end = trace_end; 227 ts->stack[ts->cnt++].ret_addr = ret_addr; 228 229 return err; 230 } 231 232 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr) 233 { 234 size_t i; 235 236 /* 237 * In some cases there may be functions which are not seen to return. 238 * For example when setjmp / longjmp has been used. Or the perf context 239 * switch in the kernel which doesn't stop and start tracing in exactly 240 * the same code path. When that happens the return address will be 241 * further down the stack. If the return address is not found at all, 242 * we assume the opposite (i.e. this is a return for a call that wasn't 243 * seen for some reason) and leave the stack alone. 244 */ 245 for (i = ts->cnt; i; ) { 246 if (ts->stack[--i].ret_addr == ret_addr) { 247 ts->cnt = i; 248 return; 249 } 250 } 251 } 252 253 static void thread_stack__pop_trace_end(struct thread_stack *ts) 254 { 255 size_t i; 256 257 for (i = ts->cnt; i; ) { 258 if (ts->stack[--i].trace_end) 259 ts->cnt = i; 260 else 261 return; 262 } 263 } 264 265 static bool thread_stack__in_kernel(struct thread_stack *ts) 266 { 267 if (!ts->cnt) 268 return false; 269 270 return ts->stack[ts->cnt - 1].cp->in_kernel; 271 } 272 273 static int thread_stack__call_return(struct thread *thread, 274 struct thread_stack *ts, size_t idx, 275 u64 timestamp, u64 ref, bool no_return) 276 { 277 struct call_return_processor *crp = ts->crp; 278 struct thread_stack_entry *tse; 279 struct call_return cr = { 280 .thread = thread, 281 .comm = ts->comm, 282 .db_id = 0, 283 }; 284 u64 *parent_db_id; 285 286 tse = &ts->stack[idx]; 287 cr.cp = tse->cp; 288 cr.call_time = tse->timestamp; 289 cr.return_time = timestamp; 290 cr.branch_count = ts->branch_count - tse->branch_count; 291 cr.insn_count = ts->insn_count - tse->insn_count; 292 cr.cyc_count = ts->cyc_count - tse->cyc_count; 293 cr.db_id = tse->db_id; 294 cr.call_ref = tse->ref; 295 cr.return_ref = ref; 296 if (tse->no_call) 297 cr.flags |= CALL_RETURN_NO_CALL; 298 if (no_return) 299 cr.flags |= CALL_RETURN_NO_RETURN; 300 if (tse->non_call) 301 cr.flags |= CALL_RETURN_NON_CALL; 302 303 /* 304 * The parent db_id must be assigned before exporting the child. Note 305 * it is not possible to export the parent first because its information 306 * is not yet complete because its 'return' has not yet been processed. 307 */ 308 parent_db_id = idx ? &(tse - 1)->db_id : NULL; 309 310 return crp->process(&cr, parent_db_id, crp->data); 311 } 312 313 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) 314 { 315 struct call_return_processor *crp = ts->crp; 316 int err; 317 318 if (!crp) { 319 ts->cnt = 0; 320 return 0; 321 } 322 323 while (ts->cnt) { 324 err = thread_stack__call_return(thread, ts, --ts->cnt, 325 ts->last_time, 0, true); 326 if (err) { 327 pr_err("Error flushing thread stack!\n"); 328 ts->cnt = 0; 329 return err; 330 } 331 } 332 333 return 0; 334 } 335 336 int thread_stack__flush(struct thread *thread) 337 { 338 struct thread_stack *ts = thread->ts; 339 unsigned int pos; 340 int err = 0; 341 342 if (ts) { 343 for (pos = 0; pos < ts->arr_sz; pos++) { 344 int ret = __thread_stack__flush(thread, ts + pos); 345 346 if (ret) 347 err = ret; 348 } 349 } 350 351 return err; 352 } 353 354 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, 355 u64 to_ip, u16 insn_len, u64 trace_nr) 356 { 357 struct thread_stack *ts = thread__stack(thread, cpu); 358 359 if (!thread) 360 return -EINVAL; 361 362 if (!ts) { 363 ts = thread_stack__new(thread, cpu, NULL); 364 if (!ts) { 365 pr_warning("Out of memory: no thread stack\n"); 366 return -ENOMEM; 367 } 368 ts->trace_nr = trace_nr; 369 } 370 371 /* 372 * When the trace is discontinuous, the trace_nr changes. In that case 373 * the stack might be completely invalid. Better to report nothing than 374 * to report something misleading, so flush the stack. 375 */ 376 if (trace_nr != ts->trace_nr) { 377 if (ts->trace_nr) 378 __thread_stack__flush(thread, ts); 379 ts->trace_nr = trace_nr; 380 } 381 382 /* Stop here if thread_stack__process() is in use */ 383 if (ts->crp) 384 return 0; 385 386 if (flags & PERF_IP_FLAG_CALL) { 387 u64 ret_addr; 388 389 if (!to_ip) 390 return 0; 391 ret_addr = from_ip + insn_len; 392 if (ret_addr == to_ip) 393 return 0; /* Zero-length calls are excluded */ 394 return thread_stack__push(ts, ret_addr, 395 flags & PERF_IP_FLAG_TRACE_END); 396 } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) { 397 /* 398 * If the caller did not change the trace number (which would 399 * have flushed the stack) then try to make sense of the stack. 400 * Possibly, tracing began after returning to the current 401 * address, so try to pop that. Also, do not expect a call made 402 * when the trace ended, to return, so pop that. 403 */ 404 thread_stack__pop(ts, to_ip); 405 thread_stack__pop_trace_end(ts); 406 } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) { 407 thread_stack__pop(ts, to_ip); 408 } 409 410 return 0; 411 } 412 413 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr) 414 { 415 struct thread_stack *ts = thread__stack(thread, cpu); 416 417 if (!ts) 418 return; 419 420 if (trace_nr != ts->trace_nr) { 421 if (ts->trace_nr) 422 __thread_stack__flush(thread, ts); 423 ts->trace_nr = trace_nr; 424 } 425 } 426 427 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts) 428 { 429 __thread_stack__flush(thread, ts); 430 zfree(&ts->stack); 431 } 432 433 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts) 434 { 435 unsigned int arr_sz = ts->arr_sz; 436 437 __thread_stack__free(thread, ts); 438 memset(ts, 0, sizeof(*ts)); 439 ts->arr_sz = arr_sz; 440 } 441 442 void thread_stack__free(struct thread *thread) 443 { 444 struct thread_stack *ts = thread->ts; 445 unsigned int pos; 446 447 if (ts) { 448 for (pos = 0; pos < ts->arr_sz; pos++) 449 __thread_stack__free(thread, ts + pos); 450 zfree(&thread->ts); 451 } 452 } 453 454 static inline u64 callchain_context(u64 ip, u64 kernel_start) 455 { 456 return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL; 457 } 458 459 void thread_stack__sample(struct thread *thread, int cpu, 460 struct ip_callchain *chain, 461 size_t sz, u64 ip, u64 kernel_start) 462 { 463 struct thread_stack *ts = thread__stack(thread, cpu); 464 u64 context = callchain_context(ip, kernel_start); 465 u64 last_context; 466 size_t i, j; 467 468 if (sz < 2) { 469 chain->nr = 0; 470 return; 471 } 472 473 chain->ips[0] = context; 474 chain->ips[1] = ip; 475 476 if (!ts) { 477 chain->nr = 2; 478 return; 479 } 480 481 last_context = context; 482 483 for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) { 484 ip = ts->stack[ts->cnt - j].ret_addr; 485 context = callchain_context(ip, kernel_start); 486 if (context != last_context) { 487 if (i >= sz - 1) 488 break; 489 chain->ips[i++] = context; 490 last_context = context; 491 } 492 chain->ips[i] = ip; 493 } 494 495 chain->nr = i; 496 } 497 498 struct call_return_processor * 499 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), 500 void *data) 501 { 502 struct call_return_processor *crp; 503 504 crp = zalloc(sizeof(struct call_return_processor)); 505 if (!crp) 506 return NULL; 507 crp->cpr = call_path_root__new(); 508 if (!crp->cpr) 509 goto out_free; 510 crp->process = process; 511 crp->data = data; 512 return crp; 513 514 out_free: 515 free(crp); 516 return NULL; 517 } 518 519 void call_return_processor__free(struct call_return_processor *crp) 520 { 521 if (crp) { 522 call_path_root__free(crp->cpr); 523 free(crp); 524 } 525 } 526 527 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, 528 u64 timestamp, u64 ref, struct call_path *cp, 529 bool no_call, bool trace_end) 530 { 531 struct thread_stack_entry *tse; 532 int err; 533 534 if (!cp) 535 return -ENOMEM; 536 537 if (ts->cnt == ts->sz) { 538 err = thread_stack__grow(ts); 539 if (err) 540 return err; 541 } 542 543 tse = &ts->stack[ts->cnt++]; 544 tse->ret_addr = ret_addr; 545 tse->timestamp = timestamp; 546 tse->ref = ref; 547 tse->branch_count = ts->branch_count; 548 tse->insn_count = ts->insn_count; 549 tse->cyc_count = ts->cyc_count; 550 tse->cp = cp; 551 tse->no_call = no_call; 552 tse->trace_end = trace_end; 553 tse->non_call = false; 554 tse->db_id = 0; 555 556 return 0; 557 } 558 559 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts, 560 u64 ret_addr, u64 timestamp, u64 ref, 561 struct symbol *sym) 562 { 563 int err; 564 565 if (!ts->cnt) 566 return 1; 567 568 if (ts->cnt == 1) { 569 struct thread_stack_entry *tse = &ts->stack[0]; 570 571 if (tse->cp->sym == sym) 572 return thread_stack__call_return(thread, ts, --ts->cnt, 573 timestamp, ref, false); 574 } 575 576 if (ts->stack[ts->cnt - 1].ret_addr == ret_addr && 577 !ts->stack[ts->cnt - 1].non_call) { 578 return thread_stack__call_return(thread, ts, --ts->cnt, 579 timestamp, ref, false); 580 } else { 581 size_t i = ts->cnt - 1; 582 583 while (i--) { 584 if (ts->stack[i].ret_addr != ret_addr || 585 ts->stack[i].non_call) 586 continue; 587 i += 1; 588 while (ts->cnt > i) { 589 err = thread_stack__call_return(thread, ts, 590 --ts->cnt, 591 timestamp, ref, 592 true); 593 if (err) 594 return err; 595 } 596 return thread_stack__call_return(thread, ts, --ts->cnt, 597 timestamp, ref, false); 598 } 599 } 600 601 return 1; 602 } 603 604 static int thread_stack__bottom(struct thread_stack *ts, 605 struct perf_sample *sample, 606 struct addr_location *from_al, 607 struct addr_location *to_al, u64 ref) 608 { 609 struct call_path_root *cpr = ts->crp->cpr; 610 struct call_path *cp; 611 struct symbol *sym; 612 u64 ip; 613 614 if (sample->ip) { 615 ip = sample->ip; 616 sym = from_al->sym; 617 } else if (sample->addr) { 618 ip = sample->addr; 619 sym = to_al->sym; 620 } else { 621 return 0; 622 } 623 624 cp = call_path__findnew(cpr, &cpr->call_path, sym, ip, 625 ts->kernel_start); 626 627 return thread_stack__push_cp(ts, ip, sample->time, ref, cp, 628 true, false); 629 } 630 631 static int thread_stack__pop_ks(struct thread *thread, struct thread_stack *ts, 632 struct perf_sample *sample, u64 ref) 633 { 634 u64 tm = sample->time; 635 int err; 636 637 /* Return to userspace, so pop all kernel addresses */ 638 while (thread_stack__in_kernel(ts)) { 639 err = thread_stack__call_return(thread, ts, --ts->cnt, 640 tm, ref, true); 641 if (err) 642 return err; 643 } 644 645 return 0; 646 } 647 648 static int thread_stack__no_call_return(struct thread *thread, 649 struct thread_stack *ts, 650 struct perf_sample *sample, 651 struct addr_location *from_al, 652 struct addr_location *to_al, u64 ref) 653 { 654 struct call_path_root *cpr = ts->crp->cpr; 655 struct call_path *root = &cpr->call_path; 656 struct symbol *fsym = from_al->sym; 657 struct symbol *tsym = to_al->sym; 658 struct call_path *cp, *parent; 659 u64 ks = ts->kernel_start; 660 u64 addr = sample->addr; 661 u64 tm = sample->time; 662 u64 ip = sample->ip; 663 int err; 664 665 if (ip >= ks && addr < ks) { 666 /* Return to userspace, so pop all kernel addresses */ 667 err = thread_stack__pop_ks(thread, ts, sample, ref); 668 if (err) 669 return err; 670 671 /* If the stack is empty, push the userspace address */ 672 if (!ts->cnt) { 673 cp = call_path__findnew(cpr, root, tsym, addr, ks); 674 return thread_stack__push_cp(ts, 0, tm, ref, cp, true, 675 false); 676 } 677 } else if (thread_stack__in_kernel(ts) && ip < ks) { 678 /* Return to userspace, so pop all kernel addresses */ 679 err = thread_stack__pop_ks(thread, ts, sample, ref); 680 if (err) 681 return err; 682 } 683 684 if (ts->cnt) 685 parent = ts->stack[ts->cnt - 1].cp; 686 else 687 parent = root; 688 689 if (parent->sym == from_al->sym) { 690 /* 691 * At the bottom of the stack, assume the missing 'call' was 692 * before the trace started. So, pop the current symbol and push 693 * the 'to' symbol. 694 */ 695 if (ts->cnt == 1) { 696 err = thread_stack__call_return(thread, ts, --ts->cnt, 697 tm, ref, false); 698 if (err) 699 return err; 700 } 701 702 if (!ts->cnt) { 703 cp = call_path__findnew(cpr, root, tsym, addr, ks); 704 705 return thread_stack__push_cp(ts, addr, tm, ref, cp, 706 true, false); 707 } 708 709 /* 710 * Otherwise assume the 'return' is being used as a jump (e.g. 711 * retpoline) and just push the 'to' symbol. 712 */ 713 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 714 715 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false); 716 if (!err) 717 ts->stack[ts->cnt - 1].non_call = true; 718 719 return err; 720 } 721 722 /* 723 * Assume 'parent' has not yet returned, so push 'to', and then push and 724 * pop 'from'. 725 */ 726 727 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 728 729 err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false); 730 if (err) 731 return err; 732 733 cp = call_path__findnew(cpr, cp, fsym, ip, ks); 734 735 err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false); 736 if (err) 737 return err; 738 739 return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false); 740 } 741 742 static int thread_stack__trace_begin(struct thread *thread, 743 struct thread_stack *ts, u64 timestamp, 744 u64 ref) 745 { 746 struct thread_stack_entry *tse; 747 int err; 748 749 if (!ts->cnt) 750 return 0; 751 752 /* Pop trace end */ 753 tse = &ts->stack[ts->cnt - 1]; 754 if (tse->trace_end) { 755 err = thread_stack__call_return(thread, ts, --ts->cnt, 756 timestamp, ref, false); 757 if (err) 758 return err; 759 } 760 761 return 0; 762 } 763 764 static int thread_stack__trace_end(struct thread_stack *ts, 765 struct perf_sample *sample, u64 ref) 766 { 767 struct call_path_root *cpr = ts->crp->cpr; 768 struct call_path *cp; 769 u64 ret_addr; 770 771 /* No point having 'trace end' on the bottom of the stack */ 772 if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref)) 773 return 0; 774 775 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0, 776 ts->kernel_start); 777 778 ret_addr = sample->ip + sample->insn_len; 779 780 return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp, 781 false, true); 782 } 783 784 static bool is_x86_retpoline(const char *name) 785 { 786 const char *p = strstr(name, "__x86_indirect_thunk_"); 787 788 return p == name || !strcmp(name, "__indirect_thunk_start"); 789 } 790 791 /* 792 * x86 retpoline functions pollute the call graph. This function removes them. 793 * This does not handle function return thunks, nor is there any improvement 794 * for the handling of inline thunks or extern thunks. 795 */ 796 static int thread_stack__x86_retpoline(struct thread_stack *ts, 797 struct perf_sample *sample, 798 struct addr_location *to_al) 799 { 800 struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1]; 801 struct call_path_root *cpr = ts->crp->cpr; 802 struct symbol *sym = tse->cp->sym; 803 struct symbol *tsym = to_al->sym; 804 struct call_path *cp; 805 806 if (sym && is_x86_retpoline(sym->name)) { 807 /* 808 * This is a x86 retpoline fn. It pollutes the call graph by 809 * showing up everywhere there is an indirect branch, but does 810 * not itself mean anything. Here the top-of-stack is removed, 811 * by decrementing the stack count, and then further down, the 812 * resulting top-of-stack is replaced with the actual target. 813 * The result is that the retpoline functions will no longer 814 * appear in the call graph. Note this only affects the call 815 * graph, since all the original branches are left unchanged. 816 */ 817 ts->cnt -= 1; 818 sym = ts->stack[ts->cnt - 2].cp->sym; 819 if (sym && sym == tsym && to_al->addr != tsym->start) { 820 /* 821 * Target is back to the middle of the symbol we came 822 * from so assume it is an indirect jmp and forget it 823 * altogether. 824 */ 825 ts->cnt -= 1; 826 return 0; 827 } 828 } else if (sym && sym == tsym) { 829 /* 830 * Target is back to the symbol we came from so assume it is an 831 * indirect jmp and forget it altogether. 832 */ 833 ts->cnt -= 1; 834 return 0; 835 } 836 837 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym, 838 sample->addr, ts->kernel_start); 839 if (!cp) 840 return -ENOMEM; 841 842 /* Replace the top-of-stack with the actual target */ 843 ts->stack[ts->cnt - 1].cp = cp; 844 845 return 0; 846 } 847 848 int thread_stack__process(struct thread *thread, struct comm *comm, 849 struct perf_sample *sample, 850 struct addr_location *from_al, 851 struct addr_location *to_al, u64 ref, 852 struct call_return_processor *crp) 853 { 854 struct thread_stack *ts = thread__stack(thread, sample->cpu); 855 enum retpoline_state_t rstate; 856 int err = 0; 857 858 if (ts && !ts->crp) { 859 /* Supersede thread_stack__event() */ 860 thread_stack__reset(thread, ts); 861 ts = NULL; 862 } 863 864 if (!ts) { 865 ts = thread_stack__new(thread, sample->cpu, crp); 866 if (!ts) 867 return -ENOMEM; 868 ts->comm = comm; 869 } 870 871 rstate = ts->rstate; 872 if (rstate == X86_RETPOLINE_DETECTED) 873 ts->rstate = X86_RETPOLINE_POSSIBLE; 874 875 /* Flush stack on exec */ 876 if (ts->comm != comm && thread->pid_ == thread->tid) { 877 err = __thread_stack__flush(thread, ts); 878 if (err) 879 return err; 880 ts->comm = comm; 881 } 882 883 /* If the stack is empty, put the current symbol on the stack */ 884 if (!ts->cnt) { 885 err = thread_stack__bottom(ts, sample, from_al, to_al, ref); 886 if (err) 887 return err; 888 } 889 890 ts->branch_count += 1; 891 ts->insn_count += sample->insn_cnt; 892 ts->cyc_count += sample->cyc_cnt; 893 ts->last_time = sample->time; 894 895 if (sample->flags & PERF_IP_FLAG_CALL) { 896 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END; 897 struct call_path_root *cpr = ts->crp->cpr; 898 struct call_path *cp; 899 u64 ret_addr; 900 901 if (!sample->ip || !sample->addr) 902 return 0; 903 904 ret_addr = sample->ip + sample->insn_len; 905 if (ret_addr == sample->addr) 906 return 0; /* Zero-length calls are excluded */ 907 908 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 909 to_al->sym, sample->addr, 910 ts->kernel_start); 911 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref, 912 cp, false, trace_end); 913 914 /* 915 * A call to the same symbol but not the start of the symbol, 916 * may be the start of a x86 retpoline. 917 */ 918 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym && 919 from_al->sym == to_al->sym && 920 to_al->addr != to_al->sym->start) 921 ts->rstate = X86_RETPOLINE_DETECTED; 922 923 } else if (sample->flags & PERF_IP_FLAG_RETURN) { 924 if (!sample->addr) { 925 u32 return_from_kernel = PERF_IP_FLAG_SYSCALLRET | 926 PERF_IP_FLAG_INTERRUPT; 927 928 if (!(sample->flags & return_from_kernel)) 929 return 0; 930 931 /* Pop kernel stack */ 932 return thread_stack__pop_ks(thread, ts, sample, ref); 933 } 934 935 if (!sample->ip) 936 return 0; 937 938 /* x86 retpoline 'return' doesn't match the stack */ 939 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 && 940 ts->stack[ts->cnt - 1].ret_addr != sample->addr) 941 return thread_stack__x86_retpoline(ts, sample, to_al); 942 943 err = thread_stack__pop_cp(thread, ts, sample->addr, 944 sample->time, ref, from_al->sym); 945 if (err) { 946 if (err < 0) 947 return err; 948 err = thread_stack__no_call_return(thread, ts, sample, 949 from_al, to_al, ref); 950 } 951 } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) { 952 err = thread_stack__trace_begin(thread, ts, sample->time, ref); 953 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) { 954 err = thread_stack__trace_end(ts, sample, ref); 955 } else if (sample->flags & PERF_IP_FLAG_BRANCH && 956 from_al->sym != to_al->sym && to_al->sym && 957 to_al->addr == to_al->sym->start) { 958 struct call_path_root *cpr = ts->crp->cpr; 959 struct call_path *cp; 960 961 /* 962 * The compiler might optimize a call/ret combination by making 963 * it a jmp. Make that visible by recording on the stack a 964 * branch to the start of a different symbol. Note, that means 965 * when a ret pops the stack, all jmps must be popped off first. 966 */ 967 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 968 to_al->sym, sample->addr, 969 ts->kernel_start); 970 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false, 971 false); 972 if (!err) 973 ts->stack[ts->cnt - 1].non_call = true; 974 } 975 976 return err; 977 } 978 979 size_t thread_stack__depth(struct thread *thread, int cpu) 980 { 981 struct thread_stack *ts = thread__stack(thread, cpu); 982 983 if (!ts) 984 return 0; 985 return ts->cnt; 986 } 987