1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * thread-stack.c: Synthesize a thread's stack using call / return events 4 * Copyright (c) 2014, Intel Corporation. 5 */ 6 7 #include <linux/rbtree.h> 8 #include <linux/list.h> 9 #include <linux/log2.h> 10 #include <errno.h> 11 #include "thread.h" 12 #include "event.h" 13 #include "machine.h" 14 #include "env.h" 15 #include "util.h" 16 #include "debug.h" 17 #include "symbol.h" 18 #include "comm.h" 19 #include "call-path.h" 20 #include "thread-stack.h" 21 22 #define STACK_GROWTH 2048 23 24 /* 25 * State of retpoline detection. 26 * 27 * RETPOLINE_NONE: no retpoline detection 28 * X86_RETPOLINE_POSSIBLE: x86 retpoline possible 29 * X86_RETPOLINE_DETECTED: x86 retpoline detected 30 */ 31 enum retpoline_state_t { 32 RETPOLINE_NONE, 33 X86_RETPOLINE_POSSIBLE, 34 X86_RETPOLINE_DETECTED, 35 }; 36 37 /** 38 * struct thread_stack_entry - thread stack entry. 39 * @ret_addr: return address 40 * @timestamp: timestamp (if known) 41 * @ref: external reference (e.g. db_id of sample) 42 * @branch_count: the branch count when the entry was created 43 * @insn_count: the instruction count when the entry was created 44 * @cyc_count the cycle count when the entry was created 45 * @db_id: id used for db-export 46 * @cp: call path 47 * @no_call: a 'call' was not seen 48 * @trace_end: a 'call' but trace ended 49 * @non_call: a branch but not a 'call' to the start of a different symbol 50 */ 51 struct thread_stack_entry { 52 u64 ret_addr; 53 u64 timestamp; 54 u64 ref; 55 u64 branch_count; 56 u64 insn_count; 57 u64 cyc_count; 58 u64 db_id; 59 struct call_path *cp; 60 bool no_call; 61 bool trace_end; 62 bool non_call; 63 }; 64 65 /** 66 * struct thread_stack - thread stack constructed from 'call' and 'return' 67 * branch samples. 68 * @stack: array that holds the stack 69 * @cnt: number of entries in the stack 70 * @sz: current maximum stack size 71 * @trace_nr: current trace number 72 * @branch_count: running branch count 73 * @insn_count: running instruction count 74 * @cyc_count running cycle count 75 * @kernel_start: kernel start address 76 * @last_time: last timestamp 77 * @crp: call/return processor 78 * @comm: current comm 79 * @arr_sz: size of array if this is the first element of an array 80 * @rstate: used to detect retpolines 81 */ 82 struct thread_stack { 83 struct thread_stack_entry *stack; 84 size_t cnt; 85 size_t sz; 86 u64 trace_nr; 87 u64 branch_count; 88 u64 insn_count; 89 u64 cyc_count; 90 u64 kernel_start; 91 u64 last_time; 92 struct call_return_processor *crp; 93 struct comm *comm; 94 unsigned int arr_sz; 95 enum retpoline_state_t rstate; 96 }; 97 98 /* 99 * Assume pid == tid == 0 identifies the idle task as defined by 100 * perf_session__register_idle_thread(). The idle task is really 1 task per cpu, 101 * and therefore requires a stack for each cpu. 102 */ 103 static inline bool thread_stack__per_cpu(struct thread *thread) 104 { 105 return !(thread->tid || thread->pid_); 106 } 107 108 static int thread_stack__grow(struct thread_stack *ts) 109 { 110 struct thread_stack_entry *new_stack; 111 size_t sz, new_sz; 112 113 new_sz = ts->sz + STACK_GROWTH; 114 sz = new_sz * sizeof(struct thread_stack_entry); 115 116 new_stack = realloc(ts->stack, sz); 117 if (!new_stack) 118 return -ENOMEM; 119 120 ts->stack = new_stack; 121 ts->sz = new_sz; 122 123 return 0; 124 } 125 126 static int thread_stack__init(struct thread_stack *ts, struct thread *thread, 127 struct call_return_processor *crp) 128 { 129 int err; 130 131 err = thread_stack__grow(ts); 132 if (err) 133 return err; 134 135 if (thread->mg && thread->mg->machine) { 136 struct machine *machine = thread->mg->machine; 137 const char *arch = perf_env__arch(machine->env); 138 139 ts->kernel_start = machine__kernel_start(machine); 140 if (!strcmp(arch, "x86")) 141 ts->rstate = X86_RETPOLINE_POSSIBLE; 142 } else { 143 ts->kernel_start = 1ULL << 63; 144 } 145 ts->crp = crp; 146 147 return 0; 148 } 149 150 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu, 151 struct call_return_processor *crp) 152 { 153 struct thread_stack *ts = thread->ts, *new_ts; 154 unsigned int old_sz = ts ? ts->arr_sz : 0; 155 unsigned int new_sz = 1; 156 157 if (thread_stack__per_cpu(thread) && cpu > 0) 158 new_sz = roundup_pow_of_two(cpu + 1); 159 160 if (!ts || new_sz > old_sz) { 161 new_ts = calloc(new_sz, sizeof(*ts)); 162 if (!new_ts) 163 return NULL; 164 if (ts) 165 memcpy(new_ts, ts, old_sz * sizeof(*ts)); 166 new_ts->arr_sz = new_sz; 167 zfree(&thread->ts); 168 thread->ts = new_ts; 169 ts = new_ts; 170 } 171 172 if (thread_stack__per_cpu(thread) && cpu > 0 && 173 (unsigned int)cpu < ts->arr_sz) 174 ts += cpu; 175 176 if (!ts->stack && 177 thread_stack__init(ts, thread, crp)) 178 return NULL; 179 180 return ts; 181 } 182 183 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu) 184 { 185 struct thread_stack *ts = thread->ts; 186 187 if (cpu < 0) 188 cpu = 0; 189 190 if (!ts || (unsigned int)cpu >= ts->arr_sz) 191 return NULL; 192 193 ts += cpu; 194 195 if (!ts->stack) 196 return NULL; 197 198 return ts; 199 } 200 201 static inline struct thread_stack *thread__stack(struct thread *thread, 202 int cpu) 203 { 204 if (!thread) 205 return NULL; 206 207 if (thread_stack__per_cpu(thread)) 208 return thread__cpu_stack(thread, cpu); 209 210 return thread->ts; 211 } 212 213 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr, 214 bool trace_end) 215 { 216 int err = 0; 217 218 if (ts->cnt == ts->sz) { 219 err = thread_stack__grow(ts); 220 if (err) { 221 pr_warning("Out of memory: discarding thread stack\n"); 222 ts->cnt = 0; 223 } 224 } 225 226 ts->stack[ts->cnt].trace_end = trace_end; 227 ts->stack[ts->cnt++].ret_addr = ret_addr; 228 229 return err; 230 } 231 232 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr) 233 { 234 size_t i; 235 236 /* 237 * In some cases there may be functions which are not seen to return. 238 * For example when setjmp / longjmp has been used. Or the perf context 239 * switch in the kernel which doesn't stop and start tracing in exactly 240 * the same code path. When that happens the return address will be 241 * further down the stack. If the return address is not found at all, 242 * we assume the opposite (i.e. this is a return for a call that wasn't 243 * seen for some reason) and leave the stack alone. 244 */ 245 for (i = ts->cnt; i; ) { 246 if (ts->stack[--i].ret_addr == ret_addr) { 247 ts->cnt = i; 248 return; 249 } 250 } 251 } 252 253 static void thread_stack__pop_trace_end(struct thread_stack *ts) 254 { 255 size_t i; 256 257 for (i = ts->cnt; i; ) { 258 if (ts->stack[--i].trace_end) 259 ts->cnt = i; 260 else 261 return; 262 } 263 } 264 265 static bool thread_stack__in_kernel(struct thread_stack *ts) 266 { 267 if (!ts->cnt) 268 return false; 269 270 return ts->stack[ts->cnt - 1].cp->in_kernel; 271 } 272 273 static int thread_stack__call_return(struct thread *thread, 274 struct thread_stack *ts, size_t idx, 275 u64 timestamp, u64 ref, bool no_return) 276 { 277 struct call_return_processor *crp = ts->crp; 278 struct thread_stack_entry *tse; 279 struct call_return cr = { 280 .thread = thread, 281 .comm = ts->comm, 282 .db_id = 0, 283 }; 284 u64 *parent_db_id; 285 286 tse = &ts->stack[idx]; 287 cr.cp = tse->cp; 288 cr.call_time = tse->timestamp; 289 cr.return_time = timestamp; 290 cr.branch_count = ts->branch_count - tse->branch_count; 291 cr.insn_count = ts->insn_count - tse->insn_count; 292 cr.cyc_count = ts->cyc_count - tse->cyc_count; 293 cr.db_id = tse->db_id; 294 cr.call_ref = tse->ref; 295 cr.return_ref = ref; 296 if (tse->no_call) 297 cr.flags |= CALL_RETURN_NO_CALL; 298 if (no_return) 299 cr.flags |= CALL_RETURN_NO_RETURN; 300 if (tse->non_call) 301 cr.flags |= CALL_RETURN_NON_CALL; 302 303 /* 304 * The parent db_id must be assigned before exporting the child. Note 305 * it is not possible to export the parent first because its information 306 * is not yet complete because its 'return' has not yet been processed. 307 */ 308 parent_db_id = idx ? &(tse - 1)->db_id : NULL; 309 310 return crp->process(&cr, parent_db_id, crp->data); 311 } 312 313 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts) 314 { 315 struct call_return_processor *crp = ts->crp; 316 int err; 317 318 if (!crp) { 319 ts->cnt = 0; 320 return 0; 321 } 322 323 while (ts->cnt) { 324 err = thread_stack__call_return(thread, ts, --ts->cnt, 325 ts->last_time, 0, true); 326 if (err) { 327 pr_err("Error flushing thread stack!\n"); 328 ts->cnt = 0; 329 return err; 330 } 331 } 332 333 return 0; 334 } 335 336 int thread_stack__flush(struct thread *thread) 337 { 338 struct thread_stack *ts = thread->ts; 339 unsigned int pos; 340 int err = 0; 341 342 if (ts) { 343 for (pos = 0; pos < ts->arr_sz; pos++) { 344 int ret = __thread_stack__flush(thread, ts + pos); 345 346 if (ret) 347 err = ret; 348 } 349 } 350 351 return err; 352 } 353 354 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip, 355 u64 to_ip, u16 insn_len, u64 trace_nr) 356 { 357 struct thread_stack *ts = thread__stack(thread, cpu); 358 359 if (!thread) 360 return -EINVAL; 361 362 if (!ts) { 363 ts = thread_stack__new(thread, cpu, NULL); 364 if (!ts) { 365 pr_warning("Out of memory: no thread stack\n"); 366 return -ENOMEM; 367 } 368 ts->trace_nr = trace_nr; 369 } 370 371 /* 372 * When the trace is discontinuous, the trace_nr changes. In that case 373 * the stack might be completely invalid. Better to report nothing than 374 * to report something misleading, so flush the stack. 375 */ 376 if (trace_nr != ts->trace_nr) { 377 if (ts->trace_nr) 378 __thread_stack__flush(thread, ts); 379 ts->trace_nr = trace_nr; 380 } 381 382 /* Stop here if thread_stack__process() is in use */ 383 if (ts->crp) 384 return 0; 385 386 if (flags & PERF_IP_FLAG_CALL) { 387 u64 ret_addr; 388 389 if (!to_ip) 390 return 0; 391 ret_addr = from_ip + insn_len; 392 if (ret_addr == to_ip) 393 return 0; /* Zero-length calls are excluded */ 394 return thread_stack__push(ts, ret_addr, 395 flags & PERF_IP_FLAG_TRACE_END); 396 } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) { 397 /* 398 * If the caller did not change the trace number (which would 399 * have flushed the stack) then try to make sense of the stack. 400 * Possibly, tracing began after returning to the current 401 * address, so try to pop that. Also, do not expect a call made 402 * when the trace ended, to return, so pop that. 403 */ 404 thread_stack__pop(ts, to_ip); 405 thread_stack__pop_trace_end(ts); 406 } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) { 407 thread_stack__pop(ts, to_ip); 408 } 409 410 return 0; 411 } 412 413 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr) 414 { 415 struct thread_stack *ts = thread__stack(thread, cpu); 416 417 if (!ts) 418 return; 419 420 if (trace_nr != ts->trace_nr) { 421 if (ts->trace_nr) 422 __thread_stack__flush(thread, ts); 423 ts->trace_nr = trace_nr; 424 } 425 } 426 427 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts) 428 { 429 __thread_stack__flush(thread, ts); 430 zfree(&ts->stack); 431 } 432 433 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts) 434 { 435 unsigned int arr_sz = ts->arr_sz; 436 437 __thread_stack__free(thread, ts); 438 memset(ts, 0, sizeof(*ts)); 439 ts->arr_sz = arr_sz; 440 } 441 442 void thread_stack__free(struct thread *thread) 443 { 444 struct thread_stack *ts = thread->ts; 445 unsigned int pos; 446 447 if (ts) { 448 for (pos = 0; pos < ts->arr_sz; pos++) 449 __thread_stack__free(thread, ts + pos); 450 zfree(&thread->ts); 451 } 452 } 453 454 static inline u64 callchain_context(u64 ip, u64 kernel_start) 455 { 456 return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL; 457 } 458 459 void thread_stack__sample(struct thread *thread, int cpu, 460 struct ip_callchain *chain, 461 size_t sz, u64 ip, u64 kernel_start) 462 { 463 struct thread_stack *ts = thread__stack(thread, cpu); 464 u64 context = callchain_context(ip, kernel_start); 465 u64 last_context; 466 size_t i, j; 467 468 if (sz < 2) { 469 chain->nr = 0; 470 return; 471 } 472 473 chain->ips[0] = context; 474 chain->ips[1] = ip; 475 476 if (!ts) { 477 chain->nr = 2; 478 return; 479 } 480 481 last_context = context; 482 483 for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) { 484 ip = ts->stack[ts->cnt - j].ret_addr; 485 context = callchain_context(ip, kernel_start); 486 if (context != last_context) { 487 if (i >= sz - 1) 488 break; 489 chain->ips[i++] = context; 490 last_context = context; 491 } 492 chain->ips[i] = ip; 493 } 494 495 chain->nr = i; 496 } 497 498 struct call_return_processor * 499 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data), 500 void *data) 501 { 502 struct call_return_processor *crp; 503 504 crp = zalloc(sizeof(struct call_return_processor)); 505 if (!crp) 506 return NULL; 507 crp->cpr = call_path_root__new(); 508 if (!crp->cpr) 509 goto out_free; 510 crp->process = process; 511 crp->data = data; 512 return crp; 513 514 out_free: 515 free(crp); 516 return NULL; 517 } 518 519 void call_return_processor__free(struct call_return_processor *crp) 520 { 521 if (crp) { 522 call_path_root__free(crp->cpr); 523 free(crp); 524 } 525 } 526 527 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr, 528 u64 timestamp, u64 ref, struct call_path *cp, 529 bool no_call, bool trace_end) 530 { 531 struct thread_stack_entry *tse; 532 int err; 533 534 if (!cp) 535 return -ENOMEM; 536 537 if (ts->cnt == ts->sz) { 538 err = thread_stack__grow(ts); 539 if (err) 540 return err; 541 } 542 543 tse = &ts->stack[ts->cnt++]; 544 tse->ret_addr = ret_addr; 545 tse->timestamp = timestamp; 546 tse->ref = ref; 547 tse->branch_count = ts->branch_count; 548 tse->insn_count = ts->insn_count; 549 tse->cyc_count = ts->cyc_count; 550 tse->cp = cp; 551 tse->no_call = no_call; 552 tse->trace_end = trace_end; 553 tse->non_call = false; 554 tse->db_id = 0; 555 556 return 0; 557 } 558 559 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts, 560 u64 ret_addr, u64 timestamp, u64 ref, 561 struct symbol *sym) 562 { 563 int err; 564 565 if (!ts->cnt) 566 return 1; 567 568 if (ts->cnt == 1) { 569 struct thread_stack_entry *tse = &ts->stack[0]; 570 571 if (tse->cp->sym == sym) 572 return thread_stack__call_return(thread, ts, --ts->cnt, 573 timestamp, ref, false); 574 } 575 576 if (ts->stack[ts->cnt - 1].ret_addr == ret_addr && 577 !ts->stack[ts->cnt - 1].non_call) { 578 return thread_stack__call_return(thread, ts, --ts->cnt, 579 timestamp, ref, false); 580 } else { 581 size_t i = ts->cnt - 1; 582 583 while (i--) { 584 if (ts->stack[i].ret_addr != ret_addr || 585 ts->stack[i].non_call) 586 continue; 587 i += 1; 588 while (ts->cnt > i) { 589 err = thread_stack__call_return(thread, ts, 590 --ts->cnt, 591 timestamp, ref, 592 true); 593 if (err) 594 return err; 595 } 596 return thread_stack__call_return(thread, ts, --ts->cnt, 597 timestamp, ref, false); 598 } 599 } 600 601 return 1; 602 } 603 604 static int thread_stack__bottom(struct thread_stack *ts, 605 struct perf_sample *sample, 606 struct addr_location *from_al, 607 struct addr_location *to_al, u64 ref) 608 { 609 struct call_path_root *cpr = ts->crp->cpr; 610 struct call_path *cp; 611 struct symbol *sym; 612 u64 ip; 613 614 if (sample->ip) { 615 ip = sample->ip; 616 sym = from_al->sym; 617 } else if (sample->addr) { 618 ip = sample->addr; 619 sym = to_al->sym; 620 } else { 621 return 0; 622 } 623 624 cp = call_path__findnew(cpr, &cpr->call_path, sym, ip, 625 ts->kernel_start); 626 627 return thread_stack__push_cp(ts, ip, sample->time, ref, cp, 628 true, false); 629 } 630 631 static int thread_stack__no_call_return(struct thread *thread, 632 struct thread_stack *ts, 633 struct perf_sample *sample, 634 struct addr_location *from_al, 635 struct addr_location *to_al, u64 ref) 636 { 637 struct call_path_root *cpr = ts->crp->cpr; 638 struct call_path *root = &cpr->call_path; 639 struct symbol *fsym = from_al->sym; 640 struct symbol *tsym = to_al->sym; 641 struct call_path *cp, *parent; 642 u64 ks = ts->kernel_start; 643 u64 addr = sample->addr; 644 u64 tm = sample->time; 645 u64 ip = sample->ip; 646 int err; 647 648 if (ip >= ks && addr < ks) { 649 /* Return to userspace, so pop all kernel addresses */ 650 while (thread_stack__in_kernel(ts)) { 651 err = thread_stack__call_return(thread, ts, --ts->cnt, 652 tm, ref, true); 653 if (err) 654 return err; 655 } 656 657 /* If the stack is empty, push the userspace address */ 658 if (!ts->cnt) { 659 cp = call_path__findnew(cpr, root, tsym, addr, ks); 660 return thread_stack__push_cp(ts, 0, tm, ref, cp, true, 661 false); 662 } 663 } else if (thread_stack__in_kernel(ts) && ip < ks) { 664 /* Return to userspace, so pop all kernel addresses */ 665 while (thread_stack__in_kernel(ts)) { 666 err = thread_stack__call_return(thread, ts, --ts->cnt, 667 tm, ref, true); 668 if (err) 669 return err; 670 } 671 } 672 673 if (ts->cnt) 674 parent = ts->stack[ts->cnt - 1].cp; 675 else 676 parent = root; 677 678 if (parent->sym == from_al->sym) { 679 /* 680 * At the bottom of the stack, assume the missing 'call' was 681 * before the trace started. So, pop the current symbol and push 682 * the 'to' symbol. 683 */ 684 if (ts->cnt == 1) { 685 err = thread_stack__call_return(thread, ts, --ts->cnt, 686 tm, ref, false); 687 if (err) 688 return err; 689 } 690 691 if (!ts->cnt) { 692 cp = call_path__findnew(cpr, root, tsym, addr, ks); 693 694 return thread_stack__push_cp(ts, addr, tm, ref, cp, 695 true, false); 696 } 697 698 /* 699 * Otherwise assume the 'return' is being used as a jump (e.g. 700 * retpoline) and just push the 'to' symbol. 701 */ 702 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 703 704 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false); 705 if (!err) 706 ts->stack[ts->cnt - 1].non_call = true; 707 708 return err; 709 } 710 711 /* 712 * Assume 'parent' has not yet returned, so push 'to', and then push and 713 * pop 'from'. 714 */ 715 716 cp = call_path__findnew(cpr, parent, tsym, addr, ks); 717 718 err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false); 719 if (err) 720 return err; 721 722 cp = call_path__findnew(cpr, cp, fsym, ip, ks); 723 724 err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false); 725 if (err) 726 return err; 727 728 return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false); 729 } 730 731 static int thread_stack__trace_begin(struct thread *thread, 732 struct thread_stack *ts, u64 timestamp, 733 u64 ref) 734 { 735 struct thread_stack_entry *tse; 736 int err; 737 738 if (!ts->cnt) 739 return 0; 740 741 /* Pop trace end */ 742 tse = &ts->stack[ts->cnt - 1]; 743 if (tse->trace_end) { 744 err = thread_stack__call_return(thread, ts, --ts->cnt, 745 timestamp, ref, false); 746 if (err) 747 return err; 748 } 749 750 return 0; 751 } 752 753 static int thread_stack__trace_end(struct thread_stack *ts, 754 struct perf_sample *sample, u64 ref) 755 { 756 struct call_path_root *cpr = ts->crp->cpr; 757 struct call_path *cp; 758 u64 ret_addr; 759 760 /* No point having 'trace end' on the bottom of the stack */ 761 if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref)) 762 return 0; 763 764 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0, 765 ts->kernel_start); 766 767 ret_addr = sample->ip + sample->insn_len; 768 769 return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp, 770 false, true); 771 } 772 773 static bool is_x86_retpoline(const char *name) 774 { 775 const char *p = strstr(name, "__x86_indirect_thunk_"); 776 777 return p == name || !strcmp(name, "__indirect_thunk_start"); 778 } 779 780 /* 781 * x86 retpoline functions pollute the call graph. This function removes them. 782 * This does not handle function return thunks, nor is there any improvement 783 * for the handling of inline thunks or extern thunks. 784 */ 785 static int thread_stack__x86_retpoline(struct thread_stack *ts, 786 struct perf_sample *sample, 787 struct addr_location *to_al) 788 { 789 struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1]; 790 struct call_path_root *cpr = ts->crp->cpr; 791 struct symbol *sym = tse->cp->sym; 792 struct symbol *tsym = to_al->sym; 793 struct call_path *cp; 794 795 if (sym && is_x86_retpoline(sym->name)) { 796 /* 797 * This is a x86 retpoline fn. It pollutes the call graph by 798 * showing up everywhere there is an indirect branch, but does 799 * not itself mean anything. Here the top-of-stack is removed, 800 * by decrementing the stack count, and then further down, the 801 * resulting top-of-stack is replaced with the actual target. 802 * The result is that the retpoline functions will no longer 803 * appear in the call graph. Note this only affects the call 804 * graph, since all the original branches are left unchanged. 805 */ 806 ts->cnt -= 1; 807 sym = ts->stack[ts->cnt - 2].cp->sym; 808 if (sym && sym == tsym && to_al->addr != tsym->start) { 809 /* 810 * Target is back to the middle of the symbol we came 811 * from so assume it is an indirect jmp and forget it 812 * altogether. 813 */ 814 ts->cnt -= 1; 815 return 0; 816 } 817 } else if (sym && sym == tsym) { 818 /* 819 * Target is back to the symbol we came from so assume it is an 820 * indirect jmp and forget it altogether. 821 */ 822 ts->cnt -= 1; 823 return 0; 824 } 825 826 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym, 827 sample->addr, ts->kernel_start); 828 if (!cp) 829 return -ENOMEM; 830 831 /* Replace the top-of-stack with the actual target */ 832 ts->stack[ts->cnt - 1].cp = cp; 833 834 return 0; 835 } 836 837 int thread_stack__process(struct thread *thread, struct comm *comm, 838 struct perf_sample *sample, 839 struct addr_location *from_al, 840 struct addr_location *to_al, u64 ref, 841 struct call_return_processor *crp) 842 { 843 struct thread_stack *ts = thread__stack(thread, sample->cpu); 844 enum retpoline_state_t rstate; 845 int err = 0; 846 847 if (ts && !ts->crp) { 848 /* Supersede thread_stack__event() */ 849 thread_stack__reset(thread, ts); 850 ts = NULL; 851 } 852 853 if (!ts) { 854 ts = thread_stack__new(thread, sample->cpu, crp); 855 if (!ts) 856 return -ENOMEM; 857 ts->comm = comm; 858 } 859 860 rstate = ts->rstate; 861 if (rstate == X86_RETPOLINE_DETECTED) 862 ts->rstate = X86_RETPOLINE_POSSIBLE; 863 864 /* Flush stack on exec */ 865 if (ts->comm != comm && thread->pid_ == thread->tid) { 866 err = __thread_stack__flush(thread, ts); 867 if (err) 868 return err; 869 ts->comm = comm; 870 } 871 872 /* If the stack is empty, put the current symbol on the stack */ 873 if (!ts->cnt) { 874 err = thread_stack__bottom(ts, sample, from_al, to_al, ref); 875 if (err) 876 return err; 877 } 878 879 ts->branch_count += 1; 880 ts->insn_count += sample->insn_cnt; 881 ts->cyc_count += sample->cyc_cnt; 882 ts->last_time = sample->time; 883 884 if (sample->flags & PERF_IP_FLAG_CALL) { 885 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END; 886 struct call_path_root *cpr = ts->crp->cpr; 887 struct call_path *cp; 888 u64 ret_addr; 889 890 if (!sample->ip || !sample->addr) 891 return 0; 892 893 ret_addr = sample->ip + sample->insn_len; 894 if (ret_addr == sample->addr) 895 return 0; /* Zero-length calls are excluded */ 896 897 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 898 to_al->sym, sample->addr, 899 ts->kernel_start); 900 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref, 901 cp, false, trace_end); 902 903 /* 904 * A call to the same symbol but not the start of the symbol, 905 * may be the start of a x86 retpoline. 906 */ 907 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym && 908 from_al->sym == to_al->sym && 909 to_al->addr != to_al->sym->start) 910 ts->rstate = X86_RETPOLINE_DETECTED; 911 912 } else if (sample->flags & PERF_IP_FLAG_RETURN) { 913 if (!sample->ip || !sample->addr) 914 return 0; 915 916 /* x86 retpoline 'return' doesn't match the stack */ 917 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 && 918 ts->stack[ts->cnt - 1].ret_addr != sample->addr) 919 return thread_stack__x86_retpoline(ts, sample, to_al); 920 921 err = thread_stack__pop_cp(thread, ts, sample->addr, 922 sample->time, ref, from_al->sym); 923 if (err) { 924 if (err < 0) 925 return err; 926 err = thread_stack__no_call_return(thread, ts, sample, 927 from_al, to_al, ref); 928 } 929 } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) { 930 err = thread_stack__trace_begin(thread, ts, sample->time, ref); 931 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) { 932 err = thread_stack__trace_end(ts, sample, ref); 933 } else if (sample->flags & PERF_IP_FLAG_BRANCH && 934 from_al->sym != to_al->sym && to_al->sym && 935 to_al->addr == to_al->sym->start) { 936 struct call_path_root *cpr = ts->crp->cpr; 937 struct call_path *cp; 938 939 /* 940 * The compiler might optimize a call/ret combination by making 941 * it a jmp. Make that visible by recording on the stack a 942 * branch to the start of a different symbol. Note, that means 943 * when a ret pops the stack, all jmps must be popped off first. 944 */ 945 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, 946 to_al->sym, sample->addr, 947 ts->kernel_start); 948 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false, 949 false); 950 if (!err) 951 ts->stack[ts->cnt - 1].non_call = true; 952 } 953 954 return err; 955 } 956 957 size_t thread_stack__depth(struct thread *thread, int cpu) 958 { 959 struct thread_stack *ts = thread__stack(thread, cpu); 960 961 if (!ts) 962 return 0; 963 return ts->cnt; 964 } 965