xref: /xnu-11215/bsd/kern/kdebug_common.c (revision 8d741a5d)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @Apple_LICENSE_HEADER_START@
5  *
6  * The contents of this file constitute Original Code as defined in and
7  * are subject to the Apple Public Source License Version 1.1 (the
8  * "License").  You may not use this file except in compliance with the
9  * License.  Please obtain a copy of the License at
10  * http://www.apple.com/publicsource and read it before using this file.
11  *
12  * This Original Code and all software distributed under the License are
13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
17  * License for the specific language governing rights and limitations
18  * under the License.
19  *
20  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
21  */
22 
23 #include <sys/kdebug_common.h>
24 #include <vm/vm_kern_xnu.h>
25 
26 LCK_GRP_DECLARE(kdebug_lck_grp, "kdebug");
27 int kdbg_debug = 0;
28 
29 extern struct kd_control kd_control_trace, kd_control_triage;
30 
31 int
kdebug_storage_lock(struct kd_control * kd_ctrl_page)32 kdebug_storage_lock(struct kd_control *kd_ctrl_page)
33 {
34 	int intrs_en = ml_set_interrupts_enabled(false);
35 	lck_spin_lock_grp(&kd_ctrl_page->kdc_storage_lock, &kdebug_lck_grp);
36 	return intrs_en;
37 }
38 
39 void
kdebug_storage_unlock(struct kd_control * kd_ctrl_page,int intrs_en)40 kdebug_storage_unlock(struct kd_control *kd_ctrl_page, int intrs_en)
41 {
42 	lck_spin_unlock(&kd_ctrl_page->kdc_storage_lock);
43 	ml_set_interrupts_enabled(intrs_en);
44 }
45 
46 // Turn on boot tracing and set the number of events.
47 static TUNABLE(unsigned int, new_nkdbufs, "trace", 0);
48 // Enable wrapping during boot tracing.
49 TUNABLE(unsigned int, trace_wrap, "trace_wrap", 0);
50 // The filter description to apply to boot tracing.
51 static TUNABLE_STR(trace_typefilter, 256, "trace_typefilter", "");
52 
53 // Turn on wake tracing and set the number of events.
54 TUNABLE(unsigned int, wake_nkdbufs, "trace_wake", 0);
55 // Write trace events to a file in the event of a panic.
56 TUNABLE(unsigned int, write_trace_on_panic, "trace_panic", 0);
57 
58 // Obsolete leak logging system.
59 TUNABLE(int, log_leaks, "-l", 0);
60 
61 void
kdebug_startup(void)62 kdebug_startup(void)
63 {
64 	lck_spin_init(&kd_control_trace.kdc_storage_lock, &kdebug_lck_grp, LCK_ATTR_NULL);
65 	lck_spin_init(&kd_control_triage.kdc_storage_lock, &kdebug_lck_grp, LCK_ATTR_NULL);
66 	kdebug_init(new_nkdbufs, trace_typefilter,
67 	    (trace_wrap ? KDOPT_WRAPPING : 0) | KDOPT_ATBOOT);
68 	create_buffers_triage();
69 }
70 
71 uint32_t
kdbg_cpu_count(void)72 kdbg_cpu_count(void)
73 {
74 #if defined(__x86_64__)
75 	return ml_early_cpu_max_number() + 1;
76 #else // defined(__x86_64__)
77 	return ml_get_cpu_count();
78 #endif // !defined(__x86_64__)
79 }
80 
81 /*
82  * Both kdebug_timestamp and kdebug_using_continuous_time are known
83  * to kexts. And going forward we always want to use mach_continuous_time().
84  * So we keep these 2 routines as-is to keep the TRACE mode use outside
85  * the kernel intact. TRIAGE mode will explicitly only use mach_continuous_time()
86  * for its timestamp.
87  */
88 bool
kdebug_using_continuous_time(void)89 kdebug_using_continuous_time(void)
90 {
91 	return kd_control_trace.kdc_flags & KDBG_CONTINUOUS_TIME;
92 }
93 
94 uint64_t
kdebug_timestamp(void)95 kdebug_timestamp(void)
96 {
97 	if (kdebug_using_continuous_time()) {
98 		return mach_continuous_time();
99 	} else {
100 		return mach_absolute_time();
101 	}
102 }
103 
104 int
create_buffers(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page,vm_tag_t tag)105 create_buffers(
106 	struct kd_control *kd_ctrl_page,
107 	struct kd_buffer *kd_data_page,
108 	vm_tag_t tag)
109 {
110 	unsigned int i;
111 	unsigned int p_buffer_size;
112 	unsigned int f_buffer_size;
113 	unsigned int f_buffers;
114 	int error = 0;
115 	int ncpus, count_storage_units = 0;
116 
117 	struct kd_bufinfo *kdbip = NULL;
118 	struct kd_region *kd_bufs = NULL;
119 	int kdb_storage_count = kd_data_page->kdb_storage_count;
120 
121 	ncpus = kd_ctrl_page->alloc_cpus;
122 
123 	kdbip = kalloc_type_tag(struct kd_bufinfo, ncpus, Z_WAITOK | Z_ZERO, tag);
124 	if (kdbip == NULL) {
125 		error = ENOSPC;
126 		goto out;
127 	}
128 	kd_data_page->kdb_info = kdbip;
129 
130 	f_buffers = kdb_storage_count / N_STORAGE_UNITS_PER_BUFFER;
131 	kd_data_page->kdb_region_count = f_buffers;
132 
133 	f_buffer_size = N_STORAGE_UNITS_PER_BUFFER * sizeof(struct kd_storage);
134 	p_buffer_size = (kdb_storage_count % N_STORAGE_UNITS_PER_BUFFER) * sizeof(struct kd_storage);
135 
136 	if (p_buffer_size) {
137 		kd_data_page->kdb_region_count++;
138 	}
139 
140 	if (kd_data_page->kdcopybuf == 0) {
141 		if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_data_page->kdcopybuf,
142 		    (vm_size_t) kd_ctrl_page->kdebug_kdcopybuf_size,
143 		    KMA_DATA | KMA_ZERO, tag) != KERN_SUCCESS) {
144 			error = ENOSPC;
145 			goto out;
146 		}
147 	}
148 
149 	kd_bufs = kalloc_type_tag(struct kd_region, kd_data_page->kdb_region_count,
150 	    Z_WAITOK | Z_ZERO, tag);
151 	if (kd_bufs == NULL) {
152 		error = ENOSPC;
153 		goto out;
154 	}
155 	kd_data_page->kd_bufs = kd_bufs;
156 
157 	for (i = 0; i < f_buffers; i++) {
158 		if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_bufs[i].kdr_addr,
159 		    (vm_size_t)f_buffer_size, KMA_DATA | KMA_ZERO, tag) != KERN_SUCCESS) {
160 			error = ENOSPC;
161 			goto out;
162 		}
163 
164 		kd_bufs[i].kdr_size = f_buffer_size;
165 	}
166 	if (p_buffer_size) {
167 		if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_bufs[i].kdr_addr,
168 		    (vm_size_t)p_buffer_size, KMA_DATA | KMA_ZERO, tag) != KERN_SUCCESS) {
169 			error = ENOSPC;
170 			goto out;
171 		}
172 
173 		kd_bufs[i].kdr_size = p_buffer_size;
174 	}
175 
176 	count_storage_units = 0;
177 	for (i = 0; i < kd_data_page->kdb_region_count; i++) {
178 		struct kd_storage *kds;
179 		uint16_t n_elements;
180 		static_assert(N_STORAGE_UNITS_PER_BUFFER <= UINT16_MAX);
181 		assert(kd_bufs[i].kdr_size <= N_STORAGE_UNITS_PER_BUFFER *
182 		    sizeof(struct kd_storage));
183 
184 		n_elements = kd_bufs[i].kdr_size / sizeof(struct kd_storage);
185 		kds = kd_bufs[i].kdr_addr;
186 
187 		for (uint16_t n = 0; n < n_elements; n++) {
188 			kds[n].kds_next.buffer_index = kd_ctrl_page->kds_free_list.buffer_index;
189 			kds[n].kds_next.offset = kd_ctrl_page->kds_free_list.offset;
190 
191 			kd_ctrl_page->kds_free_list.buffer_index = i;
192 			kd_ctrl_page->kds_free_list.offset = n;
193 		}
194 		count_storage_units += n_elements;
195 	}
196 
197 	kd_data_page->kdb_storage_count = count_storage_units;
198 
199 	for (i = 0; i < ncpus; i++) {
200 		kdbip[i].kd_list_head.raw = KDS_PTR_NULL;
201 		kdbip[i].kd_list_tail.raw = KDS_PTR_NULL;
202 		kdbip[i].kd_lostevents = false;
203 		kdbip[i].num_bufs = 0;
204 	}
205 
206 	kd_ctrl_page->kdc_flags |= KDBG_BUFINIT;
207 
208 	kd_ctrl_page->kdc_storage_used = 0;
209 out:
210 	if (error) {
211 		delete_buffers(kd_ctrl_page, kd_data_page);
212 	}
213 
214 	return error;
215 }
216 
217 void
delete_buffers(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page)218 delete_buffers(struct kd_control *kd_ctrl_page,
219     struct kd_buffer *kd_data_page)
220 {
221 	unsigned int i;
222 	int kdb_region_count = kd_data_page->kdb_region_count;
223 
224 	struct kd_bufinfo *kdbip = kd_data_page->kdb_info;
225 	struct kd_region *kd_bufs = kd_data_page->kd_bufs;
226 
227 	if (kd_bufs) {
228 		for (i = 0; i < kdb_region_count; i++) {
229 			if (kd_bufs[i].kdr_addr) {
230 				kmem_free(kernel_map, (vm_offset_t)kd_bufs[i].kdr_addr, (vm_size_t)kd_bufs[i].kdr_size);
231 			}
232 		}
233 		kfree_type(struct kd_region, kdb_region_count, kd_bufs);
234 
235 		kd_data_page->kd_bufs = NULL;
236 		kd_data_page->kdb_region_count = 0;
237 	}
238 	if (kd_data_page->kdcopybuf) {
239 		kmem_free(kernel_map, (vm_offset_t)kd_data_page->kdcopybuf, kd_ctrl_page->kdebug_kdcopybuf_size);
240 
241 		kd_data_page->kdcopybuf = NULL;
242 	}
243 	kd_ctrl_page->kds_free_list.raw = KDS_PTR_NULL;
244 
245 	if (kdbip) {
246 		kfree_type(struct kd_bufinfo, kd_ctrl_page->alloc_cpus, kdbip);
247 		kd_data_page->kdb_info = NULL;
248 	}
249 	kd_ctrl_page->kdc_coprocs = NULL;
250 	kd_ctrl_page->kdebug_cpus = 0;
251 	kd_ctrl_page->alloc_cpus = 0;
252 	kd_ctrl_page->kdc_flags &= ~KDBG_BUFINIT;
253 }
254 
255 static bool
allocate_storage_unit(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page,int cpu)256 allocate_storage_unit(struct kd_control *kd_ctrl_page,
257     struct kd_buffer *kd_data_page, int cpu)
258 {
259 	union kds_ptr kdsp;
260 	struct kd_storage *kdsp_actual, *kdsp_next_actual;
261 	struct kd_bufinfo *kdbip, *kdbp, *kdbp_vict, *kdbp_try;
262 	uint64_t oldest_ts, ts;
263 	bool retval = true;
264 	struct kd_region *kd_bufs;
265 
266 	int intrs_en = kdebug_storage_lock(kd_ctrl_page);
267 
268 	kdbp = &kd_data_page->kdb_info[cpu];
269 	kd_bufs = kd_data_page->kd_bufs;
270 	kdbip = kd_data_page->kdb_info;
271 
272 	/* If someone beat us to the allocate, return success */
273 	if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) {
274 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdbp->kd_list_tail);
275 
276 		if (kdsp_actual->kds_bufindx < kd_ctrl_page->kdebug_events_per_storage_unit) {
277 			goto out;
278 		}
279 	}
280 
281 	if ((kdsp = kd_ctrl_page->kds_free_list).raw != KDS_PTR_NULL) {
282 		/*
283 		 * If there's a free page, grab it from the free list.
284 		 */
285 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
286 		kd_ctrl_page->kds_free_list = kdsp_actual->kds_next;
287 
288 		kd_ctrl_page->kdc_storage_used++;
289 	} else {
290 		/*
291 		 * Otherwise, we're going to lose events and repurpose the oldest
292 		 * storage unit we can find.
293 		 */
294 		if (kd_ctrl_page->kdc_live_flags & KDBG_NOWRAP) {
295 			kd_ctrl_page->kdc_emit = KDEMIT_DISABLE;
296 			kd_ctrl_page->kdc_live_flags |= KDBG_WRAPPED;
297 			kdebug_enable = 0;
298 			kd_ctrl_page->enabled = 0;
299 			commpage_update_kdebug_state();
300 			kdbp->kd_lostevents = true;
301 			retval = false;
302 			goto out;
303 		}
304 		kdbp_vict = NULL;
305 		oldest_ts = UINT64_MAX;
306 
307 		for (kdbp_try = &kdbip[0]; kdbp_try < &kdbip[kd_ctrl_page->kdebug_cpus]; kdbp_try++) {
308 			if (kdbp_try->kd_list_head.raw == KDS_PTR_NULL) {
309 				/*
310 				 * no storage unit to steal
311 				 */
312 				continue;
313 			}
314 
315 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdbp_try->kd_list_head);
316 
317 			if (kdsp_actual->kds_bufcnt < kd_ctrl_page->kdebug_events_per_storage_unit) {
318 				/*
319 				 * make sure we don't steal the storage unit
320 				 * being actively recorded to...  need to
321 				 * move on because we don't want an out-of-order
322 				 * set of events showing up later
323 				 */
324 				continue;
325 			}
326 
327 			/*
328 			 * When wrapping, steal the storage unit with the
329 			 * earliest timestamp on its last event, instead of the
330 			 * earliest timestamp on the first event.  This allows a
331 			 * storage unit with more recent events to be preserved,
332 			 * even if the storage unit contains events that are
333 			 * older than those found in other CPUs.
334 			 */
335 			ts = kdbg_get_timestamp(&kdsp_actual->kds_records[kd_ctrl_page->kdebug_events_per_storage_unit - 1]);
336 			if (ts < oldest_ts) {
337 				oldest_ts = ts;
338 				kdbp_vict = kdbp_try;
339 			}
340 		}
341 		if (kdbp_vict == NULL && kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
342 			kd_ctrl_page->kdc_emit = KDEMIT_DISABLE;
343 			kdebug_enable = 0;
344 			kd_ctrl_page->enabled = 0;
345 			commpage_update_kdebug_state();
346 			retval = false;
347 			goto out;
348 		}
349 		kdsp = kdbp_vict->kd_list_head;
350 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
351 		kdbp_vict->kd_list_head = kdsp_actual->kds_next;
352 
353 		if (kdbp_vict->kd_list_head.raw != KDS_PTR_NULL) {
354 			kdsp_next_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdbp_vict->kd_list_head);
355 			kdsp_next_actual->kds_lostevents = true;
356 		} else {
357 			kdbp_vict->kd_lostevents = true;
358 		}
359 
360 		if (kd_ctrl_page->kdc_oldest_time < oldest_ts) {
361 			kd_ctrl_page->kdc_oldest_time = oldest_ts;
362 		}
363 		kd_ctrl_page->kdc_live_flags |= KDBG_WRAPPED;
364 	}
365 
366 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
367 		kdsp_actual->kds_timestamp = kdebug_timestamp();
368 	} else {
369 		kdsp_actual->kds_timestamp = mach_continuous_time();
370 	}
371 
372 	kdsp_actual->kds_next.raw = KDS_PTR_NULL;
373 	kdsp_actual->kds_bufcnt   = 0;
374 	kdsp_actual->kds_readlast = 0;
375 
376 	kdsp_actual->kds_lostevents = kdbp->kd_lostevents;
377 	kdbp->kd_lostevents = false;
378 	kdsp_actual->kds_bufindx = 0;
379 
380 	if (kdbp->kd_list_head.raw == KDS_PTR_NULL) {
381 		kdbp->kd_list_head = kdsp;
382 	} else {
383 		POINTER_FROM_KDS_PTR(kd_bufs, kdbp->kd_list_tail)->kds_next = kdsp;
384 	}
385 	kdbp->kd_list_tail = kdsp;
386 out:
387 	kdebug_storage_unlock(kd_ctrl_page, intrs_en);
388 
389 	return retval;
390 }
391 
392 static void
release_storage_unit(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page,int cpu,uint32_t kdsp_raw)393 release_storage_unit(struct kd_control *kd_ctrl_page, struct kd_buffer *kd_data_page, int cpu, uint32_t kdsp_raw)
394 {
395 	struct  kd_storage *kdsp_actual;
396 	struct kd_bufinfo *kdbp;
397 	union kds_ptr kdsp;
398 
399 	kdbp = &kd_data_page->kdb_info[cpu];
400 
401 	kdsp.raw = kdsp_raw;
402 
403 	int intrs_en = kdebug_storage_lock(kd_ctrl_page);
404 
405 	if (kdsp.raw == kdbp->kd_list_head.raw) {
406 		/*
407 		 * it's possible for the storage unit pointed to
408 		 * by kdsp to have already been stolen... so
409 		 * check to see if it's still the head of the list
410 		 * now that we're behind the lock that protects
411 		 * adding and removing from the queue...
412 		 * since we only ever release and steal units from
413 		 * that position, if it's no longer the head
414 		 * we having nothing to do in this context
415 		 */
416 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kdsp);
417 		kdbp->kd_list_head = kdsp_actual->kds_next;
418 
419 		kdsp_actual->kds_next = kd_ctrl_page->kds_free_list;
420 		kd_ctrl_page->kds_free_list = kdsp;
421 
422 		kd_ctrl_page->kdc_storage_used--;
423 	}
424 
425 	kdebug_storage_unlock(kd_ctrl_page, intrs_en);
426 }
427 
428 bool
kdebug_disable_wrap(struct kd_control * ctl,kdebug_emit_filter_t * old_emit,kdebug_live_flags_t * old_live)429 kdebug_disable_wrap(struct kd_control *ctl,
430     kdebug_emit_filter_t *old_emit, kdebug_live_flags_t *old_live)
431 {
432 	int intrs_en = kdebug_storage_lock(ctl);
433 
434 	*old_emit = ctl->kdc_emit;
435 	*old_live = ctl->kdc_live_flags;
436 
437 	bool wrapped = ctl->kdc_live_flags & KDBG_WRAPPED;
438 	ctl->kdc_live_flags &= ~KDBG_WRAPPED;
439 	ctl->kdc_live_flags |= KDBG_NOWRAP;
440 
441 	kdebug_storage_unlock(ctl, intrs_en);
442 
443 	return wrapped;
444 }
445 
446 static void
_enable_wrap(struct kd_control * kd_ctrl_page,kdebug_emit_filter_t emit)447 _enable_wrap(struct kd_control *kd_ctrl_page, kdebug_emit_filter_t emit)
448 {
449 	int intrs_en = kdebug_storage_lock(kd_ctrl_page);
450 	kd_ctrl_page->kdc_live_flags &= ~KDBG_NOWRAP;
451 	if (emit) {
452 		kd_ctrl_page->kdc_emit = emit;
453 	}
454 	kdebug_storage_unlock(kd_ctrl_page, intrs_en);
455 }
456 
457 __attribute__((always_inline))
458 void
kernel_debug_write(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page,struct kd_record kd_rec)459 kernel_debug_write(struct kd_control *kd_ctrl_page,
460     struct kd_buffer *kd_data_page,
461     struct kd_record      kd_rec)
462 {
463 	uint64_t now = 0;
464 	uint32_t bindx;
465 	kd_buf *kd;
466 	int cpu;
467 	struct kd_bufinfo *kdbp;
468 	struct kd_storage *kdsp_actual;
469 	union kds_ptr kds_raw;
470 
471 	disable_preemption();
472 
473 	if (kd_ctrl_page->enabled == 0) {
474 		goto out;
475 	}
476 
477 	if (kd_rec.cpu == -1) {
478 		cpu = cpu_number();
479 	} else {
480 		cpu = kd_rec.cpu;
481 	}
482 
483 	kdbp = &kd_data_page->kdb_info[cpu];
484 
485 	bool timestamp_is_continuous = kdbp->continuous_timestamps;
486 
487 	if (kd_rec.timestamp != -1) {
488 		if (kdebug_using_continuous_time()) {
489 			if (!timestamp_is_continuous) {
490 				kd_rec.timestamp = absolutetime_to_continuoustime(kd_rec.timestamp);
491 			}
492 		} else {
493 			if (timestamp_is_continuous) {
494 				kd_rec.timestamp = continuoustime_to_absolutetime(kd_rec.timestamp);
495 			}
496 		}
497 		kd_rec.timestamp &= KDBG_TIMESTAMP_MASK;
498 		if (kd_rec.timestamp < kd_ctrl_page->kdc_oldest_time) {
499 			if (kdbp->latest_past_event_timestamp < kd_rec.timestamp) {
500 				kdbp->latest_past_event_timestamp = kd_rec.timestamp;
501 			}
502 			goto out;
503 		}
504 	}
505 
506 retry_q:
507 	kds_raw = kdbp->kd_list_tail;
508 
509 	if (kds_raw.raw != KDS_PTR_NULL) {
510 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kds_raw);
511 		bindx = kdsp_actual->kds_bufindx;
512 	} else {
513 		kdsp_actual = NULL;
514 		bindx = kd_ctrl_page->kdebug_events_per_storage_unit;
515 	}
516 
517 	if (kdsp_actual == NULL || bindx >= kd_ctrl_page->kdebug_events_per_storage_unit) {
518 		if (allocate_storage_unit(kd_ctrl_page, kd_data_page, cpu) == false) {
519 			/*
520 			 * this can only happen if wrapping
521 			 * has been disabled
522 			 */
523 			goto out;
524 		}
525 		goto retry_q;
526 	}
527 
528 	if (kd_rec.timestamp != -1) {
529 		/*
530 		 * IOP entries can be allocated before xnu allocates and inits the buffer
531 		 * And, Intel uses a special 0 value as a early tracing timestamp sentinel
532 		 * to set the start of trace-time-start-of-interest.
533 		 */
534 		if (kd_rec.timestamp < kdsp_actual->kds_timestamp) {
535 			kdsp_actual->kds_timestamp = kd_rec.timestamp;
536 		}
537 		now = kd_rec.timestamp;
538 	} else {
539 		if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
540 			now = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
541 		} else {
542 			now = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
543 		}
544 	}
545 
546 	if (!OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) {
547 		goto retry_q;
548 	}
549 
550 	kd = &kdsp_actual->kds_records[bindx];
551 
552 	if (kd_ctrl_page->kdc_flags & KDBG_DEBUGID_64) {
553 		/*DebugID has been passed in arg 4*/
554 		kd->debugid = 0;
555 	} else {
556 		kd->debugid = kd_rec.debugid;
557 	}
558 
559 	kd->arg1 = kd_rec.arg1;
560 	kd->arg2 = kd_rec.arg2;
561 	kd->arg3 = kd_rec.arg3;
562 	kd->arg4 = kd_rec.arg4;
563 	kd->arg5 = kd_rec.arg5;
564 
565 	kdbg_set_timestamp_and_cpu(kd, now, cpu);
566 
567 	OSAddAtomic(1, &kdsp_actual->kds_bufcnt);
568 
569 out:
570 	enable_preemption();
571 }
572 
573 // Read events from kdebug storage units into a user space buffer or file.
574 //
575 // This code runs while events are emitted -- storage unit allocation and
576 // deallocation wll synchronize with the emitters.  Only one reader per control
577 // structure is allowed.
578 int
kernel_debug_read(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page,user_addr_t buffer,size_t * number,vnode_t vp,vfs_context_t ctx,uint32_t file_version)579 kernel_debug_read(struct kd_control *kd_ctrl_page,
580     struct kd_buffer *kd_data_page, user_addr_t buffer, size_t *number,
581     vnode_t vp, vfs_context_t ctx, uint32_t file_version)
582 {
583 	size_t count;
584 	unsigned int cpu, min_cpu;
585 	uint64_t barrier_min = 0, barrier_max = 0, t, earliest_time;
586 	int error = 0;
587 	kd_buf *tempbuf;
588 	uint32_t rcursor;
589 	kd_buf lostevent;
590 	union kds_ptr kdsp;
591 	bool traced_retrograde = false;
592 	struct kd_storage *kdsp_actual;
593 	struct kd_bufinfo *kdbp;
594 	struct kd_bufinfo *min_kdbp;
595 	size_t tempbuf_count;
596 	uint32_t tempbuf_number;
597 	kdebug_emit_filter_t old_emit;
598 	uint32_t old_live_flags;
599 	bool out_of_events = false;
600 	bool wrapped = false;
601 	bool set_preempt = true;
602 	bool should_disable = false;
603 
604 	struct kd_bufinfo *kdbip = kd_data_page->kdb_info;
605 	struct kd_region *kd_bufs = kd_data_page->kd_bufs;
606 
607 	assert(number != NULL);
608 	count = *number / sizeof(kd_buf);
609 	*number = 0;
610 
611 	if (count == 0 || !(kd_ctrl_page->kdc_flags & KDBG_BUFINIT) || kd_data_page->kdcopybuf == 0) {
612 		return EINVAL;
613 	}
614 
615 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
616 		/*
617 		 * A corpse can be created due to 'TASK_HAS_TOO_MANY_THREADS'
618 		 * and that can be handled by a callout thread that already
619 		 * has the eager-preemption set.
620 		 * So check to see if we are dealing with one such thread.
621 		 */
622 		set_preempt = !(thread_is_eager_preempt(current_thread()));
623 	}
624 
625 	if (set_preempt) {
626 		thread_set_eager_preempt(current_thread());
627 	}
628 
629 	memset(&lostevent, 0, sizeof(lostevent));
630 	lostevent.debugid = TRACE_LOST_EVENTS;
631 
632 	/*
633 	 * Capture the current time.  Only sort events that have occured
634 	 * before now.  Since the IOPs are being flushed here, it is possible
635 	 * that events occur on the AP while running live tracing.
636 	 */
637 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
638 		barrier_max = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
639 	} else {
640 		barrier_max = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
641 	}
642 
643 	/*
644 	 * Disable wrap so storage units cannot be stolen out from underneath us
645 	 * while merging events.
646 	 *
647 	 * Because we hold ktrace_lock, no other control threads can be playing
648 	 * with kdc_flags.  The code that emits new events could be running,
649 	 * but it grabs kdc_storage_lock if it needs to acquire a new storage
650 	 * chunk, which is where it examines kdc_flags.  If it is adding to
651 	 * the same chunk we're reading from, check for that below.
652 	 */
653 	wrapped = kdebug_disable_wrap(kd_ctrl_page, &old_emit, &old_live_flags);
654 
655 	if (count > kd_data_page->kdb_event_count) {
656 		count = kd_data_page->kdb_event_count;
657 	}
658 
659 	if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) {
660 		tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count;
661 	}
662 
663 	/*
664 	 * If the buffers have wrapped, do not emit additional lost events for the
665 	 * oldest storage units.
666 	 */
667 	if (wrapped) {
668 		kd_ctrl_page->kdc_live_flags &= ~KDBG_WRAPPED;
669 
670 		for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
671 			if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
672 				continue;
673 			}
674 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
675 			kdsp_actual->kds_lostevents = false;
676 		}
677 	}
678 
679 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
680 		/*
681 		 * In TRIAGE mode we want to extract all the current
682 		 * records regardless of where we stopped reading last
683 		 * time so that we have the best shot at getting older
684 		 * records for threads before the buffers are wrapped.
685 		 * So set:-
686 		 * a) kd_prev_timebase to 0 so we (re-)consider older records
687 		 * b) readlast to 0 to initiate the search from the
688 		 * 1st record.
689 		 */
690 		for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
691 			kdbp->kd_prev_timebase = 0;
692 			if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
693 				continue;
694 			}
695 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
696 			kdsp_actual->kds_readlast = 0;
697 		}
698 	}
699 
700 	/*
701 	 * Capture the earliest time where there are events for all CPUs and don't
702 	 * emit events with timestamps prior.
703 	 */
704 	barrier_min = kd_ctrl_page->kdc_oldest_time;
705 
706 	while (count) {
707 		tempbuf = kd_data_page->kdcopybuf;
708 		tempbuf_number = 0;
709 
710 		if (wrapped) {
711 			/*
712 			 * Emit a lost events tracepoint to indicate that previous events
713 			 * were lost -- the thread map cannot be trusted.  A new one must
714 			 * be taken so tools can analyze the trace in a backwards-facing
715 			 * fashion.
716 			 */
717 			kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0);
718 			*tempbuf = lostevent;
719 			wrapped = false;
720 			goto nextevent;
721 		}
722 
723 		/* While space left in merged events scratch buffer. */
724 		while (tempbuf_count) {
725 			bool lostevents = false;
726 			int lostcpu = 0;
727 			earliest_time = UINT64_MAX;
728 			min_kdbp = NULL;
729 			min_cpu = 0;
730 
731 			/* Check each CPU's buffers for the earliest event. */
732 			for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
733 				/* Skip CPUs without data in their oldest storage unit. */
734 				if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
735 next_cpu:
736 					continue;
737 				}
738 				/* From CPU data to buffer header to buffer. */
739 				kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
740 
741 next_event:
742 				/* The next event to be read from this buffer. */
743 				rcursor = kdsp_actual->kds_readlast;
744 
745 				/* Skip this buffer if there are no events left. */
746 				if (rcursor == kdsp_actual->kds_bufindx) {
747 					continue;
748 				}
749 
750 				if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
751 					/*
752 					 * TRIAGE mode record keeping doesn't (currently)
753 					 * use lostevent markers. It also doesn't want to
754 					 * call release_storage_unit() in this read call.
755 					 * It expects the buffers to wrap and records reclaimed
756 					 * in that way solely.
757 					 */
758 					t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
759 					goto skip_record_checks;
760 				}
761 
762 				/*
763 				 * Check that this storage unit wasn't stolen and events were
764 				 * lost.  This must have happened while wrapping was disabled
765 				 * in this function.
766 				 */
767 				if (kdsp_actual->kds_lostevents) {
768 					lostevents = true;
769 					kdsp_actual->kds_lostevents = false;
770 
771 					/*
772 					 * The earliest event we can trust is the first one in this
773 					 * stolen storage unit.
774 					 */
775 					uint64_t lost_time =
776 					    kdbg_get_timestamp(&kdsp_actual->kds_records[0]);
777 					if (kd_ctrl_page->kdc_oldest_time < lost_time) {
778 						/*
779 						 * If this is the first time we've seen lost events for
780 						 * this gap, record its timestamp as the oldest
781 						 * timestamp we're willing to merge for the lost events
782 						 * tracepoint.
783 						 */
784 						kd_ctrl_page->kdc_oldest_time = barrier_min = lost_time;
785 						lostcpu = cpu;
786 					}
787 				}
788 
789 				t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
790 
791 				if (t > barrier_max) {
792 					goto next_cpu;
793 				}
794 				if (t < kdsp_actual->kds_timestamp) {
795 					/*
796 					 * This indicates the event emitter hasn't completed
797 					 * filling in the event (becuase we're looking at the
798 					 * buffer that the record head is using).  The max barrier
799 					 * timestamp should have saved us from seeing these kinds
800 					 * of things, but other CPUs might be slow on the up-take.
801 					 *
802 					 * Bail out so we don't get out-of-order events by
803 					 * continuing to read events from other CPUs' events.
804 					 */
805 					out_of_events = true;
806 					break;
807 				}
808 
809 				/*
810 				 * Ignore events that have aged out due to wrapping or storage
811 				 * unit exhaustion while merging events.
812 				 */
813 				if (t < barrier_min) {
814 					kdsp_actual->kds_readlast++;
815 					if (kdsp_actual->kds_readlast >= kd_ctrl_page->kdebug_events_per_storage_unit) {
816 						release_storage_unit(kd_ctrl_page, kd_data_page, cpu, kdsp.raw);
817 
818 						if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
819 							goto next_cpu;
820 						}
821 						kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
822 					}
823 					goto next_event;
824 				}
825 
826 				/*
827 				 * Don't worry about merging any events -- just walk through
828 				 * the CPUs and find the latest timestamp of lost events.
829 				 */
830 				if (lostevents) {
831 					continue;
832 				}
833 skip_record_checks:
834 				if (t < earliest_time) {
835 					earliest_time = t;
836 					min_kdbp = kdbp;
837 					min_cpu = cpu;
838 				}
839 			}
840 			if (lostevents) {
841 				/*
842 				 * If any lost events were hit in the buffers, emit an event
843 				 * with the latest timestamp.
844 				 */
845 				kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, lostcpu);
846 				*tempbuf = lostevent;
847 				tempbuf->arg1 = 1;
848 				goto nextevent;
849 			}
850 			if (min_kdbp == NULL) {
851 				/* All buffers ran empty. */
852 				out_of_events = true;
853 			}
854 			if (out_of_events) {
855 				break;
856 			}
857 
858 			kdsp = min_kdbp->kd_list_head;
859 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
860 
861 			if (min_kdbp->latest_past_event_timestamp != 0) {
862 				if (kdbg_debug) {
863 					printf("kdebug: PAST EVENT: debugid %#8x: "
864 					    "time %lld from CPU %u "
865 					    "(barrier at time %lld)\n",
866 					    kdsp_actual->kds_records[rcursor].debugid,
867 					    t, cpu, barrier_min);
868 				}
869 
870 				kdbg_set_timestamp_and_cpu(tempbuf, earliest_time, min_cpu);
871 				tempbuf->arg1 = (kd_buf_argtype)min_kdbp->latest_past_event_timestamp;
872 				tempbuf->arg2 = 0;
873 				tempbuf->arg3 = 0;
874 				tempbuf->arg4 = 0;
875 				tempbuf->debugid = TRACE_PAST_EVENTS;
876 				min_kdbp->latest_past_event_timestamp = 0;
877 				goto nextevent;
878 			}
879 
880 			/* Copy earliest event into merged events scratch buffer. */
881 			*tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++];
882 			kd_buf *earliest_event = tempbuf;
883 			if (kd_control_trace.kdc_flags & KDBG_MATCH_DISABLE) {
884 				kd_event_matcher *match = &kd_control_trace.disable_event_match;
885 				kd_event_matcher *mask = &kd_control_trace.disable_event_mask;
886 				if ((earliest_event->debugid & mask->kem_debugid) == match->kem_debugid &&
887 				    (earliest_event->arg1 & mask->kem_args[0]) == match->kem_args[0] &&
888 				    (earliest_event->arg2 & mask->kem_args[1]) == match->kem_args[1] &&
889 				    (earliest_event->arg3 & mask->kem_args[2]) == match->kem_args[2] &&
890 				    (earliest_event->arg4 & mask->kem_args[3]) == match->kem_args[3]) {
891 					should_disable = true;
892 				}
893 			}
894 
895 			if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
896 				if (kdsp_actual->kds_readlast == kd_ctrl_page->kdebug_events_per_storage_unit) {
897 					release_storage_unit(kd_ctrl_page, kd_data_page, min_cpu, kdsp.raw);
898 				}
899 			}
900 
901 			/*
902 			 * Watch for out of order timestamps (from IOPs).
903 			 */
904 			if (earliest_time < min_kdbp->kd_prev_timebase) {
905 				/*
906 				 * If we haven't already, emit a retrograde events event.
907 				 * Otherwise, ignore this event.
908 				 */
909 				if (traced_retrograde) {
910 					continue;
911 				}
912 				if (kdbg_debug) {
913 					printf("kdebug: RETRO EVENT: debugid %#8x: "
914 					    "time %lld from CPU %u "
915 					    "(barrier at time %lld)\n",
916 					    kdsp_actual->kds_records[rcursor].debugid,
917 					    t, cpu, barrier_min);
918 				}
919 
920 				kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase,
921 				    kdbg_get_cpu(tempbuf));
922 				tempbuf->arg1 = tempbuf->debugid;
923 				tempbuf->arg2 = (kd_buf_argtype)earliest_time;
924 				tempbuf->arg3 = 0;
925 				tempbuf->arg4 = 0;
926 				tempbuf->debugid = TRACE_RETROGRADE_EVENTS;
927 				traced_retrograde = true;
928 			} else {
929 				min_kdbp->kd_prev_timebase = earliest_time;
930 			}
931 nextevent:
932 			tempbuf_count--;
933 			tempbuf_number++;
934 			tempbuf++;
935 
936 			if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE &&
937 			    (RAW_file_written += sizeof(kd_buf)) >= RAW_FLUSH_SIZE) {
938 				break;
939 			}
940 		}
941 
942 		if (tempbuf_number) {
943 			/*
944 			 * Remember the latest timestamp of events that we've merged so we
945 			 * don't think we've lost events later.
946 			 */
947 			uint64_t latest_time = kdbg_get_timestamp(tempbuf - 1);
948 			if (kd_ctrl_page->kdc_oldest_time < latest_time) {
949 				kd_ctrl_page->kdc_oldest_time = latest_time;
950 			}
951 
952 			if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
953 				extern int kernel_debug_trace_write_to_file(user_addr_t *buffer,
954 				    size_t *number, size_t *count, size_t tempbuf_number,
955 				    vnode_t vp, vfs_context_t ctx, uint32_t file_version);
956 				error = kernel_debug_trace_write_to_file(&buffer, number,
957 				    &count, tempbuf_number, vp, ctx, file_version);
958 			} else if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
959 				memcpy((void*)buffer, kd_data_page->kdcopybuf,
960 				    tempbuf_number * sizeof(kd_buf));
961 				buffer += tempbuf_number * sizeof(kd_buf);
962 			} else {
963 				panic("kdebug: invalid kdebug mode %d", kd_ctrl_page->mode);
964 			}
965 			if (error) {
966 				*number = 0;
967 				error = EINVAL;
968 				break;
969 			}
970 			count   -= tempbuf_number;
971 			*number += tempbuf_number;
972 		}
973 		if (out_of_events) {
974 			break;
975 		}
976 
977 		if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) {
978 			tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count;
979 		}
980 	}
981 	if ((old_live_flags & KDBG_NOWRAP) == 0) {
982 		_enable_wrap(kd_ctrl_page, old_emit);
983 	}
984 
985 	if (set_preempt) {
986 		thread_clear_eager_preempt(current_thread());
987 	}
988 
989 	if (should_disable) {
990 		kernel_debug_disable();
991 	}
992 
993 	return error;
994 }
995