xref: /xnu-11215/osfmk/i386/fpu.c (revision 8d741a5d)
1 /*
2  * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1992-1990 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 #include <mach/exception_types.h>
58 #include <mach/i386/thread_status.h>
59 #include <mach/i386/fp_reg.h>
60 
61 #include <kern/mach_param.h>
62 #include <kern/processor.h>
63 #include <kern/thread.h>
64 #include <kern/zalloc.h>
65 #include <kern/misc_protos.h>
66 #include <kern/spl.h>
67 #include <kern/assert.h>
68 
69 #include <libkern/OSAtomic.h>
70 
71 #include <architecture/i386/pio.h>
72 #include <i386/cpuid.h>
73 #include <i386/fpu.h>
74 #include <i386/proc_reg.h>
75 #include <i386/misc_protos.h>
76 #include <i386/thread.h>
77 #include <i386/trap_internal.h>
78 
79 xstate_t        fpu_capability = UNDEFINED;     /* extended state capability */
80 xstate_t        fpu_default = UNDEFINED;        /* default extended state */
81 
82 #define ALIGNED(addr, size)      (((uintptr_t)(addr)&((size)-1))==0)
83 #define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 	assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 	    "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
86 
87 /* Forward */
88 
89 extern void             fpinit(void);
90 extern void             fp_save(
91 	thread_t        thr_act);
92 extern void             fp_load(
93 	thread_t        thr_act);
94 
95 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
96 static xstate_t thread_xstate(thread_t);
97 
98 static x86_ext_thread_state_t  initial_fp_state __attribute((aligned(64)));
99 static x86_ext_thread_state_t  default_avx512_state __attribute((aligned(64)));
100 static x86_ext_thread_state_t  default_avx_state __attribute((aligned(64)));
101 static x86_ext_thread_state_t  default_fx_state __attribute((aligned(64)));
102 
103 /* Global MXCSR capability bitmask */
104 static unsigned int mxcsr_capability_mask;
105 
106 #define fninit() \
107 	__asm__ volatile("fninit")
108 
109 #define fnstcw(control) \
110 	__asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111 
112 #define fldcw(control) \
113 	__asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
114 
115 #define fnclex() \
116 	__asm__ volatile("fnclex")
117 
118 #define fnsave(state)  \
119 	__asm__ volatile("fnsave %0" : "=m" (*state))
120 
121 #define frstor(state) \
122 	__asm__ volatile("frstor %0" : : "m" (state))
123 
124 #define fwait() \
125 	__asm__("fwait");
126 
127 static inline void
fxrstor(struct x86_fx_thread_state * a)128 fxrstor(struct x86_fx_thread_state *a)
129 {
130 	__asm__ __volatile__ ("fxrstor %0" ::  "m" (*a));
131 }
132 
133 static inline void
fxsave(struct x86_fx_thread_state * a)134 fxsave(struct x86_fx_thread_state *a)
135 {
136 	__asm__ __volatile__ ("fxsave %0" : "=m" (*a));
137 }
138 
139 static inline void
fxrstor64(struct x86_fx_thread_state * a)140 fxrstor64(struct x86_fx_thread_state *a)
141 {
142 	__asm__ __volatile__ ("fxrstor64 %0" ::  "m" (*a));
143 }
144 
145 static inline void
fxsave64(struct x86_fx_thread_state * a)146 fxsave64(struct x86_fx_thread_state *a)
147 {
148 	__asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
149 }
150 
151 #define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX || (x) == AVX512)
152 
153 SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
154 	[FP]     = NULL,
155 	[AVX]    = NULL,
156 	[AVX512] = NULL
157 };
158 
159 #define AVX512_OFFSET   offsetof(struct x86_avx512_thread_state, x_Opmask)
160 
161 SECURITY_READ_ONLY_LATE(uint32_t) fp_avx512_offset = AVX512_OFFSET;
162 
163 static uint32_t
fp_state_size(xstate_t state)164 fp_state_size(xstate_t state)
165 {
166 	switch (state) {
167 	case FP:
168 		return sizeof(struct x86_fx_thread_state);
169 	case AVX:
170 		return sizeof(struct x86_avx_thread_state);
171 	case AVX512:
172 		switch (fp_avx512_offset) {
173 		case AVX512_OFFSET:
174 			return sizeof(struct x86_avx512_thread_state);
175 		default:
176 			panic("avx512 offset %d", fp_avx512_offset);
177 		}
178 	default:
179 		panic("bad state %d", state);
180 	}
181 }
182 
183 struct fp_avx512 {
184 	uint64_t *x_Opmask;
185 	reg256_t *x_ZMM_Hi256;
186 	reg512_t *x_Hi16_ZMM;
187 };
188 
189 static struct fp_avx512
fp_avx512_get(struct x86_avx512_thread_state * iavx)190 fp_avx512_get(struct x86_avx512_thread_state *iavx)
191 {
192 	switch (fp_avx512_offset) {
193 	case AVX512_OFFSET:
194 		return (struct fp_avx512) {
195 			       .x_Opmask = iavx->x_Opmask,
196 			       .x_ZMM_Hi256 = iavx->x_ZMM_Hi256,
197 			       .x_Hi16_ZMM = iavx->x_Hi16_ZMM,
198 		};
199 	default:
200 		panic("bad offset");
201 	}
202 }
203 
204 static const char *const xstate_name[] = {
205 	[UNDEFINED] = "UNDEFINED",
206 	[FP] = "FP",
207 	[AVX] = "AVX",
208 	[AVX512] = "AVX512"
209 };
210 
211 #define fpu_ZMM_capable (fpu_capability == AVX512)
212 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
213 
214 /*
215  * On-demand AVX512 support
216  * ------------------------
217  * On machines with AVX512 support, by default, threads are created with
218  * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
219  * capabilities are advertised in the commpage and via sysctl. If a thread
220  * opts to use AVX512 instructions, the first will result in a #UD exception.
221  * Faulting AVX512 intructions are recognizable by their unique prefix.
222  * This exception results in the thread being promoted to use an AVX512-sized
223  * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
224  * instruction is re-driven and the thread can proceed to perform AVX512
225  * operations.
226  *
227  * In addition to AVX512 instructions causing promotion, the thread_set_state()
228  * primitive with an AVX512 state flavor result in promotion.
229  *
230  * AVX512 promotion of the first thread in a task causes the default xstate
231  * of the task to be promoted so that any subsequently created or subsequently
232  * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
233  * a promoted xstate.
234  *
235  * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
236  * and a second pool of larger AVX512-sized (2688 byte) areas.
237  *
238  * Note the initial state value is an AVX512 object but that the AVX initial
239  * value is a subset of it.
240  */
241 static uint32_t cpuid_reevaluated = 0;
242 
243 static void fpu_store_registers(void *, boolean_t);
244 static void fpu_load_registers(void *);
245 
246 static const uint32_t xstate_xmask[] = {
247 	[FP] =          FP_XMASK,
248 	[AVX] =         AVX_XMASK,
249 	[AVX512] =      AVX512_XMASK
250 };
251 
252 static inline void
xsave(struct x86_fx_thread_state * a,uint32_t rfbm)253 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
254 {
255 	__asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
256 }
257 
258 static inline void
xsave64(struct x86_fx_thread_state * a,uint32_t rfbm)259 xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
260 {
261 	__asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
262 }
263 
264 static inline void
xrstor(struct x86_fx_thread_state * a,uint32_t rfbm)265 xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
266 {
267 	__asm__ __volatile__ ("xrstor %0" ::  "m" (*a), "a"(rfbm), "d"(0));
268 }
269 
270 static inline void
xrstor64(struct x86_fx_thread_state * a,uint32_t rfbm)271 xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
272 {
273 	__asm__ __volatile__ ("xrstor64 %0" ::  "m" (*a), "a"(rfbm), "d"(0));
274 }
275 
276 __unused static inline void
vzeroupper(void)277 vzeroupper(void)
278 {
279 	__asm__ __volatile__ ("vzeroupper" ::);
280 }
281 
282 static boolean_t fpu_thread_promote_avx512(thread_t);   /* Forward */
283 
284 
285 /*
286  * Furthermore, make compile-time asserts that no padding creeps into structures
287  * for which we're doing this.
288  */
289 #define ASSERT_PACKED(t, m1, m2, n, mt)                 \
290 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2   \
291 	[(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
292 
293 ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
294 
295 ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
296 
297 ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
298 ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
299 ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
300 
301 ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
302 ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
303 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
304 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
305 
306 #if defined(DEBUG_AVX512)
307 
308 #define DBG(x...)       kprintf("DBG: " x)
309 
310 typedef struct { uint8_t byte[8]; }  opmask_t;
311 typedef struct { uint8_t byte[16]; } xmm_t;
312 typedef struct { uint8_t byte[32]; } ymm_t;
313 typedef struct { uint8_t byte[64]; } zmm_t;
314 
315 static void
DBG_AVX512_STATE(struct x86_avx512_thread_state * sp)316 DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
317 {
318 	const xmm_t *xmm  = (const xmm_t *) &sp->fp.fx_XMM_reg;
319 	const xmm_t *ymmh = (const xmm_t *) &sp->x_YMM_Hi128;
320 
321 	const struct fp_avx512 p = fp_avx512_get(sp);
322 	const ymm_t *zmmh = (const ymm_t *) &p.x_ZMM_Hi256;
323 	const zmm_t *zmm  = (const zmm_t *) &p.x_Hi16_ZMM;
324 	const opmask_t *k = (const opmask_t *) &p.x_Opmask;
325 
326 	switch (fp_avx512_offset) {
327 	case AVX512_OFFSET:
328 		kprintf("x_YMM_Hi128: %lu\n",
329 		    offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
330 		kprintf("x_Opmask:    %lu\n",
331 		    offsetof(struct x86_avx512_thread_state, x_Opmask));
332 		kprintf("x_ZMM_Hi256: %lu\n",
333 		    offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
334 		kprintf("x_Hi16_ZMM:  %lu\n",
335 		    offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
336 		break;
337 	default:
338 		break;
339 	}
340 
341 	kprintf("XCR0:   0x%016llx\n", xgetbv(XCR0));
342 	kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
343 
344 	/* Print all ZMM registers */
345 	for (int i = 0; i < 16; i++) {
346 		kprintf("zmm%d:\t0x", i);
347 		for (int j = 0; j < 16; j++) {
348 			kprintf("%02x", xmm[i].byte[j]);
349 		}
350 		for (int j = 0; j < 16; j++) {
351 			kprintf("%02x", ymmh[i].byte[j]);
352 		}
353 		for (int j = 0; j < 32; j++) {
354 			kprintf("%02x", zmmh[i].byte[j]);
355 		}
356 		kprintf("\n");
357 	}
358 	for (int i = 0; i < 16; i++) {
359 		kprintf("zmm%d:\t0x", 16 + i);
360 		for (int j = 0; j < 64; j++) {
361 			kprintf("%02x", zmm[i].byte[j]);
362 		}
363 		kprintf("\n");
364 	}
365 	for (int i = 0; i < 8; i++) {
366 		kprintf("k%d:\t0x", i);
367 		for (int j = 0; j < 8; j++) {
368 			kprintf("%02x", k[i].byte[j]);
369 		}
370 		kprintf("\n");
371 	}
372 
373 	kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
374 	kprintf("xcomp_bv:  0x%016llx\n", sp->_xh.xcomp_bv);
375 }
376 #else
377 #define DBG(x...)
378 static void
DBG_AVX512_STATE(__unused struct x86_avx512_thread_state * sp)379 DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
380 {
381 }
382 #endif /* DEBUG_AVX512 */
383 
384 #if     DEBUG
385 static inline unsigned short
fnstsw(void)386 fnstsw(void)
387 {
388 	unsigned short status;
389 	__asm__ volatile ("fnstsw %0" : "=ma" (status));
390 	return status;
391 }
392 #endif
393 
394 /*
395  * Configure the initial FPU state presented to new threads.
396  * Determine the MXCSR capability mask, which allows us to mask off any
397  * potentially unsafe "reserved" bits before restoring the FPU context.
398  * *Not* per-cpu, assumes symmetry.
399  */
400 
401 static void
configure_mxcsr_capability_mask(x86_ext_thread_state_t * fps)402 configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
403 {
404 	/* XSAVE requires a 64 byte aligned store */
405 	assert(ALIGNED(fps, 64));
406 	/* Clear, to prepare for the diagnostic FXSAVE */
407 	bzero(fps, sizeof(*fps));
408 
409 	fpinit();
410 	fpu_store_registers(fps, FALSE);
411 
412 	mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
413 
414 	/* Set default mask value if necessary */
415 	if (mxcsr_capability_mask == 0) {
416 		mxcsr_capability_mask = 0xffbf;
417 	}
418 
419 	/* Clear vector register store */
420 	bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
421 	bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
422 	if (fpu_ZMM_capable) {
423 		const struct fp_avx512 p = fp_avx512_get(&fps->avx512);
424 		bzero(p.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
425 		bzero(p.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
426 		bzero(p.x_Opmask, sizeof(fps->avx512.x_Opmask));
427 	}
428 
429 	fps->fx.fp_valid = TRUE;
430 	fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
431 	fpu_load_registers(fps);
432 
433 	if (fpu_ZMM_capable) {
434 		xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
435 	}
436 	if (fpu_YMM_capable) {
437 		xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
438 	} else {
439 		fxsave64((struct x86_fx_thread_state *)&default_fx_state);
440 	}
441 
442 	/* Poison values to trap unsafe usage */
443 	fps->fx.fp_valid = 0xFFFFFFFF;
444 	fps->fx.fp_save_layout = FP_UNUSED;
445 
446 	/* Re-enable FPU/SSE DNA exceptions */
447 	set_ts();
448 }
449 
450 #if DEBUG || DEVELOPMENT
451 int fpsimd_fault_popc = 1;
452 #endif
453 
454 /*
455  * Look for FPU and initialize it.
456  * Called on each CPU.
457  */
458 void
init_fpu(void)459 init_fpu(void)
460 {
461 #if     DEBUG
462 	unsigned short  status;
463 	unsigned short  control;
464 #endif
465 	/*
466 	 * Check for FPU by initializing it,
467 	 * then trying to read the correct bit patterns from
468 	 * the control and status registers.
469 	 */
470 	set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE);       /* allow use of FPU */
471 	fninit();
472 #if     DEBUG
473 	status = fnstsw();
474 	fnstcw(&control);
475 
476 	assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
477 #endif
478 	/* Advertise SSE support */
479 	if (cpuid_features() & CPUID_FEATURE_FXSR) {
480 		set_cr4(get_cr4() | CR4_OSFXS);
481 		/* And allow SIMD exceptions if present */
482 		if (cpuid_features() & CPUID_FEATURE_SSE) {
483 			set_cr4(get_cr4() | CR4_OSXMM);
484 		}
485 	} else {
486 		panic("fpu is not FP_FXSR");
487 	}
488 
489 	fpu_capability = fpu_default = FP;
490 
491 	static boolean_t is_avx512_enabled = TRUE;
492 	if (cpu_number() == master_cpu) {
493 		if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
494 			PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
495 			kprintf("AVX512 supported %s\n",
496 			    is_avx512_enabled ? "and enabled" : "but disabled");
497 		}
498 	}
499 
500 	/* Configure the XSAVE context mechanism if the processor supports
501 	 * AVX/YMM registers
502 	 */
503 	if (cpuid_features() & CPUID_FEATURE_XSAVE) {
504 		cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
505 		if (is_avx512_enabled &&
506 		    (xs0p->extended_state[eax] & XFEM_ZMM_OPMASK) == XFEM_ZMM_OPMASK) {
507 			assert(xs0p->extended_state[eax] & XFEM_SSE);
508 			assert(xs0p->extended_state[eax] & XFEM_YMM);
509 			fpu_capability = AVX512;
510 			/* XSAVE container size for all features */
511 			set_cr4(get_cr4() | CR4_OSXSAVE);
512 			xsetbv(0, AVX512_XMASK);
513 			/* Re-evaluate CPUID, once, to reflect OSXSAVE */
514 			if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
515 				cpuid_set_info();
516 			}
517 			/* Verify that now selected state can be accommodated */
518 			assert(xs0p->extended_state[ebx] == fp_state_size(AVX512));
519 			/*
520 			 * AVX set until AVX512 is used.
521 			 * See comment above about on-demand AVX512 support.
522 			 */
523 			xsetbv(0, AVX_XMASK);
524 			fpu_default = AVX;
525 		} else if (xs0p->extended_state[eax] & XFEM_YMM) {
526 			assert(xs0p->extended_state[eax] & XFEM_SSE);
527 			fpu_capability = AVX;
528 			fpu_default = AVX;
529 			/* XSAVE container size for all features */
530 			set_cr4(get_cr4() | CR4_OSXSAVE);
531 			xsetbv(0, AVX_XMASK);
532 			/* Re-evaluate CPUID, once, to reflect OSXSAVE */
533 			if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
534 				cpuid_set_info();
535 			}
536 			/* Verify that now selected state can be accommodated */
537 			assert(xs0p->extended_state[ebx] == fp_state_size(AVX));
538 		}
539 	}
540 
541 	if (cpu_number() == master_cpu) {
542 		kprintf("fpu_state: %s, state_size: %d\n",
543 		    xstate_name[fpu_capability],
544 		    fp_state_size(fpu_capability));
545 	}
546 
547 	fpinit();
548 	current_cpu_datap()->cpu_xstate = fpu_default;
549 
550 	/*
551 	 * Trap wait instructions.  Turn off FPU for now.
552 	 */
553 	set_cr0(get_cr0() | CR0_TS | CR0_MP);
554 }
555 
556 /*
557  * Allocate and initialize FP state for specified xstate.
558  * Don't load state.
559  */
560 static void *
fp_state_alloc(xstate_t xs)561 fp_state_alloc(xstate_t xs)
562 {
563 	assert(ifps_zone[xs] != NULL);
564 	return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
565 }
566 
567 static inline void
fp_state_free(void * ifps,xstate_t xs)568 fp_state_free(void *ifps, xstate_t xs)
569 {
570 	assert(ifps_zone[xs] != NULL);
571 	zfree(ifps_zone[xs], ifps);
572 }
573 
574 void
clear_fpu(void)575 clear_fpu(void)
576 {
577 	set_ts();
578 }
579 
580 static boolean_t
fpu_allzeroes(uint64_t * __attribute ((aligned (8)))ptr,uint32_t size)581 fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
582 {
583 	VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
584 	assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
585 
586 	for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
587 		if (ptr[count] != 0) {
588 			return FALSE;
589 		}
590 	}
591 	return TRUE;
592 }
593 
594 static void
fpu_load_registers(void * fstate)595 fpu_load_registers(void *fstate)
596 {
597 	struct x86_fx_thread_state *ifps = fstate;
598 	fp_save_layout_t layout = ifps->fp_save_layout;
599 
600 	assert(startup_phase < STARTUP_SUB_EARLY_BOOT || \
601 	    (thread_is_64bit_addr(current_thread()) ?                        \
602 	    (layout == FXSAVE64 || layout == XSAVE64) :     \
603 	    (layout == FXSAVE32 || layout == XSAVE32)));
604 	assert(ALIGNED(ifps, 64));
605 	assert(ml_get_interrupts_enabled() == FALSE);
606 
607 #if     DEBUG
608 	if (layout == XSAVE32 || layout == XSAVE64) {
609 		struct x86_avx_thread_state *iavx = fstate;
610 		unsigned i;
611 		/* Verify reserved bits in the XSAVE header*/
612 		if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
613 			panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
614 		}
615 		for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
616 			if (iavx->_xh.xhrsvd[i]) {
617 				panic("Reserved bit set");
618 			}
619 		}
620 	}
621 	if (fpu_YMM_capable) {
622 		if (layout != XSAVE32 && layout != XSAVE64) {
623 			panic("Inappropriate layout: %u", layout);
624 		}
625 	}
626 #endif  /* DEBUG */
627 
628 	switch (layout) {
629 	case FXSAVE64:
630 		fxrstor64(ifps);
631 		break;
632 	case FXSAVE32:
633 		fxrstor(ifps);
634 		break;
635 	case XSAVE64:
636 		xrstor64(ifps, xstate_xmask[current_xstate()]);
637 		break;
638 	case XSAVE32:
639 		xrstor(ifps, xstate_xmask[current_xstate()]);
640 		break;
641 	default:
642 		panic("fpu_load_registers() bad layout: %d", layout);
643 	}
644 }
645 
646 static void
fpu_store_registers(void * fstate,boolean_t is64)647 fpu_store_registers(void *fstate, boolean_t is64)
648 {
649 	struct x86_fx_thread_state *ifps = fstate;
650 	assert(ALIGNED(ifps, 64));
651 	xstate_t xs = current_xstate();
652 	switch (xs) {
653 	case FP:
654 		if (is64) {
655 			fxsave64(fstate);
656 			ifps->fp_save_layout = FXSAVE64;
657 		} else {
658 			fxsave(fstate);
659 			ifps->fp_save_layout = FXSAVE32;
660 		}
661 		break;
662 	case AVX:
663 	case AVX512:
664 		if (is64) {
665 			xsave64(ifps, xstate_xmask[xs]);
666 			ifps->fp_save_layout = XSAVE64;
667 		} else {
668 			xsave(ifps, xstate_xmask[xs]);
669 			ifps->fp_save_layout = XSAVE32;
670 		}
671 		break;
672 	default:
673 		panic("fpu_store_registers() bad xstate: %d", xs);
674 	}
675 }
676 
677 /*
678  * Initialize FP handling.
679  */
680 
681 void
fpu_module_init(void)682 fpu_module_init(void)
683 {
684 	if (!IS_VALID_XSTATE(fpu_default)) {
685 		panic("fpu_module_init: invalid extended state %u",
686 		    fpu_default);
687 	}
688 
689 	/* To maintain the required alignment, disable
690 	 * zone debugging for this zone as that appends
691 	 * 16 bytes to each element.
692 	 */
693 	ifps_zone[fpu_default] = zone_create("x86 fpsave state",
694 	    fp_state_size(fpu_default), ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
695 
696 	/*
697 	 * If AVX512 is supported, create a separate savearea zone.
698 	 */
699 	if (fpu_ZMM_capable) {
700 		ifps_zone[AVX512] = zone_create("x86 avx512 save state",
701 		    fp_state_size(AVX512), ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
702 	}
703 
704 	/* Determine MXCSR reserved bits and configure initial FPU state*/
705 	configure_mxcsr_capability_mask(&initial_fp_state);
706 
707 #if DEBUG || DEVELOPMENT
708 	if (kern_feature_override(KF_DISABLE_FP_POPC_ON_PGFLT)) {
709 		fpsimd_fault_popc = 0;
710 	}
711 
712 	/* Allow the explicit boot-arg to override the validation disables */
713 	PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
714 #endif
715 }
716 
717 /*
718  * Context switch fpu state.
719  * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
720  * Switch to the new task's xstate.
721  */
722 
723 void
fpu_switch_context(thread_t old,thread_t new)724 fpu_switch_context(thread_t old, thread_t new)
725 {
726 	struct x86_fx_thread_state      *ifps;
727 	cpu_data_t *cdp = current_cpu_datap();
728 	xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
729 
730 	assert(ml_get_interrupts_enabled() == FALSE);
731 	ifps = (old)->machine.ifps;
732 #if     DEBUG
733 	if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
734 		panic("ifps->fp_valid: %u", ifps->fp_valid);
735 	}
736 #endif
737 	if (ifps != 0 && (ifps->fp_valid == FALSE)) {
738 		/* Clear CR0.TS in preparation for the FP context save. In
739 		 * theory, this shouldn't be necessary since a live FPU should
740 		 * indicate that TS is clear. However, various routines
741 		 * (such as sendsig & sigreturn) manipulate TS directly.
742 		 */
743 		clear_ts();
744 		/* registers are in FPU - save to memory */
745 		boolean_t is64 = (thread_is_64bit_addr(old) &&
746 		    is_saved_state64(old->machine.iss));
747 
748 		fpu_store_registers(ifps, is64);
749 		ifps->fp_valid = TRUE;
750 
751 		if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
752 			xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
753 		} else if (fpu_YMM_capable) {
754 			xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
755 		} else {
756 			fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
757 		}
758 	}
759 
760 	assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
761 	if (new_xstate != (xstate_t) cdp->cpu_xstate) {
762 		DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
763 		    old, new, xstate_name[new_xstate]);
764 		xsetbv(0, xstate_xmask[new_xstate]);
765 		cdp->cpu_xstate = new_xstate;
766 	}
767 	set_ts();
768 }
769 
770 
771 /*
772  * Free a FPU save area.
773  * Called only when thread terminating - no locking necessary.
774  */
775 void
fpu_free(thread_t thread,void * fps)776 fpu_free(thread_t thread, void *fps)
777 {
778 	pcb_t   pcb = THREAD_TO_PCB(thread);
779 
780 	fp_state_free(fps, pcb->xstate);
781 	pcb->xstate = UNDEFINED;
782 }
783 
784 /*
785  * Set the floating-point state for a thread based on the FXSave formatted data.
786  * This is basically the same as fpu_set_state except it uses the expanded data
787  * structure.
788  * If the thread is not the current thread, it is not running (held).  Locking
789  * needed against concurrent fpu_set_state or fpu_get_state.
790  *
791  * While translating between XNU FP state structures and the CPU-native XSAVE area,
792  * if we detect state components that are all zeroes, we clear the corresponding
793  * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
794  * be initialized to a "clean" state.  That's most important when clearing the YMM
795  * bit, since an initialized "upper clean" state results in a massive performance
796  * improvement due to elimination of false dependencies between the XMMs and the
797  * upper bits of the YMMs.
798  */
799 kern_return_t
fpu_set_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)800 fpu_set_fxstate(
801 	thread_t        thr_act,
802 	thread_state_t  tstate,
803 	thread_flavor_t f)
804 {
805 	struct x86_fx_thread_state      *ifps;
806 	struct x86_fx_thread_state      *new_ifps;
807 	x86_float_state64_t             *state;
808 	pcb_t                           pcb;
809 	boolean_t                       old_valid, fresh_state = FALSE;
810 	xstate_t                        thr_xstate;
811 
812 	if (fpu_capability == UNDEFINED) {
813 		return KERN_FAILURE;
814 	}
815 
816 	if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
817 	    fpu_capability < AVX) {
818 		return KERN_FAILURE;
819 	}
820 
821 	assert(thr_act != THREAD_NULL);
822 
823 	thr_xstate = thread_xstate(thr_act);
824 
825 	if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
826 	    thr_xstate == AVX) {
827 		if (!fpu_thread_promote_avx512(thr_act)) {
828 			return KERN_FAILURE;
829 		} else {
830 			/* Reload thr_xstate after successful promotion */
831 			thr_xstate = thread_xstate(thr_act);
832 		}
833 	}
834 
835 	state = (x86_float_state64_t *)tstate;
836 
837 	pcb = THREAD_TO_PCB(thr_act);
838 
839 	if (state == NULL) {
840 		/*
841 		 * new FPU state is 'invalid'.
842 		 * Deallocate the fp state if it exists.
843 		 */
844 		simple_lock(&pcb->lock, LCK_GRP_NULL);
845 
846 		ifps = pcb->ifps;
847 		pcb->ifps = 0;
848 
849 		simple_unlock(&pcb->lock);
850 
851 		if (ifps != 0) {
852 			fp_state_free(ifps, thr_xstate);
853 		}
854 	} else {
855 		/*
856 		 * Valid incoming state. Allocate the fp state if there is none.
857 		 */
858 		new_ifps = 0;
859 Retry:
860 		simple_lock(&pcb->lock, LCK_GRP_NULL);
861 
862 		ifps = pcb->ifps;
863 		if (ifps == 0) {
864 			if (new_ifps == 0) {
865 				simple_unlock(&pcb->lock);
866 				new_ifps = fp_state_alloc(thr_xstate);
867 				goto Retry;
868 			}
869 			ifps = new_ifps;
870 			new_ifps = 0;
871 			pcb->ifps = ifps;
872 			pcb->xstate = thr_xstate;
873 			fresh_state = TRUE;
874 		}
875 
876 		/*
877 		 * now copy over the new data.
878 		 */
879 
880 		old_valid = ifps->fp_valid;
881 
882 #if     DEBUG || DEVELOPMENT
883 		if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
884 			panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
885 		}
886 #endif
887 		/*
888 		 * Clear any reserved bits in the MXCSR to prevent a GPF
889 		 * when issuing an FXRSTOR.
890 		 */
891 
892 		state->fpu_mxcsr &= mxcsr_capability_mask;
893 
894 		__nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size(FP));
895 
896 		switch (thr_xstate) {
897 		case UNDEFINED_FULL:
898 		case FP_FULL:
899 		case AVX_FULL:
900 		case AVX512_FULL:
901 			panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
902 			break;
903 
904 		case UNDEFINED:
905 			panic("fpu_set_fxstate() UNDEFINED xstate");
906 			break;
907 		case FP:
908 			ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
909 			break;
910 		case AVX: {
911 			struct x86_avx_thread_state *iavx = (void *) ifps;
912 			x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
913 
914 			iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
915 
916 			/* Sanitize XSAVE header */
917 			bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
918 			iavx->_xh.xstate_bv = AVX_XMASK;
919 			iavx->_xh.xcomp_bv  = 0;
920 
921 			/*
922 			 * See the block comment at the top of the function for a description of why we're clearing
923 			 * xstate_bv bits.
924 			 */
925 			if (f == x86_AVX_STATE32) {
926 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
927 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
928 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
929 				}
930 			} else if (f == x86_AVX_STATE64) {
931 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
932 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
933 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
934 				}
935 			} else {
936 				iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
937 			}
938 			break;
939 		}
940 		case AVX512: {
941 			struct x86_avx512_thread_state *iavx = (void *) ifps;
942 			union {
943 				thread_state_t       ts;
944 				x86_avx512_state32_t *s32;
945 				x86_avx512_state64_t *s64;
946 			} xs = { .ts = tstate };
947 
948 			iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
949 
950 			/* Sanitize XSAVE header */
951 			bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
952 			iavx->_xh.xstate_bv = AVX512_XMASK;
953 			iavx->_xh.xcomp_bv  = 0;
954 
955 			const struct fp_avx512 p = fp_avx512_get(iavx);
956 
957 			/*
958 			 * See the block comment at the top of the function for a description of why we're clearing
959 			 * xstate_bv bits.
960 			 */
961 			switch (f) {
962 			case x86_AVX512_STATE32:
963 				__nochk_bcopy(&xs.s32->fpu_k0, p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
964 				__nochk_bcopy(&xs.s32->fpu_zmmh0, p.x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
965 
966 				if (fpu_allzeroes((uint64_t *)(void *)p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
967 					iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
968 				}
969 
970 				if (fpu_allzeroes((uint64_t *)(void *)p.x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
971 					iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
972 				}
973 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
974 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
975 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
976 				}
977 
978 				DBG_AVX512_STATE(iavx);
979 				break;
980 			case x86_AVX_STATE32:
981 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
982 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
983 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
984 				}
985 				break;
986 			case x86_AVX512_STATE64:
987 				__nochk_bcopy(&xs.s64->fpu_k0, p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
988 				__nochk_bcopy(&xs.s64->fpu_zmm16, p.x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
989 				__nochk_bcopy(&xs.s64->fpu_zmmh0, p.x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
990 				/*
991 				 * Note that it is valid to have XFEM_ZMM_OPMASK set but XFEM_YMM cleared.  In that case,
992 				 * the upper bits of the YMMs would be cleared and would result in a clean-upper
993 				 * state, allowing SSE instruction to avoid false dependencies.
994 				 */
995 				if (fpu_allzeroes((uint64_t *)(void *)p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
996 					iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
997 				}
998 
999 				if (fpu_allzeroes((uint64_t *)(void *)p.x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
1000 				    fpu_allzeroes((uint64_t *)(void *)p.x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
1001 					iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
1002 				}
1003 
1004 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
1005 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
1006 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
1007 				}
1008 				DBG_AVX512_STATE(iavx);
1009 				break;
1010 			case x86_AVX_STATE64:
1011 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
1012 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
1013 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
1014 				}
1015 				break;
1016 			}
1017 			break;
1018 		}
1019 		}
1020 
1021 		ifps->fp_valid = old_valid;
1022 
1023 		if (old_valid == FALSE) {
1024 			boolean_t istate = ml_set_interrupts_enabled(FALSE);
1025 			ifps->fp_valid = TRUE;
1026 			/* If altering the current thread's state, disable FPU */
1027 			if (thr_act == current_thread()) {
1028 				set_ts();
1029 			}
1030 
1031 			ml_set_interrupts_enabled(istate);
1032 		}
1033 
1034 		simple_unlock(&pcb->lock);
1035 
1036 		if (new_ifps != 0) {
1037 			fp_state_free(new_ifps, thr_xstate);
1038 		}
1039 	}
1040 	return KERN_SUCCESS;
1041 }
1042 
1043 /*
1044  * Get the floating-point state for a thread.
1045  * If the thread is not the current thread, it is
1046  * not running (held).  Locking needed against
1047  * concurrent fpu_set_state or fpu_get_state.
1048  */
1049 kern_return_t
fpu_get_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)1050 fpu_get_fxstate(
1051 	thread_t        thr_act,
1052 	thread_state_t  tstate,
1053 	thread_flavor_t f)
1054 {
1055 	struct x86_fx_thread_state      *ifps;
1056 	x86_float_state64_t             *state;
1057 	kern_return_t                   ret = KERN_FAILURE;
1058 	pcb_t                           pcb;
1059 	xstate_t                        thr_xstate = thread_xstate(thr_act);
1060 
1061 	if (fpu_capability == UNDEFINED) {
1062 		return KERN_FAILURE;
1063 	}
1064 
1065 	if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
1066 	    fpu_capability < AVX) {
1067 		return KERN_FAILURE;
1068 	}
1069 
1070 	if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
1071 	    thr_xstate != AVX512) {
1072 		return KERN_FAILURE;
1073 	}
1074 
1075 	state = (x86_float_state64_t *)tstate;
1076 
1077 	assert(thr_act != THREAD_NULL);
1078 	pcb = THREAD_TO_PCB(thr_act);
1079 
1080 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1081 
1082 	ifps = pcb->ifps;
1083 	if (ifps == 0) {
1084 		/*
1085 		 * No valid floating-point state.
1086 		 */
1087 
1088 		__nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
1089 		    fp_state_size(FP));
1090 
1091 		simple_unlock(&pcb->lock);
1092 
1093 		return KERN_SUCCESS;
1094 	}
1095 	/*
1096 	 * Make sure we`ve got the latest fp state info
1097 	 * If the live fpu state belongs to our target
1098 	 */
1099 	if (thr_act == current_thread()) {
1100 		boolean_t       intr;
1101 
1102 		intr = ml_set_interrupts_enabled(FALSE);
1103 
1104 		clear_ts();
1105 		fp_save(thr_act);
1106 		clear_fpu();
1107 
1108 		(void)ml_set_interrupts_enabled(intr);
1109 	}
1110 	if (ifps->fp_valid) {
1111 		__nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size(FP));
1112 		switch (thr_xstate) {
1113 		case UNDEFINED_FULL:
1114 		case FP_FULL:
1115 		case AVX_FULL:
1116 		case AVX512_FULL:
1117 			panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
1118 			break;
1119 
1120 		case UNDEFINED:
1121 			panic("fpu_get_fxstate() UNDEFINED xstate");
1122 			break;
1123 		case FP:
1124 			break;                  /* already done */
1125 		case AVX: {
1126 			struct x86_avx_thread_state *iavx = (void *) ifps;
1127 			x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1128 			if (f == x86_AVX_STATE32) {
1129 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1130 			} else if (f == x86_AVX_STATE64) {
1131 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1132 			}
1133 			break;
1134 		}
1135 		case AVX512: {
1136 			struct x86_avx512_thread_state *iavx = (void *) ifps;
1137 			union {
1138 				thread_state_t       ts;
1139 				x86_avx512_state32_t *s32;
1140 				x86_avx512_state64_t *s64;
1141 			} xs = { .ts = tstate };
1142 
1143 			const struct fp_avx512 p = fp_avx512_get(iavx);
1144 
1145 			switch (f) {
1146 			case x86_AVX512_STATE32:
1147 				__nochk_bcopy(p.x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1148 				__nochk_bcopy(p.x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1149 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1150 				DBG_AVX512_STATE(iavx);
1151 				break;
1152 			case x86_AVX_STATE32:
1153 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1154 				break;
1155 			case x86_AVX512_STATE64:
1156 				__nochk_bcopy(p.x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1157 				__nochk_bcopy(p.x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1158 				__nochk_bcopy(p.x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1159 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1160 				DBG_AVX512_STATE(iavx);
1161 				break;
1162 			case x86_AVX_STATE64:
1163 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1164 				break;
1165 			}
1166 			break;
1167 		}
1168 		}
1169 
1170 		ret = KERN_SUCCESS;
1171 	}
1172 	simple_unlock(&pcb->lock);
1173 
1174 	return ret;
1175 }
1176 
1177 
1178 
1179 /*
1180  * the child thread is 'stopped' with the thread
1181  * mutex held and is currently not known by anyone
1182  * so no way for fpu state to get manipulated by an
1183  * outside agency -> no need for pcb lock
1184  */
1185 
1186 void
fpu_dup_fxstate(thread_t parent,thread_t child)1187 fpu_dup_fxstate(
1188 	thread_t        parent,
1189 	thread_t        child)
1190 {
1191 	struct x86_fx_thread_state *new_ifps = NULL;
1192 	boolean_t       intr;
1193 	pcb_t           ppcb;
1194 	xstate_t        xstate = thread_xstate(parent);
1195 
1196 	ppcb = THREAD_TO_PCB(parent);
1197 
1198 	if (ppcb->ifps == NULL) {
1199 		return;
1200 	}
1201 
1202 	if (child->machine.ifps) {
1203 		panic("fpu_dup_fxstate: child's ifps non-null");
1204 	}
1205 
1206 	new_ifps = fp_state_alloc(xstate);
1207 
1208 	simple_lock(&ppcb->lock, LCK_GRP_NULL);
1209 
1210 	if (ppcb->ifps != NULL) {
1211 		struct x86_fx_thread_state *ifps = ppcb->ifps;
1212 		/*
1213 		 * Make sure we`ve got the latest fp state info
1214 		 */
1215 		if (current_thread() == parent) {
1216 			intr = ml_set_interrupts_enabled(FALSE);
1217 			assert(current_thread() == parent);
1218 			clear_ts();
1219 			fp_save(parent);
1220 			clear_fpu();
1221 
1222 			(void)ml_set_interrupts_enabled(intr);
1223 		}
1224 
1225 		if (ifps->fp_valid) {
1226 			child->machine.ifps = new_ifps;
1227 			child->machine.xstate = xstate;
1228 			__nochk_bcopy((char *)(ppcb->ifps),
1229 			    (char *)(child->machine.ifps),
1230 			    fp_state_size(xstate));
1231 
1232 			/* Mark the new fp saved state as non-live. */
1233 			/* Temporarily disabled: radar 4647827
1234 			 * new_ifps->fp_valid = TRUE;
1235 			 */
1236 
1237 			/*
1238 			 * Clear any reserved bits in the MXCSR to prevent a GPF
1239 			 * when issuing an FXRSTOR.
1240 			 */
1241 			new_ifps->fx_MXCSR &= mxcsr_capability_mask;
1242 			new_ifps = NULL;
1243 		}
1244 	}
1245 	simple_unlock(&ppcb->lock);
1246 
1247 	if (new_ifps != NULL) {
1248 		fp_state_free(new_ifps, xstate);
1249 	}
1250 }
1251 
1252 /*
1253  * Initialize FPU.
1254  * FNINIT programs the x87 control word to 0x37f, which matches
1255  * the desired default for macOS.
1256  */
1257 
1258 void
fpinit(void)1259 fpinit(void)
1260 {
1261 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1262 	clear_ts();
1263 	fninit();
1264 #if DEBUG
1265 	/* We skip this power-on-default verification sequence on
1266 	 * non-DEBUG, as dirtying the x87 control word may slow down
1267 	 * xsave/xrstor and affect energy use.
1268 	 */
1269 	unsigned short  control, control2;
1270 	fnstcw(&control);
1271 	control2 = control;
1272 	control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1273 	control |= (FPC_PC_64 |         /* Set precision */
1274 	    FPC_RC_RN |                 /* round-to-nearest */
1275 	    FPC_ZE |                    /* Suppress zero-divide */
1276 	    FPC_OE |                    /*  and overflow */
1277 	    FPC_UE |                    /*  underflow */
1278 	    FPC_IE |                    /* Allow NaNQs and +-INF */
1279 	    FPC_DE |                    /* Allow denorms as operands  */
1280 	    FPC_PE);                    /* No trap for precision loss */
1281 	assert(control == control2);
1282 	fldcw(control);
1283 #endif
1284 	/* Initialize SSE/SSE2 */
1285 	__builtin_ia32_ldmxcsr(0x1f80);
1286 	if (fpu_YMM_capable) {
1287 		vzeroall();
1288 	} else {
1289 		xmmzeroall();
1290 	}
1291 	ml_set_interrupts_enabled(istate);
1292 }
1293 
1294 /*
1295  * Coprocessor not present.
1296  */
1297 
1298 uint64_t x86_isr_fp_simd_use;
1299 
1300 void
fpnoextflt(void)1301 fpnoextflt(void)
1302 {
1303 	boolean_t       intr;
1304 	thread_t        thr_act;
1305 	pcb_t           pcb;
1306 	struct x86_fx_thread_state *ifps = 0;
1307 	xstate_t        xstate = current_xstate();
1308 
1309 	thr_act = current_thread();
1310 	pcb = THREAD_TO_PCB(thr_act);
1311 
1312 	if (pcb->ifps == 0 && !get_interrupt_level()) {
1313 		ifps = fp_state_alloc(xstate);
1314 		__nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
1315 		    fp_state_size(xstate));
1316 		if (!thread_is_64bit_addr(thr_act)) {
1317 			ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
1318 		} else {
1319 			ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
1320 		}
1321 		ifps->fp_valid = TRUE;
1322 	}
1323 	intr = ml_set_interrupts_enabled(FALSE);
1324 
1325 	clear_ts();                     /*  Enable FPU use */
1326 
1327 	if (__improbable(get_interrupt_level())) {
1328 		/* Track number of #DNA traps at interrupt context,
1329 		 * which is likely suboptimal. Racy, but good enough.
1330 		 */
1331 		x86_isr_fp_simd_use++;
1332 		/*
1333 		 * Save current FP/SIMD context if valid
1334 		 * Initialize live FP/SIMD registers
1335 		 */
1336 		if (pcb->ifps) {
1337 			fp_save(thr_act);
1338 		}
1339 		fpinit();
1340 	} else {
1341 		if (pcb->ifps == 0) {
1342 			pcb->ifps = ifps;
1343 			pcb->xstate = xstate;
1344 			ifps = 0;
1345 		}
1346 		/*
1347 		 * Load this thread`s state into coprocessor live context.
1348 		 */
1349 		fp_load(thr_act);
1350 	}
1351 	(void)ml_set_interrupts_enabled(intr);
1352 
1353 	if (ifps) {
1354 		fp_state_free(ifps, xstate);
1355 	}
1356 }
1357 
1358 /*
1359  * FPU overran end of segment.
1360  * Re-initialize FPU.  Floating point state is not valid.
1361  */
1362 
1363 void
fpextovrflt(void)1364 fpextovrflt(void)
1365 {
1366 	thread_t        thr_act = current_thread();
1367 	pcb_t           pcb;
1368 	struct x86_fx_thread_state *ifps;
1369 	boolean_t       intr;
1370 	xstate_t        xstate = current_xstate();
1371 
1372 	intr = ml_set_interrupts_enabled(FALSE);
1373 
1374 	if (get_interrupt_level()) {
1375 		panic("FPU segment overrun exception at interrupt context");
1376 	}
1377 	if (current_task() == kernel_task) {
1378 		panic("FPU segment overrun exception in kernel thread context");
1379 	}
1380 
1381 	/*
1382 	 * This is a non-recoverable error.
1383 	 * Invalidate the thread`s FPU state.
1384 	 */
1385 	pcb = THREAD_TO_PCB(thr_act);
1386 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1387 	ifps = pcb->ifps;
1388 	pcb->ifps = 0;
1389 	simple_unlock(&pcb->lock);
1390 
1391 	/*
1392 	 * Re-initialize the FPU.
1393 	 */
1394 	clear_ts();
1395 	fninit();
1396 
1397 	/*
1398 	 * And disable access.
1399 	 */
1400 	clear_fpu();
1401 
1402 	(void)ml_set_interrupts_enabled(intr);
1403 
1404 	if (ifps) {
1405 		fp_state_free(ifps, xstate);
1406 	}
1407 }
1408 
1409 /*
1410  * FPU error. Called by AST.
1411  */
1412 
1413 void
fpexterrflt(void)1414 fpexterrflt(void)
1415 {
1416 	thread_t        thr_act = current_thread();
1417 	boolean_t       intr;
1418 
1419 	intr = ml_set_interrupts_enabled(FALSE);
1420 
1421 	if (get_interrupt_level()) {
1422 		panic("FPU error exception at interrupt context");
1423 	}
1424 	if (current_task() == kernel_task) {
1425 		panic("FPU error exception in kernel thread context");
1426 	}
1427 
1428 	/*
1429 	 * Save the FPU state and turn off the FPU.
1430 	 */
1431 	fp_save(thr_act);
1432 	/* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1433 	set_ts();
1434 
1435 	(void)ml_set_interrupts_enabled(intr);
1436 }
1437 
1438 /*
1439  * Save FPU state.
1440  *
1441  * Locking not needed:
1442  * .	if called from fpu_get_state, pcb already locked.
1443  * .	if called from fpnoextflt or fp_intr, we are single-cpu
1444  * .	otherwise, thread is running.
1445  * N.B.: Must be called with interrupts disabled
1446  */
1447 
1448 void
fp_save(thread_t thr_act)1449 fp_save(
1450 	thread_t        thr_act)
1451 {
1452 	pcb_t pcb = THREAD_TO_PCB(thr_act);
1453 	struct x86_fx_thread_state *ifps = pcb->ifps;
1454 
1455 	assert(ifps != 0);
1456 	if (ifps != 0 && !ifps->fp_valid) {
1457 		assert((get_cr0() & CR0_TS) == 0);
1458 		/* registers are in FPU */
1459 		ifps->fp_valid = TRUE;
1460 		fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1461 	}
1462 }
1463 
1464 /*
1465  * Restore FPU state from PCB.
1466  *
1467  * Locking not needed; always called on the current thread.
1468  */
1469 
1470 void
fp_load(thread_t thr_act)1471 fp_load(
1472 	thread_t        thr_act)
1473 {
1474 	pcb_t pcb = THREAD_TO_PCB(thr_act);
1475 	struct x86_fx_thread_state *ifps = pcb->ifps;
1476 
1477 	assert(ifps);
1478 #if     DEBUG
1479 	if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1480 		panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u",
1481 		    ifps->fp_valid, ifps->fp_save_layout);
1482 	}
1483 #endif
1484 
1485 	if (ifps->fp_valid == FALSE) {
1486 		fpinit();
1487 	} else {
1488 		fpu_load_registers(ifps);
1489 	}
1490 	ifps->fp_valid = FALSE;         /* in FPU */
1491 }
1492 
1493 /*
1494  * SSE arithmetic exception handling code.
1495  * Basically the same as the x87 exception handler with a different subtype
1496  */
1497 
1498 void
fpSSEexterrflt(void)1499 fpSSEexterrflt(void)
1500 {
1501 	thread_t        thr_act = current_thread();
1502 	boolean_t       intr;
1503 
1504 	intr = ml_set_interrupts_enabled(FALSE);
1505 
1506 	if (get_interrupt_level()) {
1507 		panic("SSE exception at interrupt context");
1508 	}
1509 	if (current_task() == kernel_task) {
1510 		panic("SSE exception in kernel thread context");
1511 	}
1512 
1513 	/*
1514 	 * Save the FPU state and turn off the FPU.
1515 	 */
1516 	fp_save(thr_act);
1517 	/* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1518 	set_ts();
1519 
1520 	(void)ml_set_interrupts_enabled(intr);
1521 }
1522 
1523 
1524 /*
1525  * If a thread is using an AVX-sized savearea:
1526  * - allocate a new AVX512-sized  area,
1527  * - copy the 256-bit state into the 512-bit area,
1528  * - deallocate the smaller area
1529  * ASSUMES: thread is the current thread.
1530  */
1531 static void
fpu_savearea_promote_avx512(thread_t thread)1532 fpu_savearea_promote_avx512(thread_t thread)
1533 {
1534 	struct x86_avx_thread_state     *ifps = NULL;
1535 	struct x86_avx512_thread_state  *ifps512 = NULL;
1536 	pcb_t                           pcb = THREAD_TO_PCB(thread);
1537 	boolean_t                       do_avx512_alloc = FALSE;
1538 	boolean_t                       intr;
1539 
1540 	assert(thread == current_thread());
1541 
1542 	DBG("fpu_savearea_promote_avx512(%p)\n", thread);
1543 
1544 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1545 
1546 	ifps = pcb->ifps;
1547 	if (ifps == NULL) {
1548 		pcb->xstate = AVX512;
1549 		simple_unlock(&pcb->lock);
1550 		/*
1551 		 * Now that the PCB xstate has been promoted, set XCR0 so
1552 		 * that we don't re-trip #UD on the next AVX-512 instruction.
1553 		 *
1554 		 * Since this branch is taken when the first FP instruction
1555 		 * attempted by this thread is an AVX-512 instruction, we
1556 		 * call fpnoextflt() to allocate an appropriately-sized
1557 		 * AVX-512 save-area, thereby avoiding the overhead of another
1558 		 * fault that would be triggered immediately on return.
1559 		 */
1560 		intr = ml_set_interrupts_enabled(FALSE);
1561 		xsetbv(0, AVX512_XMASK);
1562 		current_cpu_datap()->cpu_xstate = AVX512;
1563 		(void)ml_set_interrupts_enabled(intr);
1564 
1565 		fpnoextflt();
1566 		return;
1567 	}
1568 
1569 	if (pcb->xstate != AVX512) {
1570 		do_avx512_alloc = TRUE;
1571 	}
1572 
1573 	simple_unlock(&pcb->lock);
1574 
1575 	if (do_avx512_alloc == TRUE) {
1576 		ifps512 = fp_state_alloc(AVX512);
1577 	}
1578 
1579 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1580 
1581 	intr = ml_set_interrupts_enabled(FALSE);
1582 
1583 	clear_ts();
1584 	fp_save(thread);
1585 	clear_fpu();
1586 
1587 	xsetbv(0, AVX512_XMASK);
1588 	current_cpu_datap()->cpu_xstate = AVX512;
1589 	(void)ml_set_interrupts_enabled(intr);
1590 
1591 	assert(ifps->fp.fp_valid);
1592 
1593 	/* Allocate an AVX512 savearea and copy AVX state into it */
1594 	if (pcb->xstate != AVX512) {
1595 		__nochk_bcopy(ifps, ifps512, fp_state_size(AVX));
1596 		pcb->ifps = ifps512;
1597 		pcb->xstate = AVX512;
1598 		ifps512 = NULL;
1599 	} else {
1600 		ifps = NULL;
1601 	}
1602 	/* The PCB lock is redundant in some scenarios given the higher level
1603 	 * thread mutex, but its pre-emption disablement is relied upon here
1604 	 */
1605 	simple_unlock(&pcb->lock);
1606 
1607 	if (ifps) {
1608 		fp_state_free(ifps, AVX);
1609 	}
1610 	if (ifps512) {
1611 		fp_state_free(ifps, AVX512);
1612 	}
1613 }
1614 
1615 /*
1616  * Upgrade the calling thread to AVX512.
1617  */
1618 boolean_t
fpu_thread_promote_avx512(thread_t thread)1619 fpu_thread_promote_avx512(thread_t thread)
1620 {
1621 	task_t          task = current_task();
1622 
1623 	if (thread != current_thread()) {
1624 		return FALSE;
1625 	}
1626 	if (!ml_fpu_avx512_enabled()) {
1627 		return FALSE;
1628 	}
1629 
1630 	fpu_savearea_promote_avx512(thread);
1631 
1632 	/* Racy but the task's xstate is only a hint */
1633 	task->xstate = AVX512;
1634 
1635 	return TRUE;
1636 }
1637 
1638 
1639 /*
1640  * Called from user_trap() when an invalid opcode fault is taken.
1641  * If the user is attempting an AVX512 instruction on a machine
1642  * that supports this, we switch the calling thread to use
1643  * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1644  * return to user_trap() with a 0 return value.
1645  * Otherwise, simply return a nonzero value.
1646  */
1647 
1648 #define MAX_X86_INSN_LENGTH (15)
1649 int
fpUDflt(user_addr_t rip)1650 fpUDflt(user_addr_t rip)
1651 {
1652 	uint8_t         instruction_prefix;
1653 	boolean_t       is_AVX512_instruction = FALSE;
1654 	user_addr_t     original_rip = rip;
1655 
1656 	/*
1657 	 * If this thread's xstate is already AVX512, then this #UD is
1658 	 * a true #UD.
1659 	 */
1660 	if (thread_xstate(current_thread()) == AVX512) {
1661 		return 1;
1662 	}
1663 
1664 	do {
1665 		/* TODO: as an optimisation, copy up to the lesser of the
1666 		 * next page boundary or maximal prefix length in one pass
1667 		 * rather than issue multiple copyins
1668 		 */
1669 		if (copyin(rip, (char *) &instruction_prefix, 1)) {
1670 			return 1;
1671 		}
1672 		DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1673 		    rip, instruction_prefix);
1674 		/* TODO: determine more specifically which prefixes
1675 		 * are sane possibilities for AVX512 insns
1676 		 */
1677 		switch (instruction_prefix) {
1678 		case 0x2E:      /* CS segment override */
1679 		case 0x36:      /* SS segment override */
1680 		case 0x3E:      /* DS segment override */
1681 		case 0x26:      /* ES segment override */
1682 		case 0x64:      /* FS segment override */
1683 		case 0x65:      /* GS segment override */
1684 		case 0x66:      /* Operand-size override */
1685 		case 0x67:      /* address-size override */
1686 			/* Skip optional prefixes */
1687 			rip++;
1688 			if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
1689 				return 1;
1690 			}
1691 			break;
1692 		case 0x62:      /* EVEX */
1693 		case 0xC5:      /* VEX 2-byte */
1694 		case 0xC4:      /* VEX 3-byte */
1695 			is_AVX512_instruction = TRUE;
1696 			break;
1697 		default:
1698 			return 1;
1699 		}
1700 	} while (!is_AVX512_instruction);
1701 
1702 	/* Here if we detect attempted execution of an AVX512 instruction */
1703 
1704 	/*
1705 	 * Fail if this machine doesn't support AVX512
1706 	 */
1707 	if (!fpu_ZMM_capable) {
1708 		return 1;
1709 	}
1710 
1711 	assert(xgetbv(XCR0) == AVX_XMASK);
1712 
1713 	DBG("fpUDflt() switching xstate to AVX512\n");
1714 	(void) fpu_thread_promote_avx512(current_thread());
1715 
1716 	return 0;
1717 }
1718 
1719 void
fp_setvalid(boolean_t value)1720 fp_setvalid(boolean_t value)
1721 {
1722 	thread_t        thr_act = current_thread();
1723 	struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
1724 
1725 	if (ifps) {
1726 		ifps->fp_valid = value;
1727 
1728 		if (value == TRUE) {
1729 			boolean_t istate = ml_set_interrupts_enabled(FALSE);
1730 			clear_fpu();
1731 			ml_set_interrupts_enabled(istate);
1732 		}
1733 	}
1734 }
1735 
1736 boolean_t
ml_fpu_avx_enabled(void)1737 ml_fpu_avx_enabled(void)
1738 {
1739 	return fpu_capability >= AVX;
1740 }
1741 
1742 boolean_t
ml_fpu_avx512_enabled(void)1743 ml_fpu_avx512_enabled(void)
1744 {
1745 	return fpu_ZMM_capable;
1746 }
1747 
1748 static xstate_t
thread_xstate(thread_t thread)1749 thread_xstate(thread_t thread)
1750 {
1751 	xstate_t xs = THREAD_TO_PCB(thread)->xstate;
1752 	if (xs != UNDEFINED) {
1753 		return xs;
1754 	} else if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
1755 		return fpu_default;
1756 	} else {
1757 		return get_threadtask(thread)->xstate;
1758 	}
1759 }
1760 
1761 xstate_t
current_xstate(void)1762 current_xstate(void)
1763 {
1764 	return thread_xstate(current_thread());
1765 }
1766 
1767 /*
1768  * Called when exec'ing between bitnesses.
1769  * If valid FPU state exists, adjust the layout.
1770  */
1771 void
fpu_switch_addrmode(thread_t thread,boolean_t is_64bit)1772 fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1773 {
1774 	struct x86_fx_thread_state *ifps = thread->machine.ifps;
1775 	mp_disable_preemption();
1776 
1777 	if (ifps && ifps->fp_valid) {
1778 		if (thread_xstate(thread) == FP) {
1779 			ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1780 		} else {
1781 			ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1782 		}
1783 	}
1784 	mp_enable_preemption();
1785 }
1786 
1787 #if DEBUG || DEVELOPMENT
1788 static inline uint32_t
fpsimd_pop(uintptr_t ins,int sz)1789 fpsimd_pop(uintptr_t ins, int sz)
1790 {
1791 	uint32_t rv = 0;
1792 
1793 
1794 	while (sz >= 16) {
1795 		uint32_t rv1, rv2;
1796 		uint64_t *ins64 = (uint64_t *) ins;
1797 		uint64_t *ins642 = (uint64_t *) (ins + 8);
1798 		rv1 = __builtin_popcountll(*ins64);
1799 		rv2 = __builtin_popcountll(*ins642);
1800 		rv += rv1 + rv2;
1801 		sz -= 16;
1802 		ins += 16;
1803 	}
1804 
1805 	while (sz >= 4) {
1806 		uint32_t *ins32 = (uint32_t *) ins;
1807 		rv += __builtin_popcount(*ins32);
1808 		sz -= 4;
1809 		ins += 4;
1810 	}
1811 
1812 	while (sz > 0) {
1813 		char *ins8 = (char *)ins;
1814 		rv += __builtin_popcount(*ins8);
1815 		sz--;
1816 		ins++;
1817 	}
1818 	return rv;
1819 }
1820 
1821 bool
thread_fpsimd_hash_enabled(void)1822 thread_fpsimd_hash_enabled(void)
1823 {
1824 	return fpsimd_fault_popc ? true : false;
1825 }
1826 
1827 uint32_t __attribute__((noinline))
thread_fpsimd_hash(thread_t ft)1828 thread_fpsimd_hash(thread_t ft)
1829 {
1830 	uint32_t prv = 0;
1831 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1832 	struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1833 
1834 	if (pifps) {
1835 		if (pifps->fp_valid) {
1836 			prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1837 			    sizeof(pifps->fx_XMM_reg));
1838 		} else {
1839 			uintptr_t cr0 = get_cr0();
1840 			/*
1841 			 * The unusual case where the fp save area is not valid, yet TS is set,
1842 			 * is used to perform a lazy-init of FP state, so for this specific case,
1843 			 * assume that the popcount of the FP regs is 0.
1844 			 */
1845 			if (!(cr0 & CR0_TS)) {
1846 				fp_save(ft);
1847 				prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1848 				    sizeof(pifps->fx_XMM_reg));
1849 				pifps->fp_valid = FALSE;
1850 			}
1851 		}
1852 	}
1853 	ml_set_interrupts_enabled(istate);
1854 	return prv;
1855 }
1856 #endif /* DEBUG || DEVELOPMENT */
1857