1 /*
2 * Copyright (c) 2000-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57 #include <mach/exception_types.h>
58 #include <mach/i386/thread_status.h>
59 #include <mach/i386/fp_reg.h>
60
61 #include <kern/mach_param.h>
62 #include <kern/processor.h>
63 #include <kern/thread.h>
64 #include <kern/zalloc.h>
65 #include <kern/misc_protos.h>
66 #include <kern/spl.h>
67 #include <kern/assert.h>
68
69 #include <libkern/OSAtomic.h>
70
71 #include <architecture/i386/pio.h>
72 #include <i386/cpuid.h>
73 #include <i386/fpu.h>
74 #include <i386/proc_reg.h>
75 #include <i386/misc_protos.h>
76 #include <i386/thread.h>
77 #include <i386/trap_internal.h>
78
79 xstate_t fpu_capability = UNDEFINED; /* extended state capability */
80 xstate_t fpu_default = UNDEFINED; /* default extended state */
81
82 #define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0)
83 #define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
86
87 /* Forward */
88
89 extern void fpinit(void);
90 extern void fp_save(
91 thread_t thr_act);
92 extern void fp_load(
93 thread_t thr_act);
94
95 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
96 static xstate_t thread_xstate(thread_t);
97
98 static x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
99 static x86_ext_thread_state_t default_avx512_state __attribute((aligned(64)));
100 static x86_ext_thread_state_t default_avx_state __attribute((aligned(64)));
101 static x86_ext_thread_state_t default_fx_state __attribute((aligned(64)));
102
103 /* Global MXCSR capability bitmask */
104 static unsigned int mxcsr_capability_mask;
105
106 #define fninit() \
107 __asm__ volatile("fninit")
108
109 #define fnstcw(control) \
110 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111
112 #define fldcw(control) \
113 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
114
115 #define fnclex() \
116 __asm__ volatile("fnclex")
117
118 #define fnsave(state) \
119 __asm__ volatile("fnsave %0" : "=m" (*state))
120
121 #define frstor(state) \
122 __asm__ volatile("frstor %0" : : "m" (state))
123
124 #define fwait() \
125 __asm__("fwait");
126
127 static inline void
fxrstor(struct x86_fx_thread_state * a)128 fxrstor(struct x86_fx_thread_state *a)
129 {
130 __asm__ __volatile__ ("fxrstor %0" :: "m" (*a));
131 }
132
133 static inline void
fxsave(struct x86_fx_thread_state * a)134 fxsave(struct x86_fx_thread_state *a)
135 {
136 __asm__ __volatile__ ("fxsave %0" : "=m" (*a));
137 }
138
139 static inline void
fxrstor64(struct x86_fx_thread_state * a)140 fxrstor64(struct x86_fx_thread_state *a)
141 {
142 __asm__ __volatile__ ("fxrstor64 %0" :: "m" (*a));
143 }
144
145 static inline void
fxsave64(struct x86_fx_thread_state * a)146 fxsave64(struct x86_fx_thread_state *a)
147 {
148 __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
149 }
150
151 #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
152
153 SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
154 [FP] = NULL,
155 [AVX] = NULL,
156 [AVX512] = NULL
157 };
158
159 #define AVX512_OFFSET offsetof(struct x86_avx512_thread_state, x_Opmask)
160
161 SECURITY_READ_ONLY_LATE(uint32_t) fp_avx512_offset = AVX512_OFFSET;
162
163 static uint32_t
fp_state_size(xstate_t state)164 fp_state_size(xstate_t state)
165 {
166 switch (state) {
167 case FP:
168 return sizeof(struct x86_fx_thread_state);
169 case AVX:
170 return sizeof(struct x86_avx_thread_state);
171 case AVX512:
172 switch (fp_avx512_offset) {
173 case AVX512_OFFSET:
174 return sizeof(struct x86_avx512_thread_state);
175 default:
176 panic("avx512 offset %d", fp_avx512_offset);
177 }
178 default:
179 panic("bad state %d", state);
180 }
181 }
182
183 struct fp_avx512 {
184 uint64_t *x_Opmask;
185 reg256_t *x_ZMM_Hi256;
186 reg512_t *x_Hi16_ZMM;
187 };
188
189 static struct fp_avx512
fp_avx512_get(struct x86_avx512_thread_state * iavx)190 fp_avx512_get(struct x86_avx512_thread_state *iavx)
191 {
192 switch (fp_avx512_offset) {
193 case AVX512_OFFSET:
194 return (struct fp_avx512) {
195 .x_Opmask = iavx->x_Opmask,
196 .x_ZMM_Hi256 = iavx->x_ZMM_Hi256,
197 .x_Hi16_ZMM = iavx->x_Hi16_ZMM,
198 };
199 default:
200 panic("bad offset");
201 }
202 }
203
204 static const char *const xstate_name[] = {
205 [UNDEFINED] = "UNDEFINED",
206 [FP] = "FP",
207 [AVX] = "AVX",
208 [AVX512] = "AVX512"
209 };
210
211 #define fpu_ZMM_capable (fpu_capability == AVX512)
212 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
213
214 /*
215 * On-demand AVX512 support
216 * ------------------------
217 * On machines with AVX512 support, by default, threads are created with
218 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
219 * capabilities are advertised in the commpage and via sysctl. If a thread
220 * opts to use AVX512 instructions, the first will result in a #UD exception.
221 * Faulting AVX512 intructions are recognizable by their unique prefix.
222 * This exception results in the thread being promoted to use an AVX512-sized
223 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
224 * instruction is re-driven and the thread can proceed to perform AVX512
225 * operations.
226 *
227 * In addition to AVX512 instructions causing promotion, the thread_set_state()
228 * primitive with an AVX512 state flavor result in promotion.
229 *
230 * AVX512 promotion of the first thread in a task causes the default xstate
231 * of the task to be promoted so that any subsequently created or subsequently
232 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
233 * a promoted xstate.
234 *
235 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
236 * and a second pool of larger AVX512-sized (2688 byte) areas.
237 *
238 * Note the initial state value is an AVX512 object but that the AVX initial
239 * value is a subset of it.
240 */
241 static uint32_t cpuid_reevaluated = 0;
242
243 static void fpu_store_registers(void *, boolean_t);
244 static void fpu_load_registers(void *);
245
246 static const uint32_t xstate_xmask[] = {
247 [FP] = FP_XMASK,
248 [AVX] = AVX_XMASK,
249 [AVX512] = AVX512_XMASK
250 };
251
252 static inline void
xsave(struct x86_fx_thread_state * a,uint32_t rfbm)253 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
254 {
255 __asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
256 }
257
258 static inline void
xsave64(struct x86_fx_thread_state * a,uint32_t rfbm)259 xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
260 {
261 __asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
262 }
263
264 static inline void
xrstor(struct x86_fx_thread_state * a,uint32_t rfbm)265 xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
266 {
267 __asm__ __volatile__ ("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
268 }
269
270 static inline void
xrstor64(struct x86_fx_thread_state * a,uint32_t rfbm)271 xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
272 {
273 __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
274 }
275
276 __unused static inline void
vzeroupper(void)277 vzeroupper(void)
278 {
279 __asm__ __volatile__ ("vzeroupper" ::);
280 }
281
282 static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
283
284
285 /*
286 * Furthermore, make compile-time asserts that no padding creeps into structures
287 * for which we're doing this.
288 */
289 #define ASSERT_PACKED(t, m1, m2, n, mt) \
290 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
291 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
292
293 ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
294
295 ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
296
297 ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
298 ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
299 ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
300
301 ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
302 ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
303 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
304 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
305
306 #if defined(DEBUG_AVX512)
307
308 #define DBG(x...) kprintf("DBG: " x)
309
310 typedef struct { uint8_t byte[8]; } opmask_t;
311 typedef struct { uint8_t byte[16]; } xmm_t;
312 typedef struct { uint8_t byte[32]; } ymm_t;
313 typedef struct { uint8_t byte[64]; } zmm_t;
314
315 static void
DBG_AVX512_STATE(struct x86_avx512_thread_state * sp)316 DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
317 {
318 const xmm_t *xmm = (const xmm_t *) &sp->fp.fx_XMM_reg;
319 const xmm_t *ymmh = (const xmm_t *) &sp->x_YMM_Hi128;
320
321 const struct fp_avx512 p = fp_avx512_get(sp);
322 const ymm_t *zmmh = (const ymm_t *) &p.x_ZMM_Hi256;
323 const zmm_t *zmm = (const zmm_t *) &p.x_Hi16_ZMM;
324 const opmask_t *k = (const opmask_t *) &p.x_Opmask;
325
326 switch (fp_avx512_offset) {
327 case AVX512_OFFSET:
328 kprintf("x_YMM_Hi128: %lu\n",
329 offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
330 kprintf("x_Opmask: %lu\n",
331 offsetof(struct x86_avx512_thread_state, x_Opmask));
332 kprintf("x_ZMM_Hi256: %lu\n",
333 offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
334 kprintf("x_Hi16_ZMM: %lu\n",
335 offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
336 break;
337 default:
338 break;
339 }
340
341 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
342 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
343
344 /* Print all ZMM registers */
345 for (int i = 0; i < 16; i++) {
346 kprintf("zmm%d:\t0x", i);
347 for (int j = 0; j < 16; j++) {
348 kprintf("%02x", xmm[i].byte[j]);
349 }
350 for (int j = 0; j < 16; j++) {
351 kprintf("%02x", ymmh[i].byte[j]);
352 }
353 for (int j = 0; j < 32; j++) {
354 kprintf("%02x", zmmh[i].byte[j]);
355 }
356 kprintf("\n");
357 }
358 for (int i = 0; i < 16; i++) {
359 kprintf("zmm%d:\t0x", 16 + i);
360 for (int j = 0; j < 64; j++) {
361 kprintf("%02x", zmm[i].byte[j]);
362 }
363 kprintf("\n");
364 }
365 for (int i = 0; i < 8; i++) {
366 kprintf("k%d:\t0x", i);
367 for (int j = 0; j < 8; j++) {
368 kprintf("%02x", k[i].byte[j]);
369 }
370 kprintf("\n");
371 }
372
373 kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
374 kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
375 }
376 #else
377 #define DBG(x...)
378 static void
DBG_AVX512_STATE(__unused struct x86_avx512_thread_state * sp)379 DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
380 {
381 }
382 #endif /* DEBUG_AVX512 */
383
384 #if DEBUG
385 static inline unsigned short
fnstsw(void)386 fnstsw(void)
387 {
388 unsigned short status;
389 __asm__ volatile ("fnstsw %0" : "=ma" (status));
390 return status;
391 }
392 #endif
393
394 /*
395 * Configure the initial FPU state presented to new threads.
396 * Determine the MXCSR capability mask, which allows us to mask off any
397 * potentially unsafe "reserved" bits before restoring the FPU context.
398 * *Not* per-cpu, assumes symmetry.
399 */
400
401 static void
configure_mxcsr_capability_mask(x86_ext_thread_state_t * fps)402 configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
403 {
404 /* XSAVE requires a 64 byte aligned store */
405 assert(ALIGNED(fps, 64));
406 /* Clear, to prepare for the diagnostic FXSAVE */
407 bzero(fps, sizeof(*fps));
408
409 fpinit();
410 fpu_store_registers(fps, FALSE);
411
412 mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
413
414 /* Set default mask value if necessary */
415 if (mxcsr_capability_mask == 0) {
416 mxcsr_capability_mask = 0xffbf;
417 }
418
419 /* Clear vector register store */
420 bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
421 bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
422 if (fpu_ZMM_capable) {
423 const struct fp_avx512 p = fp_avx512_get(&fps->avx512);
424 bzero(p.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
425 bzero(p.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
426 bzero(p.x_Opmask, sizeof(fps->avx512.x_Opmask));
427 }
428
429 fps->fx.fp_valid = TRUE;
430 fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
431 fpu_load_registers(fps);
432
433 if (fpu_ZMM_capable) {
434 xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
435 }
436 if (fpu_YMM_capable) {
437 xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
438 } else {
439 fxsave64((struct x86_fx_thread_state *)&default_fx_state);
440 }
441
442 /* Poison values to trap unsafe usage */
443 fps->fx.fp_valid = 0xFFFFFFFF;
444 fps->fx.fp_save_layout = FP_UNUSED;
445
446 /* Re-enable FPU/SSE DNA exceptions */
447 set_ts();
448 }
449
450 #if DEBUG || DEVELOPMENT
451 int fpsimd_fault_popc = 1;
452 #endif
453
454 /*
455 * Look for FPU and initialize it.
456 * Called on each CPU.
457 */
458 void
init_fpu(void)459 init_fpu(void)
460 {
461 #if DEBUG
462 unsigned short status;
463 unsigned short control;
464 #endif
465 /*
466 * Check for FPU by initializing it,
467 * then trying to read the correct bit patterns from
468 * the control and status registers.
469 */
470 set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE); /* allow use of FPU */
471 fninit();
472 #if DEBUG
473 status = fnstsw();
474 fnstcw(&control);
475
476 assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
477 #endif
478 /* Advertise SSE support */
479 if (cpuid_features() & CPUID_FEATURE_FXSR) {
480 set_cr4(get_cr4() | CR4_OSFXS);
481 /* And allow SIMD exceptions if present */
482 if (cpuid_features() & CPUID_FEATURE_SSE) {
483 set_cr4(get_cr4() | CR4_OSXMM);
484 }
485 } else {
486 panic("fpu is not FP_FXSR");
487 }
488
489 fpu_capability = fpu_default = FP;
490
491 static boolean_t is_avx512_enabled = TRUE;
492 if (cpu_number() == master_cpu) {
493 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
494 PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
495 kprintf("AVX512 supported %s\n",
496 is_avx512_enabled ? "and enabled" : "but disabled");
497 }
498 }
499
500 /* Configure the XSAVE context mechanism if the processor supports
501 * AVX/YMM registers
502 */
503 if (cpuid_features() & CPUID_FEATURE_XSAVE) {
504 cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
505 if (is_avx512_enabled &&
506 (xs0p->extended_state[eax] & XFEM_ZMM_OPMASK) == XFEM_ZMM_OPMASK) {
507 assert(xs0p->extended_state[eax] & XFEM_SSE);
508 assert(xs0p->extended_state[eax] & XFEM_YMM);
509 fpu_capability = AVX512;
510 /* XSAVE container size for all features */
511 set_cr4(get_cr4() | CR4_OSXSAVE);
512 xsetbv(0, AVX512_XMASK);
513 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
514 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
515 cpuid_set_info();
516 }
517 /* Verify that now selected state can be accommodated */
518 assert(xs0p->extended_state[ebx] == fp_state_size(AVX512));
519 /*
520 * AVX set until AVX512 is used.
521 * See comment above about on-demand AVX512 support.
522 */
523 xsetbv(0, AVX_XMASK);
524 fpu_default = AVX;
525 } else if (xs0p->extended_state[eax] & XFEM_YMM) {
526 assert(xs0p->extended_state[eax] & XFEM_SSE);
527 fpu_capability = AVX;
528 fpu_default = AVX;
529 /* XSAVE container size for all features */
530 set_cr4(get_cr4() | CR4_OSXSAVE);
531 xsetbv(0, AVX_XMASK);
532 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
533 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
534 cpuid_set_info();
535 }
536 /* Verify that now selected state can be accommodated */
537 assert(xs0p->extended_state[ebx] == fp_state_size(AVX));
538 }
539 }
540
541 if (cpu_number() == master_cpu) {
542 kprintf("fpu_state: %s, state_size: %d\n",
543 xstate_name[fpu_capability],
544 fp_state_size(fpu_capability));
545 }
546
547 fpinit();
548 current_cpu_datap()->cpu_xstate = fpu_default;
549
550 /*
551 * Trap wait instructions. Turn off FPU for now.
552 */
553 set_cr0(get_cr0() | CR0_TS | CR0_MP);
554 }
555
556 /*
557 * Allocate and initialize FP state for specified xstate.
558 * Don't load state.
559 */
560 static void *
fp_state_alloc(xstate_t xs)561 fp_state_alloc(xstate_t xs)
562 {
563 assert(ifps_zone[xs] != NULL);
564 return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
565 }
566
567 static inline void
fp_state_free(void * ifps,xstate_t xs)568 fp_state_free(void *ifps, xstate_t xs)
569 {
570 assert(ifps_zone[xs] != NULL);
571 zfree(ifps_zone[xs], ifps);
572 }
573
574 void
clear_fpu(void)575 clear_fpu(void)
576 {
577 set_ts();
578 }
579
580 static boolean_t
fpu_allzeroes(uint64_t * __attribute ((aligned (8)))ptr,uint32_t size)581 fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
582 {
583 VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
584 assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
585
586 for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
587 if (ptr[count] != 0) {
588 return FALSE;
589 }
590 }
591 return TRUE;
592 }
593
594 static void
fpu_load_registers(void * fstate)595 fpu_load_registers(void *fstate)
596 {
597 struct x86_fx_thread_state *ifps = fstate;
598 fp_save_layout_t layout = ifps->fp_save_layout;
599
600 assert(startup_phase < STARTUP_SUB_EARLY_BOOT || \
601 (thread_is_64bit_addr(current_thread()) ? \
602 (layout == FXSAVE64 || layout == XSAVE64) : \
603 (layout == FXSAVE32 || layout == XSAVE32)));
604 assert(ALIGNED(ifps, 64));
605 assert(ml_get_interrupts_enabled() == FALSE);
606
607 #if DEBUG
608 if (layout == XSAVE32 || layout == XSAVE64) {
609 struct x86_avx_thread_state *iavx = fstate;
610 unsigned i;
611 /* Verify reserved bits in the XSAVE header*/
612 if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
613 panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
614 }
615 for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
616 if (iavx->_xh.xhrsvd[i]) {
617 panic("Reserved bit set");
618 }
619 }
620 }
621 if (fpu_YMM_capable) {
622 if (layout != XSAVE32 && layout != XSAVE64) {
623 panic("Inappropriate layout: %u", layout);
624 }
625 }
626 #endif /* DEBUG */
627
628 switch (layout) {
629 case FXSAVE64:
630 fxrstor64(ifps);
631 break;
632 case FXSAVE32:
633 fxrstor(ifps);
634 break;
635 case XSAVE64:
636 xrstor64(ifps, xstate_xmask[current_xstate()]);
637 break;
638 case XSAVE32:
639 xrstor(ifps, xstate_xmask[current_xstate()]);
640 break;
641 default:
642 panic("fpu_load_registers() bad layout: %d", layout);
643 }
644 }
645
646 static void
fpu_store_registers(void * fstate,boolean_t is64)647 fpu_store_registers(void *fstate, boolean_t is64)
648 {
649 struct x86_fx_thread_state *ifps = fstate;
650 assert(ALIGNED(ifps, 64));
651 xstate_t xs = current_xstate();
652 switch (xs) {
653 case FP:
654 if (is64) {
655 fxsave64(fstate);
656 ifps->fp_save_layout = FXSAVE64;
657 } else {
658 fxsave(fstate);
659 ifps->fp_save_layout = FXSAVE32;
660 }
661 break;
662 case AVX:
663 case AVX512:
664 if (is64) {
665 xsave64(ifps, xstate_xmask[xs]);
666 ifps->fp_save_layout = XSAVE64;
667 } else {
668 xsave(ifps, xstate_xmask[xs]);
669 ifps->fp_save_layout = XSAVE32;
670 }
671 break;
672 default:
673 panic("fpu_store_registers() bad xstate: %d", xs);
674 }
675 }
676
677 /*
678 * Initialize FP handling.
679 */
680
681 void
fpu_module_init(void)682 fpu_module_init(void)
683 {
684 if (!IS_VALID_XSTATE(fpu_default)) {
685 panic("fpu_module_init: invalid extended state %u",
686 fpu_default);
687 }
688
689 /* To maintain the required alignment, disable
690 * zone debugging for this zone as that appends
691 * 16 bytes to each element.
692 */
693 ifps_zone[fpu_default] = zone_create("x86 fpsave state",
694 fp_state_size(fpu_default), ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
695
696 /*
697 * If AVX512 is supported, create a separate savearea zone.
698 */
699 if (fpu_ZMM_capable) {
700 ifps_zone[AVX512] = zone_create("x86 avx512 save state",
701 fp_state_size(AVX512), ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
702 }
703
704 /* Determine MXCSR reserved bits and configure initial FPU state*/
705 configure_mxcsr_capability_mask(&initial_fp_state);
706
707 #if DEBUG || DEVELOPMENT
708 if (kern_feature_override(KF_DISABLE_FP_POPC_ON_PGFLT)) {
709 fpsimd_fault_popc = 0;
710 }
711
712 /* Allow the explicit boot-arg to override the validation disables */
713 PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
714 #endif
715 }
716
717 /*
718 * Context switch fpu state.
719 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
720 * Switch to the new task's xstate.
721 */
722
723 void
fpu_switch_context(thread_t old,thread_t new)724 fpu_switch_context(thread_t old, thread_t new)
725 {
726 struct x86_fx_thread_state *ifps;
727 cpu_data_t *cdp = current_cpu_datap();
728 xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
729
730 assert(ml_get_interrupts_enabled() == FALSE);
731 ifps = (old)->machine.ifps;
732 #if DEBUG
733 if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
734 panic("ifps->fp_valid: %u", ifps->fp_valid);
735 }
736 #endif
737 if (ifps != 0 && (ifps->fp_valid == FALSE)) {
738 /* Clear CR0.TS in preparation for the FP context save. In
739 * theory, this shouldn't be necessary since a live FPU should
740 * indicate that TS is clear. However, various routines
741 * (such as sendsig & sigreturn) manipulate TS directly.
742 */
743 clear_ts();
744 /* registers are in FPU - save to memory */
745 boolean_t is64 = (thread_is_64bit_addr(old) &&
746 is_saved_state64(old->machine.iss));
747
748 fpu_store_registers(ifps, is64);
749 ifps->fp_valid = TRUE;
750
751 if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
752 xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
753 } else if (fpu_YMM_capable) {
754 xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
755 } else {
756 fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
757 }
758 }
759
760 assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
761 if (new_xstate != (xstate_t) cdp->cpu_xstate) {
762 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
763 old, new, xstate_name[new_xstate]);
764 xsetbv(0, xstate_xmask[new_xstate]);
765 cdp->cpu_xstate = new_xstate;
766 }
767 set_ts();
768 }
769
770
771 /*
772 * Free a FPU save area.
773 * Called only when thread terminating - no locking necessary.
774 */
775 void
fpu_free(thread_t thread,void * fps)776 fpu_free(thread_t thread, void *fps)
777 {
778 pcb_t pcb = THREAD_TO_PCB(thread);
779
780 fp_state_free(fps, pcb->xstate);
781 pcb->xstate = UNDEFINED;
782 }
783
784 /*
785 * Set the floating-point state for a thread based on the FXSave formatted data.
786 * This is basically the same as fpu_set_state except it uses the expanded data
787 * structure.
788 * If the thread is not the current thread, it is not running (held). Locking
789 * needed against concurrent fpu_set_state or fpu_get_state.
790 *
791 * While translating between XNU FP state structures and the CPU-native XSAVE area,
792 * if we detect state components that are all zeroes, we clear the corresponding
793 * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
794 * be initialized to a "clean" state. That's most important when clearing the YMM
795 * bit, since an initialized "upper clean" state results in a massive performance
796 * improvement due to elimination of false dependencies between the XMMs and the
797 * upper bits of the YMMs.
798 */
799 kern_return_t
fpu_set_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)800 fpu_set_fxstate(
801 thread_t thr_act,
802 thread_state_t tstate,
803 thread_flavor_t f)
804 {
805 struct x86_fx_thread_state *ifps;
806 struct x86_fx_thread_state *new_ifps;
807 x86_float_state64_t *state;
808 pcb_t pcb;
809 boolean_t old_valid, fresh_state = FALSE;
810 xstate_t thr_xstate;
811
812 if (fpu_capability == UNDEFINED) {
813 return KERN_FAILURE;
814 }
815
816 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
817 fpu_capability < AVX) {
818 return KERN_FAILURE;
819 }
820
821 assert(thr_act != THREAD_NULL);
822
823 thr_xstate = thread_xstate(thr_act);
824
825 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
826 thr_xstate == AVX) {
827 if (!fpu_thread_promote_avx512(thr_act)) {
828 return KERN_FAILURE;
829 } else {
830 /* Reload thr_xstate after successful promotion */
831 thr_xstate = thread_xstate(thr_act);
832 }
833 }
834
835 state = (x86_float_state64_t *)tstate;
836
837 pcb = THREAD_TO_PCB(thr_act);
838
839 if (state == NULL) {
840 /*
841 * new FPU state is 'invalid'.
842 * Deallocate the fp state if it exists.
843 */
844 simple_lock(&pcb->lock, LCK_GRP_NULL);
845
846 ifps = pcb->ifps;
847 pcb->ifps = 0;
848
849 simple_unlock(&pcb->lock);
850
851 if (ifps != 0) {
852 fp_state_free(ifps, thr_xstate);
853 }
854 } else {
855 /*
856 * Valid incoming state. Allocate the fp state if there is none.
857 */
858 new_ifps = 0;
859 Retry:
860 simple_lock(&pcb->lock, LCK_GRP_NULL);
861
862 ifps = pcb->ifps;
863 if (ifps == 0) {
864 if (new_ifps == 0) {
865 simple_unlock(&pcb->lock);
866 new_ifps = fp_state_alloc(thr_xstate);
867 goto Retry;
868 }
869 ifps = new_ifps;
870 new_ifps = 0;
871 pcb->ifps = ifps;
872 pcb->xstate = thr_xstate;
873 fresh_state = TRUE;
874 }
875
876 /*
877 * now copy over the new data.
878 */
879
880 old_valid = ifps->fp_valid;
881
882 #if DEBUG || DEVELOPMENT
883 if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
884 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
885 }
886 #endif
887 /*
888 * Clear any reserved bits in the MXCSR to prevent a GPF
889 * when issuing an FXRSTOR.
890 */
891
892 state->fpu_mxcsr &= mxcsr_capability_mask;
893
894 __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size(FP));
895
896 switch (thr_xstate) {
897 case UNDEFINED_FULL:
898 case FP_FULL:
899 case AVX_FULL:
900 case AVX512_FULL:
901 panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
902 break;
903
904 case UNDEFINED:
905 panic("fpu_set_fxstate() UNDEFINED xstate");
906 break;
907 case FP:
908 ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
909 break;
910 case AVX: {
911 struct x86_avx_thread_state *iavx = (void *) ifps;
912 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
913
914 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
915
916 /* Sanitize XSAVE header */
917 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
918 iavx->_xh.xstate_bv = AVX_XMASK;
919 iavx->_xh.xcomp_bv = 0;
920
921 /*
922 * See the block comment at the top of the function for a description of why we're clearing
923 * xstate_bv bits.
924 */
925 if (f == x86_AVX_STATE32) {
926 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
927 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
928 iavx->_xh.xstate_bv &= ~XFEM_YMM;
929 }
930 } else if (f == x86_AVX_STATE64) {
931 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
932 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
933 iavx->_xh.xstate_bv &= ~XFEM_YMM;
934 }
935 } else {
936 iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
937 }
938 break;
939 }
940 case AVX512: {
941 struct x86_avx512_thread_state *iavx = (void *) ifps;
942 union {
943 thread_state_t ts;
944 x86_avx512_state32_t *s32;
945 x86_avx512_state64_t *s64;
946 } xs = { .ts = tstate };
947
948 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
949
950 /* Sanitize XSAVE header */
951 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
952 iavx->_xh.xstate_bv = AVX512_XMASK;
953 iavx->_xh.xcomp_bv = 0;
954
955 const struct fp_avx512 p = fp_avx512_get(iavx);
956
957 /*
958 * See the block comment at the top of the function for a description of why we're clearing
959 * xstate_bv bits.
960 */
961 switch (f) {
962 case x86_AVX512_STATE32:
963 __nochk_bcopy(&xs.s32->fpu_k0, p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
964 __nochk_bcopy(&xs.s32->fpu_zmmh0, p.x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
965
966 if (fpu_allzeroes((uint64_t *)(void *)p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
967 iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
968 }
969
970 if (fpu_allzeroes((uint64_t *)(void *)p.x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
971 iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
972 }
973 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
974 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
975 iavx->_xh.xstate_bv &= ~XFEM_YMM;
976 }
977
978 DBG_AVX512_STATE(iavx);
979 break;
980 case x86_AVX_STATE32:
981 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
982 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
983 iavx->_xh.xstate_bv &= ~XFEM_YMM;
984 }
985 break;
986 case x86_AVX512_STATE64:
987 __nochk_bcopy(&xs.s64->fpu_k0, p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
988 __nochk_bcopy(&xs.s64->fpu_zmm16, p.x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
989 __nochk_bcopy(&xs.s64->fpu_zmmh0, p.x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
990 /*
991 * Note that it is valid to have XFEM_ZMM_OPMASK set but XFEM_YMM cleared. In that case,
992 * the upper bits of the YMMs would be cleared and would result in a clean-upper
993 * state, allowing SSE instruction to avoid false dependencies.
994 */
995 if (fpu_allzeroes((uint64_t *)(void *)p.x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
996 iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
997 }
998
999 if (fpu_allzeroes((uint64_t *)(void *)p.x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
1000 fpu_allzeroes((uint64_t *)(void *)p.x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
1001 iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
1002 }
1003
1004 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
1005 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
1006 iavx->_xh.xstate_bv &= ~XFEM_YMM;
1007 }
1008 DBG_AVX512_STATE(iavx);
1009 break;
1010 case x86_AVX_STATE64:
1011 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
1012 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
1013 iavx->_xh.xstate_bv &= ~XFEM_YMM;
1014 }
1015 break;
1016 }
1017 break;
1018 }
1019 }
1020
1021 ifps->fp_valid = old_valid;
1022
1023 if (old_valid == FALSE) {
1024 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1025 ifps->fp_valid = TRUE;
1026 /* If altering the current thread's state, disable FPU */
1027 if (thr_act == current_thread()) {
1028 set_ts();
1029 }
1030
1031 ml_set_interrupts_enabled(istate);
1032 }
1033
1034 simple_unlock(&pcb->lock);
1035
1036 if (new_ifps != 0) {
1037 fp_state_free(new_ifps, thr_xstate);
1038 }
1039 }
1040 return KERN_SUCCESS;
1041 }
1042
1043 /*
1044 * Get the floating-point state for a thread.
1045 * If the thread is not the current thread, it is
1046 * not running (held). Locking needed against
1047 * concurrent fpu_set_state or fpu_get_state.
1048 */
1049 kern_return_t
fpu_get_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)1050 fpu_get_fxstate(
1051 thread_t thr_act,
1052 thread_state_t tstate,
1053 thread_flavor_t f)
1054 {
1055 struct x86_fx_thread_state *ifps;
1056 x86_float_state64_t *state;
1057 kern_return_t ret = KERN_FAILURE;
1058 pcb_t pcb;
1059 xstate_t thr_xstate = thread_xstate(thr_act);
1060
1061 if (fpu_capability == UNDEFINED) {
1062 return KERN_FAILURE;
1063 }
1064
1065 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
1066 fpu_capability < AVX) {
1067 return KERN_FAILURE;
1068 }
1069
1070 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
1071 thr_xstate != AVX512) {
1072 return KERN_FAILURE;
1073 }
1074
1075 state = (x86_float_state64_t *)tstate;
1076
1077 assert(thr_act != THREAD_NULL);
1078 pcb = THREAD_TO_PCB(thr_act);
1079
1080 simple_lock(&pcb->lock, LCK_GRP_NULL);
1081
1082 ifps = pcb->ifps;
1083 if (ifps == 0) {
1084 /*
1085 * No valid floating-point state.
1086 */
1087
1088 __nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
1089 fp_state_size(FP));
1090
1091 simple_unlock(&pcb->lock);
1092
1093 return KERN_SUCCESS;
1094 }
1095 /*
1096 * Make sure we`ve got the latest fp state info
1097 * If the live fpu state belongs to our target
1098 */
1099 if (thr_act == current_thread()) {
1100 boolean_t intr;
1101
1102 intr = ml_set_interrupts_enabled(FALSE);
1103
1104 clear_ts();
1105 fp_save(thr_act);
1106 clear_fpu();
1107
1108 (void)ml_set_interrupts_enabled(intr);
1109 }
1110 if (ifps->fp_valid) {
1111 __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size(FP));
1112 switch (thr_xstate) {
1113 case UNDEFINED_FULL:
1114 case FP_FULL:
1115 case AVX_FULL:
1116 case AVX512_FULL:
1117 panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
1118 break;
1119
1120 case UNDEFINED:
1121 panic("fpu_get_fxstate() UNDEFINED xstate");
1122 break;
1123 case FP:
1124 break; /* already done */
1125 case AVX: {
1126 struct x86_avx_thread_state *iavx = (void *) ifps;
1127 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1128 if (f == x86_AVX_STATE32) {
1129 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1130 } else if (f == x86_AVX_STATE64) {
1131 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1132 }
1133 break;
1134 }
1135 case AVX512: {
1136 struct x86_avx512_thread_state *iavx = (void *) ifps;
1137 union {
1138 thread_state_t ts;
1139 x86_avx512_state32_t *s32;
1140 x86_avx512_state64_t *s64;
1141 } xs = { .ts = tstate };
1142
1143 const struct fp_avx512 p = fp_avx512_get(iavx);
1144
1145 switch (f) {
1146 case x86_AVX512_STATE32:
1147 __nochk_bcopy(p.x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1148 __nochk_bcopy(p.x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1149 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1150 DBG_AVX512_STATE(iavx);
1151 break;
1152 case x86_AVX_STATE32:
1153 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1154 break;
1155 case x86_AVX512_STATE64:
1156 __nochk_bcopy(p.x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1157 __nochk_bcopy(p.x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1158 __nochk_bcopy(p.x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1159 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1160 DBG_AVX512_STATE(iavx);
1161 break;
1162 case x86_AVX_STATE64:
1163 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1164 break;
1165 }
1166 break;
1167 }
1168 }
1169
1170 ret = KERN_SUCCESS;
1171 }
1172 simple_unlock(&pcb->lock);
1173
1174 return ret;
1175 }
1176
1177
1178
1179 /*
1180 * the child thread is 'stopped' with the thread
1181 * mutex held and is currently not known by anyone
1182 * so no way for fpu state to get manipulated by an
1183 * outside agency -> no need for pcb lock
1184 */
1185
1186 void
fpu_dup_fxstate(thread_t parent,thread_t child)1187 fpu_dup_fxstate(
1188 thread_t parent,
1189 thread_t child)
1190 {
1191 struct x86_fx_thread_state *new_ifps = NULL;
1192 boolean_t intr;
1193 pcb_t ppcb;
1194 xstate_t xstate = thread_xstate(parent);
1195
1196 ppcb = THREAD_TO_PCB(parent);
1197
1198 if (ppcb->ifps == NULL) {
1199 return;
1200 }
1201
1202 if (child->machine.ifps) {
1203 panic("fpu_dup_fxstate: child's ifps non-null");
1204 }
1205
1206 new_ifps = fp_state_alloc(xstate);
1207
1208 simple_lock(&ppcb->lock, LCK_GRP_NULL);
1209
1210 if (ppcb->ifps != NULL) {
1211 struct x86_fx_thread_state *ifps = ppcb->ifps;
1212 /*
1213 * Make sure we`ve got the latest fp state info
1214 */
1215 if (current_thread() == parent) {
1216 intr = ml_set_interrupts_enabled(FALSE);
1217 assert(current_thread() == parent);
1218 clear_ts();
1219 fp_save(parent);
1220 clear_fpu();
1221
1222 (void)ml_set_interrupts_enabled(intr);
1223 }
1224
1225 if (ifps->fp_valid) {
1226 child->machine.ifps = new_ifps;
1227 child->machine.xstate = xstate;
1228 __nochk_bcopy((char *)(ppcb->ifps),
1229 (char *)(child->machine.ifps),
1230 fp_state_size(xstate));
1231
1232 /* Mark the new fp saved state as non-live. */
1233 /* Temporarily disabled: radar 4647827
1234 * new_ifps->fp_valid = TRUE;
1235 */
1236
1237 /*
1238 * Clear any reserved bits in the MXCSR to prevent a GPF
1239 * when issuing an FXRSTOR.
1240 */
1241 new_ifps->fx_MXCSR &= mxcsr_capability_mask;
1242 new_ifps = NULL;
1243 }
1244 }
1245 simple_unlock(&ppcb->lock);
1246
1247 if (new_ifps != NULL) {
1248 fp_state_free(new_ifps, xstate);
1249 }
1250 }
1251
1252 /*
1253 * Initialize FPU.
1254 * FNINIT programs the x87 control word to 0x37f, which matches
1255 * the desired default for macOS.
1256 */
1257
1258 void
fpinit(void)1259 fpinit(void)
1260 {
1261 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1262 clear_ts();
1263 fninit();
1264 #if DEBUG
1265 /* We skip this power-on-default verification sequence on
1266 * non-DEBUG, as dirtying the x87 control word may slow down
1267 * xsave/xrstor and affect energy use.
1268 */
1269 unsigned short control, control2;
1270 fnstcw(&control);
1271 control2 = control;
1272 control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1273 control |= (FPC_PC_64 | /* Set precision */
1274 FPC_RC_RN | /* round-to-nearest */
1275 FPC_ZE | /* Suppress zero-divide */
1276 FPC_OE | /* and overflow */
1277 FPC_UE | /* underflow */
1278 FPC_IE | /* Allow NaNQs and +-INF */
1279 FPC_DE | /* Allow denorms as operands */
1280 FPC_PE); /* No trap for precision loss */
1281 assert(control == control2);
1282 fldcw(control);
1283 #endif
1284 /* Initialize SSE/SSE2 */
1285 __builtin_ia32_ldmxcsr(0x1f80);
1286 if (fpu_YMM_capable) {
1287 vzeroall();
1288 } else {
1289 xmmzeroall();
1290 }
1291 ml_set_interrupts_enabled(istate);
1292 }
1293
1294 /*
1295 * Coprocessor not present.
1296 */
1297
1298 uint64_t x86_isr_fp_simd_use;
1299
1300 void
fpnoextflt(void)1301 fpnoextflt(void)
1302 {
1303 boolean_t intr;
1304 thread_t thr_act;
1305 pcb_t pcb;
1306 struct x86_fx_thread_state *ifps = 0;
1307 xstate_t xstate = current_xstate();
1308
1309 thr_act = current_thread();
1310 pcb = THREAD_TO_PCB(thr_act);
1311
1312 if (pcb->ifps == 0 && !get_interrupt_level()) {
1313 ifps = fp_state_alloc(xstate);
1314 __nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
1315 fp_state_size(xstate));
1316 if (!thread_is_64bit_addr(thr_act)) {
1317 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
1318 } else {
1319 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
1320 }
1321 ifps->fp_valid = TRUE;
1322 }
1323 intr = ml_set_interrupts_enabled(FALSE);
1324
1325 clear_ts(); /* Enable FPU use */
1326
1327 if (__improbable(get_interrupt_level())) {
1328 /* Track number of #DNA traps at interrupt context,
1329 * which is likely suboptimal. Racy, but good enough.
1330 */
1331 x86_isr_fp_simd_use++;
1332 /*
1333 * Save current FP/SIMD context if valid
1334 * Initialize live FP/SIMD registers
1335 */
1336 if (pcb->ifps) {
1337 fp_save(thr_act);
1338 }
1339 fpinit();
1340 } else {
1341 if (pcb->ifps == 0) {
1342 pcb->ifps = ifps;
1343 pcb->xstate = xstate;
1344 ifps = 0;
1345 }
1346 /*
1347 * Load this thread`s state into coprocessor live context.
1348 */
1349 fp_load(thr_act);
1350 }
1351 (void)ml_set_interrupts_enabled(intr);
1352
1353 if (ifps) {
1354 fp_state_free(ifps, xstate);
1355 }
1356 }
1357
1358 /*
1359 * FPU overran end of segment.
1360 * Re-initialize FPU. Floating point state is not valid.
1361 */
1362
1363 void
fpextovrflt(void)1364 fpextovrflt(void)
1365 {
1366 thread_t thr_act = current_thread();
1367 pcb_t pcb;
1368 struct x86_fx_thread_state *ifps;
1369 boolean_t intr;
1370 xstate_t xstate = current_xstate();
1371
1372 intr = ml_set_interrupts_enabled(FALSE);
1373
1374 if (get_interrupt_level()) {
1375 panic("FPU segment overrun exception at interrupt context");
1376 }
1377 if (current_task() == kernel_task) {
1378 panic("FPU segment overrun exception in kernel thread context");
1379 }
1380
1381 /*
1382 * This is a non-recoverable error.
1383 * Invalidate the thread`s FPU state.
1384 */
1385 pcb = THREAD_TO_PCB(thr_act);
1386 simple_lock(&pcb->lock, LCK_GRP_NULL);
1387 ifps = pcb->ifps;
1388 pcb->ifps = 0;
1389 simple_unlock(&pcb->lock);
1390
1391 /*
1392 * Re-initialize the FPU.
1393 */
1394 clear_ts();
1395 fninit();
1396
1397 /*
1398 * And disable access.
1399 */
1400 clear_fpu();
1401
1402 (void)ml_set_interrupts_enabled(intr);
1403
1404 if (ifps) {
1405 fp_state_free(ifps, xstate);
1406 }
1407 }
1408
1409 /*
1410 * FPU error. Called by AST.
1411 */
1412
1413 void
fpexterrflt(void)1414 fpexterrflt(void)
1415 {
1416 thread_t thr_act = current_thread();
1417 boolean_t intr;
1418
1419 intr = ml_set_interrupts_enabled(FALSE);
1420
1421 if (get_interrupt_level()) {
1422 panic("FPU error exception at interrupt context");
1423 }
1424 if (current_task() == kernel_task) {
1425 panic("FPU error exception in kernel thread context");
1426 }
1427
1428 /*
1429 * Save the FPU state and turn off the FPU.
1430 */
1431 fp_save(thr_act);
1432 /* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1433 set_ts();
1434
1435 (void)ml_set_interrupts_enabled(intr);
1436 }
1437
1438 /*
1439 * Save FPU state.
1440 *
1441 * Locking not needed:
1442 * . if called from fpu_get_state, pcb already locked.
1443 * . if called from fpnoextflt or fp_intr, we are single-cpu
1444 * . otherwise, thread is running.
1445 * N.B.: Must be called with interrupts disabled
1446 */
1447
1448 void
fp_save(thread_t thr_act)1449 fp_save(
1450 thread_t thr_act)
1451 {
1452 pcb_t pcb = THREAD_TO_PCB(thr_act);
1453 struct x86_fx_thread_state *ifps = pcb->ifps;
1454
1455 assert(ifps != 0);
1456 if (ifps != 0 && !ifps->fp_valid) {
1457 assert((get_cr0() & CR0_TS) == 0);
1458 /* registers are in FPU */
1459 ifps->fp_valid = TRUE;
1460 fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1461 }
1462 }
1463
1464 /*
1465 * Restore FPU state from PCB.
1466 *
1467 * Locking not needed; always called on the current thread.
1468 */
1469
1470 void
fp_load(thread_t thr_act)1471 fp_load(
1472 thread_t thr_act)
1473 {
1474 pcb_t pcb = THREAD_TO_PCB(thr_act);
1475 struct x86_fx_thread_state *ifps = pcb->ifps;
1476
1477 assert(ifps);
1478 #if DEBUG
1479 if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1480 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u",
1481 ifps->fp_valid, ifps->fp_save_layout);
1482 }
1483 #endif
1484
1485 if (ifps->fp_valid == FALSE) {
1486 fpinit();
1487 } else {
1488 fpu_load_registers(ifps);
1489 }
1490 ifps->fp_valid = FALSE; /* in FPU */
1491 }
1492
1493 /*
1494 * SSE arithmetic exception handling code.
1495 * Basically the same as the x87 exception handler with a different subtype
1496 */
1497
1498 void
fpSSEexterrflt(void)1499 fpSSEexterrflt(void)
1500 {
1501 thread_t thr_act = current_thread();
1502 boolean_t intr;
1503
1504 intr = ml_set_interrupts_enabled(FALSE);
1505
1506 if (get_interrupt_level()) {
1507 panic("SSE exception at interrupt context");
1508 }
1509 if (current_task() == kernel_task) {
1510 panic("SSE exception in kernel thread context");
1511 }
1512
1513 /*
1514 * Save the FPU state and turn off the FPU.
1515 */
1516 fp_save(thr_act);
1517 /* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1518 set_ts();
1519
1520 (void)ml_set_interrupts_enabled(intr);
1521 }
1522
1523
1524 /*
1525 * If a thread is using an AVX-sized savearea:
1526 * - allocate a new AVX512-sized area,
1527 * - copy the 256-bit state into the 512-bit area,
1528 * - deallocate the smaller area
1529 * ASSUMES: thread is the current thread.
1530 */
1531 static void
fpu_savearea_promote_avx512(thread_t thread)1532 fpu_savearea_promote_avx512(thread_t thread)
1533 {
1534 struct x86_avx_thread_state *ifps = NULL;
1535 struct x86_avx512_thread_state *ifps512 = NULL;
1536 pcb_t pcb = THREAD_TO_PCB(thread);
1537 boolean_t do_avx512_alloc = FALSE;
1538 boolean_t intr;
1539
1540 assert(thread == current_thread());
1541
1542 DBG("fpu_savearea_promote_avx512(%p)\n", thread);
1543
1544 simple_lock(&pcb->lock, LCK_GRP_NULL);
1545
1546 ifps = pcb->ifps;
1547 if (ifps == NULL) {
1548 pcb->xstate = AVX512;
1549 simple_unlock(&pcb->lock);
1550 /*
1551 * Now that the PCB xstate has been promoted, set XCR0 so
1552 * that we don't re-trip #UD on the next AVX-512 instruction.
1553 *
1554 * Since this branch is taken when the first FP instruction
1555 * attempted by this thread is an AVX-512 instruction, we
1556 * call fpnoextflt() to allocate an appropriately-sized
1557 * AVX-512 save-area, thereby avoiding the overhead of another
1558 * fault that would be triggered immediately on return.
1559 */
1560 intr = ml_set_interrupts_enabled(FALSE);
1561 xsetbv(0, AVX512_XMASK);
1562 current_cpu_datap()->cpu_xstate = AVX512;
1563 (void)ml_set_interrupts_enabled(intr);
1564
1565 fpnoextflt();
1566 return;
1567 }
1568
1569 if (pcb->xstate != AVX512) {
1570 do_avx512_alloc = TRUE;
1571 }
1572
1573 simple_unlock(&pcb->lock);
1574
1575 if (do_avx512_alloc == TRUE) {
1576 ifps512 = fp_state_alloc(AVX512);
1577 }
1578
1579 simple_lock(&pcb->lock, LCK_GRP_NULL);
1580
1581 intr = ml_set_interrupts_enabled(FALSE);
1582
1583 clear_ts();
1584 fp_save(thread);
1585 clear_fpu();
1586
1587 xsetbv(0, AVX512_XMASK);
1588 current_cpu_datap()->cpu_xstate = AVX512;
1589 (void)ml_set_interrupts_enabled(intr);
1590
1591 assert(ifps->fp.fp_valid);
1592
1593 /* Allocate an AVX512 savearea and copy AVX state into it */
1594 if (pcb->xstate != AVX512) {
1595 __nochk_bcopy(ifps, ifps512, fp_state_size(AVX));
1596 pcb->ifps = ifps512;
1597 pcb->xstate = AVX512;
1598 ifps512 = NULL;
1599 } else {
1600 ifps = NULL;
1601 }
1602 /* The PCB lock is redundant in some scenarios given the higher level
1603 * thread mutex, but its pre-emption disablement is relied upon here
1604 */
1605 simple_unlock(&pcb->lock);
1606
1607 if (ifps) {
1608 fp_state_free(ifps, AVX);
1609 }
1610 if (ifps512) {
1611 fp_state_free(ifps, AVX512);
1612 }
1613 }
1614
1615 /*
1616 * Upgrade the calling thread to AVX512.
1617 */
1618 boolean_t
fpu_thread_promote_avx512(thread_t thread)1619 fpu_thread_promote_avx512(thread_t thread)
1620 {
1621 task_t task = current_task();
1622
1623 if (thread != current_thread()) {
1624 return FALSE;
1625 }
1626 if (!ml_fpu_avx512_enabled()) {
1627 return FALSE;
1628 }
1629
1630 fpu_savearea_promote_avx512(thread);
1631
1632 /* Racy but the task's xstate is only a hint */
1633 task->xstate = AVX512;
1634
1635 return TRUE;
1636 }
1637
1638
1639 /*
1640 * Called from user_trap() when an invalid opcode fault is taken.
1641 * If the user is attempting an AVX512 instruction on a machine
1642 * that supports this, we switch the calling thread to use
1643 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1644 * return to user_trap() with a 0 return value.
1645 * Otherwise, simply return a nonzero value.
1646 */
1647
1648 #define MAX_X86_INSN_LENGTH (15)
1649 int
fpUDflt(user_addr_t rip)1650 fpUDflt(user_addr_t rip)
1651 {
1652 uint8_t instruction_prefix;
1653 boolean_t is_AVX512_instruction = FALSE;
1654 user_addr_t original_rip = rip;
1655
1656 /*
1657 * If this thread's xstate is already AVX512, then this #UD is
1658 * a true #UD.
1659 */
1660 if (thread_xstate(current_thread()) == AVX512) {
1661 return 1;
1662 }
1663
1664 do {
1665 /* TODO: as an optimisation, copy up to the lesser of the
1666 * next page boundary or maximal prefix length in one pass
1667 * rather than issue multiple copyins
1668 */
1669 if (copyin(rip, (char *) &instruction_prefix, 1)) {
1670 return 1;
1671 }
1672 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1673 rip, instruction_prefix);
1674 /* TODO: determine more specifically which prefixes
1675 * are sane possibilities for AVX512 insns
1676 */
1677 switch (instruction_prefix) {
1678 case 0x2E: /* CS segment override */
1679 case 0x36: /* SS segment override */
1680 case 0x3E: /* DS segment override */
1681 case 0x26: /* ES segment override */
1682 case 0x64: /* FS segment override */
1683 case 0x65: /* GS segment override */
1684 case 0x66: /* Operand-size override */
1685 case 0x67: /* address-size override */
1686 /* Skip optional prefixes */
1687 rip++;
1688 if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
1689 return 1;
1690 }
1691 break;
1692 case 0x62: /* EVEX */
1693 case 0xC5: /* VEX 2-byte */
1694 case 0xC4: /* VEX 3-byte */
1695 is_AVX512_instruction = TRUE;
1696 break;
1697 default:
1698 return 1;
1699 }
1700 } while (!is_AVX512_instruction);
1701
1702 /* Here if we detect attempted execution of an AVX512 instruction */
1703
1704 /*
1705 * Fail if this machine doesn't support AVX512
1706 */
1707 if (!fpu_ZMM_capable) {
1708 return 1;
1709 }
1710
1711 assert(xgetbv(XCR0) == AVX_XMASK);
1712
1713 DBG("fpUDflt() switching xstate to AVX512\n");
1714 (void) fpu_thread_promote_avx512(current_thread());
1715
1716 return 0;
1717 }
1718
1719 void
fp_setvalid(boolean_t value)1720 fp_setvalid(boolean_t value)
1721 {
1722 thread_t thr_act = current_thread();
1723 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
1724
1725 if (ifps) {
1726 ifps->fp_valid = value;
1727
1728 if (value == TRUE) {
1729 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1730 clear_fpu();
1731 ml_set_interrupts_enabled(istate);
1732 }
1733 }
1734 }
1735
1736 boolean_t
ml_fpu_avx_enabled(void)1737 ml_fpu_avx_enabled(void)
1738 {
1739 return fpu_capability >= AVX;
1740 }
1741
1742 boolean_t
ml_fpu_avx512_enabled(void)1743 ml_fpu_avx512_enabled(void)
1744 {
1745 return fpu_ZMM_capable;
1746 }
1747
1748 static xstate_t
thread_xstate(thread_t thread)1749 thread_xstate(thread_t thread)
1750 {
1751 xstate_t xs = THREAD_TO_PCB(thread)->xstate;
1752 if (xs != UNDEFINED) {
1753 return xs;
1754 } else if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
1755 return fpu_default;
1756 } else {
1757 return get_threadtask(thread)->xstate;
1758 }
1759 }
1760
1761 xstate_t
current_xstate(void)1762 current_xstate(void)
1763 {
1764 return thread_xstate(current_thread());
1765 }
1766
1767 /*
1768 * Called when exec'ing between bitnesses.
1769 * If valid FPU state exists, adjust the layout.
1770 */
1771 void
fpu_switch_addrmode(thread_t thread,boolean_t is_64bit)1772 fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1773 {
1774 struct x86_fx_thread_state *ifps = thread->machine.ifps;
1775 mp_disable_preemption();
1776
1777 if (ifps && ifps->fp_valid) {
1778 if (thread_xstate(thread) == FP) {
1779 ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1780 } else {
1781 ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1782 }
1783 }
1784 mp_enable_preemption();
1785 }
1786
1787 #if DEBUG || DEVELOPMENT
1788 static inline uint32_t
fpsimd_pop(uintptr_t ins,int sz)1789 fpsimd_pop(uintptr_t ins, int sz)
1790 {
1791 uint32_t rv = 0;
1792
1793
1794 while (sz >= 16) {
1795 uint32_t rv1, rv2;
1796 uint64_t *ins64 = (uint64_t *) ins;
1797 uint64_t *ins642 = (uint64_t *) (ins + 8);
1798 rv1 = __builtin_popcountll(*ins64);
1799 rv2 = __builtin_popcountll(*ins642);
1800 rv += rv1 + rv2;
1801 sz -= 16;
1802 ins += 16;
1803 }
1804
1805 while (sz >= 4) {
1806 uint32_t *ins32 = (uint32_t *) ins;
1807 rv += __builtin_popcount(*ins32);
1808 sz -= 4;
1809 ins += 4;
1810 }
1811
1812 while (sz > 0) {
1813 char *ins8 = (char *)ins;
1814 rv += __builtin_popcount(*ins8);
1815 sz--;
1816 ins++;
1817 }
1818 return rv;
1819 }
1820
1821 bool
thread_fpsimd_hash_enabled(void)1822 thread_fpsimd_hash_enabled(void)
1823 {
1824 return fpsimd_fault_popc ? true : false;
1825 }
1826
1827 uint32_t __attribute__((noinline))
thread_fpsimd_hash(thread_t ft)1828 thread_fpsimd_hash(thread_t ft)
1829 {
1830 uint32_t prv = 0;
1831 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1832 struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1833
1834 if (pifps) {
1835 if (pifps->fp_valid) {
1836 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1837 sizeof(pifps->fx_XMM_reg));
1838 } else {
1839 uintptr_t cr0 = get_cr0();
1840 /*
1841 * The unusual case where the fp save area is not valid, yet TS is set,
1842 * is used to perform a lazy-init of FP state, so for this specific case,
1843 * assume that the popcount of the FP regs is 0.
1844 */
1845 if (!(cr0 & CR0_TS)) {
1846 fp_save(ft);
1847 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1848 sizeof(pifps->fx_XMM_reg));
1849 pifps->fp_valid = FALSE;
1850 }
1851 }
1852 }
1853 ml_set_interrupts_enabled(istate);
1854 return prv;
1855 }
1856 #endif /* DEBUG || DEVELOPMENT */
1857