1 use super::{
2     RegAlloc,
3     abi::X64ABI,
4     address::Address,
5     asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},
6     regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset},
7 };
8 use crate::masm::{
9     DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm, Imm as I,
10     IntCmpKind, IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind,
11     OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType,
12     ShiftKind, SplatKind, StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS,
13     V128AbsKind, V128AddKind, V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind,
14     V128MaxKind, V128MinKind, V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind,
15     VectorCompareKind, VectorEqualityKind, Zero,
16 };
17 use crate::{
18     Result,
19     abi::{self, LocalSlot, align_to, calculate_frame_adjustment},
20     bail,
21     codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size},
22     format_err,
23     stack::{TypedReg, Val},
24 };
25 use crate::{
26     abi::{ABI, vmctx},
27     masm::{SPOffset, StackSlot},
28 };
29 use crate::{
30     isa::{
31         CallingConvention,
32         reg::{Reg, RegClass, WritableReg, writable},
33     },
34     masm::CalleeKind,
35 };
36 use cranelift_codegen::{
37     Final, MachBufferFinalized, MachLabel,
38     binemit::CodeOffset,
39     ir::{MemFlags, RelSourceLoc, SourceLoc},
40     isa::{
41         unwind::UnwindInst,
42         x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings},
43     },
44     settings,
45 };
46 use wasmtime_cranelift::TRAP_UNREACHABLE;
47 use wasmtime_environ::{PtrSize, WasmValType};
48 
49 // Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`
50 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
51 // need to fix up the bits that migrate from one half of the lane to the
52 // other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
53 // right by 0 (no movement), we want to retain all the bits so we mask with
54 // `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
55 // we mask with `0x7f`; etc.
56 
57 #[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
58 const I8X16_ISHL_MASKS: [u8; 128] = [
59     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60     0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
61     0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
62     0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
63     0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
64     0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
65     0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
66     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
67 ];
68 
69 #[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
70 const I8X16_USHR_MASKS: [u8; 128] = [
71     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
73     0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
74     0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
75     0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
76     0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
77     0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
78     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
79 ];
80 
81 /// x64 MacroAssembler.
82 pub(crate) struct MacroAssembler {
83     /// Stack pointer offset.
84     sp_offset: u32,
85     /// This value represents the maximum stack size seen while compiling the function. While the
86     /// function is still being compiled its value will not be valid (the stack will grow and
87     /// shrink as space is reserved and freed during compilation), but once all instructions have
88     /// been seen this value will be the maximum stack usage seen.
89     sp_max: u32,
90     /// Add instructions that are used to add the constant stack max to a register.
91     stack_max_use_add: Option<PatchableAddToReg>,
92     /// Low level assembler.
93     asm: Assembler,
94     /// ISA flags.
95     flags: x64_settings::Flags,
96     /// Shared flags.vmcontext_store_context
97     shared_flags: settings::Flags,
98     /// The target pointer size.
99     ptr_size: OperandSize,
100     /// Scratch register scope.
101     scratch_scope: RegAlloc,
102 }
103 
104 impl Masm for MacroAssembler {
105     type Address = Address;
106     type Ptr = u8;
107     type ABI = X64ABI;
108 
frame_setup(&mut self) -> Result<()>109     fn frame_setup(&mut self) -> Result<()> {
110         let frame_pointer = rbp();
111         let stack_pointer = rsp();
112 
113         self.asm.push_r(frame_pointer);
114 
115         if self.shared_flags.unwind_info() {
116             self.asm.unwind_inst(UnwindInst::PushFrameRegs {
117                 offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
118             })
119         }
120 
121         self.asm
122             .mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);
123 
124         Ok(())
125     }
126 
check_stack(&mut self, vmctx: Reg) -> Result<()>127     fn check_stack(&mut self, vmctx: Reg) -> Result<()> {
128         let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();
129 
130         self.with_scratch::<IntScratch, _>(|masm, scratch| {
131             masm.load_ptr(
132                 masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,
133                 scratch.writable(),
134             )?;
135 
136             masm.load_ptr(
137                 Address::offset(
138                     scratch.inner(),
139                     ptr_size.vmstore_context_stack_limit().into(),
140                 ),
141                 scratch.writable(),
142             )?;
143 
144             masm.add_stack_max(scratch.inner());
145 
146             masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size);
147             masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);
148             wasmtime_environ::error::Ok(())
149         })?;
150 
151         // Emit unwind info.
152         if self.shared_flags.unwind_info() {
153             self.asm.unwind_inst(UnwindInst::DefineNewFrame {
154                 offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
155 
156                 // The Winch calling convention has no callee-save registers, so nothing will be
157                 // clobbered.
158                 offset_downward_to_clobbers: 0,
159             })
160         }
161         Ok(())
162     }
163 
push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot>164     fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {
165         let bytes = match (reg.class(), size) {
166             (RegClass::Int, OperandSize::S64) => {
167                 let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;
168                 self.asm.push_r(reg);
169                 self.increment_sp(word_bytes);
170                 word_bytes
171             }
172             (RegClass::Int, OperandSize::S32) => {
173                 let bytes = size.bytes();
174                 self.reserve_stack(bytes)?;
175                 let sp_offset = SPOffset::from_u32(self.sp_offset);
176                 self.asm
177                     .mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
178                 bytes
179             }
180             (RegClass::Float, _) => {
181                 let bytes = size.bytes();
182                 self.reserve_stack(bytes)?;
183                 let sp_offset = SPOffset::from_u32(self.sp_offset);
184                 self.asm
185                     .xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
186                 bytes
187             }
188             _ => unreachable!(),
189         };
190 
191         Ok(StackSlot {
192             offset: SPOffset::from_u32(self.sp_offset),
193             size: bytes,
194         })
195     }
196 
reserve_stack(&mut self, bytes: u32) -> Result<()>197     fn reserve_stack(&mut self, bytes: u32) -> Result<()> {
198         if bytes == 0 {
199             return Ok(());
200         }
201 
202         self.asm
203             .sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
204         self.increment_sp(bytes);
205 
206         Ok(())
207     }
208 
free_stack(&mut self, bytes: u32) -> Result<()>209     fn free_stack(&mut self, bytes: u32) -> Result<()> {
210         if bytes == 0 {
211             return Ok(());
212         }
213         self.asm
214             .add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
215         self.decrement_sp(bytes);
216 
217         Ok(())
218     }
219 
reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()>220     fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {
221         self.sp_offset = offset.as_u32();
222 
223         Ok(())
224     }
225 
local_address(&mut self, local: &LocalSlot) -> Result<Address>226     fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {
227         let (reg, offset) = if local.addressed_from_sp() {
228             let offset = self
229                 .sp_offset
230                 .checked_sub(local.offset)
231                 .ok_or_else(|| CodeGenError::invalid_local_offset())?;
232             (rsp(), offset)
233         } else {
234             (rbp(), local.offset)
235         };
236 
237         Ok(Address::offset(reg, offset))
238     }
239 
address_from_sp(&self, offset: SPOffset) -> Result<Self::Address>240     fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {
241         Ok(Address::offset(
242             regs::rsp(),
243             self.sp_offset - offset.as_u32(),
244         ))
245     }
246 
address_at_sp(&self, offset: SPOffset) -> Result<Self::Address>247     fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {
248         Ok(Address::offset(regs::rsp(), offset.as_u32()))
249     }
250 
address_at_vmctx(&self, offset: u32) -> Result<Self::Address>251     fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {
252         Ok(Address::offset(vmctx!(Self), offset))
253     }
254 
store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()>255     fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {
256         self.store(src.into(), dst, self.ptr_size)
257     }
258 
store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()>259     fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {
260         self.store_impl(src, dst, size, TRUSTED_FLAGS)
261     }
262 
wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()>263     fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {
264         match kind {
265             StoreKind::Operand(size) => {
266                 self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
267             }
268             StoreKind::Atomic(size) => {
269                 if size == OperandSize::S128 {
270                     // TODO: we don't support 128-bit atomic store yet.
271                     bail!(CodeGenError::unexpected_operand_size());
272                 }
273                 // To stay consistent with cranelift, we emit a normal store followed by a mfence,
274                 // although, we could probably just emit a xchg.
275                 self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
276                 self.asm.mfence();
277             }
278             StoreKind::VectorLane(LaneSelector { lane, size }) => {
279                 self.ensure_has_avx()?;
280                 self.asm
281                     .xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS);
282             }
283         }
284 
285         Ok(())
286     }
287 
pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()>288     fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
289         let current_sp = SPOffset::from_u32(self.sp_offset);
290         let _ = match (dst.to_reg().class(), size) {
291             (RegClass::Int, OperandSize::S32) => {
292                 let addr = self.address_from_sp(current_sp)?;
293                 self.asm.movzx_mr(
294                     &addr,
295                     dst,
296                     size.extend_to::<Zero>(OperandSize::S64),
297                     TRUSTED_FLAGS,
298                 );
299                 self.free_stack(size.bytes())?;
300             }
301             (RegClass::Int, OperandSize::S64) => {
302                 self.asm.pop_r(dst);
303                 self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);
304             }
305             (RegClass::Float, _) | (RegClass::Vector, _) => {
306                 let addr = self.address_from_sp(current_sp)?;
307                 self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
308                 self.free_stack(size.bytes())?;
309             }
310             _ => bail!(CodeGenError::invalid_operand_combination()),
311         };
312         Ok(())
313     }
314 
with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R315     fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R {
316         let r = self
317             .scratch_scope
318             .reg_for_class(T::reg_class(), &mut |_| Ok(()))
319             .expect("Scratch register to be available");
320 
321         let ret = f(self, Scratch::new(r));
322         self.scratch_scope.free(r);
323         ret
324     }
325 
call( &mut self, stack_args_size: u32, mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>, ) -> Result<u32>326     fn call(
327         &mut self,
328         stack_args_size: u32,
329         mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,
330     ) -> Result<u32> {
331         let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();
332         let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();
333         let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);
334         let aligned_args_size = align_to(stack_args_size, alignment);
335         let total_stack = delta + aligned_args_size;
336         self.reserve_stack(total_stack)?;
337         let (callee, cc) = load_callee(self)?;
338         match callee {
339             CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),
340             CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),
341         };
342         Ok(total_stack)
343     }
344 
load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()>345     fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {
346         self.load(src, dst, self.ptr_size)
347     }
348 
compute_addr( &mut self, src: Self::Address, dst: WritableReg, size: OperandSize, ) -> Result<()>349     fn compute_addr(
350         &mut self,
351         src: Self::Address,
352         dst: WritableReg,
353         size: OperandSize,
354     ) -> Result<()> {
355         self.asm.lea(&src, dst, size);
356         Ok(())
357     }
358 
load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()>359     fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {
360         self.load_impl(src, dst, size, TRUSTED_FLAGS)
361     }
362 
wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()>363     fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {
364         let size = kind.derive_operand_size();
365 
366         match kind {
367             LoadKind::ScalarExtend(ext) => match ext {
368                 ExtendKind::Signed(ext) => {
369                     self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);
370                 }
371                 ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,
372             },
373             LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {
374                 // The guarantees of the x86-64 memory model ensure that `SeqCst`
375                 // loads are equivalent to normal loads.
376                 if kind.is_atomic() && size == OperandSize::S128 {
377                     bail!(CodeGenError::unexpected_operand_size());
378                 }
379 
380                 self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;
381             }
382             LoadKind::VectorExtend(ext) => {
383                 self.ensure_has_avx()?;
384                 self.asm
385                     .xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)
386             }
387             LoadKind::Splat(_) => {
388                 self.ensure_has_avx()?;
389 
390                 if size == OperandSize::S64 {
391                     self.asm
392                         .xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);
393                     self.asm.xmm_vpshuf_rr(
394                         dst.to_reg(),
395                         dst,
396                         Self::vpshuf_mask_for_64_bit_splats(),
397                         OperandSize::S32,
398                     );
399                 } else {
400                     self.asm
401                         .xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);
402                 }
403             }
404             LoadKind::VectorLane(LaneSelector { lane, size }) => {
405                 self.ensure_has_avx()?;
406                 self.with_scratch::<IntScratch, _>(|masm, byte_tmp| {
407                     masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?;
408                     masm.asm
409                         .xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size);
410                     wasmtime_environ::error::Ok(())
411                 })?;
412             }
413             LoadKind::VectorZero(size) => {
414                 self.ensure_has_avx()?;
415                 self.with_scratch::<IntScratch, _>(|masm, scratch| {
416                     masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?;
417                     masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size);
418                     wasmtime_environ::error::Ok(())
419                 })?;
420             }
421         }
422 
423         Ok(())
424     }
425 
sp_offset(&self) -> Result<SPOffset>426     fn sp_offset(&self) -> Result<SPOffset> {
427         Ok(SPOffset::from_u32(self.sp_offset))
428     }
429 
zero(&mut self, reg: WritableReg) -> Result<()>430     fn zero(&mut self, reg: WritableReg) -> Result<()> {
431         self.asm.xor_rr(
432             reg.to_reg(),
433             reg,
434             OperandSize::from_bytes(<Self::ABI>::word_bytes()),
435         );
436         Ok(())
437     }
438 
mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()>439     fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {
440         match (src, dst.to_reg()) {
441             (RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {
442                 (RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),
443                 (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),
444                 _ => bail!(CodeGenError::invalid_operand_combination()),
445             },
446             (RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size),
447         }
448     }
449 
cmov( &mut self, dst: WritableReg, src: Reg, cc: IntCmpKind, size: OperandSize, ) -> Result<()>450     fn cmov(
451         &mut self,
452         dst: WritableReg,
453         src: Reg,
454         cc: IntCmpKind,
455         size: OperandSize,
456     ) -> Result<()> {
457         match (src.class(), dst.to_reg().class()) {
458             (RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),
459             (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),
460             _ => Err(format_err!(CodeGenError::invalid_operand_combination())),
461         }
462     }
463 
add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>464     fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
465         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
466         match (rhs, dst) {
467             (RegImm::Imm(imm), _) => {
468                 if let Some(v) = imm.to_i32() {
469                     self.asm.add_ir(v, dst, size);
470                 } else {
471                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
472                         masm.load_constant(&imm, scratch.writable(), size)?;
473                         masm.asm.add_rr(scratch.inner(), dst, size);
474                         wasmtime_environ::error::Ok(())
475                     })?;
476                 }
477             }
478 
479             (RegImm::Reg(src), dst) => {
480                 self.asm.add_rr(src, dst, size);
481             }
482         }
483 
484         Ok(())
485     }
486 
add_uextend( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, from_size: OperandSize, size: OperandSize, ) -> Result<()>487     fn add_uextend(
488         &mut self,
489         dst: WritableReg,
490         lhs: Reg,
491         rhs: Reg,
492         from_size: OperandSize,
493         size: OperandSize,
494     ) -> Result<()> {
495         assert!(size == OperandSize::S64);
496         assert!(from_size == OperandSize::S32 || from_size == OperandSize::S64);
497 
498         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
499         if from_size == OperandSize::S32 && size == OperandSize::S64 {
500             self.extend(
501                 writable!(rhs),
502                 rhs,
503                 ExtendKind::Unsigned(Extend::I64Extend32),
504             )?;
505         }
506 
507         self.asm.add_rr(rhs, dst, size);
508 
509         Ok(())
510     }
511 
checked_uadd( &mut self, dst: WritableReg, lhs: Reg, rhs: Imm, size: OperandSize, trap: TrapCode, ) -> Result<()>512     fn checked_uadd(
513         &mut self,
514         dst: WritableReg,
515         lhs: Reg,
516         rhs: Imm,
517         size: OperandSize,
518         trap: TrapCode,
519     ) -> Result<()> {
520         self.add(dst, lhs, RegImm::Imm(rhs), size)?;
521         self.asm.trapif(CC::B, trap);
522         Ok(())
523     }
524 
sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>525     fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
526         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
527         match (rhs, dst) {
528             (RegImm::Imm(imm), reg) => {
529                 if let Some(v) = imm.to_i32() {
530                     self.asm.sub_ir(v, reg, size);
531                 } else {
532                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
533                         masm.load_constant(&imm, scratch.writable(), size)?;
534                         masm.asm.sub_rr(scratch.inner(), reg, size);
535                         wasmtime_environ::error::Ok(())
536                     })?;
537                 }
538             }
539 
540             (RegImm::Reg(src), dst) => {
541                 self.asm.sub_rr(src, dst, size);
542             }
543         }
544 
545         Ok(())
546     }
547 
mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>548     fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
549         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
550         match (rhs, dst) {
551             (RegImm::Imm(imm), _) => {
552                 if let Some(v) = imm.to_i32() {
553                     self.asm.mul_ir(v, dst, size);
554                 } else {
555                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
556                         masm.load_constant(&imm, scratch.writable(), size)?;
557                         masm.asm.mul_rr(scratch.inner(), dst, size);
558                         wasmtime_environ::error::Ok(())
559                     })?;
560                 }
561             }
562 
563             (RegImm::Reg(src), dst) => {
564                 self.asm.mul_rr(src, dst, size);
565             }
566         }
567 
568         Ok(())
569     }
570 
float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>571     fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
572         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
573         self.asm.xmm_add_rr(rhs, dst, size);
574         Ok(())
575     }
576 
float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>577     fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
578         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
579         self.asm.xmm_sub_rr(rhs, dst, size);
580         Ok(())
581     }
582 
float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>583     fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
584         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
585         self.asm.xmm_mul_rr(rhs, dst, size);
586         Ok(())
587     }
588 
float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>589     fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
590         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
591         self.asm.xmm_div_rr(rhs, dst, size);
592         Ok(())
593     }
594 
float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>595     fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
596         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
597         self.asm.xmm_min_seq(rhs, dst, size);
598         Ok(())
599     }
600 
float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>601     fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
602         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
603         self.asm.xmm_max_seq(rhs, dst, size);
604         Ok(())
605     }
606 
float_copysign( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize, ) -> Result<()>607     fn float_copysign(
608         &mut self,
609         dst: WritableReg,
610         lhs: Reg,
611         rhs: Reg,
612         size: OperandSize,
613     ) -> Result<()> {
614         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
615         let sign_mask = match size {
616             OperandSize::S32 => I::I32(0x80000000),
617             OperandSize::S64 => I::I64(0x8000000000000000),
618             OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
619                 bail!(CodeGenError::unexpected_operand_size())
620             }
621         };
622 
623         self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
624             masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
625                 masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?;
626                 masm.asm
627                     .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
628 
629                 // Clear everything except sign bit in src.
630                 masm.asm
631                     .xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size);
632 
633                 // Clear sign bit in dst using scratch to store result. Then copy the
634                 // result back to dst.
635                 masm.asm
636                     .xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size);
637                 masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size);
638 
639                 // Copy sign bit from src to dst.
640                 masm.asm.xmm_or_rr(rhs, dst, size);
641                 Ok(())
642             })
643         })
644     }
645 
float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()>646     fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
647         debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
648         let mask = match size {
649             OperandSize::S32 => I::I32(0x80000000),
650             OperandSize::S64 => I::I64(0x8000000000000000),
651             OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
652                 bail!(CodeGenError::unexpected_operand_size())
653             }
654         };
655         self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
656             masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
657                 masm.load_constant(&mask, scratch_gpr.writable(), size)?;
658                 masm.asm
659                     .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
660                 masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size);
661                 Ok(())
662             })
663         })
664     }
665 
float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()>666     fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
667         debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
668         let mask = match size {
669             OperandSize::S32 => I::I32(0x7fffffff),
670             OperandSize::S64 => I::I64(0x7fffffffffffffff),
671             OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {
672                 bail!(CodeGenError::unexpected_operand_size())
673             }
674         };
675 
676         self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
677             masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
678                 masm.load_constant(&mask, scratch_gpr.writable(), size)?;
679 
680                 masm.asm
681                     .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
682                 masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size);
683                 Ok(())
684             })
685         })
686     }
687 
float_round< F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>, >( &mut self, mode: RoundingMode, env: &mut FuncEnv<Self::Ptr>, context: &mut CodeGenContext<Emission>, size: OperandSize, mut fallback: F, ) -> Result<()>688     fn float_round<
689         F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,
690     >(
691         &mut self,
692         mode: RoundingMode,
693         env: &mut FuncEnv<Self::Ptr>,
694         context: &mut CodeGenContext<Emission>,
695         size: OperandSize,
696         mut fallback: F,
697     ) -> Result<()> {
698         if self.flags.has_sse41() {
699             let src = context.pop_to_reg(self, None)?;
700             self.asm
701                 .xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);
702             context.stack.push(src.into());
703             Ok(())
704         } else {
705             fallback(env, context, self)
706         }
707     }
708 
float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>709     fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
710         self.asm.sqrt(src, dst, size);
711         Ok(())
712     }
713 
and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>714     fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
715         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
716         match (rhs, dst) {
717             (RegImm::Imm(imm), _) => {
718                 if let Some(v) = imm.to_i32() {
719                     self.asm.and_ir(v, dst, size);
720                 } else {
721                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
722                         masm.load_constant(&imm, scratch.writable(), size)?;
723                         masm.asm.and_rr(scratch.inner(), dst, size);
724                         wasmtime_environ::error::Ok(())
725                     })?;
726                 }
727             }
728 
729             (RegImm::Reg(src), dst) => {
730                 self.asm.and_rr(src, dst, size);
731             }
732         }
733 
734         Ok(())
735     }
736 
or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>737     fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
738         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
739         match (rhs, dst) {
740             (RegImm::Imm(imm), _) => {
741                 if let Some(v) = imm.to_i32() {
742                     self.asm.or_ir(v, dst, size);
743                 } else {
744                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
745                         masm.load_constant(&imm, scratch.writable(), size)?;
746                         masm.asm.or_rr(scratch.inner(), dst, size);
747                         wasmtime_environ::error::Ok(())
748                     })?;
749                 }
750             }
751 
752             (RegImm::Reg(src), dst) => {
753                 self.asm.or_rr(src, dst, size);
754             }
755         }
756 
757         Ok(())
758     }
759 
xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>760     fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
761         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
762         match (rhs, dst) {
763             (RegImm::Imm(imm), _) => {
764                 if let Some(v) = imm.to_i32() {
765                     self.asm.xor_ir(v, dst, size);
766                 } else {
767                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
768                         masm.load_constant(&imm, scratch.writable(), size)?;
769                         masm.asm.xor_rr(scratch.inner(), dst, size);
770                         wasmtime_environ::error::Ok(())
771                     })?;
772                 }
773             }
774 
775             (RegImm::Reg(src), _) => {
776                 self.asm.xor_rr(src, dst, size);
777             }
778         }
779 
780         Ok(())
781     }
782 
shift_ir( &mut self, dst: WritableReg, imm: I, lhs: Reg, kind: ShiftKind, size: OperandSize, ) -> Result<()>783     fn shift_ir(
784         &mut self,
785         dst: WritableReg,
786         imm: I,
787         lhs: Reg,
788         kind: ShiftKind,
789         size: OperandSize,
790     ) -> Result<()> {
791         Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
792         self.asm
793             .shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size);
794         Ok(())
795     }
796 
shift( &mut self, context: &mut CodeGenContext<Emission>, kind: ShiftKind, size: OperandSize, ) -> Result<()>797     fn shift(
798         &mut self,
799         context: &mut CodeGenContext<Emission>,
800         kind: ShiftKind,
801         size: OperandSize,
802     ) -> Result<()> {
803         // Number of bits to shift must be in the CL register.
804         let src = context.pop_to_reg(self, Some(regs::rcx()))?;
805         let dst = context.pop_to_reg(self, None)?;
806 
807         self.asm
808             .shift_rr(src.into(), writable!(dst.into()), kind, size);
809 
810         context.free_reg(src);
811         context.stack.push(dst.into());
812 
813         Ok(())
814     }
815 
div( &mut self, context: &mut CodeGenContext<Emission>, kind: DivKind, size: OperandSize, ) -> Result<()>816     fn div(
817         &mut self,
818         context: &mut CodeGenContext<Emission>,
819         kind: DivKind,
820         size: OperandSize,
821     ) -> Result<()> {
822         // Allocate rdx:rax.
823         let rdx = context.reg(regs::rdx(), self)?;
824         let rax = context.reg(regs::rax(), self)?;
825 
826         // Allocate the divisor, which can be any gpr.
827         let divisor = context.pop_to_reg(self, None)?;
828 
829         // Mark rax as allocatable.
830         context.free_reg(rax);
831         // Move the top value to rax.
832         let rax = context.pop_to_reg(self, Some(rax))?;
833         self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);
834 
835         // Free the divisor and rdx.
836         context.free_reg(divisor);
837         context.free_reg(rdx);
838 
839         // Push the quotient.
840         context.stack.push(rax.into());
841         Ok(())
842     }
843 
rem( &mut self, context: &mut CodeGenContext<Emission>, kind: RemKind, size: OperandSize, ) -> Result<()>844     fn rem(
845         &mut self,
846         context: &mut CodeGenContext<Emission>,
847         kind: RemKind,
848         size: OperandSize,
849     ) -> Result<()> {
850         // Allocate rdx:rax.
851         let rdx = context.reg(regs::rdx(), self)?;
852         let rax = context.reg(regs::rax(), self)?;
853 
854         // Allocate the divisor, which can be any gpr.
855         let divisor = context.pop_to_reg(self, None)?;
856 
857         // Mark rax as allocatable.
858         context.free_reg(rax);
859         // Move the top value to rax.
860         let rax = context.pop_to_reg(self, Some(rax))?;
861         self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);
862 
863         // Free the divisor and rax.
864         context.free_reg(divisor);
865         context.free_reg(rax);
866 
867         // Push the remainder.
868         context.stack.push(Val::reg(rdx, divisor.ty));
869 
870         Ok(())
871     }
872 
frame_restore(&mut self) -> Result<()>873     fn frame_restore(&mut self) -> Result<()> {
874         debug_assert_eq!(self.sp_offset, 0);
875         self.asm.pop_r(writable!(rbp()));
876         self.asm.ret();
877         Ok(())
878     }
879 
finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>>880     fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {
881         if let Some(patch) = self.stack_max_use_add {
882             patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());
883         }
884 
885         Ok(self.asm.finalize(base))
886     }
887 
address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address>888     fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {
889         Ok(Address::offset(reg, offset))
890     }
891 
cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()>892     fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {
893         match src2 {
894             RegImm::Imm(imm) => {
895                 if let Some(v) = imm.to_i32() {
896                     self.asm.cmp_ir(src1, v, size);
897                 } else {
898                     self.with_scratch::<IntScratch, _>(|masm, scratch| {
899                         masm.load_constant(&imm, scratch.writable(), size)?;
900                         masm.asm.cmp_rr(src1, scratch.inner(), size);
901                         wasmtime_environ::error::Ok(())
902                     })?;
903                 }
904             }
905             RegImm::Reg(src2) => {
906                 self.asm.cmp_rr(src1, src2, size);
907             }
908         }
909 
910         Ok(())
911     }
912 
cmp_with_set( &mut self, dst: WritableReg, src: RegImm, kind: IntCmpKind, size: OperandSize, ) -> Result<()>913     fn cmp_with_set(
914         &mut self,
915         dst: WritableReg,
916         src: RegImm,
917         kind: IntCmpKind,
918         size: OperandSize,
919     ) -> Result<()> {
920         self.cmp(dst.to_reg(), src, size)?;
921         self.asm.setcc(kind, dst);
922         Ok(())
923     }
924 
float_cmp_with_set( &mut self, dst: WritableReg, src1: Reg, src2: Reg, kind: FloatCmpKind, size: OperandSize, ) -> Result<()>925     fn float_cmp_with_set(
926         &mut self,
927         dst: WritableReg,
928         src1: Reg,
929         src2: Reg,
930         kind: FloatCmpKind,
931         size: OperandSize,
932     ) -> Result<()> {
933         // Float comparisons needs to be ordered (that is, comparing with a NaN
934         // should return 0) except for not equal which needs to be unordered.
935         // We use ucomis{s, d} because comis{s, d} has an undefined result if
936         // either operand is NaN. Since ucomis{s, d} is unordered, we need to
937         // compensate to make the comparison ordered.  Ucomis{s, d} sets the
938         // ZF, PF, and CF flags if there is an unordered result.
939         let (src1, src2, set_kind) = match kind {
940             FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),
941             FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),
942             FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),
943             FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),
944             // Reversing the operands and using the complementary comparison
945             // avoids needing to perform an additional SETNP and AND
946             // instruction.
947             // SETNB and SETNBE check if the carry flag is unset (i.e., not
948             // less than and not unordered) so we get the intended result
949             // without having to look at the parity flag.
950             FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),
951             FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),
952         };
953         self.asm.ucomis(src1, src2, size);
954         self.asm.setcc(set_kind, dst);
955         let _ = match kind {
956             FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {
957                 // Return false if either operand is NaN by ensuring PF is
958                 // unset.
959                 self.with_scratch::<IntScratch, _>(|masm, scratch| {
960                     masm.asm.setnp(scratch.writable());
961                     masm.asm.and_rr(scratch.inner(), dst, size);
962                 });
963             }
964             FloatCmpKind::Ne => {
965                 // Return true if either operand is NaN by checking if PF is
966                 // set.
967                 self.with_scratch::<IntScratch, _>(|masm, scratch| {
968                     masm.asm.setp(scratch.writable());
969                     masm.asm.or_rr(scratch.inner(), dst, size);
970                 });
971             }
972             FloatCmpKind::Lt | FloatCmpKind::Le => (),
973         };
974         Ok(())
975     }
976 
clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>977     fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
978         if self.flags.has_lzcnt() {
979             self.asm.lzcnt(src, dst, size);
980         } else {
981             self.with_scratch::<IntScratch, _>(|masm, scratch| {
982                 // Use the following approach:
983                 // dst = size.num_bits() - bsr(src) - is_not_zero
984                 //     = size.num.bits() + -bsr(src) - is_not_zero.
985                 masm.asm.bsr(src, dst, size);
986                 masm.asm.setcc(IntCmpKind::Ne, scratch.writable());
987                 masm.asm.neg(dst.to_reg(), dst, size);
988                 masm.asm.add_ir(size.num_bits() as i32, dst, size);
989                 masm.asm.sub_rr(scratch.inner(), dst, size);
990             });
991         }
992 
993         Ok(())
994     }
995 
ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>996     fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
997         if self.flags.has_bmi1() {
998             self.asm.tzcnt(src, dst, size);
999         } else {
1000             self.with_scratch::<IntScratch, _>(|masm, scratch| {
1001                 // Use the following approach:
1002                 // dst = bsf(src) + (is_zero * size.num_bits())
1003                 //     = bsf(src) + (is_zero << size.log2()).
1004                 // BSF outputs the correct value for every value except 0.
1005                 // When the value is 0, BSF outputs 0, correct output for ctz is
1006                 // the number of bits.
1007                 masm.asm.bsf(src, dst, size);
1008                 masm.asm.setcc(IntCmpKind::Eq, scratch.writable());
1009                 masm.asm
1010                     .shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size);
1011                 masm.asm.add_rr(scratch.inner(), dst, size);
1012             });
1013         }
1014 
1015         Ok(())
1016     }
1017 
get_label(&mut self) -> Result<MachLabel>1018     fn get_label(&mut self) -> Result<MachLabel> {
1019         let buffer = self.asm.buffer_mut();
1020         Ok(buffer.get_label())
1021     }
1022 
bind(&mut self, label: MachLabel) -> Result<()>1023     fn bind(&mut self, label: MachLabel) -> Result<()> {
1024         let buffer = self.asm.buffer_mut();
1025         buffer.bind_label(label, &mut Default::default());
1026         Ok(())
1027     }
1028 
branch( &mut self, kind: IntCmpKind, lhs: Reg, rhs: RegImm, taken: MachLabel, size: OperandSize, ) -> Result<()>1029     fn branch(
1030         &mut self,
1031         kind: IntCmpKind,
1032         lhs: Reg,
1033         rhs: RegImm,
1034         taken: MachLabel,
1035         size: OperandSize,
1036     ) -> Result<()> {
1037         use IntCmpKind::*;
1038 
1039         match &(lhs, rhs) {
1040             (rlhs, RegImm::Reg(rrhs)) => {
1041                 // If the comparison kind is zero or not zero and both operands
1042                 // are the same register, emit a test instruction. Else we emit
1043                 // a normal comparison.
1044                 if (kind == Eq || kind == Ne) && (rlhs == rrhs) {
1045                     self.asm.test_rr(*rlhs, *rrhs, size);
1046                 } else {
1047                     self.cmp(lhs, rhs, size)?;
1048                 }
1049             }
1050             _ => self.cmp(lhs, rhs, size)?,
1051         }
1052         self.asm.jmp_if(kind, taken);
1053         Ok(())
1054     }
1055 
jmp(&mut self, target: MachLabel) -> Result<()>1056     fn jmp(&mut self, target: MachLabel) -> Result<()> {
1057         self.asm.jmp(target);
1058         Ok(())
1059     }
1060 
popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()>1061     fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {
1062         let src = context.pop_to_reg(self, None)?;
1063         if self.flags.has_popcnt() && self.flags.has_sse42() {
1064             self.asm.popcnt(src.into(), writable!(src.into()), size);
1065             context.stack.push(src.into());
1066             Ok(())
1067         } else {
1068             // The fallback functionality here is based on `MacroAssembler::popcnt64` in:
1069             // https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#495
1070 
1071             let tmp = writable!(context.any_gpr(self)?);
1072             let dst = writable!(src.into());
1073             let (masks, shift_amt) = match size {
1074                 OperandSize::S64 => (
1075                     [
1076                         0x5555555555555555, // m1
1077                         0x3333333333333333, // m2
1078                         0x0f0f0f0f0f0f0f0f, // m4
1079                         0x0101010101010101, // h01
1080                     ],
1081                     56u8,
1082                 ),
1083                 // 32-bit popcount is the same, except the masks are half as
1084                 // wide and we shift by 24 at the end rather than 56
1085                 OperandSize::S32 => (
1086                     [0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],
1087                     24u8,
1088                 ),
1089                 _ => bail!(CodeGenError::unexpected_operand_size()),
1090             };
1091             self.asm.mov_rr(src.into(), tmp, size);
1092 
1093             // x -= (x >> 1) & m1;
1094             self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);
1095             let lhs = dst.to_reg();
1096             self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;
1097             self.asm.sub_rr(dst.to_reg(), tmp, size);
1098 
1099             // x = (x & m2) + ((x >> 2) & m2);
1100             self.asm.mov_rr(tmp.to_reg(), dst, size);
1101             // Load `0x3333...` into the scratch reg once, allowing us to use
1102             // `and_rr` and avoid inadvertently loading it twice as with `and`
1103 
1104             self.with_scratch::<IntScratch, _>(|masm, scratch| {
1105                 masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?;
1106                 masm.asm.and_rr(scratch.inner(), dst, size);
1107                 masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);
1108                 masm.asm.and_rr(scratch.inner(), tmp, size);
1109                 wasmtime_environ::error::Ok(())
1110             })?;
1111             self.asm.add_rr(dst.to_reg(), tmp, size);
1112 
1113             // x = (x + (x >> 4)) & m4;
1114             self.asm.mov_rr(tmp.to_reg(), dst, size);
1115             self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size);
1116             self.asm.add_rr(tmp.to_reg(), dst, size);
1117             let lhs = dst.to_reg();
1118             self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;
1119 
1120             // (x * h01) >> shift_amt
1121             let lhs = dst.to_reg();
1122             self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;
1123             self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size);
1124 
1125             context.stack.push(src.into());
1126             context.free_reg(tmp.to_reg());
1127 
1128             Ok(())
1129         }
1130     }
1131 
wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()>1132     fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1133         self.asm.mov_rr(src, dst, OperandSize::S32);
1134         Ok(())
1135     }
1136 
extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()>1137     fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {
1138         match kind {
1139             ExtendKind::Signed(ext) => {
1140                 self.asm.movsx_rr(src, dst, ext);
1141             }
1142             ExtendKind::Unsigned(ext) => {
1143                 self.asm.movzx_rr(src, dst, ext);
1144             }
1145         }
1146 
1147         Ok(())
1148     }
1149 
signed_truncate( &mut self, dst: WritableReg, src: Reg, src_size: OperandSize, dst_size: OperandSize, kind: TruncKind, ) -> Result<()>1150     fn signed_truncate(
1151         &mut self,
1152         dst: WritableReg,
1153         src: Reg,
1154         src_size: OperandSize,
1155         dst_size: OperandSize,
1156         kind: TruncKind,
1157     ) -> Result<()> {
1158         self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1159             masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1160                 masm.asm.cvt_float_to_sint_seq(
1161                     src,
1162                     dst,
1163                     gpr_scratch.inner(),
1164                     xmm_scratch.inner(),
1165                     src_size,
1166                     dst_size,
1167                     kind.is_checked(),
1168                 );
1169                 Ok(())
1170             })
1171         })
1172     }
1173 
unsigned_truncate( &mut self, ctx: &mut CodeGenContext<Emission>, src_size: OperandSize, dst_size: OperandSize, kind: TruncKind, ) -> Result<()>1174     fn unsigned_truncate(
1175         &mut self,
1176         ctx: &mut CodeGenContext<Emission>,
1177         src_size: OperandSize,
1178         dst_size: OperandSize,
1179         kind: TruncKind,
1180     ) -> Result<()> {
1181         let dst_ty = match dst_size {
1182             OperandSize::S32 => WasmValType::I32,
1183             OperandSize::S64 => WasmValType::I64,
1184             _ => bail!(CodeGenError::unexpected_operand_size()),
1185         };
1186 
1187         ctx.convert_op_with_tmp_reg(
1188             self,
1189             dst_ty,
1190             RegClass::Float,
1191             |masm, dst, src, tmp_fpr, dst_size| {
1192                 masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1193                     masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1194                         masm.asm.cvt_float_to_uint_seq(
1195                             src,
1196                             writable!(dst),
1197                             gpr_scratch.inner(),
1198                             xmm_scratch.inner(),
1199                             tmp_fpr,
1200                             src_size,
1201                             dst_size,
1202                             kind.is_checked(),
1203                         );
1204                         Ok(())
1205                     })
1206                 })
1207             },
1208         )
1209     }
1210 
signed_convert( &mut self, dst: WritableReg, src: Reg, src_size: OperandSize, dst_size: OperandSize, ) -> Result<()>1211     fn signed_convert(
1212         &mut self,
1213         dst: WritableReg,
1214         src: Reg,
1215         src_size: OperandSize,
1216         dst_size: OperandSize,
1217     ) -> Result<()> {
1218         self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);
1219         Ok(())
1220     }
1221 
unsigned_convert( &mut self, dst: WritableReg, src: Reg, tmp_gpr: Reg, src_size: OperandSize, dst_size: OperandSize, ) -> Result<()>1222     fn unsigned_convert(
1223         &mut self,
1224         dst: WritableReg,
1225         src: Reg,
1226         tmp_gpr: Reg,
1227         src_size: OperandSize,
1228         dst_size: OperandSize,
1229     ) -> Result<()> {
1230         // Need to convert unsigned uint32 to uint64 for conversion instruction sequence.
1231         if let OperandSize::S32 = src_size {
1232             self.extend(
1233                 writable!(src),
1234                 src,
1235                 ExtendKind::Unsigned(Extend::I64Extend32),
1236             )?;
1237         }
1238 
1239         self.with_scratch::<IntScratch, _>(|masm, scratch| {
1240             masm.asm
1241                 .cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size);
1242             Ok(())
1243         })
1244     }
1245 
reinterpret_float_as_int( &mut self, dst: WritableReg, src: Reg, size: OperandSize, ) -> Result<()>1246     fn reinterpret_float_as_int(
1247         &mut self,
1248         dst: WritableReg,
1249         src: Reg,
1250         size: OperandSize,
1251     ) -> Result<()> {
1252         self.asm.xmm_to_gpr(src, dst, size);
1253         Ok(())
1254     }
1255 
reinterpret_int_as_float( &mut self, dst: WritableReg, src: Reg, size: OperandSize, ) -> Result<()>1256     fn reinterpret_int_as_float(
1257         &mut self,
1258         dst: WritableReg,
1259         src: Reg,
1260         size: OperandSize,
1261     ) -> Result<()> {
1262         self.asm.gpr_to_xmm(src, dst, size);
1263         Ok(())
1264     }
1265 
demote(&mut self, dst: WritableReg, src: Reg) -> Result<()>1266     fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1267         self.asm
1268             .cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32);
1269         Ok(())
1270     }
1271 
promote(&mut self, dst: WritableReg, src: Reg) -> Result<()>1272     fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1273         self.asm
1274             .cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64);
1275         Ok(())
1276     }
1277 
unreachable(&mut self) -> Result<()>1278     fn unreachable(&mut self) -> Result<()> {
1279         self.asm.trap(TRAP_UNREACHABLE);
1280         Ok(())
1281     }
1282 
trap(&mut self, code: TrapCode) -> Result<()>1283     fn trap(&mut self, code: TrapCode) -> Result<()> {
1284         self.asm.trap(code);
1285         Ok(())
1286     }
1287 
trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()>1288     fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {
1289         self.asm.trapif(cc, code);
1290         Ok(())
1291     }
1292 
trapz(&mut self, src: Reg, code: TrapCode) -> Result<()>1293     fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {
1294         self.asm.test_rr(src, src, self.ptr_size);
1295         self.asm.trapif(IntCmpKind::Eq, code);
1296         Ok(())
1297     }
1298 
jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()>1299     fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {
1300         // At least one default target.
1301         debug_assert!(targets.len() >= 1);
1302         let default_index = targets.len() - 1;
1303         // Emit bounds check, by conditionally moving the max cases
1304         // into the given index reg if the contents of the index reg
1305         // are greater.
1306         let max = default_index;
1307         let size = OperandSize::S32;
1308         self.asm.mov_ir(max as u64, writable!(tmp), size);
1309         self.asm.cmp_rr(tmp, index, size);
1310         self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);
1311 
1312         let default = targets[default_index];
1313         let rest = &targets[0..default_index];
1314 
1315         self.with_scratch::<IntScratch, _>(|masm, tmp1| {
1316             masm.asm
1317                 .jmp_table(rest.into(), default, index, tmp1.inner(), tmp);
1318             Ok(())
1319         })
1320     }
1321 
start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)>1322     fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {
1323         Ok(self.asm.buffer_mut().start_srcloc(loc))
1324     }
1325 
end_source_loc(&mut self) -> Result<()>1326     fn end_source_loc(&mut self) -> Result<()> {
1327         self.asm.buffer_mut().end_srcloc();
1328         Ok(())
1329     }
1330 
current_code_offset(&self) -> Result<CodeOffset>1331     fn current_code_offset(&self) -> Result<CodeOffset> {
1332         Ok(self.asm.buffer().cur_offset())
1333     }
1334 
add128( &mut self, dst_lo: WritableReg, dst_hi: WritableReg, lhs_lo: Reg, lhs_hi: Reg, rhs_lo: Reg, rhs_hi: Reg, ) -> Result<()>1335     fn add128(
1336         &mut self,
1337         dst_lo: WritableReg,
1338         dst_hi: WritableReg,
1339         lhs_lo: Reg,
1340         lhs_hi: Reg,
1341         rhs_lo: Reg,
1342         rhs_hi: Reg,
1343     ) -> Result<()> {
1344         Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1345         Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1346         self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
1347         self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
1348         Ok(())
1349     }
1350 
sub128( &mut self, dst_lo: WritableReg, dst_hi: WritableReg, lhs_lo: Reg, lhs_hi: Reg, rhs_lo: Reg, rhs_hi: Reg, ) -> Result<()>1351     fn sub128(
1352         &mut self,
1353         dst_lo: WritableReg,
1354         dst_hi: WritableReg,
1355         lhs_lo: Reg,
1356         lhs_hi: Reg,
1357         rhs_lo: Reg,
1358         rhs_hi: Reg,
1359     ) -> Result<()> {
1360         Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1361         Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1362         self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
1363         self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
1364         Ok(())
1365     }
1366 
mul_wide( &mut self, context: &mut CodeGenContext<Emission>, kind: MulWideKind, ) -> Result<()>1367     fn mul_wide(
1368         &mut self,
1369         context: &mut CodeGenContext<Emission>,
1370         kind: MulWideKind,
1371     ) -> Result<()> {
1372         // Reserve rax/rdx since they're required by the `mul_wide` instruction
1373         // being used here.
1374         let rax = context.reg(regs::rax(), self)?;
1375         let rdx = context.reg(regs::rdx(), self)?;
1376 
1377         // The rhs of this binop can be in any register
1378         let rhs = context.pop_to_reg(self, None)?;
1379         // Mark rax as allocatable. and then force the lhs operand to be placed
1380         // in `rax`.
1381         context.free_reg(rax);
1382         let lhs = context.pop_to_reg(self, Some(rax))?;
1383 
1384         self.asm.mul_wide(
1385             writable!(rax),
1386             writable!(rdx),
1387             lhs.reg,
1388             rhs.reg,
1389             kind,
1390             OperandSize::S64,
1391         );
1392 
1393         // No longer using the rhs register after the multiplication has been
1394         // executed.
1395         context.free_reg(rhs);
1396 
1397         // The low bits of the result are in rax, where `lhs` was allocated to
1398         context.stack.push(lhs.into());
1399         // The high bits of the result are in rdx, which we previously reserved.
1400         context.stack.push(Val::Reg(TypedReg::i64(rdx)));
1401 
1402         Ok(())
1403     }
1404 
splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()>1405     fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {
1406         // Get the source and destination operands set up first.
1407         let (src, dst) = match size {
1408             // Floats can use the same register for `src` and `dst`.
1409             SplatKind::F32x4 | SplatKind::F64x2 => {
1410                 let reg = context.pop_to_reg(self, None)?.reg;
1411                 (RegImm::reg(reg), writable!(reg))
1412             }
1413             // For ints, we need to load the operand into a vector register if
1414             // it's not a constant.
1415             SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {
1416                 let dst = writable!(context.any_fpr(self)?);
1417                 let src = if size == SplatKind::I64x2 {
1418                     context.pop_i64_const().map(RegImm::i64)
1419                 } else {
1420                     context.pop_i32_const().map(RegImm::i32)
1421                 }
1422                 .map_or_else(
1423                     || -> Result<RegImm> {
1424                         let reg = context.pop_to_reg(self, None)?.reg;
1425                         self.reinterpret_int_as_float(
1426                             dst,
1427                             reg,
1428                             match size {
1429                                 SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {
1430                                     OperandSize::S32
1431                                 }
1432                                 SplatKind::I64x2 => OperandSize::S64,
1433                                 SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),
1434                             },
1435                         )?;
1436                         context.free_reg(reg);
1437                         Ok(RegImm::Reg(dst.to_reg()))
1438                     },
1439                     Ok,
1440                 )?;
1441                 (src, dst)
1442             }
1443         };
1444 
1445         // Perform the splat on the operands.
1446         if size == SplatKind::I64x2 || size == SplatKind::F64x2 {
1447             self.ensure_has_avx()?;
1448             let mask = Self::vpshuf_mask_for_64_bit_splats();
1449             match src {
1450                 RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),
1451                 RegImm::Imm(imm) => {
1452                     let src = self.asm.add_constant(&imm.to_bytes());
1453                     self.asm
1454                         .xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());
1455                 }
1456             }
1457         } else {
1458             self.ensure_has_avx2()?;
1459 
1460             match src {
1461                 RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),
1462                 RegImm::Imm(imm) => {
1463                     let src = self.asm.add_constant(&imm.to_bytes());
1464                     self.asm
1465                         .xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());
1466                 }
1467             }
1468         }
1469 
1470         context
1471             .stack
1472             .push(Val::reg(dst.to_reg(), WasmValType::V128));
1473         Ok(())
1474     }
1475 
shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()>1476     fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1477         self.ensure_has_avx()?;
1478 
1479         // Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`
1480         // separately to either the selected index or 0.
1481         // Then use `vpor` to combine `lhs` and `rhs` into `dst`.
1482         // Setting the most significant bit in the mask's lane to 1 will
1483         // result in corresponding lane in the destination register being
1484         // set to 0. 0x80 sets the most significant bit to 1.
1485         let mut mask_lhs: [u8; 16] = [0x80; 16];
1486         let mut mask_rhs: [u8; 16] = [0x80; 16];
1487         for i in 0..lanes.len() {
1488             if lanes[i] < 16 {
1489                 mask_lhs[i] = lanes[i];
1490             } else {
1491                 mask_rhs[i] = lanes[i] - 16;
1492             }
1493         }
1494         let mask_lhs = self.asm.add_constant(&mask_lhs);
1495         let mask_rhs = self.asm.add_constant(&mask_rhs);
1496 
1497         self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1498         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1499             masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs);
1500             masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner());
1501             Ok(())
1502         })
1503     }
1504 
swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()>1505     fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
1506         self.ensure_has_avx()?;
1507 
1508         // Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything
1509         // outside that range.
1510         // Each lane is a signed byte so the maximum value is 0x7F. Adding
1511         // 0x70 to any value higher than 0xF will saturate resulting in a value
1512         // of 0xFF (i.e., 0).
1513         let clamp = self.asm.add_constant(&[0x70; 16]);
1514         self.asm
1515             .xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8);
1516 
1517         // Don't need to subtract 0x70 since `vpshufb` uses the least
1518         // significant 4 bits which are the same after adding 0x70.
1519         self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
1520         Ok(())
1521     }
1522 
atomic_rmw( &mut self, context: &mut CodeGenContext<Emission>, addr: Self::Address, size: OperandSize, op: RmwOp, flags: MemFlags, extend: Option<Extend<Zero>>, ) -> Result<()>1523     fn atomic_rmw(
1524         &mut self,
1525         context: &mut CodeGenContext<Emission>,
1526         addr: Self::Address,
1527         size: OperandSize,
1528         op: RmwOp,
1529         flags: MemFlags,
1530         extend: Option<Extend<Zero>>,
1531     ) -> Result<()> {
1532         let res = match op {
1533             RmwOp::Add => {
1534                 let operand = context.pop_to_reg(self, None)?;
1535                 self.asm
1536                     .lock_xadd(addr, writable!(operand.reg), size, flags);
1537                 operand.reg
1538             }
1539             RmwOp::Sub => {
1540                 let operand = context.pop_to_reg(self, None)?;
1541                 self.asm.neg(operand.reg, writable!(operand.reg), size);
1542                 self.asm
1543                     .lock_xadd(addr, writable!(operand.reg), size, flags);
1544                 operand.reg
1545             }
1546             RmwOp::Xchg => {
1547                 let operand = context.pop_to_reg(self, None)?;
1548                 self.asm.xchg(addr, writable!(operand.reg), size, flags);
1549                 operand.reg
1550             }
1551             RmwOp::And | RmwOp::Or | RmwOp::Xor => {
1552                 let op = match op {
1553                     RmwOp::And => AtomicRmwSeqOp::And,
1554                     RmwOp::Or => AtomicRmwSeqOp::Or,
1555                     RmwOp::Xor => AtomicRmwSeqOp::Xor,
1556                     _ => unreachable!(
1557                         "invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"
1558                     ),
1559                 };
1560                 let dst = context.reg(regs::rax(), self)?;
1561                 let operand = context.pop_to_reg(self, None)?;
1562 
1563                 self.with_scratch::<IntScratch, _>(|masm, scratch| {
1564                     masm.asm.atomic_rmw_seq(
1565                         addr,
1566                         operand.reg,
1567                         writable!(dst),
1568                         scratch.writable(),
1569                         size,
1570                         flags,
1571                         op,
1572                     );
1573                 });
1574 
1575                 context.free_reg(operand.reg);
1576                 dst
1577             }
1578         };
1579 
1580         let dst_ty = match extend {
1581             Some(ext) => {
1582                 // We don't need to zero-extend from 32 to 64bits.
1583                 if !(ext.from_bits() == 32 && ext.to_bits() == 64) {
1584                     self.asm.movzx_rr(res, writable!(res), ext);
1585                 }
1586 
1587                 WasmValType::int_from_bits(ext.to_bits())
1588             }
1589             None => WasmValType::int_from_bits(size.num_bits()),
1590         };
1591 
1592         context.stack.push(TypedReg::new(dst_ty, res).into());
1593 
1594         Ok(())
1595     }
1596 
extract_lane( &mut self, src: Reg, dst: WritableReg, lane: u8, kind: ExtractLaneKind, ) -> Result<()>1597     fn extract_lane(
1598         &mut self,
1599         src: Reg,
1600         dst: WritableReg,
1601         lane: u8,
1602         kind: ExtractLaneKind,
1603     ) -> Result<()> {
1604         self.ensure_has_avx()?;
1605 
1606         match kind {
1607             ExtractLaneKind::I8x16S
1608             | ExtractLaneKind::I8x16U
1609             | ExtractLaneKind::I16x8S
1610             | ExtractLaneKind::I16x8U
1611             | ExtractLaneKind::I32x4
1612             | ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),
1613             ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {
1614                 // If the `src` and `dst` registers are the same, then the
1615                 // appropriate value is already in the correct position in
1616                 // the register.
1617                 assert!(src == dst.to_reg());
1618             }
1619             ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),
1620             ExtractLaneKind::F64x2 => {
1621                 // `0b11_10` selects the high and low 32-bits of the second
1622                 // 64-bit, so `0b11_10_11_10` splats the 64-bit value across
1623                 // both lanes. Since we put an `f64` on the stack, we use
1624                 // the splatted value.
1625                 // Double-check `lane == 0` was handled in another branch.
1626                 assert!(lane == 1);
1627                 self.asm
1628                     .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)
1629             }
1630         }
1631 
1632         // Sign-extend to 32-bits for sign extended kinds.
1633         match kind {
1634             ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {
1635                 self.asm.movsx_rr(dst.to_reg(), dst, kind.into())
1636             }
1637             _ => (),
1638         }
1639 
1640         Ok(())
1641     }
1642 
replace_lane( &mut self, src: RegImm, dst: WritableReg, lane: u8, kind: ReplaceLaneKind, ) -> Result<()>1643     fn replace_lane(
1644         &mut self,
1645         src: RegImm,
1646         dst: WritableReg,
1647         lane: u8,
1648         kind: ReplaceLaneKind,
1649     ) -> Result<()> {
1650         self.ensure_has_avx()?;
1651 
1652         match kind {
1653             ReplaceLaneKind::I8x16
1654             | ReplaceLaneKind::I16x8
1655             | ReplaceLaneKind::I32x4
1656             | ReplaceLaneKind::I64x2 => match src {
1657                 RegImm::Reg(reg) => {
1658                     self.asm
1659                         .xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());
1660                 }
1661                 RegImm::Imm(imm) => {
1662                     let address = self.asm.add_constant(&imm.to_bytes());
1663                     self.asm
1664                         .xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());
1665                 }
1666             },
1667             ReplaceLaneKind::F32x4 => {
1668                 // Immediate for `vinsertps` uses first 3 bits to determine
1669                 // which elements of the destination to set to 0. The next 2
1670                 // bits specify which element of the destination will be
1671                 // overwritten.
1672                 let imm = lane << 4;
1673                 match src {
1674                     RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),
1675                     RegImm::Imm(val) => {
1676                         let address = self.asm.add_constant(&val.to_bytes());
1677                         self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);
1678                     }
1679                 }
1680             }
1681             ReplaceLaneKind::F64x2 => match src {
1682                 RegImm::Reg(reg) => match lane {
1683                     0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),
1684                     1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),
1685                     _ => unreachable!(),
1686                 },
1687                 RegImm::Imm(imm) => {
1688                     let address = self.asm.add_constant(&imm.to_bytes());
1689                     match lane {
1690                         0 => {
1691                             // Memory load variant of `vmovsd` zeroes the upper
1692                             // 64 bits of the register so need to load the
1693                             // immediate to a register to use the register
1694                             // variant of `vmovsd` to perform the merge.
1695 
1696                             self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1697                                 masm.asm.xmm_vmovsd_rm(scratch.writable(), &address);
1698                                 masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner());
1699                             });
1700                         }
1701                         1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),
1702                         _ => unreachable!(),
1703                     }
1704                 }
1705             },
1706         }
1707         Ok(())
1708     }
1709 
atomic_cas( &mut self, context: &mut CodeGenContext<Emission>, addr: Self::Address, size: OperandSize, flags: MemFlags, extend: Option<Extend<Zero>>, ) -> Result<()>1710     fn atomic_cas(
1711         &mut self,
1712         context: &mut CodeGenContext<Emission>,
1713         addr: Self::Address,
1714         size: OperandSize,
1715         flags: MemFlags,
1716         extend: Option<Extend<Zero>>,
1717     ) -> Result<()> {
1718         // `cmpxchg` expects `expected` to be in the `*a*` register.
1719         // reserve rax for the expected argument.
1720         let rax = context.reg(regs::rax(), self)?;
1721 
1722         let replacement = context.pop_to_reg(self, None)?;
1723 
1724         // mark `rax` as allocatable again.
1725         context.free_reg(rax);
1726         let expected = context.pop_to_reg(self, Some(regs::rax()))?;
1727 
1728         self.asm
1729             .cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);
1730 
1731         if let Some(extend) = extend {
1732             // We don't need to zero-extend from 32 to 64bits.
1733             if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
1734                 self.asm
1735                     .movzx_rr(expected.reg, writable!(expected.reg), extend);
1736             }
1737         }
1738 
1739         context.stack.push(expected.into());
1740         context.free_reg(replacement);
1741 
1742         Ok(())
1743     }
1744 
v128_eq( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorEqualityKind, ) -> Result<()>1745     fn v128_eq(
1746         &mut self,
1747         dst: WritableReg,
1748         lhs: Reg,
1749         rhs: Reg,
1750         kind: VectorEqualityKind,
1751     ) -> Result<()> {
1752         self.ensure_has_avx()?;
1753 
1754         match kind {
1755             VectorEqualityKind::I8x16
1756             | VectorEqualityKind::I16x8
1757             | VectorEqualityKind::I32x4
1758             | VectorEqualityKind::I64x2 => {
1759                 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())
1760             }
1761             VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1762                 self.asm
1763                     .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)
1764             }
1765         }
1766         Ok(())
1767     }
1768 
v128_ne( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorEqualityKind, ) -> Result<()>1769     fn v128_ne(
1770         &mut self,
1771         dst: WritableReg,
1772         lhs: Reg,
1773         rhs: Reg,
1774         kind: VectorEqualityKind,
1775     ) -> Result<()> {
1776         self.ensure_has_avx()?;
1777 
1778         match kind {
1779             VectorEqualityKind::I8x16
1780             | VectorEqualityKind::I16x8
1781             | VectorEqualityKind::I32x4
1782             | VectorEqualityKind::I64x2 => {
1783                 // Check for equality and invert the results.
1784                 self.asm
1785                     .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1786                 self.asm
1787                     .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1788                 self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1789             }
1790             VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1791                 self.asm
1792                     .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)
1793             }
1794         }
1795         Ok(())
1796     }
1797 
v128_lt( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1798     fn v128_lt(
1799         &mut self,
1800         dst: WritableReg,
1801         lhs: Reg,
1802         rhs: Reg,
1803         kind: VectorCompareKind,
1804     ) -> Result<()> {
1805         self.ensure_has_avx()?;
1806 
1807         match kind {
1808             VectorCompareKind::I8x16S
1809             | VectorCompareKind::I16x8S
1810             | VectorCompareKind::I32x4S
1811             | VectorCompareKind::I64x2S => {
1812                 // Perform a greater than check with reversed parameters.
1813                 self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())
1814             }
1815             VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1816                 // Set `lhs` to min values, check for equality, then invert the
1817                 // result.
1818                 // If `lhs` is smaller, then equality check will fail and result
1819                 // will be inverted to true. Otherwise the equality check will
1820                 // pass and be inverted to false.
1821                 self.asm
1822                     .xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1823                 self.asm
1824                     .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1825                 self.asm
1826                     .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1827                 self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1828             }
1829             VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1830                 self.asm
1831                     .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)
1832             }
1833         }
1834         Ok(())
1835     }
1836 
v128_le( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1837     fn v128_le(
1838         &mut self,
1839         dst: WritableReg,
1840         lhs: Reg,
1841         rhs: Reg,
1842         kind: VectorCompareKind,
1843     ) -> Result<()> {
1844         self.ensure_has_avx()?;
1845 
1846         match kind {
1847             VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1848                 // Set the `rhs` vector to the signed minimum values and then
1849                 // compare them with `lhs` for equality.
1850                 self.asm
1851                     .xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1852                 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1853             }
1854             VectorCompareKind::I64x2S => {
1855                 // Do a greater than check and invert the results.
1856                 self.asm
1857                     .xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1858                 self.asm
1859                     .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1860                 self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1861             }
1862             VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1863                 // Set the `rhs` vector to the signed minimum values and then
1864                 // compare them with `lhs` for equality.
1865                 self.asm
1866                     .xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1867                 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1868             }
1869             VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1870                 self.asm
1871                     .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)
1872             }
1873         }
1874         Ok(())
1875     }
1876 
v128_gt( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1877     fn v128_gt(
1878         &mut self,
1879         dst: WritableReg,
1880         lhs: Reg,
1881         rhs: Reg,
1882         kind: VectorCompareKind,
1883     ) -> Result<()> {
1884         self.ensure_has_avx()?;
1885 
1886         match kind {
1887             VectorCompareKind::I8x16S
1888             | VectorCompareKind::I16x8S
1889             | VectorCompareKind::I32x4S
1890             | VectorCompareKind::I64x2S => {
1891                 self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())
1892             }
1893             VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1894                 // Set `lhs` to max values, check for equality, then invert the
1895                 // result.
1896                 // If `lhs` is larger, then equality check will fail and result
1897                 // will be inverted to true. Otherwise the equality check will
1898                 // pass and be inverted to false.
1899                 self.asm
1900                     .xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1901                 self.asm
1902                     .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1903                 self.asm
1904                     .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1905                 self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1906             }
1907             VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1908                 // Do a less than comparison with the operands swapped.
1909                 self.asm
1910                     .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)
1911             }
1912         }
1913         Ok(())
1914     }
1915 
v128_ge( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1916     fn v128_ge(
1917         &mut self,
1918         dst: WritableReg,
1919         lhs: Reg,
1920         rhs: Reg,
1921         kind: VectorCompareKind,
1922     ) -> Result<()> {
1923         self.ensure_has_avx()?;
1924 
1925         match kind {
1926             VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1927                 // Set each lane to maximum value and then compare for equality.
1928                 self.asm
1929                     .xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1930                 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1931             }
1932             VectorCompareKind::I64x2S => {
1933                 // Perform a greater than comparison with operands swapped,
1934                 // then invert the results.
1935                 self.asm
1936                     .xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());
1937                 self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());
1938                 self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
1939             }
1940             VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1941                 // Set lanes to maximum values and compare them for equality.
1942                 self.asm
1943                     .xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1944                 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1945             }
1946             VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1947                 // Perform a less than or equal comparison on swapped operands.
1948                 self.asm
1949                     .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)
1950             }
1951         }
1952 
1953         Ok(())
1954     }
1955 
fence(&mut self) -> Result<()>1956     fn fence(&mut self) -> Result<()> {
1957         self.asm.mfence();
1958         Ok(())
1959     }
1960 
v128_not(&mut self, dst: WritableReg) -> Result<()>1961     fn v128_not(&mut self, dst: WritableReg) -> Result<()> {
1962         self.ensure_has_avx()?;
1963 
1964         self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1965             // First, we initialize `tmp` with all ones by comparing it with
1966             // itself.
1967             masm.asm
1968                 .xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32);
1969             // Then we `xor` tmp and `dst` together, yielding `!dst`.
1970             masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst);
1971             Ok(())
1972         })
1973     }
1974 
v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1975     fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1976         self.ensure_has_avx()?;
1977         self.asm.xmm_vpand_rrr(src1, src2, dst);
1978         Ok(())
1979     }
1980 
v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1981     fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1982         self.ensure_has_avx()?;
1983         self.asm.xmm_vpandn_rrr(src1, src2, dst);
1984         Ok(())
1985     }
1986 
v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1987     fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1988         self.ensure_has_avx()?;
1989         self.asm.xmm_vpor_rrr(dst, src1, src2);
1990         Ok(())
1991     }
1992 
v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1993     fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1994         self.ensure_has_avx()?;
1995         self.asm.xmm_vpxor_rrr(src1, src2, dst);
1996         Ok(())
1997     }
1998 
v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()>1999     fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {
2000         self.ensure_has_avx()?;
2001 
2002         self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2003             masm.v128_and(src1, mask, tmp.writable())?;
2004             masm.v128_and_not(mask, src2, dst)?;
2005             masm.v128_or(dst.to_reg(), tmp.inner(), dst)?;
2006             Ok(())
2007         })
2008     }
2009 
v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()>2010     fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2011         self.ensure_has_avx()?;
2012         self.asm.xmm_vptest(src, src);
2013         self.asm.setcc(IntCmpKind::Ne, dst);
2014         Ok(())
2015     }
2016 
v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()>2017     fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {
2018         self.ensure_has_avx()?;
2019         match kind {
2020             V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),
2021             V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),
2022             V128ConvertKind::I32x4U => {
2023                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2024                     // Split each 32-bit integer into 16-bit parts.
2025                     // `scratch` will contain the low bits and `dst` will contain
2026                     // the high bits.
2027                     masm.asm
2028                         .xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size());
2029                     masm.asm.xmm_vpsrl_rri(
2030                         scratch.inner(),
2031                         scratch.writable(),
2032                         0x10,
2033                         kind.src_lane_size(),
2034                     );
2035                     masm.asm
2036                         .xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2037 
2038                     // Convert the low bits in `scratch` to floating point numbers.
2039                     masm.asm
2040                         .xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
2041 
2042                     // Prevent overflow by right shifting high bits.
2043                     masm.asm
2044                         .xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size());
2045                     // Convert high bits in `dst` to floating point numbers.
2046                     masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);
2047                     // Double high bits in `dst` to reverse right shift.
2048                     masm.asm
2049                         .xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());
2050                     // Add high bits in `dst` to low bits in `scratch`.
2051                     masm.asm.xmm_vaddp_rrr(
2052                         dst.to_reg(),
2053                         scratch.inner(),
2054                         dst,
2055                         kind.src_lane_size(),
2056                     );
2057                 });
2058             }
2059             V128ConvertKind::I32x4LowU => {
2060                 // See
2061                 // https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L3668
2062                 // for details on the Cranelift AVX implementation.
2063                 // Use `vunpcklp` to create doubles from the integers.
2064                 // Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers
2065                 // creates a byte array for a double that sets the mantissa
2066                 // bits to the original integer value.
2067                 let conversion_constant = self
2068                     .asm
2069                     .add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);
2070                 self.asm
2071                     .xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());
2072                 // Subtract the 0x1.0p52 added above.
2073                 let conversion_constant = self.asm.add_constant(&[
2074                     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
2075                     0x00, 0x30, 0x43,
2076                 ]);
2077                 self.asm.xmm_vsub_rrm(
2078                     dst.to_reg(),
2079                     &conversion_constant,
2080                     dst,
2081                     kind.dst_lane_size(),
2082                 );
2083             }
2084         }
2085         Ok(())
2086     }
2087 
v128_narrow( &mut self, src1: Reg, src2: Reg, dst: WritableReg, kind: V128NarrowKind, ) -> Result<()>2088     fn v128_narrow(
2089         &mut self,
2090         src1: Reg,
2091         src2: Reg,
2092         dst: WritableReg,
2093         kind: V128NarrowKind,
2094     ) -> Result<()> {
2095         self.ensure_has_avx()?;
2096         match kind {
2097             V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {
2098                 self.asm
2099                     .xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())
2100             }
2101             V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {
2102                 self.asm
2103                     .xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())
2104             }
2105         }
2106         Ok(())
2107     }
2108 
v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()>2109     fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2110         self.ensure_has_avx()?;
2111         self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);
2112         Ok(())
2113     }
2114 
v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()>2115     fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2116         self.ensure_has_avx()?;
2117         self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);
2118         Ok(())
2119     }
2120 
v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()>2121     fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {
2122         self.ensure_has_avx()?;
2123         match kind {
2124             V128ExtendKind::LowI8x16S
2125             | V128ExtendKind::LowI8x16U
2126             | V128ExtendKind::LowI16x8S
2127             | V128ExtendKind::LowI16x8U
2128             | V128ExtendKind::LowI32x4S
2129             | V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),
2130             V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {
2131                 self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);
2132                 self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2133             }
2134             V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {
2135                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2136                     masm.asm
2137                         .xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2138                     masm.asm
2139                         .xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2140                 });
2141             }
2142             V128ExtendKind::HighI32x4S => {
2143                 // Move the 3rd element (i.e., 0b10) to the 1st (rightmost)
2144                 // position and the 4th element (i.e., 0b11) to the 2nd (second
2145                 // from the right) position and then perform the extend.
2146                 self.asm
2147                     .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());
2148                 self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2149             }
2150             V128ExtendKind::HighI32x4U => {
2151                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2152                     // Set `scratch` to a vector 0s.
2153                     masm.asm.xmm_vxorp_rrr(
2154                         scratch.inner(),
2155                         scratch.inner(),
2156                         scratch.writable(),
2157                         kind.src_lane_size(),
2158                     );
2159                     // Interleave the 0 bits into the two 32-bit integers to zero extend them.
2160                     masm.asm
2161                         .xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2162                 });
2163             }
2164         }
2165         Ok(())
2166     }
2167 
v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()>2168     fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {
2169         self.ensure_has_avx()?;
2170         match kind {
2171             V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32),
2172             V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64),
2173             V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8),
2174             V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8),
2175             V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8),
2176             V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16),
2177             V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16),
2178             V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16),
2179             V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32),
2180             V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64),
2181         };
2182         Ok(())
2183     }
2184 
v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()>2185     fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {
2186         self.ensure_has_avx()?;
2187         match kind {
2188             V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32),
2189             V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64),
2190             V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8),
2191             V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8),
2192             V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8),
2193             V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16),
2194             V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16),
2195             V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16),
2196             V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32),
2197             V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64),
2198         };
2199         Ok(())
2200     }
2201 
v128_mul( &mut self, context: &mut CodeGenContext<Emission>, kind: V128MulKind, ) -> Result<()>2202     fn v128_mul(
2203         &mut self,
2204         context: &mut CodeGenContext<Emission>,
2205         kind: V128MulKind,
2206     ) -> Result<()> {
2207         self.ensure_has_avx()?;
2208 
2209         let rhs = context.pop_to_reg(self, None)?;
2210         let lhs = context.pop_to_reg(self, None)?;
2211 
2212         let mul_i64x2_avx512 = |this: &mut Self| {
2213             this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg));
2214         };
2215 
2216         let mul_i64x2_fallback = |this: &mut Self,
2217                                   context: &mut CodeGenContext<Emission>|
2218          -> Result<()> {
2219             // Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback
2220             // to an instruction sequence using 32bits multiplication (taken from cranelift
2221             // implementation, in `isa/x64/lower.isle`):
2222             //
2223             // > Otherwise, for i64x2 multiplication we describe a lane A as being composed of
2224             // > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
2225             // > multiplication can then be written as:
2226             //
2227             // >    Ah Al
2228             // > *  Bh Bl
2229             // >    -----
2230             // >    Al * Bl
2231             // > + (Ah * Bl) << 32
2232             // > + (Al * Bh) << 32
2233             //
2234             // > So for each lane we will compute:
2235             //
2236             // >   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
2237             //
2238             // > Note, the algorithm will use `pmuludq` which operates directly on the lower
2239             // > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
2240             // > the lane of the destination. For this reason we don't need shifts to isolate
2241             // > the lower 32-bits, however, we will need to use shifts to isolate the high
2242             // > 32-bits when doing calculations, i.e., `Ah == A >> 32`.
2243 
2244             let tmp2 = context.any_fpr(this)?;
2245             this.with_scratch::<FloatScratch, _>(|this, tmp1| {
2246                 // tmp1 = lhs_hi = (lhs >> 32)
2247                 this.asm
2248                     .xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64);
2249 
2250                 // tmp2 = lhs_hi * rhs_low = tmp1 * rhs
2251                 this.asm
2252                     .xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2));
2253 
2254                 // tmp1 = rhs_hi = rhs >> 32
2255                 this.asm
2256                     .xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64);
2257 
2258                 // tmp1 = lhs_low * rhs_high = tmp1 * lhs
2259                 this.asm
2260                     .xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable());
2261 
2262                 // tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2263                 this.asm
2264                     .xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64);
2265 
2266                 //tmp1 = tmp1 << 32
2267                 this.asm
2268                     .xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64);
2269 
2270                 // tmp2 = lhs_lo + rhs_lo
2271                 this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2));
2272 
2273                 // finally, with `lhs` as destination:
2274                 // lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2275                 this.asm
2276                     .xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64);
2277             });
2278 
2279             context.free_reg(tmp2);
2280 
2281             Ok(())
2282         };
2283 
2284         match kind {
2285             V128MulKind::F32x4 => {
2286                 self.asm
2287                     .xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2288             }
2289             V128MulKind::F64x2 => {
2290                 self.asm
2291                     .xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64)
2292             }
2293             V128MulKind::I16x8 => {
2294                 self.asm
2295                     .xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16)
2296             }
2297             V128MulKind::I32x4 => {
2298                 self.asm
2299                     .xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2300             }
2301             // This is the fast path when AVX512 is available.
2302             V128MulKind::I64x2
2303                 if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>
2304             {
2305                 mul_i64x2_avx512(self)
2306             }
2307             // Otherwise, we emit AVX fallback sequence.
2308             V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,
2309         }
2310 
2311         context.stack.push(lhs.into());
2312         context.free_reg(rhs);
2313 
2314         Ok(())
2315     }
2316 
v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()>2317     fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {
2318         self.ensure_has_avx()?;
2319 
2320         match kind {
2321             V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {
2322                 self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())
2323             }
2324             V128AbsKind::I64x2 => {
2325                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2326                     // Perform an arithmetic right shift of 31 bits. If the number
2327                     // is positive, this will result in all zeroes in the upper
2328                     // 32-bits. If the number is negative, this will result in all
2329                     // ones in the upper 32-bits.
2330                     masm.asm
2331                         .xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32);
2332                     // Copy the ones and zeroes in the high bits of each 64-bit
2333                     // lane to the low bits of each 64-bit lane.
2334                     masm.asm.xmm_vpshuf_rr(
2335                         scratch.inner(),
2336                         scratch.writable(),
2337                         0b11_11_01_01,
2338                         OperandSize::S32,
2339                     );
2340                     // Flip the bits in lanes that were negative in `src` and leave
2341                     // the positive lanes as they are. Positive lanes will have a
2342                     // zero mask in `scratch` so xor doesn't affect them.
2343                     masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst);
2344                     // Subtract the mask from the results of xor which will
2345                     // complete the two's complement for lanes which were negative.
2346                     masm.asm
2347                         .xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2348                 });
2349             }
2350             V128AbsKind::F32x4 | V128AbsKind::F64x2 => {
2351                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2352                     // Create a mask of all ones.
2353                     masm.asm.xmm_vpcmpeq_rrr(
2354                         scratch.writable(),
2355                         scratch.inner(),
2356                         scratch.inner(),
2357                         kind.lane_size(),
2358                     );
2359                     // Right shift the mask so each lane is a single zero followed
2360                     // by all ones.
2361                     masm.asm.xmm_vpsrl_rri(
2362                         scratch.inner(),
2363                         scratch.writable(),
2364                         0x1,
2365                         kind.lane_size(),
2366                     );
2367                     // Use the mask to zero the sign bit in each lane which will
2368                     // make the float value positive.
2369                     masm.asm
2370                         .xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size());
2371                 });
2372             }
2373         }
2374         Ok(())
2375     }
2376 
v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()>2377     fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {
2378         self.ensure_has_avx()?;
2379 
2380         match kind {
2381             V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {
2382                 self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2383                     masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?;
2384                     masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?;
2385                     wasmtime_environ::error::Ok(())
2386                 })?;
2387             }
2388             V128NegKind::F32x4 | V128NegKind::F64x2 => {
2389                 self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2390                     // Create a mask of all 1s.
2391                     masm.asm.xmm_vpcmpeq_rrr(
2392                         tmp.writable(),
2393                         tmp.inner(),
2394                         tmp.inner(),
2395                         kind.lane_size(),
2396                     );
2397                     // Left shift the lanes in the mask so only the sign bit in the
2398                     // mask is set to 1.
2399                     masm.asm.xmm_vpsll_rri(
2400                         tmp.inner(),
2401                         tmp.writable(),
2402                         (kind.lane_size().num_bits() - 1) as u32,
2403                         kind.lane_size(),
2404                     );
2405                     // Use the mask to flip the sign bit.
2406                     masm.asm
2407                         .xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size());
2408                 });
2409             }
2410         }
2411         Ok(())
2412     }
2413 
v128_shift( &mut self, context: &mut CodeGenContext<Emission>, lane_width: OperandSize, kind: ShiftKind, ) -> Result<()>2414     fn v128_shift(
2415         &mut self,
2416         context: &mut CodeGenContext<Emission>,
2417         lane_width: OperandSize,
2418         kind: ShiftKind,
2419     ) -> Result<()> {
2420         self.ensure_has_avx()?;
2421         let shift_amount = context.pop_to_reg(self, None)?.reg;
2422         let operand = context.pop_to_reg(self, None)?.reg;
2423         let amount_mask = lane_width.num_bits() - 1;
2424 
2425         self.and(
2426             writable!(shift_amount),
2427             shift_amount,
2428             RegImm::i32(amount_mask as i32),
2429             OperandSize::S32,
2430         )?;
2431 
2432         let move_to_tmp_xmm = |this: &mut Self, tmp_xmm: Scratch| {
2433             this.asm
2434                 .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2435         };
2436 
2437         // A helper for deciding between `vpsllw` and `vpsrlw` in
2438         // `shift_i8x16`.
2439         enum Direction {
2440             Left,
2441             Right,
2442         }
2443 
2444         let shift_i8x16 = |this: &mut Self,
2445                            masks: &'static [u8],
2446                            direction: Direction|
2447          -> Result<()> {
2448             // The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit
2449             // shift instruction. Instead, we shift as 16bits, and then mask the bits in the
2450             // 8bits lane, for example (with 2 8bits lanes):
2451             // - Before shifting:
2452             // 01001101 11101110
2453             // - shifting by 2 left:
2454             // 00110111 10111000
2455             //       ^^_ these bits come from the previous byte, and need to be masked.
2456             // - The mask:
2457             // 11111100 11111111
2458             // - After masking:
2459             // 00110100 10111000
2460             //
2461             // The mask is loaded from a well known memory, depending on the shift amount.
2462 
2463             this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {
2464                 this.asm
2465                     .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2466 
2467                 // Perform the 16-bit shift.
2468                 match direction {
2469                     Direction::Left => this.asm.xmm_vpsll_rrr(
2470                         operand,
2471                         tmp_xmm.inner(),
2472                         writable!(operand),
2473                         OperandSize::S16,
2474                     ),
2475                     Direction::Right => this.asm.xmm_vpsrl_rrr(
2476                         operand,
2477                         tmp_xmm.inner(),
2478                         writable!(operand),
2479                         OperandSize::S16,
2480                     ),
2481                 }
2482 
2483                 // Get a handle to the masks array constant.
2484                 let masks_addr = this.asm.add_constant(masks);
2485 
2486                 this.with_scratch::<IntScratch, _>(|this, tmp| {
2487                     // Load the masks array effective address into the tmp register.
2488                     this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64);
2489 
2490                     // Compute the offset of the mask that we need to use. This is shift_amount * 16 ==
2491                     // shift_amount << 4.
2492                     this.asm
2493                         .shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
2494 
2495                     // Load the mask to tmp_xmm.
2496                     this.asm.xmm_vmovdqu_mr(
2497                         &Address::ImmRegRegShift {
2498                             simm32: 0,
2499                             base: tmp.inner(),
2500                             index: shift_amount,
2501                             shift: 0,
2502                         },
2503                         tmp_xmm.writable(),
2504                         MemFlags::trusted(),
2505                     );
2506                 });
2507 
2508                 // Mask unwanted bits from operand.
2509                 this.asm
2510                     .xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand));
2511                 Ok(())
2512             })
2513         };
2514 
2515         let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2516             const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
2517 
2518             // AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the
2519             // following formula (from hacker's delight 2-7), where x is the value and n the shift
2520             // amount, for each lane:
2521             // t = (1 << 63) >> n; ((x >> n) ^ t) - t
2522 
2523             // We need an extra scratch register:
2524             let tmp_xmm2 = context.any_fpr(this)?;
2525 
2526             this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {
2527                 this.asm
2528                     .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2529 
2530                 let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
2531 
2532                 this.asm
2533                     .xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
2534                 this.asm.xmm_vpsrl_rrr(
2535                     tmp_xmm2,
2536                     tmp_xmm.inner(),
2537                     writable!(tmp_xmm2),
2538                     OperandSize::S64,
2539                 );
2540                 this.asm.xmm_vpsrl_rrr(
2541                     operand,
2542                     tmp_xmm.inner(),
2543                     writable!(operand),
2544                     OperandSize::S64,
2545                 );
2546             });
2547             this.asm
2548                 .xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand));
2549             this.asm
2550                 .xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64);
2551 
2552             context.free_reg(tmp_xmm2);
2553 
2554             Ok(())
2555         };
2556 
2557         let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2558             // Since the x86 instruction set does not have an 8x16 shift instruction and the
2559             // approach used for `ishl` and `ushr` cannot be easily used (the masks do not
2560             // preserve the sign), we use a different approach here: separate the low and
2561             // high lanes, shift them separately, and merge them into the final result.
2562             //
2563             // Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
2564             // s15]:
2565             //
2566             //   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2567             //   shifted_lo.i16x8 = shift each lane of `low`
2568             //   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2569             //   shifted_hi.i16x8 = shift each lane of `high`
2570             //   result = [s0'', s1'', ..., s15'']
2571 
2572             // In order for `packsswb` later to only use the high byte of each
2573             // 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
2574             // fill in the upper bits appropriately.
2575             let tmp_lo = context.any_fpr(this)?;
2576             let tmp_hi = context.any_fpr(this)?;
2577 
2578             this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {
2579                 this.asm
2580                     .add_ir(8, writable!(shift_amount), OperandSize::S32);
2581                 this.asm
2582                     .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2583 
2584                 // Extract lower and upper bytes.
2585                 this.asm
2586                     .xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8);
2587                 this.asm
2588                     .xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8);
2589 
2590                 // Perform 16bit right shift of upper and lower bytes.
2591                 this.asm.xmm_vpsra_rrr(
2592                     tmp_lo,
2593                     tmp_xmm.inner(),
2594                     writable!(tmp_lo),
2595                     OperandSize::S16,
2596                 );
2597                 this.asm.xmm_vpsra_rrr(
2598                     tmp_hi,
2599                     tmp_xmm.inner(),
2600                     writable!(tmp_hi),
2601                     OperandSize::S16,
2602                 );
2603             });
2604 
2605             // Merge lower and upper bytes back.
2606             this.asm
2607                 .xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8);
2608 
2609             context.free_reg(tmp_lo);
2610             context.free_reg(tmp_hi);
2611 
2612             Ok(())
2613         };
2614 
2615         match (lane_width, kind) {
2616             // shl
2617             (OperandSize::S8, ShiftKind::Shl) => {
2618                 shift_i8x16(self, &I8X16_ISHL_MASKS, Direction::Left)?
2619             }
2620             (OperandSize::S16, ShiftKind::Shl) => {
2621                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2622                     move_to_tmp_xmm(masm, tmp_xmm);
2623                     masm.asm.xmm_vpsll_rrr(
2624                         operand,
2625                         tmp_xmm.inner(),
2626                         writable!(operand),
2627                         OperandSize::S16,
2628                     );
2629                 })
2630             }
2631             (OperandSize::S32, ShiftKind::Shl) => {
2632                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2633                     move_to_tmp_xmm(masm, tmp_xmm);
2634                     masm.asm.xmm_vpsll_rrr(
2635                         operand,
2636                         tmp_xmm.inner(),
2637                         writable!(operand),
2638                         OperandSize::S32,
2639                     );
2640                 })
2641             }
2642             (OperandSize::S64, ShiftKind::Shl) => {
2643                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2644                     move_to_tmp_xmm(masm, tmp_xmm);
2645                     masm.asm.xmm_vpsll_rrr(
2646                         operand,
2647                         tmp_xmm.inner(),
2648                         writable!(operand),
2649                         OperandSize::S64,
2650                     );
2651                 })
2652             }
2653             // shr_u
2654             (OperandSize::S8, ShiftKind::ShrU) => {
2655                 shift_i8x16(self, &I8X16_USHR_MASKS, Direction::Right)?
2656             }
2657             (OperandSize::S16, ShiftKind::ShrU) => {
2658                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2659                     move_to_tmp_xmm(masm, tmp_xmm);
2660                     masm.asm.xmm_vpsrl_rrr(
2661                         operand,
2662                         tmp_xmm.inner(),
2663                         writable!(operand),
2664                         OperandSize::S16,
2665                     );
2666                 })
2667             }
2668             (OperandSize::S32, ShiftKind::ShrU) => {
2669                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2670                     move_to_tmp_xmm(masm, tmp_xmm);
2671                     masm.asm.xmm_vpsrl_rrr(
2672                         operand,
2673                         tmp_xmm.inner(),
2674                         writable!(operand),
2675                         OperandSize::S32,
2676                     );
2677                 })
2678             }
2679             (OperandSize::S64, ShiftKind::ShrU) => {
2680                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2681                     move_to_tmp_xmm(masm, tmp_xmm);
2682                     masm.asm.xmm_vpsrl_rrr(
2683                         operand,
2684                         tmp_xmm.inner(),
2685                         writable!(operand),
2686                         OperandSize::S64,
2687                     );
2688                 })
2689             }
2690             // shr_s
2691             (OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?,
2692             (OperandSize::S16, ShiftKind::ShrS) => {
2693                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2694                     move_to_tmp_xmm(masm, tmp_xmm);
2695                     masm.asm.xmm_vpsra_rrr(
2696                         operand,
2697                         tmp_xmm.inner(),
2698                         writable!(operand),
2699                         OperandSize::S16,
2700                     );
2701                 })
2702             }
2703             (OperandSize::S32, ShiftKind::ShrS) => {
2704                 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2705                     move_to_tmp_xmm(masm, tmp_xmm);
2706                     masm.asm.xmm_vpsra_rrr(
2707                         operand,
2708                         tmp_xmm.inner(),
2709                         writable!(operand),
2710                         OperandSize::S32,
2711                     );
2712                 })
2713             }
2714             (OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?,
2715 
2716             _ => bail!(CodeGenError::invalid_operand_combination()),
2717         }
2718 
2719         context.free_reg(shift_amount);
2720         context
2721             .stack
2722             .push(TypedReg::new(WasmValType::V128, operand).into());
2723         Ok(())
2724     }
2725 
v128_q15mulr_sat_s( &mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize, ) -> Result<()>2726     fn v128_q15mulr_sat_s(
2727         &mut self,
2728         lhs: Reg,
2729         rhs: Reg,
2730         dst: WritableReg,
2731         size: OperandSize,
2732     ) -> Result<()> {
2733         self.ensure_has_avx()?;
2734 
2735         self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
2736 
2737         // Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
2738         // format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
2739         // produces 0x8000 in that case when the correct result is 0x7FFF (that
2740         // is, +1) so need to check if the result is 0x8000 and flip the bits
2741         // of the result if it is.
2742         let address = self.asm.add_constant(&[
2743             0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
2744             0x00, 0x80,
2745         ]);
2746         self.asm
2747             .xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
2748         self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
2749         Ok(())
2750     }
2751 
v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>2752     fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2753         self.ensure_has_avx()?;
2754 
2755         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2756             // Create a mask of all 0s.
2757             masm.asm
2758                 .xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2759             // Sets lane in `dst` to not zero if `src` lane was zero, and lane in
2760             // `dst` to zero if `src` lane was not zero.
2761             masm.asm
2762                 .xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size);
2763             // Sets ZF if all values are zero (i.e., if all original values were not zero).
2764             masm.asm.xmm_vptest(src, src);
2765             // Set byte if ZF=1.
2766         });
2767         self.asm.setcc(IntCmpKind::Eq, dst);
2768         Ok(())
2769     }
2770 
v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>2771     fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2772         self.ensure_has_avx()?;
2773 
2774         match size {
2775             OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),
2776             OperandSize::S16 => {
2777                 // Signed conversion of 16-bit integers to 8-bit integers.
2778                 self.asm
2779                     .xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);
2780                 // Creates a mask from each byte in `src`.
2781                 self.asm
2782                     .xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);
2783                 // Removes 8 bits added as a result of the `vpackss` step.
2784                 self.asm
2785                     .shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);
2786             }
2787             OperandSize::S32 | OperandSize::S64 => {
2788                 self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32)
2789             }
2790             _ => unimplemented!(),
2791         }
2792 
2793         Ok(())
2794     }
2795 
v128_trunc( &mut self, context: &mut CodeGenContext<Emission>, kind: V128TruncKind, ) -> Result<()>2796     fn v128_trunc(
2797         &mut self,
2798         context: &mut CodeGenContext<Emission>,
2799         kind: V128TruncKind,
2800     ) -> Result<()> {
2801         self.ensure_has_avx()?;
2802 
2803         let reg = writable!(context.pop_to_reg(self, None)?.reg);
2804         match kind {
2805             V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(
2806                 reg.to_reg(),
2807                 reg,
2808                 VroundMode::TowardZero,
2809                 kind.dst_lane_size(),
2810             ),
2811             V128TruncKind::I32x4FromF32x4S => {
2812                 self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2813             }
2814             V128TruncKind::I32x4FromF32x4U => {
2815                 let temp_reg = writable!(context.any_fpr(self)?);
2816                 self.v128_trunc_sat_f32x4_u(
2817                     reg,
2818                     temp_reg,
2819                     kind.src_lane_size(),
2820                     kind.dst_lane_size(),
2821                 )?;
2822                 context.free_reg(temp_reg.to_reg());
2823             }
2824             V128TruncKind::I32x4FromF64x2SZero => {
2825                 self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?;
2826             }
2827             V128TruncKind::I32x4FromF64x2UZero => {
2828                 self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2829             }
2830         }
2831 
2832         context.stack.push(TypedReg::v128(reg.to_reg()).into());
2833         Ok(())
2834     }
2835 
v128_min( &mut self, src1: Reg, src2: Reg, dst: WritableReg, kind: V128MinKind, ) -> Result<()>2836     fn v128_min(
2837         &mut self,
2838         src1: Reg,
2839         src2: Reg,
2840         dst: WritableReg,
2841         kind: V128MinKind,
2842     ) -> Result<()> {
2843         self.ensure_has_avx()?;
2844 
2845         match kind {
2846             V128MinKind::I8x16S
2847             | V128MinKind::I8x16U
2848             | V128MinKind::I16x8S
2849             | V128MinKind::I16x8U
2850             | V128MinKind::I32x4S
2851             | V128MinKind::I32x4U => {
2852                 match kind {
2853                     V128MinKind::I8x16S => {
2854                         self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8)
2855                     }
2856                     V128MinKind::I8x16U => {
2857                         self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8)
2858                     }
2859                     V128MinKind::I16x8S => {
2860                         self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16)
2861                     }
2862                     V128MinKind::I16x8U => {
2863                         self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16)
2864                     }
2865                     V128MinKind::I32x4S => {
2866                         self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32)
2867                     }
2868                     V128MinKind::I32x4U => {
2869                         self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32)
2870                     }
2871                     _ => unreachable!(),
2872                 };
2873             }
2874             V128MinKind::F32x4 | V128MinKind::F64x2 => {
2875                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2876                     // Handling +0 and -0 as well as NaN values are not commutative
2877                     // when using `vminp` so we have to compensate.
2878                     // Perform two comparison operations with the operands swapped
2879                     // and OR the result to propagate 0 (positive and negative) and
2880                     // NaN.
2881                     masm.asm
2882                         .xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2883                     masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());
2884                     // Use a single OR instruction to set the sign bit if either
2885                     // result has the sign bit set to correctly propagate -0.
2886                     masm.asm
2887                         .xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2888                 });
2889                 // Set lanes with NaN to all 1s.
2890                 self.asm.xmm_vcmpp_rrr(
2891                     writable!(src2),
2892                     src2,
2893                     dst.to_reg(),
2894                     kind.lane_size(),
2895                     VcmpKind::Unord,
2896                 );
2897                 // Doesn't change non-NaN values. For NaN values, sets all bits.
2898                 self.asm
2899                     .xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2900                 self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2901             }
2902         }
2903 
2904         Ok(())
2905     }
2906 
v128_max( &mut self, src1: Reg, src2: Reg, dst: WritableReg, kind: V128MaxKind, ) -> Result<()>2907     fn v128_max(
2908         &mut self,
2909         src1: Reg,
2910         src2: Reg,
2911         dst: WritableReg,
2912         kind: V128MaxKind,
2913     ) -> Result<()> {
2914         self.ensure_has_avx()?;
2915 
2916         match kind {
2917             V128MaxKind::I8x16S
2918             | V128MaxKind::I8x16U
2919             | V128MaxKind::I16x8S
2920             | V128MaxKind::I16x8U
2921             | V128MaxKind::I32x4S
2922             | V128MaxKind::I32x4U => {
2923                 match kind {
2924                     V128MaxKind::I8x16S => {
2925                         self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8)
2926                     }
2927                     V128MaxKind::I8x16U => {
2928                         self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8)
2929                     }
2930                     V128MaxKind::I16x8S => {
2931                         self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16)
2932                     }
2933                     V128MaxKind::I16x8U => {
2934                         self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16)
2935                     }
2936                     V128MaxKind::I32x4S => {
2937                         self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32)
2938                     }
2939                     V128MaxKind::I32x4U => {
2940                         self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32)
2941                     }
2942                     _ => unreachable!(),
2943                 };
2944             }
2945             V128MaxKind::F32x4 | V128MaxKind::F64x2 => {
2946                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2947                     // Handling +0 and -0 as well as NaN values are not commutative
2948                     // when using `vmaxp` so we have to compensate.
2949                     // Perform two comparison operations with the operands swapped
2950                     // so we can propagate 0 (positive and negative) and NaNs
2951                     // correctly.
2952 
2953                     masm.asm
2954                         .xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2955                     masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());
2956                     // This combination of XOR, OR, and SUB will set the sign bit
2957                     // on a 0 result to the correct value for a max operation.
2958                     masm.asm
2959                         .xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2960                     masm.asm.xmm_vorp_rrr(
2961                         dst.to_reg(),
2962                         scratch.inner(),
2963                         writable!(src2),
2964                         kind.lane_size(),
2965                     );
2966                 });
2967                 self.asm
2968                     .xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2969                 // Set lanes of NaN values to 1.
2970                 self.asm.xmm_vcmpp_rrr(
2971                     writable!(src2),
2972                     src2,
2973                     src2,
2974                     kind.lane_size(),
2975                     VcmpKind::Unord,
2976                 );
2977                 self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2978             }
2979         }
2980         Ok(())
2981     }
2982 
v128_extmul( &mut self, context: &mut CodeGenContext<Emission>, kind: V128ExtMulKind, ) -> Result<()>2983     fn v128_extmul(
2984         &mut self,
2985         context: &mut CodeGenContext<Emission>,
2986         kind: V128ExtMulKind,
2987     ) -> Result<()> {
2988         self.ensure_has_avx()?;
2989 
2990         // The implementation for extmul is not optimized; for simplicity's sake, we simply perform
2991         // an extension followed by a multiplication using already implemented primitives.
2992 
2993         let src1 = context.pop_to_reg(self, None)?;
2994         let src2 = context.pop_to_reg(self, None)?;
2995 
2996         let ext_kind = kind.into();
2997         self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;
2998         self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;
2999 
3000         context.stack.push(src2.into());
3001         context.stack.push(src1.into());
3002 
3003         self.v128_mul(context, kind.into())
3004     }
3005 
v128_extadd_pairwise( &mut self, src: Reg, dst: WritableReg, kind: V128ExtAddKind, ) -> Result<()>3006     fn v128_extadd_pairwise(
3007         &mut self,
3008         src: Reg,
3009         dst: WritableReg,
3010         kind: V128ExtAddKind,
3011     ) -> Result<()> {
3012         self.ensure_has_avx()?;
3013 
3014         match kind {
3015             V128ExtAddKind::I8x16S => {
3016                 self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3017                     // Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
3018                     // sign extend `src` to 16 bits and add adjacent words.
3019                     // Need to supply constant as first operand since first operand
3020                     // is treated as unsigned and the second operand is signed.
3021                     let mask = masm.asm.add_constant(&[1; 16]);
3022                     masm.asm.xmm_mov_mr(
3023                         &mask,
3024                         scratch.writable(),
3025                         OperandSize::S128,
3026                         MemFlags::trusted(),
3027                     );
3028                     masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst);
3029                 });
3030             }
3031             V128ExtAddKind::I8x16U => {
3032                 // Same approach as the signed variant but treat `src` as
3033                 // unsigned instead of signed by passing it as the first
3034                 // operand.
3035                 let mask = self.asm.add_constant(&[1; 16]);
3036                 self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst);
3037             }
3038             V128ExtAddKind::I16x8S => {
3039                 // Similar approach to the two variants above. The vector is 8
3040                 // lanes of 16-bit 1's and `vpmaddwd` treats both operands as
3041                 // signed.
3042                 let mask = self
3043                     .asm
3044                     .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3045                 self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
3046             }
3047             V128ExtAddKind::I16x8U => {
3048                 // Similar approach as the signed variant.
3049                 // `vpmaddwd` operates on signed integers and the operand is
3050                 // unsigned so the operand needs to be converted to a signed
3051                 // format and than that process needs to be reversed after
3052                 // `vpmaddwd`.
3053                 // Flip the sign bit for 8 16-bit lanes.
3054                 let xor_mask = self.asm.add_constant(&[
3055                     0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
3056                     0x80, 0x00, 0x80,
3057                 ]);
3058                 self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
3059 
3060                 let madd_mask = self
3061                     .asm
3062                     .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3063                 self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
3064 
3065                 // Reverse the XOR. The XOR effectively subtracts 32,768 from
3066                 // both pairs that are added together so 65,536 (0x10000)
3067                 // needs to be added to 4 lanes of 32-bit values.
3068                 let add_mask = self
3069                     .asm
3070                     .add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
3071                 self.asm
3072                     .xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
3073             }
3074         }
3075         Ok(())
3076     }
3077 
v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()>3078     fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
3079         self.ensure_has_avx()?;
3080         self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst);
3081         Ok(())
3082     }
3083 
v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()>3084     fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
3085         self.ensure_has_avx()?;
3086 
3087         let reg = writable!(context.pop_to_reg(self, None)?.reg);
3088         let reg2 = writable!(context.any_fpr(self)?);
3089 
3090         // This works by using a lookup table to determine the count of bits
3091         // set in the upper 4 bits and lower 4 bits separately and then adding
3092         // the counts.
3093 
3094         // A mask to zero out the upper 4 bits in each lane.
3095         let address = self.asm.add_constant(&[
3096             0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
3097             0x0F, 0x0F,
3098         ]);
3099 
3100         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3101             // Zero out the upper 4 bits of each lane.
3102             masm.asm
3103                 .xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable());
3104             // Right shift bytes in input by 4 bits to put the upper 4 bits in the
3105             // lower 4 bits.
3106             masm.asm
3107                 .xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16);
3108             // Zero out the upper 4 bits of each shifted lane.
3109             masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
3110 
3111             // Write a lookup table of 4 bit values to number of bits set to a
3112             // register so we only perform the memory read once.
3113             // Index (hex) | Value (binary) | Population Count
3114             // 0x0         | 0000          | 0
3115             // 0x1         | 0001          | 1
3116             // 0x2         | 0010          | 1
3117             // 0x3         | 0011          | 2
3118             // 0x4         | 0100          | 1
3119             // 0x5         | 0101          | 2
3120             // 0x6         | 0110          | 2
3121             // 0x7         | 0111          | 3
3122             // 0x8         | 1000          | 1
3123             // 0x9         | 1001          | 2
3124             // 0xA         | 1010          | 2
3125             // 0xB         | 1011          | 3
3126             // 0xC         | 1100          | 2
3127             // 0xD         | 1101          | 3
3128             // 0xE         | 1110          | 3
3129             // 0xF         | 1111          | 4
3130             let address = masm.asm.add_constant(&[
3131                 0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
3132             ]);
3133             masm.asm
3134                 .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
3135             // Use the upper 4 bits as an index into the lookup table.
3136             masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
3137             // Use the lower 4 bits as an index into the lookup table.
3138             masm.asm
3139                 .xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner());
3140             context.free_reg(reg2.to_reg());
3141 
3142             // Add the counts of the upper 4 bits and the lower 4 bits to get the
3143             // total number of bits set.
3144             masm.asm
3145                 .xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8);
3146             wasmtime_environ::error::Ok(())
3147         })?;
3148 
3149         context.stack.push(TypedReg::v128(reg.to_reg()).into());
3150         Ok(())
3151     }
3152 
v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3153     fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3154         self.ensure_has_avx()?;
3155         self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
3156         Ok(())
3157     }
3158 
v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3159     fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3160         self.ensure_has_avx()?;
3161         self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);
3162         Ok(())
3163     }
3164 
v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3165     fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3166         self.ensure_has_avx()?;
3167         self.asm.xmm_vsqrtp_rr(src, dst, size);
3168         Ok(())
3169     }
3170 
v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3171     fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3172         self.ensure_has_avx()?;
3173         self.asm
3174             .xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);
3175         Ok(())
3176     }
3177 
v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3178     fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3179         self.ensure_has_avx()?;
3180         self.asm
3181             .xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);
3182         Ok(())
3183     }
3184 
v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3185     fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3186         self.ensure_has_avx()?;
3187         self.asm
3188             .xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);
3189         Ok(())
3190     }
3191 
v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3192     fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3193         self.ensure_has_avx()?;
3194         // Reverse operands since Wasm specifies returning the first operand if
3195         // either operand is NaN while x86 returns the second operand.
3196         self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);
3197         Ok(())
3198     }
3199 
v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3200     fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3201         self.ensure_has_avx()?;
3202         // Reverse operands since Wasm specifies returning the first operand if
3203         // either operand is NaN while x86 returns the second operand.
3204         self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);
3205         Ok(())
3206     }
3207 }
3208 
3209 impl MacroAssembler {
3210     /// Create an x64 MacroAssembler.
new( ptr_size: impl PtrSize, shared_flags: settings::Flags, isa_flags: x64_settings::Flags, ) -> Result<Self>3211     pub fn new(
3212         ptr_size: impl PtrSize,
3213         shared_flags: settings::Flags,
3214         isa_flags: x64_settings::Flags,
3215     ) -> Result<Self> {
3216         let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size());
3217 
3218         Ok(Self {
3219             sp_offset: 0,
3220             sp_max: 0,
3221             stack_max_use_add: None,
3222             asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),
3223             flags: isa_flags,
3224             shared_flags,
3225             ptr_size: ptr_type.try_into()?,
3226             scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),
3227         })
3228     }
3229 
3230     /// Add the maximum stack used to a register, recording an obligation to update the
3231     /// add-with-immediate instruction emitted to use the real stack max when the masm is being
3232     /// finalized.
add_stack_max(&mut self, reg: Reg)3233     fn add_stack_max(&mut self, reg: Reg) {
3234         assert!(self.stack_max_use_add.is_none());
3235         let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm);
3236         self.stack_max_use_add.replace(patch);
3237     }
3238 
ensure_has_avx(&self) -> Result<()>3239     fn ensure_has_avx(&self) -> Result<()> {
3240         crate::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);
3241         Ok(())
3242     }
3243 
ensure_has_avx2(&self) -> Result<()>3244     fn ensure_has_avx2(&self) -> Result<()> {
3245         crate::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);
3246         Ok(())
3247     }
3248 
ensure_has_avx512vl(&self) -> Result<()>3249     fn ensure_has_avx512vl(&self) -> Result<()> {
3250         crate::ensure!(
3251             self.flags.has_avx512vl(),
3252             CodeGenError::UnimplementedForNoAvx512VL
3253         );
3254         Ok(())
3255     }
3256 
ensure_has_avx512dq(&self) -> Result<()>3257     fn ensure_has_avx512dq(&self) -> Result<()> {
3258         crate::ensure!(
3259             self.flags.has_avx512dq(),
3260             CodeGenError::UnimplementedForNoAvx512DQ
3261         );
3262         Ok(())
3263     }
3264 
increment_sp(&mut self, bytes: u32)3265     fn increment_sp(&mut self, bytes: u32) {
3266         self.sp_offset += bytes;
3267 
3268         // NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have
3269         // seen the entire function, this value will represent the maximum size for the stack
3270         // frame.
3271         self.sp_max = self.sp_max.max(self.sp_offset);
3272     }
3273 
decrement_sp(&mut self, bytes: u32)3274     fn decrement_sp(&mut self, bytes: u32) {
3275         assert!(
3276             self.sp_offset >= bytes,
3277             "sp offset = {}; bytes = {}",
3278             self.sp_offset,
3279             bytes
3280         );
3281         self.sp_offset -= bytes;
3282     }
3283 
load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()>3284     fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {
3285         match constant {
3286             I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),
3287             I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),
3288             I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3289             I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3290             I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3291         }
3292     }
3293 
3294     /// A common implementation for zero-extend stack loads.
load_impl( &mut self, src: Address, dst: WritableReg, size: OperandSize, flags: MemFlags, ) -> Result<()>3295     fn load_impl(
3296         &mut self,
3297         src: Address,
3298         dst: WritableReg,
3299         size: OperandSize,
3300         flags: MemFlags,
3301     ) -> Result<()> {
3302         if dst.to_reg().is_int() {
3303             let ext = size.extend_to::<Zero>(OperandSize::S64);
3304             self.asm.movzx_mr(&src, dst, ext, flags);
3305         } else {
3306             self.asm.xmm_mov_mr(&src, dst, size, flags);
3307         }
3308 
3309         Ok(())
3310     }
3311 
3312     /// A common implementation for stack stores.
store_impl( &mut self, src: RegImm, dst: Address, size: OperandSize, flags: MemFlags, ) -> Result<()>3313     fn store_impl(
3314         &mut self,
3315         src: RegImm,
3316         dst: Address,
3317         size: OperandSize,
3318         flags: MemFlags,
3319     ) -> Result<()> {
3320         let _ = match src {
3321             RegImm::Imm(imm) => match imm {
3322                 I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),
3323                 I::I64(v) => match v.try_into() {
3324                     Ok(v) => self.asm.mov_im(v, &dst, size, flags),
3325                     Err(_) => {
3326                         // If the immediate doesn't sign extend, use a scratch
3327                         // register.
3328                         self.with_scratch::<IntScratch, _>(|masm, scratch| {
3329                             masm.asm.mov_ir(v, scratch.writable(), size);
3330                             masm.asm.mov_rm(scratch.inner(), &dst, size, flags);
3331                         });
3332                     }
3333                 },
3334                 I::F32(v) => {
3335                     let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3336                     self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3337                         // Always trusted, since we are loading the constant from
3338                         // the constant pool.
3339                         masm.asm.xmm_mov_mr(
3340                             &addr,
3341                             float_scratch.writable(),
3342                             size,
3343                             MemFlags::trusted(),
3344                         );
3345                         masm.asm
3346                             .xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3347                     });
3348                 }
3349                 I::F64(v) => {
3350                     let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3351 
3352                     self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3353                         // Similar to above, always trusted since we are loading the
3354                         // constant from the constant pool.
3355                         masm.asm.xmm_mov_mr(
3356                             &addr,
3357                             float_scratch.writable(),
3358                             size,
3359                             MemFlags::trusted(),
3360                         );
3361                         masm.asm
3362                             .xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3363                     });
3364                 }
3365                 I::V128(v) => {
3366                     let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3367                     self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| {
3368                         // Always trusted, since we are loading the constant from
3369                         // the constant pool.
3370                         masm.asm.xmm_mov_mr(
3371                             &addr,
3372                             vector_scratch.writable(),
3373                             size,
3374                             MemFlags::trusted(),
3375                         );
3376                         masm.asm
3377                             .xmm_mov_rm(vector_scratch.inner(), &dst, size, flags);
3378                     });
3379                 }
3380             },
3381             RegImm::Reg(reg) => {
3382                 if reg.is_int() {
3383                     self.asm.mov_rm(reg, &dst, size, flags);
3384                 } else {
3385                     self.asm.xmm_mov_rm(reg, &dst, size, flags);
3386                 }
3387             }
3388         };
3389         Ok(())
3390     }
3391 
ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()>3392     fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {
3393         if dst != lhs {
3394             Err(format_err!(CodeGenError::invalid_two_arg_form()))
3395         } else {
3396             Ok(())
3397         }
3398     }
3399 
3400     /// The mask to use when performing a `vpshuf` operation for a 64-bit splat.
vpshuf_mask_for_64_bit_splats() -> u83401     fn vpshuf_mask_for_64_bit_splats() -> u8 {
3402         // Results in the first 4 bytes and second 4 bytes being
3403         // swapped and then the swapped bytes being copied.
3404         // [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields
3405         // [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].
3406         0b01_00_01_00
3407     }
3408 
v128_trunc_sat_f32x4_s( &mut self, reg: WritableReg, src_lane_size: OperandSize, dst_lane_size: OperandSize, ) -> Result<()>3409     fn v128_trunc_sat_f32x4_s(
3410         &mut self,
3411         reg: WritableReg,
3412         src_lane_size: OperandSize,
3413         dst_lane_size: OperandSize,
3414     ) -> Result<()> {
3415         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3416             // Create a mask to handle NaN values (1 for not NaN, 0 for
3417             // NaN).
3418             masm.asm.xmm_vcmpp_rrr(
3419                 scratch.writable(),
3420                 reg.to_reg(),
3421                 reg.to_reg(),
3422                 src_lane_size,
3423                 VcmpKind::Eq,
3424             );
3425             // Zero out any NaN values.
3426             masm.asm
3427                 .xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3428             // Create a mask for the sign bits.
3429             masm.asm
3430                 .xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable());
3431             // Convert floats to integers.
3432             masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3433             // Apply sign mask to the converted integers.
3434             masm.asm
3435                 .xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3436             // Create a saturation mask of all 1s for negative numbers,
3437             // all 0s for positive numbers. The arithmetic shift will cop
3438             // the sign bit.
3439             masm.asm
3440                 .xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size);
3441             // Combine converted integers with saturation mask.
3442             masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg);
3443             Ok(())
3444         })
3445     }
3446 
v128_trunc_sat_f32x4_u( &mut self, reg: WritableReg, temp_reg: WritableReg, src_lane_size: OperandSize, dst_lane_size: OperandSize, ) -> Result<()>3447     fn v128_trunc_sat_f32x4_u(
3448         &mut self,
3449         reg: WritableReg,
3450         temp_reg: WritableReg,
3451         src_lane_size: OperandSize,
3452         dst_lane_size: OperandSize,
3453     ) -> Result<()> {
3454         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3455             // Set scratch to all zeros.
3456             masm.asm.xmm_vxorp_rrr(
3457                 reg.to_reg(),
3458                 reg.to_reg(),
3459                 scratch.writable(),
3460                 src_lane_size,
3461             );
3462             // Clamp negative numbers to 0.
3463             masm.asm
3464                 .xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3465             // Create a vector of all 1s.
3466             masm.asm.xmm_vpcmpeq_rrr(
3467                 scratch.writable(),
3468                 scratch.inner(),
3469                 scratch.inner(),
3470                 src_lane_size,
3471             );
3472             // Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by
3473             // performing a logical shift right.
3474             masm.asm
3475                 .xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size);
3476             // Convert max signed int to float as a reference point for saturation.
3477             masm.asm
3478                 .xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
3479             // Convert the floats to integers and put the results in `reg2`.
3480             // This is signed and not unsigned so we need to handle the
3481             // value for the high bit in each lane.
3482             masm.asm
3483                 .xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);
3484             // Set `reg` lanes to the amount that the value in the lane
3485             // exceeds the maximum signed 32-bit integer.
3486             masm.asm
3487                 .xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size);
3488             // Create mask in `scratch` for numbers that are larger than
3489             // the maximum signed 32-bit integer. Lanes that don't fit
3490             // in 32-bits ints will be 1.
3491             masm.asm.xmm_vcmpp_rrr(
3492                 scratch.writable(),
3493                 scratch.inner(),
3494                 reg.to_reg(),
3495                 dst_lane_size,
3496                 VcmpKind::Le,
3497             );
3498             // Convert the excess over signed 32-bits from floats to integers.
3499             masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3500             // Apply large number mask to excess values which will flip the
3501             // bits in any lanes that exceed signed 32-bits. Adding this
3502             // flipped value to the signed value will set the high bit and
3503             // the carry behavior will update the other bits correctly.
3504             masm.asm
3505                 .xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3506             // Set `reg` to all 0s.
3507             masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg);
3508             // Ensure excess values are not negative by taking max b/w
3509             // excess values and zero.
3510             masm.asm
3511                 .xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size);
3512         });
3513         // Perform the addition between the signed conversion value (in
3514         // `reg2`) and the flipped excess value (in `reg`) to get the
3515         // unsigned value.
3516         self.asm
3517             .xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);
3518         Ok(())
3519     }
3520 
v128_trunc_sat_f64x2_s_zero( &mut self, reg: WritableReg, src_lane_size: OperandSize, ) -> Result<()>3521     fn v128_trunc_sat_f64x2_s_zero(
3522         &mut self,
3523         reg: WritableReg,
3524         src_lane_size: OperandSize,
3525     ) -> Result<()> {
3526         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3527             // Create a NaN mask (1s for non-NaN, 0s for NaN).
3528             masm.asm.xmm_vcmpp_rrr(
3529                 scratch.writable(),
3530                 reg.to_reg(),
3531                 reg.to_reg(),
3532                 src_lane_size,
3533                 VcmpKind::Eq,
3534             );
3535             // Clamp NaN values to maximum 64-bit float that can be
3536             // converted to an i32.
3537             let address = masm.asm.add_constant(&[
3538                 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
3539                 0xDF, 0x41,
3540             ]);
3541             masm.asm
3542                 .xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size);
3543             // Handle the saturation for values too large to fit in an i32.
3544             masm.asm
3545                 .xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3546             // Convert the floats to integers.
3547             masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);
3548 
3549             Ok(())
3550         })
3551     }
3552 
v128_trunc_sat_f64x2_u_zero( &mut self, reg: WritableReg, src_lane_size: OperandSize, dst_lane_size: OperandSize, ) -> Result<()>3553     fn v128_trunc_sat_f64x2_u_zero(
3554         &mut self,
3555         reg: WritableReg,
3556         src_lane_size: OperandSize,
3557         dst_lane_size: OperandSize,
3558     ) -> Result<()> {
3559         self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3560             // Zero out the scratch register.
3561             masm.asm.xmm_vxorp_rrr(
3562                 scratch.inner(),
3563                 scratch.inner(),
3564                 scratch.writable(),
3565                 src_lane_size,
3566             );
3567             // Clamp negative values to zero.
3568             masm.asm
3569                 .xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3570             // Clamp value to maximum unsigned 32-bit integer value
3571             // (0x41F0000000000000).
3572             let address = masm.asm.add_constant(&[
3573                 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
3574                 0xEF, 0x41,
3575             ]);
3576             masm.asm
3577                 .xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3578             // Truncate floating point values.
3579             masm.asm
3580                 .xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);
3581             // Add 2^52 (doubles store 52 bits in their mantissa) to each
3582             // lane causing values in the lower bits to be shifted into
3583             // position for integer conversion.
3584             let address = masm.asm.add_constant(&[
3585                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3586                 0x30, 0x43,
3587             ]);
3588             masm.asm
3589                 .xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3590             // Takes lanes 0 and 2 from `reg` (converted values) and lanes
3591             // 0 and 2 from `scratch` (zeroes) to put the converted ints in
3592             // the lower lanes and zeroes in the upper lanes.
3593             masm.asm.xmm_vshufp_rrri(
3594                 reg.to_reg(),
3595                 scratch.inner(),
3596                 reg,
3597                 0b10_00_10_00,
3598                 dst_lane_size,
3599             );
3600             Ok(())
3601         })
3602     }
3603 
3604     /// Given a vector of floats where lanes with NaN values are set to all 1s
3605     /// in `reg` and a vector register `dst` with a mix of non-NaN values and
3606     /// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.
canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize)3607     fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {
3608         // Canonical NaNs do not preserve the sign bit, have the exponent bits
3609         // all set, and have only the high bit of the mantissa set so shift by
3610         // that number.
3611         // The mask we're producing in this step will be inverted in the next
3612         // step.
3613         let amount_to_shift = 1 + size.mantissa_bits() + 1;
3614         self.asm
3615             .xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size);
3616         // The mask will be inverted by the ANDN so non-NaN values will be all
3617         // 1s and NaN values will set the sign bit, exponent bits, and zero out
3618         // almost all of the mantissa.
3619         self.asm
3620             .xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);
3621     }
3622 }
3623