1 use super::{ 2 RegAlloc, 3 abi::X64ABI, 4 address::Address, 5 asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode}, 6 regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset}, 7 }; 8 use crate::masm::{ 9 DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm, Imm as I, 10 IntCmpKind, IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, 11 OperandSize, RegImm, RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType, 12 ShiftKind, SplatKind, StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS, 13 V128AbsKind, V128AddKind, V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, 14 V128MaxKind, V128MinKind, V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind, 15 VectorCompareKind, VectorEqualityKind, Zero, 16 }; 17 use crate::{ 18 Result, 19 abi::{self, LocalSlot, align_to, calculate_frame_adjustment}, 20 bail, 21 codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size}, 22 format_err, 23 stack::{TypedReg, Val}, 24 }; 25 use crate::{ 26 abi::{ABI, vmctx}, 27 masm::{SPOffset, StackSlot}, 28 }; 29 use crate::{ 30 isa::{ 31 CallingConvention, 32 reg::{Reg, RegClass, WritableReg, writable}, 33 }, 34 masm::CalleeKind, 35 }; 36 use cranelift_codegen::{ 37 Final, MachBufferFinalized, MachLabel, 38 binemit::CodeOffset, 39 ir::{MemFlags, RelSourceLoc, SourceLoc}, 40 isa::{ 41 unwind::UnwindInst, 42 x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings}, 43 }, 44 settings, 45 }; 46 use wasmtime_cranelift::TRAP_UNREACHABLE; 47 use wasmtime_environ::{PtrSize, WasmValType}; 48 49 // Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs` 50 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we 51 // need to fix up the bits that migrate from one half of the lane to the 52 // other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift 53 // right by 0 (no movement), we want to retain all the bits so we mask with 54 // `0xff`; if we shift right by 1, we want to retain all bits except the MSB so 55 // we mask with `0x7f`; etc. 56 57 #[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row. 58 const I8X16_ISHL_MASKS: [u8; 128] = [ 59 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 60 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 61 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 62 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 63 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 64 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 65 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 66 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 67 ]; 68 69 #[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row. 70 const I8X16_USHR_MASKS: [u8; 128] = [ 71 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 72 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 73 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 74 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 75 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 76 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 77 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 78 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 79 ]; 80 81 /// x64 MacroAssembler. 82 pub(crate) struct MacroAssembler { 83 /// Stack pointer offset. 84 sp_offset: u32, 85 /// This value represents the maximum stack size seen while compiling the function. While the 86 /// function is still being compiled its value will not be valid (the stack will grow and 87 /// shrink as space is reserved and freed during compilation), but once all instructions have 88 /// been seen this value will be the maximum stack usage seen. 89 sp_max: u32, 90 /// Add instructions that are used to add the constant stack max to a register. 91 stack_max_use_add: Option<PatchableAddToReg>, 92 /// Low level assembler. 93 asm: Assembler, 94 /// ISA flags. 95 flags: x64_settings::Flags, 96 /// Shared flags.vmcontext_store_context 97 shared_flags: settings::Flags, 98 /// The target pointer size. 99 ptr_size: OperandSize, 100 /// Scratch register scope. 101 scratch_scope: RegAlloc, 102 } 103 104 impl Masm for MacroAssembler { 105 type Address = Address; 106 type Ptr = u8; 107 type ABI = X64ABI; 108 frame_setup(&mut self) -> Result<()>109 fn frame_setup(&mut self) -> Result<()> { 110 let frame_pointer = rbp(); 111 let stack_pointer = rsp(); 112 113 self.asm.push_r(frame_pointer); 114 115 if self.shared_flags.unwind_info() { 116 self.asm.unwind_inst(UnwindInst::PushFrameRegs { 117 offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(), 118 }) 119 } 120 121 self.asm 122 .mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64); 123 124 Ok(()) 125 } 126 check_stack(&mut self, vmctx: Reg) -> Result<()>127 fn check_stack(&mut self, vmctx: Reg) -> Result<()> { 128 let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap(); 129 130 self.with_scratch::<IntScratch, _>(|masm, scratch| { 131 masm.load_ptr( 132 masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?, 133 scratch.writable(), 134 )?; 135 136 masm.load_ptr( 137 Address::offset( 138 scratch.inner(), 139 ptr_size.vmstore_context_stack_limit().into(), 140 ), 141 scratch.writable(), 142 )?; 143 144 masm.add_stack_max(scratch.inner()); 145 146 masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size); 147 masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW); 148 wasmtime_environ::error::Ok(()) 149 })?; 150 151 // Emit unwind info. 152 if self.shared_flags.unwind_info() { 153 self.asm.unwind_inst(UnwindInst::DefineNewFrame { 154 offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(), 155 156 // The Winch calling convention has no callee-save registers, so nothing will be 157 // clobbered. 158 offset_downward_to_clobbers: 0, 159 }) 160 } 161 Ok(()) 162 } 163 push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot>164 fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> { 165 let bytes = match (reg.class(), size) { 166 (RegClass::Int, OperandSize::S64) => { 167 let word_bytes = <Self::ABI as ABI>::word_bytes() as u32; 168 self.asm.push_r(reg); 169 self.increment_sp(word_bytes); 170 word_bytes 171 } 172 (RegClass::Int, OperandSize::S32) => { 173 let bytes = size.bytes(); 174 self.reserve_stack(bytes)?; 175 let sp_offset = SPOffset::from_u32(self.sp_offset); 176 self.asm 177 .mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS); 178 bytes 179 } 180 (RegClass::Float, _) => { 181 let bytes = size.bytes(); 182 self.reserve_stack(bytes)?; 183 let sp_offset = SPOffset::from_u32(self.sp_offset); 184 self.asm 185 .xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS); 186 bytes 187 } 188 _ => unreachable!(), 189 }; 190 191 Ok(StackSlot { 192 offset: SPOffset::from_u32(self.sp_offset), 193 size: bytes, 194 }) 195 } 196 reserve_stack(&mut self, bytes: u32) -> Result<()>197 fn reserve_stack(&mut self, bytes: u32) -> Result<()> { 198 if bytes == 0 { 199 return Ok(()); 200 } 201 202 self.asm 203 .sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64); 204 self.increment_sp(bytes); 205 206 Ok(()) 207 } 208 free_stack(&mut self, bytes: u32) -> Result<()>209 fn free_stack(&mut self, bytes: u32) -> Result<()> { 210 if bytes == 0 { 211 return Ok(()); 212 } 213 self.asm 214 .add_ir(bytes as i32, writable!(rsp()), OperandSize::S64); 215 self.decrement_sp(bytes); 216 217 Ok(()) 218 } 219 reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()>220 fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> { 221 self.sp_offset = offset.as_u32(); 222 223 Ok(()) 224 } 225 local_address(&mut self, local: &LocalSlot) -> Result<Address>226 fn local_address(&mut self, local: &LocalSlot) -> Result<Address> { 227 let (reg, offset) = if local.addressed_from_sp() { 228 let offset = self 229 .sp_offset 230 .checked_sub(local.offset) 231 .ok_or_else(|| CodeGenError::invalid_local_offset())?; 232 (rsp(), offset) 233 } else { 234 (rbp(), local.offset) 235 }; 236 237 Ok(Address::offset(reg, offset)) 238 } 239 address_from_sp(&self, offset: SPOffset) -> Result<Self::Address>240 fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> { 241 Ok(Address::offset( 242 regs::rsp(), 243 self.sp_offset - offset.as_u32(), 244 )) 245 } 246 address_at_sp(&self, offset: SPOffset) -> Result<Self::Address>247 fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> { 248 Ok(Address::offset(regs::rsp(), offset.as_u32())) 249 } 250 address_at_vmctx(&self, offset: u32) -> Result<Self::Address>251 fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> { 252 Ok(Address::offset(vmctx!(Self), offset)) 253 } 254 store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()>255 fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> { 256 self.store(src.into(), dst, self.ptr_size) 257 } 258 store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()>259 fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> { 260 self.store_impl(src, dst, size, TRUSTED_FLAGS) 261 } 262 wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()>263 fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> { 264 match kind { 265 StoreKind::Operand(size) => { 266 self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?; 267 } 268 StoreKind::Atomic(size) => { 269 if size == OperandSize::S128 { 270 // TODO: we don't support 128-bit atomic store yet. 271 bail!(CodeGenError::unexpected_operand_size()); 272 } 273 // To stay consistent with cranelift, we emit a normal store followed by a mfence, 274 // although, we could probably just emit a xchg. 275 self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?; 276 self.asm.mfence(); 277 } 278 StoreKind::VectorLane(LaneSelector { lane, size }) => { 279 self.ensure_has_avx()?; 280 self.asm 281 .xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS); 282 } 283 } 284 285 Ok(()) 286 } 287 pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()>288 fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> { 289 let current_sp = SPOffset::from_u32(self.sp_offset); 290 let _ = match (dst.to_reg().class(), size) { 291 (RegClass::Int, OperandSize::S32) => { 292 let addr = self.address_from_sp(current_sp)?; 293 self.asm.movzx_mr( 294 &addr, 295 dst, 296 size.extend_to::<Zero>(OperandSize::S64), 297 TRUSTED_FLAGS, 298 ); 299 self.free_stack(size.bytes())?; 300 } 301 (RegClass::Int, OperandSize::S64) => { 302 self.asm.pop_r(dst); 303 self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32); 304 } 305 (RegClass::Float, _) | (RegClass::Vector, _) => { 306 let addr = self.address_from_sp(current_sp)?; 307 self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS); 308 self.free_stack(size.bytes())?; 309 } 310 _ => bail!(CodeGenError::invalid_operand_combination()), 311 }; 312 Ok(()) 313 } 314 with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R315 fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R { 316 let r = self 317 .scratch_scope 318 .reg_for_class(T::reg_class(), &mut |_| Ok(())) 319 .expect("Scratch register to be available"); 320 321 let ret = f(self, Scratch::new(r)); 322 self.scratch_scope.free(r); 323 ret 324 } 325 call( &mut self, stack_args_size: u32, mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>, ) -> Result<u32>326 fn call( 327 &mut self, 328 stack_args_size: u32, 329 mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>, 330 ) -> Result<u32> { 331 let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into(); 332 let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into(); 333 let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment); 334 let aligned_args_size = align_to(stack_args_size, alignment); 335 let total_stack = delta + aligned_args_size; 336 self.reserve_stack(total_stack)?; 337 let (callee, cc) = load_callee(self)?; 338 match callee { 339 CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg), 340 CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx), 341 }; 342 Ok(total_stack) 343 } 344 load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()>345 fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> { 346 self.load(src, dst, self.ptr_size) 347 } 348 compute_addr( &mut self, src: Self::Address, dst: WritableReg, size: OperandSize, ) -> Result<()>349 fn compute_addr( 350 &mut self, 351 src: Self::Address, 352 dst: WritableReg, 353 size: OperandSize, 354 ) -> Result<()> { 355 self.asm.lea(&src, dst, size); 356 Ok(()) 357 } 358 load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()>359 fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> { 360 self.load_impl(src, dst, size, TRUSTED_FLAGS) 361 } 362 wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()>363 fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> { 364 let size = kind.derive_operand_size(); 365 366 match kind { 367 LoadKind::ScalarExtend(ext) => match ext { 368 ExtendKind::Signed(ext) => { 369 self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS); 370 } 371 ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?, 372 }, 373 LoadKind::Operand(_) | LoadKind::Atomic(_, _) => { 374 // The guarantees of the x86-64 memory model ensure that `SeqCst` 375 // loads are equivalent to normal loads. 376 if kind.is_atomic() && size == OperandSize::S128 { 377 bail!(CodeGenError::unexpected_operand_size()); 378 } 379 380 self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?; 381 } 382 LoadKind::VectorExtend(ext) => { 383 self.ensure_has_avx()?; 384 self.asm 385 .xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS) 386 } 387 LoadKind::Splat(_) => { 388 self.ensure_has_avx()?; 389 390 if size == OperandSize::S64 { 391 self.asm 392 .xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS); 393 self.asm.xmm_vpshuf_rr( 394 dst.to_reg(), 395 dst, 396 Self::vpshuf_mask_for_64_bit_splats(), 397 OperandSize::S32, 398 ); 399 } else { 400 self.asm 401 .xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS); 402 } 403 } 404 LoadKind::VectorLane(LaneSelector { lane, size }) => { 405 self.ensure_has_avx()?; 406 self.with_scratch::<IntScratch, _>(|masm, byte_tmp| { 407 masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?; 408 masm.asm 409 .xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size); 410 wasmtime_environ::error::Ok(()) 411 })?; 412 } 413 LoadKind::VectorZero(size) => { 414 self.ensure_has_avx()?; 415 self.with_scratch::<IntScratch, _>(|masm, scratch| { 416 masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?; 417 masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size); 418 wasmtime_environ::error::Ok(()) 419 })?; 420 } 421 } 422 423 Ok(()) 424 } 425 sp_offset(&self) -> Result<SPOffset>426 fn sp_offset(&self) -> Result<SPOffset> { 427 Ok(SPOffset::from_u32(self.sp_offset)) 428 } 429 zero(&mut self, reg: WritableReg) -> Result<()>430 fn zero(&mut self, reg: WritableReg) -> Result<()> { 431 self.asm.xor_rr( 432 reg.to_reg(), 433 reg, 434 OperandSize::from_bytes(<Self::ABI>::word_bytes()), 435 ); 436 Ok(()) 437 } 438 mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()>439 fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> { 440 match (src, dst.to_reg()) { 441 (RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) { 442 (RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)), 443 (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)), 444 _ => bail!(CodeGenError::invalid_operand_combination()), 445 }, 446 (RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size), 447 } 448 } 449 cmov( &mut self, dst: WritableReg, src: Reg, cc: IntCmpKind, size: OperandSize, ) -> Result<()>450 fn cmov( 451 &mut self, 452 dst: WritableReg, 453 src: Reg, 454 cc: IntCmpKind, 455 size: OperandSize, 456 ) -> Result<()> { 457 match (src.class(), dst.to_reg().class()) { 458 (RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)), 459 (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)), 460 _ => Err(format_err!(CodeGenError::invalid_operand_combination())), 461 } 462 } 463 add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>464 fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> { 465 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 466 match (rhs, dst) { 467 (RegImm::Imm(imm), _) => { 468 if let Some(v) = imm.to_i32() { 469 self.asm.add_ir(v, dst, size); 470 } else { 471 self.with_scratch::<IntScratch, _>(|masm, scratch| { 472 masm.load_constant(&imm, scratch.writable(), size)?; 473 masm.asm.add_rr(scratch.inner(), dst, size); 474 wasmtime_environ::error::Ok(()) 475 })?; 476 } 477 } 478 479 (RegImm::Reg(src), dst) => { 480 self.asm.add_rr(src, dst, size); 481 } 482 } 483 484 Ok(()) 485 } 486 add_uextend( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, from_size: OperandSize, size: OperandSize, ) -> Result<()>487 fn add_uextend( 488 &mut self, 489 dst: WritableReg, 490 lhs: Reg, 491 rhs: Reg, 492 from_size: OperandSize, 493 size: OperandSize, 494 ) -> Result<()> { 495 assert!(size == OperandSize::S64); 496 assert!(from_size == OperandSize::S32 || from_size == OperandSize::S64); 497 498 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 499 if from_size == OperandSize::S32 && size == OperandSize::S64 { 500 self.extend( 501 writable!(rhs), 502 rhs, 503 ExtendKind::Unsigned(Extend::I64Extend32), 504 )?; 505 } 506 507 self.asm.add_rr(rhs, dst, size); 508 509 Ok(()) 510 } 511 checked_uadd( &mut self, dst: WritableReg, lhs: Reg, rhs: Imm, size: OperandSize, trap: TrapCode, ) -> Result<()>512 fn checked_uadd( 513 &mut self, 514 dst: WritableReg, 515 lhs: Reg, 516 rhs: Imm, 517 size: OperandSize, 518 trap: TrapCode, 519 ) -> Result<()> { 520 self.add(dst, lhs, RegImm::Imm(rhs), size)?; 521 self.asm.trapif(CC::B, trap); 522 Ok(()) 523 } 524 sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>525 fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> { 526 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 527 match (rhs, dst) { 528 (RegImm::Imm(imm), reg) => { 529 if let Some(v) = imm.to_i32() { 530 self.asm.sub_ir(v, reg, size); 531 } else { 532 self.with_scratch::<IntScratch, _>(|masm, scratch| { 533 masm.load_constant(&imm, scratch.writable(), size)?; 534 masm.asm.sub_rr(scratch.inner(), reg, size); 535 wasmtime_environ::error::Ok(()) 536 })?; 537 } 538 } 539 540 (RegImm::Reg(src), dst) => { 541 self.asm.sub_rr(src, dst, size); 542 } 543 } 544 545 Ok(()) 546 } 547 mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>548 fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> { 549 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 550 match (rhs, dst) { 551 (RegImm::Imm(imm), _) => { 552 if let Some(v) = imm.to_i32() { 553 self.asm.mul_ir(v, dst, size); 554 } else { 555 self.with_scratch::<IntScratch, _>(|masm, scratch| { 556 masm.load_constant(&imm, scratch.writable(), size)?; 557 masm.asm.mul_rr(scratch.inner(), dst, size); 558 wasmtime_environ::error::Ok(()) 559 })?; 560 } 561 } 562 563 (RegImm::Reg(src), dst) => { 564 self.asm.mul_rr(src, dst, size); 565 } 566 } 567 568 Ok(()) 569 } 570 float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>571 fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> { 572 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 573 self.asm.xmm_add_rr(rhs, dst, size); 574 Ok(()) 575 } 576 float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>577 fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> { 578 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 579 self.asm.xmm_sub_rr(rhs, dst, size); 580 Ok(()) 581 } 582 float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>583 fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> { 584 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 585 self.asm.xmm_mul_rr(rhs, dst, size); 586 Ok(()) 587 } 588 float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>589 fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> { 590 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 591 self.asm.xmm_div_rr(rhs, dst, size); 592 Ok(()) 593 } 594 float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>595 fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> { 596 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 597 self.asm.xmm_min_seq(rhs, dst, size); 598 Ok(()) 599 } 600 float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()>601 fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> { 602 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 603 self.asm.xmm_max_seq(rhs, dst, size); 604 Ok(()) 605 } 606 float_copysign( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize, ) -> Result<()>607 fn float_copysign( 608 &mut self, 609 dst: WritableReg, 610 lhs: Reg, 611 rhs: Reg, 612 size: OperandSize, 613 ) -> Result<()> { 614 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 615 let sign_mask = match size { 616 OperandSize::S32 => I::I32(0x80000000), 617 OperandSize::S64 => I::I64(0x8000000000000000), 618 OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => { 619 bail!(CodeGenError::unexpected_operand_size()) 620 } 621 }; 622 623 self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| { 624 masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| { 625 masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?; 626 masm.asm 627 .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size); 628 629 // Clear everything except sign bit in src. 630 masm.asm 631 .xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size); 632 633 // Clear sign bit in dst using scratch to store result. Then copy the 634 // result back to dst. 635 masm.asm 636 .xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size); 637 masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size); 638 639 // Copy sign bit from src to dst. 640 masm.asm.xmm_or_rr(rhs, dst, size); 641 Ok(()) 642 }) 643 }) 644 } 645 float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()>646 fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> { 647 debug_assert_eq!(dst.to_reg().class(), RegClass::Float); 648 let mask = match size { 649 OperandSize::S32 => I::I32(0x80000000), 650 OperandSize::S64 => I::I64(0x8000000000000000), 651 OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => { 652 bail!(CodeGenError::unexpected_operand_size()) 653 } 654 }; 655 self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| { 656 masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| { 657 masm.load_constant(&mask, scratch_gpr.writable(), size)?; 658 masm.asm 659 .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size); 660 masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size); 661 Ok(()) 662 }) 663 }) 664 } 665 float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()>666 fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> { 667 debug_assert_eq!(dst.to_reg().class(), RegClass::Float); 668 let mask = match size { 669 OperandSize::S32 => I::I32(0x7fffffff), 670 OperandSize::S64 => I::I64(0x7fffffffffffffff), 671 OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => { 672 bail!(CodeGenError::unexpected_operand_size()) 673 } 674 }; 675 676 self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| { 677 masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| { 678 masm.load_constant(&mask, scratch_gpr.writable(), size)?; 679 680 masm.asm 681 .gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size); 682 masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size); 683 Ok(()) 684 }) 685 }) 686 } 687 float_round< F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>, >( &mut self, mode: RoundingMode, env: &mut FuncEnv<Self::Ptr>, context: &mut CodeGenContext<Emission>, size: OperandSize, mut fallback: F, ) -> Result<()>688 fn float_round< 689 F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>, 690 >( 691 &mut self, 692 mode: RoundingMode, 693 env: &mut FuncEnv<Self::Ptr>, 694 context: &mut CodeGenContext<Emission>, 695 size: OperandSize, 696 mut fallback: F, 697 ) -> Result<()> { 698 if self.flags.has_sse41() { 699 let src = context.pop_to_reg(self, None)?; 700 self.asm 701 .xmm_rounds_rr(src.into(), writable!(src.into()), mode, size); 702 context.stack.push(src.into()); 703 Ok(()) 704 } else { 705 fallback(env, context, self) 706 } 707 } 708 float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>709 fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> { 710 self.asm.sqrt(src, dst, size); 711 Ok(()) 712 } 713 and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>714 fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> { 715 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 716 match (rhs, dst) { 717 (RegImm::Imm(imm), _) => { 718 if let Some(v) = imm.to_i32() { 719 self.asm.and_ir(v, dst, size); 720 } else { 721 self.with_scratch::<IntScratch, _>(|masm, scratch| { 722 masm.load_constant(&imm, scratch.writable(), size)?; 723 masm.asm.and_rr(scratch.inner(), dst, size); 724 wasmtime_environ::error::Ok(()) 725 })?; 726 } 727 } 728 729 (RegImm::Reg(src), dst) => { 730 self.asm.and_rr(src, dst, size); 731 } 732 } 733 734 Ok(()) 735 } 736 or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>737 fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> { 738 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 739 match (rhs, dst) { 740 (RegImm::Imm(imm), _) => { 741 if let Some(v) = imm.to_i32() { 742 self.asm.or_ir(v, dst, size); 743 } else { 744 self.with_scratch::<IntScratch, _>(|masm, scratch| { 745 masm.load_constant(&imm, scratch.writable(), size)?; 746 masm.asm.or_rr(scratch.inner(), dst, size); 747 wasmtime_environ::error::Ok(()) 748 })?; 749 } 750 } 751 752 (RegImm::Reg(src), dst) => { 753 self.asm.or_rr(src, dst, size); 754 } 755 } 756 757 Ok(()) 758 } 759 xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()>760 fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> { 761 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 762 match (rhs, dst) { 763 (RegImm::Imm(imm), _) => { 764 if let Some(v) = imm.to_i32() { 765 self.asm.xor_ir(v, dst, size); 766 } else { 767 self.with_scratch::<IntScratch, _>(|masm, scratch| { 768 masm.load_constant(&imm, scratch.writable(), size)?; 769 masm.asm.xor_rr(scratch.inner(), dst, size); 770 wasmtime_environ::error::Ok(()) 771 })?; 772 } 773 } 774 775 (RegImm::Reg(src), _) => { 776 self.asm.xor_rr(src, dst, size); 777 } 778 } 779 780 Ok(()) 781 } 782 shift_ir( &mut self, dst: WritableReg, imm: I, lhs: Reg, kind: ShiftKind, size: OperandSize, ) -> Result<()>783 fn shift_ir( 784 &mut self, 785 dst: WritableReg, 786 imm: I, 787 lhs: Reg, 788 kind: ShiftKind, 789 size: OperandSize, 790 ) -> Result<()> { 791 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?; 792 self.asm 793 .shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size); 794 Ok(()) 795 } 796 shift( &mut self, context: &mut CodeGenContext<Emission>, kind: ShiftKind, size: OperandSize, ) -> Result<()>797 fn shift( 798 &mut self, 799 context: &mut CodeGenContext<Emission>, 800 kind: ShiftKind, 801 size: OperandSize, 802 ) -> Result<()> { 803 // Number of bits to shift must be in the CL register. 804 let src = context.pop_to_reg(self, Some(regs::rcx()))?; 805 let dst = context.pop_to_reg(self, None)?; 806 807 self.asm 808 .shift_rr(src.into(), writable!(dst.into()), kind, size); 809 810 context.free_reg(src); 811 context.stack.push(dst.into()); 812 813 Ok(()) 814 } 815 div( &mut self, context: &mut CodeGenContext<Emission>, kind: DivKind, size: OperandSize, ) -> Result<()>816 fn div( 817 &mut self, 818 context: &mut CodeGenContext<Emission>, 819 kind: DivKind, 820 size: OperandSize, 821 ) -> Result<()> { 822 // Allocate rdx:rax. 823 let rdx = context.reg(regs::rdx(), self)?; 824 let rax = context.reg(regs::rax(), self)?; 825 826 // Allocate the divisor, which can be any gpr. 827 let divisor = context.pop_to_reg(self, None)?; 828 829 // Mark rax as allocatable. 830 context.free_reg(rax); 831 // Move the top value to rax. 832 let rax = context.pop_to_reg(self, Some(rax))?; 833 self.asm.div(divisor.into(), (rax.into(), rdx), kind, size); 834 835 // Free the divisor and rdx. 836 context.free_reg(divisor); 837 context.free_reg(rdx); 838 839 // Push the quotient. 840 context.stack.push(rax.into()); 841 Ok(()) 842 } 843 rem( &mut self, context: &mut CodeGenContext<Emission>, kind: RemKind, size: OperandSize, ) -> Result<()>844 fn rem( 845 &mut self, 846 context: &mut CodeGenContext<Emission>, 847 kind: RemKind, 848 size: OperandSize, 849 ) -> Result<()> { 850 // Allocate rdx:rax. 851 let rdx = context.reg(regs::rdx(), self)?; 852 let rax = context.reg(regs::rax(), self)?; 853 854 // Allocate the divisor, which can be any gpr. 855 let divisor = context.pop_to_reg(self, None)?; 856 857 // Mark rax as allocatable. 858 context.free_reg(rax); 859 // Move the top value to rax. 860 let rax = context.pop_to_reg(self, Some(rax))?; 861 self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size); 862 863 // Free the divisor and rax. 864 context.free_reg(divisor); 865 context.free_reg(rax); 866 867 // Push the remainder. 868 context.stack.push(Val::reg(rdx, divisor.ty)); 869 870 Ok(()) 871 } 872 frame_restore(&mut self) -> Result<()>873 fn frame_restore(&mut self) -> Result<()> { 874 debug_assert_eq!(self.sp_offset, 0); 875 self.asm.pop_r(writable!(rbp())); 876 self.asm.ret(); 877 Ok(()) 878 } 879 finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>>880 fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> { 881 if let Some(patch) = self.stack_max_use_add { 882 patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut()); 883 } 884 885 Ok(self.asm.finalize(base)) 886 } 887 address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address>888 fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> { 889 Ok(Address::offset(reg, offset)) 890 } 891 cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()>892 fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> { 893 match src2 { 894 RegImm::Imm(imm) => { 895 if let Some(v) = imm.to_i32() { 896 self.asm.cmp_ir(src1, v, size); 897 } else { 898 self.with_scratch::<IntScratch, _>(|masm, scratch| { 899 masm.load_constant(&imm, scratch.writable(), size)?; 900 masm.asm.cmp_rr(src1, scratch.inner(), size); 901 wasmtime_environ::error::Ok(()) 902 })?; 903 } 904 } 905 RegImm::Reg(src2) => { 906 self.asm.cmp_rr(src1, src2, size); 907 } 908 } 909 910 Ok(()) 911 } 912 cmp_with_set( &mut self, dst: WritableReg, src: RegImm, kind: IntCmpKind, size: OperandSize, ) -> Result<()>913 fn cmp_with_set( 914 &mut self, 915 dst: WritableReg, 916 src: RegImm, 917 kind: IntCmpKind, 918 size: OperandSize, 919 ) -> Result<()> { 920 self.cmp(dst.to_reg(), src, size)?; 921 self.asm.setcc(kind, dst); 922 Ok(()) 923 } 924 float_cmp_with_set( &mut self, dst: WritableReg, src1: Reg, src2: Reg, kind: FloatCmpKind, size: OperandSize, ) -> Result<()>925 fn float_cmp_with_set( 926 &mut self, 927 dst: WritableReg, 928 src1: Reg, 929 src2: Reg, 930 kind: FloatCmpKind, 931 size: OperandSize, 932 ) -> Result<()> { 933 // Float comparisons needs to be ordered (that is, comparing with a NaN 934 // should return 0) except for not equal which needs to be unordered. 935 // We use ucomis{s, d} because comis{s, d} has an undefined result if 936 // either operand is NaN. Since ucomis{s, d} is unordered, we need to 937 // compensate to make the comparison ordered. Ucomis{s, d} sets the 938 // ZF, PF, and CF flags if there is an unordered result. 939 let (src1, src2, set_kind) = match kind { 940 FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq), 941 FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne), 942 FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU), 943 FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU), 944 // Reversing the operands and using the complementary comparison 945 // avoids needing to perform an additional SETNP and AND 946 // instruction. 947 // SETNB and SETNBE check if the carry flag is unset (i.e., not 948 // less than and not unordered) so we get the intended result 949 // without having to look at the parity flag. 950 FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU), 951 FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU), 952 }; 953 self.asm.ucomis(src1, src2, size); 954 self.asm.setcc(set_kind, dst); 955 let _ = match kind { 956 FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => { 957 // Return false if either operand is NaN by ensuring PF is 958 // unset. 959 self.with_scratch::<IntScratch, _>(|masm, scratch| { 960 masm.asm.setnp(scratch.writable()); 961 masm.asm.and_rr(scratch.inner(), dst, size); 962 }); 963 } 964 FloatCmpKind::Ne => { 965 // Return true if either operand is NaN by checking if PF is 966 // set. 967 self.with_scratch::<IntScratch, _>(|masm, scratch| { 968 masm.asm.setp(scratch.writable()); 969 masm.asm.or_rr(scratch.inner(), dst, size); 970 }); 971 } 972 FloatCmpKind::Lt | FloatCmpKind::Le => (), 973 }; 974 Ok(()) 975 } 976 clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>977 fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> { 978 if self.flags.has_lzcnt() { 979 self.asm.lzcnt(src, dst, size); 980 } else { 981 self.with_scratch::<IntScratch, _>(|masm, scratch| { 982 // Use the following approach: 983 // dst = size.num_bits() - bsr(src) - is_not_zero 984 // = size.num.bits() + -bsr(src) - is_not_zero. 985 masm.asm.bsr(src, dst, size); 986 masm.asm.setcc(IntCmpKind::Ne, scratch.writable()); 987 masm.asm.neg(dst.to_reg(), dst, size); 988 masm.asm.add_ir(size.num_bits() as i32, dst, size); 989 masm.asm.sub_rr(scratch.inner(), dst, size); 990 }); 991 } 992 993 Ok(()) 994 } 995 ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()>996 fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> { 997 if self.flags.has_bmi1() { 998 self.asm.tzcnt(src, dst, size); 999 } else { 1000 self.with_scratch::<IntScratch, _>(|masm, scratch| { 1001 // Use the following approach: 1002 // dst = bsf(src) + (is_zero * size.num_bits()) 1003 // = bsf(src) + (is_zero << size.log2()). 1004 // BSF outputs the correct value for every value except 0. 1005 // When the value is 0, BSF outputs 0, correct output for ctz is 1006 // the number of bits. 1007 masm.asm.bsf(src, dst, size); 1008 masm.asm.setcc(IntCmpKind::Eq, scratch.writable()); 1009 masm.asm 1010 .shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size); 1011 masm.asm.add_rr(scratch.inner(), dst, size); 1012 }); 1013 } 1014 1015 Ok(()) 1016 } 1017 get_label(&mut self) -> Result<MachLabel>1018 fn get_label(&mut self) -> Result<MachLabel> { 1019 let buffer = self.asm.buffer_mut(); 1020 Ok(buffer.get_label()) 1021 } 1022 bind(&mut self, label: MachLabel) -> Result<()>1023 fn bind(&mut self, label: MachLabel) -> Result<()> { 1024 let buffer = self.asm.buffer_mut(); 1025 buffer.bind_label(label, &mut Default::default()); 1026 Ok(()) 1027 } 1028 branch( &mut self, kind: IntCmpKind, lhs: Reg, rhs: RegImm, taken: MachLabel, size: OperandSize, ) -> Result<()>1029 fn branch( 1030 &mut self, 1031 kind: IntCmpKind, 1032 lhs: Reg, 1033 rhs: RegImm, 1034 taken: MachLabel, 1035 size: OperandSize, 1036 ) -> Result<()> { 1037 use IntCmpKind::*; 1038 1039 match &(lhs, rhs) { 1040 (rlhs, RegImm::Reg(rrhs)) => { 1041 // If the comparison kind is zero or not zero and both operands 1042 // are the same register, emit a test instruction. Else we emit 1043 // a normal comparison. 1044 if (kind == Eq || kind == Ne) && (rlhs == rrhs) { 1045 self.asm.test_rr(*rlhs, *rrhs, size); 1046 } else { 1047 self.cmp(lhs, rhs, size)?; 1048 } 1049 } 1050 _ => self.cmp(lhs, rhs, size)?, 1051 } 1052 self.asm.jmp_if(kind, taken); 1053 Ok(()) 1054 } 1055 jmp(&mut self, target: MachLabel) -> Result<()>1056 fn jmp(&mut self, target: MachLabel) -> Result<()> { 1057 self.asm.jmp(target); 1058 Ok(()) 1059 } 1060 popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()>1061 fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> { 1062 let src = context.pop_to_reg(self, None)?; 1063 if self.flags.has_popcnt() && self.flags.has_sse42() { 1064 self.asm.popcnt(src.into(), writable!(src.into()), size); 1065 context.stack.push(src.into()); 1066 Ok(()) 1067 } else { 1068 // The fallback functionality here is based on `MacroAssembler::popcnt64` in: 1069 // https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#495 1070 1071 let tmp = writable!(context.any_gpr(self)?); 1072 let dst = writable!(src.into()); 1073 let (masks, shift_amt) = match size { 1074 OperandSize::S64 => ( 1075 [ 1076 0x5555555555555555, // m1 1077 0x3333333333333333, // m2 1078 0x0f0f0f0f0f0f0f0f, // m4 1079 0x0101010101010101, // h01 1080 ], 1081 56u8, 1082 ), 1083 // 32-bit popcount is the same, except the masks are half as 1084 // wide and we shift by 24 at the end rather than 56 1085 OperandSize::S32 => ( 1086 [0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64], 1087 24u8, 1088 ), 1089 _ => bail!(CodeGenError::unexpected_operand_size()), 1090 }; 1091 self.asm.mov_rr(src.into(), tmp, size); 1092 1093 // x -= (x >> 1) & m1; 1094 self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size); 1095 let lhs = dst.to_reg(); 1096 self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?; 1097 self.asm.sub_rr(dst.to_reg(), tmp, size); 1098 1099 // x = (x & m2) + ((x >> 2) & m2); 1100 self.asm.mov_rr(tmp.to_reg(), dst, size); 1101 // Load `0x3333...` into the scratch reg once, allowing us to use 1102 // `and_rr` and avoid inadvertently loading it twice as with `and` 1103 1104 self.with_scratch::<IntScratch, _>(|masm, scratch| { 1105 masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?; 1106 masm.asm.and_rr(scratch.inner(), dst, size); 1107 masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size); 1108 masm.asm.and_rr(scratch.inner(), tmp, size); 1109 wasmtime_environ::error::Ok(()) 1110 })?; 1111 self.asm.add_rr(dst.to_reg(), tmp, size); 1112 1113 // x = (x + (x >> 4)) & m4; 1114 self.asm.mov_rr(tmp.to_reg(), dst, size); 1115 self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size); 1116 self.asm.add_rr(tmp.to_reg(), dst, size); 1117 let lhs = dst.to_reg(); 1118 self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?; 1119 1120 // (x * h01) >> shift_amt 1121 let lhs = dst.to_reg(); 1122 self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?; 1123 self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size); 1124 1125 context.stack.push(src.into()); 1126 context.free_reg(tmp.to_reg()); 1127 1128 Ok(()) 1129 } 1130 } 1131 wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()>1132 fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> { 1133 self.asm.mov_rr(src, dst, OperandSize::S32); 1134 Ok(()) 1135 } 1136 extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()>1137 fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> { 1138 match kind { 1139 ExtendKind::Signed(ext) => { 1140 self.asm.movsx_rr(src, dst, ext); 1141 } 1142 ExtendKind::Unsigned(ext) => { 1143 self.asm.movzx_rr(src, dst, ext); 1144 } 1145 } 1146 1147 Ok(()) 1148 } 1149 signed_truncate( &mut self, dst: WritableReg, src: Reg, src_size: OperandSize, dst_size: OperandSize, kind: TruncKind, ) -> Result<()>1150 fn signed_truncate( 1151 &mut self, 1152 dst: WritableReg, 1153 src: Reg, 1154 src_size: OperandSize, 1155 dst_size: OperandSize, 1156 kind: TruncKind, 1157 ) -> Result<()> { 1158 self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| { 1159 masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| { 1160 masm.asm.cvt_float_to_sint_seq( 1161 src, 1162 dst, 1163 gpr_scratch.inner(), 1164 xmm_scratch.inner(), 1165 src_size, 1166 dst_size, 1167 kind.is_checked(), 1168 ); 1169 Ok(()) 1170 }) 1171 }) 1172 } 1173 unsigned_truncate( &mut self, ctx: &mut CodeGenContext<Emission>, src_size: OperandSize, dst_size: OperandSize, kind: TruncKind, ) -> Result<()>1174 fn unsigned_truncate( 1175 &mut self, 1176 ctx: &mut CodeGenContext<Emission>, 1177 src_size: OperandSize, 1178 dst_size: OperandSize, 1179 kind: TruncKind, 1180 ) -> Result<()> { 1181 let dst_ty = match dst_size { 1182 OperandSize::S32 => WasmValType::I32, 1183 OperandSize::S64 => WasmValType::I64, 1184 _ => bail!(CodeGenError::unexpected_operand_size()), 1185 }; 1186 1187 ctx.convert_op_with_tmp_reg( 1188 self, 1189 dst_ty, 1190 RegClass::Float, 1191 |masm, dst, src, tmp_fpr, dst_size| { 1192 masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| { 1193 masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| { 1194 masm.asm.cvt_float_to_uint_seq( 1195 src, 1196 writable!(dst), 1197 gpr_scratch.inner(), 1198 xmm_scratch.inner(), 1199 tmp_fpr, 1200 src_size, 1201 dst_size, 1202 kind.is_checked(), 1203 ); 1204 Ok(()) 1205 }) 1206 }) 1207 }, 1208 ) 1209 } 1210 signed_convert( &mut self, dst: WritableReg, src: Reg, src_size: OperandSize, dst_size: OperandSize, ) -> Result<()>1211 fn signed_convert( 1212 &mut self, 1213 dst: WritableReg, 1214 src: Reg, 1215 src_size: OperandSize, 1216 dst_size: OperandSize, 1217 ) -> Result<()> { 1218 self.asm.cvt_sint_to_float(src, dst, src_size, dst_size); 1219 Ok(()) 1220 } 1221 unsigned_convert( &mut self, dst: WritableReg, src: Reg, tmp_gpr: Reg, src_size: OperandSize, dst_size: OperandSize, ) -> Result<()>1222 fn unsigned_convert( 1223 &mut self, 1224 dst: WritableReg, 1225 src: Reg, 1226 tmp_gpr: Reg, 1227 src_size: OperandSize, 1228 dst_size: OperandSize, 1229 ) -> Result<()> { 1230 // Need to convert unsigned uint32 to uint64 for conversion instruction sequence. 1231 if let OperandSize::S32 = src_size { 1232 self.extend( 1233 writable!(src), 1234 src, 1235 ExtendKind::Unsigned(Extend::I64Extend32), 1236 )?; 1237 } 1238 1239 self.with_scratch::<IntScratch, _>(|masm, scratch| { 1240 masm.asm 1241 .cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size); 1242 Ok(()) 1243 }) 1244 } 1245 reinterpret_float_as_int( &mut self, dst: WritableReg, src: Reg, size: OperandSize, ) -> Result<()>1246 fn reinterpret_float_as_int( 1247 &mut self, 1248 dst: WritableReg, 1249 src: Reg, 1250 size: OperandSize, 1251 ) -> Result<()> { 1252 self.asm.xmm_to_gpr(src, dst, size); 1253 Ok(()) 1254 } 1255 reinterpret_int_as_float( &mut self, dst: WritableReg, src: Reg, size: OperandSize, ) -> Result<()>1256 fn reinterpret_int_as_float( 1257 &mut self, 1258 dst: WritableReg, 1259 src: Reg, 1260 size: OperandSize, 1261 ) -> Result<()> { 1262 self.asm.gpr_to_xmm(src, dst, size); 1263 Ok(()) 1264 } 1265 demote(&mut self, dst: WritableReg, src: Reg) -> Result<()>1266 fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> { 1267 self.asm 1268 .cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32); 1269 Ok(()) 1270 } 1271 promote(&mut self, dst: WritableReg, src: Reg) -> Result<()>1272 fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> { 1273 self.asm 1274 .cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64); 1275 Ok(()) 1276 } 1277 unreachable(&mut self) -> Result<()>1278 fn unreachable(&mut self) -> Result<()> { 1279 self.asm.trap(TRAP_UNREACHABLE); 1280 Ok(()) 1281 } 1282 trap(&mut self, code: TrapCode) -> Result<()>1283 fn trap(&mut self, code: TrapCode) -> Result<()> { 1284 self.asm.trap(code); 1285 Ok(()) 1286 } 1287 trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()>1288 fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> { 1289 self.asm.trapif(cc, code); 1290 Ok(()) 1291 } 1292 trapz(&mut self, src: Reg, code: TrapCode) -> Result<()>1293 fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> { 1294 self.asm.test_rr(src, src, self.ptr_size); 1295 self.asm.trapif(IntCmpKind::Eq, code); 1296 Ok(()) 1297 } 1298 jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()>1299 fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> { 1300 // At least one default target. 1301 debug_assert!(targets.len() >= 1); 1302 let default_index = targets.len() - 1; 1303 // Emit bounds check, by conditionally moving the max cases 1304 // into the given index reg if the contents of the index reg 1305 // are greater. 1306 let max = default_index; 1307 let size = OperandSize::S32; 1308 self.asm.mov_ir(max as u64, writable!(tmp), size); 1309 self.asm.cmp_rr(tmp, index, size); 1310 self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size); 1311 1312 let default = targets[default_index]; 1313 let rest = &targets[0..default_index]; 1314 1315 self.with_scratch::<IntScratch, _>(|masm, tmp1| { 1316 masm.asm 1317 .jmp_table(rest.into(), default, index, tmp1.inner(), tmp); 1318 Ok(()) 1319 }) 1320 } 1321 start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)>1322 fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> { 1323 Ok(self.asm.buffer_mut().start_srcloc(loc)) 1324 } 1325 end_source_loc(&mut self) -> Result<()>1326 fn end_source_loc(&mut self) -> Result<()> { 1327 self.asm.buffer_mut().end_srcloc(); 1328 Ok(()) 1329 } 1330 current_code_offset(&self) -> Result<CodeOffset>1331 fn current_code_offset(&self) -> Result<CodeOffset> { 1332 Ok(self.asm.buffer().cur_offset()) 1333 } 1334 add128( &mut self, dst_lo: WritableReg, dst_hi: WritableReg, lhs_lo: Reg, lhs_hi: Reg, rhs_lo: Reg, rhs_hi: Reg, ) -> Result<()>1335 fn add128( 1336 &mut self, 1337 dst_lo: WritableReg, 1338 dst_hi: WritableReg, 1339 lhs_lo: Reg, 1340 lhs_hi: Reg, 1341 rhs_lo: Reg, 1342 rhs_hi: Reg, 1343 ) -> Result<()> { 1344 Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?; 1345 Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?; 1346 self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64); 1347 self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64); 1348 Ok(()) 1349 } 1350 sub128( &mut self, dst_lo: WritableReg, dst_hi: WritableReg, lhs_lo: Reg, lhs_hi: Reg, rhs_lo: Reg, rhs_hi: Reg, ) -> Result<()>1351 fn sub128( 1352 &mut self, 1353 dst_lo: WritableReg, 1354 dst_hi: WritableReg, 1355 lhs_lo: Reg, 1356 lhs_hi: Reg, 1357 rhs_lo: Reg, 1358 rhs_hi: Reg, 1359 ) -> Result<()> { 1360 Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?; 1361 Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?; 1362 self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64); 1363 self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64); 1364 Ok(()) 1365 } 1366 mul_wide( &mut self, context: &mut CodeGenContext<Emission>, kind: MulWideKind, ) -> Result<()>1367 fn mul_wide( 1368 &mut self, 1369 context: &mut CodeGenContext<Emission>, 1370 kind: MulWideKind, 1371 ) -> Result<()> { 1372 // Reserve rax/rdx since they're required by the `mul_wide` instruction 1373 // being used here. 1374 let rax = context.reg(regs::rax(), self)?; 1375 let rdx = context.reg(regs::rdx(), self)?; 1376 1377 // The rhs of this binop can be in any register 1378 let rhs = context.pop_to_reg(self, None)?; 1379 // Mark rax as allocatable. and then force the lhs operand to be placed 1380 // in `rax`. 1381 context.free_reg(rax); 1382 let lhs = context.pop_to_reg(self, Some(rax))?; 1383 1384 self.asm.mul_wide( 1385 writable!(rax), 1386 writable!(rdx), 1387 lhs.reg, 1388 rhs.reg, 1389 kind, 1390 OperandSize::S64, 1391 ); 1392 1393 // No longer using the rhs register after the multiplication has been 1394 // executed. 1395 context.free_reg(rhs); 1396 1397 // The low bits of the result are in rax, where `lhs` was allocated to 1398 context.stack.push(lhs.into()); 1399 // The high bits of the result are in rdx, which we previously reserved. 1400 context.stack.push(Val::Reg(TypedReg::i64(rdx))); 1401 1402 Ok(()) 1403 } 1404 splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()>1405 fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> { 1406 // Get the source and destination operands set up first. 1407 let (src, dst) = match size { 1408 // Floats can use the same register for `src` and `dst`. 1409 SplatKind::F32x4 | SplatKind::F64x2 => { 1410 let reg = context.pop_to_reg(self, None)?.reg; 1411 (RegImm::reg(reg), writable!(reg)) 1412 } 1413 // For ints, we need to load the operand into a vector register if 1414 // it's not a constant. 1415 SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => { 1416 let dst = writable!(context.any_fpr(self)?); 1417 let src = if size == SplatKind::I64x2 { 1418 context.pop_i64_const().map(RegImm::i64) 1419 } else { 1420 context.pop_i32_const().map(RegImm::i32) 1421 } 1422 .map_or_else( 1423 || -> Result<RegImm> { 1424 let reg = context.pop_to_reg(self, None)?.reg; 1425 self.reinterpret_int_as_float( 1426 dst, 1427 reg, 1428 match size { 1429 SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => { 1430 OperandSize::S32 1431 } 1432 SplatKind::I64x2 => OperandSize::S64, 1433 SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(), 1434 }, 1435 )?; 1436 context.free_reg(reg); 1437 Ok(RegImm::Reg(dst.to_reg())) 1438 }, 1439 Ok, 1440 )?; 1441 (src, dst) 1442 } 1443 }; 1444 1445 // Perform the splat on the operands. 1446 if size == SplatKind::I64x2 || size == SplatKind::F64x2 { 1447 self.ensure_has_avx()?; 1448 let mask = Self::vpshuf_mask_for_64_bit_splats(); 1449 match src { 1450 RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32), 1451 RegImm::Imm(imm) => { 1452 let src = self.asm.add_constant(&imm.to_bytes()); 1453 self.asm 1454 .xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted()); 1455 } 1456 } 1457 } else { 1458 self.ensure_has_avx2()?; 1459 1460 match src { 1461 RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()), 1462 RegImm::Imm(imm) => { 1463 let src = self.asm.add_constant(&imm.to_bytes()); 1464 self.asm 1465 .xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted()); 1466 } 1467 } 1468 } 1469 1470 context 1471 .stack 1472 .push(Val::reg(dst.to_reg(), WasmValType::V128)); 1473 Ok(()) 1474 } 1475 shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()>1476 fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> { 1477 self.ensure_has_avx()?; 1478 1479 // Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs` 1480 // separately to either the selected index or 0. 1481 // Then use `vpor` to combine `lhs` and `rhs` into `dst`. 1482 // Setting the most significant bit in the mask's lane to 1 will 1483 // result in corresponding lane in the destination register being 1484 // set to 0. 0x80 sets the most significant bit to 1. 1485 let mut mask_lhs: [u8; 16] = [0x80; 16]; 1486 let mut mask_rhs: [u8; 16] = [0x80; 16]; 1487 for i in 0..lanes.len() { 1488 if lanes[i] < 16 { 1489 mask_lhs[i] = lanes[i]; 1490 } else { 1491 mask_rhs[i] = lanes[i] - 16; 1492 } 1493 } 1494 let mask_lhs = self.asm.add_constant(&mask_lhs); 1495 let mask_rhs = self.asm.add_constant(&mask_rhs); 1496 1497 self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs); 1498 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 1499 masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs); 1500 masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner()); 1501 Ok(()) 1502 }) 1503 } 1504 swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()>1505 fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> { 1506 self.ensure_has_avx()?; 1507 1508 // Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything 1509 // outside that range. 1510 // Each lane is a signed byte so the maximum value is 0x7F. Adding 1511 // 0x70 to any value higher than 0xF will saturate resulting in a value 1512 // of 0xFF (i.e., 0). 1513 let clamp = self.asm.add_constant(&[0x70; 16]); 1514 self.asm 1515 .xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8); 1516 1517 // Don't need to subtract 0x70 since `vpshufb` uses the least 1518 // significant 4 bits which are the same after adding 0x70. 1519 self.asm.xmm_vpshufb_rrr(dst, lhs, rhs); 1520 Ok(()) 1521 } 1522 atomic_rmw( &mut self, context: &mut CodeGenContext<Emission>, addr: Self::Address, size: OperandSize, op: RmwOp, flags: MemFlags, extend: Option<Extend<Zero>>, ) -> Result<()>1523 fn atomic_rmw( 1524 &mut self, 1525 context: &mut CodeGenContext<Emission>, 1526 addr: Self::Address, 1527 size: OperandSize, 1528 op: RmwOp, 1529 flags: MemFlags, 1530 extend: Option<Extend<Zero>>, 1531 ) -> Result<()> { 1532 let res = match op { 1533 RmwOp::Add => { 1534 let operand = context.pop_to_reg(self, None)?; 1535 self.asm 1536 .lock_xadd(addr, writable!(operand.reg), size, flags); 1537 operand.reg 1538 } 1539 RmwOp::Sub => { 1540 let operand = context.pop_to_reg(self, None)?; 1541 self.asm.neg(operand.reg, writable!(operand.reg), size); 1542 self.asm 1543 .lock_xadd(addr, writable!(operand.reg), size, flags); 1544 operand.reg 1545 } 1546 RmwOp::Xchg => { 1547 let operand = context.pop_to_reg(self, None)?; 1548 self.asm.xchg(addr, writable!(operand.reg), size, flags); 1549 operand.reg 1550 } 1551 RmwOp::And | RmwOp::Or | RmwOp::Xor => { 1552 let op = match op { 1553 RmwOp::And => AtomicRmwSeqOp::And, 1554 RmwOp::Or => AtomicRmwSeqOp::Or, 1555 RmwOp::Xor => AtomicRmwSeqOp::Xor, 1556 _ => unreachable!( 1557 "invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`" 1558 ), 1559 }; 1560 let dst = context.reg(regs::rax(), self)?; 1561 let operand = context.pop_to_reg(self, None)?; 1562 1563 self.with_scratch::<IntScratch, _>(|masm, scratch| { 1564 masm.asm.atomic_rmw_seq( 1565 addr, 1566 operand.reg, 1567 writable!(dst), 1568 scratch.writable(), 1569 size, 1570 flags, 1571 op, 1572 ); 1573 }); 1574 1575 context.free_reg(operand.reg); 1576 dst 1577 } 1578 }; 1579 1580 let dst_ty = match extend { 1581 Some(ext) => { 1582 // We don't need to zero-extend from 32 to 64bits. 1583 if !(ext.from_bits() == 32 && ext.to_bits() == 64) { 1584 self.asm.movzx_rr(res, writable!(res), ext); 1585 } 1586 1587 WasmValType::int_from_bits(ext.to_bits()) 1588 } 1589 None => WasmValType::int_from_bits(size.num_bits()), 1590 }; 1591 1592 context.stack.push(TypedReg::new(dst_ty, res).into()); 1593 1594 Ok(()) 1595 } 1596 extract_lane( &mut self, src: Reg, dst: WritableReg, lane: u8, kind: ExtractLaneKind, ) -> Result<()>1597 fn extract_lane( 1598 &mut self, 1599 src: Reg, 1600 dst: WritableReg, 1601 lane: u8, 1602 kind: ExtractLaneKind, 1603 ) -> Result<()> { 1604 self.ensure_has_avx()?; 1605 1606 match kind { 1607 ExtractLaneKind::I8x16S 1608 | ExtractLaneKind::I8x16U 1609 | ExtractLaneKind::I16x8S 1610 | ExtractLaneKind::I16x8U 1611 | ExtractLaneKind::I32x4 1612 | ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()), 1613 ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => { 1614 // If the `src` and `dst` registers are the same, then the 1615 // appropriate value is already in the correct position in 1616 // the register. 1617 assert!(src == dst.to_reg()); 1618 } 1619 ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()), 1620 ExtractLaneKind::F64x2 => { 1621 // `0b11_10` selects the high and low 32-bits of the second 1622 // 64-bit, so `0b11_10_11_10` splats the 64-bit value across 1623 // both lanes. Since we put an `f64` on the stack, we use 1624 // the splatted value. 1625 // Double-check `lane == 0` was handled in another branch. 1626 assert!(lane == 1); 1627 self.asm 1628 .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32) 1629 } 1630 } 1631 1632 // Sign-extend to 32-bits for sign extended kinds. 1633 match kind { 1634 ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => { 1635 self.asm.movsx_rr(dst.to_reg(), dst, kind.into()) 1636 } 1637 _ => (), 1638 } 1639 1640 Ok(()) 1641 } 1642 replace_lane( &mut self, src: RegImm, dst: WritableReg, lane: u8, kind: ReplaceLaneKind, ) -> Result<()>1643 fn replace_lane( 1644 &mut self, 1645 src: RegImm, 1646 dst: WritableReg, 1647 lane: u8, 1648 kind: ReplaceLaneKind, 1649 ) -> Result<()> { 1650 self.ensure_has_avx()?; 1651 1652 match kind { 1653 ReplaceLaneKind::I8x16 1654 | ReplaceLaneKind::I16x8 1655 | ReplaceLaneKind::I32x4 1656 | ReplaceLaneKind::I64x2 => match src { 1657 RegImm::Reg(reg) => { 1658 self.asm 1659 .xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size()); 1660 } 1661 RegImm::Imm(imm) => { 1662 let address = self.asm.add_constant(&imm.to_bytes()); 1663 self.asm 1664 .xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size()); 1665 } 1666 }, 1667 ReplaceLaneKind::F32x4 => { 1668 // Immediate for `vinsertps` uses first 3 bits to determine 1669 // which elements of the destination to set to 0. The next 2 1670 // bits specify which element of the destination will be 1671 // overwritten. 1672 let imm = lane << 4; 1673 match src { 1674 RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm), 1675 RegImm::Imm(val) => { 1676 let address = self.asm.add_constant(&val.to_bytes()); 1677 self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm); 1678 } 1679 } 1680 } 1681 ReplaceLaneKind::F64x2 => match src { 1682 RegImm::Reg(reg) => match lane { 1683 0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg), 1684 1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg), 1685 _ => unreachable!(), 1686 }, 1687 RegImm::Imm(imm) => { 1688 let address = self.asm.add_constant(&imm.to_bytes()); 1689 match lane { 1690 0 => { 1691 // Memory load variant of `vmovsd` zeroes the upper 1692 // 64 bits of the register so need to load the 1693 // immediate to a register to use the register 1694 // variant of `vmovsd` to perform the merge. 1695 1696 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 1697 masm.asm.xmm_vmovsd_rm(scratch.writable(), &address); 1698 masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner()); 1699 }); 1700 } 1701 1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address), 1702 _ => unreachable!(), 1703 } 1704 } 1705 }, 1706 } 1707 Ok(()) 1708 } 1709 atomic_cas( &mut self, context: &mut CodeGenContext<Emission>, addr: Self::Address, size: OperandSize, flags: MemFlags, extend: Option<Extend<Zero>>, ) -> Result<()>1710 fn atomic_cas( 1711 &mut self, 1712 context: &mut CodeGenContext<Emission>, 1713 addr: Self::Address, 1714 size: OperandSize, 1715 flags: MemFlags, 1716 extend: Option<Extend<Zero>>, 1717 ) -> Result<()> { 1718 // `cmpxchg` expects `expected` to be in the `*a*` register. 1719 // reserve rax for the expected argument. 1720 let rax = context.reg(regs::rax(), self)?; 1721 1722 let replacement = context.pop_to_reg(self, None)?; 1723 1724 // mark `rax` as allocatable again. 1725 context.free_reg(rax); 1726 let expected = context.pop_to_reg(self, Some(regs::rax()))?; 1727 1728 self.asm 1729 .cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags); 1730 1731 if let Some(extend) = extend { 1732 // We don't need to zero-extend from 32 to 64bits. 1733 if !(extend.from_bits() == 32 && extend.to_bits() == 64) { 1734 self.asm 1735 .movzx_rr(expected.reg, writable!(expected.reg), extend); 1736 } 1737 } 1738 1739 context.stack.push(expected.into()); 1740 context.free_reg(replacement); 1741 1742 Ok(()) 1743 } 1744 v128_eq( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorEqualityKind, ) -> Result<()>1745 fn v128_eq( 1746 &mut self, 1747 dst: WritableReg, 1748 lhs: Reg, 1749 rhs: Reg, 1750 kind: VectorEqualityKind, 1751 ) -> Result<()> { 1752 self.ensure_has_avx()?; 1753 1754 match kind { 1755 VectorEqualityKind::I8x16 1756 | VectorEqualityKind::I16x8 1757 | VectorEqualityKind::I32x4 1758 | VectorEqualityKind::I64x2 => { 1759 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size()) 1760 } 1761 VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => { 1762 self.asm 1763 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq) 1764 } 1765 } 1766 Ok(()) 1767 } 1768 v128_ne( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorEqualityKind, ) -> Result<()>1769 fn v128_ne( 1770 &mut self, 1771 dst: WritableReg, 1772 lhs: Reg, 1773 rhs: Reg, 1774 kind: VectorEqualityKind, 1775 ) -> Result<()> { 1776 self.ensure_has_avx()?; 1777 1778 match kind { 1779 VectorEqualityKind::I8x16 1780 | VectorEqualityKind::I16x8 1781 | VectorEqualityKind::I32x4 1782 | VectorEqualityKind::I64x2 => { 1783 // Check for equality and invert the results. 1784 self.asm 1785 .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size()); 1786 self.asm 1787 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size()); 1788 self.asm.xmm_vpxor_rrr(lhs, rhs, dst); 1789 } 1790 VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => { 1791 self.asm 1792 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne) 1793 } 1794 } 1795 Ok(()) 1796 } 1797 v128_lt( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1798 fn v128_lt( 1799 &mut self, 1800 dst: WritableReg, 1801 lhs: Reg, 1802 rhs: Reg, 1803 kind: VectorCompareKind, 1804 ) -> Result<()> { 1805 self.ensure_has_avx()?; 1806 1807 match kind { 1808 VectorCompareKind::I8x16S 1809 | VectorCompareKind::I16x8S 1810 | VectorCompareKind::I32x4S 1811 | VectorCompareKind::I64x2S => { 1812 // Perform a greater than check with reversed parameters. 1813 self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size()) 1814 } 1815 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => { 1816 // Set `lhs` to min values, check for equality, then invert the 1817 // result. 1818 // If `lhs` is smaller, then equality check will fail and result 1819 // will be inverted to true. Otherwise the equality check will 1820 // pass and be inverted to false. 1821 self.asm 1822 .xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size()); 1823 self.asm 1824 .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size()); 1825 self.asm 1826 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size()); 1827 self.asm.xmm_vpxor_rrr(lhs, rhs, dst); 1828 } 1829 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => { 1830 self.asm 1831 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt) 1832 } 1833 } 1834 Ok(()) 1835 } 1836 v128_le( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1837 fn v128_le( 1838 &mut self, 1839 dst: WritableReg, 1840 lhs: Reg, 1841 rhs: Reg, 1842 kind: VectorCompareKind, 1843 ) -> Result<()> { 1844 self.ensure_has_avx()?; 1845 1846 match kind { 1847 VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => { 1848 // Set the `rhs` vector to the signed minimum values and then 1849 // compare them with `lhs` for equality. 1850 self.asm 1851 .xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size()); 1852 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size()); 1853 } 1854 VectorCompareKind::I64x2S => { 1855 // Do a greater than check and invert the results. 1856 self.asm 1857 .xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size()); 1858 self.asm 1859 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size()); 1860 self.asm.xmm_vpxor_rrr(lhs, rhs, dst); 1861 } 1862 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => { 1863 // Set the `rhs` vector to the signed minimum values and then 1864 // compare them with `lhs` for equality. 1865 self.asm 1866 .xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size()); 1867 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size()); 1868 } 1869 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => { 1870 self.asm 1871 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le) 1872 } 1873 } 1874 Ok(()) 1875 } 1876 v128_gt( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1877 fn v128_gt( 1878 &mut self, 1879 dst: WritableReg, 1880 lhs: Reg, 1881 rhs: Reg, 1882 kind: VectorCompareKind, 1883 ) -> Result<()> { 1884 self.ensure_has_avx()?; 1885 1886 match kind { 1887 VectorCompareKind::I8x16S 1888 | VectorCompareKind::I16x8S 1889 | VectorCompareKind::I32x4S 1890 | VectorCompareKind::I64x2S => { 1891 self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size()) 1892 } 1893 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => { 1894 // Set `lhs` to max values, check for equality, then invert the 1895 // result. 1896 // If `lhs` is larger, then equality check will fail and result 1897 // will be inverted to true. Otherwise the equality check will 1898 // pass and be inverted to false. 1899 self.asm 1900 .xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size()); 1901 self.asm 1902 .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size()); 1903 self.asm 1904 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size()); 1905 self.asm.xmm_vpxor_rrr(lhs, rhs, dst); 1906 } 1907 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => { 1908 // Do a less than comparison with the operands swapped. 1909 self.asm 1910 .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt) 1911 } 1912 } 1913 Ok(()) 1914 } 1915 v128_ge( &mut self, dst: WritableReg, lhs: Reg, rhs: Reg, kind: VectorCompareKind, ) -> Result<()>1916 fn v128_ge( 1917 &mut self, 1918 dst: WritableReg, 1919 lhs: Reg, 1920 rhs: Reg, 1921 kind: VectorCompareKind, 1922 ) -> Result<()> { 1923 self.ensure_has_avx()?; 1924 1925 match kind { 1926 VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => { 1927 // Set each lane to maximum value and then compare for equality. 1928 self.asm 1929 .xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size()); 1930 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size()); 1931 } 1932 VectorCompareKind::I64x2S => { 1933 // Perform a greater than comparison with operands swapped, 1934 // then invert the results. 1935 self.asm 1936 .xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size()); 1937 self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size()); 1938 self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst); 1939 } 1940 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => { 1941 // Set lanes to maximum values and compare them for equality. 1942 self.asm 1943 .xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size()); 1944 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size()); 1945 } 1946 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => { 1947 // Perform a less than or equal comparison on swapped operands. 1948 self.asm 1949 .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le) 1950 } 1951 } 1952 1953 Ok(()) 1954 } 1955 fence(&mut self) -> Result<()>1956 fn fence(&mut self) -> Result<()> { 1957 self.asm.mfence(); 1958 Ok(()) 1959 } 1960 v128_not(&mut self, dst: WritableReg) -> Result<()>1961 fn v128_not(&mut self, dst: WritableReg) -> Result<()> { 1962 self.ensure_has_avx()?; 1963 1964 self.with_scratch::<FloatScratch, _>(|masm, tmp| { 1965 // First, we initialize `tmp` with all ones by comparing it with 1966 // itself. 1967 masm.asm 1968 .xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32); 1969 // Then we `xor` tmp and `dst` together, yielding `!dst`. 1970 masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst); 1971 Ok(()) 1972 }) 1973 } 1974 v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1975 fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> { 1976 self.ensure_has_avx()?; 1977 self.asm.xmm_vpand_rrr(src1, src2, dst); 1978 Ok(()) 1979 } 1980 v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1981 fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> { 1982 self.ensure_has_avx()?; 1983 self.asm.xmm_vpandn_rrr(src1, src2, dst); 1984 Ok(()) 1985 } 1986 v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1987 fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> { 1988 self.ensure_has_avx()?; 1989 self.asm.xmm_vpor_rrr(dst, src1, src2); 1990 Ok(()) 1991 } 1992 v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()>1993 fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> { 1994 self.ensure_has_avx()?; 1995 self.asm.xmm_vpxor_rrr(src1, src2, dst); 1996 Ok(()) 1997 } 1998 v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()>1999 fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> { 2000 self.ensure_has_avx()?; 2001 2002 self.with_scratch::<FloatScratch, _>(|masm, tmp| { 2003 masm.v128_and(src1, mask, tmp.writable())?; 2004 masm.v128_and_not(mask, src2, dst)?; 2005 masm.v128_or(dst.to_reg(), tmp.inner(), dst)?; 2006 Ok(()) 2007 }) 2008 } 2009 v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()>2010 fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> { 2011 self.ensure_has_avx()?; 2012 self.asm.xmm_vptest(src, src); 2013 self.asm.setcc(IntCmpKind::Ne, dst); 2014 Ok(()) 2015 } 2016 v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()>2017 fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> { 2018 self.ensure_has_avx()?; 2019 match kind { 2020 V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32), 2021 V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64), 2022 V128ConvertKind::I32x4U => { 2023 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2024 // Split each 32-bit integer into 16-bit parts. 2025 // `scratch` will contain the low bits and `dst` will contain 2026 // the high bits. 2027 masm.asm 2028 .xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size()); 2029 masm.asm.xmm_vpsrl_rri( 2030 scratch.inner(), 2031 scratch.writable(), 2032 0x10, 2033 kind.src_lane_size(), 2034 ); 2035 masm.asm 2036 .xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size()); 2037 2038 // Convert the low bits in `scratch` to floating point numbers. 2039 masm.asm 2040 .xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32); 2041 2042 // Prevent overflow by right shifting high bits. 2043 masm.asm 2044 .xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size()); 2045 // Convert high bits in `dst` to floating point numbers. 2046 masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32); 2047 // Double high bits in `dst` to reverse right shift. 2048 masm.asm 2049 .xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size()); 2050 // Add high bits in `dst` to low bits in `scratch`. 2051 masm.asm.xmm_vaddp_rrr( 2052 dst.to_reg(), 2053 scratch.inner(), 2054 dst, 2055 kind.src_lane_size(), 2056 ); 2057 }); 2058 } 2059 V128ConvertKind::I32x4LowU => { 2060 // See 2061 // https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L3668 2062 // for details on the Cranelift AVX implementation. 2063 // Use `vunpcklp` to create doubles from the integers. 2064 // Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers 2065 // creates a byte array for a double that sets the mantissa 2066 // bits to the original integer value. 2067 let conversion_constant = self 2068 .asm 2069 .add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]); 2070 self.asm 2071 .xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size()); 2072 // Subtract the 0x1.0p52 added above. 2073 let conversion_constant = self.asm.add_constant(&[ 2074 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 2075 0x00, 0x30, 0x43, 2076 ]); 2077 self.asm.xmm_vsub_rrm( 2078 dst.to_reg(), 2079 &conversion_constant, 2080 dst, 2081 kind.dst_lane_size(), 2082 ); 2083 } 2084 } 2085 Ok(()) 2086 } 2087 v128_narrow( &mut self, src1: Reg, src2: Reg, dst: WritableReg, kind: V128NarrowKind, ) -> Result<()>2088 fn v128_narrow( 2089 &mut self, 2090 src1: Reg, 2091 src2: Reg, 2092 dst: WritableReg, 2093 kind: V128NarrowKind, 2094 ) -> Result<()> { 2095 self.ensure_has_avx()?; 2096 match kind { 2097 V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => { 2098 self.asm 2099 .xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size()) 2100 } 2101 V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => { 2102 self.asm 2103 .xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size()) 2104 } 2105 } 2106 Ok(()) 2107 } 2108 v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()>2109 fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> { 2110 self.ensure_has_avx()?; 2111 self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32); 2112 Ok(()) 2113 } 2114 v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()>2115 fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> { 2116 self.ensure_has_avx()?; 2117 self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64); 2118 Ok(()) 2119 } 2120 v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()>2121 fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> { 2122 self.ensure_has_avx()?; 2123 match kind { 2124 V128ExtendKind::LowI8x16S 2125 | V128ExtendKind::LowI8x16U 2126 | V128ExtendKind::LowI16x8S 2127 | V128ExtendKind::LowI16x8U 2128 | V128ExtendKind::LowI32x4S 2129 | V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()), 2130 V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => { 2131 self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8); 2132 self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into()); 2133 } 2134 V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => { 2135 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2136 masm.asm 2137 .xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable()); 2138 masm.asm 2139 .xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size()); 2140 }); 2141 } 2142 V128ExtendKind::HighI32x4S => { 2143 // Move the 3rd element (i.e., 0b10) to the 1st (rightmost) 2144 // position and the 4th element (i.e., 0b11) to the 2nd (second 2145 // from the right) position and then perform the extend. 2146 self.asm 2147 .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size()); 2148 self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into()); 2149 } 2150 V128ExtendKind::HighI32x4U => { 2151 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2152 // Set `scratch` to a vector 0s. 2153 masm.asm.xmm_vxorp_rrr( 2154 scratch.inner(), 2155 scratch.inner(), 2156 scratch.writable(), 2157 kind.src_lane_size(), 2158 ); 2159 // Interleave the 0 bits into the two 32-bit integers to zero extend them. 2160 masm.asm 2161 .xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size()); 2162 }); 2163 } 2164 } 2165 Ok(()) 2166 } 2167 v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()>2168 fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> { 2169 self.ensure_has_avx()?; 2170 match kind { 2171 V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32), 2172 V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64), 2173 V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8), 2174 V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8), 2175 V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8), 2176 V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16), 2177 V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16), 2178 V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16), 2179 V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32), 2180 V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64), 2181 }; 2182 Ok(()) 2183 } 2184 v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()>2185 fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> { 2186 self.ensure_has_avx()?; 2187 match kind { 2188 V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32), 2189 V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64), 2190 V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8), 2191 V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8), 2192 V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8), 2193 V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16), 2194 V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16), 2195 V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16), 2196 V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32), 2197 V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64), 2198 }; 2199 Ok(()) 2200 } 2201 v128_mul( &mut self, context: &mut CodeGenContext<Emission>, kind: V128MulKind, ) -> Result<()>2202 fn v128_mul( 2203 &mut self, 2204 context: &mut CodeGenContext<Emission>, 2205 kind: V128MulKind, 2206 ) -> Result<()> { 2207 self.ensure_has_avx()?; 2208 2209 let rhs = context.pop_to_reg(self, None)?; 2210 let lhs = context.pop_to_reg(self, None)?; 2211 2212 let mul_i64x2_avx512 = |this: &mut Self| { 2213 this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg)); 2214 }; 2215 2216 let mul_i64x2_fallback = |this: &mut Self, 2217 context: &mut CodeGenContext<Emission>| 2218 -> Result<()> { 2219 // Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback 2220 // to an instruction sequence using 32bits multiplication (taken from cranelift 2221 // implementation, in `isa/x64/lower.isle`): 2222 // 2223 // > Otherwise, for i64x2 multiplication we describe a lane A as being composed of 2224 // > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand 2225 // > multiplication can then be written as: 2226 // 2227 // > Ah Al 2228 // > * Bh Bl 2229 // > ----- 2230 // > Al * Bl 2231 // > + (Ah * Bl) << 32 2232 // > + (Al * Bh) << 32 2233 // 2234 // > So for each lane we will compute: 2235 // 2236 // > A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 2237 // 2238 // > Note, the algorithm will use `pmuludq` which operates directly on the lower 2239 // > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of 2240 // > the lane of the destination. For this reason we don't need shifts to isolate 2241 // > the lower 32-bits, however, we will need to use shifts to isolate the high 2242 // > 32-bits when doing calculations, i.e., `Ah == A >> 32`. 2243 2244 let tmp2 = context.any_fpr(this)?; 2245 this.with_scratch::<FloatScratch, _>(|this, tmp1| { 2246 // tmp1 = lhs_hi = (lhs >> 32) 2247 this.asm 2248 .xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64); 2249 2250 // tmp2 = lhs_hi * rhs_low = tmp1 * rhs 2251 this.asm 2252 .xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2)); 2253 2254 // tmp1 = rhs_hi = rhs >> 32 2255 this.asm 2256 .xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64); 2257 2258 // tmp1 = lhs_low * rhs_high = tmp1 * lhs 2259 this.asm 2260 .xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable()); 2261 2262 // tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2 2263 this.asm 2264 .xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64); 2265 2266 //tmp1 = tmp1 << 32 2267 this.asm 2268 .xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64); 2269 2270 // tmp2 = lhs_lo + rhs_lo 2271 this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2)); 2272 2273 // finally, with `lhs` as destination: 2274 // lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2 2275 this.asm 2276 .xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64); 2277 }); 2278 2279 context.free_reg(tmp2); 2280 2281 Ok(()) 2282 }; 2283 2284 match kind { 2285 V128MulKind::F32x4 => { 2286 self.asm 2287 .xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32) 2288 } 2289 V128MulKind::F64x2 => { 2290 self.asm 2291 .xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64) 2292 } 2293 V128MulKind::I16x8 => { 2294 self.asm 2295 .xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16) 2296 } 2297 V128MulKind::I32x4 => { 2298 self.asm 2299 .xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32) 2300 } 2301 // This is the fast path when AVX512 is available. 2302 V128MulKind::I64x2 2303 if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() => 2304 { 2305 mul_i64x2_avx512(self) 2306 } 2307 // Otherwise, we emit AVX fallback sequence. 2308 V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?, 2309 } 2310 2311 context.stack.push(lhs.into()); 2312 context.free_reg(rhs); 2313 2314 Ok(()) 2315 } 2316 v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()>2317 fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> { 2318 self.ensure_has_avx()?; 2319 2320 match kind { 2321 V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => { 2322 self.asm.xmm_vpabs_rr(src, dst, kind.lane_size()) 2323 } 2324 V128AbsKind::I64x2 => { 2325 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2326 // Perform an arithmetic right shift of 31 bits. If the number 2327 // is positive, this will result in all zeroes in the upper 2328 // 32-bits. If the number is negative, this will result in all 2329 // ones in the upper 32-bits. 2330 masm.asm 2331 .xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32); 2332 // Copy the ones and zeroes in the high bits of each 64-bit 2333 // lane to the low bits of each 64-bit lane. 2334 masm.asm.xmm_vpshuf_rr( 2335 scratch.inner(), 2336 scratch.writable(), 2337 0b11_11_01_01, 2338 OperandSize::S32, 2339 ); 2340 // Flip the bits in lanes that were negative in `src` and leave 2341 // the positive lanes as they are. Positive lanes will have a 2342 // zero mask in `scratch` so xor doesn't affect them. 2343 masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst); 2344 // Subtract the mask from the results of xor which will 2345 // complete the two's complement for lanes which were negative. 2346 masm.asm 2347 .xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size()); 2348 }); 2349 } 2350 V128AbsKind::F32x4 | V128AbsKind::F64x2 => { 2351 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2352 // Create a mask of all ones. 2353 masm.asm.xmm_vpcmpeq_rrr( 2354 scratch.writable(), 2355 scratch.inner(), 2356 scratch.inner(), 2357 kind.lane_size(), 2358 ); 2359 // Right shift the mask so each lane is a single zero followed 2360 // by all ones. 2361 masm.asm.xmm_vpsrl_rri( 2362 scratch.inner(), 2363 scratch.writable(), 2364 0x1, 2365 kind.lane_size(), 2366 ); 2367 // Use the mask to zero the sign bit in each lane which will 2368 // make the float value positive. 2369 masm.asm 2370 .xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size()); 2371 }); 2372 } 2373 } 2374 Ok(()) 2375 } 2376 v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()>2377 fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> { 2378 self.ensure_has_avx()?; 2379 2380 match kind { 2381 V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => { 2382 self.with_scratch::<FloatScratch, _>(|masm, tmp| { 2383 masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?; 2384 masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?; 2385 wasmtime_environ::error::Ok(()) 2386 })?; 2387 } 2388 V128NegKind::F32x4 | V128NegKind::F64x2 => { 2389 self.with_scratch::<FloatScratch, _>(|masm, tmp| { 2390 // Create a mask of all 1s. 2391 masm.asm.xmm_vpcmpeq_rrr( 2392 tmp.writable(), 2393 tmp.inner(), 2394 tmp.inner(), 2395 kind.lane_size(), 2396 ); 2397 // Left shift the lanes in the mask so only the sign bit in the 2398 // mask is set to 1. 2399 masm.asm.xmm_vpsll_rri( 2400 tmp.inner(), 2401 tmp.writable(), 2402 (kind.lane_size().num_bits() - 1) as u32, 2403 kind.lane_size(), 2404 ); 2405 // Use the mask to flip the sign bit. 2406 masm.asm 2407 .xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size()); 2408 }); 2409 } 2410 } 2411 Ok(()) 2412 } 2413 v128_shift( &mut self, context: &mut CodeGenContext<Emission>, lane_width: OperandSize, kind: ShiftKind, ) -> Result<()>2414 fn v128_shift( 2415 &mut self, 2416 context: &mut CodeGenContext<Emission>, 2417 lane_width: OperandSize, 2418 kind: ShiftKind, 2419 ) -> Result<()> { 2420 self.ensure_has_avx()?; 2421 let shift_amount = context.pop_to_reg(self, None)?.reg; 2422 let operand = context.pop_to_reg(self, None)?.reg; 2423 let amount_mask = lane_width.num_bits() - 1; 2424 2425 self.and( 2426 writable!(shift_amount), 2427 shift_amount, 2428 RegImm::i32(amount_mask as i32), 2429 OperandSize::S32, 2430 )?; 2431 2432 let move_to_tmp_xmm = |this: &mut Self, tmp_xmm: Scratch| { 2433 this.asm 2434 .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32); 2435 }; 2436 2437 // A helper for deciding between `vpsllw` and `vpsrlw` in 2438 // `shift_i8x16`. 2439 enum Direction { 2440 Left, 2441 Right, 2442 } 2443 2444 let shift_i8x16 = |this: &mut Self, 2445 masks: &'static [u8], 2446 direction: Direction| 2447 -> Result<()> { 2448 // The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit 2449 // shift instruction. Instead, we shift as 16bits, and then mask the bits in the 2450 // 8bits lane, for example (with 2 8bits lanes): 2451 // - Before shifting: 2452 // 01001101 11101110 2453 // - shifting by 2 left: 2454 // 00110111 10111000 2455 // ^^_ these bits come from the previous byte, and need to be masked. 2456 // - The mask: 2457 // 11111100 11111111 2458 // - After masking: 2459 // 00110100 10111000 2460 // 2461 // The mask is loaded from a well known memory, depending on the shift amount. 2462 2463 this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| { 2464 this.asm 2465 .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32); 2466 2467 // Perform the 16-bit shift. 2468 match direction { 2469 Direction::Left => this.asm.xmm_vpsll_rrr( 2470 operand, 2471 tmp_xmm.inner(), 2472 writable!(operand), 2473 OperandSize::S16, 2474 ), 2475 Direction::Right => this.asm.xmm_vpsrl_rrr( 2476 operand, 2477 tmp_xmm.inner(), 2478 writable!(operand), 2479 OperandSize::S16, 2480 ), 2481 } 2482 2483 // Get a handle to the masks array constant. 2484 let masks_addr = this.asm.add_constant(masks); 2485 2486 this.with_scratch::<IntScratch, _>(|this, tmp| { 2487 // Load the masks array effective address into the tmp register. 2488 this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64); 2489 2490 // Compute the offset of the mask that we need to use. This is shift_amount * 16 == 2491 // shift_amount << 4. 2492 this.asm 2493 .shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32); 2494 2495 // Load the mask to tmp_xmm. 2496 this.asm.xmm_vmovdqu_mr( 2497 &Address::ImmRegRegShift { 2498 simm32: 0, 2499 base: tmp.inner(), 2500 index: shift_amount, 2501 shift: 0, 2502 }, 2503 tmp_xmm.writable(), 2504 MemFlags::trusted(), 2505 ); 2506 }); 2507 2508 // Mask unwanted bits from operand. 2509 this.asm 2510 .xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand)); 2511 Ok(()) 2512 }) 2513 }; 2514 2515 let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> { 2516 const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000; 2517 2518 // AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the 2519 // following formula (from hacker's delight 2-7), where x is the value and n the shift 2520 // amount, for each lane: 2521 // t = (1 << 63) >> n; ((x >> n) ^ t) - t 2522 2523 // We need an extra scratch register: 2524 let tmp_xmm2 = context.any_fpr(this)?; 2525 2526 this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| { 2527 this.asm 2528 .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32); 2529 2530 let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes()); 2531 2532 this.asm 2533 .xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted()); 2534 this.asm.xmm_vpsrl_rrr( 2535 tmp_xmm2, 2536 tmp_xmm.inner(), 2537 writable!(tmp_xmm2), 2538 OperandSize::S64, 2539 ); 2540 this.asm.xmm_vpsrl_rrr( 2541 operand, 2542 tmp_xmm.inner(), 2543 writable!(operand), 2544 OperandSize::S64, 2545 ); 2546 }); 2547 this.asm 2548 .xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand)); 2549 this.asm 2550 .xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64); 2551 2552 context.free_reg(tmp_xmm2); 2553 2554 Ok(()) 2555 }; 2556 2557 let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> { 2558 // Since the x86 instruction set does not have an 8x16 shift instruction and the 2559 // approach used for `ishl` and `ushr` cannot be easily used (the masks do not 2560 // preserve the sign), we use a different approach here: separate the low and 2561 // high lanes, shift them separately, and merge them into the final result. 2562 // 2563 // Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., 2564 // s15]: 2565 // 2566 // lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)] 2567 // shifted_lo.i16x8 = shift each lane of `low` 2568 // hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] 2569 // shifted_hi.i16x8 = shift each lane of `high` 2570 // result = [s0'', s1'', ..., s15''] 2571 2572 // In order for `packsswb` later to only use the high byte of each 2573 // 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to 2574 // fill in the upper bits appropriately. 2575 let tmp_lo = context.any_fpr(this)?; 2576 let tmp_hi = context.any_fpr(this)?; 2577 2578 this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| { 2579 this.asm 2580 .add_ir(8, writable!(shift_amount), OperandSize::S32); 2581 this.asm 2582 .avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32); 2583 2584 // Extract lower and upper bytes. 2585 this.asm 2586 .xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8); 2587 this.asm 2588 .xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8); 2589 2590 // Perform 16bit right shift of upper and lower bytes. 2591 this.asm.xmm_vpsra_rrr( 2592 tmp_lo, 2593 tmp_xmm.inner(), 2594 writable!(tmp_lo), 2595 OperandSize::S16, 2596 ); 2597 this.asm.xmm_vpsra_rrr( 2598 tmp_hi, 2599 tmp_xmm.inner(), 2600 writable!(tmp_hi), 2601 OperandSize::S16, 2602 ); 2603 }); 2604 2605 // Merge lower and upper bytes back. 2606 this.asm 2607 .xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8); 2608 2609 context.free_reg(tmp_lo); 2610 context.free_reg(tmp_hi); 2611 2612 Ok(()) 2613 }; 2614 2615 match (lane_width, kind) { 2616 // shl 2617 (OperandSize::S8, ShiftKind::Shl) => { 2618 shift_i8x16(self, &I8X16_ISHL_MASKS, Direction::Left)? 2619 } 2620 (OperandSize::S16, ShiftKind::Shl) => { 2621 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2622 move_to_tmp_xmm(masm, tmp_xmm); 2623 masm.asm.xmm_vpsll_rrr( 2624 operand, 2625 tmp_xmm.inner(), 2626 writable!(operand), 2627 OperandSize::S16, 2628 ); 2629 }) 2630 } 2631 (OperandSize::S32, ShiftKind::Shl) => { 2632 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2633 move_to_tmp_xmm(masm, tmp_xmm); 2634 masm.asm.xmm_vpsll_rrr( 2635 operand, 2636 tmp_xmm.inner(), 2637 writable!(operand), 2638 OperandSize::S32, 2639 ); 2640 }) 2641 } 2642 (OperandSize::S64, ShiftKind::Shl) => { 2643 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2644 move_to_tmp_xmm(masm, tmp_xmm); 2645 masm.asm.xmm_vpsll_rrr( 2646 operand, 2647 tmp_xmm.inner(), 2648 writable!(operand), 2649 OperandSize::S64, 2650 ); 2651 }) 2652 } 2653 // shr_u 2654 (OperandSize::S8, ShiftKind::ShrU) => { 2655 shift_i8x16(self, &I8X16_USHR_MASKS, Direction::Right)? 2656 } 2657 (OperandSize::S16, ShiftKind::ShrU) => { 2658 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2659 move_to_tmp_xmm(masm, tmp_xmm); 2660 masm.asm.xmm_vpsrl_rrr( 2661 operand, 2662 tmp_xmm.inner(), 2663 writable!(operand), 2664 OperandSize::S16, 2665 ); 2666 }) 2667 } 2668 (OperandSize::S32, ShiftKind::ShrU) => { 2669 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2670 move_to_tmp_xmm(masm, tmp_xmm); 2671 masm.asm.xmm_vpsrl_rrr( 2672 operand, 2673 tmp_xmm.inner(), 2674 writable!(operand), 2675 OperandSize::S32, 2676 ); 2677 }) 2678 } 2679 (OperandSize::S64, ShiftKind::ShrU) => { 2680 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2681 move_to_tmp_xmm(masm, tmp_xmm); 2682 masm.asm.xmm_vpsrl_rrr( 2683 operand, 2684 tmp_xmm.inner(), 2685 writable!(operand), 2686 OperandSize::S64, 2687 ); 2688 }) 2689 } 2690 // shr_s 2691 (OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?, 2692 (OperandSize::S16, ShiftKind::ShrS) => { 2693 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2694 move_to_tmp_xmm(masm, tmp_xmm); 2695 masm.asm.xmm_vpsra_rrr( 2696 operand, 2697 tmp_xmm.inner(), 2698 writable!(operand), 2699 OperandSize::S16, 2700 ); 2701 }) 2702 } 2703 (OperandSize::S32, ShiftKind::ShrS) => { 2704 self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| { 2705 move_to_tmp_xmm(masm, tmp_xmm); 2706 masm.asm.xmm_vpsra_rrr( 2707 operand, 2708 tmp_xmm.inner(), 2709 writable!(operand), 2710 OperandSize::S32, 2711 ); 2712 }) 2713 } 2714 (OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?, 2715 2716 _ => bail!(CodeGenError::invalid_operand_combination()), 2717 } 2718 2719 context.free_reg(shift_amount); 2720 context 2721 .stack 2722 .push(TypedReg::new(WasmValType::V128, operand).into()); 2723 Ok(()) 2724 } 2725 v128_q15mulr_sat_s( &mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize, ) -> Result<()>2726 fn v128_q15mulr_sat_s( 2727 &mut self, 2728 lhs: Reg, 2729 rhs: Reg, 2730 dst: WritableReg, 2731 size: OperandSize, 2732 ) -> Result<()> { 2733 self.ensure_has_avx()?; 2734 2735 self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size); 2736 2737 // Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15 2738 // format) because of how `vpmulhrs` handles rounding. `vpmulhrs` 2739 // produces 0x8000 in that case when the correct result is 0x7FFF (that 2740 // is, +1) so need to check if the result is 0x8000 and flip the bits 2741 // of the result if it is. 2742 let address = self.asm.add_constant(&[ 2743 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 2744 0x00, 0x80, 2745 ]); 2746 self.asm 2747 .xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size); 2748 self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst); 2749 Ok(()) 2750 } 2751 v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>2752 fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 2753 self.ensure_has_avx()?; 2754 2755 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2756 // Create a mask of all 0s. 2757 masm.asm 2758 .xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable()); 2759 // Sets lane in `dst` to not zero if `src` lane was zero, and lane in 2760 // `dst` to zero if `src` lane was not zero. 2761 masm.asm 2762 .xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size); 2763 // Sets ZF if all values are zero (i.e., if all original values were not zero). 2764 masm.asm.xmm_vptest(src, src); 2765 // Set byte if ZF=1. 2766 }); 2767 self.asm.setcc(IntCmpKind::Eq, dst); 2768 Ok(()) 2769 } 2770 v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>2771 fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 2772 self.ensure_has_avx()?; 2773 2774 match size { 2775 OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32), 2776 OperandSize::S16 => { 2777 // Signed conversion of 16-bit integers to 8-bit integers. 2778 self.asm 2779 .xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8); 2780 // Creates a mask from each byte in `src`. 2781 self.asm 2782 .xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32); 2783 // Removes 8 bits added as a result of the `vpackss` step. 2784 self.asm 2785 .shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32); 2786 } 2787 OperandSize::S32 | OperandSize::S64 => { 2788 self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32) 2789 } 2790 _ => unimplemented!(), 2791 } 2792 2793 Ok(()) 2794 } 2795 v128_trunc( &mut self, context: &mut CodeGenContext<Emission>, kind: V128TruncKind, ) -> Result<()>2796 fn v128_trunc( 2797 &mut self, 2798 context: &mut CodeGenContext<Emission>, 2799 kind: V128TruncKind, 2800 ) -> Result<()> { 2801 self.ensure_has_avx()?; 2802 2803 let reg = writable!(context.pop_to_reg(self, None)?.reg); 2804 match kind { 2805 V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri( 2806 reg.to_reg(), 2807 reg, 2808 VroundMode::TowardZero, 2809 kind.dst_lane_size(), 2810 ), 2811 V128TruncKind::I32x4FromF32x4S => { 2812 self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?; 2813 } 2814 V128TruncKind::I32x4FromF32x4U => { 2815 let temp_reg = writable!(context.any_fpr(self)?); 2816 self.v128_trunc_sat_f32x4_u( 2817 reg, 2818 temp_reg, 2819 kind.src_lane_size(), 2820 kind.dst_lane_size(), 2821 )?; 2822 context.free_reg(temp_reg.to_reg()); 2823 } 2824 V128TruncKind::I32x4FromF64x2SZero => { 2825 self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?; 2826 } 2827 V128TruncKind::I32x4FromF64x2UZero => { 2828 self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?; 2829 } 2830 } 2831 2832 context.stack.push(TypedReg::v128(reg.to_reg()).into()); 2833 Ok(()) 2834 } 2835 v128_min( &mut self, src1: Reg, src2: Reg, dst: WritableReg, kind: V128MinKind, ) -> Result<()>2836 fn v128_min( 2837 &mut self, 2838 src1: Reg, 2839 src2: Reg, 2840 dst: WritableReg, 2841 kind: V128MinKind, 2842 ) -> Result<()> { 2843 self.ensure_has_avx()?; 2844 2845 match kind { 2846 V128MinKind::I8x16S 2847 | V128MinKind::I8x16U 2848 | V128MinKind::I16x8S 2849 | V128MinKind::I16x8U 2850 | V128MinKind::I32x4S 2851 | V128MinKind::I32x4U => { 2852 match kind { 2853 V128MinKind::I8x16S => { 2854 self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8) 2855 } 2856 V128MinKind::I8x16U => { 2857 self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8) 2858 } 2859 V128MinKind::I16x8S => { 2860 self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16) 2861 } 2862 V128MinKind::I16x8U => { 2863 self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16) 2864 } 2865 V128MinKind::I32x4S => { 2866 self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32) 2867 } 2868 V128MinKind::I32x4U => { 2869 self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32) 2870 } 2871 _ => unreachable!(), 2872 }; 2873 } 2874 V128MinKind::F32x4 | V128MinKind::F64x2 => { 2875 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2876 // Handling +0 and -0 as well as NaN values are not commutative 2877 // when using `vminp` so we have to compensate. 2878 // Perform two comparison operations with the operands swapped 2879 // and OR the result to propagate 0 (positive and negative) and 2880 // NaN. 2881 masm.asm 2882 .xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size()); 2883 masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size()); 2884 // Use a single OR instruction to set the sign bit if either 2885 // result has the sign bit set to correctly propagate -0. 2886 masm.asm 2887 .xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size()); 2888 }); 2889 // Set lanes with NaN to all 1s. 2890 self.asm.xmm_vcmpp_rrr( 2891 writable!(src2), 2892 src2, 2893 dst.to_reg(), 2894 kind.lane_size(), 2895 VcmpKind::Unord, 2896 ); 2897 // Doesn't change non-NaN values. For NaN values, sets all bits. 2898 self.asm 2899 .xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size()); 2900 self.canonicalize_nans(writable!(src2), dst, kind.lane_size()); 2901 } 2902 } 2903 2904 Ok(()) 2905 } 2906 v128_max( &mut self, src1: Reg, src2: Reg, dst: WritableReg, kind: V128MaxKind, ) -> Result<()>2907 fn v128_max( 2908 &mut self, 2909 src1: Reg, 2910 src2: Reg, 2911 dst: WritableReg, 2912 kind: V128MaxKind, 2913 ) -> Result<()> { 2914 self.ensure_has_avx()?; 2915 2916 match kind { 2917 V128MaxKind::I8x16S 2918 | V128MaxKind::I8x16U 2919 | V128MaxKind::I16x8S 2920 | V128MaxKind::I16x8U 2921 | V128MaxKind::I32x4S 2922 | V128MaxKind::I32x4U => { 2923 match kind { 2924 V128MaxKind::I8x16S => { 2925 self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8) 2926 } 2927 V128MaxKind::I8x16U => { 2928 self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8) 2929 } 2930 V128MaxKind::I16x8S => { 2931 self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16) 2932 } 2933 V128MaxKind::I16x8U => { 2934 self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16) 2935 } 2936 V128MaxKind::I32x4S => { 2937 self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32) 2938 } 2939 V128MaxKind::I32x4U => { 2940 self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32) 2941 } 2942 _ => unreachable!(), 2943 }; 2944 } 2945 V128MaxKind::F32x4 | V128MaxKind::F64x2 => { 2946 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 2947 // Handling +0 and -0 as well as NaN values are not commutative 2948 // when using `vmaxp` so we have to compensate. 2949 // Perform two comparison operations with the operands swapped 2950 // so we can propagate 0 (positive and negative) and NaNs 2951 // correctly. 2952 2953 masm.asm 2954 .xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size()); 2955 masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size()); 2956 // This combination of XOR, OR, and SUB will set the sign bit 2957 // on a 0 result to the correct value for a max operation. 2958 masm.asm 2959 .xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size()); 2960 masm.asm.xmm_vorp_rrr( 2961 dst.to_reg(), 2962 scratch.inner(), 2963 writable!(src2), 2964 kind.lane_size(), 2965 ); 2966 }); 2967 self.asm 2968 .xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size()); 2969 // Set lanes of NaN values to 1. 2970 self.asm.xmm_vcmpp_rrr( 2971 writable!(src2), 2972 src2, 2973 src2, 2974 kind.lane_size(), 2975 VcmpKind::Unord, 2976 ); 2977 self.canonicalize_nans(writable!(src2), dst, kind.lane_size()); 2978 } 2979 } 2980 Ok(()) 2981 } 2982 v128_extmul( &mut self, context: &mut CodeGenContext<Emission>, kind: V128ExtMulKind, ) -> Result<()>2983 fn v128_extmul( 2984 &mut self, 2985 context: &mut CodeGenContext<Emission>, 2986 kind: V128ExtMulKind, 2987 ) -> Result<()> { 2988 self.ensure_has_avx()?; 2989 2990 // The implementation for extmul is not optimized; for simplicity's sake, we simply perform 2991 // an extension followed by a multiplication using already implemented primitives. 2992 2993 let src1 = context.pop_to_reg(self, None)?; 2994 let src2 = context.pop_to_reg(self, None)?; 2995 2996 let ext_kind = kind.into(); 2997 self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?; 2998 self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?; 2999 3000 context.stack.push(src2.into()); 3001 context.stack.push(src1.into()); 3002 3003 self.v128_mul(context, kind.into()) 3004 } 3005 v128_extadd_pairwise( &mut self, src: Reg, dst: WritableReg, kind: V128ExtAddKind, ) -> Result<()>3006 fn v128_extadd_pairwise( 3007 &mut self, 3008 src: Reg, 3009 dst: WritableReg, 3010 kind: V128ExtAddKind, 3011 ) -> Result<()> { 3012 self.ensure_has_avx()?; 3013 3014 match kind { 3015 V128ExtAddKind::I8x16S => { 3016 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 3017 // Use `vpmaddubsw` with a vector of 16 8-bit 1's which will 3018 // sign extend `src` to 16 bits and add adjacent words. 3019 // Need to supply constant as first operand since first operand 3020 // is treated as unsigned and the second operand is signed. 3021 let mask = masm.asm.add_constant(&[1; 16]); 3022 masm.asm.xmm_mov_mr( 3023 &mask, 3024 scratch.writable(), 3025 OperandSize::S128, 3026 MemFlags::trusted(), 3027 ); 3028 masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst); 3029 }); 3030 } 3031 V128ExtAddKind::I8x16U => { 3032 // Same approach as the signed variant but treat `src` as 3033 // unsigned instead of signed by passing it as the first 3034 // operand. 3035 let mask = self.asm.add_constant(&[1; 16]); 3036 self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst); 3037 } 3038 V128ExtAddKind::I16x8S => { 3039 // Similar approach to the two variants above. The vector is 8 3040 // lanes of 16-bit 1's and `vpmaddwd` treats both operands as 3041 // signed. 3042 let mask = self 3043 .asm 3044 .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]); 3045 self.asm.xmm_vpmaddwd_rmr(src, &mask, dst); 3046 } 3047 V128ExtAddKind::I16x8U => { 3048 // Similar approach as the signed variant. 3049 // `vpmaddwd` operates on signed integers and the operand is 3050 // unsigned so the operand needs to be converted to a signed 3051 // format and than that process needs to be reversed after 3052 // `vpmaddwd`. 3053 // Flip the sign bit for 8 16-bit lanes. 3054 let xor_mask = self.asm.add_constant(&[ 3055 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 3056 0x80, 0x00, 0x80, 3057 ]); 3058 self.asm.xmm_vpxor_rmr(src, &xor_mask, dst); 3059 3060 let madd_mask = self 3061 .asm 3062 .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]); 3063 self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst); 3064 3065 // Reverse the XOR. The XOR effectively subtracts 32,768 from 3066 // both pairs that are added together so 65,536 (0x10000) 3067 // needs to be added to 4 lanes of 32-bit values. 3068 let add_mask = self 3069 .asm 3070 .add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]); 3071 self.asm 3072 .xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32); 3073 } 3074 } 3075 Ok(()) 3076 } 3077 v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()>3078 fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> { 3079 self.ensure_has_avx()?; 3080 self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst); 3081 Ok(()) 3082 } 3083 v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()>3084 fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> { 3085 self.ensure_has_avx()?; 3086 3087 let reg = writable!(context.pop_to_reg(self, None)?.reg); 3088 let reg2 = writable!(context.any_fpr(self)?); 3089 3090 // This works by using a lookup table to determine the count of bits 3091 // set in the upper 4 bits and lower 4 bits separately and then adding 3092 // the counts. 3093 3094 // A mask to zero out the upper 4 bits in each lane. 3095 let address = self.asm.add_constant(&[ 3096 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 3097 0x0F, 0x0F, 3098 ]); 3099 3100 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 3101 // Zero out the upper 4 bits of each lane. 3102 masm.asm 3103 .xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable()); 3104 // Right shift bytes in input by 4 bits to put the upper 4 bits in the 3105 // lower 4 bits. 3106 masm.asm 3107 .xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16); 3108 // Zero out the upper 4 bits of each shifted lane. 3109 masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg); 3110 3111 // Write a lookup table of 4 bit values to number of bits set to a 3112 // register so we only perform the memory read once. 3113 // Index (hex) | Value (binary) | Population Count 3114 // 0x0 | 0000 | 0 3115 // 0x1 | 0001 | 1 3116 // 0x2 | 0010 | 1 3117 // 0x3 | 0011 | 2 3118 // 0x4 | 0100 | 1 3119 // 0x5 | 0101 | 2 3120 // 0x6 | 0110 | 2 3121 // 0x7 | 0111 | 3 3122 // 0x8 | 1000 | 1 3123 // 0x9 | 1001 | 2 3124 // 0xA | 1010 | 2 3125 // 0xB | 1011 | 3 3126 // 0xC | 1100 | 2 3127 // 0xD | 1101 | 3 3128 // 0xE | 1110 | 3 3129 // 0xF | 1111 | 4 3130 let address = masm.asm.add_constant(&[ 3131 0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4, 3132 ]); 3133 masm.asm 3134 .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted()); 3135 // Use the upper 4 bits as an index into the lookup table. 3136 masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg()); 3137 // Use the lower 4 bits as an index into the lookup table. 3138 masm.asm 3139 .xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner()); 3140 context.free_reg(reg2.to_reg()); 3141 3142 // Add the counts of the upper 4 bits and the lower 4 bits to get the 3143 // total number of bits set. 3144 masm.asm 3145 .xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8); 3146 wasmtime_environ::error::Ok(()) 3147 })?; 3148 3149 context.stack.push(TypedReg::v128(reg.to_reg()).into()); 3150 Ok(()) 3151 } 3152 v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3153 fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3154 self.ensure_has_avx()?; 3155 self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size); 3156 Ok(()) 3157 } 3158 v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3159 fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3160 self.ensure_has_avx()?; 3161 self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size); 3162 Ok(()) 3163 } 3164 v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3165 fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3166 self.ensure_has_avx()?; 3167 self.asm.xmm_vsqrtp_rr(src, dst, size); 3168 Ok(()) 3169 } 3170 v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3171 fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3172 self.ensure_has_avx()?; 3173 self.asm 3174 .xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size); 3175 Ok(()) 3176 } 3177 v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3178 fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3179 self.ensure_has_avx()?; 3180 self.asm 3181 .xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size); 3182 Ok(()) 3183 } 3184 v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3185 fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3186 self.ensure_has_avx()?; 3187 self.asm 3188 .xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size); 3189 Ok(()) 3190 } 3191 v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3192 fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3193 self.ensure_has_avx()?; 3194 // Reverse operands since Wasm specifies returning the first operand if 3195 // either operand is NaN while x86 returns the second operand. 3196 self.asm.xmm_vminp_rrr(rhs, lhs, dst, size); 3197 Ok(()) 3198 } 3199 v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>3200 fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { 3201 self.ensure_has_avx()?; 3202 // Reverse operands since Wasm specifies returning the first operand if 3203 // either operand is NaN while x86 returns the second operand. 3204 self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size); 3205 Ok(()) 3206 } 3207 } 3208 3209 impl MacroAssembler { 3210 /// Create an x64 MacroAssembler. new( ptr_size: impl PtrSize, shared_flags: settings::Flags, isa_flags: x64_settings::Flags, ) -> Result<Self>3211 pub fn new( 3212 ptr_size: impl PtrSize, 3213 shared_flags: settings::Flags, 3214 isa_flags: x64_settings::Flags, 3215 ) -> Result<Self> { 3216 let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size()); 3217 3218 Ok(Self { 3219 sp_offset: 0, 3220 sp_max: 0, 3221 stack_max_use_add: None, 3222 asm: Assembler::new(shared_flags.clone(), isa_flags.clone()), 3223 flags: isa_flags, 3224 shared_flags, 3225 ptr_size: ptr_type.try_into()?, 3226 scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()), 3227 }) 3228 } 3229 3230 /// Add the maximum stack used to a register, recording an obligation to update the 3231 /// add-with-immediate instruction emitted to use the real stack max when the masm is being 3232 /// finalized. add_stack_max(&mut self, reg: Reg)3233 fn add_stack_max(&mut self, reg: Reg) { 3234 assert!(self.stack_max_use_add.is_none()); 3235 let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm); 3236 self.stack_max_use_add.replace(patch); 3237 } 3238 ensure_has_avx(&self) -> Result<()>3239 fn ensure_has_avx(&self) -> Result<()> { 3240 crate::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx); 3241 Ok(()) 3242 } 3243 ensure_has_avx2(&self) -> Result<()>3244 fn ensure_has_avx2(&self) -> Result<()> { 3245 crate::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2); 3246 Ok(()) 3247 } 3248 ensure_has_avx512vl(&self) -> Result<()>3249 fn ensure_has_avx512vl(&self) -> Result<()> { 3250 crate::ensure!( 3251 self.flags.has_avx512vl(), 3252 CodeGenError::UnimplementedForNoAvx512VL 3253 ); 3254 Ok(()) 3255 } 3256 ensure_has_avx512dq(&self) -> Result<()>3257 fn ensure_has_avx512dq(&self) -> Result<()> { 3258 crate::ensure!( 3259 self.flags.has_avx512dq(), 3260 CodeGenError::UnimplementedForNoAvx512DQ 3261 ); 3262 Ok(()) 3263 } 3264 increment_sp(&mut self, bytes: u32)3265 fn increment_sp(&mut self, bytes: u32) { 3266 self.sp_offset += bytes; 3267 3268 // NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have 3269 // seen the entire function, this value will represent the maximum size for the stack 3270 // frame. 3271 self.sp_max = self.sp_max.max(self.sp_offset); 3272 } 3273 decrement_sp(&mut self, bytes: u32)3274 fn decrement_sp(&mut self, bytes: u32) { 3275 assert!( 3276 self.sp_offset >= bytes, 3277 "sp offset = {}; bytes = {}", 3278 self.sp_offset, 3279 bytes 3280 ); 3281 self.sp_offset -= bytes; 3282 } 3283 load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()>3284 fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> { 3285 match constant { 3286 I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)), 3287 I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)), 3288 I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)), 3289 I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)), 3290 I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)), 3291 } 3292 } 3293 3294 /// A common implementation for zero-extend stack loads. load_impl( &mut self, src: Address, dst: WritableReg, size: OperandSize, flags: MemFlags, ) -> Result<()>3295 fn load_impl( 3296 &mut self, 3297 src: Address, 3298 dst: WritableReg, 3299 size: OperandSize, 3300 flags: MemFlags, 3301 ) -> Result<()> { 3302 if dst.to_reg().is_int() { 3303 let ext = size.extend_to::<Zero>(OperandSize::S64); 3304 self.asm.movzx_mr(&src, dst, ext, flags); 3305 } else { 3306 self.asm.xmm_mov_mr(&src, dst, size, flags); 3307 } 3308 3309 Ok(()) 3310 } 3311 3312 /// A common implementation for stack stores. store_impl( &mut self, src: RegImm, dst: Address, size: OperandSize, flags: MemFlags, ) -> Result<()>3313 fn store_impl( 3314 &mut self, 3315 src: RegImm, 3316 dst: Address, 3317 size: OperandSize, 3318 flags: MemFlags, 3319 ) -> Result<()> { 3320 let _ = match src { 3321 RegImm::Imm(imm) => match imm { 3322 I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags), 3323 I::I64(v) => match v.try_into() { 3324 Ok(v) => self.asm.mov_im(v, &dst, size, flags), 3325 Err(_) => { 3326 // If the immediate doesn't sign extend, use a scratch 3327 // register. 3328 self.with_scratch::<IntScratch, _>(|masm, scratch| { 3329 masm.asm.mov_ir(v, scratch.writable(), size); 3330 masm.asm.mov_rm(scratch.inner(), &dst, size, flags); 3331 }); 3332 } 3333 }, 3334 I::F32(v) => { 3335 let addr = self.asm.add_constant(v.to_le_bytes().as_slice()); 3336 self.with_scratch::<FloatScratch, _>(|masm, float_scratch| { 3337 // Always trusted, since we are loading the constant from 3338 // the constant pool. 3339 masm.asm.xmm_mov_mr( 3340 &addr, 3341 float_scratch.writable(), 3342 size, 3343 MemFlags::trusted(), 3344 ); 3345 masm.asm 3346 .xmm_mov_rm(float_scratch.inner(), &dst, size, flags); 3347 }); 3348 } 3349 I::F64(v) => { 3350 let addr = self.asm.add_constant(v.to_le_bytes().as_slice()); 3351 3352 self.with_scratch::<FloatScratch, _>(|masm, float_scratch| { 3353 // Similar to above, always trusted since we are loading the 3354 // constant from the constant pool. 3355 masm.asm.xmm_mov_mr( 3356 &addr, 3357 float_scratch.writable(), 3358 size, 3359 MemFlags::trusted(), 3360 ); 3361 masm.asm 3362 .xmm_mov_rm(float_scratch.inner(), &dst, size, flags); 3363 }); 3364 } 3365 I::V128(v) => { 3366 let addr = self.asm.add_constant(v.to_le_bytes().as_slice()); 3367 self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| { 3368 // Always trusted, since we are loading the constant from 3369 // the constant pool. 3370 masm.asm.xmm_mov_mr( 3371 &addr, 3372 vector_scratch.writable(), 3373 size, 3374 MemFlags::trusted(), 3375 ); 3376 masm.asm 3377 .xmm_mov_rm(vector_scratch.inner(), &dst, size, flags); 3378 }); 3379 } 3380 }, 3381 RegImm::Reg(reg) => { 3382 if reg.is_int() { 3383 self.asm.mov_rm(reg, &dst, size, flags); 3384 } else { 3385 self.asm.xmm_mov_rm(reg, &dst, size, flags); 3386 } 3387 } 3388 }; 3389 Ok(()) 3390 } 3391 ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()>3392 fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> { 3393 if dst != lhs { 3394 Err(format_err!(CodeGenError::invalid_two_arg_form())) 3395 } else { 3396 Ok(()) 3397 } 3398 } 3399 3400 /// The mask to use when performing a `vpshuf` operation for a 64-bit splat. vpshuf_mask_for_64_bit_splats() -> u83401 fn vpshuf_mask_for_64_bit_splats() -> u8 { 3402 // Results in the first 4 bytes and second 4 bytes being 3403 // swapped and then the swapped bytes being copied. 3404 // [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields 3405 // [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3]. 3406 0b01_00_01_00 3407 } 3408 v128_trunc_sat_f32x4_s( &mut self, reg: WritableReg, src_lane_size: OperandSize, dst_lane_size: OperandSize, ) -> Result<()>3409 fn v128_trunc_sat_f32x4_s( 3410 &mut self, 3411 reg: WritableReg, 3412 src_lane_size: OperandSize, 3413 dst_lane_size: OperandSize, 3414 ) -> Result<()> { 3415 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 3416 // Create a mask to handle NaN values (1 for not NaN, 0 for 3417 // NaN). 3418 masm.asm.xmm_vcmpp_rrr( 3419 scratch.writable(), 3420 reg.to_reg(), 3421 reg.to_reg(), 3422 src_lane_size, 3423 VcmpKind::Eq, 3424 ); 3425 // Zero out any NaN values. 3426 masm.asm 3427 .xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size); 3428 // Create a mask for the sign bits. 3429 masm.asm 3430 .xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable()); 3431 // Convert floats to integers. 3432 masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32); 3433 // Apply sign mask to the converted integers. 3434 masm.asm 3435 .xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable()); 3436 // Create a saturation mask of all 1s for negative numbers, 3437 // all 0s for positive numbers. The arithmetic shift will cop 3438 // the sign bit. 3439 masm.asm 3440 .xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size); 3441 // Combine converted integers with saturation mask. 3442 masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg); 3443 Ok(()) 3444 }) 3445 } 3446 v128_trunc_sat_f32x4_u( &mut self, reg: WritableReg, temp_reg: WritableReg, src_lane_size: OperandSize, dst_lane_size: OperandSize, ) -> Result<()>3447 fn v128_trunc_sat_f32x4_u( 3448 &mut self, 3449 reg: WritableReg, 3450 temp_reg: WritableReg, 3451 src_lane_size: OperandSize, 3452 dst_lane_size: OperandSize, 3453 ) -> Result<()> { 3454 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 3455 // Set scratch to all zeros. 3456 masm.asm.xmm_vxorp_rrr( 3457 reg.to_reg(), 3458 reg.to_reg(), 3459 scratch.writable(), 3460 src_lane_size, 3461 ); 3462 // Clamp negative numbers to 0. 3463 masm.asm 3464 .xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size); 3465 // Create a vector of all 1s. 3466 masm.asm.xmm_vpcmpeq_rrr( 3467 scratch.writable(), 3468 scratch.inner(), 3469 scratch.inner(), 3470 src_lane_size, 3471 ); 3472 // Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by 3473 // performing a logical shift right. 3474 masm.asm 3475 .xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size); 3476 // Convert max signed int to float as a reference point for saturation. 3477 masm.asm 3478 .xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32); 3479 // Convert the floats to integers and put the results in `reg2`. 3480 // This is signed and not unsigned so we need to handle the 3481 // value for the high bit in each lane. 3482 masm.asm 3483 .xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32); 3484 // Set `reg` lanes to the amount that the value in the lane 3485 // exceeds the maximum signed 32-bit integer. 3486 masm.asm 3487 .xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size); 3488 // Create mask in `scratch` for numbers that are larger than 3489 // the maximum signed 32-bit integer. Lanes that don't fit 3490 // in 32-bits ints will be 1. 3491 masm.asm.xmm_vcmpp_rrr( 3492 scratch.writable(), 3493 scratch.inner(), 3494 reg.to_reg(), 3495 dst_lane_size, 3496 VcmpKind::Le, 3497 ); 3498 // Convert the excess over signed 32-bits from floats to integers. 3499 masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32); 3500 // Apply large number mask to excess values which will flip the 3501 // bits in any lanes that exceed signed 32-bits. Adding this 3502 // flipped value to the signed value will set the high bit and 3503 // the carry behavior will update the other bits correctly. 3504 masm.asm 3505 .xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable()); 3506 // Set `reg` to all 0s. 3507 masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg); 3508 // Ensure excess values are not negative by taking max b/w 3509 // excess values and zero. 3510 masm.asm 3511 .xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size); 3512 }); 3513 // Perform the addition between the signed conversion value (in 3514 // `reg2`) and the flipped excess value (in `reg`) to get the 3515 // unsigned value. 3516 self.asm 3517 .xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size); 3518 Ok(()) 3519 } 3520 v128_trunc_sat_f64x2_s_zero( &mut self, reg: WritableReg, src_lane_size: OperandSize, ) -> Result<()>3521 fn v128_trunc_sat_f64x2_s_zero( 3522 &mut self, 3523 reg: WritableReg, 3524 src_lane_size: OperandSize, 3525 ) -> Result<()> { 3526 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 3527 // Create a NaN mask (1s for non-NaN, 0s for NaN). 3528 masm.asm.xmm_vcmpp_rrr( 3529 scratch.writable(), 3530 reg.to_reg(), 3531 reg.to_reg(), 3532 src_lane_size, 3533 VcmpKind::Eq, 3534 ); 3535 // Clamp NaN values to maximum 64-bit float that can be 3536 // converted to an i32. 3537 let address = masm.asm.add_constant(&[ 3538 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 3539 0xDF, 0x41, 3540 ]); 3541 masm.asm 3542 .xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size); 3543 // Handle the saturation for values too large to fit in an i32. 3544 masm.asm 3545 .xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size); 3546 // Convert the floats to integers. 3547 masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32); 3548 3549 Ok(()) 3550 }) 3551 } 3552 v128_trunc_sat_f64x2_u_zero( &mut self, reg: WritableReg, src_lane_size: OperandSize, dst_lane_size: OperandSize, ) -> Result<()>3553 fn v128_trunc_sat_f64x2_u_zero( 3554 &mut self, 3555 reg: WritableReg, 3556 src_lane_size: OperandSize, 3557 dst_lane_size: OperandSize, 3558 ) -> Result<()> { 3559 self.with_scratch::<FloatScratch, _>(|masm, scratch| { 3560 // Zero out the scratch register. 3561 masm.asm.xmm_vxorp_rrr( 3562 scratch.inner(), 3563 scratch.inner(), 3564 scratch.writable(), 3565 src_lane_size, 3566 ); 3567 // Clamp negative values to zero. 3568 masm.asm 3569 .xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size); 3570 // Clamp value to maximum unsigned 32-bit integer value 3571 // (0x41F0000000000000). 3572 let address = masm.asm.add_constant(&[ 3573 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 3574 0xEF, 0x41, 3575 ]); 3576 masm.asm 3577 .xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size); 3578 // Truncate floating point values. 3579 masm.asm 3580 .xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size); 3581 // Add 2^52 (doubles store 52 bits in their mantissa) to each 3582 // lane causing values in the lower bits to be shifted into 3583 // position for integer conversion. 3584 let address = masm.asm.add_constant(&[ 3585 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 3586 0x30, 0x43, 3587 ]); 3588 masm.asm 3589 .xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size); 3590 // Takes lanes 0 and 2 from `reg` (converted values) and lanes 3591 // 0 and 2 from `scratch` (zeroes) to put the converted ints in 3592 // the lower lanes and zeroes in the upper lanes. 3593 masm.asm.xmm_vshufp_rrri( 3594 reg.to_reg(), 3595 scratch.inner(), 3596 reg, 3597 0b10_00_10_00, 3598 dst_lane_size, 3599 ); 3600 Ok(()) 3601 }) 3602 } 3603 3604 /// Given a vector of floats where lanes with NaN values are set to all 1s 3605 /// in `reg` and a vector register `dst` with a mix of non-NaN values and 3606 /// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`. canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize)3607 fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) { 3608 // Canonical NaNs do not preserve the sign bit, have the exponent bits 3609 // all set, and have only the high bit of the mantissa set so shift by 3610 // that number. 3611 // The mask we're producing in this step will be inverted in the next 3612 // step. 3613 let amount_to_shift = 1 + size.mantissa_bits() + 1; 3614 self.asm 3615 .xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size); 3616 // The mask will be inverted by the ANDN so non-NaN values will be all 3617 // 1s and NaN values will set the sign bit, exponent bits, and zero out 3618 // almost all of the mantissa. 3619 self.asm 3620 .xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size); 3621 } 3622 } 3623