1;; x86-64 instruction selection and CLIF-to-MachInst lowering. 2 3;; The main lowering constructor term: takes a clif `Inst` and returns the 4;; register(s) within which the lowered instruction's result values live. 5(spec (lower arg) 6 (provide (= result arg))) 7(decl partial lower (Inst) InstOutput) 8 9;; A variant of the main lowering constructor term, used for branches. 10;; The only difference is that it gets an extra argument holding a vector 11;; of branch targets to be used. 12(decl partial lower_branch (Inst MachLabelSlice) Unit) 13 14;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 15 16;; `i64` and smaller. 17(rule (lower (has_type (fits_in_64 ty) 18 (iconst _ (u64_from_imm64 x)))) 19 (imm ty x)) 20 21;; `i128` 22(rule 1 (lower (has_type $I128 23 (iconst _ (u64_from_imm64 x)))) 24 (value_regs (imm $I64 x) 25 (imm $I64 0))) 26 27;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 28 29(rule (lower (f16const _ (u16_from_ieee16 x))) 30 (imm $F16 x)) 31 32;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 33 34(rule (lower (f32const _ (u32_from_ieee32 x))) 35 (imm $F32 x)) 36 37;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 38 39(rule (lower (f64const _ (u64_from_ieee64 x))) 40 (imm $F64 x)) 41 42;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 43(rule (lower (f128const _ const)) 44 ;; TODO use Inst::gen_constant() instead. 45 (x64_xmm_load_const $F128 (const_to_vconst const))) 46 47(rule 1 (lower (f128const _ (u128_from_constant 0))) 48 (xmm_zero $F128)) 49 50;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 51 52;; `i64` and smaller. 53 54;; Base case for 8 and 16-bit types 55(rule -6 (lower (has_type (fits_in_16 ty) 56 (iadd _ x y))) 57 (x64_add ty x y)) 58 59;; Base case for 32 and 64-bit types which might end up using the `lea` 60;; instruction to fold multiple operations into one. 61;; 62;; Note that at this time this always generates a `lea` pseudo-instruction, 63;; but the actual instruction emitted might be an `add` if it's equivalent. 64;; For more details on this see the `emit.rs` logic to emit 65;; `LoadEffectiveAddress`. 66(rule iadd_base_case_32_or_64_lea -5 (lower (has_type (ty_32_or_64 ty) (iadd _ x y))) 67 (x64_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset)))) 68 69;; Higher-priority cases than the previous two where a load can be sunk into 70;; the add instruction itself. Note that both operands are tested for 71;; sink-ability since addition is commutative 72(rule -4 (lower (has_type (fits_in_64 ty) 73 (iadd _ x (sinkable_load y)))) 74 (x64_add ty x y)) 75(rule -3 (lower (has_type (fits_in_64 ty) 76 (iadd _ (sinkable_load x) y))) 77 (x64_add ty y x)) 78 79;; SSE. 80 81(rule (lower (has_type (multi_lane 8 16) 82 (iadd _ x y))) 83 (x64_paddb x y)) 84 85(rule (lower (has_type (multi_lane 16 8) 86 (iadd _ x y))) 87 (x64_paddw x y)) 88 89(rule (lower (has_type (multi_lane 32 4) 90 (iadd _ x y))) 91 (x64_paddd x y)) 92 93(rule (lower (has_type (multi_lane 64 2) 94 (iadd _ x y))) 95 (x64_paddq x y)) 96 97;; `i128` 98(rule 1 (lower (has_type $I128 (iadd _ x y))) 99 ;; Get the high/low registers for `x`. 100 (let ((x_regs ValueRegs x) 101 (y_regs ValueRegs y)) 102 (iadd128 103 (value_regs_get_gpr x_regs 0) 104 (value_regs_get_gpr x_regs 1) 105 (value_regs_get_gpr y_regs 0) 106 (value_regs_get_gpr y_regs 1)))) 107(rule 2 (lower (has_type $I128 (iadd _ x (iconcat _ y_lo y_hi)))) 108 (let ((x_regs ValueRegs x)) 109 (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi))) 110(rule 3 (lower (has_type $I128 (iadd _ x (uextend _ y @ (value_type $I64))))) 111 (let ((x_regs ValueRegs x)) 112 (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) 113 y (RegMemImm.Imm 0)))) 114 115;; Specialized lowering rule for `iadd` of two 64-bit unsigned integers, meaning 116;; that we can skip the `adc` and instead use `setb`. This is in some sense a 117;; way of modeling `uadd_overflow`. 118(rule 4 (lower (has_type $I128 (iadd _ (uextend _ x) (uextend _ y @ (value_type $I64))))) 119 (let ( 120 (x Gpr (extend_to_gpr x $I64 (ExtendKind.Zero))) 121 (ret ValueRegs (with_flags (x64_add_with_flags_paired $I64 x y) 122 (x64_setcc_paired (CC.B)))) 123 ) 124 ;; FIXME: this `movzx` ideally would happen before the `add` itself to 125 ;; zero out the destination register with `xor %dst,%dst` and then 126 ;; the `setb` would just write to the lower bytes. That would probably 127 ;; require modeling this as a pseudo-inst which isn't quite worth it 128 ;; at this time. 129 (value_regs (value_regs_get ret 0) 130 (x64_movzx (ExtMode.BQ) (value_regs_get ret 1))))) 131 132;; Helper for lowering 128-bit addition with the 64-bit halves of the lhs/rhs 133;; already split. The first two arguments are lo/hi for the lhs and the second 134;; two are lo/hi for the rhs. 135(decl iadd128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs) 136(rule (iadd128 x_lo x_hi y_lo y_hi) 137 (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo) 138 (x64_adc_paired $I64 x_hi y_hi))) 139 140;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 141 142(decl construct_overflow_op (CC ProducesFlags) InstOutput) 143(rule (construct_overflow_op cc inst) 144 (let ((results ValueRegs (with_flags inst 145 (x64_setcc_paired cc)))) 146 (output_pair (value_regs_get results 0) 147 (value_regs_get results 1)))) 148 149(decl construct_overflow_op_alu (Type CC ProduceFlagsOp Gpr GprMemImm) InstOutput) 150(rule (construct_overflow_op_alu ty cc op src1 src2) 151 (construct_overflow_op cc (x64_produce_flags op ty src1 src2))) 152 153;; This essentially creates 154;; alu_<op1> x_lo, y_lo 155;; alu_<op2> x_hi, y_hi 156;; set<cc> r8 157(decl construct_overflow_op_alu_128 (CC ProduceFlagsOp ChainFlagsOp Value Value) InstOutput) 158(rule (construct_overflow_op_alu_128 cc op1 op2 x y) 159 ;; Get the high/low registers for `x`. 160 (let ((x_regs ValueRegs x) 161 (x_lo Gpr (value_regs_get_gpr x_regs 0)) 162 (x_hi Gpr (value_regs_get_gpr x_regs 1))) 163 ;; Get the high/low registers for `y`. 164 (let ((y_regs ValueRegs y) 165 (y_lo Gpr (value_regs_get_gpr y_regs 0)) 166 (y_hi Gpr (value_regs_get_gpr y_regs 1))) 167 (let ((lo_inst ProducesFlags (x64_produce_flags op1 $I64 x_lo y_lo)) 168 (hi_inst ConsumesAndProducesFlags (x64_chain_flags op2 $I64 x_hi y_hi)) 169 (of_inst ConsumesFlags (x64_setcc_paired cc)) 170 171 (result MultiReg (with_flags_chained lo_inst hi_inst of_inst))) 172 (multi_reg_to_pair_and_single result))))) 173 174;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 175 176(rule 1 (lower (uadd_overflow _ x y @ (value_type (fits_in_64 ty)))) 177 (construct_overflow_op_alu ty (CC.B) (ProduceFlagsOp.Add) x y)) 178 179;; i128 gets lowered into adc and add 180(rule 0 (lower (uadd_overflow _ x y @ (value_type $I128))) 181 (construct_overflow_op_alu_128 (CC.B) (ProduceFlagsOp.Add) (ChainFlagsOp.Adc) x y)) 182 183;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 184 185(rule 1 (lower (sadd_overflow _ x y @ (value_type (fits_in_64 ty)))) 186 (construct_overflow_op_alu ty (CC.O) (ProduceFlagsOp.Add) x y)) 187 188(rule 0 (lower (sadd_overflow _ x y @ (value_type $I128))) 189 (construct_overflow_op_alu_128 (CC.O) (ProduceFlagsOp.Add) (ChainFlagsOp.Adc) x y)) 190 191;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 192 193(rule 1 (lower (usub_overflow _ x y @ (value_type (fits_in_64 ty)))) 194 (construct_overflow_op_alu ty (CC.B) (ProduceFlagsOp.Sub) x y)) 195 196(rule 0 (lower (usub_overflow _ x y @ (value_type $I128))) 197 (construct_overflow_op_alu_128 (CC.B) (ProduceFlagsOp.Sub) (ChainFlagsOp.Sbb) x y)) 198 199;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 200 201(rule 1 (lower (ssub_overflow _ x y @ (value_type (fits_in_64 ty)))) 202 (construct_overflow_op_alu ty (CC.O) (ProduceFlagsOp.Sub) x y)) 203 204(rule 0 (lower (ssub_overflow _ x y @ (value_type $I128))) 205 (construct_overflow_op_alu_128 (CC.O) (ProduceFlagsOp.Sub) (ChainFlagsOp.Sbb) x y)) 206 207;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 208 209(rule 2 (lower (umul_overflow _ x y @ (value_type $I8))) 210 (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired false x y))) 211 212(rule 3 (lower (umul_overflow _ x y @ (value_type (ty_int_ref_16_to_64 ty)))) 213 (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty false x y))) 214 215;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 216 217(rule 2 (lower (smul_overflow _ x y @ (value_type $I8))) 218 (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired true x y))) 219 220(rule 3 (lower (smul_overflow _ x y @ (value_type (ty_int_ref_16_to_64 ty)))) 221 (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty true x y))) 222 223;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 224 225(rule (lower (has_type (multi_lane 8 16) 226 (sadd_sat _ x y))) 227 (x64_paddsb x y)) 228 229(rule (lower (has_type (multi_lane 16 8) 230 (sadd_sat _ x y))) 231 (x64_paddsw x y)) 232 233;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 234 235(rule (lower (has_type (multi_lane 8 16) 236 (uadd_sat _ x y))) 237 (x64_paddusb x y)) 238 239(rule (lower (has_type (multi_lane 16 8) 240 (uadd_sat _ x y))) 241 (x64_paddusw x y)) 242 243;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 244 245;; `i64` and smaller. 246 247;; Sub two registers. 248(rule -3 (lower (has_type (fits_in_64 ty) 249 (isub _ x y))) 250 (x64_sub ty x y)) 251 252;; SSE. 253 254(rule (lower (has_type (multi_lane 8 16) 255 (isub _ x y))) 256 (x64_psubb x y)) 257 258(rule (lower (has_type (multi_lane 16 8) 259 (isub _ x y))) 260 (x64_psubw x y)) 261 262(rule (lower (has_type (multi_lane 32 4) 263 (isub _ x y))) 264 (x64_psubd x y)) 265 266(rule (lower (has_type (multi_lane 64 2) 267 (isub _ x y))) 268 (x64_psubq x y)) 269 270;; `i128` 271(rule 1 (lower (has_type $I128 (isub _ x y))) 272 ;; Get the high/low registers for `x`. 273 (let ((x_regs ValueRegs x) 274 (y_regs ValueRegs y)) 275 (isub128 276 (value_regs_get_gpr x_regs 0) 277 (value_regs_get_gpr x_regs 1) 278 (value_regs_get_gpr y_regs 0) 279 (value_regs_get_gpr y_regs 1)))) 280(rule 2 (lower (has_type $I128 (isub _ x (iconcat _ y_lo y_hi)))) 281 (let ((x_regs ValueRegs x)) 282 (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi))) 283(rule 3 (lower (has_type $I128 (isub _ x (uextend _ y @ (value_type $I64))))) 284 (let ((x_regs ValueRegs x)) 285 (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) 286 y (RegMemImm.Imm 0)))) 287 288;; Helper for lowering 128-bit subtraction with the 64-bit halves of the lhs/rhs 289;; already split. The first two arguments are lo/hi for the lhs and the second 290;; two are lo/hi for the rhs. 291(decl isub128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs) 292(rule (isub128 x_lo x_hi y_lo y_hi) 293 (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo) 294 (x64_sbb_paired $I64 x_hi y_hi))) 295 296;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 297 298(rule (lower (has_type (multi_lane 8 16) 299 (ssub_sat _ x y))) 300 (x64_psubsb x y)) 301 302(rule (lower (has_type (multi_lane 16 8) 303 (ssub_sat _ x y))) 304 (x64_psubsw x y)) 305 306;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 307 308(rule (lower (has_type (multi_lane 8 16) 309 (usub_sat _ x y))) 310 (x64_psubusb x y)) 311 312(rule (lower (has_type (multi_lane 16 8) 313 (usub_sat _ x y))) 314 (x64_psubusw x y)) 315 316;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 317 318;; `{i,b}64` and smaller. 319 320;; And two registers. 321(rule 0 (lower (has_type ty (band _ x y))) 322 (if (ty_int_ref_scalar_64 ty)) 323 (x64_and ty x y)) 324 325;; The above case automatically handles when the rhs is an immediate or a 326;; sinkable load, but additionally handle the lhs here. 327 328(rule 1 (lower (has_type ty (band _ (sinkable_load x) y))) 329 (if (ty_int_ref_scalar_64 ty)) 330 (x64_and ty y x)) 331 332(rule 2 (lower (has_type ty (band _ (simm32_from_value x) y))) 333 (if (ty_int_ref_scalar_64 ty)) 334 (x64_and ty y x)) 335 336;; f32 and f64 337 338(rule 5 (lower (has_type (ty_scalar_float ty) (band _ x y))) 339 (sse_and ty x y)) 340 341;; SSE. 342 343(decl sse_and (Type Xmm XmmMem) Xmm) 344(rule (sse_and $F32X4 x y) (x64_andps x y)) 345(rule (sse_and $F64X2 x y) (x64_andpd x y)) 346(rule (sse_and $F32 x y) (x64_andps x y)) 347(rule (sse_and $F64 x y) (x64_andpd x y)) 348(rule -1 (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y)) 349 350(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) 351 (band _ x y))) 352 (sse_and ty x y)) 353 354;; `i128`. 355 356(decl and_i128 (ValueRegs ValueRegs) ValueRegs) 357(rule (and_i128 x y) 358 (let ((x_regs ValueRegs x) 359 (x_lo Gpr (value_regs_get_gpr x_regs 0)) 360 (x_hi Gpr (value_regs_get_gpr x_regs 1)) 361 (y_regs ValueRegs y) 362 (y_lo Gpr (value_regs_get_gpr y_regs 0)) 363 (y_hi Gpr (value_regs_get_gpr y_regs 1))) 364 (value_gprs (x64_and $I64 x_lo y_lo) 365 (x64_and $I64 x_hi y_hi)))) 366 367(rule 7 (lower (has_type $I128 (band _ x y))) 368 (and_i128 x y)) 369 370;; Specialized lowerings for `(band x (bnot y))` which is additionally produced 371;; by Cranelift's `band_not` instruction that is legalized into the simpler 372;; forms early on. 373 374(decl sse_and_not (Type Xmm XmmMem) Xmm) 375(rule (sse_and_not $F32X4 x y) (x64_andnps x y)) 376(rule (sse_and_not $F64X2 x y) (x64_andnpd x y)) 377(rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y)) 378 379;; Note the flipping of operands below as we're match 380;; 381;; (band x (bnot y)) 382;; 383;; while x86 does 384;; 385;; pandn(x, y) = and(not(x), y) 386(rule 8 (lower (has_type ty @ (multi_lane _bits _lane) (band _ x (bnot _ y)))) 387 (sse_and_not ty y x)) 388(rule 9 (lower (has_type ty @ (multi_lane _bits _lane) (band _ (bnot _ y) x))) 389 (sse_and_not ty y x)) 390 391(rule 10 (lower (has_type ty (band _ x (bnot _ y)))) 392 (if (ty_int_ref_scalar_64 ty)) 393 (if-let true (has_bmi1)) 394 ;; the first argument is the one that gets inverted with andn 395 (x64_andn ty y x)) 396(rule 11 (lower (has_type ty (band _ (bnot _ y) x))) 397 (if (ty_int_ref_scalar_64 ty)) 398 (if-let true (has_bmi1)) 399 (x64_andn ty y x)) 400 401;; Specialization of `blsr` for BMI1 402 403(decl pure partial val_minus_one (Value) Value) 404(rule 0 (val_minus_one (isub _ x (u64_from_iconst 1))) x) 405(rule 0 (val_minus_one (iadd _ x (i64_from_iconst -1))) x) 406(rule 1 (val_minus_one (iadd _ (i64_from_iconst -1) x)) x) 407 408(rule 12 (lower (has_type (ty_32_or_64 ty) (band _ x y))) 409 (if-let true (has_bmi1)) 410 (if-let x (val_minus_one y)) 411 (x64_blsr ty x)) 412(rule 13 (lower (has_type (ty_32_or_64 ty) (band _ y x))) 413 (if-let true (has_bmi1)) 414 (if-let x (val_minus_one y)) 415 (x64_blsr ty x)) 416 417;; Specialization of `blsi` for BMI1 418 419(rule 14 (lower (has_type (ty_32_or_64 ty) (band _ (ineg _ x) x))) 420 (if-let true (has_bmi1)) 421 (x64_blsi ty x)) 422(rule 15 (lower (has_type (ty_32_or_64 ty) (band _ x (ineg _ x)))) 423 (if-let true (has_bmi1)) 424 (x64_blsi ty x)) 425 426;; Specialization of `bzhi` for BMI2 427;; 428;; The `bzhi` instruction clears all bits indexed by the second operand of the 429;; first operand. This is pattern-matched here with a `band` against a mask 430;; which is generated to be N bits large. Note that if the index is larger than 431;; the bit-width of the type then `bzhi` doesn't have the same semantics as 432;; `ishl`, so an `and` instruction is required to mask the index to match the 433;; semantics of Cranelift's `ishl`. 434 435(rule 16 (lower (has_type (ty_32_or_64 ty) (band _ x y))) 436 (if-let true (has_bmi2)) 437 (if-let (ishl _ (u64_from_iconst 1) index) (val_minus_one y)) 438 (x64_bzhi ty x (x64_and ty index (RegMemImm.Imm (u32_wrapping_sub (ty_bits ty) 1))))) 439 440;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 441 442;; `{i,b}64` and smaller. 443 444;; Or two registers. 445(rule 0 (lower (has_type ty (bor _ x y))) 446 (if (ty_int_ref_scalar_64 ty)) 447 (x64_or ty x y)) 448 449;; Handle immediates/sinkable loads on the lhs in addition to the automatic 450;; handling of the rhs above 451 452(rule 1 (lower (has_type ty (bor _ (sinkable_load x) y))) 453 (if (ty_int_ref_scalar_64 ty)) 454 (x64_or ty y x)) 455 456(rule 2 (lower (has_type ty (bor _ (simm32_from_value x) y))) 457 (if (ty_int_ref_scalar_64 ty)) 458 (x64_or ty y x)) 459 460;; f32 and f64 461 462(rule 5 (lower (has_type (ty_scalar_float ty) (bor _ x y))) 463 (sse_or ty x y)) 464 465;; SSE. 466 467(decl sse_or (Type Xmm XmmMem) Xmm) 468(rule (sse_or $F32X4 x y) (x64_orps x y)) 469(rule (sse_or $F64X2 x y) (x64_orpd x y)) 470(rule (sse_or $F32 x y) (x64_orps x y)) 471(rule (sse_or $F64 x y) (x64_orpd x y)) 472(rule -1 (sse_or (multi_lane _bits _lanes) x y) (x64_por x y)) 473 474(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) 475 (bor _ x y))) 476 (sse_or ty x y)) 477 478;; `{i,b}128`. 479 480(decl or_i128 (ValueRegs ValueRegs) ValueRegs) 481(rule (or_i128 x y) 482 (let ((x_lo Gpr (value_regs_get_gpr x 0)) 483 (x_hi Gpr (value_regs_get_gpr x 1)) 484 (y_lo Gpr (value_regs_get_gpr y 0)) 485 (y_hi Gpr (value_regs_get_gpr y 1))) 486 (value_gprs (x64_or $I64 x_lo y_lo) 487 (x64_or $I64 x_hi y_hi)))) 488 489(rule 7 (lower (has_type $I128 (bor _ x y))) 490 (or_i128 x y)) 491 492;; Specialized lowerings to generate the `shld` instruction. 493;; 494;; The `shld` instruction will shift a value left and shift-in bits from a 495;; different register. Pattern-match doing this with bit-ops and shifts to 496;; generate a `shld` instruction. 497(rule 8 (lower (has_type (ty_int_ref_16_to_64 ty) 498 (bor _ (ishl _ x (u8_from_iconst xs)) (ushr _ y (u8_from_iconst ys))))) 499 (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys))) 500 (if-let true (u64_gt xs 0)) 501 (if-let true (u64_gt ys 0)) 502 (x64_shld ty x y xs)) 503(rule 8 (lower (has_type (ty_int_ref_16_to_64 ty) 504 (bor _ (ushr _ y (u8_from_iconst ys)) (ishl _ x (u8_from_iconst xs))))) 505 (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys))) 506 (if-let true (u64_gt xs 0)) 507 (if-let true (u64_gt ys 0)) 508 (x64_shld ty x y xs)) 509 510 511;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 512 513;; `{i,b}64` and smaller. 514 515;; Xor two registers. 516(rule 0 (lower (has_type ty (bxor _ x y))) 517 (if (ty_int_ref_scalar_64 ty)) 518 (x64_xor ty x y)) 519 520;; Handle xor with lhs immediates/sinkable loads in addition to the automatic 521;; handling of the rhs above. 522 523(rule 1 (lower (has_type ty (bxor _ (sinkable_load x) y))) 524 (if (ty_int_ref_scalar_64 ty)) 525 (x64_xor ty y x)) 526 527(rule 4 (lower (has_type ty (bxor _ (simm32_from_value x) y))) 528 (if (ty_int_ref_scalar_64 ty)) 529 (x64_xor ty y x)) 530 531;; f32 and f64 532 533(rule 5 (lower (has_type (ty_scalar_float ty) (bxor _ x y))) 534 (x64_xor_vector ty x y)) 535 536;; SSE. 537 538(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) (bxor _ x y))) 539 (x64_xor_vector ty x y)) 540 541;; `{i,b}128`. 542 543(rule 7 (lower (has_type $I128 (bxor _ x y))) 544 (let ((x_regs ValueRegs x) 545 (x_lo Gpr (value_regs_get_gpr x_regs 0)) 546 (x_hi Gpr (value_regs_get_gpr x_regs 1)) 547 (y_regs ValueRegs y) 548 (y_lo Gpr (value_regs_get_gpr y_regs 0)) 549 (y_hi Gpr (value_regs_get_gpr y_regs 1))) 550 (value_gprs (x64_xor $I64 x_lo y_lo) 551 (x64_xor $I64 x_hi y_hi)))) 552 553;; Specialization of `blsmsk` for BMI1 554 555(rule 8 (lower (has_type (ty_32_or_64 ty) (bxor _ x y))) 556 (if-let true (has_bmi1)) 557 (if-let x (val_minus_one y)) 558 (x64_blsmsk ty x)) 559(rule 9 (lower (has_type (ty_32_or_64 ty) (bxor _ y x))) 560 (if-let true (has_bmi1)) 561 (if-let x (val_minus_one y)) 562 (x64_blsmsk ty x)) 563 564;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 565 566;; `i64` and smaller. 567 568(rule -1 (lower (has_type (fits_in_64 ty) (ishl _ src amt))) 569 (x64_shl ty src (put_masked_in_imm8_gpr amt ty))) 570 571;; `i128`. 572 573(decl shl_i128 (ValueRegs Gpr) ValueRegs) 574(rule (shl_i128 src amt) 575 ;; Unpack the registers that make up the 128-bit value being shifted. 576 (let ((src_lo Gpr (value_regs_get_gpr src 0)) 577 (src_hi Gpr (value_regs_get_gpr src 1)) 578 ;; Do two 64-bit shifts. 579 (lo_shifted Gpr (x64_shl $I64 src_lo amt)) 580 (hi_shifted Gpr (x64_shl $I64 src_hi amt)) 581 ;; `src_lo >> (64 - amt)` are the bits to carry over from the lo 582 ;; into the hi. 583 (carry Gpr (x64_shr $I64 584 src_lo 585 (x64_sub $I64 586 (imm $I64 64) 587 amt))) 588 (zero Gpr (imm $I64 0)) 589 ;; Nullify the carry if we are shifting in by a multiple of 128. 590 (carry_ Gpr (with_flags_reg (x64_testq_mi amt 127) 591 (cmove $I64 592 (CC.Z) 593 zero 594 carry))) 595 ;; Add the carry into the high half. 596 (hi_shifted_ Gpr (x64_or $I64 carry_ hi_shifted))) 597 ;; Combine the two shifted halves. However, if we are shifting by >= 64 598 ;; (modulo 128), then the low bits are zero and the high bits are our 599 ;; low bits. 600 (with_flags (x64_testq_mi amt 64) 601 (consumes_flags_concat 602 (cmove $I64 (CC.Z) lo_shifted zero) 603 (cmove $I64 (CC.Z) hi_shifted_ lo_shifted))))) 604 605(rule (lower (has_type $I128 (ishl _ src amt))) 606 ;; NB: Only the low bits of `amt` matter since we logically mask the shift 607 ;; amount to the value's bit width. 608 (let ((amt_ Gpr (lo_gpr amt))) 609 (shl_i128 src amt_))) 610 611;; SSE. 612 613;; Since the x86 instruction set does not have any 8x16 shift instructions (even 614;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of 615;; instructions. The basic idea, whether the amount to shift by is an immediate 616;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s. 617(rule (lower (has_type ty @ $I8X16 (ishl _ src amt))) 618 (let ( 619 ;; Mask the amount to ensure wrapping behaviour 620 (masked_amt RegMemImm (mask_xmm_shift ty amt)) 621 ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be 622 ;; correct for half of the lanes; the others must be fixed up with 623 ;; the mask below. 624 (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt))) 625 (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt)) 626 (mask Reg (x64_movdqu_load mask_addr))) 627 (sse_and $I8X16 unmasked (RegMem.Reg mask)))) 628 629;; Get the address of the mask to use when fixing up the lanes that weren't 630;; correctly generated by the 16x8 shift. 631;; 632;; Recursion: at most once to convert memory case into register case. 633(decl rec ishl_i8x16_mask (RegMemImm) SyntheticAmode) 634 635;; When the shift amount is known, we can statically (i.e. at compile time) 636;; determine the mask to use and only emit that. 637(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode) 638(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const) 639(rule (ishl_i8x16_mask (RegMemImm.Imm amt)) 640 (ishl_i8x16_mask_for_const amt)) 641 642;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run 643;; time) find the correct mask offset in the table. We use `lea` to find the 644;; base address of the mask table and then complex addressing to offset to the 645;; right mask: `base_address + amt << 4` 646(decl ishl_i8x16_mask_table () SyntheticAmode) 647(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table) 648(rule (ishl_i8x16_mask (RegMemImm.Reg amt)) 649 (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table)) 650 (base_mask_addr Gpr (x64_leaq_rm mask_table)) 651 (mask_offset Gpr (x64_shlq_mi amt 4))) 652 (Amode.ImmRegRegShift 0 653 base_mask_addr 654 mask_offset 655 0 656 (mem_flags_trusted)))) 657 658(rule (ishl_i8x16_mask (RegMemImm.Mem amt)) 659 (ishl_i8x16_mask (RegMemImm.Reg (x64_movq_rm amt)))) 660 661;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. 662 663(rule (lower (has_type ty @ $I16X8 (ishl _ src amt))) 664 (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 665 666(rule (lower (has_type ty @ $I32X4 (ishl _ src amt))) 667 (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 668 669(rule (lower (has_type ty @ $I64X2 (ishl _ src amt))) 670 (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 671 672;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 673 674;; `i64` and smaller. 675 676(rule -1 (lower (has_type (fits_in_64 ty) (ushr _ src amt))) 677 (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Zero)))) 678 (x64_shr ty src_ (put_masked_in_imm8_gpr amt ty)))) 679 680;; `i128`. 681 682(decl shr_i128 (ValueRegs Gpr) ValueRegs) 683(rule (shr_i128 src amt) 684 ;; Unpack the lo/hi halves of `src`. 685 (let ((src_lo Gpr (value_regs_get_gpr src 0)) 686 (src_hi Gpr (value_regs_get_gpr src 1)) 687 ;; Do a shift on each half. 688 (lo_shifted Gpr (x64_shr $I64 src_lo amt)) 689 (hi_shifted Gpr (x64_shr $I64 src_hi amt)) 690 ;; `src_hi << (64 - amt)` are the bits to carry over from the hi 691 ;; into the lo. 692 (carry Gpr (x64_shl $I64 693 src_hi 694 (x64_sub $I64 695 (imm $I64 64) 696 amt))) 697 ;; Share the zero value to reduce register pressure 698 (zero Gpr (imm $I64 0)) 699 700 ;; Nullify the carry if we are shifting by a multiple of 128. 701 (carry_ Gpr (with_flags_reg (x64_testq_mi amt 127) 702 (cmove $I64 (CC.Z) zero carry))) 703 ;; Add the carry bits into the lo. 704 (lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted))) 705 ;; Combine the two shifted halves. However, if we are shifting by >= 64 706 ;; (modulo 128), then the hi bits are zero and the lo bits are what 707 ;; would otherwise be our hi bits. 708 (with_flags (x64_testq_mi amt 64) 709 (consumes_flags_concat 710 (cmove $I64 (CC.Z) lo_shifted_ hi_shifted) 711 (cmove $I64 (CC.Z) hi_shifted zero))))) 712 713(rule (lower (has_type $I128 (ushr _ src amt))) 714 ;; NB: Only the low bits of `amt` matter since we logically mask the shift 715 ;; amount to the value's bit width. 716 (let ((amt_ Gpr (lo_gpr amt))) 717 (shr_i128 src amt_))) 718 719;; SSE. 720 721;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do 722;; with 8x16 `ishl`. 723(rule (lower (has_type ty @ $I8X16 (ushr _ src amt))) 724 (let ( 725 ;; Mask the amount to ensure wrapping behaviour 726 (masked_amt RegMemImm (mask_xmm_shift ty amt)) 727 ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be 728 ;; correct for half of the lanes; the others must be fixed up with 729 ;; the mask below. 730 (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt)))) 731 (sse_and $I8X16 732 unmasked 733 (ushr_i8x16_mask masked_amt)))) 734 735;; Get the address of the mask to use when fixing up the lanes that weren't 736;; correctly generated by the 16x8 shift. 737;; 738;; Recursion: at most once to convert memory case into register case. 739(decl rec ushr_i8x16_mask (RegMemImm) SyntheticAmode) 740 741;; When the shift amount is known, we can statically (i.e. at compile time) 742;; determine the mask to use and only emit that. 743(decl ushr_i8x16_mask_for_const (u32) SyntheticAmode) 744(extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const) 745(rule (ushr_i8x16_mask (RegMemImm.Imm amt)) 746 (ushr_i8x16_mask_for_const amt)) 747 748;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run 749;; time) find the correct mask offset in the table. We use `lea` to find the 750;; base address of the mask table and then complex addressing to offset to the 751;; right mask: `base_address + amt << 4` 752(decl ushr_i8x16_mask_table () SyntheticAmode) 753(extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table) 754(rule (ushr_i8x16_mask (RegMemImm.Reg amt)) 755 (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table)) 756 (base_mask_addr Gpr (x64_leaq_rm mask_table)) 757 (mask_offset Gpr (x64_shlq_mi amt 4))) 758 (Amode.ImmRegRegShift 0 759 base_mask_addr 760 mask_offset 761 0 762 (mem_flags_trusted)))) 763 764(rule (ushr_i8x16_mask (RegMemImm.Mem amt)) 765 (ushr_i8x16_mask (RegMemImm.Reg (x64_movq_rm amt)))) 766 767;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked. 768 769(rule (lower (has_type ty @ $I16X8 (ushr _ src amt))) 770 (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 771 772(rule (lower (has_type ty @ $I32X4 (ushr _ src amt))) 773 (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 774 775(rule (lower (has_type ty @ $I64X2 (ushr _ src amt))) 776 (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 777 778(decl mask_xmm_shift (Type Value) RegMemImm) 779(rule (mask_xmm_shift ty amt) 780 (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) 781(rule 1 (mask_xmm_shift ty (iconst _ n)) 782 (RegMemImm.Imm (shift_amount_masked ty n))) 783 784;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 785 786;; `i64` and smaller. 787 788(rule -1 (lower (has_type (fits_in_64 ty) (sshr _ src amt))) 789 (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Sign)))) 790 (x64_sar ty src_ (put_masked_in_imm8_gpr amt ty)))) 791 792;; `i128`. 793 794(decl sar_i128 (ValueRegs Gpr) ValueRegs) 795(rule (sar_i128 src amt) 796 ;; Unpack the low/high halves of `src`. 797 (let ((src_lo Gpr (value_regs_get_gpr src 0)) 798 (src_hi Gpr (value_regs_get_gpr src 1)) 799 ;; Do a shift of each half. NB: the low half uses an unsigned shift 800 ;; because its MSB is not a sign bit. 801 (lo_shifted Gpr (x64_shr $I64 src_lo amt)) 802 (hi_shifted Gpr (x64_sar $I64 src_hi amt)) 803 ;; `src_hi << (64 - amt)` are the bits to carry over from the low 804 ;; half to the high half. 805 (carry Gpr (x64_shl $I64 806 src_hi 807 (x64_sub $I64 808 (imm $I64 64) 809 amt))) 810 ;; Nullify the carry if we are shifting by a multiple of 128. 811 (carry_ Gpr (with_flags_reg (x64_testq_mi amt 127) 812 (cmove $I64 (CC.Z) (imm $I64 0) carry))) 813 ;; Add the carry into the low half. 814 (lo_shifted_ Gpr (x64_or $I64 lo_shifted carry_)) 815 ;; Get all sign bits. 816 (sign_bits Gpr (x64_sarq_mi src_hi 63))) 817 ;; Combine the two shifted halves. However, if we are shifting by >= 64 818 ;; (modulo 128), then the hi bits are all sign bits and the lo bits are 819 ;; what would otherwise be our hi bits. 820 (with_flags (x64_testq_mi amt 64) 821 (consumes_flags_concat 822 (cmove $I64 (CC.Z) lo_shifted_ hi_shifted) 823 (cmove $I64 (CC.Z) hi_shifted sign_bits))))) 824 825(rule (lower (has_type $I128 (sshr _ src amt))) 826 ;; NB: Only the low bits of `amt` matter since we logically mask the shift 827 ;; amount to the value's bit width. 828 (let ((amt_ Gpr (lo_gpr amt))) 829 (sar_i128 src amt_))) 830 831;; SSE. 832 833;; Since the x86 instruction set does not have an 8x16 shift instruction and the 834;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not 835;; preserve the sign), we use a different approach here: separate the low and 836;; high lanes, shift them separately, and merge them into the final result. 837;; 838;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ..., 839;; s15]: 840;; 841;; lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)] 842;; shifted_lo.i16x8 = shift each lane of `low` 843;; hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)] 844;; shifted_hi.i16x8 = shift each lane of `high` 845;; result = [s0'', s1'', ..., s15''] 846(rule (lower (has_type ty @ $I8X16 (sshr _ src amt @ (value_type amt_ty)))) 847 (let ((src_ Xmm (put_in_xmm src)) 848 ;; Mask the amount to ensure wrapping behaviour 849 (masked_amt RegMemImm (mask_xmm_shift ty amt)) 850 ;; In order for `packsswb` later to only use the high byte of each 851 ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to 852 ;; fill in the upper bits appropriately. 853 (lo Xmm (x64_punpcklbw src_ src_)) 854 (hi Xmm (x64_punpckhbw src_ src_)) 855 (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt)) 856 (shifted_lo Xmm (x64_psraw lo amt_)) 857 (shifted_hi Xmm (x64_psraw hi amt_))) 858 (x64_packsswb shifted_lo shifted_hi))) 859 860(decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm) 861(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i)) 862 (xmm_mem_imm_new (RegMemImm.Imm (u32_wrapping_add i 8)))) 863(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r)) 864 (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty 865 r 866 (RegMemImm.Imm 8))))) 867(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m)) 868 (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty 869 (imm ty 8) 870 rmi)))) 871 872;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure 873;; that if the shift amount is in a register, it is in an XMM register. 874 875(rule (lower (has_type ty @ $I16X8 (sshr _ src amt))) 876 (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 877 878(rule (lower (has_type ty @ $I32X4 (sshr _ src amt))) 879 (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt)))) 880 881;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older 882;; feature sets. To remedy this, a small dance is done with an unsigned right 883;; shift plus some extra ops. 884(rule 3 (lower (has_type ty @ $I64X2 (sshr _ src (iconst _ n)))) 885 (if-let true (has_avx512vl)) 886 (if-let true (has_avx512f)) 887 (x64_vpsraq_imm src (shift_amount_masked ty n))) 888 889(rule 2 (lower (has_type ty @ $I64X2 (sshr _ src amt))) 890 (if-let true (has_avx512vl)) 891 (if-let true (has_avx512f)) 892 (let ((masked Gpr (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty))))) 893 (x64_vpsraq src (x64_movd_to_xmm masked)))) 894 895(rule 1 (lower (has_type $I64X2 (sshr _ src (u32_from_iconst amt)))) 896 (lower_i64x2_sshr_imm src (u32_and amt 63))) 897 898(rule (lower (has_type $I64X2 (sshr _ src amt))) 899 (lower_i64x2_sshr_gpr src (x64_and $I64 amt (RegMemImm.Imm 63)))) 900 901(decl lower_i64x2_sshr_imm (Xmm u32) Xmm) 902 903;; If the shift amount is less than 32 then do an sshr with 32-bit lanes to 904;; produce the upper halves of each result, followed by a ushr of 64-bit lanes 905;; to produce the lower halves of each result. Interleave results at the end. 906(rule 2 (lower_i64x2_sshr_imm vec imm) 907 (if-let true (u64_lt imm 32)) 908 (let ( 909 (high32 Xmm (x64_psrad vec (xmi_imm imm))) 910 (high32 Xmm (x64_pshufd high32 0b11_10_11_01)) 911 (low32 Xmm (x64_psrlq vec (xmi_imm imm))) 912 (low32 Xmm (x64_pshufd low32 0b11_10_10_00)) 913 ) 914 (x64_punpckldq low32 high32))) 915 916;; If the shift amount is 32 then the `psrlq` from the above rule can be avoided 917(rule 1 (lower_i64x2_sshr_imm vec 32) 918 (let ( 919 (low32 Xmm (x64_pshufd vec 0b11_10_11_01)) 920 (high32 Xmm (x64_psrad vec (xmi_imm 31))) 921 (high32 Xmm (x64_pshufd high32 0b11_10_11_01)) 922 ) 923 (x64_punpckldq low32 high32))) 924 925;; Shifts >= 32 use one `psrad` to generate the upper bits and second `psrad` to 926;; generate the lower bits. Everything is then woven back together with 927;; shuffles. 928(rule (lower_i64x2_sshr_imm vec imm) 929 (if-let true (u64_lt 32 imm)) 930 (let ( 931 (high32 Xmm (x64_psrad vec (xmi_imm 31))) 932 (high32 Xmm (x64_pshufd high32 0b11_10_11_01)) 933 (low32 Xmm (x64_psrad vec (xmi_imm (u32_wrapping_sub imm 32)))) 934 (low32 Xmm (x64_pshufd low32 0b11_10_11_01)) 935 ) 936 (x64_punpckldq low32 high32))) 937 938;; A variable shift amount is slightly more complicated than the immediate 939;; shift amounts from above. The `Gpr` argument is guaranteed to be <= 63 by 940;; earlier masking. A `ushr` operation is used with some xor/sub math to 941;; generate the sign bits. 942(decl lower_i64x2_sshr_gpr (Xmm Gpr) Xmm) 943(rule (lower_i64x2_sshr_gpr vec val) 944 (let ( 945 (val Xmm (x64_movq_to_xmm val)) 946 (mask Xmm (flip_high_bit_mask $I64X2)) 947 (sign_bit_loc Xmm (x64_psrlq mask val)) 948 (ushr Xmm (x64_psrlq vec val)) 949 (ushr_sign_bit_flip Xmm (x64_pxor sign_bit_loc ushr)) 950 ) 951 (x64_psubq ushr_sign_bit_flip sign_bit_loc))) 952 953;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 954 955;; `i64` and smaller: we can rely on x86's rotate-amount masking since 956;; we operate on the whole register. For const's we mask the constant. 957 958(rule -1 (lower (has_type (fits_in_64 ty) (rotl _ src amt))) 959 (x64_rotl ty src (put_masked_in_imm8_gpr amt ty))) 960 961 962;; `i128`. 963 964(rule (lower (has_type $I128 (rotl _ src amt))) 965 (let ((src_ ValueRegs src) 966 ;; NB: Only the low bits of `amt` matter since we logically mask the 967 ;; rotation amount to the value's bit width. 968 (amt_ Gpr (lo_gpr amt))) 969 (or_i128 (shl_i128 src_ amt_) 970 (shr_i128 src_ (x64_sub $I64 971 (imm $I64 128) 972 amt_))))) 973 974;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 975 976;; `i64` and smaller: we can rely on x86's rotate-amount masking since 977;; we operate on the whole register. For const's we mask the constant. 978 979(rule -1 (lower (has_type (fits_in_64 ty) (rotr _ src amt))) 980 (x64_rotr ty src (put_masked_in_imm8_gpr amt ty))) 981 982 983;; `i128`. 984 985(rule (lower (has_type $I128 (rotr _ src amt))) 986 (let ((src_ ValueRegs src) 987 ;; NB: Only the low bits of `amt` matter since we logically mask the 988 ;; rotation amount to the value's bit width. 989 (amt_ Gpr (lo_gpr amt))) 990 (or_i128 (shr_i128 src_ amt_) 991 (shl_i128 src_ (x64_sub $I64 992 (imm $I64 128) 993 amt_))))) 994 995;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 996 997;; `i64` and smaller. 998 999(rule -1 (lower (has_type (fits_in_64 ty) (ineg _ x))) 1000 (x64_neg ty x)) 1001 1002(rule -2 (lower (has_type $I128 (ineg _ x))) 1003 ;; Get the high/low registers for `x`. 1004 (let ((regs ValueRegs x) 1005 (lo Gpr (value_regs_get_gpr regs 0)) 1006 (hi Gpr (value_regs_get_gpr regs 1))) 1007 ;; Do a neg followed by an sub-with-borrow. 1008 (with_flags (x64_neg_paired $I64 lo) 1009 (x64_sbb_paired $I64 (imm $I64 0) hi)))) 1010 1011;; SSE. 1012 1013(rule (lower (has_type $I8X16 (ineg _ x))) 1014 (x64_psubb (imm $I8X16 0) x)) 1015 1016(rule (lower (has_type $I16X8 (ineg _ x))) 1017 (x64_psubw (imm $I16X8 0) x)) 1018 1019(rule (lower (has_type $I32X4 (ineg _ x))) 1020 (x64_psubd (imm $I32X4 0) x)) 1021 1022(rule (lower (has_type $I64X2 (ineg _ x))) 1023 (x64_psubq (imm $I64X2 0) x)) 1024 1025;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1026 1027(rule (lower (has_type (multi_lane 8 16) 1028 (avg_round _ x y))) 1029 (x64_pavgb x y)) 1030 1031(rule (lower (has_type (multi_lane 16 8) 1032 (avg_round _ x y))) 1033 (x64_pavgw x y)) 1034 1035;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1036 1037;; `i64` and smaller. 1038 1039;; 8-bit base case, needs a special instruction encoding and additionally 1040;; move sinkable loads to the right. 1041(rule -8 (lower (has_type $I8 (imul _ x y))) (x64_mul8 false x y)) 1042(rule -7 (lower (has_type $I8 (imul _ (sinkable_load x) y))) (x64_mul8 false y x)) 1043 1044;; 16-to-64-bit base cases, same as above by moving sinkable loads to the right. 1045(rule -6 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ x y))) 1046 (x64_imul ty x y)) 1047(rule -5 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ (sinkable_load x) y))) 1048 (x64_imul ty y x)) 1049 1050;; Lift out constants to use 3-operand form. 1051(rule -4 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ x (i32_from_iconst y)))) 1052 (x64_imul_imm ty x y)) 1053(rule -3 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ (i32_from_iconst x) y))) 1054 (x64_imul_imm ty y x)) 1055 1056;; Special case widening multiplication from 8-to-16-bits with a single 1057;; instruction since the 8-bit-multiply places both the high and low halves in 1058;; the same register 1059(rule -2 (lower (has_type $I16 (imul _ (sextend _ x) (sextend _ y)))) 1060 (x64_mul8 true x y)) 1061(rule -2 (lower (has_type $I16 (imul _ (uextend _ x) (uextend _ y)))) 1062 (x64_mul8 false x y)) 1063 1064;; `i128`. 1065 1066(rule 2 (lower (has_type $I128 (imul _ x y))) 1067 (let ((x_regs ValueRegs x) 1068 (y_regs ValueRegs y)) 1069 (imul128 1070 (value_regs_get_gpr x_regs 0) 1071 (value_regs_get_gpr x_regs 1) 1072 (value_regs_get_gpr y_regs 0) 1073 (value_regs_get_gpr y_regs 1)))) 1074 1075(rule 4 (lower (has_type $I128 (imul _ (iconcat _ x_lo x_hi) (iconcat _ y_lo y_hi)))) 1076 (imul128 x_lo x_hi y_lo y_hi)) 1077 1078;; Helper for lowering 128-bit multiplication with the 64-bit halves of the 1079;; lhs/rhs already split. The first two arguments are lo/hi for the lhs and the 1080;; second two are lo/hi for the rhs. 1081;; 1082;; mul: 1083;; dst_lo = lhs_lo * rhs_lo 1084;; dst_hi = umulhi(lhs_lo, rhs_lo) + 1085;; lhs_lo * rhs_hi + 1086;; lhs_hi * rhs_lo 1087;; 1088;; so we emit: 1089;; lo_hi = mul x_lo, y_hi 1090;; hi_lo = mul x_hi, y_lo 1091;; hilo_hilo = add lo_hi, hi_lo 1092;; dst_lo:hi_lolo = mulhi_u x_lo, y_lo 1093;; dst_hi = add hilo_hilo, hi_lolo 1094;; return (dst_lo, dst_hi) 1095(decl imul128 (Gpr Gpr GprMem GprMem) ValueRegs) 1096(rule (imul128 x_lo x_hi y_lo y_hi) 1097 ;; Put `x` into registers and unpack its hi/lo halves. 1098 (let ( 1099 ;; lo_hi = mul x_lo, y_hi 1100 (lo_hi Gpr (x64_imul $I64 x_lo y_hi)) 1101 ;; hi_lo = mul x_hi, y_lo 1102 (hi_lo Gpr (x64_imul $I64 x_hi y_lo)) 1103 ;; hilo_hilo = add lo_hi, hi_lo 1104 (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo)) 1105 ;; dst_lo:hi_lolo = x64_mul x_lo, y_lo 1106 (mul_regs ValueRegs (x64_mul $I64 false x_lo y_lo)) 1107 (dst_lo Gpr (value_regs_get_gpr mul_regs 0)) 1108 (hi_lolo Gpr (value_regs_get_gpr mul_regs 1)) 1109 ;; dst_hi = add hilo_hilo, hi_lolo 1110 (dst_hi Gpr (x64_add $I64 hilo_hilo hi_lolo))) 1111 (value_gprs dst_lo dst_hi))) 1112 1113;; The `mul` and `imul` instructions on x64 are defined as taking 64-bit 1114;; operands and producing a 128-bit result, which exactly matches the semantics 1115;; of widening 64-bit inputs to 128-bit and then multiplying them. That means 1116;; that these cases can get some some simpler codegen. 1117(rule 5 (lower (has_type $I128 (imul _ (uextend _ x @ (value_type $I64)) 1118 (uextend _ y @ (value_type $I64))))) 1119 (x64_mul $I64 false x y)) 1120(rule 5 (lower (has_type $I128 (imul _ (sextend _ x @ (value_type $I64)) 1121 (sextend _ y @ (value_type $I64))))) 1122 (x64_mul $I64 true x y)) 1123 1124;; SSE. 1125 1126;; (No i8x16 multiply.) 1127 1128(rule (lower (has_type (multi_lane 16 8) (imul _ x y))) 1129 (x64_pmullw x y)) 1130 1131(rule (lower (has_type (multi_lane 32 4) (imul _ x y))) 1132 (if-let true (has_sse41)) 1133 (x64_pmulld x y)) 1134 1135;; Without `pmulld` the `pmuludq` instruction is used instead which performs 1136;; 32-bit multiplication storing the 64-bit result. The 64-bit result is 1137;; truncated to 32-bits and everything else is woven into place. 1138(rule -1 (lower (has_type (multi_lane 32 4) (imul _ x y))) 1139 (let ( 1140 (x Xmm x) 1141 (y Xmm y) 1142 (x_hi Xmm (x64_pshufd x 0b00_11_00_01)) 1143 (y_hi Xmm (x64_pshufd y 0b00_11_00_01)) 1144 (mul_lo Xmm (x64_pshufd (x64_pmuludq x y) 0b00_00_10_00)) 1145 (mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00)) 1146 ) 1147 (x64_punpckldq mul_lo mul_hi))) 1148 1149;; With AVX-512 we can implement `i64x2` multiplication with a single 1150;; instruction. 1151(rule 3 (lower (has_type (multi_lane 64 2) (imul _ x y))) 1152 (if-let true (has_avx512vl)) 1153 (if-let true (has_avx512dq)) 1154 (x64_vpmullq x y)) 1155 1156;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of 1157;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand 1158;; multiplication can then be written as: 1159;; 1160;; Ah Al 1161;; * Bh Bl 1162;; ----- 1163;; Al * Bl 1164;; + (Ah * Bl) << 32 1165;; + (Al * Bh) << 32 1166;; 1167;; So for each lane we will compute: 1168;; 1169;; A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32 1170;; 1171;; Note, the algorithm will use `pmuludq` which operates directly on the lower 1172;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of 1173;; the lane of the destination. For this reason we don't need shifts to isolate 1174;; the lower 32-bits, however, we will need to use shifts to isolate the high 1175;; 32-bits when doing calculations, i.e., `Ah == A >> 32`. 1176(rule (lower (has_type (multi_lane 64 2) 1177 (imul _ a b))) 1178 (let ((a0 Xmm a) 1179 (b0 Xmm b) 1180 ;; a_hi = A >> 32 1181 (a_hi Xmm (x64_psrlq a0 (xmi_imm 32))) 1182 ;; ah_bl = Ah * Bl 1183 (ah_bl Xmm (x64_pmuludq a_hi b0)) 1184 ;; b_hi = B >> 32 1185 (b_hi Xmm (x64_psrlq b0 (xmi_imm 32))) 1186 ;; al_bh = Al * Bh 1187 (al_bh Xmm (x64_pmuludq a0 b_hi)) 1188 ;; aa_bb = ah_bl + al_bh 1189 (aa_bb Xmm (x64_paddq ah_bl al_bh)) 1190 ;; aa_bb_shifted = aa_bb << 32 1191 (aa_bb_shifted Xmm (x64_psllq aa_bb (xmi_imm 32))) 1192 ;; al_bl = Al * Bl 1193 (al_bl Xmm (x64_pmuludq a0 b0))) 1194 ;; al_bl + aa_bb_shifted 1195 (x64_paddq al_bl aa_bb_shifted))) 1196 1197;; Special case for `i32x4.extmul_high_i16x8_s`. 1198(rule 1 (lower (has_type (multi_lane 32 4) 1199 (imul _ (swiden_high _ (and (value_type (multi_lane 16 8)) 1200 x)) 1201 (swiden_high _ (and (value_type (multi_lane 16 8)) 1202 y))))) 1203 (let ((x2 Xmm x) 1204 (y2 Xmm y) 1205 (lo Xmm (x64_pmullw x2 y2)) 1206 (hi Xmm (x64_pmulhw x2 y2))) 1207 (x64_punpckhwd lo hi))) 1208 1209;; Special case for `i64x2.extmul_high_i32x4_s`. 1210(rule 1 (lower (has_type (multi_lane 64 2) 1211 (imul _ (swiden_high _ (and (value_type (multi_lane 32 4)) 1212 x)) 1213 (swiden_high _ (and (value_type (multi_lane 32 4)) 1214 y))))) 1215 (if-let true (has_sse41)) 1216 (let ((x2 Xmm (x64_pshufd x 0xFA)) 1217 (y2 Xmm (x64_pshufd y 0xFA))) 1218 (x64_pmuldq x2 y2))) 1219 1220;; Special case for `i32x4.extmul_low_i16x8_s`. 1221(rule 1 (lower (has_type (multi_lane 32 4) 1222 (imul _ (swiden_low _ (and (value_type (multi_lane 16 8)) 1223 x)) 1224 (swiden_low _ (and (value_type (multi_lane 16 8)) 1225 y))))) 1226 (let ((x2 Xmm x) 1227 (y2 Xmm y) 1228 (lo Xmm (x64_pmullw x2 y2)) 1229 (hi Xmm (x64_pmulhw x2 y2))) 1230 (x64_punpcklwd lo hi))) 1231 1232;; Special case for `i64x2.extmul_low_i32x4_s`. 1233(rule 1 (lower (has_type (multi_lane 64 2) 1234 (imul _ (swiden_low _ (and (value_type (multi_lane 32 4)) 1235 x)) 1236 (swiden_low _ (and (value_type (multi_lane 32 4)) 1237 y))))) 1238 (if-let true (has_sse41)) 1239 (let ((x2 Xmm (x64_pshufd x 0x50)) 1240 (y2 Xmm (x64_pshufd y 0x50))) 1241 (x64_pmuldq x2 y2))) 1242 1243;; Special case for `i32x4.extmul_high_i16x8_u`. 1244(rule 1 (lower (has_type (multi_lane 32 4) 1245 (imul _ (uwiden_high _ (and (value_type (multi_lane 16 8)) 1246 x)) 1247 (uwiden_high _ (and (value_type (multi_lane 16 8)) 1248 y))))) 1249 (let ((x2 Xmm x) 1250 (y2 Xmm y) 1251 (lo Xmm (x64_pmullw x2 y2)) 1252 (hi Xmm (x64_pmulhuw x2 y2))) 1253 (x64_punpckhwd lo hi))) 1254 1255;; Special case for `i64x2.extmul_high_i32x4_u`. 1256(rule 1 (lower (has_type (multi_lane 64 2) 1257 (imul _ (uwiden_high _ (and (value_type (multi_lane 32 4)) 1258 x)) 1259 (uwiden_high _ (and (value_type (multi_lane 32 4)) 1260 y))))) 1261 (let ((x2 Xmm (x64_pshufd x 0xFA)) 1262 (y2 Xmm (x64_pshufd y 0xFA))) 1263 (x64_pmuludq x2 y2))) 1264 1265;; Special case for `i32x4.extmul_low_i16x8_u`. 1266(rule 1 (lower (has_type (multi_lane 32 4) 1267 (imul _ (uwiden_low _ (and (value_type (multi_lane 16 8)) 1268 x)) 1269 (uwiden_low _ (and (value_type (multi_lane 16 8)) 1270 y))))) 1271 (let ((x2 Xmm x) 1272 (y2 Xmm y) 1273 (lo Xmm (x64_pmullw x2 y2)) 1274 (hi Xmm (x64_pmulhuw x2 y2))) 1275 (x64_punpcklwd lo hi))) 1276 1277;; Special case for `i64x2.extmul_low_i32x4_u`. 1278(rule 1 (lower (has_type (multi_lane 64 2) 1279 (imul _ (uwiden_low _ (and (value_type (multi_lane 32 4)) 1280 x)) 1281 (uwiden_low _ (and (value_type (multi_lane 32 4)) 1282 y))))) 1283 (let ((x2 Xmm (x64_pshufd x 0x50)) 1284 (y2 Xmm (x64_pshufd y 0x50))) 1285 (x64_pmuludq x2 y2))) 1286 1287;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1288 1289(rule 1 (lower (has_type $I8X16 (iabs _ x))) 1290 (if-let true (has_ssse3)) 1291 (x64_pabsb_a_or_avx x)) 1292 1293;; Note the use of `pminub` with signed inputs will produce the positive signed 1294;; result which is what is desired here. The `pmaxub` isn't available until 1295;; SSE4.1 in which case the single-instruction above lowering would apply. 1296(rule (lower (has_type $I8X16 (iabs _ x))) 1297 (let ( 1298 (x Xmm x) 1299 (negated Xmm (x64_psubb (xmm_zero $I8X16) x)) 1300 ) 1301 (x64_pminub_a x negated))) 1302 1303(rule 1 (lower (has_type $I16X8 (iabs _ x))) 1304 (if-let true (has_ssse3)) 1305 (x64_pabsw_a_or_avx x)) 1306 1307(rule (lower (has_type $I16X8 (iabs _ x))) 1308 (let ( 1309 (x Xmm x) 1310 (negated Xmm (x64_psubw (xmm_zero $I16X8) x)) 1311 ) 1312 (x64_pmaxsw_a x negated))) 1313 1314(rule 1 (lower (has_type $I32X4 (iabs _ x))) 1315 (if-let true (has_ssse3)) 1316 (x64_pabsd_a_or_avx x)) 1317 1318;; Generate a `negative_mask` which is either numerically -1 or 0 depending on 1319;; if the lane is negative. If the lane is positive then the xor operation 1320;; won't change the lane but otherwise it'll bit-flip everything. By then 1321;; subtracting the mask this subtracts 0 for positive lanes (does nothing) or 1322;; ends up adding one for negative lanes. This means that for a negative lane 1323;; `x` the result is `!x + 1` which is the result of negating it. 1324(rule (lower (has_type $I32X4 (iabs _ x))) 1325 (let ( 1326 (x Xmm x) 1327 (negative_mask Xmm (x64_psrad x (xmi_imm 31))) 1328 (flipped_if_negative Xmm (x64_pxor x negative_mask)) 1329 ) 1330 (x64_psubd flipped_if_negative negative_mask))) 1331 1332;; When AVX512 is available, we can use a single `vpabsq` instruction. 1333(rule 2 (lower (has_type $I64X2 (iabs _ x))) 1334 (if-let true (has_avx512vl)) 1335 (if-let true (has_avx512f)) 1336 (x64_vpabsq x)) 1337 1338;; Otherwise, we use a separate register, `neg`, to contain the results of `0 - 1339;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was 1340;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally 1341;; positive). 1342(rule 1 (lower (has_type $I64X2 (iabs _ x))) 1343 (if-let true (has_sse41)) 1344 (let ((rx Xmm x) 1345 (neg Xmm (x64_psubq (imm $I64X2 0) rx))) 1346 (x64_blendvpd neg rx neg))) 1347 1348;; and if `blendvpd` isn't available then perform a shift/shuffle to generate a 1349;; mask of which lanes are negative, followed by flipping bits/sub to make both 1350;; positive. 1351(rule (lower (has_type $I64X2 (iabs _ x))) 1352 (let ((x Xmm x) 1353 (signs Xmm (x64_psrad x (RegMemImm.Imm 31))) 1354 (signs Xmm (x64_pshufd signs 0b11_11_01_01)) 1355 (xor_if_negative Xmm (x64_pxor x signs))) 1356 (x64_psubq xor_if_negative signs))) 1357 1358;; `i64` and smaller. 1359 1360(rule -1 (lower (has_type (fits_in_64 ty) (iabs _ x))) 1361 (let ((src Gpr x) 1362 (neg ProducesFlags (x64_neg_paired ty src)) 1363 ;; Manually extract the result from the neg, then ignore 1364 ;; it below, since we need to pass it into the cmove 1365 ;; before we pass the cmove to with_flags_reg. 1366 (neg_result Gpr (produces_flags_get_reg neg)) 1367 ;; When the neg instruction sets the sign flag, 1368 ;; takes the original (non-negative) value. 1369 (cmove ConsumesFlags (cmove ty (CC.S) src neg_result))) 1370 (with_flags_reg (produces_flags_ignore neg) cmove))) 1371 1372;; `i128`. Negate the low bits, `adc` to the higher bits, then negate high bits. 1373(rule (lower (has_type $I128 (iabs _ x))) 1374 ;; Get the high/low registers for `x`. 1375 (let ((x_regs ValueRegs x) 1376 (x_lo Gpr (value_regs_get_gpr x_regs 0)) 1377 (x_hi Gpr (value_regs_get_gpr x_regs 1)) 1378 ; negate low bits, then add 0 with carry to high bits. 1379 (neg_lo ProducesFlags (x64_neg_paired $I64 x_lo)) 1380 (adc_hi ConsumesFlags (x64_adc_paired $I64 x_hi (imm $I64 0))) 1381 (neg_adc_vals ValueRegs (with_flags neg_lo adc_hi)) 1382 ; negate high bits. 1383 (neg_hi ProducesFlags (x64_neg_paired $I64 (value_regs_get neg_adc_vals 1))) 1384 (neg_hi_flag_only ProducesFlags (produces_flags_ignore neg_hi)) 1385 ; cmove based on sign flag from hi negation. 1386 (cmove_lo ConsumesFlags (cmove $I64 (CC.S) x_lo 1387 (value_regs_get neg_adc_vals 0))) 1388 (cmove_hi ConsumesFlags (cmove $I64 (CC.S) x_hi 1389 (produces_flags_get_reg neg_hi))) 1390 (cmoves ConsumesFlags (consumes_flags_concat cmove_lo cmove_hi))) 1391 (with_flags neg_hi_flag_only cmoves))) 1392 1393;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1394 1395(rule (lower (has_type $F32 (fabs _ x))) 1396 (x64_andps x (imm $F32 0x7fffffff))) 1397 1398(rule (lower (has_type $F64 (fabs _ x))) 1399 (x64_andpd x (imm $F64 0x7fffffffffffffff))) 1400 1401;; Special case for `f32x4.abs`. 1402(rule (lower (has_type $F32X4 (fabs _ x))) 1403 (x64_andps x 1404 (x64_psrld (vector_all_ones) (xmi_imm 1)))) 1405 1406;; Special case for `f64x2.abs`. 1407(rule (lower (has_type $F64X2 (fabs _ x))) 1408 (x64_andpd x 1409 (x64_psrlq (vector_all_ones) (xmi_imm 1)))) 1410 1411;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1412 1413(rule (lower (has_type $F32 (fneg _ x))) 1414 (x64_xorps x (imm $F32 0x80000000))) 1415 1416(rule (lower (has_type $F64 (fneg _ x))) 1417 (x64_xorpd x (imm $F64 0x8000000000000000))) 1418 1419(rule (lower (has_type $F32X4 (fneg _ x))) 1420 (x64_xorps x 1421 (x64_pslld (vector_all_ones) (xmi_imm 31)))) 1422 1423(rule (lower (has_type $F64X2 (fneg _ x))) 1424 (x64_xorpd x 1425 (x64_psllq (vector_all_ones) (xmi_imm 63)))) 1426 1427;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1428 1429; Recursion: reduces 128-bit cases to 64-bit. 1430(decl rec lower_bmask (Type Type ValueRegs) ValueRegs) 1431 1432;; Values that fit in a register 1433;; 1434;; Use the neg instruction on the input which sets the CF (carry) flag 1435;; to 0 if the input is 0 or 1 otherwise. 1436;; We then subtract the output register with itself, which always gives a 0, 1437;; however use the carry flag from the previous negate to generate a -1 if it 1438;; was nonzero. 1439;; 1440;; neg in_reg 1441;; sbb out_reg, out_reg 1442(rule 0 1443 (lower_bmask (fits_in_64 out_ty) (fits_in_64 in_ty) val) 1444 (let ((reg Gpr (value_regs_get_gpr val 0)) 1445 (out ValueRegs (with_flags 1446 (x64_neg_paired in_ty reg) 1447 (x64_sbb_paired out_ty reg reg)))) 1448 ;; Extract only the output of the sbb instruction 1449 (value_reg (value_regs_get out 1)))) 1450 1451 1452;; If the input type is I128 we can `or` the registers, and recurse to the general case. 1453(rule 1 1454 (lower_bmask (fits_in_64 out_ty) $I128 val) 1455 (let ((lo Gpr (value_regs_get_gpr val 0)) 1456 (hi Gpr (value_regs_get_gpr val 1)) 1457 (mixed Gpr (x64_or $I64 lo hi))) 1458 (lower_bmask out_ty $I64 (value_reg mixed)))) 1459 1460;; If the output type is I128 we just duplicate the result of the I64 lowering 1461(rule 2 1462 (lower_bmask $I128 in_ty val) 1463 (let ((res ValueRegs (lower_bmask $I64 in_ty val)) 1464 (res Gpr (value_regs_get_gpr res 0))) 1465 (value_regs res res))) 1466 1467 1468;; Call the lower_bmask rule that does all the procssing 1469(rule (lower (has_type out_ty (bmask _ x @ (value_type in_ty)))) 1470 (lower_bmask out_ty in_ty x)) 1471 1472;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1473 1474;; `i64` and smaller. 1475 1476(rule -2 (lower (has_type ty (bnot _ x))) 1477 (if (ty_int_ref_scalar_64 ty)) 1478 (x64_not ty x)) 1479 1480 1481;; `i128`. 1482 1483(decl not_i128 (Value) ValueRegs) 1484(rule (not_i128 x) 1485 (let ((x_regs ValueRegs x) 1486 (x_lo Gpr (value_regs_get_gpr x_regs 0)) 1487 (x_hi Gpr (value_regs_get_gpr x_regs 1))) 1488 (value_gprs (x64_not $I64 x_lo) 1489 (x64_not $I64 x_hi)))) 1490 1491(rule (lower (has_type $I128 (bnot _ x))) 1492 (not_i128 x)) 1493 1494;; f32 and f64 1495 1496(rule -3 (lower (has_type (ty_scalar_float ty) (bnot _ x))) 1497 (x64_xor_vector ty x (vector_all_ones))) 1498 1499;; Special case for vector-types where bit-negation is an xor against an 1500;; all-one value 1501(rule -1 (lower (has_type ty @ (multi_lane _bits _lanes) (bnot _ x))) 1502 (x64_xor_vector ty x (vector_all_ones))) 1503 1504;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1505 1506(rule (lower (has_type ty @ (multi_lane _bits _lanes) 1507 (bitselect _ condition 1508 if_true 1509 if_false))) 1510 ;; a = and if_true, condition 1511 ;; b = and_not condition, if_false 1512 ;; or b, a 1513 (let ((cond_xmm Xmm condition) 1514 (a Xmm (sse_and ty if_true cond_xmm)) 1515 (b Xmm (sse_and_not ty cond_xmm if_false))) 1516 (sse_or ty b a))) 1517 1518;; If every byte of the condition is guaranteed to be all ones or all zeroes, 1519;; we can use x64_blend. 1520(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes) 1521 (bitselect _ condition 1522 if_true 1523 if_false))) 1524 (if-let true (has_sse41)) 1525 (if (all_ones_or_all_zeros condition)) 1526 (x64_pblendvb if_false if_true condition)) 1527 1528(decl pure partial all_ones_or_all_zeros (Value) bool) 1529(rule (all_ones_or_all_zeros (and (icmp _ _ _ _) (value_type (multi_lane _ _)))) true) 1530(rule (all_ones_or_all_zeros (and (fcmp _ _ _ _) (value_type (multi_lane _ _)))) true) 1531(rule (all_ones_or_all_zeros (and (bitcast _ _ (fcmp _ _ _ _)) (value_type (multi_lane _ _)))) true) 1532(rule (all_ones_or_all_zeros (vconst _ (vconst_all_ones_or_all_zeros))) true) 1533 1534(decl pure vconst_all_ones_or_all_zeros () Constant) 1535(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros) 1536 1537;; Specializations for floating-pointer compares to generate a `minp*` or a 1538;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}` 1539;; instructions and how they're lowered into CLIF. Note the careful ordering 1540;; of all the operands here to ensure that the input CLIF matched is implemented 1541;; by the corresponding x64 instruction. 1542(rule 2 (lower (has_type $F32X4 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) x y)) x y))) 1543 (x64_minps x y)) 1544(rule 2 (lower (has_type $F64X2 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) x y)) x y))) 1545 (x64_minpd x y)) 1546 1547(rule 3 (lower (has_type $F32X4 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) y x)) x y))) 1548 (x64_maxps x y)) 1549(rule 3 (lower (has_type $F64X2 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) y x)) x y))) 1550 (x64_maxpd x y)) 1551 1552;; Scalar rules 1553 1554(rule 3 (lower (has_type $I128 (bitselect _ c t f))) 1555 (let ((a ValueRegs (and_i128 c t)) 1556 (b ValueRegs (and_i128 (not_i128 c) f))) 1557 (or_i128 a b))) 1558 1559(rule 4 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect _ c t f))) 1560 (let ((a Gpr (x64_and ty c t)) 1561 (b Gpr (x64_and ty (x64_not ty c) f))) 1562 (x64_or ty a b))) 1563 1564(rule 5 (lower (has_type (ty_scalar_float ty) (bitselect _ c t f))) 1565 (let ((a Xmm (sse_and ty c t)) 1566 (c_neg Xmm (x64_xor_vector ty c (vector_all_ones))) 1567 (b Xmm (sse_and ty c_neg f))) 1568 (sse_or ty a b))) 1569 1570;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1571 1572(rule (lower (has_type $I8X16 1573 (blendv _ condition if_true if_false))) 1574 (if-let true (has_sse41)) 1575 (x64_pblendvb if_false if_true condition)) 1576 1577(rule (lower (has_type $I32X4 1578 (blendv _ condition if_true if_false))) 1579 (if-let true (has_sse41)) 1580 (x64_blendvps if_false if_true condition)) 1581 1582(rule (lower (has_type $I64X2 1583 (blendv _ condition if_true if_false))) 1584 (if-let true (has_sse41)) 1585 (x64_blendvpd if_false if_true condition)) 1586 1587;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1588 1589(rule 1 (lower (insertlane _ vec @ (value_type $I8X16) val (u8_from_uimm8 idx))) 1590 (if-let true (has_sse41)) 1591 (x64_pinsrb vec val idx)) 1592(rule 2 (lower (insertlane _ vec @ (value_type $I8X16) (sinkable_load_exact val) (u8_from_uimm8 idx))) 1593 (if-let true (has_sse41)) 1594 (x64_pinsrb vec val idx)) 1595 1596;; This lowering is particularly unoptimized and is mostly just here to work 1597;; rather than here to be fast. Requiring SSE 4.1 for the above lowering isn't 1598;; the end of the world hopefully as that's a pretty old instruction set, so 1599;; this is the "simplest" version that works on SSE2 for now. 1600;; 1601;; This lowering masks the original vector with a constant with all 1s except 1602;; for the "hole" where this value will get placed into, meaning the desired 1603;; lane is guaranteed as all 0s. Next the `val` is shuffled into this hole with 1604;; a few operations: 1605;; 1606;; 1. The `val` is zero-extended to 32-bits to guarantee the lower 32-bits 1607;; are all defined. 1608;; 2. An arithmetic shift-left is used with the low two bits of `n`, the 1609;; desired lane, to move the value into the right position within the 32-bit 1610;; register value. 1611;; 3. The 32-bit register is moved with `movd` into an XMM register 1612;; 4. The XMM register, where all lanes are 0 except for the first lane which 1613;; has the shifted value, is then shuffled with `pshufd` to move the 1614;; shifted value to the correct and final lane. This uses the upper two 1615;; bits of `n` to index the i32x4 lane that we're targeting. 1616;; 1617;; This all, laboriously, gets the `val` into the desired lane so it's then 1618;; `por`'d with the original vec-with-a-hole to produce the final result of the 1619;; insertion. 1620(rule (lower (insertlane _ vec @ (value_type $I8X16) val (u8_from_uimm8 n))) 1621 (let ((vec_with_hole Xmm (x64_pand vec (insert_i8x16_lane_hole n))) 1622 (val Gpr (x64_movzx (ExtMode.BL) val)) 1623 (val Gpr (x64_shll_mi val (u8_wrapping_shl (u8_and n 3) 3))) 1624 (val Xmm (x64_movd_to_xmm val)) 1625 (val_at_hole Xmm (x64_pshufd val (insert_i8x16_lane_pshufd_imm (u8_wrapping_shr n 2))))) 1626 (x64_por vec_with_hole val_at_hole))) 1627 1628(decl insert_i8x16_lane_hole (u8) VCodeConstant) 1629(extern constructor insert_i8x16_lane_hole insert_i8x16_lane_hole) 1630(decl insert_i8x16_lane_pshufd_imm (u8) u8) 1631(rule (insert_i8x16_lane_pshufd_imm 0) 0b01_01_01_00) 1632(rule (insert_i8x16_lane_pshufd_imm 1) 0b01_01_00_01) 1633(rule (insert_i8x16_lane_pshufd_imm 2) 0b01_00_01_01) 1634(rule (insert_i8x16_lane_pshufd_imm 3) 0b00_01_01_01) 1635 1636 1637;; i16x8.replace_lane 1638(rule (lower (insertlane _ vec @ (value_type $I16X8) val (u8_from_uimm8 idx))) 1639 (x64_pinsrw vec val idx)) 1640(rule 1 (lower (insertlane _ vec @ (value_type $I16X8) (sinkable_load_exact val) (u8_from_uimm8 idx))) 1641 (x64_pinsrw vec val idx)) 1642 1643;; i32x4.replace_lane 1644(rule 1 (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 idx))) 1645 (if-let true (has_sse41)) 1646 (x64_pinsrd vec val idx)) 1647 1648(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 0))) 1649 (x64_movss_regmove vec (x64_movd_to_xmm val))) 1650 1651;; tmp = [ vec[1] vec[0] val[1] val[0] ] 1652;; result = [ vec[3] vec[2] tmp[0] tmp[2] ] 1653(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 1))) 1654 (let ((val Xmm (x64_movd_to_xmm val)) 1655 (vec Xmm vec)) 1656 (x64_shufps (x64_punpcklqdq val vec) vec 0b11_10_00_10))) 1657 1658;; tmp = [ vec[0] vec[3] val[0] val[0] ] 1659;; result = [ tmp[2] tmp[0] vec[1] vec[0] ] 1660(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 2))) 1661 (let ((val Xmm (x64_movd_to_xmm val)) 1662 (vec Xmm vec)) 1663 (x64_shufps vec (x64_shufps val vec 0b00_11_00_00) 0b10_00_01_00))) 1664 1665;; tmp = [ vec[3] vec[2] val[1] val[0] ] 1666;; result = [ tmp[0] tmp[2] vec[1] vec[0] ] 1667(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 3))) 1668 (let ((val Xmm (x64_movd_to_xmm val)) 1669 (vec Xmm vec)) 1670 (x64_shufps vec (x64_shufps val vec 0b11_10_01_00) 0b00_10_01_00))) 1671 1672;; i64x2.replace_lane 1673(rule 1 (lower (insertlane _ vec @ (value_type $I64X2) val (u8_from_uimm8 idx))) 1674 (if-let true (has_sse41)) 1675 (x64_pinsrq vec val idx)) 1676(rule (lower (insertlane _ vec @ (value_type $I64X2) val (u8_from_uimm8 0))) 1677 (x64_movsd_regmove vec (x64_movq_to_xmm val))) 1678(rule (lower (insertlane _ vec @ (value_type $I64X2) val (u8_from_uimm8 1))) 1679 (x64_punpcklqdq vec (x64_movq_to_xmm val))) 1680 1681;; (i64x2.replace_lane 1) with a splat as source for lane 0 -- we can elide 1682;; the splat and just do a move. This turns out to be a common pattern when 1683;; constructing an i64x2 out of two i64s. 1684(rule 3 (lower (insertlane _ (has_type $I64X2 (splat _ lane0)) 1685 lane1 1686 (u8_from_uimm8 1))) 1687 (if-let true (has_sse41)) 1688 (x64_pinsrq (bitcast_gpr_to_xmm 64 lane0) lane1 1)) 1689 1690(rule 1 (lower (insertlane _ vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx))) 1691 (if-let true (has_sse41)) 1692 (x64_insertps vec val (sse_insertps_lane_imm idx))) 1693(rule (lower (insertlane _ vec @ (value_type $F32X4) val (u8_from_uimm8 idx))) 1694 (f32x4_insertlane vec val idx)) 1695 1696;; Helper function used below for `insertlane` but also here for other 1697(decl f32x4_insertlane (Xmm Xmm u8) Xmm) 1698 1699;; f32x4.replace_lane 1700(rule 1 (f32x4_insertlane vec val idx) 1701 (if-let true (has_sse41)) 1702 (x64_insertps vec val (sse_insertps_lane_imm idx))) 1703 1704;; External rust code used to calculate the immediate value to `insertps`. 1705(decl sse_insertps_lane_imm (u8) u8) 1706(extern constructor sse_insertps_lane_imm sse_insertps_lane_imm) 1707 1708;; f32x4.replace_lane 0 1709(rule (f32x4_insertlane vec val 0) 1710 (x64_movss_regmove vec val)) 1711 1712;; f32x4.replace_lane 1 1713;; tmp = [ vec[1] vec[0] val[1] val[0] ] 1714;; result = [ vec[3] vec[2] tmp[0] tmp[2] ] 1715(rule (f32x4_insertlane vec val 1) 1716 (let ((tmp Xmm (x64_movlhps val vec))) 1717 (x64_shufps tmp vec 0b11_10_00_10))) 1718 1719;; f32x4.replace_lane 2 1720;; tmp = [ vec[0] vec[3] val[0] val[0] ] 1721;; result = [ tmp[2] tmp[0] vec[1] vec[0] ] 1722(rule (f32x4_insertlane vec val 2) 1723 (let ((tmp Xmm (x64_shufps val vec 0b00_11_00_00))) 1724 (x64_shufps vec tmp 0b10_00_01_00))) 1725 1726;; f32x4.replace_lane 3 1727;; tmp = [ vec[3] vec[2] val[1] val[0] ] 1728;; result = [ tmp[0] tmp[2] vec[1] vec[0] ] 1729(rule (f32x4_insertlane vec val 3) 1730 (let ((tmp Xmm (x64_shufps val vec 0b11_10_01_00))) 1731 (x64_shufps vec tmp 0b00_10_01_00))) 1732 1733;; f64x2.replace_lane 0 1734;; 1735;; Here the `movsd` instruction is used specifically to specialize moving 1736;; into the fist lane where unlike above cases we're not using the lane 1737;; immediate as an immediate to the instruction itself. 1738(rule (lower (insertlane _ vec @ (value_type $F64X2) val (u8_from_uimm8 0))) 1739 (x64_movsd_regmove vec val)) 1740 1741;; f64x2.replace_lane 1 1742;; 1743;; Here the `movlhps` instruction is used specifically to specialize moving 1744;; into the second lane where unlike above cases we're not using the lane 1745;; immediate as an immediate to the instruction itself. 1746(rule (lower (insertlane _ vec @ (value_type $F64X2) val (u8_from_uimm8 1))) 1747 (x64_movlhps vec val)) 1748 1749;;;; Rules for `smin`, `smax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1750 1751;; `i64` and smaller. 1752 1753(decl cmp_and_choose (Type CC Value Value) ValueRegs) 1754(rule (cmp_and_choose (fits_in_64 ty) cc x y) 1755 (let( ;; We need to put x and y in registers explicitly because 1756 ;; we use the values more than once. Hence, even if these 1757 ;; are "unique uses" at the CLIF level and would otherwise 1758 ;; allow for load-op merging, here we cannot do that. 1759 (x_reg Reg x) 1760 (y_reg Reg y)) 1761 (with_flags_reg (x64_cmp ty y_reg x_reg) 1762 (cmove ty cc y_reg x_reg)))) 1763 1764(rule -1 (lower (has_type (fits_in_64 ty) (umin _ x y))) 1765 (cmp_and_choose ty (CC.B) x y)) 1766 1767(rule -1 (lower (has_type (fits_in_64 ty) (umax _ x y))) 1768 (cmp_and_choose ty (CC.NB) x y)) 1769 1770(rule -1 (lower (has_type (fits_in_64 ty) (smin _ x y))) 1771 (cmp_and_choose ty (CC.L) x y)) 1772 1773(rule -1 (lower (has_type (fits_in_64 ty) (smax _ x y))) 1774 (cmp_and_choose ty (CC.NL) x y)) 1775 1776;; SSE helpers for determining if single-instruction lowerings are available. 1777 1778(decl pure has_pmins (Type) bool) 1779(rule 1 (has_pmins $I16X8) true) 1780(rule 1 (has_pmins $I64X2) false) 1781(rule (has_pmins _) (has_sse41)) 1782 1783(decl pure has_pmaxs (Type) bool) 1784(rule 1 (has_pmaxs $I16X8) true) 1785(rule 1 (has_pmaxs $I64X2) false) 1786(rule (has_pmaxs _) (has_sse41)) 1787 1788(decl pure has_pmaxu (Type) bool) 1789(rule 1 (has_pmaxu $I8X16) true) 1790(rule 1 (has_pmaxu $I64X2) false) 1791(rule (has_pmaxu _) (has_sse41)) 1792 1793(decl pure has_pminu (Type) bool) 1794(rule 1 (has_pminu $I8X16) true) 1795(rule 1 (has_pminu $I64X2) false) 1796(rule (has_pminu _) (has_sse41)) 1797 1798;; SSE `smax`. 1799 1800(rule (lower (has_type (ty_vec128 ty) (smax _ x y))) 1801 (lower_vec_smax ty x y)) 1802 1803(decl lower_vec_smax (Type Xmm Xmm) Xmm) 1804(rule 1 (lower_vec_smax ty x y) 1805 (if-let true (has_pmaxs ty)) 1806 (x64_pmaxs ty x y)) 1807 1808(rule (lower_vec_smax ty x y) 1809 (let ( 1810 (x Xmm x) 1811 (y Xmm y) 1812 (cmp Xmm (x64_pcmpgt ty x y)) 1813 (x_is_max Xmm (x64_pand cmp x)) 1814 (y_is_max Xmm (x64_pandn cmp y)) 1815 ) 1816 (x64_por x_is_max y_is_max))) 1817 1818;; SSE `smin`. 1819 1820(rule 1 (lower (has_type (ty_vec128 ty) (smin _ x y))) 1821 (if-let true (has_pmins ty)) 1822 (x64_pmins ty x y)) 1823 1824(rule (lower (has_type (ty_vec128 ty) (smin _ x y))) 1825 (let ( 1826 (x Xmm x) 1827 (y Xmm y) 1828 (cmp Xmm (x64_pcmpgt ty y x)) 1829 (x_is_min Xmm (x64_pand cmp x)) 1830 (y_is_min Xmm (x64_pandn cmp y)) 1831 ) 1832 (x64_por x_is_min y_is_min))) 1833 1834;; SSE `umax`. 1835 1836(rule 2 (lower (has_type (ty_vec128 ty) (umax _ x y))) 1837 (if-let true (has_pmaxu ty)) 1838 (x64_pmaxu ty x y)) 1839 1840;; If y < x then the saturating subtraction will be zero, otherwise when added 1841;; back to x it'll return y. 1842(rule 1 (lower (has_type $I16X8 (umax _ x y))) 1843 (let ((x Xmm x)) 1844 (x64_paddw x (x64_psubusw y x)))) 1845 1846;; Flip the upper bits of each lane so the signed comparison has the same 1847;; result as a signed comparison, and then select the results with the output 1848;; mask. See `pcmpgt` lowering for info on flipping the upper bit. 1849(rule (lower (has_type (ty_vec128 ty) (umax _ x y))) 1850 (let ( 1851 (x Xmm x) 1852 (y Xmm y) 1853 (mask Xmm (flip_high_bit_mask ty)) 1854 (x_masked Xmm (x64_pxor x mask)) 1855 (y_masked Xmm (x64_pxor y mask)) 1856 (cmp Xmm (x64_pcmpgt ty x_masked y_masked)) 1857 (x_is_max Xmm (x64_pand cmp x)) 1858 (y_is_max Xmm (x64_pandn cmp y)) 1859 ) 1860 (x64_por x_is_max y_is_max))) 1861 1862(decl flip_high_bit_mask (Type) Xmm) 1863(rule (flip_high_bit_mask $I16X8) 1864 (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))) 1865(rule (flip_high_bit_mask $I32X4) 1866 (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000))) 1867(rule (flip_high_bit_mask $I64X2) 1868 (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000))) 1869 1870;; SSE `umin`. 1871 1872(rule 2 (lower (has_type (ty_vec128 ty) (umin _ x y))) 1873 (if-let true (has_pminu ty)) 1874 (x64_pminu ty x y)) 1875 1876;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then 1877;; the saturated result, when subtracted again, will go back to `y`. 1878(rule 1 (lower (has_type $I16X8 (umin _ x y))) 1879 (let ((x Xmm x)) 1880 (x64_psubw x (x64_psubusw x y)))) 1881 1882;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit. 1883(rule (lower (has_type (ty_vec128 ty) (umin _ x y))) 1884 (let ( 1885 (x Xmm x) 1886 (y Xmm y) 1887 (mask Xmm (flip_high_bit_mask ty)) 1888 (x_masked Xmm (x64_pxor x mask)) 1889 (y_masked Xmm (x64_pxor y mask)) 1890 (cmp Xmm (x64_pcmpgt ty y_masked x_masked)) 1891 (x_is_max Xmm (x64_pand cmp x)) 1892 (y_is_max Xmm (x64_pandn cmp y)) 1893 ) 1894 (x64_por x_is_max y_is_max))) 1895 1896;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1897 1898(rule (lower (trap code)) 1899 (side_effect (x64_ud2_zo code))) 1900 1901;;;; Rules for `trapz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1902 1903(rule (lower (trapz val code)) 1904 (side_effect (trap_if_cond (cond_invert (is_nonzero_cmp val)) code))) 1905 1906(decl trap_if_cond (CondResult TrapCode) SideEffectNoResult) 1907(rule (trap_if_cond (CondResult.CC producer cc) tc) 1908 (with_flags_side_effect producer (trap_if cc tc))) 1909(rule (trap_if_cond (CondResult.And producer cc1 cc2) tc) 1910 (with_flags_side_effect producer (trap_if_and cc1 cc2 tc))) 1911(rule (trap_if_cond (CondResult.Or producer cc1 cc2) tc) 1912 (with_flags_side_effect producer (trap_if_or cc1 cc2 tc))) 1913 1914;;;; Rules for `trapnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1915 1916(rule (lower (trapnz val code)) 1917 (side_effect (trap_if_cond (is_nonzero_cmp val) code))) 1918 1919;;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1920 1921(rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap _ a b tc))) 1922 (with_flags 1923 (x64_add_with_flags_paired ty a b) 1924 (trap_if (CC.B) tc))) 1925 1926;; Handle lhs immediates/sinkable loads in addition to the automatic rhs 1927;; handling of above. 1928 1929(rule 1 (lower (has_type (fits_in_64 ty) 1930 (uadd_overflow_trap _ (simm32_from_value a) b tc))) 1931 (with_flags 1932 (x64_add_with_flags_paired ty b a) 1933 (trap_if (CC.B) tc))) 1934 1935(rule 2 (lower (has_type (fits_in_64 ty) 1936 (uadd_overflow_trap _ (sinkable_load a) b tc))) 1937 (with_flags 1938 (x64_add_with_flags_paired ty b a) 1939 (trap_if (CC.B) tc))) 1940 1941;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1942 1943;; N.B.: the Ret itself is generated by the ABI. 1944(rule (lower (return args)) 1945 (lower_return args)) 1946 1947;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1948 1949(rule -2 (lower (icmp _ cc a @ (value_type (fits_in_64 ty)) b)) 1950 (lower_cond_bool (emit_cmp cc a b))) 1951 1952(rule -1 (lower (icmp _ cc a @ (value_type $I128) b)) 1953 (lower_cond_bool (emit_cmp cc a b))) 1954 1955;; Peephole optimization for `x < 0`, when x is a signed 64 bit value 1956(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0)))) 1957 (x64_shrq_mi x 63)) 1958 1959;; Peephole optimization for `0 > x`, when x is a signed 64 bit value 1960(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64)))) 1961 (x64_shrq_mi x 63)) 1962 1963;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value 1964(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64)))) 1965 (x64_shrq_mi (x64_not $I64 x) 63)) 1966 1967;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value 1968(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0)))) 1969 (x64_shrq_mi (x64_not $I64 x) 63)) 1970 1971;; Peephole optimization for `x < 0`, when x is a signed 32 bit value 1972(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0)))) 1973 (x64_shrl_mi x 31)) 1974 1975;; Peephole optimization for `0 > x`, when x is a signed 32 bit value 1976(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32)))) 1977 (x64_shrl_mi x 31)) 1978 1979;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value 1980(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32)))) 1981 (x64_shrl_mi (x64_not $I32 x) 31)) 1982 1983;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value 1984(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0)))) 1985 (x64_shrl_mi (x64_not $I32 x) 31)) 1986 1987;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than 1988;; one. To note: what is different here about the output values is that each 1989;; lane will be filled with all 1s or all 0s according to the comparison, 1990;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits 1991;; unset). 1992(rule (lower (icmp _ (IntCC.Equal) a @ (value_type (ty_vec128 ty)) b)) 1993 (x64_pcmpeq ty a b)) 1994 1995;; To lower a not-equals comparison, we perform an equality comparison 1996;; (PCMPEQ*) and then invert the bits (PXOR with all 1s). 1997(rule (lower (icmp _ (IntCC.NotEqual) a @ (value_type (ty_vec128 ty)) b)) 1998 (let ((checked Xmm (x64_pcmpeq ty a b)) 1999 (all_ones Xmm (vector_all_ones))) 2000 (x64_pxor checked all_ones))) 2001 2002;; SSE `sgt` 2003 2004(rule (lower (icmp _ (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) 2005 (x64_pcmpgt ty a b)) 2006 2007;; SSE `slt` 2008 2009(rule (lower (icmp _ (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b)) 2010 (x64_pcmpgt ty b a)) 2011 2012;; SSE `ugt` 2013 2014;; N.B.: we must manually prevent load coalescing operands; the 2015;; register allocator gets confused otherwise. 2016(rule 1 (lower (icmp _ (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) 2017 (if-let true (has_pmaxu ty)) 2018 (let ((a Xmm a) 2019 (b Xmm b) 2020 (max Xmm (x64_pmaxu ty a b)) 2021 (eq Xmm (x64_pcmpeq ty max b))) 2022 (x64_pxor eq (vector_all_ones)))) 2023 2024;; Flip the upper bit of each lane so the result of a signed comparison is the 2025;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more) 2026(rule (lower (icmp _ (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b)) 2027 (let ((mask Xmm (flip_high_bit_mask ty)) 2028 (a_masked Xmm (x64_pxor a mask)) 2029 (b_masked Xmm (x64_pxor b mask))) 2030 (x64_pcmpgt ty a_masked b_masked))) 2031 2032;; SSE `ult` 2033 2034(rule 1 (lower (icmp _ (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) 2035 (if-let true (has_pminu ty)) 2036 ;; N.B.: see note above. 2037 (let ((a Xmm a) 2038 (b Xmm b) 2039 (min Xmm (x64_pminu ty a b)) 2040 (eq Xmm (x64_pcmpeq ty min b))) 2041 (x64_pxor eq (vector_all_ones)))) 2042 2043;; Flip the upper bit of `a` and `b` so the signed comparison result will 2044;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more). 2045(rule (lower (icmp _ (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b)) 2046 (let ((mask Xmm (flip_high_bit_mask ty)) 2047 (a_masked Xmm (x64_pxor a mask)) 2048 (b_masked Xmm (x64_pxor b mask))) 2049 (x64_pcmpgt ty b_masked a_masked))) 2050 2051;; SSE `sge` 2052 2053;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`. 2054(rule 1 (lower (icmp _ (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2055 (if-let true (has_pmaxs ty)) 2056 (x64_pcmpeq ty a (x64_pmaxs ty a b))) 2057 2058;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the 2059;; result. 2060(rule (lower (icmp _ (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2061 (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones))) 2062 2063;; SSE `sle` 2064 2065;; With `pmins*` use that and compare the result to `a`. 2066(rule 1 (lower (icmp _ (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2067 (if-let true (has_pmins ty)) 2068 (x64_pcmpeq ty a (x64_pmins ty a b))) 2069 2070;; Without `pmins*` perform a greater-than test and invert the result. 2071(rule (lower (icmp _ (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2072 (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones))) 2073 2074;; SSE `uge` 2075 2076(rule 2 (lower (icmp _ (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2077 (if-let true (has_pmaxu ty)) 2078 (x64_pcmpeq ty a (x64_pmaxu ty a b))) 2079 2080;; Perform a saturating subtract of `a` from `b` and if the result is zero then 2081;; `a` is greater or equal. 2082(rule 1 (lower (icmp _ (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b)) 2083 (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8))) 2084 2085;; Flip the upper bit of each lane so the signed comparison is the same as 2086;; an unsigned one and then invert the result. See docs on `pcmpgt` for why 2087;; flipping the upper bit works. 2088(rule (lower (icmp _ (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2089 (let ( 2090 (mask Xmm (flip_high_bit_mask ty)) 2091 (a_masked Xmm (x64_pxor a mask)) 2092 (b_masked Xmm (x64_pxor b mask)) 2093 (cmp Xmm (x64_pcmpgt ty b_masked a_masked)) 2094 ) 2095 (x64_pxor cmp (vector_all_ones)))) 2096 2097;; SSE `ule` 2098 2099(rule 2 (lower (icmp _ (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2100 (if-let true (has_pminu ty)) 2101 (x64_pcmpeq ty a (x64_pminu ty a b))) 2102 2103;; A saturating subtraction will produce zeros if `a` is less than `b`, so 2104;; compare that result to an all-zeros result to figure out lanes of `a` that 2105;; are <= to the lanes in `b` 2106(rule 1 (lower (icmp _ (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b)) 2107 (let ((zeros_if_a_is_min Xmm (x64_psubusw a b))) 2108 (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16)))) 2109 2110;; Flip the upper bit of each lane in `a` and `b` so a signed comparison 2111;; produces the same result as an unsigned comparison. Then test test for `gt` 2112;; and invert the result to get the `le` that is desired here. See docs on 2113;; `pcmpgt` for why flipping the upper bit works. 2114(rule (lower (icmp _ (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2115 (let ( 2116 (mask Xmm (flip_high_bit_mask ty)) 2117 (a_masked Xmm (x64_pxor a mask)) 2118 (b_masked Xmm (x64_pxor b mask)) 2119 (cmp Xmm (x64_pcmpgt ty a_masked b_masked)) 2120 ) 2121 (x64_pxor cmp (vector_all_ones)))) 2122 2123;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2124 2125;; CLIF's `fcmp` instruction always operates on XMM registers--both scalar and 2126;; vector. For the scalar versions, we use the flag-setting behavior of the 2127;; `UCOMIS*` instruction to `SETcc` a 0 or 1 in a GPR register. Note that CLIF's 2128;; `select` uses the same kind of flag-setting behavior but chooses values other 2129;; than 0 or 1. 2130;; 2131;; Checking the result of `UCOMIS*` is unfortunately difficult in some cases 2132;; because we do not have `SETcc` instructions that explicitly check 2133;; simultaneously for the condition (i.e., `eq`, `le`, `gt`, etc.) *and* 2134;; orderedness. Instead, we must check the flags multiple times. The UCOMIS* 2135;; documentation (see Intel's Software Developer's Manual, volume 2, chapter 4) 2136;; is helpful: 2137;; - unordered assigns Z = 1, P = 1, C = 1 2138;; - greater than assigns Z = 0, P = 0, C = 0 2139;; - less than assigns Z = 0, P = 0, C = 1 2140;; - equal assigns Z = 1, P = 0, C = 0 2141 2142(rule -1 (lower (fcmp _ cc a @ (value_type (ty_scalar_float ty)) b)) 2143 (lower_cond_bool (emit_fcmp cc a b))) 2144 2145;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that 2146;; determines the comparison to make. Note that comparisons that succeed will 2147;; fill the lane with 1s; comparisons that do not will fill the lane with 0s. 2148 2149(rule (lower (fcmp _ (FloatCC.Equal) a @ (value_type (ty_vec128 ty)) b)) 2150 (x64_cmpp ty a b (FcmpImm.Equal))) 2151(rule (lower (fcmp _ (FloatCC.NotEqual) a @ (value_type (ty_vec128 ty)) b)) 2152 (x64_cmpp ty a b (FcmpImm.NotEqual))) 2153(rule (lower (fcmp _ (FloatCC.LessThan) a @ (value_type (ty_vec128 ty)) b)) 2154 (x64_cmpp ty a b (FcmpImm.LessThan))) 2155(rule (lower (fcmp _ (FloatCC.LessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2156 (x64_cmpp ty a b (FcmpImm.LessThanOrEqual))) 2157(rule (lower (fcmp _ (FloatCC.Ordered) a @ (value_type (ty_vec128 ty)) b)) 2158 (x64_cmpp ty a b (FcmpImm.Ordered))) 2159(rule (lower (fcmp _ (FloatCC.Unordered) a @ (value_type (ty_vec128 ty)) b)) 2160 (x64_cmpp ty a b (FcmpImm.Unordered))) 2161(rule (lower (fcmp _ (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_vec128 ty)) b)) 2162 (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThan))) 2163(rule (lower (fcmp _ (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2164 (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThanOrEqual))) 2165 2166;; Some vector lowerings rely on flipping the operands and using a reversed 2167;; comparison code. 2168 2169(rule (lower (fcmp _ (FloatCC.GreaterThan) a @ (value_type (ty_vec128 ty)) b)) 2170 (x64_cmpp ty b a (FcmpImm.LessThan))) 2171(rule (lower (fcmp _ (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2172 (x64_cmpp ty b a (FcmpImm.LessThanOrEqual))) 2173(rule (lower (fcmp _ (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_vec128 ty)) b)) 2174 (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThan))) 2175(rule (lower (fcmp _ (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b)) 2176 (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThanOrEqual))) 2177 2178;; Some vector lowerings are simply not supported for certain codes: 2179;; - FloatCC::OrderedNotEqual 2180;; - FloatCC::UnorderedOrEqual 2181 2182;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2183 2184(rule (lower (select _ cond x y)) (lower_select (is_nonzero_cmp cond) x y)) 2185 2186; Recursion: at most once to swap the And case for an Or. 2187(decl rec lower_select (CondResult Value Value) InstOutput) 2188(rule 0 (lower_select cond a @ (value_type (ty_int (fits_in_64 ty))) b) 2189 (lower_select_gpr ty cond a b)) 2190(rule 1 (lower_select cond a @ (value_type (is_xmm_type ty)) b) 2191 (lower_select_xmm ty cond a b)) 2192(rule 2 (lower_select cond a @ (value_type $I128) b) 2193 (lower_select128 cond a b)) 2194;; Note that for all of the rules below if the condition evaluates to 2195;; `CondResult.And` that's swapped to `CondResult.Or` by negating the conditions 2196;; and swapping the two values to make codegen a bit easier (only have to do the 2197;; "or" case). 2198(rule 3 (lower_select cond @ (CondResult.And _ _ _) a b) 2199 (lower_select (cond_invert cond) b a)) 2200 2201(decl lower_select_gpr (Type CondResult GprMem Gpr) Gpr) 2202(rule (lower_select_gpr ty (CondResult.CC flags cc) a b) 2203 (value_regs_get_gpr (with_flags flags (cmove ty cc a b)) 0)) 2204(rule (lower_select_gpr ty (CondResult.Or flags cc1 cc2) a b) 2205 (let ((c1 ConsumesFlags (cmove ty cc1 a b)) 2206 (tmp Gpr (consumes_flags_get_reg c1)) 2207 (c2 ConsumesFlags (cmove ty cc2 a tmp))) 2208 (value_regs_get (with_flags flags (consumes_flags_return_last c1 c2)) 0))) 2209 2210(decl lower_select_xmm (Type CondResult Xmm Xmm) Xmm) 2211(rule (lower_select_xmm ty (CondResult.CC flags cc) a b) 2212 (value_regs_get (with_flags flags (cmove_xmm ty cc a b)) 0)) 2213(rule (lower_select_xmm ty (CondResult.Or flags cc1 cc2) a b) 2214 (let ((c1 ConsumesFlags (cmove_xmm ty cc1 a b)) 2215 (tmp Xmm (consumes_flags_get_reg c1)) 2216 (c2 ConsumesFlags (cmove_xmm ty cc2 a tmp))) 2217 (value_regs_get (with_flags flags (consumes_flags_return_last c1 c2)) 0))) 2218 2219(decl lower_select128 (CondResult ValueRegs ValueRegs) ValueRegs) 2220(rule (lower_select128 (CondResult.CC flags cc) a b) 2221 (with_flags flags (cmove128 cc a b))) 2222(rule (lower_select128 (CondResult.Or flags cc1 cc2) a b) 2223 (let ((c1 ConsumesFlags (cmove128 cc1 a b)) 2224 (tmp ValueRegs (consumes_flags_get_regs c1)) 2225 (c2 ConsumesFlags (cmove128 cc2 a tmp))) 2226 (with_flags flags (consumes_flags_return_last c1 c2)))) 2227 2228;; Helper to `lower_select128` above to create two `cmove` instructions based 2229;; on the CC provided for the upper/lower halves. 2230(decl cmove128 (CC ValueRegs ValueRegs) ConsumesFlags) 2231(rule (cmove128 cc cons alt) 2232 (consumes_flags_concat 2233 (cmove $I64 cc (value_regs_get_gpr cons 0) (value_regs_get_gpr alt 0)) 2234 (cmove $I64 cc (value_regs_get_gpr cons 1) (value_regs_get_gpr alt 1)))) 2235 2236;; Helper to the "Or" conditions above to create a `ConsumesFlags` that is a 2237;; sequence of two other `ConsumesFlags` which returns the result of the 2238;; second `ConsumesFlags`, the result of the `lower_select*` operation. 2239(decl consumes_flags_return_last (ConsumesFlags ConsumesFlags) ConsumesFlags) 2240(rule (consumes_flags_return_last 2241 (ConsumesFlags.ConsumesFlagsReturnsReg inst1 _) 2242 (ConsumesFlags.ConsumesFlagsReturnsReg inst2 dst)) 2243 (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs inst1 inst2 dst)) 2244(rule (consumes_flags_return_last 2245 (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs i1 i2 _) 2246 (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs i3 i4 dst)) 2247 (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs i1 i2 i3 i4 dst)) 2248 2249;; Specializations for floating-point compares to generate a `mins*` or a 2250;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}" 2251;; specializations for vectors. 2252(rule 3 (lower (has_type $F32 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) x y)) x y))) 2253 (x64_minss x y)) 2254(rule 3 (lower (has_type $F64 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) x y)) x y))) 2255 (x64_minsd x y)) 2256(rule 4 (lower (has_type $F32 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) y x)) x y))) 2257 (x64_maxss x y)) 2258(rule 4 (lower (has_type $F64 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) y x)) x y))) 2259 (x64_maxsd x y)) 2260 2261;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2262 2263(rule 2 (lower (has_type (ty_32_or_64 ty) (clz _ src))) 2264 (do_clz ty ty src)) 2265 2266(rule 1 (lower (has_type (ty_8_or_16 ty) (clz _ src))) 2267 (let ((extended Gpr (extend_to_gpr src $I64 (ExtendKind.Zero))) 2268 (clz Gpr (do_clz $I64 $I64 extended))) 2269 (x64_sub $I64 clz (RegMemImm.Imm (u32_wrapping_sub 64 (ty_bits ty)))))) 2270 2271 2272(rule 0 (lower 2273 (has_type $I128 2274 (clz _ src))) 2275 (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1))) 2276 (lower Gpr (x64_add $I64 2277 (do_clz $I64 $I64 (value_regs_get_gpr src 0)) 2278 (RegMemImm.Imm 64))) 2279 (result_lo Gpr 2280 (with_flags_reg 2281 (x64_cmpq_mi_sxb upper 64) 2282 (cmove $I64 (CC.NZ) upper lower)))) 2283 (value_regs result_lo (imm $I64 0)))) 2284 2285;; Implementation helper for clz; operates on 32 or 64-bit units. 2286(decl do_clz (Type Type Gpr) Gpr) 2287 2288;; If available, we can use a plain lzcnt instruction here. Note no 2289;; special handling is required for zero inputs, because the machine 2290;; instruction does what the CLIF expects for zero, i.e. it returns 2291;; zero. 2292(rule 1 (do_clz ty orig_ty src) 2293 (if-let true (has_lzcnt)) 2294 (x64_lzcnt ty src)) 2295 2296(rule 0 (do_clz ty orig_ty src) 2297 (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1))) 2298 (bits_minus_1 Reg (imm ty (u64_wrapping_sub (ty_bits_u64 orig_ty) 1)))) 2299 (x64_sub ty bits_minus_1 highest_bit_index))) 2300 2301;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2302 2303(rule 2 (lower (has_type (ty_32_or_64 ty) (ctz _ src))) 2304 (do_ctz ty ty src)) 2305 2306(rule 1 (lower (has_type (ty_8_or_16 ty) (ctz _ src))) 2307 (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Zero))) 2308 (stopbit Gpr (x64_or $I32 extended (RegMemImm.Imm (u32_wrapping_shl 1 (ty_bits ty)))))) 2309 (do_ctz $I32 ty stopbit))) 2310 2311(rule 0 (lower 2312 (has_type $I128 2313 (ctz _ src))) 2314 (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0))) 2315 (upper Gpr (x64_add $I64 2316 (do_ctz $I64 $I64 (value_regs_get_gpr src 1)) 2317 (RegMemImm.Imm 64))) 2318 (result_lo Gpr 2319 (with_flags_reg 2320 (x64_cmpq_mi_sxb lower 64) 2321 (cmove $I64 (CC.Z) upper lower)))) 2322 (value_regs result_lo (imm $I64 0)))) 2323 2324(decl do_ctz (Type Type Gpr) Gpr) 2325 2326;; Analogous to `clz` cases above, but using mirror instructions 2327;; (tzcnt vs lzcnt, bsf vs bsr). 2328(rule 1 (do_ctz ty orig_ty src) 2329 (if-let true (has_bmi1)) 2330 (x64_tzcnt ty src)) 2331 2332(rule 0 (do_ctz ty orig_ty src) 2333 (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty)))) 2334 2335;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2336 2337(rule 2 (lower (has_type (ty_32_or_64 ty) (cls _ src))) 2338 (do_cls ty src)) 2339 2340(rule 1 (lower (has_type (ty_8_or_16 ty) (cls _ src))) 2341 (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Sign))) 2342 (cls Gpr (do_cls $I32 extended))) 2343 (x64_sub $I32 cls (RegMemImm.Imm (u32_wrapping_sub 32 (ty_bits ty)))))) 2344 2345(rule 0 (lower 2346 (has_type $I128 2347 (cls _ src))) 2348 (let ((upper Gpr (do_cls $I64 (value_regs_get_gpr src 1))) 2349 (sign_fill Gpr (x64_sarq_mi (value_regs_get_gpr src 1) 63)) 2350 (xored Gpr (x64_xor $I64 (value_regs_get_gpr src 0) sign_fill)) 2351 (lower Gpr (x64_add $I64 2352 (do_clz $I64 $I64 xored) 2353 (RegMemImm.Imm 63))) 2354 (result_lo Gpr 2355 (with_flags_reg 2356 (x64_cmpq_mi_sxb upper 63) 2357 (cmove $I64 (CC.NZ) upper lower)))) 2358 (value_regs result_lo (imm $I64 0)))) 2359 2360;; Implementation helper for cls; operates on 32 or 64-bit units. 2361(decl do_cls (Type Gpr) Gpr) 2362 2363;; cls is implemented via clz using the identity: cls(x) = clz(x ^ (x >> 1)) - 1 2364(rule (do_cls ty src) 2365 (let ((shifted Gpr (x64_sar ty src (Imm8Gpr.Imm8 1))) 2366 (xored Gpr (x64_xor ty src (RegMemImm.Reg shifted))) 2367 (clz Gpr (do_clz ty ty xored))) 2368 (x64_sub ty clz (RegMemImm.Imm 1)))) 2369 2370;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2371 2372(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt _ src))) 2373 (if-let true (use_popcnt)) 2374 (x64_popcnt ty src)) 2375 2376(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt _ src))) 2377 (if-let true (use_popcnt)) 2378 (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) 2379 2380(rule 1 (lower (has_type $I128 (popcnt _ src))) 2381 (if-let true (use_popcnt)) 2382 (let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0))) 2383 (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1)))) 2384 (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) 2385 2386(rule -1 (lower 2387 (has_type (ty_32_or_64 ty) 2388 (popcnt _ src))) 2389 (do_popcnt ty src)) 2390 2391(rule -2 (lower 2392 (has_type (ty_8_or_16 ty) 2393 (popcnt _ src))) 2394 (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero)))) 2395 2396(rule (lower 2397 (has_type $I128 2398 (popcnt _ src))) 2399 (let ((lo_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 0))) 2400 (hi_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 1)))) 2401 (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0)))) 2402 2403;; Implementation of popcount when we don't nave a native popcount 2404;; instruction. 2405(decl do_popcnt (Type Gpr) Gpr) 2406(rule (do_popcnt $I64 src) 2407 (let ((shifted1 Gpr (x64_shrq_mi src 1)) 2408 (sevens Gpr (imm $I64 0x7777777777777777)) 2409 (masked1 Gpr (x64_and $I64 shifted1 sevens)) 2410 ;; diff1 := src - ((src >> 1) & 0b0111_0111_0111...) 2411 (diff1 Gpr (x64_sub $I64 src masked1)) 2412 (shifted2 Gpr (x64_shrq_mi masked1 1)) 2413 (masked2 Gpr (x64_and $I64 shifted2 sevens)) 2414 ;; diff2 := diff1 - ((diff1 >> 1) & 0b0111_0111_0111...) 2415 (diff2 Gpr (x64_sub $I64 diff1 masked2)) 2416 (shifted3 Gpr (x64_shrq_mi masked2 1)) 2417 (masked3 Gpr (x64_and $I64 shifted3 sevens)) 2418 ;; diff3 := diff2 - ((diff2 >> 1) & 0b0111_0111_0111...) 2419 ;; 2420 ;; At this point, each nibble of diff3 is the popcount of 2421 ;; that nibble. This works because at each step above, we 2422 ;; are basically subtracting floor(value / 2) from the 2423 ;; running value; the leftover remainder is 1 if the LSB 2424 ;; was 1. After three steps, we have (nibble / 8) -- 0 or 2425 ;; 1 for the MSB of the nibble -- plus three possible 2426 ;; additions for the three other bits. 2427 (diff3 Gpr (x64_sub $I64 diff2 masked3)) 2428 ;; Add the two nibbles of each byte together. 2429 (sum1 Gpr (x64_add $I64 (x64_shrq_mi diff3 4) diff3)) 2430 ;; Mask the above sum to have the popcount for each byte 2431 ;; in the lower nibble of that byte. 2432 (ofof Gpr (imm $I64 0x0f0f0f0f0f0f0f0f)) 2433 (masked4 Gpr (x64_and $I64 sum1 ofof)) 2434 (ones Gpr (imm $I64 0x0101010101010101)) 2435 ;; Use a multiply to sum all of the bytes' popcounts into 2436 ;; the top byte. Consider the binomial expansion for the 2437 ;; top byte: it is the sum of the bytes (masked4 >> 56) * 2438 ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01 2439 ;; + ... + (masked4 >> 0). 2440 (mul Gpr (x64_imul $I64 masked4 ones)) 2441 ;; Now take that top byte and return it as the popcount. 2442 (final Gpr (x64_shrq_mi mul 56))) 2443 final)) 2444 2445;; This is the 32-bit version of the above; the steps for each nibble 2446;; are the same, we just use constants half as wide. 2447(rule (do_popcnt $I32 src) 2448 (let ((shifted1 Gpr (x64_shrl_mi src 1)) 2449 (sevens Gpr (imm $I32 0x77777777)) 2450 (masked1 Gpr (x64_and $I32 shifted1 sevens)) 2451 (diff1 Gpr (x64_sub $I32 src masked1)) 2452 (shifted2 Gpr (x64_shrl_mi masked1 1)) 2453 (masked2 Gpr (x64_and $I32 shifted2 sevens)) 2454 (diff2 Gpr (x64_sub $I32 diff1 masked2)) 2455 (shifted3 Gpr (x64_shrl_mi masked2 1)) 2456 (masked3 Gpr (x64_and $I32 shifted3 sevens)) 2457 (diff3 Gpr (x64_sub $I32 diff2 masked3)) 2458 (sum1 Gpr (x64_add $I32 (x64_shrl_mi diff3 4) diff3)) 2459 (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f))) 2460 (mul Gpr (x64_imul_imm $I32 masked4 0x01010101)) 2461 (final Gpr (x64_shrl_mi mul 24))) 2462 final)) 2463 2464 2465(rule 2 (lower (has_type $I8X16 (popcnt _ src))) 2466 (if-let true (has_avx512vl)) 2467 (if-let true (has_avx512bitalg)) 2468 (x64_vpopcntb src)) 2469 2470 2471;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf): 2472;; 2473;; __m128i count_bytes ( __m128i v) { 2474;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); 2475;; __m128i low_mask = _mm_set1_epi8 (0x0f); 2476;; __m128i lo = _mm_and_si128 (v, low_mask); 2477;; __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask); 2478;; __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo); 2479;; __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi); 2480;; return _mm_add_epi8 (cnt1, cnt2); 2481;; } 2482;; 2483;; Details of the above algorithm can be found in the reference noted above, but the basics 2484;; are to create a lookup table that pre populates the popcnt values for each number [0,15]. 2485;; The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the 2486;; lookup process, and adds together the results. 2487;; 2488;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); 2489 2490 2491(rule 1 (lower (has_type $I8X16 (popcnt _ src))) 2492 (if-let true (has_ssse3)) 2493 (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f)) 2494 (low_nibbles Xmm (sse_and $I8X16 src low_mask)) 2495 ;; Note that this is a 16x8 shift, but that's OK; we mask 2496 ;; off anything that traverses from one byte to the next 2497 ;; with the low_mask below. 2498 (shifted_src Xmm (x64_psrlw src (xmi_imm 4))) 2499 (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask)) 2500 (lookup Xmm (x64_xmm_load_const $I8X16 2501 (emit_u128_le_const 0x04030302_03020201_03020201_02010100))) 2502 (bit_counts_low Xmm (x64_pshufb lookup low_nibbles)) 2503 (bit_counts_high Xmm (x64_pshufb lookup high_nibbles))) 2504 (x64_paddb bit_counts_low bit_counts_high))) 2505 2506;; A modified version of the popcnt method from Hacker's Delight. 2507(rule (lower (has_type $I8X16 (popcnt _ src))) 2508 (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777)) 2509 (src Xmm src) 2510 (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1)) 2511 (src Xmm (x64_psubb src shifted)) 2512 (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1)) 2513 (src Xmm (x64_psubb src shifted)) 2514 (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1)) 2515 (src Xmm (x64_psubb src shifted)) 2516 (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4))))) 2517 (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f)))) 2518 2519;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2520 2521(rule (lower (has_type $I8 (bitrev _ src))) 2522 (do_bitrev8 $I32 src)) 2523 2524(rule (lower (has_type $I16 (bitrev _ src))) 2525 (do_bitrev16 $I32 src)) 2526 2527(rule (lower (has_type $I32 (bitrev _ src))) 2528 (do_bitrev32 $I32 src)) 2529 2530(rule (lower (has_type $I64 (bitrev _ src))) 2531 (do_bitrev64 $I64 src)) 2532 2533(rule (lower (has_type $I128 (bitrev _ src))) 2534 (value_regs 2535 (do_bitrev64 $I64 (value_regs_get_gpr src 1)) 2536 (do_bitrev64 $I64 (value_regs_get_gpr src 0)))) 2537 2538(decl do_bitrev8 (Type Gpr) Gpr) 2539(rule (do_bitrev8 ty src) 2540 (let ((tymask u64 (ty_mask ty)) 2541 (mask1 Gpr (imm ty (u64_and tymask 0x5555555555555555))) 2542 (lo1 Gpr (x64_and ty src mask1)) 2543 (hi1 Gpr (x64_and ty (x64_shr ty src (Imm8Gpr.Imm8 1)) mask1)) 2544 (swap1 Gpr (x64_or ty 2545 (x64_shl ty lo1 (Imm8Gpr.Imm8 1)) 2546 hi1)) 2547 (mask2 Gpr (imm ty (u64_and tymask 0x3333333333333333))) 2548 (lo2 Gpr (x64_and ty swap1 mask2)) 2549 (hi2 Gpr (x64_and ty (x64_shr ty swap1 (Imm8Gpr.Imm8 2)) mask2)) 2550 (swap2 Gpr (x64_or ty 2551 (x64_shl ty lo2 (Imm8Gpr.Imm8 2)) 2552 hi2)) 2553 (mask4 Gpr (imm ty (u64_and tymask 0x0f0f0f0f0f0f0f0f))) 2554 (lo4 Gpr (x64_and ty swap2 mask4)) 2555 (hi4 Gpr (x64_and ty (x64_shr ty swap2 (Imm8Gpr.Imm8 4)) mask4)) 2556 (swap4 Gpr (x64_or ty 2557 (x64_shl ty lo4 (Imm8Gpr.Imm8 4)) 2558 hi4))) 2559 swap4)) 2560 2561(decl do_bitrev16 (Type Gpr) Gpr) 2562(rule (do_bitrev16 ty src) 2563 (let ((src_ Gpr (do_bitrev8 ty src)) 2564 (tymask u64 (ty_mask ty)) 2565 (mask8 Gpr (imm ty (u64_and tymask 0x00ff00ff00ff00ff))) 2566 (lo8 Gpr (x64_and ty src_ mask8)) 2567 (hi8 Gpr (x64_and ty (x64_shr ty src_ (Imm8Gpr.Imm8 8)) mask8)) 2568 (swap8 Gpr (x64_or ty 2569 (x64_shl ty lo8 (Imm8Gpr.Imm8 8)) 2570 hi8))) 2571 swap8)) 2572 2573(decl do_bitrev32 (Type Gpr) Gpr) 2574(rule (do_bitrev32 ty src) 2575 (let ((src_ Gpr (do_bitrev16 ty src)) 2576 (tymask u64 (ty_mask ty)) 2577 (mask16 Gpr (imm ty (u64_and tymask 0x0000ffff0000ffff))) 2578 (lo16 Gpr (x64_and ty src_ mask16)) 2579 (hi16 Gpr (x64_and ty (x64_shr ty src_ (Imm8Gpr.Imm8 16)) mask16)) 2580 (swap16 Gpr (x64_or ty 2581 (x64_shl ty lo16 (Imm8Gpr.Imm8 16)) 2582 hi16))) 2583 swap16)) 2584 2585(decl do_bitrev64 (Type Gpr) Gpr) 2586(rule (do_bitrev64 ty @ $I64 src) 2587 (let ((src_ Gpr (do_bitrev32 ty src)) 2588 (mask32 Gpr (imm ty 0xffffffff)) 2589 (lo32 Gpr (x64_and ty src_ mask32)) 2590 (hi32 Gpr (x64_shrq_mi src_ 32)) 2591 (swap32 Gpr (x64_or ty 2592 (x64_shlq_mi lo32 32) 2593 hi32))) 2594 swap32)) 2595 2596;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2597 2598;; x64 bswap instruction is only for 32- or 64-bit swaps 2599;; implement the 16-bit swap as a rotl by 8 2600(rule (lower (has_type $I16 (bswap _ src))) 2601 (x64_rolw_mi src 8)) 2602 2603(rule (lower (has_type $I32 (bswap _ src))) 2604 (x64_bswap $I32 src)) 2605 2606(rule (lower (has_type $I64 (bswap _ src))) 2607 (x64_bswap $I64 src)) 2608 2609(rule (lower (has_type $I128 (bswap _ src))) 2610 (value_regs 2611 (x64_bswap $I64 (value_regs_get_gpr src 1)) 2612 (x64_bswap $I64 (value_regs_get_gpr src 0)))) 2613 2614;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2615 2616;; I{8,16,32,64} -> I128. 2617(rule (lower (has_type $I128 (uextend _ src))) 2618 (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0))) 2619 2620;; I{8,16,32} -> I64. 2621(rule (lower (has_type $I64 (uextend _ src))) 2622 (extend_to_gpr src $I64 (ExtendKind.Zero))) 2623 2624;; I{8,16} -> I32 2625;; I8 -> I16 2626(rule -1 (lower (has_type (fits_in_32 _) (uextend _ src))) 2627 (extend_to_gpr src $I32 (ExtendKind.Zero))) 2628 2629;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2630 2631;; I{8,16,32} -> I128. 2632;; 2633;; Produce upper 64 bits sign-extended from lower 64: shift right by 2634;; 63 bits to spread the sign bit across the result. 2635(rule (lower (has_type $I128 (sextend _ src))) 2636 (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign))) 2637 (hi Gpr (x64_sarq_mi lo 63))) 2638 (value_regs lo hi))) 2639 2640;; I{8,16,32} -> I64. 2641(rule (lower (has_type $I64 (sextend _ src))) 2642 (extend_to_gpr src $I64 (ExtendKind.Sign))) 2643 2644;; I{8,16} -> I32 2645;; I8 -> I16 2646(rule -1 (lower (has_type (fits_in_32 _) (sextend _ src))) 2647 (extend_to_gpr src $I32 (ExtendKind.Sign))) 2648 2649;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2650 2651;; T -> T is always a no-op, even I128 -> I128. 2652(rule (lower (has_type ty (ireduce _ src @ (value_type ty)))) 2653 src) 2654 2655;; T -> I{64,32,16,8}: We can simply pass through the value: values 2656;; are always stored with high bits undefined, so we can just leave 2657;; them be. 2658(rule 1 (lower (has_type (fits_in_64 ty) (ireduce _ src))) 2659 (value_regs_get_gpr src 0)) 2660 2661;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2662 2663(rule (lower (debugtrap)) 2664 (side_effect (x64_int3_zo))) 2665 2666;; Rules for `x86_pmaddubsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2667 2668(rule (lower (has_type $I16X8 (x86_pmaddubsw _ x y))) 2669 (if-let true (has_ssse3)) 2670 (x64_pmaddubsw y x)) 2671 2672;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2673 2674(rule (lower (has_type $F32 (fadd _ x y))) 2675 (x64_addss x y)) 2676(rule (lower (has_type $F64 (fadd _ x y))) 2677 (x64_addsd x y)) 2678(rule (lower (has_type $F32X4 (fadd _ x y))) 2679 (x64_addps x y)) 2680(rule (lower (has_type $F64X2 (fadd _ x y))) 2681 (x64_addpd x y)) 2682 2683;; The above rules automatically sink loads for rhs operands, so additionally 2684;; add rules for sinking loads with lhs operands. 2685(rule 1 (lower (has_type $F32 (fadd _ (sinkable_load x) y))) 2686 (x64_addss y x)) 2687(rule 1 (lower (has_type $F64 (fadd _ (sinkable_load x) y))) 2688 (x64_addsd y x)) 2689(rule 1 (lower (has_type $F32X4 (fadd _ (sinkable_load x) y))) 2690 (x64_addps y x)) 2691(rule 1 (lower (has_type $F64X2 (fadd _ (sinkable_load x) y))) 2692 (x64_addpd y x)) 2693 2694;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2695 2696(rule (lower (has_type $F32 (fsub _ x y))) 2697 (x64_subss x y)) 2698(rule (lower (has_type $F64 (fsub _ x y))) 2699 (x64_subsd x y)) 2700(rule (lower (has_type $F32X4 (fsub _ x y))) 2701 (x64_subps x y)) 2702(rule (lower (has_type $F64X2 (fsub _ x y))) 2703 (x64_subpd x y)) 2704 2705;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2706 2707(rule (lower (has_type $F32 (fmul _ x y))) 2708 (x64_mulss x y)) 2709(rule (lower (has_type $F64 (fmul _ x y))) 2710 (x64_mulsd x y)) 2711(rule (lower (has_type $F32X4 (fmul _ x y))) 2712 (x64_mulps x y)) 2713(rule (lower (has_type $F64X2 (fmul _ x y))) 2714 (x64_mulpd x y)) 2715 2716;; The above rules automatically sink loads for rhs operands, so additionally 2717;; add rules for sinking loads with lhs operands. 2718(rule 1 (lower (has_type $F32 (fmul _ (sinkable_load x) y))) 2719 (x64_mulss y x)) 2720(rule 1 (lower (has_type $F64 (fmul _ (sinkable_load x) y))) 2721 (x64_mulsd y x)) 2722(rule 1 (lower (has_type $F32X4 (fmul _ (sinkable_load x) y))) 2723 (x64_mulps y x)) 2724(rule 1 (lower (has_type $F64X2 (fmul _ (sinkable_load x) y))) 2725 (x64_mulpd y x)) 2726 2727;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2728 2729(rule (lower (has_type $F32 (fdiv _ x y))) 2730 (x64_divss x y)) 2731(rule (lower (has_type $F64 (fdiv _ x y))) 2732 (x64_divsd x y)) 2733(rule (lower (has_type $F32X4 (fdiv _ x y))) 2734 (x64_divps x y)) 2735(rule (lower (has_type $F64X2 (fdiv _ x y))) 2736 (x64_divpd x y)) 2737 2738;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2739(rule (lower (has_type $F32 (sqrt _ x))) 2740 (x64_sqrtss (xmm_zero $F32X4) x)) 2741(rule (lower (has_type $F64 (sqrt _ x))) 2742 (x64_sqrtsd (xmm_zero $F64X2) x)) 2743(rule (lower (has_type $F32X4 (sqrt _ x))) 2744 (x64_sqrtps x)) 2745(rule (lower (has_type $F64X2 (sqrt _ x))) 2746 (x64_sqrtpd x)) 2747 2748;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2749(rule (lower (has_type $F64 (fpromote _ x))) 2750 (x64_cvtss2sd (xmm_zero $F64X2) x)) 2751 2752;; Rules for `fvpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2753(rule (lower (has_type $F64X2 (fvpromote_low _ x))) 2754 (x64_cvtps2pd (put_in_xmm x))) 2755 2756;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2757(rule (lower (has_type $F32 (fdemote _ x))) 2758 (x64_cvtsd2ss (xmm_zero $F32X4) x)) 2759 2760;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2761(rule (lower (has_type $F32X4 (fvdemote _ x))) 2762 (x64_cvtpd2ps x)) 2763 2764;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2765 2766(rule (lower (has_type $F32 (fmin _ x y))) 2767 (xmm_min_max_seq $F32 true x y)) 2768(rule (lower (has_type $F64 (fmin _ x y))) 2769 (xmm_min_max_seq $F64 true x y)) 2770 2771;; Vector-typed version. We don't use single pseudoinstructions as 2772;; above, because we don't need to generate a mini-CFG. Instead, we 2773;; perform a branchless series of operations. 2774;; 2775;; We cannot simply use native min instructions (minps, minpd) because 2776;; NaN handling is different per CLIF semantics than on 2777;; x86. Specifically, if an argument is NaN, or the arguments are both 2778;; zero but of opposite signs, then the x86 instruction always 2779;; produces the second argument. However, per CLIF semantics, we 2780;; require that fmin(NaN, _) = fmin(_, NaN) = NaN, and fmin(+0, -0) = 2781;; fmin(-0, +0) = -0. 2782 2783(rule (lower (has_type $F32X4 (fmin _ x y))) 2784 ;; Compute min(x, y) and min(y, x) with native 2785 ;; instructions. These will differ in one of the edge cases 2786 ;; above that we have to handle properly. (Conversely, if they 2787 ;; don't differ, then the native instruction's answer is the 2788 ;; right one per CLIF semantics.) 2789 (let ((x Xmm x) ;; force x/y into registers and disallow load sinking 2790 (y Xmm y) 2791 (min1 Xmm (x64_minps x y)) 2792 (min2 Xmm (x64_minps y x)) 2793 ;; Compute the OR of the two. Note that NaNs have an 2794 ;; exponent field of all-ones (0xFF for F32), so if either 2795 ;; result is a NaN, this OR will be. And if either is a 2796 ;; zero (which has an exponent of 0 and mantissa of 0), 2797 ;; this captures a sign-bit of 1 (negative) if either 2798 ;; input is negative. 2799 ;; 2800 ;; In the case where we don't have a +/-0 mismatch or 2801 ;; NaNs, then `min1` and `min2` are equal and `min_or` is 2802 ;; the correct minimum. 2803 (min_or Xmm (x64_orps min1 min2)) 2804 ;; "compare unordered" produces a true mask (all ones) in 2805 ;; a given lane if the min is a NaN. We use this to 2806 ;; generate a mask to ensure quiet NaNs. 2807 (is_nan_mask Xmm (x64_cmpps min_or min2 (FcmpImm.Unordered))) 2808 ;; OR in the NaN mask. 2809 (min_or_2 Xmm (x64_orps min_or is_nan_mask)) 2810 ;; Shift the NaN mask down so that it covers just the 2811 ;; fraction below the NaN signalling bit; we'll use this 2812 ;; to mask off non-canonical NaN payloads. 2813 ;; 2814 ;; All-ones for NaN, shifted down to leave 10 top bits (1 2815 ;; sign, 8 exponent, 1 QNaN bit that must remain set) 2816 ;; cleared. 2817 (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10))) 2818 ;; Do a NAND, so that we retain every bit not set in 2819 ;; `nan_fraction_mask`. This mask will be all zeroes (so 2820 ;; we retain every bit) in non-NaN cases, and will have 2821 ;; ones (so we clear those bits) in NaN-payload bits 2822 ;; otherwise. 2823 (final Xmm (x64_andnps nan_fraction_mask min_or_2))) 2824 final)) 2825 2826;; Likewise for F64 lanes, except that the right-shift is by 13 bits 2827;; (1 sign, 11 exponent, 1 QNaN bit). 2828(rule (lower (has_type $F64X2 (fmin _ x y))) 2829 (let ((x Xmm x) ;; force x/y into registers and disallow load sinking 2830 (y Xmm y) 2831 (min1 Xmm (x64_minpd x y)) 2832 (min2 Xmm (x64_minpd y x)) 2833 (min_or Xmm (x64_orpd min1 min2)) 2834 (is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered))) 2835 (min_or_2 Xmm (x64_orpd min_or is_nan_mask)) 2836 (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13))) 2837 (final Xmm (x64_andnpd nan_fraction_mask min_or_2))) 2838 final)) 2839 2840;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2841 2842(rule (lower (has_type $F32 (fmax _ x y))) 2843 (xmm_min_max_seq $F32 false x y)) 2844(rule (lower (has_type $F64 (fmax _ x y))) 2845 (xmm_min_max_seq $F64 false x y)) 2846 2847;; The vector version of fmax here is a dual to the fmin sequence 2848;; above, almost, with a few differences. 2849 2850(rule (lower (has_type $F32X4 (fmax _ x y))) 2851 ;; Compute max(x, y) and max(y, x) with native 2852 ;; instructions. These will differ in one of the edge cases 2853 ;; above that we have to handle properly. (Conversely, if they 2854 ;; don't differ, then the native instruction's answer is the 2855 ;; right one per CLIF semantics.) 2856 (let ((x Xmm x) ;; force x/y into registers and disallow load sinking 2857 (y Xmm y) 2858 (max1 Xmm (x64_maxps x y)) 2859 (max2 Xmm (x64_maxps y x)) 2860 ;; Compute the XOR of the two maxima. In the case 2861 ;; where we don't have a +/-0 mismatch or NaNs, then 2862 ;; `min1` and `min2` are equal and this XOR is zero. 2863 (max_xor Xmm (x64_xorps max1 max2)) 2864 ;; OR the XOR into one of the original maxima. If they are 2865 ;; equal, this does nothing. If max2 was NaN, its exponent 2866 ;; bits were all-ones, so the xor's exponent bits were the 2867 ;; complement of max1, and the OR of max1 and max_xor has 2868 ;; an all-ones exponent (is a NaN). If max1 was NaN, then 2869 ;; its exponent bits were already all-ones, so the OR will 2870 ;; be a NaN as well. 2871 (max_blended_nan Xmm (x64_orps max1 max_xor)) 2872 ;; Subtract the XOR. This ensures that if we had +0 and 2873 ;; -0, we end up with +0. 2874 (max_blended_nan_positive Xmm (x64_subps max_blended_nan max_xor)) 2875 ;; "compare unordered" produces a true mask (all ones) in 2876 ;; a given lane if the min is a NaN. We use this to 2877 ;; generate a mask to ensure quiet NaNs. 2878 (is_nan_mask Xmm (x64_cmpps max_blended_nan max_blended_nan (FcmpImm.Unordered))) 2879 ;; Shift the NaN mask down so that it covers just the 2880 ;; fraction below the NaN signalling bit; we'll use this 2881 ;; to mask off non-canonical NaN payloads. 2882 ;; 2883 ;; All-ones for NaN, shifted down to leave 10 top bits (1 2884 ;; sign, 8 exponent, 1 QNaN bit that must remain set) 2885 ;; cleared. 2886 (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10))) 2887 ;; Do a NAND, so that we retain every bit not set in 2888 ;; `nan_fraction_mask`. This mask will be all zeroes (so 2889 ;; we retain every bit) in non-NaN cases, and will have 2890 ;; ones (so we clear those bits) in NaN-payload bits 2891 ;; otherwise. 2892 (final Xmm (x64_andnps nan_fraction_mask max_blended_nan_positive))) 2893 final)) 2894 2895(rule (lower (has_type $F64X2 (fmax _ x y))) 2896 ;; Compute max(x, y) and max(y, x) with native 2897 ;; instructions. These will differ in one of the edge cases 2898 ;; above that we have to handle properly. (Conversely, if they 2899 ;; don't differ, then the native instruction's answer is the 2900 ;; right one per CLIF semantics.) 2901 (let ((x Xmm x) ;; force x/y into registers and disallow load sinking 2902 (y Xmm y) 2903 (max1 Xmm (x64_maxpd x y)) 2904 (max2 Xmm (x64_maxpd y x)) 2905 ;; Compute the XOR of the two maxima. In the case 2906 ;; where we don't have a +/-0 mismatch or NaNs, then 2907 ;; `min1` and `min2` are equal and this XOR is zero. 2908 (max_xor Xmm (x64_xorpd max1 max2)) 2909 ;; OR the XOR into one of the original maxima. If they are 2910 ;; equal, this does nothing. If max2 was NaN, its exponent 2911 ;; bits were all-ones, so the xor's exponent bits were the 2912 ;; complement of max1, and the OR of max1 and max_xor has 2913 ;; an all-ones exponent (is a NaN). If max1 was NaN, then 2914 ;; its exponent bits were already all-ones, so the OR will 2915 ;; be a NaN as well. 2916 (max_blended_nan Xmm (x64_orpd max1 max_xor)) 2917 ;; Subtract the XOR. This ensures that if we had +0 and 2918 ;; -0, we end up with +0. 2919 (max_blended_nan_positive Xmm (x64_subpd max_blended_nan max_xor)) 2920 ;; `cmpps` with predicate index `3` is `cmpunordps`, or 2921 ;; "compare unordered": it produces a true mask (all ones) 2922 ;; in a given lane if the min is a NaN. We use this to 2923 ;; generate a mask to ensure quiet NaNs. 2924 (is_nan_mask Xmm (x64_cmppd max_blended_nan max_blended_nan (FcmpImm.Unordered))) 2925 ;; Shift the NaN mask down so that it covers just the 2926 ;; fraction below the NaN signalling bit; we'll use this 2927 ;; to mask off non-canonical NaN payloads. 2928 ;; 2929 ;; All-ones for NaN, shifted down to leave 13 top bits (1 2930 ;; sign, 11 exponent, 1 QNaN bit that must remain set) 2931 ;; cleared. 2932 (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13))) 2933 ;; Do a NAND, so that we retain every bit not set in 2934 ;; `nan_fraction_mask`. This mask will be all zeroes (so 2935 ;; we retain every bit) in non-NaN cases, and will have 2936 ;; ones (so we clear those bits) in NaN-payload bits 2937 ;; otherwise. 2938 (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive))) 2939 final)) 2940 2941;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2942 2943;; Base case for fma is to call out to one of two libcalls. For vectors they 2944;; need to be decomposed, handle each element individually, and then recomposed. 2945 2946(rule (lower (has_type $F32 (fma _ x y z))) 2947 (libcall_3 (LibCall.FmaF32) x y z)) 2948(rule (lower (has_type $F64 (fma _ x y z))) 2949 (libcall_3 (LibCall.FmaF64) x y z)) 2950 2951(rule (lower (has_type $F32X4 (fma _ x y z))) 2952 (let ( 2953 (x Xmm (put_in_xmm x)) 2954 (y Xmm (put_in_xmm y)) 2955 (z Xmm (put_in_xmm z)) 2956 (x0 Xmm (libcall_3 (LibCall.FmaF32) x y z)) 2957 (x1 Xmm (libcall_3 (LibCall.FmaF32) 2958 (x64_pshufd x 1) 2959 (x64_pshufd y 1) 2960 (x64_pshufd z 1))) 2961 (x2 Xmm (libcall_3 (LibCall.FmaF32) 2962 (x64_pshufd x 2) 2963 (x64_pshufd y 2) 2964 (x64_pshufd z 2))) 2965 (x3 Xmm (libcall_3 (LibCall.FmaF32) 2966 (x64_pshufd x 3) 2967 (x64_pshufd y 3) 2968 (x64_pshufd z 3))) 2969 2970 (tmp Xmm (f32x4_insertlane x0 x1 1)) 2971 (tmp Xmm (f32x4_insertlane tmp x2 2)) 2972 (tmp Xmm (f32x4_insertlane tmp x3 3)) 2973 ) 2974 tmp)) 2975(rule (lower (has_type $F64X2 (fma _ x y z))) 2976 (let ( 2977 (x Xmm (put_in_xmm x)) 2978 (y Xmm (put_in_xmm y)) 2979 (z Xmm (put_in_xmm z)) 2980 (x0 Xmm (libcall_3 (LibCall.FmaF64) x y z)) 2981 (x1 Xmm (libcall_3 (LibCall.FmaF64) 2982 (x64_pshufd x 0xee) 2983 (x64_pshufd y 0xee) 2984 (x64_pshufd z 0xee))) 2985 ) 2986 (x64_movlhps x0 x1))) 2987 2988 2989;; Special case for when the `fma` feature is active and a native instruction 2990;; can be used. 2991(rule 1 (lower (has_type ty (fma _ x y z))) 2992 (if-let true (use_fma)) 2993 (fmadd ty x y z)) 2994 2995(decl fmadd (Type Value Value Value) Xmm) 2996(decl fnmadd (Type Value Value Value) Xmm) 2997 2998;; Base case. Note that this will automatically sink a load with `z`, the value 2999;; to add. 3000(rule (fmadd ty x y z) (x64_vfmadd213 ty x y z)) 3001 3002;; Allow sinking loads with one of the two values being multiplied in addition 3003;; to the value being added. Note that both x and y can be sunk here due to 3004;; multiplication being commutative. 3005(rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x)) 3006(rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y)) 3007 3008;; If one of the values being multiplied is negated then use a `vfnmadd*` 3009;; instruction instead 3010(rule 3 (fmadd ty (fneg _ x) y z) (fnmadd ty x y z)) 3011(rule 4 (fmadd ty x (fneg _ y) z) (fnmadd ty x y z)) 3012 3013(rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z)) 3014(rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x)) 3015(rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y)) 3016 3017 3018(rule 2 (lower (has_type ty (fma _ x y (fneg _ z)))) 3019 (if-let true (use_fma)) 3020 (fmsub ty x y z)) 3021 3022;; fmsub and fnmsub 3023(decl fmsub (Type Value Value Value) Xmm) 3024(decl fnmsub (Type Value Value Value) Xmm) 3025 3026;; Base case, will sink a load of `z` automatically. 3027(rule (fmsub ty x y z) (x64_vfmsub213 ty x y z)) 3028 3029;; Allow sinking loads with one of the two values being multiplied in addition 3030;; to the value being subtracted. Note that both x and y can be sunk here due to 3031;; multiplication being commutative. 3032(rule 1 (fmsub ty (sinkable_load x) y z) (x64_vfmsub132 ty y z x)) 3033(rule 2 (fmsub ty x (sinkable_load y) z) (x64_vfmsub132 ty x z y)) 3034 3035;; If one of the values being multiplied is negated then use a `vfnmsub*` 3036;; instruction instead 3037(rule 3 (fmsub ty (fneg _ x) y z) (fnmsub ty x y z)) 3038(rule 4 (fmsub ty x (fneg _ y) z) (fnmsub ty x y z)) 3039 3040(rule (fnmsub ty x y z) (x64_vfnmsub213 ty x y z)) 3041(rule 1 (fnmsub ty (sinkable_load x) y z) (x64_vfnmsub132 ty y z x)) 3042(rule 2 (fnmsub ty x (sinkable_load y) z) (x64_vfnmsub132 ty x z y)) 3043 3044 3045;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3046 3047;; In order to load a value from memory to a GPR register, we may need to extend 3048;; the loaded value from 8-, 16-, or 32-bits to this backend's expected GPR 3049;; width: 64 bits. Note that `ext_mode` will load 1-bit types (booleans) as 3050;; 8-bit loads. 3051;; 3052;; By default, we zero-extend all sub-64-bit loads to a GPR. 3053(rule load_sub64_x64_movzx -4 (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) 3054 (load _ (little_or_native_endian flags) address offset))) 3055 (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset))) 3056;; But if we know that both the `from` and `to` are 64 bits, we simply load with 3057;; no extension. 3058(rule load_64_x64_movzx -1 (lower (has_type (ty_int_ref_64 ty) (load _ (little_or_native_endian flags) address offset))) 3059 (x64_mov (to_amode flags address offset))) 3060;; Also, certain scalar loads have a specific `from` width and extension kind 3061;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit 3062;; GPR even if the `to` type is smaller (e.g., 16-bits). 3063(rule (lower (has_type (is_gpr_type ty) (uload8 _ (little_or_native_endian flags) address offset))) 3064 (x64_movzx (ExtMode.BQ) (to_amode flags address offset))) 3065(rule (lower (has_type (is_gpr_type ty) (sload8 _ (little_or_native_endian flags) address offset))) 3066 (x64_movsx (ExtMode.BQ) (to_amode flags address offset))) 3067(rule (lower (has_type (is_gpr_type ty) (uload16 _ (little_or_native_endian flags) address offset))) 3068 (x64_movzx (ExtMode.WQ) (to_amode flags address offset))) 3069(rule (lower (has_type (is_gpr_type ty) (sload16 _ (little_or_native_endian flags) address offset))) 3070 (x64_movsx (ExtMode.WQ) (to_amode flags address offset))) 3071(rule (lower (has_type (is_gpr_type ty) (uload32 _ (little_or_native_endian flags) address offset))) 3072 (x64_movzx (ExtMode.LQ) (to_amode flags address offset))) 3073(rule (lower (has_type (is_gpr_type ty) (sload32 _ (little_or_native_endian flags) address offset))) 3074 (x64_movsx (ExtMode.LQ) (to_amode flags address offset))) 3075 3076;; To load to XMM registers, we use the x64-specific instructions for each type. 3077;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits. 3078;; But for the 128-bit types, this is not strictly necessary for performance but 3079;; might help with clarity during disassembly. 3080(rule 4 (lower (has_type (is_xmm_type (ty_16 _)) (load _ (little_or_native_endian flags) address offset))) 3081 (x64_pinsrw (xmm_uninit_value) (to_amode flags address offset) 0)) 3082(rule 3 (lower (has_type (is_xmm_type (ty_32 _)) (load _ (little_or_native_endian flags) address offset))) 3083 (x64_movss_load (to_amode flags address offset))) 3084(rule 2 (lower (has_type (is_xmm_type (ty_64 _)) (load _ (little_or_native_endian flags) address offset))) 3085 (x64_movsd_load (to_amode flags address offset))) 3086(rule 1 (lower (has_type $F32X4 (load _ (little_or_native_endian flags) address offset))) 3087 (x64_movups_load (to_amode flags address offset))) 3088(rule 1 (lower (has_type $F64X2 (load _ (little_or_native_endian flags) address offset))) 3089 (x64_movupd_load (to_amode flags address offset))) 3090(rule 0 (lower (has_type (is_xmm_type (ty_128 _)) (load _ (little_or_native_endian flags) address offset))) 3091 (x64_movdqu_load (to_amode flags address offset))) 3092 3093;; We can load an I128 by doing two 64-bit loads. 3094(rule -3 (lower (has_type $I128 3095 (load _ (little_or_native_endian flags) address offset))) 3096 (let ((addr_lo SyntheticAmode (to_amode flags address offset)) 3097 (addr_hi SyntheticAmode (amode_offset addr_lo flags 8)) 3098 (value_lo Reg (x64_mov addr_lo)) 3099 (value_hi Reg (x64_mov addr_hi))) 3100 (value_regs value_lo value_hi))) 3101 3102;; We also include widening vector loads; these sign- or zero-extend each lane 3103;; to the next wider width (e.g., 16x4 -> 32x4). 3104(rule 1 (lower (has_type $I16X8 (sload8x8 _ (little_or_native_endian flags) address offset))) 3105 (if-let true (has_sse41)) 3106 (x64_pmovsxbw (to_amode flags address offset))) 3107(rule 1 (lower (has_type $I16X8 (uload8x8 _ (little_or_native_endian flags) address offset))) 3108 (if-let true (has_sse41)) 3109 (x64_pmovzxbw (to_amode flags address offset))) 3110(rule 1 (lower (has_type $I32X4 (sload16x4 _ (little_or_native_endian flags) address offset))) 3111 (if-let true (has_sse41)) 3112 (x64_pmovsxwd (to_amode flags address offset))) 3113(rule 1 (lower (has_type $I32X4 (uload16x4 _ (little_or_native_endian flags) address offset))) 3114 (if-let true (has_sse41)) 3115 (x64_pmovzxwd (to_amode flags address offset))) 3116(rule 1 (lower (has_type $I64X2 (sload32x2 _ (little_or_native_endian flags) address offset))) 3117 (if-let true (has_sse41)) 3118 (x64_pmovsxdq (to_amode flags address offset))) 3119(rule 1 (lower (has_type $I64X2 (uload32x2 _ (little_or_native_endian flags) address offset))) 3120 (if-let true (has_sse41)) 3121 (x64_pmovzxdq (to_amode flags address offset))) 3122 3123(rule (lower (has_type $I16X8 (sload8x8 _ (little_or_native_endian flags) address offset))) 3124 (lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset)))) 3125(rule (lower (has_type $I16X8 (uload8x8 _ (little_or_native_endian flags) address offset))) 3126 (lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset)))) 3127(rule (lower (has_type $I32X4 (sload16x4 _ (little_or_native_endian flags) address offset))) 3128 (lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset)))) 3129(rule (lower (has_type $I32X4 (uload16x4 _ (little_or_native_endian flags) address offset))) 3130 (lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset)))) 3131(rule (lower (has_type $I64X2 (sload32x2 _ (little_or_native_endian flags) address offset))) 3132 (lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset)))) 3133(rule (lower (has_type $I64X2 (uload32x2 _ (little_or_native_endian flags) address offset))) 3134 (lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset)))) 3135 3136;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3137 3138;; 8-, 16-, 32- and 64-bit GPR stores. 3139(rule store_x64_movrm -2 (lower (store (little_or_native_endian flags) 3140 value @ (value_type (is_gpr_type ty)) 3141 address 3142 offset)) 3143 (side_effect 3144 (x64_movrm ty (to_amode flags address offset) value))) 3145 3146;; Explicit 8/16/32-bit opcodes. 3147(rule (lower (istore8 (little_or_native_endian flags) value address offset)) 3148 (side_effect 3149 (x64_movrm $I8 (to_amode flags address offset) value))) 3150(rule (lower (istore16 (little_or_native_endian flags) value address offset)) 3151 (side_effect 3152 (x64_movrm $I16 (to_amode flags address offset) value))) 3153(rule (lower (istore32 (little_or_native_endian flags) value address offset)) 3154 (side_effect 3155 (x64_movrm $I32 (to_amode flags address offset) value))) 3156 3157;; IMM stores 3158(rule 4 (lower (store (little_or_native_endian flags) value @ (value_type (fits_in_64 ty)) address offset)) 3159 (if-let (i32_from_iconst imm) value) 3160 (side_effect 3161 (x64_movimm_m ty (to_amode flags address offset) imm))) 3162 3163;; F16 stores of values in XMM registers. 3164(rule -2 (lower (store (little_or_native_endian flags) 3165 value @ (value_type (is_xmm_type (ty_16 _))) 3166 address 3167 offset)) 3168 (side_effect 3169 (x64_movrm $I16 (to_amode flags address offset) (bitcast_xmm_to_gpr 16 value)))) 3170 3171(rule -1 (lower (store (little_or_native_endian flags) 3172 value @ (value_type (is_xmm_type (ty_16 _))) 3173 address 3174 offset)) 3175 (if-let true (has_sse41)) 3176 (side_effect 3177 (x64_pextrw_store (to_amode flags address offset) value 0))) 3178 3179;; F32 stores of values in XMM registers. 3180(rule -3 (lower (store (little_or_native_endian flags) 3181 value @ (value_type (is_xmm_type (ty_32 _))) 3182 address 3183 offset)) 3184 (side_effect 3185 (x64_movss_store (to_amode flags address offset) value))) 3186 3187;; F64 stores of values in XMM registers. 3188(rule -4 (lower (store (little_or_native_endian flags) 3189 value @ (value_type (is_xmm_type (ty_64 _))) 3190 address 3191 offset)) 3192 (side_effect 3193 (x64_movsd_store (to_amode flags address offset) value))) 3194 3195;; Stores of F32X4 vectors. 3196(rule 1 (lower (store (little_or_native_endian flags) 3197 value @ (value_type $F32X4) 3198 address 3199 offset)) 3200 (side_effect 3201 (x64_movups_store (to_amode flags address offset) value))) 3202 3203;; Stores of F64X2 vectors. 3204(rule 1 (lower (store (little_or_native_endian flags) 3205 value @ (value_type $F64X2) 3206 address 3207 offset)) 3208 (side_effect 3209 (x64_movupd_store (to_amode flags address offset) value))) 3210 3211;; Stores of all other 128-bit vector types with integer lanes. 3212(rule -5 (lower (store (little_or_native_endian flags) 3213 value @ (value_type (is_xmm_type (ty_128 _))) 3214 address 3215 offset)) 3216 (side_effect 3217 (x64_movdqu_store (to_amode flags address offset) value))) 3218 3219;; Stores of I128 values: store the two 64-bit halves separately. 3220(rule 0 (lower (store (little_or_native_endian flags) 3221 value @ (value_type $I128) 3222 address 3223 offset)) 3224 (let ((value_reg ValueRegs value) 3225 (value_lo Gpr (value_regs_get_gpr value_reg 0)) 3226 (value_hi Gpr (value_regs_get_gpr value_reg 1)) 3227 (addr_lo SyntheticAmode (to_amode flags address offset)) 3228 (addr_hi SyntheticAmode (amode_offset addr_lo flags 8))) 3229 (side_effect 3230 (side_effect_concat 3231 (x64_movrm $I64 addr_lo value_lo) 3232 (x64_movrm $I64 addr_hi value_hi))))) 3233 3234;; Slightly optimize the extraction of the first lane from a vector which is 3235;; stored in memory. In the case the first lane specifically is selected the 3236;; standard `movss` and `movsd` instructions can be used as-if we're storing a 3237;; f32 or f64 despite the source perhaps being an integer vector since the 3238;; result of the instruction is the same. 3239(rule 2 (lower (store (little_or_native_endian flags) 3240 (has_type $F32 (extractlane _ value (u8_from_uimm8 0))) 3241 address 3242 offset)) 3243 (side_effect 3244 (x64_movss_store (to_amode flags address offset) value))) 3245(rule 2 (lower (store (little_or_native_endian flags) 3246 (has_type $F64 (extractlane _ value (u8_from_uimm8 0))) 3247 address 3248 offset)) 3249 (side_effect 3250 (x64_movsd_store (to_amode flags address offset) value))) 3251(rule 2 (lower (store (little_or_native_endian flags) 3252 (has_type $I8 (extractlane _ value (u8_from_uimm8 n))) 3253 address 3254 offset)) 3255 (if-let true (has_sse41)) 3256 (side_effect 3257 (x64_pextrb_store (to_amode flags address offset) value n))) 3258(rule 2 (lower (store (little_or_native_endian flags) 3259 (has_type $I16 (extractlane _ value (u8_from_uimm8 n))) 3260 address 3261 offset)) 3262 (if-let true (has_sse41)) 3263 (side_effect 3264 (x64_pextrw_store (to_amode flags address offset) value n))) 3265(rule 2 (lower (store (little_or_native_endian flags) 3266 (has_type $I32 (extractlane _ value (u8_from_uimm8 n))) 3267 address 3268 offset)) 3269 (if-let true (has_sse41)) 3270 (side_effect 3271 (x64_pextrd_store (to_amode flags address offset) value n))) 3272(rule 2 (lower (store (little_or_native_endian flags) 3273 (has_type $I64 (extractlane _ value (u8_from_uimm8 n))) 3274 address 3275 offset)) 3276 (if-let true (has_sse41)) 3277 (side_effect 3278 (x64_pextrq_store (to_amode flags address offset) value n))) 3279 3280;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3281 3282;; `add mem, {reg,imm}` 3283(rule store_x64_add_mem 3 (lower 3284 (store (little_or_native_endian flags) 3285 (has_type (ty_32_or_64 ty) 3286 (iadd _ (and 3287 (sinkable_load sink) 3288 (load _ flags addr offset)) 3289 src2)) 3290 addr 3291 offset)) 3292 (let ((_ RegMemImm sink)) 3293 (side_effect 3294 (x64_add_mem ty (to_amode flags addr offset) src2)))) 3295 3296;; `add mem, {reg,imm}` with args swapped 3297(rule 2 (lower 3298 (store (little_or_native_endian flags) 3299 (has_type (ty_32_or_64 ty) 3300 (iadd _ src2 3301 (and 3302 (sinkable_load sink) 3303 (load _ flags addr offset)))) 3304 addr 3305 offset)) 3306 (let ((_ RegMemImm sink)) 3307 (side_effect 3308 (x64_add_mem ty (to_amode flags addr offset) src2)))) 3309 3310;; `sub mem, {reg,imm}` 3311(rule 2 (lower 3312 (store (little_or_native_endian flags) 3313 (has_type (ty_32_or_64 ty) 3314 (isub _ (and 3315 (sinkable_load sink) 3316 (load _ flags addr offset)) 3317 src2)) 3318 addr 3319 offset)) 3320 (let ((_ RegMemImm sink)) 3321 (side_effect 3322 (x64_sub_mem ty (to_amode flags addr offset) src2)))) 3323 3324;; `and mem, {reg,imm}` 3325(rule 3 (lower 3326 (store (little_or_native_endian flags) 3327 (has_type (ty_32_or_64 ty) 3328 (band _ (and 3329 (sinkable_load sink) 3330 (load _ flags addr offset)) 3331 src2)) 3332 addr 3333 offset)) 3334 (let ((_ RegMemImm sink)) 3335 (side_effect 3336 (x64_and_mem ty (to_amode flags addr offset) src2)))) 3337 3338;; `and mem, {reg,imm}` with args swapped 3339(rule 2 (lower 3340 (store (little_or_native_endian flags) 3341 (has_type (ty_32_or_64 ty) 3342 (band _ src2 3343 (and 3344 (sinkable_load sink) 3345 (load _ flags addr offset)))) 3346 addr 3347 offset)) 3348 (let ((_ RegMemImm sink)) 3349 (side_effect 3350 (x64_and_mem ty (to_amode flags addr offset) src2)))) 3351 3352;; `or mem, {reg,imm}` 3353(rule 3 (lower 3354 (store (little_or_native_endian flags) 3355 (has_type (ty_32_or_64 ty) 3356 (bor _ (and 3357 (sinkable_load sink) 3358 (load _ flags addr offset)) 3359 src2)) 3360 addr 3361 offset)) 3362 (let ((_ RegMemImm sink)) 3363 (side_effect 3364 (x64_or_mem ty (to_amode flags addr offset) src2)))) 3365 3366;; `or mem, {reg,imm}` with args swapped 3367(rule 2 (lower 3368 (store (little_or_native_endian flags) 3369 (has_type (ty_32_or_64 ty) 3370 (bor _ src2 3371 (and 3372 (sinkable_load sink) 3373 (load _ flags addr offset)))) 3374 addr 3375 offset)) 3376 (let ((_ RegMemImm sink)) 3377 (side_effect 3378 (x64_or_mem ty (to_amode flags addr offset) src2)))) 3379 3380;; Xor mem, reg 3381(rule 3 (lower 3382 (store (little_or_native_endian flags) 3383 (has_type (ty_32_or_64 ty) 3384 (bxor _ (and 3385 (sinkable_load sink) 3386 (load _ flags addr offset)) 3387 src2)) 3388 addr 3389 offset)) 3390 (let ((_ RegMemImm sink)) 3391 (side_effect 3392 (x64_xor_mem ty (to_amode flags addr offset) src2)))) 3393 3394;; Xor mem, reg with args swapped 3395(rule 2 (lower 3396 (store (little_or_native_endian flags) 3397 (has_type (ty_32_or_64 ty) 3398 (bxor _ src2 3399 (and 3400 (sinkable_load sink) 3401 (load _ flags addr offset)))) 3402 addr 3403 offset)) 3404 (let ((_ RegMemImm sink)) 3405 (side_effect 3406 (x64_xor_mem ty (to_amode flags addr offset) src2)))) 3407 3408;; Rules for `fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3409 3410(rule (lower (fence)) 3411 (side_effect (x64_mfence_zo))) 3412 3413;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3414 3415(rule (lower (func_addr _ (func_ref_data _ extname dist _))) 3416 (load_ext_name extname 0 dist)) 3417 3418;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3419 3420(rule (lower (symbol_value _ (symbol_value_data extname dist offset))) 3421 (load_ext_name extname offset dist)) 3422 3423;; Rules for `atomic_load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3424 3425;; This is a normal load. The x86-TSO memory model provides sufficient 3426;; sequencing to satisfy the CLIF synchronisation requirements for `AtomicLoad` 3427;; without the need for any fence instructions. 3428;; 3429;; This lowering is only valid for I8, I16, I32, and I64. The sub-64-bit types 3430;; are zero extended, as with a normal load. 3431(rule 1 (lower (has_type $I64 (atomic_load _ (little_or_native_endian flags) address))) 3432 (x64_mov (to_amode flags address (zero_offset)))) 3433(rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load _ (little_or_native_endian flags) address))) 3434 (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset)))) 3435;; Lower 128-bit `atomic_load` using `cmpxchg16b`. 3436(rule 1 (lower (has_type $I128 (atomic_load _ (little_or_native_endian flags) address))) 3437 (if-let true (has_cmpxchg16b)) 3438 (x64_cmpxchg16b (value_regs (imm $I64 0) (imm $I64 0)) (value_regs (imm $I64 0) (imm $I64 0)) (to_amode flags address (zero_offset)))) 3439 3440;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3441 3442;; This is a normal store followed by an `mfence` instruction. This lowering is 3443;; only valid for I8, I16, I32, and I64. 3444(rule (lower (atomic_store (little_or_native_endian flags) 3445 value @ (value_type (and (fits_in_64 ty) (ty_int _))) 3446 address)) 3447 (side_effect (side_effect_concat 3448 (x64_movrm ty (to_amode flags address (zero_offset)) value) 3449 (x64_mfence_zo)))) 3450;; Lower 128-bit `atomic_store` using `cmpxchg16b`. 3451(rule 1 (lower (atomic_store (little_or_native_endian flags) value @ (value_type $I128) address)) 3452 (if-let true (has_cmpxchg16b)) 3453 (side_effect (x64_atomic_128_store_seq (to_amode flags address (zero_offset)) flags value))) 3454 3455;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3456 3457(rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) 3458 (atomic_cas _ (little_or_native_endian flags) address expected replacement))) 3459 (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset)))) 3460(rule 1 (lower (has_type $I128 (atomic_cas _ flags address expected replacement))) 3461 (if-let true (has_cmpxchg16b)) 3462 (x64_cmpxchg16b expected replacement (to_amode flags address (zero_offset)))) 3463 3464;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3465 3466;; This is a simple, general-case atomic update, based on a loop involving 3467;; `cmpxchg`. 3468(rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) 3469 (atomic_rmw _ (little_or_native_endian flags) op address input))) 3470 (x64_atomic_rmw_seq ty (atomic_rmw_seq_op op) (to_amode flags address (zero_offset)) input)) 3471 3472;; `Add` and `Sub` can use `lock xadd` 3473(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _)) 3474 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) address input))) 3475 (x64_xadd ty (to_amode flags address (zero_offset)) input)) 3476(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _)) 3477 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) address input))) 3478 (x64_xadd ty (to_amode flags address (zero_offset)) (x64_neg ty input))) 3479;; `Xchg` can use `xchg` 3480(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _)) 3481 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xchg) address input))) 3482 (x64_xchg ty (to_amode flags address (zero_offset)) input)) 3483 3484;; `Add`, `Sub`, `And`, `Or` and `Xor` can use `lock`-prefixed instructions if 3485;; the old value is not required. 3486(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty)) 3487 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) address input))) 3488 (if-let (first_result res) i) 3489 (if-let true (value_is_unused res)) 3490 (side_effect_as_invalid (x64_lock_add (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))) 3491(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty)) 3492 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) address input))) 3493 (if-let (first_result res) i) 3494 (if-let true (value_is_unused res)) 3495 (side_effect_as_invalid (x64_lock_sub (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))) 3496(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty)) 3497 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.And) address input))) 3498 (if-let (first_result res) i) 3499 (if-let true (value_is_unused res)) 3500 (side_effect_as_invalid (x64_lock_and (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))) 3501(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty)) 3502 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Or) address input))) 3503 (if-let (first_result res) i) 3504 (if-let true (value_is_unused res)) 3505 (side_effect_as_invalid (x64_lock_or (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))) 3506(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty)) 3507 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xor) address input))) 3508 (if-let (first_result res) i) 3509 (if-let true (value_is_unused res)) 3510 (side_effect_as_invalid (x64_lock_xor (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input))) 3511 3512;; 128-bit integers always use a `lock cmpxchg16b` loop. 3513(rule 3 (lower (has_type $I128 (atomic_rmw _ (little_or_native_endian flags) op address input))) 3514 (if-let true (has_cmpxchg16b)) 3515 (x64_atomic_128_rmw_seq op (to_amode flags address (zero_offset)) flags input)) 3516 3517;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3518 3519;; Direct call to an in-range function. 3520(rule 1 (lower (call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args)) 3521 (let ((output ValueRegsVec (gen_call_output sig_ref)) 3522 (abi Sig (abi_sig sig_ref)) 3523 (uses CallArgList (gen_call_args abi args)) 3524 (defs CallRetList (gen_call_rets abi output)) 3525 (info BoxCallInfo (gen_call_info abi name uses defs (try_call_none) patchable)) 3526 (_ Unit (emit_side_effect (call_known info)))) 3527 output)) 3528 3529;; Direct call to an out-of-range function (implicitly via pointer). 3530(rule (lower (call (func_ref_data sig_ref name dist false) args)) 3531 (let ((output ValueRegsVec (gen_call_output sig_ref)) 3532 (abi Sig (abi_sig sig_ref)) 3533 (uses CallArgList (gen_call_args abi args)) 3534 (defs CallRetList (gen_call_rets abi output)) 3535 (target RegMem (RegMem.Reg (load_ext_name name 0 dist))) 3536 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none))) 3537 (_ Unit (emit_side_effect (call_unknown info)))) 3538 output)) 3539 3540;; Indirect call. 3541(rule (lower (call_indirect sig_ref ptr args)) 3542 (let ((output ValueRegsVec (gen_call_output sig_ref)) 3543 (abi Sig (abi_sig sig_ref)) 3544 (target RegMem (RegMem.Reg (put_in_reg ptr))) 3545 (uses CallArgList (gen_call_args abi args)) 3546 (defs CallRetList (gen_call_rets abi output)) 3547 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none))) 3548 (_ Unit (emit_side_effect (call_unknown info)))) 3549 output)) 3550 3551;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;; 3552 3553;; Direct call to an in-range function. 3554(rule 1 (lower (return_call (func_ref_data sig_ref name (RelocDistance.Near) false) args)) 3555 (let ((abi Sig (abi_sig sig_ref)) 3556 (uses CallArgList (gen_return_call_args abi args)) 3557 (info BoxReturnCallInfo (gen_return_call_info abi name uses))) 3558 (side_effect (return_call_known info)))) 3559 3560;; Direct call to an out-of-range function (implicitly via pointer). 3561(rule (lower (return_call (func_ref_data sig_ref name dist false) args)) 3562 (let ((abi Sig (abi_sig sig_ref)) 3563 (uses CallArgList (gen_return_call_args abi args)) 3564 (target Reg (load_ext_name name 0 dist)) 3565 (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses))) 3566 (side_effect (return_call_unknown info)))) 3567 3568;; Indirect call. 3569(rule (lower (return_call_indirect sig_ref ptr args)) 3570 (let ((abi Sig (abi_sig sig_ref)) 3571 (target Reg (put_in_reg ptr)) 3572 (uses CallArgList (gen_return_call_args abi args)) 3573 (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses))) 3574 (side_effect (return_call_unknown info)))) 3575 3576;;;; Rules for `try_call` and `try_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3577 3578;; Direct call to an in-range function. 3579(rule 1 (lower_branch (try_call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args et) targets) 3580 (let ((abi Sig (abi_sig sig_ref)) 3581 (trycall OptionTryCallInfo (try_call_info et targets)) 3582 (uses CallArgList (gen_call_args abi args)) 3583 (defs CallRetList (gen_try_call_rets abi)) 3584 (info BoxCallInfo (gen_call_info abi name uses defs trycall patchable))) 3585 (emit_side_effect (call_known info)))) 3586 3587;; Direct call to an out-of-range function (implicitly via pointer). 3588(rule (lower_branch (try_call (func_ref_data sig_ref name dist false) args et) targets) 3589 (let ((abi Sig (abi_sig sig_ref)) 3590 (trycall OptionTryCallInfo (try_call_info et targets)) 3591 (uses CallArgList (gen_call_args abi args)) 3592 (defs CallRetList (gen_try_call_rets abi)) 3593 (target RegMem (RegMem.Reg (load_ext_name name 0 dist))) 3594 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall))) 3595 (emit_side_effect (call_unknown info)))) 3596 3597;; Indirect call. 3598(rule (lower_branch (try_call_indirect ptr args et) targets) 3599 (if-let (exception_sig sig_ref) et) 3600 (let ((abi Sig (abi_sig sig_ref)) 3601 (trycall OptionTryCallInfo (try_call_info et targets)) 3602 (target RegMem (RegMem.Reg (put_in_reg ptr))) 3603 (uses CallArgList (gen_call_args abi args)) 3604 (defs CallRetList (gen_try_call_rets abi)) 3605 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall))) 3606 (emit_side_effect (call_unknown info)))) 3607 3608;; Rules for `stack_switch` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3609 3610;; currently, only the Basic model is supported 3611(rule (lower (stack_switch _ store_context_ptr load_context_ptr in_payload0)) 3612 (if-let (StackSwitchModel.Basic) (stack_switch_model)) 3613 (let ((store_context_ptr Gpr (put_in_gpr store_context_ptr)) 3614 (load_context_ptr Gpr (put_in_gpr load_context_ptr)) 3615 (in_payload0 Gpr (put_in_gpr in_payload0))) 3616 (x64_stack_switch_basic store_context_ptr load_context_ptr in_payload0))) 3617 3618;;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;; 3619 3620(rule (lower (get_frame_pointer _)) 3621 (x64_rbp)) 3622 3623(rule (lower (get_stack_pointer _)) 3624 (x64_rsp)) 3625 3626(rule (lower (get_return_address _)) 3627 (x64_movq_rm (Amode.ImmReg 8 (x64_rbp) (mem_flags_trusted)))) 3628 3629;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3630 3631(rule (lower_branch (jump _) (single_target target)) 3632 (emit_side_effect (jmp_known target))) 3633 3634;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3635 3636(rule (lower_branch (brif val _ _) (two_targets then else)) 3637 (emit_side_effect (jmp_cond_result (is_nonzero_cmp val) then else))) 3638 3639;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3640 3641(rule (lower_branch (br_table idx @ (value_type ty) _) (jump_table_targets default_target jt_targets)) 3642 (let ((jt_size u32 (jump_table_size jt_targets)) 3643 (size_reg Reg (imm ty jt_size)) 3644 (idx_reg Gpr (extend_to_gpr idx $I64 (ExtendKind.Zero))) 3645 (clamped_idx Reg (with_flags_reg 3646 (x64_cmp ty idx_reg size_reg) 3647 (cmove ty (CC.B) idx_reg size_reg)))) 3648 (emit_side_effect (jmp_table_seq ty clamped_idx default_target jt_targets)))) 3649 3650;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3651 3652(rule (lower (select_spectre_guard _ cond x y)) 3653 (lower_select (is_nonzero_cmp cond) x y)) 3654 3655;; Note that for GPR-based spectre guards everything is forced into a register 3656;; not `GprMem`. The `lower_select_spectre_gpr` helper below handles "and" 3657;; conditions which the `lower_select_gpr` helper does not. 3658(rule 1 (lower (has_type (is_single_register_gpr_type ty) (select_spectre_guard _ cond x y))) 3659 (lower_select_spectre_gpr ty (is_nonzero_cmp cond) (put_in_gpr x) y)) 3660 3661(decl lower_select_spectre_gpr (Type CondResult Gpr Gpr) Gpr) 3662(rule 0 (lower_select_spectre_gpr ty cond a b) (lower_select_gpr ty cond a b)) 3663(rule 1 (lower_select_spectre_gpr ty cond @ (CondResult.And _ _ _) a b) 3664 (lower_select_gpr ty (cond_invert cond) b a)) 3665 3666;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3667 3668;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float 3669;; conversion instruction in isolation, it also takes the upper 64-bits of an 3670;; xmm register and places it into the destination. We don't actually want that 3671;; to happen as it could accidentally create a false dependency with a 3672;; previous instruction defining the register's upper 64-bits. See #7085 for 3673;; an instance of this. 3674;; 3675;; This means that the first operand to all of the int-to-float conversions here 3676;; are `(xmm_zero)` operands which is a guaranteed zero register that has no 3677;; dependencies on other instructions. 3678;; 3679;; Ideally this would be lifted out to a higher level to get deduplicated 3680;; between consecutive int-to-float operations but that's not easy 3681;; to do at this time. One possibility would be a mid-end rule which rewrites 3682;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would 3683;; be subject to normal LICM, but that's not feasible today. 3684 3685(rule 2 (lower (has_type $F32 (fcvt_from_sint _ a @ (value_type $I8)))) 3686 (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) 3687 3688(rule 2 (lower (has_type $F32 (fcvt_from_sint _ a @ (value_type $I16)))) 3689 (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign)))) 3690 3691(rule 1 (lower (has_type $F32 (fcvt_from_sint _ a @ (value_type (ty_int (fits_in_64 ty)))))) 3692 (x64_cvtsi2ss ty (xmm_zero $F32X4) a)) 3693 3694(rule 2 (lower (has_type $F64 (fcvt_from_sint _ a @ (value_type $I8)))) 3695 (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) 3696 3697(rule 2 (lower (has_type $F64 (fcvt_from_sint _ a @ (value_type $I16)))) 3698 (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign)))) 3699 3700(rule 1 (lower (has_type $F64 (fcvt_from_sint _ a @ (value_type (ty_int (fits_in_64 ty)))))) 3701 (x64_cvtsi2sd ty (xmm_zero $F64X2) a)) 3702 3703(rule 0 (lower (fcvt_from_sint _ a @ (value_type $I32X4))) 3704 (x64_cvtdq2ps a)) 3705 3706;; Base case: decompose the i64x2 input into two scalar registers and convert 3707;; each of those into a float. Afterwards re-pack the two results into the final 3708;; destination. 3709(rule 0 (lower (fcvt_from_sint _ a @ (value_type $I64X2))) 3710 (let ( 3711 (a Xmm a) 3712 (zero Xmm (xmm_zero $F64X2)) 3713 (f0 Xmm (x64_cvtsi2sd $I64 zero (x64_movq_to_gpr a))) 3714 (f1 Xmm (x64_cvtsi2sd $I64 zero (x64_movq_to_gpr (x64_pshufd a 0b11_10_11_10)))) 3715 ) 3716 (x64_unpcklpd f0 f1))) 3717 3718(rule 1 (lower (has_type $F64X2 (fcvt_from_sint _ (swiden_low _ a @ (value_type $I32X4))))) 3719 (x64_cvtdq2pd a)) 3720 3721;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3722 3723(rule 1 (lower (has_type $F32 (fcvt_from_uint _ val @ (value_type (fits_in_32 (ty_int ty)))))) 3724 (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero)))) 3725 3726(rule 1 (lower (has_type $F64 (fcvt_from_uint _ val @ (value_type (fits_in_32 (ty_int ty)))))) 3727 (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero)))) 3728 3729(rule (lower (has_type ty (fcvt_from_uint _ val @ (value_type $I64)))) 3730 (cvt_u64_to_float_seq ty val)) 3731 3732;; Base case of u64x2 being converted to f64x2. No native instruction for this 3733;; is available so it's emulated through a series of instructions that exploit 3734;; the binary representation of 64-bit floats. This sequence of instructions is 3735;; copied from LLVM and my understanding of the general idea is to roughly: 3736;; 3737;; * For each bullet below operate in parallel on the left and right lanes. 3738;; * Move the low 32 bits of the input into one register and the upper 3739;; 32-bits into a different register, where both have all 0s for the upper 3740;; 32-bits. (e.g. split the 64-bit input into two locations) 3741;; * For the low bits, create `1.<twenty-zeros><low32>p52` via bit tricks. 3742;; * For the high bits, create `1.<twenty-zeros><high32>p84` via bit tricks. 3743;; * Create the constant `1.0p84 + 1.0p52` 3744;; * Add the two high halves and subtract the constant. 3745;; 3746;; Apply some math and this should produce the same result as the native 3747;; conversion. 3748;; 3749;; As for the bit tricks a float is represented where the low 53 bits are the 3750;; decimal of the float, basically: 3751;; 3752;; f = 1.<fraction> ^ (<exponent> - 1023) 3753;; 3754;; where `<fraction>` is the low 53 bits. By placing the 32-bit halves from 3755;; the original integer into the low 53 bits and setting the exponent right it 3756;; means that each 32-bit half can become part of a 64-bit floating point 3757;; number. The final step in combining via float arithmetic will chop off the 3758;; leading `1.` at the start of the float that we constructed, one for the low 3759;; half and one for the upper half. 3760(rule -1 (lower (has_type $F64X2 (fcvt_from_uint _ val @ (value_type $I64X2)))) 3761 (let ((low32_mask XmmMem (emit_u128_le_const 0x00000000ffffffff_00000000ffffffff)) 3762 (float_1p52 XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000)) 3763 (float_1p84 XmmMem (emit_u128_le_const 0x4530000000000000_4530000000000000)) 3764 (float_1p84_plus_1p52 XmmMem (emit_u128_le_const 0x4530000000100000_4530000000100000)) 3765 (low32 Xmm (x64_pand val low32_mask)) 3766 (low32_as_float Xmm (x64_por low32 float_1p52)) 3767 (high32 Xmm (x64_psrlq val (xmi_imm 32))) 3768 (high32_as_float Xmm (x64_por high32 float_1p84))) 3769 (x64_addpd low32_as_float (x64_subpd high32_as_float float_1p84_plus_1p52)))) 3770 3771;; Algorithm uses unpcklps to help create a float that is equivalent 3772;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent 3773;; every value of the mantissa represents a corresponding uint32 number. 3774;; When we subtract 0x1.0p52 we are left with double(src). 3775(rule 1 (lower (has_type $F64X2 (fcvt_from_uint _ (uwiden_low _ val @ (value_type $I32X4))))) 3776 (let ((uint_mask XmmMem (emit_u128_le_const 0x43300000_43300000)) 3777 (res Xmm (x64_unpcklps val uint_mask)) 3778 (uint_mask_high XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000))) 3779 (x64_subpd res uint_mask_high))) 3780 3781;; When AVX512VL and AVX512F are available, 3782;; `fcvt_from_uint` can be lowered to a single instruction. 3783(rule 2 (lower (has_type $F32X4 (fcvt_from_uint _ src))) 3784 (if-let true (has_avx512vl)) 3785 (if-let true (has_avx512f)) 3786 (x64_vcvtudq2ps src)) 3787 3788;; Converting packed unsigned integers to packed floats 3789;; requires a few steps. There is no single instruction 3790;; lowering for converting unsigned floats but there is for 3791;; converting packed signed integers to float (cvtdq2ps). In 3792;; the steps below we isolate the upper half (16 bits) and 3793;; lower half (16 bits) of each lane and then we convert 3794;; each half separately using cvtdq2ps meant for signed 3795;; integers. In order for this to work for the upper half 3796;; bits we must shift right by 1 (divide by 2) these bits in 3797;; order to ensure the most significant bit is 0 not signed, 3798;; and then after the conversion we double the value. 3799;; Finally we add the converted values where addition will 3800;; correctly round. 3801;; 3802;; Sequence: 3803;; -> A = 0xffffffff 3804;; -> Ah = 0xffff0000 3805;; -> Al = 0x0000ffff 3806;; -> Convert(Al) // Convert int to float 3807;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed 3808;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift 3809;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion. 3810;; -> dst = Ah + Al // Add the two floats together 3811(rule 1 (lower (has_type $F32X4 (fcvt_from_uint _ val))) 3812 (let ((a Xmm val) 3813 3814 ;; get the low 16 bits 3815 (a_lo Xmm (x64_pslld a (xmi_imm 16))) 3816 (a_lo Xmm (x64_psrld a_lo (xmi_imm 16))) 3817 3818 ;; get the high 16 bits 3819 (a_hi Xmm (x64_psubd a a_lo)) 3820 3821 ;; convert the low 16 bits 3822 (a_lo Xmm (x64_cvtdq2ps a_lo)) 3823 3824 ;; shift the high bits by 1, convert, and double to get the correct 3825 ;; value 3826 (a_hi Xmm (x64_psrld a_hi (xmi_imm 1))) 3827 (a_hi Xmm (x64_cvtdq2ps a_hi)) 3828 (a_hi Xmm (x64_addps a_hi a_hi))) 3829 3830 ;; add together the two converted values 3831 (x64_addps a_hi a_lo))) 3832 3833;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3834 3835(rule (lower (has_type out_ty (fcvt_to_uint _ val @ (value_type (ty_scalar_float _))))) 3836 (cvt_float_to_uint_seq out_ty val false)) 3837 3838(rule (lower (has_type out_ty (fcvt_to_uint_sat _ val @ (value_type (ty_scalar_float _))))) 3839 (cvt_float_to_uint_seq out_ty val true)) 3840 3841(rule (lower (has_type out_ty (fcvt_to_sint _ val @ (value_type (ty_scalar_float _))))) 3842 (cvt_float_to_sint_seq out_ty val false)) 3843 3844(rule (lower (has_type out_ty (fcvt_to_sint_sat _ val @ (value_type (ty_scalar_float _))))) 3845 (cvt_float_to_sint_seq out_ty val true)) 3846 3847;; The x64 backend currently only supports these two type combinations. 3848(rule 1 (lower (has_type $I32X4 (fcvt_to_sint_sat _ val @ (value_type $F32X4)))) 3849 (let ((src Xmm val) 3850 3851 ;; Sets tmp to zero if float is NaN 3852 (tmp Xmm (x64_cmpps src src (FcmpImm.Equal))) 3853 (dst Xmm (x64_andps src tmp)) 3854 3855 ;; Sets top bit of tmp if float is positive 3856 ;; Setting up to set top bit on negative float values 3857 (tmp Xmm (x64_pxor tmp dst)) 3858 3859 ;; Convert the packed float to packed doubleword. 3860 (dst Xmm (x64_cvttps2dq dst)) 3861 3862 ;; Set top bit only if < 0 3863 (tmp Xmm (x64_pand dst tmp)) 3864 (tmp Xmm (x64_psrad tmp (xmi_imm 31)))) 3865 3866 ;; On overflow 0x80000000 is returned to a lane. 3867 ;; Below sets positive overflow lanes to 0x7FFFFFFF 3868 ;; Keeps negative overflow lanes as is. 3869 (x64_pxor tmp dst))) 3870 3871;; The algorithm for converting floats to unsigned ints is a little tricky. The 3872;; complication arises because we are converting from a signed 64-bit int with a positive 3873;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended 3874;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX 3875;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our 3876;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but 3877;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than 3878;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes 3879;; precisely INT_MAX values we can correctly account for and convert every value in this range 3880;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction 3881;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX). 3882;; After the conversion we add INT_MAX+1 back to this converted value, noting again that 3883;; values we are trying to account for were already set to INT_MAX+1 during the original conversion. 3884;; We simply have to create a mask and make sure we are adding together only the lanes that need 3885;; to be accounted for. Digesting it all the steps then are: 3886;; 3887;; Step 1 - Account for NaN and negative floats by setting these src values to zero. 3888;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for 3889;; reasons described above. 3890;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX 3891;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those 3892;; values that were originally in the range (0..INT_MAX). This will come in handy during 3893;; step 7 when we zero negative lanes. 3894;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than 3895;; UINT_MAX that are now less than INT_MAX thanks to the subtraction. 3896;; Step 6 - Convert the second set of values (tmp1) 3897;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been 3898;; converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF 3899;; as this will allow us to properly saturate overflow lanes when adding to 0x80000000 3900;; Step 8 - Add the original converted src and the converted tmp1 where float values originally less 3901;; than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and 3902;; UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally 3903;; greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF). 3904;; 3905;; 3906;; The table below illustrates the result after each step where it matters for the converted set. 3907;; Note the original value range (original src set) is the final dst in Step 8: 3908;; 3909;; Original src set: 3910;; | Original Value Range | Step 1 | Step 3 | Step 8 | 3911;; | -FLT_MIN..FLT_MAX | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) | 3912;; 3913;; Copied src set (tmp1): 3914;; | Step 2 | Step 4 | 3915;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) | 3916;; 3917;; | Step 6 | Step 7 | 3918;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) | 3919(rule 1 (lower (has_type $I32X4 (fcvt_to_uint_sat _ val @ (value_type $F32X4)))) 3920 (let ((src Xmm val) 3921 3922 ;; Converting to unsigned int so if float src is negative or NaN 3923 ;; will first set to zero. 3924 (tmp2 Xmm (xmm_zero $F32X4)) 3925 (dst Xmm (x64_maxps src tmp2)) 3926 3927 ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks 3928 ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because 3929 ;; single precision IEEE-754 floats can only accurately represent contiguous 3930 ;; integers up to 2^23 and outside of this range it rounds to the closest 3931 ;; integer that it can represent. In the case of INT_MAX, this value gets 3932 ;; represented as 0x4f000000 which is the integer value (INT_MAX+1). 3933 (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2)) 3934 (tmp2 Xmm (x64_psrld tmp2 (xmi_imm 1))) 3935 (tmp2 Xmm (x64_cvtdq2ps tmp2)) 3936 3937 ;; Make a copy of these lanes and then do the first conversion. 3938 ;; Overflow lanes greater than the maximum allowed signed value will 3939 ;; set to 0x80000000. Negative and NaN lanes will be 0x0 3940 (tmp1 Xmm dst) 3941 (dst Xmm (x64_cvttps2dq dst)) 3942 3943 ;; Set lanes to src - max_signed_int 3944 (tmp1 Xmm (x64_subps tmp1 tmp2)) 3945 3946 ;; Create mask for all positive lanes to saturate (i.e. greater than 3947 ;; or equal to the maximum allowable unsigned int). 3948 (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual))) 3949 3950 ;; Convert those set of lanes that have the max_signed_int factored out. 3951 (tmp1 Xmm (x64_cvttps2dq tmp1)) 3952 3953 ;; Prepare converted lanes by zeroing negative lanes and prepping lanes 3954 ;; that have positive overflow (based on the mask) by setting these lanes 3955 ;; to 0x7FFFFFFF 3956 (tmp1 Xmm (x64_pxor tmp1 tmp2)) 3957 (tmp2 Xmm (xmm_zero $I32X4)) 3958 (tmp1 Xmm (lower_vec_smax $I32X4 tmp1 tmp2))) 3959 3960 ;; Add this second set of converted lanes to the original to properly handle 3961 ;; values greater than max signed int. 3962 (x64_paddd tmp1 dst))) 3963 3964;; Rules for `x86_cvtt2dq` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3965 3966(rule (lower (has_type $I32X4 (x86_cvtt2dq _ val @ (value_type $F32X4)))) 3967 (x64_cvttps2dq val)) 3968 3969;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3970 3971(rule (lower (has_type $I8X16 (iadd_pairwise _ x y))) 3972 (let ( 3973 ;; Shuffle all the even lanes of `x` and `y` into one register 3974 (even_lane_mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00ff_00ff_00ff_00ff_00ff_00ff_00ff_00ff))) 3975 (x_evens Xmm (x64_pand x even_lane_mask)) 3976 (y_evens Xmm (x64_pand y even_lane_mask)) 3977 (evens Xmm (x64_packuswb x_evens y_evens)) 3978 3979 ;; Shuffle all the odd lanes of `x` and `y` into one register 3980 (x_odds Xmm (x64_psrlw x (xmi_imm 8))) 3981 (y_odds Xmm (x64_psrlw y (xmi_imm 8))) 3982 (odds Xmm (x64_packuswb x_odds y_odds)) 3983 ) 3984 (x64_paddb evens odds))) 3985 3986 3987(rule 1 (lower (has_type $I16X8 (iadd_pairwise _ x y))) 3988 (if-let true (has_ssse3)) 3989 (x64_phaddw x y)) 3990 3991(rule (lower (has_type $I16X8 (iadd_pairwise _ x y))) 3992 (let ( 3993 (x Xmm x) 3994 (y Xmm y) 3995 3996 ;; Shuffle the even-numbered 16-bit lanes into low four lanes of each 3997 ;; vector by shuffling 16-bit lanes then shuffling 32-bit lanes. 3998 ;; With these in place generate a new vector from the two low 64-bits 3999 ;; of each vector (the low four 16-bit lanes). 4000 ;; 4001 ;; 0xe8 == 0b11_10_10_00 4002 (x_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw x 0xe8) 0xe8) 0xe8)) 4003 (y_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw y 0xe8) 0xe8) 0xe8)) 4004 (evens Xmm (x64_punpcklqdq x_evens y_evens)) 4005 4006 ;; Shuffle the odd-numbered 16-bit lanes into the low 8 lanes by 4007 ;; performing `sshr` operation on 32-bit lanes, effectively moving the 4008 ;; odd lanes into even lanes while leaving their sign bits in the 4009 ;; odd lanes. The `packssdw` instruction then conveniently will 4010 ;; put everything into one vector for us. 4011 (x_shifted Xmm (x64_psrad x (xmi_imm 16))) 4012 (y_shifted Xmm (x64_psrad y (xmi_imm 16))) 4013 (odds Xmm (x64_packssdw x_shifted y_shifted)) 4014 ) 4015 (x64_paddw evens odds))) 4016 4017(rule 1 (lower (has_type $I32X4 (iadd_pairwise _ x y))) 4018 (if-let true (has_ssse3)) 4019 (x64_phaddd x y)) 4020 4021(rule (lower (has_type $I32X4 (iadd_pairwise _ x y))) 4022 (let ( 4023 (x Xmm x) 4024 (y Xmm y) 4025 ;; evens = [ x[0] x[2] y[0] y[2] ] 4026 (evens Xmm (x64_shufps x y 0b10_00_10_00)) 4027 ;; odds = [ x[1] x[3] y[1] y[3] ] 4028 (odds Xmm (x64_shufps x y 0b11_01_11_01)) 4029 ) 4030 (x64_paddd evens odds))) 4031 4032;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction 4033(rule 2 (lower 4034 (has_type $I16X8 (iadd_pairwise _ 4035 (swiden_low _ val @ (value_type $I8X16)) 4036 (swiden_high _ val)))) 4037 (if-let true (has_ssse3)) 4038 (let ((mul_const Xmm (x64_xmm_load_const $I8X16 4039 (emit_u128_le_const 0x01010101010101010101010101010101)))) 4040 (x64_pmaddubsw mul_const val))) 4041 4042;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction 4043(rule 2 (lower 4044 (has_type $I32X4 (iadd_pairwise _ 4045 (swiden_low _ val @ (value_type $I16X8)) 4046 (swiden_high _ val)))) 4047 (let ((mul_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001))) 4048 (x64_pmaddwd val mul_const))) 4049 4050;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction 4051(rule 2 (lower 4052 (has_type $I16X8 (iadd_pairwise _ 4053 (uwiden_low _ val @ (value_type $I8X16)) 4054 (uwiden_high _ val)))) 4055 (if-let true (has_ssse3)) 4056 (let ((mul_const XmmMem (emit_u128_le_const 0x01010101010101010101010101010101))) 4057 (x64_pmaddubsw val mul_const))) 4058 4059;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction 4060(rule 2 (lower 4061 (has_type $I32X4 (iadd_pairwise _ 4062 (uwiden_low _ val @ (value_type $I16X8)) 4063 (uwiden_high _ val)))) 4064 (let ((xor_const XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)) 4065 (dst Xmm (x64_pxor val xor_const)) 4066 4067 (madd_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001)) 4068 (dst Xmm (x64_pmaddwd dst madd_const)) 4069 4070 (addd_const XmmMem (emit_u128_le_const 0x00010000_00010000_00010000_00010000))) 4071 (x64_paddd dst addd_const))) 4072 4073;; special case for the `i32x4.dot_i16x8_s` wasm instruction 4074(rule 2 (lower 4075 (has_type $I32X4 (iadd_pairwise _ 4076 (imul _ (swiden_low _ x) (swiden_low _ y)) 4077 (imul _ (swiden_high _ x) (swiden_high _ y))))) 4078 (x64_pmaddwd x y)) 4079 4080;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4081 4082;; With SSE4.1 use the `pmovsx*` instructions for this 4083(rule 1 (lower (has_type $I16X8 (swiden_low _ val @ (value_type $I8X16)))) 4084 (if-let true (has_sse41)) 4085 (x64_pmovsxbw val)) 4086(rule 1 (lower (has_type $I32X4 (swiden_low _ val @ (value_type $I16X8)))) 4087 (if-let true (has_sse41)) 4088 (x64_pmovsxwd val)) 4089(rule 1 (lower (has_type $I64X2 (swiden_low _ val @ (value_type $I32X4)))) 4090 (if-let true (has_sse41)) 4091 (x64_pmovsxdq val)) 4092 4093(rule (lower (has_type ty (swiden_low _ val))) (lower_swiden_low ty val)) 4094 4095(decl lower_swiden_low (Type Xmm) Xmm) 4096 4097;; Duplicate the low lanes next to each other, then perform a wider shift-right 4098;; by the low lane width to move the upper of each pair back into the lower lane 4099;; of each pair, achieving the widening of the lower lanes. 4100(rule (lower_swiden_low $I16X8 val) 4101 (x64_psraw (x64_punpcklbw val val) (xmi_imm 8))) 4102(rule (lower_swiden_low $I32X4 val) 4103 (x64_psrad (x64_punpcklwd val val) (xmi_imm 16))) 4104 4105;; Generate the sign-extended halves with a `val < 0` comparison (expressed 4106;; reversed here), then interleave the low 32-bit halves to create the full 4107;; 64-bit results. 4108(rule (lower_swiden_low $I64X2 val) 4109 (let ((tmp Xmm (x64_pcmpgtd_a (xmm_zero $I32X4) val))) 4110 (x64_punpckldq val tmp))) 4111 4112;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4113 4114;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved 4115;; to the lower lanes first. 4116(rule 1 (lower (has_type $I16X8 (swiden_high _ val @ (value_type $I8X16)))) 4117 (if-let true (has_sse41)) 4118 (if-let true (has_ssse3)) 4119 (let ((x Xmm val)) 4120 (x64_pmovsxbw (x64_palignr x x 8)))) 4121(rule 1 (lower (has_type $I32X4 (swiden_high _ val @ (value_type $I16X8)))) 4122 (if-let true (has_sse41)) 4123 (if-let true (has_ssse3)) 4124 (let ((x Xmm val)) 4125 (x64_pmovsxwd (x64_palignr x x 8)))) 4126(rule 1 (lower (has_type $I64X2 (swiden_high _ val @ (value_type $I32X4)))) 4127 (if-let true (has_sse41)) 4128 (x64_pmovsxdq (x64_pshufd val 0b11_10_11_10))) 4129 4130;; Similar to `swiden_low` versions but using `punpckh*` instructions to 4131;; pair the high lanes next to each other. 4132(rule (lower (has_type $I16X8 (swiden_high _ val @ (value_type $I8X16)))) 4133 (let ((val Xmm val)) 4134 (x64_psraw (x64_punpckhbw val val) (xmi_imm 8)))) 4135(rule (lower (has_type $I32X4 (swiden_high _ val @ (value_type $I16X8)))) 4136 (let ((val Xmm val)) 4137 (x64_psrad (x64_punpckhwd val val) (xmi_imm 16)))) 4138 4139;; Same as `swiden_low`, but `val` has its high lanes moved down. 4140(rule (lower (has_type $I64X2 (swiden_high _ val @ (value_type $I32X4)))) 4141 (let ((val Xmm (x64_pshufd val 0b00_00_11_10)) 4142 (tmp Xmm (x64_pcmpgtd_a (xmm_zero $I32X4) val))) 4143 (x64_punpckldq val tmp))) 4144 4145;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4146 4147;; With SSE4.1 use the `pmovzx*` instructions for this 4148(rule 1 (lower (has_type $I16X8 (uwiden_low _ val @ (value_type $I8X16)))) 4149 (if-let true (has_sse41)) 4150 (x64_pmovzxbw val)) 4151(rule 1 (lower (has_type $I32X4 (uwiden_low _ val @ (value_type $I16X8)))) 4152 (if-let true (has_sse41)) 4153 (x64_pmovzxwd val)) 4154(rule 1 (lower (has_type $I64X2 (uwiden_low _ val @ (value_type $I32X4)))) 4155 (if-let true (has_sse41)) 4156 (x64_pmovzxdq val)) 4157 4158(rule (lower (has_type ty (uwiden_low _ val))) (lower_uwiden_low ty val)) 4159 4160;; Interleave an all-zero register with the low lanes to produce zero-extended 4161;; results. 4162(decl lower_uwiden_low (Type Xmm) Xmm) 4163(rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16))) 4164(rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16))) 4165(rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4))) 4166 4167;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4168 4169;; Same as `uwiden_high`, but interleaving high lanes instead. 4170;; 4171;; Note that according to `llvm-mca` at least these instructions are faster 4172;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available. 4173(rule (lower (has_type $I16X8 (uwiden_high _ val @ (value_type $I8X16)))) 4174 (x64_punpckhbw val (xmm_zero $I8X16))) 4175(rule (lower (has_type $I32X4 (uwiden_high _ val @ (value_type $I16X8)))) 4176 (x64_punpckhwd val (xmm_zero $I8X16))) 4177(rule (lower (has_type $I64X2 (uwiden_high _ val @ (value_type $I32X4)))) 4178 (x64_unpckhps val (xmm_zero $F32X4))) 4179 4180;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4181 4182(rule (lower (has_type $I8X16 (snarrow _ a @ (value_type $I16X8) b))) 4183 (x64_packsswb a b)) 4184 4185(rule (lower (has_type $I16X8 (snarrow _ a @ (value_type $I32X4) b))) 4186 (x64_packssdw a b)) 4187 4188;; We're missing a `snarrow` case for $I64X2 4189;; https://github.com/bytecodealliance/wasmtime/issues/4734 4190 4191;; This rule is a special case for handling the translation of the wasm op 4192;; `i32x4.trunc_sat_f64x2_s_zero`. It can be removed once we have an 4193;; implementation of `snarrow` for `I64X2`. 4194(rule (lower (has_type $I32X4 (snarrow _ (has_type $I64X2 (fcvt_to_sint_sat _ val)) 4195 (vconst _ (u128_from_constant 0))))) 4196 (let ((a Xmm val) 4197 4198 ;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to: 4199 ;; MOVE xmm_tmp, xmm_x 4200 ;; CMPEQPD xmm_tmp, xmm_x 4201 ;; MOVE xmm_y, xmm_x 4202 ;; ANDPS xmm_tmp, [wasm_f64x2_splat(2147483647.0)] 4203 ;; MINPD xmm_y, xmm_tmp 4204 ;; CVTTPD2DQ xmm_y, xmm_y 4205 4206 (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal))) 4207 4208 ;; 2147483647.0 is equivalent to 0x41DFFFFFFFC00000 4209 (umax_mask XmmMem (emit_u128_le_const 0x41DFFFFFFFC00000_41DFFFFFFFC00000)) 4210 4211 ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)] 4212 (tmp1 Xmm (x64_andps tmp1 umax_mask)) 4213 (dst Xmm (x64_minpd a tmp1))) 4214 (x64_cvttpd2dq dst))) 4215 4216;; This rule is a special case for handling the translation of the wasm op 4217;; `i32x4.relaxed_trunc_f64x2_s_zero`. 4218(rule (lower (has_type $I32X4 (snarrow _ (has_type $I64X2 (x86_cvtt2dq _ val)) 4219 (vconst _ (u128_from_constant 0))))) 4220 (x64_cvttpd2dq val)) 4221 4222;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4223 4224(rule (lower (has_type $I8X16 (unarrow _ a @ (value_type $I16X8) b))) 4225 (x64_packuswb a b)) 4226 4227(rule 1 (lower (has_type $I16X8 (unarrow _ a @ (value_type $I32X4) b))) 4228 (if-let true (has_sse41)) 4229 (x64_packusdw a b)) 4230 4231;; For each input `a` and `b` take the four 32-bit lanes and compress them to 4232;; the low 64-bits of the vector as four 16-bit lanes. Then these are woven 4233;; into one final vector with a `punpcklqdq`. 4234;; 4235;; If this is performance sensitive then it's probably best to upgrade the CPU 4236;; to get the above single-instruction lowering. 4237(rule (lower (has_type $I16X8 (unarrow _ a @ (value_type $I32X4) b))) 4238 (let ( 4239 (a Xmm (unarrow_i32x4_lanes_to_low_u16_lanes a)) 4240 (b Xmm (unarrow_i32x4_lanes_to_low_u16_lanes b)) 4241 ) 4242 (x64_punpcklqdq a b))) 4243 4244(decl unarrow_i32x4_lanes_to_low_u16_lanes (Xmm) Xmm) 4245(rule (unarrow_i32x4_lanes_to_low_u16_lanes val) 4246 (let ( 4247 ;; First convert all negative values in `val` to zero lanes. 4248 (val_gt_zero Xmm (x64_pcmpgtd_a_or_avx val (xmm_zero $I32X4))) 4249 (val Xmm (x64_pand val val_gt_zero)) 4250 4251 ;; Next clamp all larger-than-u16-max lanes to u16::MAX. 4252 (max Xmm (x64_movdqu_load (emit_u128_le_const 0x0000ffff_0000ffff_0000ffff_0000ffff))) 4253 (cmp Xmm (x64_pcmpgtd_a_or_avx max val)) 4254 (valid_lanes Xmm (x64_pand val cmp)) 4255 (clamped_lanes Xmm (x64_pandn cmp max)) 4256 (val Xmm (x64_por valid_lanes clamped_lanes)) 4257 4258 ;; Within each 64-bit half of the 32x4 vector move the first 16 bits 4259 ;; and the third 16 bits to the bottom of the half. Afterwards 4260 ;; for the 32x4 vector move the first and third lanes to the bottom 4261 ;; lanes, which finishes up the conversion here as all the lanes 4262 ;; are now converted to 16-bit values in the low 4 lanes. 4263 (val Xmm (x64_pshuflw val 0b00_00_10_00)) 4264 (val Xmm (x64_pshufhw val 0b00_00_10_00)) 4265 ) 4266 (x64_pshufd val 0b00_00_10_00))) 4267 4268 4269;; We're missing a `unarrow` case for $I64X2 4270;; https://github.com/bytecodealliance/wasmtime/issues/4734 4271 4272;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4273 4274(rule -3 (lower (has_type (is_gpr_type (fits_in_64 ty)) (bitcast _ _ src @ (value_type (is_xmm_type _))))) 4275 (bitcast_xmm_to_gpr (ty_bits ty) src)) 4276 4277(rule -2 (lower (has_type (is_xmm_type (fits_in_64 ty)) (bitcast _ _ src @ (value_type (is_gpr_type _))))) 4278 (bitcast_gpr_to_xmm (ty_bits ty) src)) 4279 4280(rule -1 (lower (has_type $I128 (bitcast _ _ src @ (value_type (is_xmm_type _))))) 4281 (bitcast_xmm_to_gprs src)) 4282 4283(rule 0 (lower (has_type (is_xmm_type _) (bitcast _ _ src @ (value_type $I128)))) 4284 (bitcast_gprs_to_xmm src)) 4285 4286;; Bitcast between types residing in GPR registers is a no-op. 4287(rule 1 (lower (has_type (is_gpr_type _) 4288 (bitcast _ _ x @ (value_type (is_gpr_type _))))) 4289 x) 4290 4291;; Bitcast between types residing in XMM registers is a no-op. 4292(rule 3 (lower (has_type (is_xmm_type _) 4293 (bitcast _ _ x @ (value_type (is_xmm_type _))))) 4294 x) 4295 4296;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4297 4298(rule (lower (has_type $F32 (fcopysign _ a @ (value_type $F32) b))) 4299 (let ((sign_bit Xmm (imm $F32 0x80000000)) 4300 (a Xmm a) ;; force into reg so we don't sink a 128-bit load. 4301 (b Xmm b)) 4302 (x64_orps 4303 (x64_andnps sign_bit a) 4304 (x64_andps sign_bit b)))) 4305 4306(rule (lower (has_type $F64 (fcopysign _ a @ (value_type $F64) b))) 4307 (let ((sign_bit Xmm (imm $F64 0x8000000000000000)) 4308 (a Xmm a) ;; force into reg so we don't sink a 128-bit load. 4309 (b Xmm b)) 4310 (x64_orpd 4311 (x64_andnpd sign_bit a) 4312 (x64_andpd sign_bit b)))) 4313 4314;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;; 4315 4316;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates 4317;; the appropriate libcall and sequence to call that. 4318;; 4319;; Recursion: at most once to convert memory case into register case. 4320(decl rec x64_round (Type RegMem RoundImm) Xmm) 4321(rule 1 (x64_round $F32 a imm) 4322 (if-let true (has_sse41)) 4323 (x64_roundss a imm)) 4324(rule 1 (x64_round $F64 a imm) 4325 (if-let true (has_sse41)) 4326 (x64_roundsd a imm)) 4327(rule 1 (x64_round $F32X4 a imm) 4328 (if-let true (has_sse41)) 4329 (x64_roundps a imm)) 4330(rule 1 (x64_round $F64X2 a imm) 4331 (if-let true (has_sse41)) 4332 (x64_roundpd a imm)) 4333 4334(rule (x64_round $F32 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F32 imm) a)) 4335(rule (x64_round $F64 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F64 imm) a)) 4336(rule (x64_round $F32X4 (RegMem.Reg a) imm) 4337 (let ( 4338 (libcall LibCall (round_libcall $F32 imm)) 4339 (result Xmm (libcall_1 libcall a)) 4340 (a1 Xmm (libcall_1 libcall (x64_pshufd a 1))) 4341 (result Xmm (f32x4_insertlane result a1 1)) 4342 (a2 Xmm (libcall_1 libcall (x64_pshufd a 2))) 4343 (result Xmm (f32x4_insertlane result a2 2)) 4344 (a3 Xmm (libcall_1 libcall (x64_pshufd a 3))) 4345 (result Xmm (f32x4_insertlane result a3 3)) 4346 ) 4347 result)) 4348(rule (x64_round $F64X2 (RegMem.Reg a) imm) 4349 (let ( 4350 (libcall LibCall (round_libcall $F64 imm)) 4351 (result Xmm (libcall_1 libcall a)) 4352 (a1 Xmm (libcall_1 libcall (x64_pshufd a 0b00_00_11_10))) 4353 ) 4354 (x64_movlhps result a1))) 4355(rule (x64_round ty (RegMem.Mem addr) imm) 4356 (x64_round ty (RegMem.Reg (x64_load_xmm ty addr)) imm)) 4357 4358(decl round_libcall (Type RoundImm) LibCall) 4359(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32)) 4360(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64)) 4361(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32)) 4362(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64)) 4363(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32)) 4364(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64)) 4365(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32)) 4366(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64)) 4367 4368;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4369 4370(rule (lower (ceil _ a @ (value_type ty))) 4371 (x64_round ty a (RoundImm.RoundUp))) 4372 4373;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4374 4375(rule (lower (floor _ a @ (value_type ty))) 4376 (x64_round ty a (RoundImm.RoundDown))) 4377 4378;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4379 4380(rule (lower (nearest _ a @ (value_type ty))) 4381 (x64_round ty a (RoundImm.RoundNearest))) 4382 4383;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4384 4385(rule (lower (trunc _ a @ (value_type ty))) 4386 (x64_round ty a (RoundImm.RoundZero))) 4387 4388;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4389 4390(rule (lower (stack_addr _ stack_slot offset)) 4391 (stack_addr_impl stack_slot offset)) 4392 4393;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4394 4395;; NB: a `RegMem` divisor, while allowed in the instruction encoding, isn't 4396;; used right now to prevent a possibly-trapping load getting folded into the 4397;; `div` instruction. Ideally non-trapping loads would get folded, however, or 4398;; alternatively Wasmtime/Cranelift would grow support for multiple traps on 4399;; a single opcode and the signal kind would differentiate at runtime. 4400 4401;; The inputs to the `div` instruction are different for 8-bit division so 4402;; it needs a special case here since the instruction being crafted has a 4403;; different shape. 4404(rule 2 (lower (udiv _ a @ (value_type $I8) b)) 4405 (x64_divb_m (extend_to_gpr a $I32 (ExtendKind.Zero)) 4406 (put_in_gpr b) 4407 (TrapCode.INTEGER_DIVISION_BY_ZERO))) 4408 4409;; 16-to-64-bit division is all done with a similar instruction and the only 4410;; tricky requirement here is that when div traps are disallowed the divisor 4411;; must not be zero. 4412(rule 1 (lower (udiv _ a @ (value_type (fits_in_64 ty)) b)) 4413 (value_regs_get 4414 (x64_div ty a (imm $I64 0) (put_in_gpr b) (TrapCode.INTEGER_DIVISION_BY_ZERO)) 4415 0)) 4416 4417;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4418 4419(rule 2 (lower (sdiv _ a @ (value_type $I8) b)) 4420 (x64_idivb_m (x64_cbtw_zo a) 4421 (nonzero_sdiv_divisor $I8 b) 4422 (TrapCode.INTEGER_OVERFLOW))) 4423 4424(rule 1 (lower (sdiv _ a @ (value_type (fits_in_64 ty)) b)) 4425 (let ((a Gpr a)) 4426 (value_regs_get 4427 (x64_idiv ty a (repeat_sign_bit ty a) (nonzero_sdiv_divisor ty b) (TrapCode.INTEGER_OVERFLOW)) 4428 0))) 4429 4430;; Repeats the sign bit in the provided gpr, which will register-allocate to 4431;; %rax, into a destination gpr which will register-allocate to %rdx. 4432;; 4433;; This is intended to be used before x64 `div` instructions where 4434;; left-hand-side (divisor? dividend? I always forget) is double-wide and 4435;; present across the rax/rdx registers (sized to the operation in question). 4436(decl repeat_sign_bit (Type Gpr) Gpr) 4437(rule (repeat_sign_bit $I16 src) (x64_cwtd_zo src)) 4438(rule (repeat_sign_bit $I32 src) (x64_cltd_zo src)) 4439(rule (repeat_sign_bit $I64 src) (x64_cqto_zo src)) 4440 4441;; Checks to make sure that the input `Value` is a non-zero value for `sdiv`. 4442;; 4443;; This is required to differentiate the divide-by-zero trap from the 4444;; integer-overflow trap, the two trapping conditions of signed division. 4445(decl nonzero_sdiv_divisor (Type Value) Reg) 4446(rule 1 (nonzero_sdiv_divisor ty (iconst _ imm)) 4447 (if-let n (safe_divisor_from_imm64 ty imm)) 4448 (imm ty n)) 4449(rule 0 (nonzero_sdiv_divisor ty val) 4450 (let ( 4451 (val Reg val) 4452 (_ InstOutput (side_effect (with_flags_side_effect 4453 (x64_test ty val val) 4454 (trap_if (CC.Z) (TrapCode.INTEGER_DIVISION_BY_ZERO))))) 4455 ) 4456 val)) 4457 4458;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4459 4460;; The remainder is in AH, so take the result of the division and right-shift 4461;; by 8. 4462(rule 2 (lower (urem _ a @ (value_type $I8) b)) 4463 (let ( 4464 (result Gpr (x64_divb_m (extend_to_gpr a $I32 (ExtendKind.Zero)) 4465 (put_in_gpr b) ;; see `udiv` for why not `gpr_mem` 4466 (TrapCode.INTEGER_DIVISION_BY_ZERO))) 4467 ) 4468 (x64_shrq_mi result 8))) 4469 4470(rule 1 (lower (urem _ a @ (value_type (fits_in_64 ty)) b)) 4471 (value_regs_get 4472 (x64_div ty a (imm $I64 0) (put_in_gpr b) (TrapCode.INTEGER_DIVISION_BY_ZERO)) 4473 1)) 4474 4475;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4476 4477;; Special-cases first for constant `srem` where the checks for 0 and -1 aren't 4478;; applicable. 4479;; 4480;; Note that like `urem` for i8 types the result is in AH so to get the result 4481;; it's right-shifted down. 4482(rule 3 (lower (srem _ a @ (value_type $I8) (iconst _ imm))) 4483 (if-let n (safe_divisor_from_imm64 $I8 imm)) 4484 (let ( 4485 (a Gpr (x64_cbtw_zo a)) 4486 (result Gpr (x64_idivb_m a (imm $I8 n) (TrapCode.INTEGER_DIVISION_BY_ZERO))) 4487 ) 4488 (x64_shrq_mi result 8))) 4489 4490;; Same as the above rule but for 16-to-64 bit types. 4491(rule 2 (lower (srem _ a @ (value_type ty) (iconst _ imm))) 4492 (if-let n (safe_divisor_from_imm64 ty imm)) 4493 (let ( 4494 (a Gpr a) 4495 (size OperandSize (raw_operand_size_of_type ty)) 4496 ) 4497 (value_regs_get 4498 (x64_idiv ty a (repeat_sign_bit ty a) (imm ty n) (TrapCode.INTEGER_DIVISION_BY_ZERO)) 4499 1))) 4500 4501(rule 1 (lower (srem _ a @ (value_type $I8) b)) 4502 (x64_shrq_mi (x64_checked_srem_seq8 (x64_cbtw_zo a) b) 8)) 4503 4504(rule (lower (srem _ a @ (value_type ty) b)) 4505 (let ( 4506 (a Gpr a) 4507 (size OperandSize (raw_operand_size_of_type ty)) 4508 (hi Gpr (repeat_sign_bit ty a)) 4509 (tmp ValueRegs (x64_checked_srem_seq size a hi b)) 4510 ) 4511 (value_regs_get tmp 1))) 4512 4513;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4514 4515(rule 0 (lower (umulhi _ a @ (value_type $I8) b)) 4516 (x64_shrw_mi (x64_mul8 false a b) 8)) 4517 4518(rule 1 (lower (umulhi _ a @ (value_type (ty_int_ref_16_to_64 ty)) b)) 4519 (value_regs_get_gpr (x64_mul ty false a b) 1)) 4520 4521;; The BMI2 instruction set introduced `mulx` which defines two registers but 4522;; if the two registers are the same then it only defines the upper bits. This 4523;; helps slightly reduce register pressure by ensuring only one register here is 4524;; clobbered. 4525(rule 2 (lower (umulhi _ a @ (value_type (ty_32_or_64 ty)) b)) 4526 (if-let true (has_bmi2)) 4527 (x64_mulx_hi ty a b)) 4528 4529;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4530 4531(rule 0 (lower (smulhi _ a @ (value_type $I8) b)) 4532 (x64_sarw_mi (x64_mul8 true a b) 8)) 4533 4534(rule 1 (lower (smulhi _ a @ (value_type (ty_int_ref_16_to_64 ty)) b)) 4535 (value_regs_get_gpr (x64_mul ty true a b) 1)) 4536 4537;; Rules for `get_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4538 4539(rule (lower (get_pinned_reg _)) 4540 (read_pinned_gpr)) 4541 4542;; Rules for `set_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4543 4544(rule (lower (set_pinned_reg a @ (value_type ty))) 4545 (side_effect (write_pinned_gpr a))) 4546 4547;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4548 4549(rule (lower (has_type ty (vconst _ const))) 4550 ;; TODO use Inst::gen_constant() instead. 4551 (x64_xmm_load_const ty (const_to_vconst const))) 4552 4553;; Special cases for known constant patterns to skip a 16-byte load. 4554(rule 1 (lower (has_type ty (vconst _ (u128_from_constant 0)))) (xmm_zero ty)) 4555(rule 1 (lower (has_type ty (vconst _ (u128_from_constant -1)))) (vector_all_ones)) 4556 4557;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4558 4559;; Special case for `pblendw` which takes an 8-bit immediate where each bit 4560;; indicates which lane of the two operands is chosen for the output. A bit of 4561;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the 4562;; corresponding 16-bit lane from `b`. 4563(rule 14 (lower (shuffle _ a b (pblendw_imm n))) 4564 (if-let true (has_sse41)) 4565 (x64_pblendw a b n)) 4566(decl pblendw_imm (u8) Immediate) 4567(extern extractor pblendw_imm pblendw_imm) 4568 4569;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8 4570;; bytes", that's a `palignr` instruction. Note that the order of operands are 4571;; swapped in the instruction here. The `palignr` instruction uses the second 4572;; operand as the low-order bytes and the first operand as high-order bytes, 4573;; so put `a` second. 4574(rule 13 (lower (shuffle _ a b (palignr_imm_from_immediate n))) 4575 (if-let true (has_ssse3)) 4576 (x64_palignr b a n)) 4577(decl palignr_imm_from_immediate (u8) Immediate) 4578(extern extractor palignr_imm_from_immediate palignr_imm_from_immediate) 4579 4580;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit 4581;; integers within one value, preserving the other four 16-bit integers in that 4582;; value (either the high or low half). The complicated logic is in the 4583;; extractors here implemented in Rust and note that there's two cases for each 4584;; instruction here to match when either the first or second shuffle operand is 4585;; used. 4586(rule 12 (lower (shuffle _ x y (pshuflw_lhs_imm imm))) 4587 (x64_pshuflw x imm)) 4588(rule 11 (lower (shuffle _ x y (pshuflw_rhs_imm imm))) 4589 (x64_pshuflw y imm)) 4590(rule 10 (lower (shuffle _ x y (pshufhw_lhs_imm imm))) 4591 (x64_pshufhw x imm)) 4592(rule 9 (lower (shuffle _ x y (pshufhw_rhs_imm imm))) 4593 (x64_pshufhw y imm)) 4594 4595(decl pshuflw_lhs_imm (u8) Immediate) 4596(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm) 4597(decl pshuflw_rhs_imm (u8) Immediate) 4598(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm) 4599(decl pshufhw_lhs_imm (u8) Immediate) 4600(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm) 4601(decl pshufhw_rhs_imm (u8) Immediate) 4602(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm) 4603 4604;; Special case for the `pshufd` instruction which will permute 32-bit values 4605;; within a single register. This is only applicable if the `imm` specified 4606;; selects 32-bit values from either `x` or `y`, but not both. This means 4607;; there's one rule for selecting from `x` and another rule for selecting from 4608;; `y`. 4609(rule 8 (lower (shuffle _ x y (pshufd_lhs_imm imm))) 4610 (x64_pshufd x imm)) 4611(rule 7 (lower (shuffle _ x y (pshufd_rhs_imm imm))) 4612 (x64_pshufd y imm)) 4613 4614(decl pshufd_lhs_imm (u8) Immediate) 4615(extern extractor pshufd_lhs_imm pshufd_lhs_imm) 4616(decl pshufd_rhs_imm (u8) Immediate) 4617(extern extractor pshufd_rhs_imm pshufd_rhs_imm) 4618 4619;; Special case for i8-level interleaving of upper/low bytes. 4620(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) 4621 (x64_punpckhbw a b)) 4622(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) 4623 (x64_punpcklbw a b)) 4624 4625;; Special case for i16-level interleaving of upper/low bytes. 4626(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908))) 4627 (x64_punpckhwd a b)) 4628(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100))) 4629 (x64_punpcklwd a b)) 4630 4631;; Special case for i32-level interleaving of upper/low bytes. 4632(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908))) 4633 (x64_punpckhdq a b)) 4634(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x17161514_07060504_13121110_03020100))) 4635 (x64_punpckldq a b)) 4636 4637;; Special case for i64-level interleaving of upper/low bytes. 4638(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908))) 4639 (x64_punpckhqdq a b)) 4640(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1716151413121110_0706050403020100))) 4641 (x64_punpcklqdq a b)) 4642 4643;; If the vector shift mask is all 0s then that means the first byte of the 4644;; first operand is broadcast to all bytes. Falling through would load an 4645;; all-zeros constant from a rip-relative location but it should be slightly 4646;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero 4647;; register. 4648(rule 6 (lower (shuffle _ a _ (u128_from_immediate 0))) 4649 (if-let true (has_ssse3)) 4650 (x64_pshufb a (xmm_zero $I8X16))) 4651 4652;; Special case for the `shufps` instruction which will select two 32-bit values 4653;; from the first operand and two 32-bit values from the second operand. Note 4654;; that there is a second case here as well for when the operands can be 4655;; swapped. 4656;; 4657;; Note that the priority of this instruction is currently lower than the above 4658;; special cases since `shufps` handles many of them and for now it's 4659;; hypothesized that the dedicated instructions are better than `shufps`. 4660;; Someone with more knowledge about x86 timings should perhaps reorder the 4661;; rules here eventually though. 4662(rule 5 (lower (shuffle _ x y (shufps_imm imm))) 4663 (x64_shufps x y imm)) 4664(rule 4 (lower (shuffle _ x y (shufps_rev_imm imm))) 4665 (x64_shufps y x imm)) 4666 4667(decl shufps_imm(u8) Immediate) 4668(extern extractor shufps_imm shufps_imm) 4669(decl shufps_rev_imm(u8) Immediate) 4670(extern extractor shufps_rev_imm shufps_rev_imm) 4671 4672 4673;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM 4674;; register. We statically build `constructed_mask` to zero out any unknown lane 4675;; indices (may not be completely necessary: verification could fail incorrect 4676;; mask values) and fix the indexes to all point to the `dst` vector. 4677(rule 3 (lower (shuffle _ a a (vec_mask_from_immediate mask))) 4678 (if-let true (has_ssse3)) 4679 (x64_pshufb a (shuffle_0_31_mask mask))) 4680 4681;; For the case where the shuffle mask contains out-of-bounds values (values 4682;; greater than 31) we must mask off those resulting values in the result of 4683;; `vpermi2b`. 4684(rule 2 (lower (shuffle _ a b (vec_mask_from_immediate (perm_from_mask_with_zeros mask zeros)))) 4685 (if-let true (has_avx512vl)) 4686 (if-let true (has_avx512vbmi)) 4687 (x64_andps (x64_vpermi2b (x64_xmm_load_const $I8X16 mask) a b) zeros)) 4688 4689;; However, if the shuffle mask contains no out-of-bounds values, we can use 4690;; `vpermi2b` without any masking. 4691(rule 1 (lower (shuffle _ a b (vec_mask_from_immediate mask))) 4692 (if-let true (has_avx512vl)) 4693 (if-let true (has_avx512vbmi)) 4694 (x64_vpermi2b (x64_xmm_load_const $I8X16 (perm_from_mask mask)) a b)) 4695 4696;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR 4697;; them together. This is necessary due to PSHUFB semantics. As in the case 4698;; above, we build the `constructed_mask` for each case statically. 4699(rule (lower (shuffle _ a b (vec_mask_from_immediate mask))) 4700 (x64_por 4701 (lower_pshufb a (shuffle_0_15_mask mask)) 4702 (lower_pshufb b (shuffle_16_31_mask mask)))) 4703 4704;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4705 4706;; SIMD swizzle; the following inefficient implementation is due to the Wasm 4707;; SIMD spec requiring mask indexes greater than 15 to have the same semantics 4708;; as a 0 index. For the spec discussion, see 4709;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the 4710;; Wasm SIMD semantics for this instruction. The instruction format maps to 4711;; variables like: %dst = swizzle %src, %mask 4712(rule (lower (swizzle _ src mask)) 4713 (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070)))) 4714 (lower_pshufb src mask))) 4715 4716;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4717 4718(rule (lower (x86_pshufb _ src mask)) 4719 (if-let true (has_ssse3)) 4720 (x64_pshufb src mask)) 4721 4722;; A helper function to generate either the `pshufb` instruction or a libcall to 4723;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most 4724;; performant thing in the world so this is primarily here for completeness 4725;; of lowerings on all x86 cpus but if rules are ideally gated on the presence 4726;; of SSSE3 to use the `pshufb` instruction itself. 4727;; 4728;; Recursion: at most once to implement the memory load case. 4729(decl rec lower_pshufb (Xmm RegMem) Xmm) 4730(rule 1 (lower_pshufb src mask) 4731 (if-let true (has_ssse3)) 4732 (x64_pshufb src mask)) 4733(rule (lower_pshufb src (RegMem.Reg mask)) 4734 (libcall_2 (LibCall.X86Pshufb) src mask)) 4735(rule (lower_pshufb src (RegMem.Mem addr)) 4736 (lower_pshufb src (x64_movdqu_load addr))) 4737 4738;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4739 4740;; Remove the extractlane instruction, leaving the float where it is. The upper 4741;; bits will remain unchanged; for correctness, this relies on Cranelift type 4742;; checking to avoid using those bits. 4743(rule 3 (lower (has_type (ty_scalar_float _) (extractlane _ val 0))) 4744 val) 4745 4746;; `f32x4.extract_lane N` where `N != 0` 4747(rule 1 (lower (extractlane _ val @ (value_type $F32X4) (u8_from_uimm8 lane))) 4748 (x64_pshufd val lane)) 4749 4750;; `f64x2.extract_lane N` where `N != 0` (aka N == 1) 4751(rule (lower (extractlane _ val @ (value_type $F64X2) 1)) 4752 (x64_pshufd val 0b11_10_11_10)) 4753 4754;; `i8x16.extract_lane N` 4755;; 4756;; Note that without SSE4.1 a 16-bit lane extraction is performed and then 4757;; the result is updated if the desired index is either odd or even. 4758(rule 2 (lower (extractlane _ val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane))) 4759 (if-let true (has_sse41)) 4760 (x64_pextrb val lane)) 4761;; extracting an odd lane has an extra shift-right 4762(rule 1 (lower (extractlane _ val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane))) 4763 (if-let 1 (u8_and lane 1)) 4764 (x64_shrw_mi (x64_pextrw val (u8_wrapping_shr lane 1)) 8)) 4765;; Extracting an even lane already has the desired lane in the lower bits. Note 4766;; that having arbitrary upper bits in the returned register should be ok since 4767;; all operators on the resulting `i8` type should work correctly regardless of 4768;; the bits in the rest of the register. 4769(rule (lower (extractlane _ val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane))) 4770 (if-let 0 (u8_and lane 1)) 4771 (x64_pextrw val (u8_wrapping_shr lane 1))) 4772 4773;; `i16x8.extract_lane N` 4774(rule (lower (extractlane _ val @ (value_type ty @ $I16X8) (u8_from_uimm8 lane))) 4775 (x64_pextrw val lane)) 4776 4777;; `i32x4.extract_lane N` 4778(rule 2 (lower (extractlane _ val @ (value_type ty @ $I32X4) (u8_from_uimm8 lane))) 4779 (if-let true (has_sse41)) 4780 (x64_pextrd val lane)) 4781(rule 1 (lower (extractlane _ val @ (value_type $I32X4) 0)) 4782 (x64_movd_to_gpr val)) 4783(rule (lower (extractlane _ val @ (value_type $I32X4) (u8_from_uimm8 n))) 4784 (x64_movd_to_gpr (x64_pshufd val n))) 4785 4786;; `i64x2.extract_lane N` 4787(rule 1 (lower (extractlane _ val @ (value_type $I64X2) (u8_from_uimm8 lane))) 4788 (if-let true (has_sse41)) 4789 (x64_pextrq val lane)) 4790(rule (lower (extractlane _ val @ (value_type $I64X2) 0)) 4791 (x64_movq_to_gpr val)) 4792(rule (lower (extractlane _ val @ (value_type $I64X2) 1)) 4793 (x64_movq_to_gpr (x64_pshufd val 0b00_00_11_10))) 4794 4795;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4796 4797;; Case 1: when moving a scalar float the `movss` and `movsd` variants with 4798;; xmm-register-to-xmm-register semantics are used to modify only the low bits 4799;; of a guaranteed-zero register. This ensures that the upper bits are cleared 4800;; as the upper bits of `src` in a register are otherwise undefined. 4801(rule 1 (lower (scalar_to_vector _ src @ (value_type $F32))) 4802 (x64_movss_regmove (xmm_zero $F32X4) src)) 4803(rule 1 (lower (scalar_to_vector _ src @ (value_type $F64))) 4804 (x64_movsd_regmove (xmm_zero $F64X2) src)) 4805 4806;; Case 2: when moving a scalar value of any other type, use MOVD to zero 4807;; the upper lanes. 4808(rule (lower (scalar_to_vector _ src @ (value_type ty))) 4809 (bitcast_gpr_to_xmm (ty_bits ty) src)) 4810 4811;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single 4812;; MOVSS/MOVSD instruction. 4813(rule 2 (lower (scalar_to_vector _ (and (sinkable_load src) (value_type (ty_32 _))))) 4814 (x64_movss_load src)) 4815(rule 3 (lower (scalar_to_vector _ (and (sinkable_load src) (value_type (ty_64 _))))) 4816 (x64_movsd_load src)) 4817 4818;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4819 4820;; For all the splat rules below one of the goals is that splatting a value 4821;; doesn't end up accidentally depending on the previous value in a register. 4822;; This means that instructions are chosen to avoid false dependencies where 4823;; new values are created fresh or otherwise overwrite previous register 4824;; contents where possible. 4825;; 4826;; Additionally splats are specialized to special-case load-and-splat which 4827;; has a number of micro-optimizations available. 4828 4829;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts 4830;; with a mask of zero which is calculated with an xor-against-itself register. 4831(rule 0 (lower (has_type $I8X16 (splat _ src))) 4832 (let ((src Xmm (x64_movd_to_xmm src))) 4833 (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0))) 4834(rule 1 (lower (has_type $I8X16 (splat _ src))) 4835 (if-let true (has_ssse3)) 4836 (x64_pshufb (bitcast_gpr_to_xmm 32 src) (xmm_zero $I8X16))) 4837(rule 2 (lower (has_type $I8X16 (splat _ src))) 4838 (if-let true (use_avx2)) 4839 (x64_vpbroadcastb (bitcast_gpr_to_xmm 32 src))) 4840(rule 3 (lower (has_type $I8X16 (splat _ (sinkable_load_exact addr)))) 4841 (if-let true (has_sse41)) 4842 (if-let true (has_ssse3)) 4843 (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16))) 4844(rule 4 (lower (has_type $I8X16 (splat _ (sinkable_load_exact addr)))) 4845 (if-let true (use_avx2)) 4846 (x64_vpbroadcastb addr)) 4847 4848;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is 4849;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane 4850;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which 4851;; at that point is two of the 16-bit values we want to broadcast) to all the 4852;; lanes. 4853(rule 0 (lower (has_type $I16X8 (splat _ src))) 4854 (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm 32 src) 0) 0)) 4855(rule 1 (lower (has_type $I16X8 (splat _ src))) 4856 (if-let true (use_avx2)) 4857 (x64_vpbroadcastw (bitcast_gpr_to_xmm 32 src))) 4858(rule 2 (lower (has_type $I16X8 (splat _ (sinkable_load_exact addr)))) 4859 (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0)) 4860(rule 3 (lower (has_type $I16X8 (splat _ (sinkable_load_exact addr)))) 4861 (if-let true (use_avx2)) 4862 (x64_vpbroadcastw addr)) 4863 4864;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be 4865;; used to broadcast the low lane to all other lanes. 4866;; 4867;; Note that sinkable-load cases come later 4868(rule 0 (lower (has_type $I32X4 (splat _ src))) 4869 (x64_pshufd (bitcast_gpr_to_xmm 32 src) 0)) 4870(rule 1 (lower (has_type $I32X4 (splat _ src))) 4871 (if-let true (use_avx2)) 4872 (x64_vpbroadcastd (bitcast_gpr_to_xmm 32 src))) 4873 4874;; f32x4.splat - the source is already in an xmm register so `shufps` is all 4875;; that's necessary to complete the splat. This is specialized to `vbroadcastss` 4876;; on AVX2 to leverage that specific instruction for this operation. 4877(rule 0 (lower (has_type $F32X4 (splat _ src))) 4878 (let ((tmp Xmm src)) 4879 (x64_shufps src src 0))) 4880(rule 1 (lower (has_type $F32X4 (splat _ src))) 4881 (if-let true (use_avx2)) 4882 (x64_vbroadcastss src)) 4883 4884;; t32x4.splat of a load - use a `movss` to load into an xmm register and then 4885;; `shufps` broadcasts to the other lanes. Note that this is used for both i32 4886;; and f32 splats. 4887;; 4888;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note 4889;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but 4890;; the register-based encoding is only available with AVX2. With the 4891;; `sinkable_load` extractor this should be guaranteed to use the memory-based 4892;; encoding hence the `has_avx` test. 4893(rule 5 (lower (has_type (multi_lane 32 4) (splat _ (sinkable_load addr)))) 4894 (let ((tmp Xmm (x64_movss_load addr))) 4895 (x64_shufps tmp tmp 0))) 4896(rule 6 (lower (has_type (multi_lane 32 4) (splat _ (sinkable_load addr)))) 4897 (if-let true (has_avx)) 4898 (x64_vbroadcastss addr)) 4899 4900;; t64x2.splat - use `pshufd` to broadcast the lower 64-bit lane to the upper 4901;; lane. A minor specialization for sinkable loads to avoid going through a gpr 4902;; for i64 splats is used as well when `movddup` is available. 4903(rule 0 (lower (has_type $I64X2 (splat _ src))) 4904 (x64_pshufd (bitcast_gpr_to_xmm 64 src) 0b01_00_01_00)) 4905(rule 0 (lower (has_type $F64X2 (splat _ src))) 4906 (x64_pshufd (put_in_xmm src) 0b01_00_01_00)) 4907(rule 6 (lower (has_type (multi_lane 64 2) (splat _ (sinkable_load addr)))) 4908 (if-let true (has_sse3)) 4909 (x64_movddup addr)) 4910 4911;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4912 4913(rule (lower (vany_true _ val)) (lower_cond_bool (is_vany_true val))) 4914 4915;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a 4916;; zeroed register and extract the high bits to a gpr mask. If the mask is 4917;; 0xffff then every byte was equal to zero, so test if the comparison is 4918;; not-equal or NZ. 4919(decl is_vany_true (Value) CondResult) 4920(rule (is_vany_true val) 4921 (let ( 4922 (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16))) 4923 (mask Gpr (x64_pmovmskb any_byte_zero)) 4924 ) 4925 (CondResult.CC (x64_cmpl_mi mask 0xffff) (CC.NZ)))) 4926(rule 1 (is_vany_true val) 4927 (if-let true (has_sse41)) 4928 (let ((val Xmm val)) 4929 (CondResult.CC (x64_ptest val val) (CC.NZ)))) 4930 4931;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4932 4933(rule (lower (vall_true _ val)) (lower_cond_bool (is_vall_true val))) 4934 4935(decl is_vall_true (Value) CondResult) 4936(rule 1 (is_vall_true val @ (value_type ty)) 4937 (if-let true (has_sse41)) 4938 (let ((src Xmm val) 4939 (zeros Xmm (xmm_zero ty)) 4940 (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros))) 4941 (CondResult.CC (x64_ptest cmp cmp) (CC.Z)))) 4942 4943;; Perform an appropriately-sized lane-wise comparison with zero. If the 4944;; result is all 0s then all of them are true because nothing was equal to 4945;; zero. 4946(rule (is_vall_true val @ (value_type ty)) 4947 (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty))) 4948 (mask Gpr (x64_pmovmskb lanes_with_zero))) 4949 (CondResult.CC (x64_testl_mr mask mask) (CC.Z)))) 4950 4951;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4952 4953;; The Intel specification allows using both 32-bit and 64-bit GPRs as 4954;; destination for the "move mask" instructions. This is controlled by the REX.R 4955;; bit: "In 64-bit mode, the instruction can access additional registers when 4956;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode" 4957;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we 4958;; will always clear REX.W since its use is unnecessary (`OperandSize` is used 4959;; for setting/clearing REX.W) as we need at most 16 bits of output for 4960;; `vhigh_bits`. 4961 4962(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 8 16)))) 4963 (x64_pmovmskb val)) 4964 4965(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 32 4)))) 4966 (x64_movmskps val)) 4967 4968(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 64 2)))) 4969 (x64_movmskpd val)) 4970 4971;; There is no x86 instruction for extracting the high bit of 16-bit lanes so 4972;; here we: 4973;; - duplicate the 16-bit lanes of `src` into 8-bit lanes: 4974;; PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...] 4975;; - use PMOVMSKB to gather the high bits; now we have duplicates, though 4976;; - shift away the bottom 8 high bits to remove the duplicates. 4977(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 16 8)))) 4978 (let ((src Xmm val) 4979 (tmp Xmm (x64_packsswb src src)) 4980 (tmp Gpr (x64_pmovmskb tmp))) 4981 (x64_shrq_mi tmp 8))) 4982 4983;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4984 4985(rule (lower (iconcat _ lo @ (value_type $I64) hi)) 4986 (value_regs lo hi)) 4987 4988;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4989 4990(rule (lower (isplit _ val @ (value_type $I128))) 4991 (let ((regs ValueRegs val) 4992 (lo Reg (value_regs_get regs 0)) 4993 (hi Reg (value_regs_get regs 1))) 4994 (output_pair lo hi))) 4995 4996;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 4997 4998(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value _ (symbol_value_data name _ _)))) 4999 (elf_tls_get_addr name)) 5000 5001(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value _ (symbol_value_data name _ _)))) 5002 (macho_tls_get_addr name)) 5003 5004(rule (lower (has_type (tls_model (TlsModel.Coff)) (tls_value _ (symbol_value_data name _ _)))) 5005 (coff_tls_get_addr name)) 5006 5007;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5008 5009(rule 1 (lower (sqmul_round_sat _ qx @ (value_type $I16X8) qy)) 5010 (if-let true (has_ssse3)) 5011 (let ((src1 Xmm qx) 5012 (src2 Xmm qy) 5013 5014 (mask XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)) 5015 (dst Xmm (x64_pmulhrsw src1 src2)) 5016 (cmp Xmm (x64_pcmpeqw dst mask))) 5017 (x64_pxor dst cmp))) 5018 5019;; This operation is defined in wasm as: 5020;; 5021;; S.SignedSaturate((x * y + 0x4000) >> 15) 5022;; 5023;; so perform all those operations here manually with a lack of the native 5024;; instruction. 5025(rule (lower (sqmul_round_sat _ qx @ (value_type $I16X8) qy)) 5026 (let ( 5027 (qx Xmm qx) 5028 (qy Xmm qy) 5029 ;; Multiply `qx` and `qy` generating 32-bit intermediate results. The 5030 ;; 32-bit results have their low-halves stored in `mul_lsb` and the 5031 ;; high halves are stored in `mul_msb`. These are then shuffled into 5032 ;; `mul_lo` and `mul_hi` which represent the low 4 multiplications 5033 ;; and the upper 4 multiplications. 5034 (mul_lsb Xmm (x64_pmullw qx qy)) 5035 (mul_msb Xmm (x64_pmulhw qx qy)) 5036 (mul_lo Xmm (x64_punpcklwd mul_lsb mul_msb)) 5037 (mul_hi Xmm (x64_punpckhwd mul_lsb mul_msb)) 5038 ;; Add the 0x4000 constant to all multiplications 5039 (val Xmm (x64_movdqu_load (emit_u128_le_const 0x00004000_00004000_00004000_00004000))) 5040 (mul_lo Xmm (x64_paddd mul_lo val)) 5041 (mul_hi Xmm (x64_paddd mul_hi val)) 5042 ;; Perform the right-shift by 15 to all multiplications 5043 (lo Xmm (x64_psrad mul_lo (xmi_imm 15))) 5044 (hi Xmm (x64_psrad mul_hi (xmi_imm 15))) 5045 ) 5046 ;; And finally perform a saturating 32-to-16-bit conversion. 5047 (x64_packssdw lo hi))) 5048 5049;; Rules for `x86_pmulhrsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5050 5051(rule (lower (x86_pmulhrsw _ qx @ (value_type $I16X8) qy)) 5052 (if-let true (has_ssse3)) 5053 (x64_pmulhrsw qx qy)) 5054 5055;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5056 5057;; TODO: currently we only lower a special case of `uunarrow` needed to support 5058;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation. 5059;; https://github.com/bytecodealliance/wasmtime/issues/4791 5060;; 5061;; y = i32x4.trunc_sat_f64x2_u_zero(x) is lowered to: 5062;; MOVAPD xmm_y, xmm_x 5063;; XORPD xmm_tmp, xmm_tmp 5064;; MAXPD xmm_y, xmm_tmp 5065;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)] 5066;; ROUNDPD xmm_y, xmm_y, 0x0B 5067;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)] 5068;; SHUFPS xmm_y, xmm_xmp, 0x88 5069(rule (lower (uunarrow _ (fcvt_to_uint_sat _ src @ (value_type $F64X2)) 5070 (vconst _ (u128_from_constant 0)))) 5071 (let ((src Xmm src) 5072 5073 ;; MOVAPD xmm_y, xmm_x 5074 ;; XORPD xmm_tmp, xmm_tmp 5075 (zeros Xmm (xmm_zero $F64X2)) 5076 (dst Xmm (x64_maxpd src zeros)) 5077 5078 ;; 4294967295.0 is equivalent to 0x41EFFFFFFFE00000 5079 (umax_mask XmmMem (emit_u128_le_const 0x41EFFFFFFFE00000_41EFFFFFFFE00000)) 5080 5081 ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)] 5082 (dst Xmm (x64_minpd dst umax_mask)) 5083 5084 ;; ROUNDPD xmm_y, xmm_y, 0x0B 5085 (dst Xmm (x64_round $F64X2 dst (RoundImm.RoundZero))) 5086 5087 ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)] 5088 (uint_mask XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000)) 5089 5090 (dst Xmm (x64_addpd dst uint_mask))) 5091 5092 ;; SHUFPS xmm_y, xmm_xmp, 0x88 5093 (x64_shufps dst zeros 0x88))) 5094 5095;; Rules for `get_exception_handler_address` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5096 5097(rule (lower (get_exception_handler_address _ (u64_from_imm64 idx) block)) 5098 (let ((succ_label MachLabel (block_exn_successor_label block idx))) 5099 (x64_label_address succ_label))) 5100 5101;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5102 5103(rule (lower (nop)) 5104 (invalid_reg)) 5105 5106;; Rules for `sequence_point` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5107 5108(rule (lower (sequence_point)) 5109 (side_effect 5110 (x64_sequence_point))) 5111