1;; x86-64 instruction selection and CLIF-to-MachInst lowering.
2
3;; The main lowering constructor term: takes a clif `Inst` and returns the
4;; register(s) within which the lowered instruction's result values live.
5(spec (lower arg)
6      (provide (= result arg)))
7(decl partial lower (Inst) InstOutput)
8
9;; A variant of the main lowering constructor term, used for branches.
10;; The only difference is that it gets an extra argument holding a vector
11;; of branch targets to be used.
12(decl partial lower_branch (Inst MachLabelSlice) Unit)
13
14;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
15
16;; `i64` and smaller.
17(rule (lower (has_type (fits_in_64 ty)
18                       (iconst _ (u64_from_imm64 x))))
19      (imm ty x))
20
21;; `i128`
22(rule 1 (lower (has_type $I128
23                       (iconst _ (u64_from_imm64 x))))
24      (value_regs (imm $I64 x)
25                  (imm $I64 0)))
26
27;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
28
29(rule (lower (f16const _ (u16_from_ieee16 x)))
30      (imm $F16 x))
31
32;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33
34(rule (lower (f32const _ (u32_from_ieee32 x)))
35      (imm $F32 x))
36
37;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
38
39(rule (lower (f64const _ (u64_from_ieee64 x)))
40      (imm $F64 x))
41
42;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
43(rule (lower (f128const _ const))
44      ;; TODO use Inst::gen_constant() instead.
45      (x64_xmm_load_const $F128 (const_to_vconst const)))
46
47(rule 1 (lower (f128const _ (u128_from_constant 0)))
48      (xmm_zero $F128))
49
50;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51
52;; `i64` and smaller.
53
54;; Base case for 8 and 16-bit types
55(rule -6 (lower (has_type (fits_in_16 ty)
56                       (iadd _ x y)))
57      (x64_add ty x y))
58
59;; Base case for 32 and 64-bit types which might end up using the `lea`
60;; instruction to fold multiple operations into one.
61;;
62;; Note that at this time this always generates a `lea` pseudo-instruction,
63;; but the actual instruction emitted might be an `add` if it's equivalent.
64;; For more details on this see the `emit.rs` logic to emit
65;; `LoadEffectiveAddress`.
66(rule iadd_base_case_32_or_64_lea -5 (lower (has_type (ty_32_or_64 ty) (iadd _ x y)))
67      (x64_lea ty (to_amode_add (mem_flags_trusted) x y (zero_offset))))
68
69;; Higher-priority cases than the previous two where a load can be sunk into
70;; the add instruction itself. Note that both operands are tested for
71;; sink-ability since addition is commutative
72(rule -4 (lower (has_type (fits_in_64 ty)
73                       (iadd _ x (sinkable_load y))))
74      (x64_add ty x y))
75(rule -3 (lower (has_type (fits_in_64 ty)
76                       (iadd _ (sinkable_load x) y)))
77      (x64_add ty y x))
78
79;; SSE.
80
81(rule (lower (has_type (multi_lane 8 16)
82                       (iadd _ x y)))
83      (x64_paddb x y))
84
85(rule (lower (has_type (multi_lane 16 8)
86                       (iadd _ x y)))
87      (x64_paddw x y))
88
89(rule (lower (has_type (multi_lane 32 4)
90                       (iadd _ x y)))
91      (x64_paddd x y))
92
93(rule (lower (has_type (multi_lane 64 2)
94                       (iadd _ x y)))
95      (x64_paddq x y))
96
97;; `i128`
98(rule 1 (lower (has_type $I128 (iadd _ x y)))
99      ;; Get the high/low registers for `x`.
100      (let ((x_regs ValueRegs x)
101            (y_regs ValueRegs y))
102        (iadd128
103          (value_regs_get_gpr x_regs 0)
104          (value_regs_get_gpr x_regs 1)
105          (value_regs_get_gpr y_regs 0)
106          (value_regs_get_gpr y_regs 1))))
107(rule 2 (lower (has_type $I128 (iadd _ x (iconcat _ y_lo y_hi))))
108        (let ((x_regs ValueRegs x))
109          (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi)))
110(rule 3 (lower (has_type $I128 (iadd _ x (uextend _ y @ (value_type $I64)))))
111        (let ((x_regs ValueRegs x))
112          (iadd128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1)
113                   y (RegMemImm.Imm 0))))
114
115;; Specialized lowering rule for `iadd` of two 64-bit unsigned integers, meaning
116;; that we can skip the `adc` and instead use `setb`. This is in some sense a
117;; way of modeling `uadd_overflow`.
118(rule 4 (lower (has_type $I128 (iadd _ (uextend _ x) (uextend _ y @ (value_type $I64)))))
119        (let (
120            (x Gpr (extend_to_gpr x $I64 (ExtendKind.Zero)))
121            (ret ValueRegs (with_flags (x64_add_with_flags_paired $I64 x y)
122                                       (x64_setcc_paired (CC.B))))
123          )
124          ;; FIXME: this `movzx` ideally would happen before the `add` itself to
125          ;; zero out the destination register with `xor %dst,%dst` and then
126          ;; the `setb` would just write to the lower bytes. That would probably
127          ;; require modeling this as a pseudo-inst which isn't quite worth it
128          ;; at this time.
129          (value_regs (value_regs_get ret 0)
130                      (x64_movzx (ExtMode.BQ) (value_regs_get ret 1)))))
131
132;; Helper for lowering 128-bit addition with the 64-bit halves of the lhs/rhs
133;; already split. The first two arguments are lo/hi for the lhs and the second
134;; two are lo/hi for the rhs.
135(decl iadd128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs)
136(rule (iadd128 x_lo x_hi y_lo y_hi)
137      (with_flags (x64_add_with_flags_paired $I64 x_lo y_lo)
138                  (x64_adc_paired $I64 x_hi y_hi)))
139
140;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
141
142(decl construct_overflow_op (CC ProducesFlags) InstOutput)
143(rule (construct_overflow_op cc inst)
144      (let ((results ValueRegs (with_flags inst
145                                           (x64_setcc_paired cc))))
146        (output_pair (value_regs_get results 0)
147                     (value_regs_get results 1))))
148
149(decl construct_overflow_op_alu (Type CC ProduceFlagsOp Gpr GprMemImm) InstOutput)
150(rule (construct_overflow_op_alu ty cc op src1 src2)
151      (construct_overflow_op cc (x64_produce_flags op ty src1 src2)))
152
153;; This essentially creates
154;; alu_<op1> x_lo, y_lo
155;; alu_<op2> x_hi, y_hi
156;; set<cc> r8
157(decl construct_overflow_op_alu_128 (CC ProduceFlagsOp ChainFlagsOp Value Value) InstOutput)
158(rule (construct_overflow_op_alu_128 cc op1 op2 x y)
159      ;; Get the high/low registers for `x`.
160      (let ((x_regs ValueRegs x)
161            (x_lo Gpr (value_regs_get_gpr x_regs 0))
162            (x_hi Gpr (value_regs_get_gpr x_regs 1)))
163        ;; Get the high/low registers for `y`.
164        (let ((y_regs ValueRegs y)
165              (y_lo Gpr (value_regs_get_gpr y_regs 0))
166              (y_hi Gpr (value_regs_get_gpr y_regs 1)))
167          (let    ((lo_inst ProducesFlags (x64_produce_flags op1 $I64 x_lo y_lo))
168                   (hi_inst ConsumesAndProducesFlags (x64_chain_flags op2 $I64 x_hi y_hi))
169                   (of_inst ConsumesFlags (x64_setcc_paired cc))
170
171                   (result MultiReg (with_flags_chained lo_inst hi_inst of_inst)))
172                  (multi_reg_to_pair_and_single result)))))
173
174;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
175
176(rule 1 (lower (uadd_overflow _ x y @ (value_type (fits_in_64 ty))))
177      (construct_overflow_op_alu ty (CC.B) (ProduceFlagsOp.Add) x y))
178
179;; i128 gets lowered into adc and add
180(rule 0 (lower (uadd_overflow _ x y @ (value_type $I128)))
181        (construct_overflow_op_alu_128 (CC.B) (ProduceFlagsOp.Add) (ChainFlagsOp.Adc) x y))
182
183;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184
185(rule 1 (lower (sadd_overflow _ x y @ (value_type (fits_in_64 ty))))
186      (construct_overflow_op_alu ty (CC.O) (ProduceFlagsOp.Add) x y))
187
188(rule 0 (lower (sadd_overflow _ x y @ (value_type $I128)))
189        (construct_overflow_op_alu_128 (CC.O) (ProduceFlagsOp.Add) (ChainFlagsOp.Adc) x y))
190
191;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
192
193(rule 1 (lower (usub_overflow _ x y @ (value_type (fits_in_64 ty))))
194      (construct_overflow_op_alu ty (CC.B) (ProduceFlagsOp.Sub) x y))
195
196(rule 0 (lower (usub_overflow _ x y @ (value_type $I128)))
197        (construct_overflow_op_alu_128 (CC.B) (ProduceFlagsOp.Sub) (ChainFlagsOp.Sbb) x y))
198
199;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
200
201(rule 1 (lower (ssub_overflow _ x y @ (value_type (fits_in_64 ty))))
202      (construct_overflow_op_alu ty (CC.O) (ProduceFlagsOp.Sub) x y))
203
204(rule 0 (lower (ssub_overflow _ x y @ (value_type $I128)))
205        (construct_overflow_op_alu_128 (CC.O) (ProduceFlagsOp.Sub) (ChainFlagsOp.Sbb) x y))
206
207;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
208
209(rule 2 (lower (umul_overflow _ x y @ (value_type $I8)))
210      (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired false x y)))
211
212(rule 3 (lower (umul_overflow _ x y @ (value_type (ty_int_ref_16_to_64 ty))))
213      (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty false x y)))
214
215;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
216
217(rule 2 (lower (smul_overflow _ x y @ (value_type $I8)))
218      (construct_overflow_op (CC.O) (x64_mul8_with_flags_paired true x y)))
219
220(rule 3 (lower (smul_overflow _ x y @ (value_type (ty_int_ref_16_to_64 ty))))
221      (construct_overflow_op (CC.O) (x64_mul_lo_with_flags_paired ty true x y)))
222
223;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224
225(rule (lower (has_type (multi_lane 8 16)
226                       (sadd_sat _ x y)))
227      (x64_paddsb x y))
228
229(rule (lower (has_type (multi_lane 16 8)
230                       (sadd_sat _ x y)))
231      (x64_paddsw x y))
232
233;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
234
235(rule (lower (has_type (multi_lane 8 16)
236                       (uadd_sat _ x y)))
237      (x64_paddusb x y))
238
239(rule (lower (has_type (multi_lane 16 8)
240                       (uadd_sat _ x y)))
241      (x64_paddusw x y))
242
243;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
244
245;; `i64` and smaller.
246
247;; Sub two registers.
248(rule -3 (lower (has_type (fits_in_64 ty)
249                       (isub _ x y)))
250      (x64_sub ty x y))
251
252;; SSE.
253
254(rule (lower (has_type (multi_lane 8 16)
255                       (isub _ x y)))
256      (x64_psubb x y))
257
258(rule (lower (has_type (multi_lane 16 8)
259                       (isub _ x y)))
260      (x64_psubw x y))
261
262(rule (lower (has_type (multi_lane 32 4)
263                       (isub _ x y)))
264      (x64_psubd x y))
265
266(rule (lower (has_type (multi_lane 64 2)
267                       (isub _ x y)))
268      (x64_psubq x y))
269
270;; `i128`
271(rule 1 (lower (has_type $I128 (isub _ x y)))
272      ;; Get the high/low registers for `x`.
273      (let ((x_regs ValueRegs x)
274            (y_regs ValueRegs y))
275        (isub128
276          (value_regs_get_gpr x_regs 0)
277          (value_regs_get_gpr x_regs 1)
278          (value_regs_get_gpr y_regs 0)
279          (value_regs_get_gpr y_regs 1))))
280(rule 2 (lower (has_type $I128 (isub _ x (iconcat _ y_lo y_hi))))
281        (let ((x_regs ValueRegs x))
282          (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1) y_lo y_hi)))
283(rule 3 (lower (has_type $I128 (isub _ x (uextend _ y @ (value_type $I64)))))
284        (let ((x_regs ValueRegs x))
285          (isub128 (value_regs_get_gpr x 0) (value_regs_get_gpr x 1)
286                   y (RegMemImm.Imm 0))))
287
288;; Helper for lowering 128-bit subtraction with the 64-bit halves of the lhs/rhs
289;; already split. The first two arguments are lo/hi for the lhs and the second
290;; two are lo/hi for the rhs.
291(decl isub128 (Gpr Gpr GprMemImm GprMemImm) ValueRegs)
292(rule (isub128 x_lo x_hi y_lo y_hi)
293      (with_flags (x64_sub_with_flags_paired $I64 x_lo y_lo)
294                  (x64_sbb_paired $I64 x_hi y_hi)))
295
296;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
297
298(rule (lower (has_type (multi_lane 8 16)
299                       (ssub_sat _ x y)))
300      (x64_psubsb x y))
301
302(rule (lower (has_type (multi_lane 16 8)
303                       (ssub_sat _ x y)))
304      (x64_psubsw x y))
305
306;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
307
308(rule (lower (has_type (multi_lane 8 16)
309                       (usub_sat _ x y)))
310      (x64_psubusb x y))
311
312(rule (lower (has_type (multi_lane 16 8)
313                       (usub_sat _ x y)))
314      (x64_psubusw x y))
315
316;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
317
318;; `{i,b}64` and smaller.
319
320;; And two registers.
321(rule 0 (lower (has_type ty (band _ x y)))
322      (if (ty_int_ref_scalar_64 ty))
323      (x64_and ty x y))
324
325;; The above case automatically handles when the rhs is an immediate or a
326;; sinkable load, but additionally handle the lhs here.
327
328(rule 1 (lower (has_type ty (band _ (sinkable_load x) y)))
329      (if (ty_int_ref_scalar_64 ty))
330      (x64_and ty y x))
331
332(rule 2 (lower (has_type ty (band _ (simm32_from_value x) y)))
333      (if (ty_int_ref_scalar_64 ty))
334      (x64_and ty y x))
335
336;; f32 and f64
337
338(rule 5 (lower (has_type (ty_scalar_float ty) (band _ x y)))
339      (sse_and ty x y))
340
341;; SSE.
342
343(decl sse_and (Type Xmm XmmMem) Xmm)
344(rule (sse_and $F32X4 x y) (x64_andps x y))
345(rule (sse_and $F64X2 x y) (x64_andpd x y))
346(rule (sse_and $F32 x y) (x64_andps x y))
347(rule (sse_and $F64 x y) (x64_andpd x y))
348(rule -1 (sse_and (multi_lane _bits _lanes) x y) (x64_pand x y))
349
350(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes)
351                       (band _ x y)))
352      (sse_and ty x y))
353
354;; `i128`.
355
356(decl and_i128 (ValueRegs ValueRegs) ValueRegs)
357(rule (and_i128 x y)
358      (let ((x_regs ValueRegs x)
359            (x_lo Gpr (value_regs_get_gpr x_regs 0))
360            (x_hi Gpr (value_regs_get_gpr x_regs 1))
361            (y_regs ValueRegs y)
362            (y_lo Gpr (value_regs_get_gpr y_regs 0))
363            (y_hi Gpr (value_regs_get_gpr y_regs 1)))
364        (value_gprs (x64_and $I64 x_lo y_lo)
365                    (x64_and $I64 x_hi y_hi))))
366
367(rule 7 (lower (has_type $I128 (band _ x y)))
368      (and_i128 x y))
369
370;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
371;; by Cranelift's `band_not` instruction that is legalized into the simpler
372;; forms early on.
373
374(decl sse_and_not (Type Xmm XmmMem) Xmm)
375(rule (sse_and_not $F32X4 x y) (x64_andnps x y))
376(rule (sse_and_not $F64X2 x y) (x64_andnpd x y))
377(rule -1 (sse_and_not (multi_lane _bits _lanes) x y) (x64_pandn x y))
378
379;; Note the flipping of operands below as we're match
380;;
381;;   (band x (bnot y))
382;;
383;; while x86 does
384;;
385;;   pandn(x, y) = and(not(x), y)
386(rule 8 (lower (has_type ty @ (multi_lane _bits _lane) (band _ x (bnot _ y))))
387      (sse_and_not ty y x))
388(rule 9 (lower (has_type ty @ (multi_lane _bits _lane) (band _ (bnot _ y) x)))
389      (sse_and_not ty y x))
390
391(rule 10 (lower (has_type ty (band _ x (bnot _ y))))
392      (if (ty_int_ref_scalar_64 ty))
393      (if-let true (has_bmi1))
394      ;; the first argument is the one that gets inverted with andn
395      (x64_andn ty y x))
396(rule 11 (lower (has_type ty (band _ (bnot _ y) x)))
397      (if (ty_int_ref_scalar_64 ty))
398      (if-let true (has_bmi1))
399      (x64_andn ty y x))
400
401;; Specialization of `blsr` for BMI1
402
403(decl pure partial val_minus_one (Value) Value)
404(rule 0 (val_minus_one (isub _ x (u64_from_iconst 1))) x)
405(rule 0 (val_minus_one (iadd _ x (i64_from_iconst -1))) x)
406(rule 1 (val_minus_one (iadd _ (i64_from_iconst -1) x)) x)
407
408(rule 12 (lower (has_type (ty_32_or_64 ty) (band _ x y)))
409         (if-let true (has_bmi1))
410         (if-let x (val_minus_one y))
411         (x64_blsr ty x))
412(rule 13 (lower (has_type (ty_32_or_64 ty) (band _ y x)))
413         (if-let true (has_bmi1))
414         (if-let x (val_minus_one y))
415         (x64_blsr ty x))
416
417;; Specialization of `blsi` for BMI1
418
419(rule 14 (lower (has_type (ty_32_or_64 ty) (band _ (ineg _ x) x)))
420         (if-let true (has_bmi1))
421         (x64_blsi ty x))
422(rule 15 (lower (has_type (ty_32_or_64 ty) (band _ x (ineg _ x))))
423         (if-let true (has_bmi1))
424         (x64_blsi ty x))
425
426;; Specialization of `bzhi` for BMI2
427;;
428;; The `bzhi` instruction clears all bits indexed by the second operand of the
429;; first operand. This is pattern-matched here with a `band` against a mask
430;; which is generated to be N bits large. Note that if the index is larger than
431;; the bit-width of the type then `bzhi` doesn't have the same semantics as
432;; `ishl`, so an `and` instruction is required to mask the index to match the
433;; semantics of Cranelift's `ishl`.
434
435(rule 16 (lower (has_type (ty_32_or_64 ty) (band _ x y)))
436         (if-let true (has_bmi2))
437         (if-let (ishl _ (u64_from_iconst 1) index) (val_minus_one y))
438         (x64_bzhi ty x (x64_and ty index (RegMemImm.Imm (u32_wrapping_sub (ty_bits ty) 1)))))
439
440;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
441
442;; `{i,b}64` and smaller.
443
444;; Or two registers.
445(rule 0 (lower (has_type ty (bor _ x y)))
446      (if (ty_int_ref_scalar_64 ty))
447      (x64_or ty x y))
448
449;; Handle immediates/sinkable loads on the lhs in addition to the automatic
450;; handling of the rhs above
451
452(rule 1 (lower (has_type ty (bor _ (sinkable_load x) y)))
453      (if (ty_int_ref_scalar_64 ty))
454      (x64_or ty y x))
455
456(rule 2 (lower (has_type ty (bor _ (simm32_from_value x) y)))
457      (if (ty_int_ref_scalar_64 ty))
458      (x64_or ty y x))
459
460;; f32 and f64
461
462(rule 5 (lower (has_type (ty_scalar_float ty) (bor _ x y)))
463      (sse_or ty x y))
464
465;; SSE.
466
467(decl sse_or (Type Xmm XmmMem) Xmm)
468(rule (sse_or $F32X4 x y) (x64_orps x y))
469(rule (sse_or $F64X2 x y) (x64_orpd x y))
470(rule (sse_or $F32 x y) (x64_orps x y))
471(rule (sse_or $F64 x y) (x64_orpd x y))
472(rule -1 (sse_or (multi_lane _bits _lanes) x y) (x64_por x y))
473
474(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes)
475                       (bor _ x y)))
476      (sse_or ty x y))
477
478;; `{i,b}128`.
479
480(decl or_i128 (ValueRegs ValueRegs) ValueRegs)
481(rule (or_i128 x y)
482      (let ((x_lo Gpr (value_regs_get_gpr x 0))
483            (x_hi Gpr (value_regs_get_gpr x 1))
484            (y_lo Gpr (value_regs_get_gpr y 0))
485            (y_hi Gpr (value_regs_get_gpr y 1)))
486        (value_gprs (x64_or $I64 x_lo y_lo)
487                    (x64_or $I64 x_hi y_hi))))
488
489(rule 7 (lower (has_type $I128 (bor _ x y)))
490      (or_i128 x y))
491
492;; Specialized lowerings to generate the `shld` instruction.
493;;
494;; The `shld` instruction will shift a value left and shift-in bits from a
495;; different register. Pattern-match doing this with bit-ops and shifts to
496;; generate a `shld` instruction.
497(rule 8 (lower (has_type (ty_int_ref_16_to_64 ty)
498  (bor _ (ishl _ x (u8_from_iconst xs)) (ushr _ y (u8_from_iconst ys)))))
499  (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys)))
500  (if-let true (u64_gt xs 0))
501  (if-let true (u64_gt ys 0))
502  (x64_shld ty x y xs))
503(rule 8 (lower (has_type (ty_int_ref_16_to_64 ty)
504  (bor _ (ushr _ y (u8_from_iconst ys)) (ishl _ x (u8_from_iconst xs)))))
505  (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys)))
506  (if-let true (u64_gt xs 0))
507  (if-let true (u64_gt ys 0))
508  (x64_shld ty x y xs))
509
510
511;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
512
513;; `{i,b}64` and smaller.
514
515;; Xor two registers.
516(rule 0 (lower (has_type ty (bxor _ x y)))
517      (if (ty_int_ref_scalar_64 ty))
518      (x64_xor ty x y))
519
520;; Handle xor with lhs immediates/sinkable loads in addition to the automatic
521;; handling of the rhs above.
522
523(rule 1 (lower (has_type ty (bxor _ (sinkable_load x) y)))
524      (if (ty_int_ref_scalar_64 ty))
525      (x64_xor ty y x))
526
527(rule 4 (lower (has_type ty (bxor _ (simm32_from_value x) y)))
528      (if (ty_int_ref_scalar_64 ty))
529      (x64_xor ty y x))
530
531;; f32 and f64
532
533(rule 5 (lower (has_type (ty_scalar_float ty) (bxor _ x y)))
534      (x64_xor_vector ty x y))
535
536;; SSE.
537
538(rule 6 (lower (has_type ty @ (multi_lane _bits _lanes) (bxor _ x y)))
539      (x64_xor_vector ty x y))
540
541;; `{i,b}128`.
542
543(rule 7 (lower (has_type $I128 (bxor _ x y)))
544      (let ((x_regs ValueRegs x)
545            (x_lo Gpr (value_regs_get_gpr x_regs 0))
546            (x_hi Gpr (value_regs_get_gpr x_regs 1))
547            (y_regs ValueRegs y)
548            (y_lo Gpr (value_regs_get_gpr y_regs 0))
549            (y_hi Gpr (value_regs_get_gpr y_regs 1)))
550        (value_gprs (x64_xor $I64 x_lo y_lo)
551                    (x64_xor $I64 x_hi y_hi))))
552
553;; Specialization of `blsmsk` for BMI1
554
555(rule 8 (lower (has_type (ty_32_or_64 ty) (bxor _ x y)))
556        (if-let true (has_bmi1))
557        (if-let x (val_minus_one y))
558        (x64_blsmsk ty x))
559(rule 9 (lower (has_type (ty_32_or_64 ty) (bxor _ y x)))
560        (if-let true (has_bmi1))
561        (if-let x (val_minus_one y))
562        (x64_blsmsk ty x))
563
564;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
565
566;; `i64` and smaller.
567
568(rule -1 (lower (has_type (fits_in_64 ty) (ishl _ src amt)))
569      (x64_shl ty src (put_masked_in_imm8_gpr amt ty)))
570
571;; `i128`.
572
573(decl shl_i128 (ValueRegs Gpr) ValueRegs)
574(rule (shl_i128 src amt)
575      ;; Unpack the registers that make up the 128-bit value being shifted.
576      (let ((src_lo Gpr (value_regs_get_gpr src 0))
577            (src_hi Gpr (value_regs_get_gpr src 1))
578            ;; Do two 64-bit shifts.
579            (lo_shifted Gpr (x64_shl $I64 src_lo amt))
580            (hi_shifted Gpr (x64_shl $I64 src_hi amt))
581            ;; `src_lo >> (64 - amt)` are the bits to carry over from the lo
582            ;; into the hi.
583            (carry Gpr (x64_shr $I64
584                            src_lo
585                            (x64_sub $I64
586                                 (imm $I64 64)
587                                 amt)))
588            (zero Gpr (imm $I64 0))
589            ;; Nullify the carry if we are shifting in by a multiple of 128.
590            (carry_ Gpr (with_flags_reg (x64_testq_mi amt 127)
591                                        (cmove $I64
592                                               (CC.Z)
593                                               zero
594                                               carry)))
595            ;; Add the carry into the high half.
596            (hi_shifted_ Gpr (x64_or $I64 carry_ hi_shifted)))
597        ;; Combine the two shifted halves. However, if we are shifting by >= 64
598        ;; (modulo 128), then the low bits are zero and the high bits are our
599        ;; low bits.
600        (with_flags (x64_testq_mi amt 64)
601                    (consumes_flags_concat
602                     (cmove $I64 (CC.Z) lo_shifted zero)
603                     (cmove $I64 (CC.Z) hi_shifted_ lo_shifted)))))
604
605(rule (lower (has_type $I128 (ishl _ src amt)))
606      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
607      ;; amount to the value's bit width.
608      (let ((amt_ Gpr (lo_gpr amt)))
609        (shl_i128 src amt_)))
610
611;; SSE.
612
613;; Since the x86 instruction set does not have any 8x16 shift instructions (even
614;; in higher feature sets like AVX), we lower the `ishl.i8x16` to a sequence of
615;; instructions. The basic idea, whether the amount to shift by is an immediate
616;; or not, is to use a 16x8 shift and then mask off the incorrect bits to 0s.
617(rule (lower (has_type ty @ $I8X16 (ishl _ src amt)))
618      (let (
619            ;; Mask the amount to ensure wrapping behaviour
620            (masked_amt RegMemImm (mask_xmm_shift ty amt))
621            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
622            ;; correct for half of the lanes; the others must be fixed up with
623            ;; the mask below.
624            (unmasked Xmm (x64_psllw src (mov_rmi_to_xmm masked_amt)))
625            (mask_addr SyntheticAmode (ishl_i8x16_mask masked_amt))
626            (mask Reg (x64_movdqu_load mask_addr)))
627        (sse_and $I8X16 unmasked (RegMem.Reg mask))))
628
629;; Get the address of the mask to use when fixing up the lanes that weren't
630;; correctly generated by the 16x8 shift.
631;;
632;; Recursion: at most once to convert memory case into register case.
633(decl rec ishl_i8x16_mask (RegMemImm) SyntheticAmode)
634
635;; When the shift amount is known, we can statically (i.e. at compile time)
636;; determine the mask to use and only emit that.
637(decl ishl_i8x16_mask_for_const (u32) SyntheticAmode)
638(extern constructor ishl_i8x16_mask_for_const ishl_i8x16_mask_for_const)
639(rule (ishl_i8x16_mask (RegMemImm.Imm amt))
640      (ishl_i8x16_mask_for_const amt))
641
642;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
643;; time) find the correct mask offset in the table. We use `lea` to find the
644;; base address of the mask table and then complex addressing to offset to the
645;; right mask: `base_address + amt << 4`
646(decl ishl_i8x16_mask_table () SyntheticAmode)
647(extern constructor ishl_i8x16_mask_table ishl_i8x16_mask_table)
648(rule (ishl_i8x16_mask (RegMemImm.Reg amt))
649      (let ((mask_table SyntheticAmode (ishl_i8x16_mask_table))
650            (base_mask_addr Gpr (x64_leaq_rm mask_table))
651            (mask_offset Gpr (x64_shlq_mi amt 4)))
652        (Amode.ImmRegRegShift 0
653                              base_mask_addr
654                              mask_offset
655                              0
656                              (mem_flags_trusted))))
657
658(rule (ishl_i8x16_mask (RegMemImm.Mem amt))
659      (ishl_i8x16_mask (RegMemImm.Reg (x64_movq_rm amt))))
660
661;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
662
663(rule (lower (has_type ty @ $I16X8 (ishl _ src amt)))
664      (x64_psllw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
665
666(rule (lower (has_type ty @ $I32X4 (ishl _ src amt)))
667      (x64_pslld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
668
669(rule (lower (has_type ty @ $I64X2 (ishl _ src amt)))
670      (x64_psllq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
671
672;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
673
674;; `i64` and smaller.
675
676(rule -1 (lower (has_type (fits_in_64 ty) (ushr _ src amt)))
677      (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Zero))))
678        (x64_shr ty src_ (put_masked_in_imm8_gpr amt ty))))
679
680;; `i128`.
681
682(decl shr_i128 (ValueRegs Gpr) ValueRegs)
683(rule (shr_i128 src amt)
684      ;; Unpack the lo/hi halves of `src`.
685      (let ((src_lo Gpr (value_regs_get_gpr src 0))
686            (src_hi Gpr (value_regs_get_gpr src 1))
687            ;; Do a shift on each half.
688            (lo_shifted Gpr (x64_shr $I64 src_lo amt))
689            (hi_shifted Gpr (x64_shr $I64 src_hi amt))
690            ;; `src_hi << (64 - amt)` are the bits to carry over from the hi
691            ;; into the lo.
692            (carry Gpr (x64_shl $I64
693                            src_hi
694                            (x64_sub $I64
695                                 (imm $I64 64)
696                                 amt)))
697            ;; Share the zero value to reduce register pressure
698            (zero Gpr (imm $I64 0))
699
700            ;; Nullify the carry if we are shifting by a multiple of 128.
701            (carry_ Gpr (with_flags_reg (x64_testq_mi amt 127)
702                                        (cmove $I64 (CC.Z) zero carry)))
703            ;; Add the carry bits into the lo.
704            (lo_shifted_ Gpr (x64_or $I64 carry_ lo_shifted)))
705        ;; Combine the two shifted halves. However, if we are shifting by >= 64
706        ;; (modulo 128), then the hi bits are zero and the lo bits are what
707        ;; would otherwise be our hi bits.
708        (with_flags (x64_testq_mi amt 64)
709                    (consumes_flags_concat
710                     (cmove $I64 (CC.Z) lo_shifted_ hi_shifted)
711                     (cmove $I64 (CC.Z) hi_shifted zero)))))
712
713(rule (lower (has_type $I128 (ushr _ src amt)))
714      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
715      ;; amount to the value's bit width.
716      (let ((amt_ Gpr (lo_gpr amt)))
717        (shr_i128 src amt_)))
718
719;; SSE.
720
721;; There are no 8x16 shifts in x64. Do the same 16x8-shift-and-mask thing we do
722;; with 8x16 `ishl`.
723(rule (lower (has_type ty @ $I8X16 (ushr _ src amt)))
724      (let (
725            ;; Mask the amount to ensure wrapping behaviour
726            (masked_amt RegMemImm (mask_xmm_shift ty amt))
727            ;; Shift `src` using 16x8. Unfortunately, a 16x8 shift will only be
728            ;; correct for half of the lanes; the others must be fixed up with
729            ;; the mask below.
730            (unmasked Xmm (x64_psrlw src (mov_rmi_to_xmm masked_amt))))
731        (sse_and $I8X16
732                 unmasked
733                 (ushr_i8x16_mask masked_amt))))
734
735;; Get the address of the mask to use when fixing up the lanes that weren't
736;; correctly generated by the 16x8 shift.
737;;
738;; Recursion: at most once to convert memory case into register case.
739(decl rec ushr_i8x16_mask (RegMemImm) SyntheticAmode)
740
741;; When the shift amount is known, we can statically (i.e. at compile time)
742;; determine the mask to use and only emit that.
743(decl ushr_i8x16_mask_for_const (u32) SyntheticAmode)
744(extern constructor ushr_i8x16_mask_for_const ushr_i8x16_mask_for_const)
745(rule (ushr_i8x16_mask (RegMemImm.Imm amt))
746      (ushr_i8x16_mask_for_const amt))
747
748;; Otherwise, we must emit the entire mask table and dynamically (i.e. at run
749;; time) find the correct mask offset in the table. We use `lea` to find the
750;; base address of the mask table and then complex addressing to offset to the
751;; right mask: `base_address + amt << 4`
752(decl ushr_i8x16_mask_table () SyntheticAmode)
753(extern constructor ushr_i8x16_mask_table ushr_i8x16_mask_table)
754(rule (ushr_i8x16_mask (RegMemImm.Reg amt))
755      (let ((mask_table SyntheticAmode (ushr_i8x16_mask_table))
756            (base_mask_addr Gpr (x64_leaq_rm mask_table))
757            (mask_offset Gpr (x64_shlq_mi amt 4)))
758        (Amode.ImmRegRegShift 0
759                              base_mask_addr
760                              mask_offset
761                              0
762                              (mem_flags_trusted))))
763
764(rule (ushr_i8x16_mask (RegMemImm.Mem amt))
765      (ushr_i8x16_mask (RegMemImm.Reg (x64_movq_rm amt))))
766
767;; 16x8, 32x4, and 64x2 shifts can each use a single instruction, once the shift amount is masked.
768
769(rule (lower (has_type ty @ $I16X8 (ushr _ src amt)))
770      (x64_psrlw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
771
772(rule (lower (has_type ty @ $I32X4 (ushr _ src amt)))
773      (x64_psrld src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
774
775(rule (lower (has_type ty @ $I64X2 (ushr _ src amt)))
776      (x64_psrlq src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
777
778(decl mask_xmm_shift (Type Value) RegMemImm)
779(rule (mask_xmm_shift ty amt)
780      (gpr_to_reg (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
781(rule 1 (mask_xmm_shift ty (iconst _ n))
782      (RegMemImm.Imm (shift_amount_masked ty n)))
783
784;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
785
786;; `i64` and smaller.
787
788(rule -1 (lower (has_type (fits_in_64 ty) (sshr _ src amt)))
789      (let ((src_ Gpr (extend_to_gpr src ty (ExtendKind.Sign))))
790        (x64_sar ty src_ (put_masked_in_imm8_gpr amt ty))))
791
792;; `i128`.
793
794(decl sar_i128 (ValueRegs Gpr) ValueRegs)
795(rule (sar_i128 src amt)
796      ;; Unpack the low/high halves of `src`.
797      (let ((src_lo Gpr (value_regs_get_gpr src 0))
798            (src_hi Gpr (value_regs_get_gpr src 1))
799            ;; Do a shift of each half. NB: the low half uses an unsigned shift
800            ;; because its MSB is not a sign bit.
801            (lo_shifted Gpr (x64_shr $I64 src_lo amt))
802            (hi_shifted Gpr (x64_sar $I64 src_hi amt))
803            ;; `src_hi << (64 - amt)` are the bits to carry over from the low
804            ;; half to the high half.
805            (carry Gpr (x64_shl $I64
806                            src_hi
807                            (x64_sub $I64
808                                 (imm $I64 64)
809                                 amt)))
810            ;; Nullify the carry if we are shifting by a multiple of 128.
811            (carry_ Gpr (with_flags_reg (x64_testq_mi amt 127)
812                                        (cmove $I64 (CC.Z) (imm $I64 0) carry)))
813            ;; Add the carry into the low half.
814            (lo_shifted_ Gpr (x64_or $I64 lo_shifted carry_))
815            ;; Get all sign bits.
816            (sign_bits Gpr (x64_sarq_mi src_hi 63)))
817        ;; Combine the two shifted halves. However, if we are shifting by >= 64
818        ;; (modulo 128), then the hi bits are all sign bits and the lo bits are
819        ;; what would otherwise be our hi bits.
820        (with_flags (x64_testq_mi amt 64)
821                    (consumes_flags_concat
822                     (cmove $I64 (CC.Z) lo_shifted_ hi_shifted)
823                     (cmove $I64 (CC.Z) hi_shifted sign_bits)))))
824
825(rule (lower (has_type $I128 (sshr _ src amt)))
826      ;; NB: Only the low bits of `amt` matter since we logically mask the shift
827      ;; amount to the value's bit width.
828      (let ((amt_ Gpr (lo_gpr amt)))
829        (sar_i128 src amt_)))
830
831;; SSE.
832
833;; Since the x86 instruction set does not have an 8x16 shift instruction and the
834;; approach used for `ishl` and `ushr` cannot be easily used (the masks do not
835;; preserve the sign), we use a different approach here: separate the low and
836;; high lanes, shift them separately, and merge them into the final result.
837;;
838;; Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
839;; s15]:
840;;
841;;   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
842;;   shifted_lo.i16x8 = shift each lane of `low`
843;;   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
844;;   shifted_hi.i16x8 = shift each lane of `high`
845;;   result = [s0'', s1'', ..., s15'']
846(rule (lower (has_type ty @ $I8X16 (sshr _ src amt @ (value_type amt_ty))))
847      (let ((src_ Xmm (put_in_xmm src))
848            ;; Mask the amount to ensure wrapping behaviour
849            (masked_amt RegMemImm (mask_xmm_shift ty amt))
850            ;; In order for `packsswb` later to only use the high byte of each
851            ;; 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
852            ;; fill in the upper bits appropriately.
853            (lo Xmm (x64_punpcklbw src_ src_))
854            (hi Xmm (x64_punpckhbw src_ src_))
855            (amt_ XmmMemImm (sshr_i8x16_bigger_shift amt_ty masked_amt))
856            (shifted_lo Xmm (x64_psraw lo amt_))
857            (shifted_hi Xmm (x64_psraw hi amt_)))
858        (x64_packsswb shifted_lo shifted_hi)))
859
860(decl sshr_i8x16_bigger_shift (Type RegMemImm) XmmMemImm)
861(rule (sshr_i8x16_bigger_shift _ty (RegMemImm.Imm i))
862      (xmm_mem_imm_new (RegMemImm.Imm (u32_wrapping_add i 8))))
863(rule (sshr_i8x16_bigger_shift ty (RegMemImm.Reg r))
864      (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty
865                                          r
866                                          (RegMemImm.Imm 8)))))
867(rule (sshr_i8x16_bigger_shift ty rmi @ (RegMemImm.Mem _m))
868      (mov_rmi_to_xmm (RegMemImm.Reg (x64_add ty
869                                          (imm ty 8)
870                                          rmi))))
871
872;; `sshr.{i16x8,i32x4}` can be a simple `psra{w,d}`, we just have to make sure
873;; that if the shift amount is in a register, it is in an XMM register.
874
875(rule (lower (has_type ty @ $I16X8 (sshr _ src amt)))
876      (x64_psraw src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
877
878(rule (lower (has_type ty @ $I32X4 (sshr _ src amt)))
879      (x64_psrad src (mov_rmi_to_xmm (mask_xmm_shift ty amt))))
880
881;; The `sshr.i64x2` CLIF instruction has no single x86 instruction in the older
882;; feature sets. To remedy this, a small dance is done with an unsigned right
883;; shift plus some extra ops.
884(rule 3 (lower (has_type ty @ $I64X2 (sshr _ src (iconst _ n))))
885        (if-let true (has_avx512vl))
886        (if-let true (has_avx512f))
887        (x64_vpsraq_imm src (shift_amount_masked ty n)))
888
889(rule 2 (lower (has_type ty @ $I64X2 (sshr _ src amt)))
890        (if-let true (has_avx512vl))
891        (if-let true (has_avx512f))
892        (let ((masked Gpr (x64_and $I64 amt (RegMemImm.Imm (shift_mask ty)))))
893          (x64_vpsraq src (x64_movd_to_xmm masked))))
894
895(rule 1 (lower (has_type $I64X2 (sshr _ src (u32_from_iconst amt))))
896        (lower_i64x2_sshr_imm src (u32_and amt 63)))
897
898(rule (lower (has_type $I64X2 (sshr _ src amt)))
899      (lower_i64x2_sshr_gpr src (x64_and $I64 amt (RegMemImm.Imm 63))))
900
901(decl lower_i64x2_sshr_imm (Xmm u32) Xmm)
902
903;; If the shift amount is less than 32 then do an sshr with 32-bit lanes to
904;; produce the upper halves of each result, followed by a ushr of 64-bit lanes
905;; to produce the lower halves of each result. Interleave results at the end.
906(rule 2 (lower_i64x2_sshr_imm vec imm)
907        (if-let true (u64_lt imm 32))
908        (let (
909            (high32 Xmm (x64_psrad vec (xmi_imm imm)))
910            (high32 Xmm (x64_pshufd high32 0b11_10_11_01))
911            (low32  Xmm (x64_psrlq vec (xmi_imm imm)))
912            (low32  Xmm (x64_pshufd low32 0b11_10_10_00))
913          )
914          (x64_punpckldq low32 high32)))
915
916;; If the shift amount is 32 then the `psrlq` from the above rule can be avoided
917(rule 1 (lower_i64x2_sshr_imm vec 32)
918        (let (
919            (low32  Xmm (x64_pshufd vec 0b11_10_11_01))
920            (high32 Xmm (x64_psrad vec (xmi_imm 31)))
921            (high32 Xmm (x64_pshufd high32 0b11_10_11_01))
922          )
923          (x64_punpckldq low32 high32)))
924
925;; Shifts >= 32 use one `psrad` to generate the upper bits and second `psrad` to
926;; generate the lower bits. Everything is then woven back together with
927;; shuffles.
928(rule (lower_i64x2_sshr_imm vec imm)
929      (if-let true (u64_lt 32 imm))
930      (let (
931          (high32 Xmm (x64_psrad vec (xmi_imm 31)))
932          (high32 Xmm (x64_pshufd high32 0b11_10_11_01))
933          (low32  Xmm (x64_psrad vec (xmi_imm (u32_wrapping_sub imm 32))))
934          (low32  Xmm (x64_pshufd low32 0b11_10_11_01))
935        )
936        (x64_punpckldq low32 high32)))
937
938;; A variable shift amount is slightly more complicated than the immediate
939;; shift amounts from above. The `Gpr` argument is guaranteed to be <= 63 by
940;; earlier masking. A `ushr` operation is used with some xor/sub math to
941;; generate the sign bits.
942(decl lower_i64x2_sshr_gpr (Xmm Gpr) Xmm)
943(rule (lower_i64x2_sshr_gpr vec val)
944      (let (
945          (val                Xmm (x64_movq_to_xmm val))
946          (mask               Xmm (flip_high_bit_mask $I64X2))
947          (sign_bit_loc       Xmm (x64_psrlq mask val))
948          (ushr               Xmm (x64_psrlq vec val))
949          (ushr_sign_bit_flip Xmm (x64_pxor sign_bit_loc ushr))
950        )
951        (x64_psubq ushr_sign_bit_flip sign_bit_loc)))
952
953;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
954
955;; `i64` and smaller: we can rely on x86's rotate-amount masking since
956;;  we operate on the whole register. For const's we mask the constant.
957
958(rule -1 (lower (has_type (fits_in_64 ty) (rotl _ src amt)))
959        (x64_rotl ty src (put_masked_in_imm8_gpr amt ty)))
960
961
962;; `i128`.
963
964(rule (lower (has_type $I128 (rotl _ src amt)))
965      (let ((src_ ValueRegs src)
966            ;; NB: Only the low bits of `amt` matter since we logically mask the
967            ;; rotation amount to the value's bit width.
968            (amt_ Gpr (lo_gpr amt)))
969        (or_i128 (shl_i128 src_ amt_)
970                 (shr_i128 src_ (x64_sub $I64
971                                     (imm $I64 128)
972                                     amt_)))))
973
974;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
975
976;; `i64` and smaller: we can rely on x86's rotate-amount masking since
977;;  we operate on the whole register. For const's we mask the constant.
978
979(rule -1 (lower (has_type (fits_in_64 ty) (rotr _ src amt)))
980        (x64_rotr ty src (put_masked_in_imm8_gpr amt ty)))
981
982
983;; `i128`.
984
985(rule (lower (has_type $I128 (rotr _ src amt)))
986      (let ((src_ ValueRegs src)
987            ;; NB: Only the low bits of `amt` matter since we logically mask the
988            ;; rotation amount to the value's bit width.
989            (amt_ Gpr (lo_gpr amt)))
990        (or_i128 (shr_i128 src_ amt_)
991                 (shl_i128 src_ (x64_sub $I64
992                                     (imm $I64 128)
993                                     amt_)))))
994
995;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
996
997;; `i64` and smaller.
998
999(rule -1 (lower (has_type (fits_in_64 ty) (ineg _ x)))
1000      (x64_neg ty x))
1001
1002(rule -2 (lower (has_type $I128 (ineg _ x)))
1003      ;; Get the high/low registers for `x`.
1004      (let ((regs ValueRegs x)
1005            (lo Gpr (value_regs_get_gpr regs 0))
1006            (hi Gpr (value_regs_get_gpr regs 1)))
1007        ;; Do a neg followed by an sub-with-borrow.
1008        (with_flags (x64_neg_paired $I64 lo)
1009                    (x64_sbb_paired $I64 (imm $I64 0) hi))))
1010
1011;; SSE.
1012
1013(rule (lower (has_type $I8X16 (ineg _ x)))
1014      (x64_psubb (imm $I8X16 0) x))
1015
1016(rule (lower (has_type $I16X8 (ineg _ x)))
1017      (x64_psubw (imm $I16X8 0) x))
1018
1019(rule (lower (has_type $I32X4 (ineg _ x)))
1020      (x64_psubd (imm $I32X4 0) x))
1021
1022(rule (lower (has_type $I64X2 (ineg _ x)))
1023      (x64_psubq (imm $I64X2 0) x))
1024
1025;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1026
1027(rule (lower (has_type (multi_lane 8 16)
1028                       (avg_round _ x y)))
1029      (x64_pavgb x y))
1030
1031(rule (lower (has_type (multi_lane 16 8)
1032                       (avg_round _ x y)))
1033      (x64_pavgw x y))
1034
1035;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1036
1037;; `i64` and smaller.
1038
1039;; 8-bit base case, needs a special instruction encoding and additionally
1040;; move sinkable loads to the right.
1041(rule -8 (lower (has_type $I8 (imul _ x y))) (x64_mul8 false x y))
1042(rule -7 (lower (has_type $I8 (imul _ (sinkable_load x) y))) (x64_mul8 false y x))
1043
1044;; 16-to-64-bit base cases, same as above by moving sinkable loads to the right.
1045(rule -6 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ x y)))
1046         (x64_imul ty x y))
1047(rule -5 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ (sinkable_load x) y)))
1048         (x64_imul ty y x))
1049
1050;; Lift out constants to use 3-operand form.
1051(rule -4 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ x (i32_from_iconst y))))
1052         (x64_imul_imm ty x y))
1053(rule -3 (lower (has_type (ty_int_ref_16_to_64 ty) (imul _ (i32_from_iconst x) y)))
1054         (x64_imul_imm ty y x))
1055
1056;; Special case widening multiplication from 8-to-16-bits with a single
1057;; instruction since the 8-bit-multiply places both the high and low halves in
1058;; the same register
1059(rule -2 (lower (has_type $I16 (imul _ (sextend _ x) (sextend _ y))))
1060  (x64_mul8 true x y))
1061(rule -2 (lower (has_type $I16 (imul _ (uextend _ x) (uextend _ y))))
1062  (x64_mul8 false x y))
1063
1064;; `i128`.
1065
1066(rule 2 (lower (has_type $I128 (imul _ x y)))
1067      (let ((x_regs ValueRegs x)
1068            (y_regs ValueRegs y))
1069        (imul128
1070          (value_regs_get_gpr x_regs 0)
1071          (value_regs_get_gpr x_regs 1)
1072          (value_regs_get_gpr y_regs 0)
1073          (value_regs_get_gpr y_regs 1))))
1074
1075(rule 4 (lower (has_type $I128 (imul _ (iconcat _ x_lo x_hi) (iconcat _ y_lo y_hi))))
1076        (imul128 x_lo x_hi y_lo y_hi))
1077
1078;; Helper for lowering 128-bit multiplication with the 64-bit halves of the
1079;; lhs/rhs already split. The first two arguments are lo/hi for the lhs and the
1080;; second two are lo/hi for the rhs.
1081;;
1082;; mul:
1083;;   dst_lo = lhs_lo * rhs_lo
1084;;   dst_hi = umulhi(lhs_lo, rhs_lo) +
1085;;            lhs_lo * rhs_hi +
1086;;            lhs_hi * rhs_lo
1087;;
1088;; so we emit:
1089;;   lo_hi = mul x_lo, y_hi
1090;;   hi_lo = mul x_hi, y_lo
1091;;   hilo_hilo = add lo_hi, hi_lo
1092;;   dst_lo:hi_lolo = mulhi_u x_lo, y_lo
1093;;   dst_hi = add hilo_hilo, hi_lolo
1094;;   return (dst_lo, dst_hi)
1095(decl imul128 (Gpr Gpr GprMem GprMem) ValueRegs)
1096(rule (imul128 x_lo x_hi y_lo y_hi)
1097      ;; Put `x` into registers and unpack its hi/lo halves.
1098      (let (
1099            ;; lo_hi = mul x_lo, y_hi
1100            (lo_hi Gpr (x64_imul $I64 x_lo y_hi))
1101            ;; hi_lo = mul x_hi, y_lo
1102            (hi_lo Gpr (x64_imul $I64 x_hi y_lo))
1103            ;; hilo_hilo = add lo_hi, hi_lo
1104            (hilo_hilo Gpr (x64_add $I64 lo_hi hi_lo))
1105            ;; dst_lo:hi_lolo = x64_mul x_lo, y_lo
1106            (mul_regs ValueRegs (x64_mul $I64 false x_lo y_lo))
1107            (dst_lo Gpr (value_regs_get_gpr mul_regs 0))
1108            (hi_lolo Gpr (value_regs_get_gpr mul_regs 1))
1109            ;; dst_hi = add hilo_hilo, hi_lolo
1110            (dst_hi Gpr (x64_add $I64 hilo_hilo hi_lolo)))
1111        (value_gprs dst_lo dst_hi)))
1112
1113;; The `mul` and `imul` instructions on x64 are defined as taking 64-bit
1114;; operands and producing a 128-bit result, which exactly matches the semantics
1115;; of widening 64-bit inputs to 128-bit and then multiplying them. That means
1116;; that these cases can get some some simpler codegen.
1117(rule 5 (lower (has_type $I128 (imul _ (uextend _ x @ (value_type $I64))
1118                                     (uextend _ y @ (value_type $I64)))))
1119        (x64_mul $I64 false x y))
1120(rule 5 (lower (has_type $I128 (imul _ (sextend _ x @ (value_type $I64))
1121                                     (sextend _ y @ (value_type $I64)))))
1122        (x64_mul $I64 true x y))
1123
1124;; SSE.
1125
1126;; (No i8x16 multiply.)
1127
1128(rule (lower (has_type (multi_lane 16 8) (imul _ x y)))
1129      (x64_pmullw x y))
1130
1131(rule (lower (has_type (multi_lane 32 4) (imul _ x y)))
1132      (if-let true (has_sse41))
1133      (x64_pmulld x y))
1134
1135;; Without `pmulld` the `pmuludq` instruction is used instead which performs
1136;; 32-bit multiplication storing the 64-bit result. The 64-bit result is
1137;; truncated to 32-bits and everything else is woven into place.
1138(rule -1 (lower (has_type (multi_lane 32 4) (imul _ x y)))
1139         (let (
1140            (x Xmm x)
1141            (y Xmm y)
1142            (x_hi Xmm (x64_pshufd x 0b00_11_00_01))
1143            (y_hi Xmm (x64_pshufd y 0b00_11_00_01))
1144            (mul_lo Xmm (x64_pshufd (x64_pmuludq x y)       0b00_00_10_00))
1145            (mul_hi Xmm (x64_pshufd (x64_pmuludq x_hi y_hi) 0b00_00_10_00))
1146          )
1147          (x64_punpckldq mul_lo mul_hi)))
1148
1149;; With AVX-512 we can implement `i64x2` multiplication with a single
1150;; instruction.
1151(rule 3 (lower (has_type (multi_lane 64 2) (imul _ x y)))
1152      (if-let true (has_avx512vl))
1153      (if-let true (has_avx512dq))
1154      (x64_vpmullq x y))
1155
1156;; Otherwise, for i64x2 multiplication we describe a lane A as being composed of
1157;; a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
1158;; multiplication can then be written as:
1159;;
1160;;    Ah Al
1161;; *  Bh Bl
1162;;    -----
1163;;    Al * Bl
1164;; + (Ah * Bl) << 32
1165;; + (Al * Bh) << 32
1166;;
1167;; So for each lane we will compute:
1168;;
1169;;   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
1170;;
1171;; Note, the algorithm will use `pmuludq` which operates directly on the lower
1172;; 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
1173;; the lane of the destination. For this reason we don't need shifts to isolate
1174;; the lower 32-bits, however, we will need to use shifts to isolate the high
1175;; 32-bits when doing calculations, i.e., `Ah == A >> 32`.
1176(rule (lower (has_type (multi_lane 64 2)
1177                       (imul _ a b)))
1178      (let ((a0 Xmm a)
1179            (b0 Xmm b)
1180            ;; a_hi = A >> 32
1181            (a_hi Xmm (x64_psrlq a0 (xmi_imm 32)))
1182            ;; ah_bl = Ah * Bl
1183            (ah_bl Xmm (x64_pmuludq a_hi b0))
1184            ;; b_hi = B >> 32
1185            (b_hi Xmm (x64_psrlq b0 (xmi_imm 32)))
1186            ;; al_bh = Al * Bh
1187            (al_bh Xmm (x64_pmuludq a0 b_hi))
1188            ;; aa_bb = ah_bl + al_bh
1189            (aa_bb Xmm (x64_paddq ah_bl al_bh))
1190            ;; aa_bb_shifted = aa_bb << 32
1191            (aa_bb_shifted Xmm (x64_psllq aa_bb (xmi_imm 32)))
1192            ;; al_bl = Al * Bl
1193            (al_bl Xmm (x64_pmuludq a0 b0)))
1194        ;; al_bl + aa_bb_shifted
1195        (x64_paddq al_bl aa_bb_shifted)))
1196
1197;; Special case for `i32x4.extmul_high_i16x8_s`.
1198(rule 1 (lower (has_type (multi_lane 32 4)
1199                       (imul _ (swiden_high _ (and (value_type (multi_lane 16 8))
1200                                               x))
1201                             (swiden_high _ (and (value_type (multi_lane 16 8))
1202                                               y)))))
1203      (let ((x2 Xmm x)
1204            (y2 Xmm y)
1205            (lo Xmm (x64_pmullw x2 y2))
1206            (hi Xmm (x64_pmulhw x2 y2)))
1207        (x64_punpckhwd lo hi)))
1208
1209;; Special case for `i64x2.extmul_high_i32x4_s`.
1210(rule 1 (lower (has_type (multi_lane 64 2)
1211                       (imul _ (swiden_high _ (and (value_type (multi_lane 32 4))
1212                                               x))
1213                             (swiden_high _ (and (value_type (multi_lane 32 4))
1214                                               y)))))
1215      (if-let true (has_sse41))
1216      (let ((x2 Xmm (x64_pshufd x 0xFA))
1217            (y2 Xmm (x64_pshufd y 0xFA)))
1218        (x64_pmuldq x2 y2)))
1219
1220;; Special case for `i32x4.extmul_low_i16x8_s`.
1221(rule 1 (lower (has_type (multi_lane 32 4)
1222                       (imul _ (swiden_low _ (and (value_type (multi_lane 16 8))
1223                                              x))
1224                             (swiden_low _ (and (value_type (multi_lane 16 8))
1225                                              y)))))
1226      (let ((x2 Xmm x)
1227            (y2 Xmm y)
1228            (lo Xmm (x64_pmullw x2 y2))
1229            (hi Xmm (x64_pmulhw x2 y2)))
1230        (x64_punpcklwd lo hi)))
1231
1232;; Special case for `i64x2.extmul_low_i32x4_s`.
1233(rule 1 (lower (has_type (multi_lane 64 2)
1234                       (imul _ (swiden_low _ (and (value_type (multi_lane 32 4))
1235                                              x))
1236                             (swiden_low _ (and (value_type (multi_lane 32 4))
1237                                              y)))))
1238      (if-let true (has_sse41))
1239      (let ((x2 Xmm (x64_pshufd x 0x50))
1240            (y2 Xmm (x64_pshufd y 0x50)))
1241        (x64_pmuldq x2 y2)))
1242
1243;; Special case for `i32x4.extmul_high_i16x8_u`.
1244(rule 1 (lower (has_type (multi_lane 32 4)
1245                       (imul _ (uwiden_high _ (and (value_type (multi_lane 16 8))
1246                                               x))
1247                             (uwiden_high _ (and (value_type (multi_lane 16 8))
1248                                               y)))))
1249      (let ((x2 Xmm x)
1250            (y2 Xmm y)
1251            (lo Xmm (x64_pmullw x2 y2))
1252            (hi Xmm (x64_pmulhuw x2 y2)))
1253        (x64_punpckhwd lo hi)))
1254
1255;; Special case for `i64x2.extmul_high_i32x4_u`.
1256(rule 1 (lower (has_type (multi_lane 64 2)
1257                       (imul _ (uwiden_high _ (and (value_type (multi_lane 32 4))
1258                                               x))
1259                             (uwiden_high _ (and (value_type (multi_lane 32 4))
1260                                               y)))))
1261      (let ((x2 Xmm (x64_pshufd x 0xFA))
1262            (y2 Xmm (x64_pshufd y 0xFA)))
1263        (x64_pmuludq x2 y2)))
1264
1265;; Special case for `i32x4.extmul_low_i16x8_u`.
1266(rule 1 (lower (has_type (multi_lane 32 4)
1267                       (imul _ (uwiden_low _ (and (value_type (multi_lane 16 8))
1268                                              x))
1269                             (uwiden_low _ (and (value_type (multi_lane 16 8))
1270                                              y)))))
1271      (let ((x2 Xmm x)
1272            (y2 Xmm y)
1273            (lo Xmm (x64_pmullw x2 y2))
1274            (hi Xmm (x64_pmulhuw x2 y2)))
1275        (x64_punpcklwd lo hi)))
1276
1277;; Special case for `i64x2.extmul_low_i32x4_u`.
1278(rule 1 (lower (has_type (multi_lane 64 2)
1279                       (imul _ (uwiden_low _ (and (value_type (multi_lane 32 4))
1280                                              x))
1281                             (uwiden_low _ (and (value_type (multi_lane 32 4))
1282                                              y)))))
1283      (let ((x2 Xmm (x64_pshufd x 0x50))
1284            (y2 Xmm (x64_pshufd y 0x50)))
1285        (x64_pmuludq x2 y2)))
1286
1287;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1288
1289(rule 1 (lower (has_type $I8X16 (iabs _ x)))
1290        (if-let true (has_ssse3))
1291        (x64_pabsb_a_or_avx x))
1292
1293;; Note the use of `pminub` with signed inputs will produce the positive signed
1294;; result which is what is desired here. The `pmaxub` isn't available until
1295;; SSE4.1 in which case the single-instruction above lowering would apply.
1296(rule (lower (has_type $I8X16 (iabs _ x)))
1297      (let (
1298          (x Xmm x)
1299          (negated Xmm (x64_psubb (xmm_zero $I8X16) x))
1300        )
1301        (x64_pminub_a x negated)))
1302
1303(rule 1 (lower (has_type $I16X8 (iabs _ x)))
1304        (if-let true (has_ssse3))
1305        (x64_pabsw_a_or_avx x))
1306
1307(rule (lower (has_type $I16X8 (iabs _ x)))
1308      (let (
1309          (x Xmm x)
1310          (negated Xmm (x64_psubw (xmm_zero $I16X8) x))
1311        )
1312        (x64_pmaxsw_a x negated)))
1313
1314(rule 1 (lower (has_type $I32X4 (iabs _ x)))
1315        (if-let true (has_ssse3))
1316        (x64_pabsd_a_or_avx x))
1317
1318;; Generate a `negative_mask` which is either numerically -1 or 0 depending on
1319;; if the lane is negative. If the lane is positive then the xor operation
1320;; won't change the lane but otherwise it'll bit-flip everything. By then
1321;; subtracting the mask this subtracts 0 for positive lanes (does nothing) or
1322;; ends up adding one for negative lanes. This means that for a negative lane
1323;; `x` the result is `!x + 1` which is the result of negating it.
1324(rule (lower (has_type $I32X4 (iabs _ x)))
1325      (let (
1326          (x Xmm x)
1327          (negative_mask Xmm (x64_psrad x (xmi_imm 31)))
1328          (flipped_if_negative Xmm (x64_pxor x negative_mask))
1329        )
1330        (x64_psubd flipped_if_negative negative_mask)))
1331
1332;; When AVX512 is available, we can use a single `vpabsq` instruction.
1333(rule 2 (lower (has_type $I64X2 (iabs _ x)))
1334      (if-let true (has_avx512vl))
1335      (if-let true (has_avx512f))
1336      (x64_vpabsq x))
1337
1338;; Otherwise, we use a separate register, `neg`, to contain the results of `0 -
1339;; x` and then blend in those results with `blendvpd` if the MSB of `neg` was
1340;; set to 1 (i.e. if `neg` was negative or, conversely, if `x` was originally
1341;; positive).
1342(rule 1 (lower (has_type $I64X2 (iabs _ x)))
1343        (if-let true (has_sse41))
1344        (let ((rx Xmm x)
1345              (neg Xmm (x64_psubq (imm $I64X2 0) rx)))
1346          (x64_blendvpd neg rx neg)))
1347
1348;; and if `blendvpd` isn't available then perform a shift/shuffle to generate a
1349;; mask of which lanes are negative, followed by flipping bits/sub to make both
1350;; positive.
1351(rule (lower (has_type $I64X2 (iabs _ x)))
1352      (let ((x Xmm x)
1353            (signs Xmm (x64_psrad x (RegMemImm.Imm 31)))
1354            (signs Xmm (x64_pshufd signs 0b11_11_01_01))
1355            (xor_if_negative Xmm (x64_pxor x signs)))
1356        (x64_psubq xor_if_negative signs)))
1357
1358;; `i64` and smaller.
1359
1360(rule -1 (lower (has_type (fits_in_64 ty) (iabs _ x)))
1361      (let ((src Gpr x)
1362            (neg ProducesFlags (x64_neg_paired ty src))
1363            ;; Manually extract the result from the neg, then ignore
1364            ;; it below, since we need to pass it into the cmove
1365            ;; before we pass the cmove to with_flags_reg.
1366            (neg_result Gpr (produces_flags_get_reg neg))
1367            ;; When the neg instruction sets the sign flag,
1368            ;; takes the original (non-negative) value.
1369            (cmove ConsumesFlags (cmove ty (CC.S) src neg_result)))
1370        (with_flags_reg (produces_flags_ignore neg) cmove)))
1371
1372;; `i128`. Negate the low bits, `adc` to the higher bits, then negate high bits.
1373(rule (lower (has_type $I128 (iabs _ x)))
1374      ;; Get the high/low registers for `x`.
1375      (let ((x_regs ValueRegs x)
1376            (x_lo Gpr (value_regs_get_gpr x_regs 0))
1377            (x_hi Gpr (value_regs_get_gpr x_regs 1))
1378            ; negate low bits, then add 0 with carry to high bits.
1379            (neg_lo ProducesFlags (x64_neg_paired $I64 x_lo))
1380            (adc_hi ConsumesFlags (x64_adc_paired $I64 x_hi (imm $I64 0)))
1381            (neg_adc_vals ValueRegs (with_flags neg_lo adc_hi))
1382            ; negate high bits.
1383            (neg_hi ProducesFlags (x64_neg_paired $I64 (value_regs_get neg_adc_vals 1)))
1384            (neg_hi_flag_only ProducesFlags (produces_flags_ignore neg_hi))
1385            ; cmove based on sign flag from hi negation.
1386            (cmove_lo ConsumesFlags (cmove $I64 (CC.S) x_lo
1387                                     (value_regs_get neg_adc_vals 0)))
1388            (cmove_hi ConsumesFlags (cmove $I64 (CC.S) x_hi
1389                                     (produces_flags_get_reg neg_hi)))
1390            (cmoves ConsumesFlags (consumes_flags_concat cmove_lo cmove_hi)))
1391        (with_flags neg_hi_flag_only cmoves)))
1392
1393;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1394
1395(rule (lower (has_type $F32 (fabs _ x)))
1396      (x64_andps x (imm $F32 0x7fffffff)))
1397
1398(rule (lower (has_type $F64 (fabs _ x)))
1399      (x64_andpd x (imm $F64 0x7fffffffffffffff)))
1400
1401;; Special case for `f32x4.abs`.
1402(rule (lower (has_type $F32X4 (fabs _ x)))
1403      (x64_andps x
1404             (x64_psrld (vector_all_ones) (xmi_imm 1))))
1405
1406;; Special case for `f64x2.abs`.
1407(rule (lower (has_type $F64X2 (fabs _ x)))
1408      (x64_andpd x
1409             (x64_psrlq (vector_all_ones) (xmi_imm 1))))
1410
1411;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1412
1413(rule (lower (has_type $F32 (fneg _ x)))
1414      (x64_xorps x (imm $F32 0x80000000)))
1415
1416(rule (lower (has_type $F64 (fneg _ x)))
1417      (x64_xorpd x (imm $F64 0x8000000000000000)))
1418
1419(rule (lower (has_type $F32X4 (fneg _ x)))
1420      (x64_xorps x
1421             (x64_pslld (vector_all_ones) (xmi_imm 31))))
1422
1423(rule (lower (has_type $F64X2 (fneg _ x)))
1424      (x64_xorpd x
1425             (x64_psllq (vector_all_ones) (xmi_imm 63))))
1426
1427;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1428
1429; Recursion: reduces 128-bit cases to 64-bit.
1430(decl rec lower_bmask (Type Type ValueRegs) ValueRegs)
1431
1432;; Values that fit in a register
1433;;
1434;; Use the neg instruction on the input which sets the CF (carry) flag
1435;; to 0 if the input is 0 or 1 otherwise.
1436;; We then subtract the output register with itself, which always gives a 0,
1437;; however use the carry flag from the previous negate to generate a -1 if it
1438;; was nonzero.
1439;;
1440;; neg in_reg
1441;; sbb out_reg, out_reg
1442(rule 0
1443      (lower_bmask (fits_in_64 out_ty) (fits_in_64 in_ty) val)
1444      (let ((reg Gpr (value_regs_get_gpr val 0))
1445            (out ValueRegs (with_flags
1446                  (x64_neg_paired in_ty reg)
1447                  (x64_sbb_paired out_ty reg reg))))
1448        ;; Extract only the output of the sbb instruction
1449        (value_reg (value_regs_get out 1))))
1450
1451
1452;; If the input type is I128 we can `or` the registers, and recurse to the general case.
1453(rule 1
1454      (lower_bmask (fits_in_64 out_ty) $I128 val)
1455      (let ((lo Gpr (value_regs_get_gpr val 0))
1456            (hi Gpr (value_regs_get_gpr val 1))
1457            (mixed Gpr (x64_or $I64 lo hi)))
1458        (lower_bmask out_ty $I64 (value_reg mixed))))
1459
1460;; If the output type is I128 we just duplicate the result of the I64 lowering
1461(rule 2
1462      (lower_bmask $I128 in_ty val)
1463      (let ((res ValueRegs (lower_bmask $I64 in_ty val))
1464            (res Gpr (value_regs_get_gpr res 0)))
1465        (value_regs res res)))
1466
1467
1468;; Call the lower_bmask rule that does all the procssing
1469(rule (lower (has_type out_ty (bmask _ x @ (value_type in_ty))))
1470      (lower_bmask out_ty in_ty x))
1471
1472;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1473
1474;; `i64` and smaller.
1475
1476(rule -2 (lower (has_type ty (bnot _ x)))
1477      (if (ty_int_ref_scalar_64 ty))
1478      (x64_not ty x))
1479
1480
1481;; `i128`.
1482
1483(decl not_i128 (Value) ValueRegs)
1484(rule (not_i128 x)
1485      (let ((x_regs ValueRegs x)
1486            (x_lo Gpr (value_regs_get_gpr x_regs 0))
1487            (x_hi Gpr (value_regs_get_gpr x_regs 1)))
1488        (value_gprs (x64_not $I64 x_lo)
1489                    (x64_not $I64 x_hi))))
1490
1491(rule (lower (has_type $I128 (bnot _ x)))
1492      (not_i128 x))
1493
1494;; f32 and f64
1495
1496(rule -3 (lower (has_type (ty_scalar_float ty) (bnot _ x)))
1497      (x64_xor_vector ty x (vector_all_ones)))
1498
1499;; Special case for vector-types where bit-negation is an xor against an
1500;; all-one value
1501(rule -1 (lower (has_type ty @ (multi_lane _bits _lanes) (bnot _ x)))
1502      (x64_xor_vector ty x (vector_all_ones)))
1503
1504;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1505
1506(rule (lower (has_type ty @ (multi_lane _bits _lanes)
1507                       (bitselect _ condition
1508                                  if_true
1509                                  if_false)))
1510      ;; a = and if_true, condition
1511      ;; b = and_not condition, if_false
1512      ;; or b, a
1513      (let ((cond_xmm Xmm condition)
1514            (a Xmm (sse_and ty if_true cond_xmm))
1515            (b Xmm (sse_and_not ty cond_xmm if_false)))
1516        (sse_or ty b a)))
1517
1518;; If every byte of the condition is guaranteed to be all ones or all zeroes,
1519;; we can use x64_blend.
1520(rule 1 (lower (has_type ty @ (multi_lane _bits _lanes)
1521                         (bitselect _ condition
1522                                    if_true
1523                                    if_false)))
1524      (if-let true (has_sse41))
1525      (if (all_ones_or_all_zeros condition))
1526      (x64_pblendvb if_false if_true condition))
1527
1528(decl pure partial all_ones_or_all_zeros (Value) bool)
1529(rule (all_ones_or_all_zeros (and (icmp _ _ _ _) (value_type (multi_lane _ _)))) true)
1530(rule (all_ones_or_all_zeros (and (fcmp _ _ _ _) (value_type (multi_lane _ _)))) true)
1531(rule (all_ones_or_all_zeros (and (bitcast _ _ (fcmp _ _ _ _)) (value_type (multi_lane _ _)))) true)
1532(rule (all_ones_or_all_zeros (vconst _ (vconst_all_ones_or_all_zeros))) true)
1533
1534(decl pure vconst_all_ones_or_all_zeros () Constant)
1535(extern extractor vconst_all_ones_or_all_zeros vconst_all_ones_or_all_zeros)
1536
1537;; Specializations for floating-pointer compares to generate a `minp*` or a
1538;; `maxp*` instruction. These are equivalent to the wasm `f32x4.{pmin,pmax}`
1539;; instructions and how they're lowered into CLIF. Note the careful ordering
1540;; of all the operands here to ensure that the input CLIF matched is implemented
1541;; by the corresponding x64 instruction.
1542(rule 2 (lower (has_type $F32X4 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) x y)) x y)))
1543        (x64_minps x y))
1544(rule 2 (lower (has_type $F64X2 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) x y)) x y)))
1545        (x64_minpd x y))
1546
1547(rule 3 (lower (has_type $F32X4 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) y x)) x y)))
1548        (x64_maxps x y))
1549(rule 3 (lower (has_type $F64X2 (bitselect _ (bitcast _ _ (fcmp _ (FloatCC.LessThan) y x)) x y)))
1550        (x64_maxpd x y))
1551
1552;; Scalar rules
1553
1554(rule 3 (lower (has_type $I128 (bitselect _ c t f)))
1555      (let ((a ValueRegs (and_i128 c t))
1556            (b ValueRegs (and_i128 (not_i128 c) f)))
1557        (or_i128 a b)))
1558
1559(rule 4 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect _ c t f)))
1560      (let ((a Gpr (x64_and ty c t))
1561            (b Gpr (x64_and ty (x64_not ty c) f)))
1562        (x64_or ty a b)))
1563
1564(rule 5 (lower (has_type (ty_scalar_float ty) (bitselect _ c t f)))
1565      (let ((a Xmm (sse_and ty c t))
1566            (c_neg Xmm (x64_xor_vector ty c (vector_all_ones)))
1567            (b Xmm (sse_and ty c_neg f)))
1568        (sse_or ty a b)))
1569
1570;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1571
1572(rule (lower (has_type $I8X16
1573                       (blendv _ condition if_true if_false)))
1574      (if-let true (has_sse41))
1575      (x64_pblendvb if_false if_true condition))
1576
1577(rule (lower (has_type $I32X4
1578                       (blendv _ condition if_true if_false)))
1579      (if-let true (has_sse41))
1580      (x64_blendvps if_false if_true condition))
1581
1582(rule (lower (has_type $I64X2
1583                       (blendv _ condition if_true if_false)))
1584      (if-let true (has_sse41))
1585      (x64_blendvpd if_false if_true condition))
1586
1587;;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1588
1589(rule 1 (lower (insertlane _ vec @ (value_type $I8X16) val (u8_from_uimm8 idx)))
1590  (if-let true (has_sse41))
1591  (x64_pinsrb vec val idx))
1592(rule 2 (lower (insertlane _ vec @ (value_type $I8X16) (sinkable_load_exact val) (u8_from_uimm8 idx)))
1593  (if-let true (has_sse41))
1594  (x64_pinsrb vec val idx))
1595
1596;; This lowering is particularly unoptimized and is mostly just here to work
1597;; rather than here to be fast. Requiring SSE 4.1 for the above lowering isn't
1598;; the end of the world hopefully as that's a pretty old instruction set, so
1599;; this is the "simplest" version that works on SSE2 for now.
1600;;
1601;; This lowering masks the original vector with a constant with all 1s except
1602;; for the "hole" where this value will get placed into, meaning the desired
1603;; lane is guaranteed as all 0s. Next the `val` is shuffled into this hole with
1604;; a few operations:
1605;;
1606;;  1. The `val` is zero-extended to 32-bits to guarantee the lower 32-bits
1607;;     are all defined.
1608;;  2. An arithmetic shift-left is used with the low two bits of `n`, the
1609;;     desired lane, to move the value into the right position within the 32-bit
1610;;     register value.
1611;;  3. The 32-bit register is moved with `movd` into an XMM register
1612;;  4. The XMM register, where all lanes are 0 except for the first lane which
1613;;     has the shifted value, is then shuffled with `pshufd` to move the
1614;;     shifted value to the correct and final lane. This uses the upper two
1615;;     bits of `n` to index the i32x4 lane that we're targeting.
1616;;
1617;; This all, laboriously, gets the `val` into the desired lane so it's then
1618;; `por`'d with the original vec-with-a-hole to produce the final result of the
1619;; insertion.
1620(rule (lower (insertlane _ vec @ (value_type $I8X16) val (u8_from_uimm8 n)))
1621      (let ((vec_with_hole Xmm (x64_pand vec (insert_i8x16_lane_hole n)))
1622            (val Gpr (x64_movzx (ExtMode.BL) val))
1623            (val Gpr (x64_shll_mi val (u8_wrapping_shl (u8_and n 3) 3)))
1624            (val Xmm (x64_movd_to_xmm val))
1625            (val_at_hole Xmm (x64_pshufd val (insert_i8x16_lane_pshufd_imm (u8_wrapping_shr n 2)))))
1626        (x64_por vec_with_hole val_at_hole)))
1627
1628(decl insert_i8x16_lane_hole (u8) VCodeConstant)
1629(extern constructor insert_i8x16_lane_hole insert_i8x16_lane_hole)
1630(decl insert_i8x16_lane_pshufd_imm (u8) u8)
1631(rule (insert_i8x16_lane_pshufd_imm 0) 0b01_01_01_00)
1632(rule (insert_i8x16_lane_pshufd_imm 1) 0b01_01_00_01)
1633(rule (insert_i8x16_lane_pshufd_imm 2) 0b01_00_01_01)
1634(rule (insert_i8x16_lane_pshufd_imm 3) 0b00_01_01_01)
1635
1636
1637;; i16x8.replace_lane
1638(rule (lower (insertlane _ vec @ (value_type $I16X8) val (u8_from_uimm8 idx)))
1639  (x64_pinsrw vec val idx))
1640(rule 1 (lower (insertlane _ vec @ (value_type $I16X8) (sinkable_load_exact val) (u8_from_uimm8 idx)))
1641  (x64_pinsrw vec val idx))
1642
1643;; i32x4.replace_lane
1644(rule 1 (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 idx)))
1645        (if-let true (has_sse41))
1646        (x64_pinsrd vec val idx))
1647
1648(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 0)))
1649  (x64_movss_regmove vec (x64_movd_to_xmm val)))
1650
1651;; tmp    = [ vec[1] vec[0] val[1] val[0] ]
1652;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
1653(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 1)))
1654      (let ((val Xmm (x64_movd_to_xmm val))
1655            (vec Xmm vec))
1656        (x64_shufps (x64_punpcklqdq val vec) vec 0b11_10_00_10)))
1657
1658;; tmp    = [ vec[0] vec[3] val[0] val[0] ]
1659;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
1660(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 2)))
1661      (let ((val Xmm (x64_movd_to_xmm val))
1662            (vec Xmm vec))
1663        (x64_shufps vec (x64_shufps val vec 0b00_11_00_00) 0b10_00_01_00)))
1664
1665;; tmp    = [ vec[3] vec[2] val[1] val[0] ]
1666;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
1667(rule (lower (insertlane _ vec @ (value_type $I32X4) val (u8_from_uimm8 3)))
1668      (let ((val Xmm (x64_movd_to_xmm val))
1669            (vec Xmm vec))
1670        (x64_shufps vec (x64_shufps val vec 0b11_10_01_00) 0b00_10_01_00)))
1671
1672;; i64x2.replace_lane
1673(rule 1 (lower (insertlane _ vec @ (value_type $I64X2) val (u8_from_uimm8 idx)))
1674        (if-let true (has_sse41))
1675        (x64_pinsrq vec val idx))
1676(rule (lower (insertlane _ vec @ (value_type $I64X2) val (u8_from_uimm8 0)))
1677      (x64_movsd_regmove vec (x64_movq_to_xmm val)))
1678(rule (lower (insertlane _ vec @ (value_type $I64X2) val (u8_from_uimm8 1)))
1679      (x64_punpcklqdq vec (x64_movq_to_xmm val)))
1680
1681;; (i64x2.replace_lane 1) with a splat as source for lane 0 -- we can elide
1682;; the splat and just do a move. This turns out to be a common pattern when
1683;; constructing an i64x2 out of two i64s.
1684(rule 3 (lower (insertlane _ (has_type $I64X2 (splat _ lane0))
1685                           lane1
1686                           (u8_from_uimm8 1)))
1687        (if-let true (has_sse41))
1688        (x64_pinsrq (bitcast_gpr_to_xmm 64 lane0) lane1 1))
1689
1690(rule 1 (lower (insertlane _ vec @ (value_type $F32X4) (sinkable_load val) (u8_from_uimm8 idx)))
1691  (if-let true (has_sse41))
1692  (x64_insertps vec val (sse_insertps_lane_imm idx)))
1693(rule (lower (insertlane _ vec @ (value_type $F32X4) val (u8_from_uimm8 idx)))
1694  (f32x4_insertlane vec val idx))
1695
1696;; Helper function used below for `insertlane` but also here for other
1697(decl f32x4_insertlane (Xmm Xmm u8) Xmm)
1698
1699;; f32x4.replace_lane
1700(rule 1 (f32x4_insertlane vec val idx)
1701        (if-let true (has_sse41))
1702        (x64_insertps vec val (sse_insertps_lane_imm idx)))
1703
1704;; External rust code used to calculate the immediate value to `insertps`.
1705(decl sse_insertps_lane_imm (u8) u8)
1706(extern constructor sse_insertps_lane_imm sse_insertps_lane_imm)
1707
1708;; f32x4.replace_lane 0
1709(rule (f32x4_insertlane vec val 0)
1710      (x64_movss_regmove vec val))
1711
1712;; f32x4.replace_lane 1
1713;; tmp    = [ vec[1] vec[0] val[1] val[0] ]
1714;; result = [ vec[3] vec[2] tmp[0] tmp[2] ]
1715(rule (f32x4_insertlane vec val 1)
1716      (let ((tmp Xmm (x64_movlhps val vec)))
1717        (x64_shufps tmp vec 0b11_10_00_10)))
1718
1719;; f32x4.replace_lane 2
1720;; tmp    = [ vec[0] vec[3] val[0] val[0] ]
1721;; result = [ tmp[2] tmp[0] vec[1] vec[0] ]
1722(rule (f32x4_insertlane vec val 2)
1723      (let ((tmp Xmm (x64_shufps val vec 0b00_11_00_00)))
1724        (x64_shufps vec tmp 0b10_00_01_00)))
1725
1726;; f32x4.replace_lane 3
1727;; tmp    = [ vec[3] vec[2] val[1] val[0] ]
1728;; result = [ tmp[0] tmp[2] vec[1] vec[0] ]
1729(rule (f32x4_insertlane vec val 3)
1730      (let ((tmp Xmm (x64_shufps val vec 0b11_10_01_00)))
1731        (x64_shufps vec tmp 0b00_10_01_00)))
1732
1733;; f64x2.replace_lane 0
1734;;
1735;; Here the `movsd` instruction is used specifically to specialize moving
1736;; into the fist lane where unlike above cases we're not using the lane
1737;; immediate as an immediate to the instruction itself.
1738(rule (lower (insertlane _ vec @ (value_type $F64X2) val (u8_from_uimm8 0)))
1739      (x64_movsd_regmove vec val))
1740
1741;; f64x2.replace_lane 1
1742;;
1743;; Here the `movlhps` instruction is used specifically to specialize moving
1744;; into the second lane where unlike above cases we're not using the lane
1745;; immediate as an immediate to the instruction itself.
1746(rule (lower (insertlane _ vec @ (value_type $F64X2) val (u8_from_uimm8 1)))
1747      (x64_movlhps vec val))
1748
1749;;;; Rules for `smin`, `smax`, `umin`, `umax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1750
1751;; `i64` and smaller.
1752
1753(decl cmp_and_choose (Type CC Value Value) ValueRegs)
1754(rule (cmp_and_choose (fits_in_64 ty) cc x y)
1755      (let( ;; We need to put x and y in registers explicitly because
1756            ;; we use the values more than once. Hence, even if these
1757            ;; are "unique uses" at the CLIF level and would otherwise
1758            ;; allow for load-op merging, here we cannot do that.
1759            (x_reg Reg x)
1760            (y_reg Reg y))
1761        (with_flags_reg (x64_cmp ty y_reg x_reg)
1762                        (cmove ty cc y_reg x_reg))))
1763
1764(rule -1 (lower (has_type (fits_in_64 ty) (umin _ x y)))
1765      (cmp_and_choose ty (CC.B) x y))
1766
1767(rule -1 (lower (has_type (fits_in_64 ty) (umax _ x y)))
1768      (cmp_and_choose ty (CC.NB) x y))
1769
1770(rule -1 (lower (has_type (fits_in_64 ty) (smin _ x y)))
1771      (cmp_and_choose ty (CC.L) x y))
1772
1773(rule -1 (lower (has_type (fits_in_64 ty) (smax _ x y)))
1774      (cmp_and_choose ty (CC.NL) x y))
1775
1776;; SSE helpers for determining if single-instruction lowerings are available.
1777
1778(decl pure has_pmins (Type) bool)
1779(rule 1 (has_pmins $I16X8) true)
1780(rule 1 (has_pmins $I64X2) false)
1781(rule (has_pmins _) (has_sse41))
1782
1783(decl pure has_pmaxs (Type) bool)
1784(rule 1 (has_pmaxs $I16X8) true)
1785(rule 1 (has_pmaxs $I64X2) false)
1786(rule (has_pmaxs _) (has_sse41))
1787
1788(decl pure has_pmaxu (Type) bool)
1789(rule 1 (has_pmaxu $I8X16) true)
1790(rule 1 (has_pmaxu $I64X2) false)
1791(rule (has_pmaxu _) (has_sse41))
1792
1793(decl pure has_pminu (Type) bool)
1794(rule 1 (has_pminu $I8X16) true)
1795(rule 1 (has_pminu $I64X2) false)
1796(rule (has_pminu _) (has_sse41))
1797
1798;; SSE `smax`.
1799
1800(rule (lower (has_type (ty_vec128 ty) (smax _ x y)))
1801      (lower_vec_smax ty x y))
1802
1803(decl lower_vec_smax (Type Xmm Xmm) Xmm)
1804(rule 1 (lower_vec_smax ty x y)
1805        (if-let true (has_pmaxs ty))
1806        (x64_pmaxs ty x y))
1807
1808(rule (lower_vec_smax ty x y)
1809      (let (
1810          (x Xmm x)
1811          (y Xmm y)
1812          (cmp Xmm (x64_pcmpgt ty x y))
1813          (x_is_max Xmm (x64_pand cmp x))
1814          (y_is_max Xmm (x64_pandn cmp y))
1815        )
1816        (x64_por x_is_max y_is_max)))
1817
1818;; SSE `smin`.
1819
1820(rule 1 (lower (has_type (ty_vec128 ty) (smin _ x y)))
1821        (if-let true (has_pmins ty))
1822        (x64_pmins ty x y))
1823
1824(rule (lower (has_type (ty_vec128 ty) (smin _ x y)))
1825      (let (
1826          (x Xmm x)
1827          (y Xmm y)
1828          (cmp Xmm (x64_pcmpgt ty y x))
1829          (x_is_min Xmm (x64_pand cmp x))
1830          (y_is_min Xmm (x64_pandn cmp y))
1831        )
1832        (x64_por x_is_min y_is_min)))
1833
1834;; SSE `umax`.
1835
1836(rule 2 (lower (has_type (ty_vec128 ty) (umax _ x y)))
1837        (if-let true (has_pmaxu ty))
1838        (x64_pmaxu ty x y))
1839
1840;; If y < x then the saturating subtraction will be zero, otherwise when added
1841;; back to x it'll return y.
1842(rule 1 (lower (has_type $I16X8 (umax _ x y)))
1843        (let ((x Xmm x))
1844          (x64_paddw x (x64_psubusw y x))))
1845
1846;; Flip the upper bits of each lane so the signed comparison has the same
1847;; result as a signed comparison, and then select the results with the output
1848;; mask. See `pcmpgt` lowering for info on flipping the upper bit.
1849(rule (lower (has_type (ty_vec128 ty) (umax _ x y)))
1850      (let (
1851          (x Xmm x)
1852          (y Xmm y)
1853          (mask Xmm (flip_high_bit_mask ty))
1854          (x_masked Xmm (x64_pxor x mask))
1855          (y_masked Xmm (x64_pxor y mask))
1856          (cmp Xmm (x64_pcmpgt ty x_masked y_masked))
1857          (x_is_max Xmm (x64_pand cmp x))
1858          (y_is_max Xmm (x64_pandn cmp y))
1859        )
1860        (x64_por x_is_max y_is_max)))
1861
1862(decl flip_high_bit_mask (Type) Xmm)
1863(rule (flip_high_bit_mask $I16X8)
1864      (x64_movdqu_load (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000)))
1865(rule (flip_high_bit_mask $I32X4)
1866      (x64_movdqu_load (emit_u128_le_const 0x80000000_80000000_80000000_80000000)))
1867(rule (flip_high_bit_mask $I64X2)
1868      (x64_movdqu_load (emit_u128_le_const 0x8000000000000000_8000000000000000)))
1869
1870;; SSE `umin`.
1871
1872(rule 2 (lower (has_type (ty_vec128 ty) (umin _ x y)))
1873        (if-let true (has_pminu ty))
1874        (x64_pminu ty x y))
1875
1876;; If x < y then the saturating subtraction will be 0. Otherwise if x > y then
1877;; the saturated result, when subtracted again, will go back to `y`.
1878(rule 1 (lower (has_type $I16X8 (umin _ x y)))
1879        (let ((x Xmm x))
1880          (x64_psubw x (x64_psubusw x y))))
1881
1882;; Same as `umax`, and see `pcmpgt` for docs on flipping the upper bit.
1883(rule (lower (has_type (ty_vec128 ty) (umin _ x y)))
1884      (let (
1885          (x Xmm x)
1886          (y Xmm y)
1887          (mask Xmm (flip_high_bit_mask ty))
1888          (x_masked Xmm (x64_pxor x mask))
1889          (y_masked Xmm (x64_pxor y mask))
1890          (cmp Xmm (x64_pcmpgt ty y_masked x_masked))
1891          (x_is_max Xmm (x64_pand cmp x))
1892          (y_is_max Xmm (x64_pandn cmp y))
1893        )
1894        (x64_por x_is_max y_is_max)))
1895
1896;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1897
1898(rule (lower (trap code))
1899      (side_effect (x64_ud2_zo code)))
1900
1901;;;; Rules for `trapz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1902
1903(rule (lower (trapz val code))
1904  (side_effect (trap_if_cond (cond_invert (is_nonzero_cmp val)) code)))
1905
1906(decl trap_if_cond (CondResult TrapCode) SideEffectNoResult)
1907(rule (trap_if_cond (CondResult.CC producer cc) tc)
1908      (with_flags_side_effect producer (trap_if cc tc)))
1909(rule (trap_if_cond (CondResult.And producer cc1 cc2) tc)
1910      (with_flags_side_effect producer (trap_if_and cc1 cc2 tc)))
1911(rule (trap_if_cond (CondResult.Or producer cc1 cc2) tc)
1912      (with_flags_side_effect producer (trap_if_or cc1 cc2 tc)))
1913
1914;;;; Rules for `trapnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1915
1916(rule (lower (trapnz val code))
1917  (side_effect (trap_if_cond (is_nonzero_cmp val) code)))
1918
1919;;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1920
1921(rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap _ a b tc)))
1922      (with_flags
1923        (x64_add_with_flags_paired ty a b)
1924        (trap_if (CC.B) tc)))
1925
1926;; Handle lhs immediates/sinkable loads in addition to the automatic rhs
1927;; handling of above.
1928
1929(rule 1 (lower (has_type (fits_in_64 ty)
1930                         (uadd_overflow_trap _ (simm32_from_value a) b tc)))
1931      (with_flags
1932        (x64_add_with_flags_paired ty b a)
1933        (trap_if (CC.B) tc)))
1934
1935(rule 2 (lower (has_type (fits_in_64 ty)
1936                         (uadd_overflow_trap _ (sinkable_load a) b tc)))
1937      (with_flags
1938        (x64_add_with_flags_paired ty b a)
1939        (trap_if (CC.B) tc)))
1940
1941;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1942
1943;; N.B.: the Ret itself is generated by the ABI.
1944(rule (lower (return args))
1945      (lower_return args))
1946
1947;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1948
1949(rule -2 (lower (icmp _ cc a @ (value_type (fits_in_64 ty)) b))
1950      (lower_cond_bool (emit_cmp cc a b)))
1951
1952(rule -1 (lower (icmp _ cc a @ (value_type $I128) b))
1953      (lower_cond_bool (emit_cmp cc a b)))
1954
1955;; Peephole optimization for `x < 0`, when x is a signed 64 bit value
1956(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThan) x @ (value_type $I64) (u64_from_iconst 0))))
1957      (x64_shrq_mi x 63))
1958
1959;; Peephole optimization for `0 > x`, when x is a signed 64 bit value
1960(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I64))))
1961      (x64_shrq_mi x 63))
1962
1963;; Peephole optimization for `0 <= x`, when x is a signed 64 bit value
1964(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I64))))
1965      (x64_shrq_mi (x64_not $I64 x) 63))
1966
1967;; Peephole optimization for `x >= 0`, when x is a signed 64 bit value
1968(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I64) (u64_from_iconst 0))))
1969      (x64_shrq_mi (x64_not $I64 x) 63))
1970
1971;; Peephole optimization for `x < 0`, when x is a signed 32 bit value
1972(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThan) x @ (value_type $I32) (u64_from_iconst 0))))
1973      (x64_shrl_mi x 31))
1974
1975;; Peephole optimization for `0 > x`, when x is a signed 32 bit value
1976(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThan) (u64_from_iconst 0) x @ (value_type $I32))))
1977      (x64_shrl_mi x 31))
1978
1979;; Peephole optimization for `0 <= x`, when x is a signed 32 bit value
1980(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedLessThanOrEqual) (u64_from_iconst 0) x @ (value_type $I32))))
1981      (x64_shrl_mi (x64_not $I32 x) 31))
1982
1983;; Peephole optimization for `x >= 0`, when x is a signed 32 bit value
1984(rule 2 (lower (has_type $I8 (icmp _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type $I32) (u64_from_iconst 0))))
1985      (x64_shrl_mi (x64_not $I32 x) 31))
1986
1987;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
1988;; one. To note: what is different here about the output values is that each
1989;; lane will be filled with all 1s or all 0s according to the comparison,
1990;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
1991;; unset).
1992(rule (lower (icmp _ (IntCC.Equal) a @ (value_type (ty_vec128 ty)) b))
1993      (x64_pcmpeq ty a b))
1994
1995;; To lower a not-equals comparison, we perform an equality comparison
1996;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
1997(rule (lower (icmp _ (IntCC.NotEqual) a @ (value_type (ty_vec128 ty)) b))
1998      (let ((checked Xmm (x64_pcmpeq ty a b))
1999            (all_ones Xmm (vector_all_ones)))
2000           (x64_pxor checked all_ones)))
2001
2002;; SSE `sgt`
2003
2004(rule (lower (icmp _ (IntCC.SignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
2005      (x64_pcmpgt ty a b))
2006
2007;; SSE `slt`
2008
2009(rule (lower (icmp _ (IntCC.SignedLessThan) a @ (value_type (ty_vec128 ty)) b))
2010      (x64_pcmpgt ty b a))
2011
2012;; SSE `ugt`
2013
2014;; N.B.: we must manually prevent load coalescing operands; the
2015;; register allocator gets confused otherwise.
2016(rule 1 (lower (icmp _ (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
2017        (if-let true (has_pmaxu ty))
2018        (let ((a Xmm a)
2019              (b Xmm b)
2020              (max Xmm (x64_pmaxu ty a b))
2021              (eq Xmm (x64_pcmpeq ty max b)))
2022             (x64_pxor eq (vector_all_ones))))
2023
2024;; Flip the upper bit of each lane so the result of a signed comparison is the
2025;; same as the result of an unsigned comparison (see docs on `pcmpgt` for more)
2026(rule (lower (icmp _ (IntCC.UnsignedGreaterThan) a @ (value_type (ty_vec128 ty)) b))
2027      (let ((mask Xmm (flip_high_bit_mask ty))
2028            (a_masked Xmm (x64_pxor a mask))
2029            (b_masked Xmm (x64_pxor b mask)))
2030           (x64_pcmpgt ty a_masked b_masked)))
2031
2032;; SSE `ult`
2033
2034(rule 1 (lower (icmp _ (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
2035        (if-let true (has_pminu ty))
2036        ;; N.B.: see note above.
2037        (let ((a Xmm a)
2038              (b Xmm b)
2039              (min Xmm (x64_pminu ty a b))
2040              (eq Xmm (x64_pcmpeq ty min b)))
2041             (x64_pxor eq (vector_all_ones))))
2042
2043;; Flip the upper bit of `a` and `b` so the signed comparison result will
2044;; be the same as the unsigned comparison result (see docs on `pcmpgt` for more).
2045(rule (lower (icmp _ (IntCC.UnsignedLessThan) a @ (value_type (ty_vec128 ty)) b))
2046      (let ((mask Xmm (flip_high_bit_mask ty))
2047            (a_masked Xmm (x64_pxor a mask))
2048            (b_masked Xmm (x64_pxor b mask)))
2049           (x64_pcmpgt ty b_masked a_masked)))
2050
2051;; SSE `sge`
2052
2053;; Use `pmaxs*` and compare the result to `a` to see if it's `>= b`.
2054(rule 1 (lower (icmp _ (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2055        (if-let true (has_pmaxs ty))
2056        (x64_pcmpeq ty a (x64_pmaxs ty a b)))
2057
2058;; Without `pmaxs*` use a `pcmpgt*` with reversed operands and invert the
2059;; result.
2060(rule (lower (icmp _ (IntCC.SignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2061      (x64_pxor (x64_pcmpgt ty b a) (vector_all_ones)))
2062
2063;; SSE `sle`
2064
2065;; With `pmins*` use that and compare the result to `a`.
2066(rule 1 (lower (icmp _ (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2067        (if-let true (has_pmins ty))
2068        (x64_pcmpeq ty a (x64_pmins ty a b)))
2069
2070;; Without `pmins*` perform a greater-than test and invert the result.
2071(rule (lower (icmp _ (IntCC.SignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2072      (x64_pxor (x64_pcmpgt ty a b) (vector_all_ones)))
2073
2074;; SSE `uge`
2075
2076(rule 2 (lower (icmp _ (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2077        (if-let true (has_pmaxu ty))
2078        (x64_pcmpeq ty a (x64_pmaxu ty a b)))
2079
2080;; Perform a saturating subtract of `a` from `b` and if the result is zero then
2081;; `a` is greater or equal.
2082(rule 1 (lower (icmp _ (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I16X8) b))
2083         (x64_pcmpeqw (x64_psubusw b a) (xmm_zero $I16X8)))
2084
2085;; Flip the upper bit of each lane so the signed comparison is the same as
2086;; an unsigned one and then invert the result. See docs on `pcmpgt` for why
2087;; flipping the upper bit works.
2088(rule (lower (icmp _ (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2089      (let (
2090          (mask Xmm (flip_high_bit_mask ty))
2091          (a_masked Xmm (x64_pxor a mask))
2092          (b_masked Xmm (x64_pxor b mask))
2093          (cmp Xmm (x64_pcmpgt ty b_masked a_masked))
2094        )
2095        (x64_pxor cmp (vector_all_ones))))
2096
2097;; SSE `ule`
2098
2099(rule 2 (lower (icmp _ (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2100        (if-let true (has_pminu ty))
2101        (x64_pcmpeq ty a (x64_pminu ty a b)))
2102
2103;; A saturating subtraction will produce zeros if `a` is less than `b`, so
2104;; compare that result to an all-zeros result to figure out lanes of `a` that
2105;; are <= to the lanes in `b`
2106(rule 1 (lower (icmp _ (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I16X8) b))
2107        (let ((zeros_if_a_is_min Xmm (x64_psubusw a b)))
2108            (x64_pcmpeqw zeros_if_a_is_min (xmm_zero $I8X16))))
2109
2110;; Flip the upper bit of each lane in `a` and `b` so a signed comparison
2111;; produces the same result as an unsigned comparison. Then test test for `gt`
2112;; and invert the result to get the `le` that is desired here. See docs on
2113;; `pcmpgt` for why flipping the upper bit works.
2114(rule (lower (icmp _ (IntCC.UnsignedLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2115      (let (
2116          (mask Xmm (flip_high_bit_mask ty))
2117          (a_masked Xmm (x64_pxor a mask))
2118          (b_masked Xmm (x64_pxor b mask))
2119          (cmp Xmm (x64_pcmpgt ty a_masked b_masked))
2120        )
2121        (x64_pxor cmp (vector_all_ones))))
2122
2123;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2124
2125;; CLIF's `fcmp` instruction always operates on XMM registers--both scalar and
2126;; vector. For the scalar versions, we use the flag-setting behavior of the
2127;; `UCOMIS*` instruction to `SETcc` a 0 or 1 in a GPR register. Note that CLIF's
2128;; `select` uses the same kind of flag-setting behavior but chooses values other
2129;; than 0 or 1.
2130;;
2131;; Checking the result of `UCOMIS*` is unfortunately difficult in some cases
2132;; because we do not have `SETcc` instructions that explicitly check
2133;; simultaneously for the condition (i.e., `eq`, `le`, `gt`, etc.) *and*
2134;; orderedness. Instead, we must check the flags multiple times. The UCOMIS*
2135;; documentation (see Intel's Software Developer's Manual, volume 2, chapter 4)
2136;; is helpful:
2137;;  - unordered assigns    Z = 1, P = 1, C = 1
2138;;  - greater than assigns Z = 0, P = 0, C = 0
2139;;  - less than assigns    Z = 0, P = 0, C = 1
2140;;  - equal assigns        Z = 1, P = 0, C = 0
2141
2142(rule -1 (lower (fcmp _ cc a @ (value_type (ty_scalar_float ty)) b))
2143      (lower_cond_bool (emit_fcmp cc a b)))
2144
2145;; For vector lowerings, we use `CMPP*` instructions with a 3-bit operand that
2146;; determines the comparison to make. Note that comparisons that succeed will
2147;; fill the lane with 1s; comparisons that do not will fill the lane with 0s.
2148
2149(rule (lower (fcmp _ (FloatCC.Equal) a @ (value_type (ty_vec128 ty)) b))
2150      (x64_cmpp ty a b (FcmpImm.Equal)))
2151(rule (lower (fcmp _ (FloatCC.NotEqual) a @ (value_type (ty_vec128 ty)) b))
2152      (x64_cmpp ty a b (FcmpImm.NotEqual)))
2153(rule (lower (fcmp _ (FloatCC.LessThan) a @ (value_type (ty_vec128 ty)) b))
2154      (x64_cmpp ty a b (FcmpImm.LessThan)))
2155(rule (lower (fcmp _ (FloatCC.LessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2156      (x64_cmpp ty a b (FcmpImm.LessThanOrEqual)))
2157(rule (lower (fcmp _ (FloatCC.Ordered) a @ (value_type (ty_vec128 ty)) b))
2158      (x64_cmpp ty a b (FcmpImm.Ordered)))
2159(rule (lower (fcmp _ (FloatCC.Unordered) a @ (value_type (ty_vec128 ty)) b))
2160      (x64_cmpp ty a b (FcmpImm.Unordered)))
2161(rule (lower (fcmp _ (FloatCC.UnorderedOrGreaterThan) a @ (value_type (ty_vec128 ty)) b))
2162      (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThan)))
2163(rule (lower (fcmp _ (FloatCC.UnorderedOrGreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2164      (x64_cmpp ty a b (FcmpImm.UnorderedOrGreaterThanOrEqual)))
2165
2166;; Some vector lowerings rely on flipping the operands and using a reversed
2167;; comparison code.
2168
2169(rule (lower (fcmp _ (FloatCC.GreaterThan) a @ (value_type (ty_vec128 ty)) b))
2170      (x64_cmpp ty b a (FcmpImm.LessThan)))
2171(rule (lower (fcmp _ (FloatCC.GreaterThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2172      (x64_cmpp ty b a (FcmpImm.LessThanOrEqual)))
2173(rule (lower (fcmp _ (FloatCC.UnorderedOrLessThan) a @ (value_type (ty_vec128 ty)) b))
2174      (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThan)))
2175(rule (lower (fcmp _ (FloatCC.UnorderedOrLessThanOrEqual) a @ (value_type (ty_vec128 ty)) b))
2176      (x64_cmpp ty b a (FcmpImm.UnorderedOrGreaterThanOrEqual)))
2177
2178;; Some vector lowerings are simply not supported for certain codes:
2179;; - FloatCC::OrderedNotEqual
2180;; - FloatCC::UnorderedOrEqual
2181
2182;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2183
2184(rule (lower (select _ cond x y)) (lower_select (is_nonzero_cmp cond) x y))
2185
2186; Recursion: at most once to swap the And case for an Or.
2187(decl rec lower_select (CondResult Value Value) InstOutput)
2188(rule 0 (lower_select cond a @ (value_type (ty_int (fits_in_64 ty))) b)
2189  (lower_select_gpr ty cond a b))
2190(rule 1 (lower_select cond a @ (value_type (is_xmm_type ty)) b)
2191  (lower_select_xmm ty cond a b))
2192(rule 2 (lower_select cond a @ (value_type $I128) b)
2193  (lower_select128 cond a b))
2194;; Note that for all of the rules below if the condition evaluates to
2195;; `CondResult.And` that's swapped to `CondResult.Or` by negating the conditions
2196;; and swapping the two values to make codegen a bit easier (only have to do the
2197;; "or" case).
2198(rule 3 (lower_select cond @ (CondResult.And _ _ _) a b)
2199  (lower_select (cond_invert cond) b a))
2200
2201(decl lower_select_gpr (Type CondResult GprMem Gpr) Gpr)
2202(rule (lower_select_gpr ty (CondResult.CC flags cc) a b)
2203  (value_regs_get_gpr (with_flags flags (cmove ty cc a b)) 0))
2204(rule (lower_select_gpr ty (CondResult.Or flags cc1 cc2) a b)
2205  (let ((c1 ConsumesFlags (cmove ty cc1 a b))
2206        (tmp Gpr (consumes_flags_get_reg c1))
2207        (c2 ConsumesFlags (cmove ty cc2 a tmp)))
2208  (value_regs_get (with_flags flags (consumes_flags_return_last c1 c2)) 0)))
2209
2210(decl lower_select_xmm (Type CondResult Xmm Xmm) Xmm)
2211(rule (lower_select_xmm ty (CondResult.CC flags cc) a b)
2212  (value_regs_get (with_flags flags (cmove_xmm ty cc a b)) 0))
2213(rule (lower_select_xmm ty (CondResult.Or flags cc1 cc2) a b)
2214  (let ((c1 ConsumesFlags (cmove_xmm ty cc1 a b))
2215        (tmp Xmm (consumes_flags_get_reg c1))
2216        (c2 ConsumesFlags (cmove_xmm ty cc2 a tmp)))
2217  (value_regs_get (with_flags flags (consumes_flags_return_last c1 c2)) 0)))
2218
2219(decl lower_select128 (CondResult ValueRegs ValueRegs) ValueRegs)
2220(rule (lower_select128 (CondResult.CC flags cc) a b)
2221  (with_flags flags (cmove128 cc a b)))
2222(rule (lower_select128 (CondResult.Or flags cc1 cc2) a b)
2223  (let ((c1 ConsumesFlags (cmove128 cc1 a b))
2224        (tmp ValueRegs (consumes_flags_get_regs c1))
2225        (c2 ConsumesFlags (cmove128 cc2 a tmp)))
2226    (with_flags flags (consumes_flags_return_last c1 c2))))
2227
2228;; Helper to `lower_select128` above to create two `cmove` instructions based
2229;; on the CC provided for the upper/lower halves.
2230(decl cmove128 (CC ValueRegs ValueRegs) ConsumesFlags)
2231(rule (cmove128 cc cons alt)
2232  (consumes_flags_concat
2233    (cmove $I64 cc (value_regs_get_gpr cons 0) (value_regs_get_gpr alt 0))
2234    (cmove $I64 cc (value_regs_get_gpr cons 1) (value_regs_get_gpr alt 1))))
2235
2236;; Helper to the "Or" conditions above to create a `ConsumesFlags` that is a
2237;; sequence of two other `ConsumesFlags` which returns the result of the
2238;; second `ConsumesFlags`, the result of the `lower_select*` operation.
2239(decl consumes_flags_return_last (ConsumesFlags ConsumesFlags) ConsumesFlags)
2240(rule (consumes_flags_return_last
2241        (ConsumesFlags.ConsumesFlagsReturnsReg inst1 _)
2242        (ConsumesFlags.ConsumesFlagsReturnsReg inst2 dst))
2243  (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs inst1 inst2 dst))
2244(rule (consumes_flags_return_last
2245        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs i1 i2 _)
2246        (ConsumesFlags.ConsumesFlagsTwiceReturnsValueRegs i3 i4 dst))
2247  (ConsumesFlags.ConsumesFlagsFourTimesReturnsValueRegs i1 i2 i3 i4 dst))
2248
2249;; Specializations for floating-point compares to generate a `mins*` or a
2250;; `maxs*` instruction. These are equivalent to the "pseudo-m{in,ax}"
2251;; specializations for vectors.
2252(rule 3 (lower (has_type $F32 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) x y)) x y)))
2253        (x64_minss x y))
2254(rule 3 (lower (has_type $F64 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) x y)) x y)))
2255        (x64_minsd x y))
2256(rule 4 (lower (has_type $F32 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) y x)) x y)))
2257        (x64_maxss x y))
2258(rule 4 (lower (has_type $F64 (select _ (maybe_uextend (fcmp _ (FloatCC.LessThan) y x)) x y)))
2259        (x64_maxsd x y))
2260
2261;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2262
2263(rule 2 (lower (has_type (ty_32_or_64 ty) (clz _ src)))
2264      (do_clz ty ty src))
2265
2266(rule 1 (lower (has_type (ty_8_or_16 ty) (clz _ src)))
2267      (let ((extended Gpr (extend_to_gpr src $I64 (ExtendKind.Zero)))
2268            (clz Gpr (do_clz $I64 $I64 extended)))
2269        (x64_sub $I64 clz (RegMemImm.Imm (u32_wrapping_sub 64 (ty_bits ty))))))
2270
2271
2272(rule 0 (lower
2273       (has_type $I128
2274                 (clz _ src)))
2275      (let ((upper Gpr (do_clz $I64 $I64 (value_regs_get_gpr src 1)))
2276            (lower Gpr (x64_add $I64
2277                            (do_clz $I64 $I64 (value_regs_get_gpr src 0))
2278                            (RegMemImm.Imm 64)))
2279            (result_lo Gpr
2280              (with_flags_reg
2281               (x64_cmpq_mi_sxb upper 64)
2282               (cmove $I64 (CC.NZ) upper lower))))
2283        (value_regs result_lo (imm $I64 0))))
2284
2285;; Implementation helper for clz; operates on 32 or 64-bit units.
2286(decl do_clz (Type Type Gpr) Gpr)
2287
2288;; If available, we can use a plain lzcnt instruction here. Note no
2289;; special handling is required for zero inputs, because the machine
2290;; instruction does what the CLIF expects for zero, i.e. it returns
2291;; zero.
2292(rule 1 (do_clz ty orig_ty src)
2293      (if-let true (has_lzcnt))
2294      (x64_lzcnt ty src))
2295
2296(rule 0 (do_clz ty orig_ty src)
2297      (let ((highest_bit_index Reg (bsr_or_else ty src (imm_i64 $I64 -1)))
2298            (bits_minus_1 Reg (imm ty (u64_wrapping_sub (ty_bits_u64 orig_ty) 1))))
2299        (x64_sub ty bits_minus_1 highest_bit_index)))
2300
2301;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2302
2303(rule 2 (lower (has_type (ty_32_or_64 ty) (ctz _ src)))
2304      (do_ctz ty ty src))
2305
2306(rule 1 (lower (has_type (ty_8_or_16 ty) (ctz _ src)))
2307      (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Zero)))
2308            (stopbit Gpr (x64_or $I32 extended (RegMemImm.Imm (u32_wrapping_shl 1 (ty_bits ty))))))
2309        (do_ctz $I32 ty stopbit)))
2310
2311(rule 0 (lower
2312       (has_type $I128
2313                 (ctz _ src)))
2314      (let ((lower Gpr (do_ctz $I64 $I64 (value_regs_get_gpr src 0)))
2315            (upper Gpr (x64_add $I64
2316                            (do_ctz $I64 $I64 (value_regs_get_gpr src 1))
2317                            (RegMemImm.Imm 64)))
2318            (result_lo Gpr
2319              (with_flags_reg
2320               (x64_cmpq_mi_sxb lower 64)
2321               (cmove $I64 (CC.Z) upper lower))))
2322        (value_regs result_lo (imm $I64 0))))
2323
2324(decl do_ctz (Type Type Gpr) Gpr)
2325
2326;; Analogous to `clz` cases above, but using mirror instructions
2327;; (tzcnt vs lzcnt, bsf vs bsr).
2328(rule 1 (do_ctz ty orig_ty src)
2329      (if-let true (has_bmi1))
2330      (x64_tzcnt ty src))
2331
2332(rule 0 (do_ctz ty orig_ty src)
2333      (bsf_or_else ty src (imm $I64 (ty_bits_u64 orig_ty))))
2334
2335;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2336
2337(rule 2 (lower (has_type (ty_32_or_64 ty) (cls _ src)))
2338      (do_cls ty src))
2339
2340(rule 1 (lower (has_type (ty_8_or_16 ty) (cls _ src)))
2341      (let ((extended Gpr (extend_to_gpr src $I32 (ExtendKind.Sign)))
2342            (cls Gpr (do_cls $I32 extended)))
2343        (x64_sub $I32 cls (RegMemImm.Imm (u32_wrapping_sub 32 (ty_bits ty))))))
2344
2345(rule 0 (lower
2346       (has_type $I128
2347                 (cls _ src)))
2348      (let ((upper Gpr (do_cls $I64 (value_regs_get_gpr src 1)))
2349            (sign_fill Gpr (x64_sarq_mi (value_regs_get_gpr src 1) 63))
2350            (xored Gpr (x64_xor $I64 (value_regs_get_gpr src 0) sign_fill))
2351            (lower Gpr (x64_add $I64
2352                            (do_clz $I64 $I64 xored)
2353                            (RegMemImm.Imm 63)))
2354            (result_lo Gpr
2355              (with_flags_reg
2356               (x64_cmpq_mi_sxb upper 63)
2357               (cmove $I64 (CC.NZ) upper lower))))
2358        (value_regs result_lo (imm $I64 0))))
2359
2360;; Implementation helper for cls; operates on 32 or 64-bit units.
2361(decl do_cls (Type Gpr) Gpr)
2362
2363;; cls is implemented via clz using the identity: cls(x) = clz(x ^ (x >> 1)) - 1
2364(rule (do_cls ty src)
2365      (let ((shifted Gpr (x64_sar ty src (Imm8Gpr.Imm8 1)))
2366            (xored Gpr (x64_xor ty src (RegMemImm.Reg shifted)))
2367            (clz Gpr (do_clz ty ty xored)))
2368        (x64_sub ty clz (RegMemImm.Imm 1))))
2369
2370;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2371
2372(rule 4 (lower (has_type (ty_32_or_64 ty) (popcnt _ src)))
2373      (if-let true (use_popcnt))
2374      (x64_popcnt ty src))
2375
2376(rule 3 (lower (has_type (ty_8_or_16 ty) (popcnt _ src)))
2377      (if-let true (use_popcnt))
2378      (x64_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
2379
2380(rule 1 (lower (has_type $I128 (popcnt _ src)))
2381      (if-let true (use_popcnt))
2382      (let ((lo_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 0)))
2383            (hi_count Gpr (x64_popcnt $I64 (value_regs_get_gpr src 1))))
2384        (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0))))
2385
2386(rule -1 (lower
2387       (has_type (ty_32_or_64 ty)
2388                 (popcnt _ src)))
2389      (do_popcnt ty src))
2390
2391(rule -2 (lower
2392       (has_type (ty_8_or_16 ty)
2393                 (popcnt _ src)))
2394      (do_popcnt $I32 (extend_to_gpr src $I32 (ExtendKind.Zero))))
2395
2396(rule (lower
2397       (has_type $I128
2398                 (popcnt _ src)))
2399      (let ((lo_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 0)))
2400            (hi_count Gpr (do_popcnt $I64 (value_regs_get_gpr src 1))))
2401        (value_regs (x64_add $I64 lo_count hi_count) (imm $I64 0))))
2402
2403;; Implementation of popcount when we don't nave a native popcount
2404;; instruction.
2405(decl do_popcnt (Type Gpr) Gpr)
2406(rule (do_popcnt $I64 src)
2407      (let ((shifted1 Gpr (x64_shrq_mi src 1))
2408            (sevens Gpr (imm $I64 0x7777777777777777))
2409            (masked1 Gpr (x64_and $I64 shifted1 sevens))
2410            ;; diff1 := src - ((src >> 1) & 0b0111_0111_0111...)
2411            (diff1 Gpr (x64_sub $I64 src masked1))
2412            (shifted2 Gpr (x64_shrq_mi masked1 1))
2413            (masked2 Gpr (x64_and $I64 shifted2 sevens))
2414            ;; diff2 := diff1 - ((diff1 >> 1) & 0b0111_0111_0111...)
2415            (diff2 Gpr (x64_sub $I64 diff1 masked2))
2416            (shifted3 Gpr (x64_shrq_mi masked2 1))
2417            (masked3 Gpr (x64_and $I64 shifted3 sevens))
2418            ;; diff3 := diff2 - ((diff2 >> 1) & 0b0111_0111_0111...)
2419            ;;
2420            ;; At this point, each nibble of diff3 is the popcount of
2421            ;; that nibble. This works because at each step above, we
2422            ;; are basically subtracting floor(value / 2) from the
2423            ;; running value; the leftover remainder is 1 if the LSB
2424            ;; was 1. After three steps, we have (nibble / 8) -- 0 or
2425            ;; 1 for the MSB of the nibble -- plus three possible
2426            ;; additions for the three other bits.
2427            (diff3 Gpr (x64_sub $I64 diff2 masked3))
2428            ;; Add the two nibbles of each byte together.
2429            (sum1 Gpr (x64_add $I64 (x64_shrq_mi diff3 4) diff3))
2430            ;; Mask the above sum to have the popcount for each byte
2431            ;; in the lower nibble of that byte.
2432            (ofof Gpr (imm $I64 0x0f0f0f0f0f0f0f0f))
2433            (masked4 Gpr (x64_and $I64 sum1 ofof))
2434            (ones Gpr (imm $I64 0x0101010101010101))
2435            ;; Use a multiply to sum all of the bytes' popcounts into
2436            ;; the top byte. Consider the binomial expansion for the
2437            ;; top byte: it is the sum of the bytes (masked4 >> 56) *
2438            ;; 0x01 + (masked4 >> 48) * 0x01 + (masked4 >> 40) * 0x01
2439            ;; + ... + (masked4 >> 0).
2440            (mul Gpr (x64_imul $I64 masked4 ones))
2441            ;; Now take that top byte and return it as the popcount.
2442            (final Gpr (x64_shrq_mi mul 56)))
2443        final))
2444
2445;; This is the 32-bit version of the above; the steps for each nibble
2446;; are the same, we just use constants half as wide.
2447(rule (do_popcnt $I32 src)
2448      (let ((shifted1 Gpr (x64_shrl_mi src 1))
2449            (sevens Gpr (imm $I32 0x77777777))
2450            (masked1 Gpr (x64_and $I32 shifted1 sevens))
2451            (diff1 Gpr (x64_sub $I32 src masked1))
2452            (shifted2 Gpr (x64_shrl_mi masked1 1))
2453            (masked2 Gpr (x64_and $I32 shifted2 sevens))
2454            (diff2 Gpr (x64_sub $I32 diff1 masked2))
2455            (shifted3 Gpr (x64_shrl_mi masked2 1))
2456            (masked3 Gpr (x64_and $I32 shifted3 sevens))
2457            (diff3 Gpr (x64_sub $I32 diff2 masked3))
2458            (sum1 Gpr (x64_add $I32 (x64_shrl_mi diff3 4) diff3))
2459            (masked4 Gpr (x64_and $I32 sum1 (RegMemImm.Imm 0x0f0f0f0f)))
2460            (mul Gpr (x64_imul_imm $I32 masked4 0x01010101))
2461            (final Gpr (x64_shrl_mi mul 24)))
2462        final))
2463
2464
2465(rule 2 (lower (has_type $I8X16 (popcnt _ src)))
2466      (if-let true (has_avx512vl))
2467      (if-let true (has_avx512bitalg))
2468      (x64_vpopcntb src))
2469
2470
2471;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
2472;;
2473;; __m128i count_bytes ( __m128i v) {
2474;;     __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
2475;;     __m128i low_mask = _mm_set1_epi8 (0x0f);
2476;;     __m128i lo = _mm_and_si128 (v, low_mask);
2477;;     __m128i hi = _mm_and_si128 (_mm_srli_epi16 (v, 4), low_mask);
2478;;     __m128i cnt1 = _mm_shuffle_epi8 (lookup, lo);
2479;;     __m128i cnt2 = _mm_shuffle_epi8 (lookup, hi);
2480;;     return _mm_add_epi8 (cnt1, cnt2);
2481;; }
2482;;
2483;; Details of the above algorithm can be found in the reference noted above, but the basics
2484;; are to create a lookup table that pre populates the popcnt values for each number [0,15].
2485;; The algorithm uses shifts to isolate 4 bit sections of the vector, pshufb as part of the
2486;; lookup process, and adds together the results.
2487;;
2488;; __m128i lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
2489
2490
2491(rule 1 (lower (has_type $I8X16 (popcnt _ src)))
2492      (if-let true (has_ssse3))
2493      (let ((low_mask XmmMem (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))
2494            (low_nibbles Xmm (sse_and $I8X16 src low_mask))
2495            ;; Note that this is a 16x8 shift, but that's OK; we mask
2496            ;; off anything that traverses from one byte to the next
2497            ;; with the low_mask below.
2498            (shifted_src Xmm (x64_psrlw src (xmi_imm 4)))
2499            (high_nibbles Xmm (sse_and $I8X16 shifted_src low_mask))
2500            (lookup Xmm (x64_xmm_load_const $I8X16
2501              (emit_u128_le_const 0x04030302_03020201_03020201_02010100)))
2502            (bit_counts_low Xmm (x64_pshufb lookup low_nibbles))
2503            (bit_counts_high Xmm (x64_pshufb lookup high_nibbles)))
2504        (x64_paddb bit_counts_low bit_counts_high)))
2505
2506;; A modified version of the popcnt method from Hacker's Delight.
2507(rule (lower (has_type $I8X16 (popcnt _ src)))
2508      (let ((mask1 XmmMem (emit_u128_le_const 0x77777777777777777777777777777777))
2509            (src Xmm src)
2510            (shifted Xmm (x64_pand (x64_psrlq src (xmi_imm 1)) mask1))
2511            (src Xmm (x64_psubb src shifted))
2512            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
2513            (src Xmm (x64_psubb src shifted))
2514            (shifted Xmm (x64_pand (x64_psrlq shifted (xmi_imm 1)) mask1))
2515            (src Xmm (x64_psubb src shifted))
2516            (src Xmm (x64_paddb src (x64_psrlw src (xmi_imm 4)))))
2517        (x64_pand src (emit_u128_le_const 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f))))
2518
2519;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2520
2521(rule (lower (has_type $I8 (bitrev _ src)))
2522      (do_bitrev8 $I32 src))
2523
2524(rule (lower (has_type $I16 (bitrev _ src)))
2525      (do_bitrev16 $I32 src))
2526
2527(rule (lower (has_type $I32 (bitrev _ src)))
2528      (do_bitrev32 $I32 src))
2529
2530(rule (lower (has_type $I64 (bitrev _ src)))
2531      (do_bitrev64 $I64 src))
2532
2533(rule (lower (has_type $I128 (bitrev _ src)))
2534      (value_regs
2535       (do_bitrev64 $I64 (value_regs_get_gpr src 1))
2536       (do_bitrev64 $I64 (value_regs_get_gpr src 0))))
2537
2538(decl do_bitrev8 (Type Gpr) Gpr)
2539(rule (do_bitrev8 ty src)
2540      (let ((tymask u64 (ty_mask ty))
2541            (mask1 Gpr (imm ty (u64_and tymask 0x5555555555555555)))
2542            (lo1 Gpr (x64_and ty src mask1))
2543            (hi1 Gpr (x64_and ty (x64_shr ty src (Imm8Gpr.Imm8 1)) mask1))
2544            (swap1 Gpr (x64_or ty
2545                           (x64_shl ty lo1 (Imm8Gpr.Imm8 1))
2546                           hi1))
2547            (mask2 Gpr (imm ty (u64_and tymask 0x3333333333333333)))
2548            (lo2 Gpr (x64_and ty swap1 mask2))
2549            (hi2 Gpr (x64_and ty (x64_shr ty swap1 (Imm8Gpr.Imm8 2)) mask2))
2550            (swap2 Gpr (x64_or ty
2551                           (x64_shl ty lo2 (Imm8Gpr.Imm8 2))
2552                           hi2))
2553            (mask4 Gpr (imm ty (u64_and tymask 0x0f0f0f0f0f0f0f0f)))
2554            (lo4 Gpr (x64_and ty swap2 mask4))
2555            (hi4 Gpr (x64_and ty (x64_shr ty swap2 (Imm8Gpr.Imm8 4)) mask4))
2556            (swap4 Gpr (x64_or ty
2557                           (x64_shl ty lo4 (Imm8Gpr.Imm8 4))
2558                           hi4)))
2559        swap4))
2560
2561(decl do_bitrev16 (Type Gpr) Gpr)
2562(rule (do_bitrev16 ty src)
2563      (let ((src_ Gpr (do_bitrev8 ty src))
2564            (tymask u64 (ty_mask ty))
2565            (mask8 Gpr (imm ty (u64_and tymask 0x00ff00ff00ff00ff)))
2566            (lo8 Gpr (x64_and ty src_ mask8))
2567            (hi8 Gpr (x64_and ty (x64_shr ty src_ (Imm8Gpr.Imm8 8)) mask8))
2568            (swap8 Gpr (x64_or ty
2569                           (x64_shl ty lo8 (Imm8Gpr.Imm8 8))
2570                           hi8)))
2571        swap8))
2572
2573(decl do_bitrev32 (Type Gpr) Gpr)
2574(rule (do_bitrev32 ty src)
2575      (let ((src_ Gpr (do_bitrev16 ty src))
2576            (tymask u64 (ty_mask ty))
2577            (mask16 Gpr (imm ty (u64_and tymask 0x0000ffff0000ffff)))
2578            (lo16 Gpr (x64_and ty src_ mask16))
2579            (hi16 Gpr (x64_and ty (x64_shr ty src_ (Imm8Gpr.Imm8 16)) mask16))
2580            (swap16 Gpr (x64_or ty
2581                            (x64_shl ty lo16 (Imm8Gpr.Imm8 16))
2582                            hi16)))
2583        swap16))
2584
2585(decl do_bitrev64 (Type Gpr) Gpr)
2586(rule (do_bitrev64 ty @ $I64 src)
2587      (let ((src_ Gpr (do_bitrev32 ty src))
2588            (mask32 Gpr (imm ty 0xffffffff))
2589            (lo32 Gpr (x64_and ty src_ mask32))
2590            (hi32 Gpr (x64_shrq_mi src_ 32))
2591            (swap32 Gpr (x64_or ty
2592                            (x64_shlq_mi lo32 32)
2593                            hi32)))
2594        swap32))
2595
2596;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2597
2598;; x64 bswap instruction is only for 32- or 64-bit swaps
2599;; implement the 16-bit swap as a rotl by 8
2600(rule (lower (has_type $I16 (bswap _ src)))
2601      (x64_rolw_mi src 8))
2602
2603(rule (lower (has_type $I32 (bswap _ src)))
2604      (x64_bswap $I32 src))
2605
2606(rule (lower (has_type $I64 (bswap _ src)))
2607      (x64_bswap $I64 src))
2608
2609(rule (lower (has_type $I128 (bswap _ src)))
2610      (value_regs
2611       (x64_bswap $I64 (value_regs_get_gpr src 1))
2612       (x64_bswap $I64 (value_regs_get_gpr src 0))))
2613
2614;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2615
2616;; I{8,16,32,64} -> I128.
2617(rule (lower (has_type $I128 (uextend _ src)))
2618      (value_regs (extend_to_gpr src $I64 (ExtendKind.Zero)) (imm $I64 0)))
2619
2620;; I{8,16,32} -> I64.
2621(rule (lower (has_type $I64 (uextend _ src)))
2622      (extend_to_gpr src $I64 (ExtendKind.Zero)))
2623
2624;; I{8,16} -> I32
2625;; I8 -> I16
2626(rule -1 (lower (has_type (fits_in_32 _) (uextend _ src)))
2627         (extend_to_gpr src $I32 (ExtendKind.Zero)))
2628
2629;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2630
2631;; I{8,16,32} -> I128.
2632;;
2633;; Produce upper 64 bits sign-extended from lower 64: shift right by
2634;; 63 bits to spread the sign bit across the result.
2635(rule (lower (has_type $I128 (sextend _ src)))
2636      (let ((lo Gpr (extend_to_gpr src $I64 (ExtendKind.Sign)))
2637            (hi Gpr (x64_sarq_mi lo 63)))
2638      (value_regs lo hi)))
2639
2640;; I{8,16,32} -> I64.
2641(rule (lower (has_type $I64 (sextend _ src)))
2642      (extend_to_gpr src $I64 (ExtendKind.Sign)))
2643
2644;; I{8,16} -> I32
2645;; I8 -> I16
2646(rule -1 (lower (has_type (fits_in_32 _) (sextend _ src)))
2647         (extend_to_gpr src $I32 (ExtendKind.Sign)))
2648
2649;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2650
2651;; T -> T is always a no-op, even I128 -> I128.
2652(rule (lower (has_type ty (ireduce _ src @ (value_type ty))))
2653      src)
2654
2655;; T -> I{64,32,16,8}: We can simply pass through the value: values
2656;; are always stored with high bits undefined, so we can just leave
2657;; them be.
2658(rule 1 (lower (has_type (fits_in_64 ty) (ireduce _ src)))
2659      (value_regs_get_gpr src 0))
2660
2661;; Rules for `debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2662
2663(rule (lower (debugtrap))
2664      (side_effect (x64_int3_zo)))
2665
2666;; Rules for `x86_pmaddubsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2667
2668(rule (lower (has_type $I16X8 (x86_pmaddubsw _ x y)))
2669      (if-let true (has_ssse3))
2670      (x64_pmaddubsw y x))
2671
2672;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2673
2674(rule (lower (has_type $F32 (fadd _ x y)))
2675      (x64_addss x y))
2676(rule (lower (has_type $F64 (fadd _ x y)))
2677      (x64_addsd x y))
2678(rule (lower (has_type $F32X4 (fadd _ x y)))
2679      (x64_addps x y))
2680(rule (lower (has_type $F64X2 (fadd _ x y)))
2681      (x64_addpd x y))
2682
2683;; The above rules automatically sink loads for rhs operands, so additionally
2684;; add rules for sinking loads with lhs operands.
2685(rule 1 (lower (has_type $F32 (fadd _ (sinkable_load x) y)))
2686      (x64_addss y x))
2687(rule 1 (lower (has_type $F64 (fadd _ (sinkable_load x) y)))
2688      (x64_addsd y x))
2689(rule 1 (lower (has_type $F32X4 (fadd _ (sinkable_load x) y)))
2690      (x64_addps y x))
2691(rule 1 (lower (has_type $F64X2 (fadd _ (sinkable_load x) y)))
2692      (x64_addpd y x))
2693
2694;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2695
2696(rule (lower (has_type $F32 (fsub _ x y)))
2697      (x64_subss x y))
2698(rule (lower (has_type $F64 (fsub _ x y)))
2699      (x64_subsd x y))
2700(rule (lower (has_type $F32X4 (fsub _ x y)))
2701      (x64_subps x y))
2702(rule (lower (has_type $F64X2 (fsub _ x y)))
2703      (x64_subpd x y))
2704
2705;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2706
2707(rule (lower (has_type $F32 (fmul _ x y)))
2708      (x64_mulss x y))
2709(rule (lower (has_type $F64 (fmul _ x y)))
2710      (x64_mulsd x y))
2711(rule (lower (has_type $F32X4 (fmul _ x y)))
2712      (x64_mulps x y))
2713(rule (lower (has_type $F64X2 (fmul _ x y)))
2714      (x64_mulpd x y))
2715
2716;; The above rules automatically sink loads for rhs operands, so additionally
2717;; add rules for sinking loads with lhs operands.
2718(rule 1 (lower (has_type $F32 (fmul _ (sinkable_load x) y)))
2719      (x64_mulss y x))
2720(rule 1 (lower (has_type $F64 (fmul _ (sinkable_load x) y)))
2721      (x64_mulsd y x))
2722(rule 1 (lower (has_type $F32X4 (fmul _ (sinkable_load x) y)))
2723      (x64_mulps y x))
2724(rule 1 (lower (has_type $F64X2 (fmul _ (sinkable_load x) y)))
2725      (x64_mulpd y x))
2726
2727;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2728
2729(rule (lower (has_type $F32 (fdiv _ x y)))
2730      (x64_divss x y))
2731(rule (lower (has_type $F64 (fdiv _ x y)))
2732      (x64_divsd x y))
2733(rule (lower (has_type $F32X4 (fdiv _ x y)))
2734      (x64_divps x y))
2735(rule (lower (has_type $F64X2 (fdiv _ x y)))
2736      (x64_divpd x y))
2737
2738;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2739(rule (lower (has_type $F32 (sqrt _ x)))
2740      (x64_sqrtss (xmm_zero $F32X4) x))
2741(rule (lower (has_type $F64 (sqrt _ x)))
2742      (x64_sqrtsd (xmm_zero $F64X2) x))
2743(rule (lower (has_type $F32X4 (sqrt _ x)))
2744      (x64_sqrtps x))
2745(rule (lower (has_type $F64X2 (sqrt _ x)))
2746      (x64_sqrtpd x))
2747
2748;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2749(rule (lower (has_type $F64 (fpromote _ x)))
2750      (x64_cvtss2sd (xmm_zero $F64X2) x))
2751
2752;; Rules for `fvpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2753(rule (lower (has_type $F64X2 (fvpromote_low _ x)))
2754      (x64_cvtps2pd (put_in_xmm x)))
2755
2756;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2757(rule (lower (has_type $F32 (fdemote _ x)))
2758      (x64_cvtsd2ss (xmm_zero $F32X4) x))
2759
2760;; Rules for `fvdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2761(rule (lower (has_type $F32X4 (fvdemote _ x)))
2762      (x64_cvtpd2ps x))
2763
2764;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2765
2766(rule (lower (has_type $F32 (fmin _ x y)))
2767      (xmm_min_max_seq $F32 true x y))
2768(rule (lower (has_type $F64 (fmin _ x y)))
2769      (xmm_min_max_seq $F64 true x y))
2770
2771;; Vector-typed version. We don't use single pseudoinstructions as
2772;; above, because we don't need to generate a mini-CFG. Instead, we
2773;; perform a branchless series of operations.
2774;;
2775;; We cannot simply use native min instructions (minps, minpd) because
2776;; NaN handling is different per CLIF semantics than on
2777;; x86. Specifically, if an argument is NaN, or the arguments are both
2778;; zero but of opposite signs, then the x86 instruction always
2779;; produces the second argument. However, per CLIF semantics, we
2780;; require that fmin(NaN, _) = fmin(_, NaN) = NaN, and fmin(+0, -0) =
2781;; fmin(-0, +0) = -0.
2782
2783(rule (lower (has_type $F32X4 (fmin _ x y)))
2784      ;; Compute min(x, y) and min(y, x) with native
2785      ;; instructions. These will differ in one of the edge cases
2786      ;; above that we have to handle properly. (Conversely, if they
2787      ;; don't differ, then the native instruction's answer is the
2788      ;; right one per CLIF semantics.)
2789      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
2790            (y Xmm y)
2791            (min1 Xmm (x64_minps x y))
2792            (min2 Xmm (x64_minps y x))
2793            ;; Compute the OR of the two. Note that NaNs have an
2794            ;; exponent field of all-ones (0xFF for F32), so if either
2795            ;; result is a NaN, this OR will be. And if either is a
2796            ;; zero (which has an exponent of 0 and mantissa of 0),
2797            ;; this captures a sign-bit of 1 (negative) if either
2798            ;; input is negative.
2799            ;;
2800            ;; In the case where we don't have a +/-0 mismatch or
2801            ;; NaNs, then `min1` and `min2` are equal and `min_or` is
2802            ;; the correct minimum.
2803            (min_or Xmm (x64_orps min1 min2))
2804            ;; "compare unordered" produces a true mask (all ones) in
2805            ;; a given lane if the min is a NaN. We use this to
2806            ;; generate a mask to ensure quiet NaNs.
2807            (is_nan_mask Xmm (x64_cmpps min_or min2 (FcmpImm.Unordered)))
2808            ;; OR in the NaN mask.
2809            (min_or_2 Xmm (x64_orps min_or is_nan_mask))
2810            ;; Shift the NaN mask down so that it covers just the
2811            ;; fraction below the NaN signalling bit; we'll use this
2812            ;; to mask off non-canonical NaN payloads.
2813            ;;
2814            ;; All-ones for NaN, shifted down to leave 10 top bits (1
2815            ;; sign, 8 exponent, 1 QNaN bit that must remain set)
2816            ;; cleared.
2817            (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10)))
2818            ;; Do a NAND, so that we retain every bit not set in
2819            ;; `nan_fraction_mask`. This mask will be all zeroes (so
2820            ;; we retain every bit) in non-NaN cases, and will have
2821            ;; ones (so we clear those bits) in NaN-payload bits
2822            ;; otherwise.
2823            (final Xmm (x64_andnps nan_fraction_mask min_or_2)))
2824        final))
2825
2826;; Likewise for F64 lanes, except that the right-shift is by 13 bits
2827;; (1 sign, 11 exponent, 1 QNaN bit).
2828(rule (lower (has_type $F64X2 (fmin _ x y)))
2829      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
2830            (y Xmm y)
2831            (min1 Xmm (x64_minpd x y))
2832            (min2 Xmm (x64_minpd y x))
2833            (min_or Xmm (x64_orpd min1 min2))
2834            (is_nan_mask Xmm (x64_cmppd min1 min2 (FcmpImm.Unordered)))
2835            (min_or_2 Xmm (x64_orpd min_or is_nan_mask))
2836            (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13)))
2837            (final Xmm (x64_andnpd nan_fraction_mask min_or_2)))
2838        final))
2839
2840;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2841
2842(rule (lower (has_type $F32 (fmax _ x y)))
2843      (xmm_min_max_seq $F32 false x y))
2844(rule (lower (has_type $F64 (fmax _ x y)))
2845      (xmm_min_max_seq $F64 false x y))
2846
2847;; The vector version of fmax here is a dual to the fmin sequence
2848;; above, almost, with a few differences.
2849
2850(rule (lower (has_type $F32X4 (fmax _ x y)))
2851      ;; Compute max(x, y) and max(y, x) with native
2852      ;; instructions. These will differ in one of the edge cases
2853      ;; above that we have to handle properly. (Conversely, if they
2854      ;; don't differ, then the native instruction's answer is the
2855      ;; right one per CLIF semantics.)
2856      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
2857            (y Xmm y)
2858            (max1 Xmm (x64_maxps x y))
2859            (max2 Xmm (x64_maxps y x))
2860            ;; Compute the XOR of the two maxima. In the case
2861            ;; where we don't have a +/-0 mismatch or NaNs, then
2862            ;; `min1` and `min2` are equal and this XOR is zero.
2863            (max_xor Xmm (x64_xorps max1 max2))
2864            ;; OR the XOR into one of the original maxima. If they are
2865            ;; equal, this does nothing. If max2 was NaN, its exponent
2866            ;; bits were all-ones, so the xor's exponent bits were the
2867            ;; complement of max1, and the OR of max1 and max_xor has
2868            ;; an all-ones exponent (is a NaN). If max1 was NaN, then
2869            ;; its exponent bits were already all-ones, so the OR will
2870            ;; be a NaN as well.
2871            (max_blended_nan Xmm (x64_orps max1 max_xor))
2872            ;; Subtract the XOR. This ensures that if we had +0 and
2873            ;; -0, we end up with +0.
2874            (max_blended_nan_positive Xmm (x64_subps max_blended_nan max_xor))
2875            ;; "compare unordered" produces a true mask (all ones) in
2876            ;; a given lane if the min is a NaN. We use this to
2877            ;; generate a mask to ensure quiet NaNs.
2878            (is_nan_mask Xmm (x64_cmpps max_blended_nan max_blended_nan (FcmpImm.Unordered)))
2879            ;; Shift the NaN mask down so that it covers just the
2880            ;; fraction below the NaN signalling bit; we'll use this
2881            ;; to mask off non-canonical NaN payloads.
2882            ;;
2883            ;; All-ones for NaN, shifted down to leave 10 top bits (1
2884            ;; sign, 8 exponent, 1 QNaN bit that must remain set)
2885            ;; cleared.
2886            (nan_fraction_mask Xmm (x64_psrld is_nan_mask (xmi_imm 10)))
2887            ;; Do a NAND, so that we retain every bit not set in
2888            ;; `nan_fraction_mask`. This mask will be all zeroes (so
2889            ;; we retain every bit) in non-NaN cases, and will have
2890            ;; ones (so we clear those bits) in NaN-payload bits
2891            ;; otherwise.
2892            (final Xmm (x64_andnps nan_fraction_mask max_blended_nan_positive)))
2893        final))
2894
2895(rule (lower (has_type $F64X2 (fmax _ x y)))
2896      ;; Compute max(x, y) and max(y, x) with native
2897      ;; instructions. These will differ in one of the edge cases
2898      ;; above that we have to handle properly. (Conversely, if they
2899      ;; don't differ, then the native instruction's answer is the
2900      ;; right one per CLIF semantics.)
2901      (let ((x Xmm x) ;; force x/y into registers and disallow load sinking
2902            (y Xmm y)
2903            (max1 Xmm (x64_maxpd x y))
2904            (max2 Xmm (x64_maxpd y x))
2905            ;; Compute the XOR of the two maxima. In the case
2906            ;; where we don't have a +/-0 mismatch or NaNs, then
2907            ;; `min1` and `min2` are equal and this XOR is zero.
2908            (max_xor Xmm (x64_xorpd max1 max2))
2909            ;; OR the XOR into one of the original maxima. If they are
2910            ;; equal, this does nothing. If max2 was NaN, its exponent
2911            ;; bits were all-ones, so the xor's exponent bits were the
2912            ;; complement of max1, and the OR of max1 and max_xor has
2913            ;; an all-ones exponent (is a NaN). If max1 was NaN, then
2914            ;; its exponent bits were already all-ones, so the OR will
2915            ;; be a NaN as well.
2916            (max_blended_nan Xmm (x64_orpd max1 max_xor))
2917            ;; Subtract the XOR. This ensures that if we had +0 and
2918            ;; -0, we end up with +0.
2919            (max_blended_nan_positive Xmm (x64_subpd max_blended_nan max_xor))
2920            ;; `cmpps` with predicate index `3` is `cmpunordps`, or
2921            ;; "compare unordered": it produces a true mask (all ones)
2922            ;; in a given lane if the min is a NaN. We use this to
2923            ;; generate a mask to ensure quiet NaNs.
2924            (is_nan_mask Xmm (x64_cmppd max_blended_nan max_blended_nan (FcmpImm.Unordered)))
2925            ;; Shift the NaN mask down so that it covers just the
2926            ;; fraction below the NaN signalling bit; we'll use this
2927            ;; to mask off non-canonical NaN payloads.
2928            ;;
2929            ;; All-ones for NaN, shifted down to leave 13 top bits (1
2930            ;; sign, 11 exponent, 1 QNaN bit that must remain set)
2931            ;; cleared.
2932            (nan_fraction_mask Xmm (x64_psrlq is_nan_mask (xmi_imm 13)))
2933            ;; Do a NAND, so that we retain every bit not set in
2934            ;; `nan_fraction_mask`. This mask will be all zeroes (so
2935            ;; we retain every bit) in non-NaN cases, and will have
2936            ;; ones (so we clear those bits) in NaN-payload bits
2937            ;; otherwise.
2938            (final Xmm (x64_andnpd nan_fraction_mask max_blended_nan_positive)))
2939        final))
2940
2941;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2942
2943;; Base case for fma is to call out to one of two libcalls. For vectors they
2944;; need to be decomposed, handle each element individually, and then recomposed.
2945
2946(rule (lower (has_type $F32 (fma _ x y z)))
2947      (libcall_3 (LibCall.FmaF32) x y z))
2948(rule (lower (has_type $F64 (fma _ x y z)))
2949      (libcall_3 (LibCall.FmaF64) x y z))
2950
2951(rule (lower (has_type $F32X4 (fma _ x y z)))
2952      (let (
2953          (x Xmm (put_in_xmm x))
2954          (y Xmm (put_in_xmm y))
2955          (z Xmm (put_in_xmm z))
2956          (x0 Xmm (libcall_3 (LibCall.FmaF32) x y z))
2957          (x1 Xmm (libcall_3 (LibCall.FmaF32)
2958            (x64_pshufd x 1)
2959            (x64_pshufd y 1)
2960            (x64_pshufd z 1)))
2961          (x2 Xmm (libcall_3 (LibCall.FmaF32)
2962            (x64_pshufd x 2)
2963            (x64_pshufd y 2)
2964            (x64_pshufd z 2)))
2965          (x3 Xmm (libcall_3 (LibCall.FmaF32)
2966            (x64_pshufd x 3)
2967            (x64_pshufd y 3)
2968            (x64_pshufd z 3)))
2969
2970          (tmp Xmm (f32x4_insertlane x0 x1 1))
2971          (tmp Xmm (f32x4_insertlane tmp x2 2))
2972          (tmp Xmm (f32x4_insertlane tmp x3 3))
2973        )
2974        tmp))
2975(rule (lower (has_type $F64X2 (fma _ x y z)))
2976      (let (
2977          (x Xmm (put_in_xmm x))
2978          (y Xmm (put_in_xmm y))
2979          (z Xmm (put_in_xmm z))
2980          (x0 Xmm (libcall_3 (LibCall.FmaF64) x y z))
2981          (x1 Xmm (libcall_3 (LibCall.FmaF64)
2982            (x64_pshufd x 0xee)
2983            (x64_pshufd y 0xee)
2984            (x64_pshufd z 0xee)))
2985        )
2986        (x64_movlhps x0 x1)))
2987
2988
2989;; Special case for when the `fma` feature is active and a native instruction
2990;; can be used.
2991(rule 1 (lower (has_type ty (fma _ x y z)))
2992      (if-let true (use_fma))
2993      (fmadd ty x y z))
2994
2995(decl fmadd (Type Value Value Value) Xmm)
2996(decl fnmadd (Type Value Value Value) Xmm)
2997
2998;; Base case. Note that this will automatically sink a load with `z`, the value
2999;; to add.
3000(rule (fmadd ty x y z) (x64_vfmadd213 ty x y z))
3001
3002;; Allow sinking loads with one of the two values being multiplied in addition
3003;; to the value being added. Note that both x and y can be sunk here due to
3004;; multiplication being commutative.
3005(rule 1 (fmadd ty (sinkable_load x) y z) (x64_vfmadd132 ty y z x))
3006(rule 2 (fmadd ty x (sinkable_load y) z) (x64_vfmadd132 ty x z y))
3007
3008;; If one of the values being multiplied is negated then use a `vfnmadd*`
3009;; instruction instead
3010(rule 3 (fmadd ty (fneg _ x) y z) (fnmadd ty x y z))
3011(rule 4 (fmadd ty x (fneg _ y) z) (fnmadd ty x y z))
3012
3013(rule (fnmadd ty x y z) (x64_vfnmadd213 ty x y z))
3014(rule 1 (fnmadd ty (sinkable_load x) y z) (x64_vfnmadd132 ty y z x))
3015(rule 2 (fnmadd ty x (sinkable_load y) z) (x64_vfnmadd132 ty x z y))
3016
3017
3018(rule 2 (lower (has_type ty (fma _ x y (fneg _ z))))
3019      (if-let true (use_fma))
3020      (fmsub ty x y z))
3021
3022;; fmsub and fnmsub
3023(decl fmsub (Type Value Value Value) Xmm)
3024(decl fnmsub (Type Value Value Value) Xmm)
3025
3026;; Base case, will sink a load of `z` automatically.
3027(rule (fmsub ty x y z) (x64_vfmsub213 ty x y z))
3028
3029;; Allow sinking loads with one of the two values being multiplied in addition
3030;; to the value being subtracted. Note that both x and y can be sunk here due to
3031;; multiplication being commutative.
3032(rule 1 (fmsub ty (sinkable_load x) y z) (x64_vfmsub132 ty y z x))
3033(rule 2 (fmsub ty x (sinkable_load y) z) (x64_vfmsub132 ty x z y))
3034
3035;; If one of the values being multiplied is negated then use a `vfnmsub*`
3036;; instruction instead
3037(rule 3 (fmsub ty (fneg _ x) y z) (fnmsub ty x y z))
3038(rule 4 (fmsub ty x (fneg _ y) z) (fnmsub ty x y z))
3039
3040(rule (fnmsub ty x y z) (x64_vfnmsub213 ty x y z))
3041(rule 1 (fnmsub ty (sinkable_load x) y z) (x64_vfnmsub132 ty y z x))
3042(rule 2 (fnmsub ty x (sinkable_load y) z) (x64_vfnmsub132 ty x z y))
3043
3044
3045;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3046
3047;; In order to load a value from memory to a GPR register, we may need to extend
3048;; the loaded value from 8-, 16-, or 32-bits to this backend's expected GPR
3049;; width: 64 bits. Note that `ext_mode` will load 1-bit types (booleans) as
3050;; 8-bit loads.
3051;;
3052;; By default, we zero-extend all sub-64-bit loads to a GPR.
3053(rule load_sub64_x64_movzx -4 (lower (has_type (and (fits_in_32 ty) (is_gpr_type _))
3054                                               (load _ (little_or_native_endian flags) address offset)))
3055      (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset)))
3056;; But if we know that both the `from` and `to` are 64 bits, we simply load with
3057;; no extension.
3058(rule load_64_x64_movzx -1 (lower (has_type (ty_int_ref_64 ty) (load _ (little_or_native_endian flags) address offset)))
3059      (x64_mov (to_amode flags address offset)))
3060;; Also, certain scalar loads have a specific `from` width and extension kind
3061;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit
3062;; GPR even if the `to` type is smaller (e.g., 16-bits).
3063(rule (lower (has_type (is_gpr_type ty) (uload8 _ (little_or_native_endian flags) address offset)))
3064      (x64_movzx (ExtMode.BQ) (to_amode flags address offset)))
3065(rule (lower (has_type (is_gpr_type ty) (sload8 _ (little_or_native_endian flags) address offset)))
3066      (x64_movsx (ExtMode.BQ) (to_amode flags address offset)))
3067(rule (lower (has_type (is_gpr_type ty) (uload16 _ (little_or_native_endian flags) address offset)))
3068      (x64_movzx (ExtMode.WQ) (to_amode flags address offset)))
3069(rule (lower (has_type (is_gpr_type ty) (sload16 _ (little_or_native_endian flags) address offset)))
3070      (x64_movsx (ExtMode.WQ) (to_amode flags address offset)))
3071(rule (lower (has_type (is_gpr_type ty) (uload32 _ (little_or_native_endian flags) address offset)))
3072      (x64_movzx (ExtMode.LQ) (to_amode flags address offset)))
3073(rule (lower (has_type (is_gpr_type ty) (sload32 _ (little_or_native_endian flags) address offset)))
3074      (x64_movsx (ExtMode.LQ) (to_amode flags address offset)))
3075
3076;; To load to XMM registers, we use the x64-specific instructions for each type.
3077;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits.
3078;; But for the 128-bit types, this is not strictly necessary for performance but
3079;; might help with clarity during disassembly.
3080(rule 4 (lower (has_type (is_xmm_type (ty_16 _)) (load _ (little_or_native_endian flags) address offset)))
3081      (x64_pinsrw (xmm_uninit_value) (to_amode flags address offset) 0))
3082(rule 3 (lower (has_type (is_xmm_type (ty_32 _)) (load _ (little_or_native_endian flags) address offset)))
3083      (x64_movss_load (to_amode flags address offset)))
3084(rule 2 (lower (has_type (is_xmm_type (ty_64 _)) (load _ (little_or_native_endian flags) address offset)))
3085      (x64_movsd_load (to_amode flags address offset)))
3086(rule 1 (lower (has_type $F32X4 (load _ (little_or_native_endian flags) address offset)))
3087      (x64_movups_load (to_amode flags address offset)))
3088(rule 1 (lower (has_type $F64X2 (load _ (little_or_native_endian flags) address offset)))
3089      (x64_movupd_load (to_amode flags address offset)))
3090(rule 0 (lower (has_type (is_xmm_type (ty_128 _)) (load _ (little_or_native_endian flags) address offset)))
3091      (x64_movdqu_load (to_amode flags address offset)))
3092
3093;; We can load an I128 by doing two 64-bit loads.
3094(rule -3 (lower (has_type $I128
3095                       (load _ (little_or_native_endian flags) address offset)))
3096      (let ((addr_lo SyntheticAmode (to_amode flags address offset))
3097            (addr_hi SyntheticAmode (amode_offset addr_lo flags 8))
3098            (value_lo Reg (x64_mov addr_lo))
3099            (value_hi Reg (x64_mov addr_hi)))
3100        (value_regs value_lo value_hi)))
3101
3102;; We also include widening vector loads; these sign- or zero-extend each lane
3103;; to the next wider width (e.g., 16x4 -> 32x4).
3104(rule 1 (lower (has_type $I16X8 (sload8x8 _ (little_or_native_endian flags) address offset)))
3105        (if-let true (has_sse41))
3106        (x64_pmovsxbw (to_amode flags address offset)))
3107(rule 1 (lower (has_type $I16X8 (uload8x8 _ (little_or_native_endian flags) address offset)))
3108        (if-let true (has_sse41))
3109        (x64_pmovzxbw (to_amode flags address offset)))
3110(rule 1 (lower (has_type $I32X4 (sload16x4 _ (little_or_native_endian flags) address offset)))
3111        (if-let true (has_sse41))
3112        (x64_pmovsxwd (to_amode flags address offset)))
3113(rule 1 (lower (has_type $I32X4 (uload16x4 _ (little_or_native_endian flags) address offset)))
3114        (if-let true (has_sse41))
3115        (x64_pmovzxwd (to_amode flags address offset)))
3116(rule 1 (lower (has_type $I64X2 (sload32x2 _ (little_or_native_endian flags) address offset)))
3117        (if-let true (has_sse41))
3118        (x64_pmovsxdq (to_amode flags address offset)))
3119(rule 1 (lower (has_type $I64X2 (uload32x2 _ (little_or_native_endian flags) address offset)))
3120        (if-let true (has_sse41))
3121        (x64_pmovzxdq (to_amode flags address offset)))
3122
3123(rule (lower (has_type $I16X8 (sload8x8 _ (little_or_native_endian flags) address offset)))
3124      (lower_swiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
3125(rule (lower (has_type $I16X8 (uload8x8 _ (little_or_native_endian flags) address offset)))
3126      (lower_uwiden_low $I16X8 (x64_movq_to_xmm (to_amode flags address offset))))
3127(rule (lower (has_type $I32X4 (sload16x4 _ (little_or_native_endian flags) address offset)))
3128      (lower_swiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
3129(rule (lower (has_type $I32X4 (uload16x4 _ (little_or_native_endian flags) address offset)))
3130      (lower_uwiden_low $I32X4 (x64_movq_to_xmm (to_amode flags address offset))))
3131(rule (lower (has_type $I64X2 (sload32x2 _ (little_or_native_endian flags) address offset)))
3132      (lower_swiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
3133(rule (lower (has_type $I64X2 (uload32x2 _ (little_or_native_endian flags) address offset)))
3134      (lower_uwiden_low $I64X2 (x64_movq_to_xmm (to_amode flags address offset))))
3135
3136;; Rules for `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3137
3138;; 8-, 16-, 32- and 64-bit GPR stores.
3139(rule store_x64_movrm -2 (lower (store (little_or_native_endian flags)
3140                    value @ (value_type (is_gpr_type ty))
3141                    address
3142                    offset))
3143      (side_effect
3144       (x64_movrm ty (to_amode flags address offset) value)))
3145
3146;; Explicit 8/16/32-bit opcodes.
3147(rule (lower (istore8 (little_or_native_endian flags) value address offset))
3148      (side_effect
3149       (x64_movrm $I8 (to_amode flags address offset) value)))
3150(rule (lower (istore16 (little_or_native_endian flags) value address offset))
3151      (side_effect
3152       (x64_movrm $I16 (to_amode flags address offset) value)))
3153(rule (lower (istore32 (little_or_native_endian flags) value address offset))
3154      (side_effect
3155       (x64_movrm $I32 (to_amode flags address offset) value)))
3156
3157;; IMM stores
3158(rule 4 (lower (store (little_or_native_endian flags) value @ (value_type (fits_in_64 ty)) address offset))
3159      (if-let (i32_from_iconst imm) value)
3160      (side_effect
3161       (x64_movimm_m ty (to_amode flags address offset) imm)))
3162
3163;; F16 stores of values in XMM registers.
3164(rule -2 (lower (store (little_or_native_endian flags)
3165                    value @ (value_type (is_xmm_type (ty_16 _)))
3166                    address
3167                    offset))
3168      (side_effect
3169       (x64_movrm $I16 (to_amode flags address offset) (bitcast_xmm_to_gpr 16 value))))
3170
3171(rule -1 (lower (store (little_or_native_endian flags)
3172                    value @ (value_type (is_xmm_type (ty_16 _)))
3173                    address
3174                    offset))
3175      (if-let true (has_sse41))
3176      (side_effect
3177       (x64_pextrw_store (to_amode flags address offset) value 0)))
3178
3179;; F32 stores of values in XMM registers.
3180(rule -3 (lower (store (little_or_native_endian flags)
3181                    value @ (value_type (is_xmm_type (ty_32 _)))
3182                    address
3183                    offset))
3184      (side_effect
3185       (x64_movss_store (to_amode flags address offset) value)))
3186
3187;; F64 stores of values in XMM registers.
3188(rule -4 (lower (store (little_or_native_endian flags)
3189                    value @ (value_type (is_xmm_type (ty_64 _)))
3190                    address
3191                    offset))
3192      (side_effect
3193       (x64_movsd_store (to_amode flags address offset) value)))
3194
3195;; Stores of F32X4 vectors.
3196(rule 1 (lower (store (little_or_native_endian flags)
3197                    value @ (value_type $F32X4)
3198                    address
3199                    offset))
3200      (side_effect
3201       (x64_movups_store (to_amode flags address offset) value)))
3202
3203;; Stores of F64X2 vectors.
3204(rule 1 (lower (store (little_or_native_endian flags)
3205                    value @ (value_type $F64X2)
3206                    address
3207                    offset))
3208      (side_effect
3209       (x64_movupd_store (to_amode flags address offset) value)))
3210
3211;; Stores of all other 128-bit vector types with integer lanes.
3212(rule -5 (lower (store (little_or_native_endian flags)
3213                    value @ (value_type (is_xmm_type (ty_128 _)))
3214                    address
3215                    offset))
3216      (side_effect
3217       (x64_movdqu_store (to_amode flags address offset) value)))
3218
3219;; Stores of I128 values: store the two 64-bit halves separately.
3220(rule 0 (lower (store (little_or_native_endian flags)
3221                    value @ (value_type $I128)
3222                    address
3223                    offset))
3224      (let ((value_reg ValueRegs value)
3225            (value_lo Gpr (value_regs_get_gpr value_reg 0))
3226            (value_hi Gpr (value_regs_get_gpr value_reg 1))
3227            (addr_lo SyntheticAmode (to_amode flags address offset))
3228            (addr_hi SyntheticAmode (amode_offset addr_lo flags 8)))
3229      (side_effect
3230       (side_effect_concat
3231        (x64_movrm $I64 addr_lo value_lo)
3232        (x64_movrm $I64 addr_hi value_hi)))))
3233
3234;; Slightly optimize the extraction of the first lane from a vector which is
3235;; stored in memory. In the case the first lane specifically is selected the
3236;; standard `movss` and `movsd` instructions can be used as-if we're storing a
3237;; f32 or f64 despite the source perhaps being an integer vector since the
3238;; result of the instruction is the same.
3239(rule 2 (lower (store (little_or_native_endian flags)
3240                    (has_type $F32 (extractlane _ value (u8_from_uimm8 0)))
3241                    address
3242                    offset))
3243      (side_effect
3244       (x64_movss_store (to_amode flags address offset) value)))
3245(rule 2 (lower (store (little_or_native_endian flags)
3246                    (has_type $F64 (extractlane _ value (u8_from_uimm8 0)))
3247                    address
3248                    offset))
3249      (side_effect
3250       (x64_movsd_store (to_amode flags address offset) value)))
3251(rule 2 (lower (store (little_or_native_endian flags)
3252                    (has_type $I8 (extractlane _ value (u8_from_uimm8 n)))
3253                    address
3254                    offset))
3255      (if-let true (has_sse41))
3256      (side_effect
3257       (x64_pextrb_store (to_amode flags address offset) value n)))
3258(rule 2 (lower (store (little_or_native_endian flags)
3259                    (has_type $I16 (extractlane _ value (u8_from_uimm8 n)))
3260                    address
3261                    offset))
3262      (if-let true (has_sse41))
3263      (side_effect
3264       (x64_pextrw_store (to_amode flags address offset) value n)))
3265(rule 2 (lower (store (little_or_native_endian flags)
3266                    (has_type $I32 (extractlane _ value (u8_from_uimm8 n)))
3267                    address
3268                    offset))
3269      (if-let true (has_sse41))
3270      (side_effect
3271       (x64_pextrd_store (to_amode flags address offset) value n)))
3272(rule 2 (lower (store (little_or_native_endian flags)
3273                    (has_type $I64 (extractlane _ value (u8_from_uimm8 n)))
3274                    address
3275                    offset))
3276      (if-let true (has_sse41))
3277      (side_effect
3278       (x64_pextrq_store (to_amode flags address offset) value n)))
3279
3280;; Rules for `load*` + ALU op + `store*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3281
3282;; `add mem, {reg,imm}`
3283(rule store_x64_add_mem 3 (lower
3284       (store (little_or_native_endian flags)
3285              (has_type (ty_32_or_64 ty)
3286                        (iadd _ (and
3287                               (sinkable_load sink)
3288                               (load _ flags addr offset))
3289                              src2))
3290              addr
3291              offset))
3292      (let ((_ RegMemImm sink))
3293        (side_effect
3294         (x64_add_mem ty (to_amode flags addr offset) src2))))
3295
3296;; `add mem, {reg,imm}` with args swapped
3297(rule 2 (lower
3298       (store (little_or_native_endian flags)
3299              (has_type (ty_32_or_64 ty)
3300                        (iadd _ src2
3301                              (and
3302                               (sinkable_load sink)
3303                               (load _ flags addr offset))))
3304              addr
3305              offset))
3306      (let ((_ RegMemImm sink))
3307        (side_effect
3308         (x64_add_mem ty (to_amode flags addr offset) src2))))
3309
3310;; `sub mem, {reg,imm}`
3311(rule 2 (lower
3312       (store (little_or_native_endian flags)
3313              (has_type (ty_32_or_64 ty)
3314                        (isub _ (and
3315                               (sinkable_load sink)
3316                               (load _ flags addr offset))
3317                              src2))
3318              addr
3319              offset))
3320      (let ((_ RegMemImm sink))
3321        (side_effect
3322         (x64_sub_mem ty (to_amode flags addr offset) src2))))
3323
3324;; `and mem, {reg,imm}`
3325(rule 3 (lower
3326       (store (little_or_native_endian flags)
3327              (has_type (ty_32_or_64 ty)
3328                        (band _ (and
3329                               (sinkable_load sink)
3330                               (load _ flags addr offset))
3331                              src2))
3332              addr
3333              offset))
3334      (let ((_ RegMemImm sink))
3335        (side_effect
3336         (x64_and_mem ty (to_amode flags addr offset) src2))))
3337
3338;; `and mem, {reg,imm}` with args swapped
3339(rule 2 (lower
3340       (store (little_or_native_endian flags)
3341              (has_type (ty_32_or_64 ty)
3342                        (band _ src2
3343                              (and
3344                               (sinkable_load sink)
3345                               (load _ flags addr offset))))
3346              addr
3347              offset))
3348      (let ((_ RegMemImm sink))
3349        (side_effect
3350         (x64_and_mem ty (to_amode flags addr offset) src2))))
3351
3352;; `or mem, {reg,imm}`
3353(rule 3 (lower
3354       (store (little_or_native_endian flags)
3355              (has_type (ty_32_or_64 ty)
3356                        (bor _ (and
3357                               (sinkable_load sink)
3358                               (load _ flags addr offset))
3359                              src2))
3360              addr
3361              offset))
3362      (let ((_ RegMemImm sink))
3363        (side_effect
3364         (x64_or_mem ty (to_amode flags addr offset) src2))))
3365
3366;; `or mem, {reg,imm}` with args swapped
3367(rule 2 (lower
3368       (store (little_or_native_endian flags)
3369              (has_type (ty_32_or_64 ty)
3370                        (bor _ src2
3371                              (and
3372                               (sinkable_load sink)
3373                               (load _ flags addr offset))))
3374              addr
3375              offset))
3376      (let ((_ RegMemImm sink))
3377        (side_effect
3378         (x64_or_mem ty (to_amode flags addr offset) src2))))
3379
3380;; Xor mem, reg
3381(rule 3 (lower
3382       (store (little_or_native_endian flags)
3383              (has_type (ty_32_or_64 ty)
3384                        (bxor _ (and
3385                               (sinkable_load sink)
3386                               (load _ flags addr offset))
3387                              src2))
3388              addr
3389              offset))
3390      (let ((_ RegMemImm sink))
3391        (side_effect
3392         (x64_xor_mem ty (to_amode flags addr offset) src2))))
3393
3394;; Xor mem, reg with args swapped
3395(rule 2 (lower
3396       (store (little_or_native_endian flags)
3397              (has_type (ty_32_or_64 ty)
3398                        (bxor _ src2
3399                              (and
3400                               (sinkable_load sink)
3401                               (load _ flags addr offset))))
3402              addr
3403              offset))
3404      (let ((_ RegMemImm sink))
3405        (side_effect
3406         (x64_xor_mem ty (to_amode flags addr offset) src2))))
3407
3408;; Rules for `fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3409
3410(rule (lower (fence))
3411      (side_effect (x64_mfence_zo)))
3412
3413;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3414
3415(rule (lower (func_addr _ (func_ref_data _ extname dist _)))
3416      (load_ext_name extname 0 dist))
3417
3418;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3419
3420(rule (lower (symbol_value _ (symbol_value_data extname dist offset)))
3421      (load_ext_name extname offset dist))
3422
3423;; Rules for `atomic_load` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3424
3425;; This is a normal load. The x86-TSO memory model provides sufficient
3426;; sequencing to satisfy the CLIF synchronisation requirements for `AtomicLoad`
3427;; without the need for any fence instructions.
3428;;
3429;; This lowering is only valid for I8, I16, I32, and I64. The sub-64-bit types
3430;; are zero extended, as with a normal load.
3431(rule 1 (lower (has_type $I64 (atomic_load _ (little_or_native_endian flags) address)))
3432      (x64_mov (to_amode flags address (zero_offset))))
3433(rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load _ (little_or_native_endian flags) address)))
3434      (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset))))
3435;; Lower 128-bit `atomic_load` using `cmpxchg16b`.
3436(rule 1 (lower (has_type $I128 (atomic_load _ (little_or_native_endian flags) address)))
3437      (if-let true (has_cmpxchg16b))
3438      (x64_cmpxchg16b (value_regs (imm $I64 0) (imm $I64 0)) (value_regs (imm $I64 0) (imm $I64 0)) (to_amode flags address (zero_offset))))
3439
3440;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3441
3442;; This is a normal store followed by an `mfence` instruction. This lowering is
3443;; only valid for I8, I16, I32, and I64.
3444(rule (lower (atomic_store (little_or_native_endian flags)
3445                           value @ (value_type (and (fits_in_64 ty) (ty_int _)))
3446                           address))
3447      (side_effect (side_effect_concat
3448       (x64_movrm ty (to_amode flags address (zero_offset)) value)
3449       (x64_mfence_zo))))
3450;; Lower 128-bit `atomic_store` using `cmpxchg16b`.
3451(rule 1 (lower (atomic_store (little_or_native_endian flags) value @ (value_type $I128) address))
3452      (if-let true (has_cmpxchg16b))
3453      (side_effect (x64_atomic_128_store_seq (to_amode flags address (zero_offset)) flags value)))
3454
3455;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3456
3457(rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
3458                  (atomic_cas _ (little_or_native_endian flags) address expected replacement)))
3459      (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset))))
3460(rule 1 (lower (has_type $I128 (atomic_cas _ flags address expected replacement)))
3461        (if-let true (has_cmpxchg16b))
3462        (x64_cmpxchg16b expected replacement (to_amode flags address (zero_offset))))
3463
3464;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3465
3466;; This is a simple, general-case atomic update, based on a loop involving
3467;; `cmpxchg`.
3468(rule (lower (has_type (and (fits_in_64 ty) (ty_int _))
3469                  (atomic_rmw _ (little_or_native_endian flags) op address input)))
3470      (x64_atomic_rmw_seq ty (atomic_rmw_seq_op op) (to_amode flags address (zero_offset)) input))
3471
3472;; `Add` and `Sub` can use `lock xadd`
3473(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _))
3474                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) address input)))
3475      (x64_xadd ty (to_amode flags address (zero_offset)) input))
3476(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _))
3477                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) address input)))
3478      (x64_xadd ty (to_amode flags address (zero_offset)) (x64_neg ty input)))
3479;; `Xchg` can use `xchg`
3480(rule 1 (lower (has_type (and (fits_in_64 ty) (ty_int _))
3481                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xchg) address input)))
3482      (x64_xchg ty (to_amode flags address (zero_offset)) input))
3483
3484;; `Add`, `Sub`, `And`, `Or` and `Xor` can use `lock`-prefixed instructions if
3485;; the old value is not required.
3486(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
3487                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) address input)))
3488      (if-let (first_result res) i)
3489      (if-let true (value_is_unused res))
3490      (side_effect_as_invalid (x64_lock_add (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input)))
3491(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
3492                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) address input)))
3493      (if-let (first_result res) i)
3494      (if-let true (value_is_unused res))
3495      (side_effect_as_invalid (x64_lock_sub (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input)))
3496(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
3497                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.And) address input)))
3498      (if-let (first_result res) i)
3499      (if-let true (value_is_unused res))
3500      (side_effect_as_invalid (x64_lock_and (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input)))
3501(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
3502                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Or) address input)))
3503      (if-let (first_result res) i)
3504      (if-let true (value_is_unused res))
3505      (side_effect_as_invalid (x64_lock_or (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input)))
3506(rule 2 (lower i @ (has_type (fits_in_64 (ty_int ty))
3507                  (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xor) address input)))
3508      (if-let (first_result res) i)
3509      (if-let true (value_is_unused res))
3510      (side_effect_as_invalid (x64_lock_xor (raw_operand_size_of_type ty) (to_amode flags address (zero_offset)) input)))
3511
3512;; 128-bit integers always use a `lock cmpxchg16b` loop.
3513(rule 3 (lower (has_type $I128 (atomic_rmw _ (little_or_native_endian flags) op address input)))
3514        (if-let true (has_cmpxchg16b))
3515        (x64_atomic_128_rmw_seq op (to_amode flags address (zero_offset)) flags input))
3516
3517;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3518
3519;; Direct call to an in-range function.
3520(rule 1 (lower (call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args))
3521      (let ((output ValueRegsVec (gen_call_output sig_ref))
3522            (abi Sig (abi_sig sig_ref))
3523            (uses CallArgList (gen_call_args abi args))
3524            (defs CallRetList (gen_call_rets abi output))
3525            (info BoxCallInfo (gen_call_info abi name uses defs (try_call_none) patchable))
3526            (_ Unit (emit_side_effect (call_known info))))
3527        output))
3528
3529;; Direct call to an out-of-range function (implicitly via pointer).
3530(rule (lower (call (func_ref_data sig_ref name dist false) args))
3531      (let ((output ValueRegsVec (gen_call_output sig_ref))
3532            (abi Sig (abi_sig sig_ref))
3533            (uses CallArgList (gen_call_args abi args))
3534            (defs CallRetList (gen_call_rets abi output))
3535            (target RegMem (RegMem.Reg (load_ext_name name 0 dist)))
3536            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none)))
3537            (_ Unit (emit_side_effect (call_unknown info))))
3538        output))
3539
3540;; Indirect call.
3541(rule (lower (call_indirect sig_ref ptr args))
3542      (let ((output ValueRegsVec (gen_call_output sig_ref))
3543            (abi Sig (abi_sig sig_ref))
3544            (target RegMem (RegMem.Reg (put_in_reg ptr)))
3545            (uses CallArgList (gen_call_args abi args))
3546            (defs CallRetList (gen_call_rets abi output))
3547            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none)))
3548            (_ Unit (emit_side_effect (call_unknown info))))
3549        output))
3550
3551;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;
3552
3553;; Direct call to an in-range function.
3554(rule 1 (lower (return_call (func_ref_data sig_ref name (RelocDistance.Near) false) args))
3555      (let ((abi Sig (abi_sig sig_ref))
3556            (uses CallArgList (gen_return_call_args abi args))
3557            (info BoxReturnCallInfo (gen_return_call_info abi name uses)))
3558        (side_effect (return_call_known info))))
3559
3560;; Direct call to an out-of-range function (implicitly via pointer).
3561(rule (lower (return_call (func_ref_data sig_ref name dist false) args))
3562      (let ((abi Sig (abi_sig sig_ref))
3563            (uses CallArgList (gen_return_call_args abi args))
3564            (target Reg (load_ext_name name 0 dist))
3565            (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses)))
3566        (side_effect (return_call_unknown info))))
3567
3568;; Indirect call.
3569(rule (lower (return_call_indirect sig_ref ptr args))
3570      (let ((abi Sig (abi_sig sig_ref))
3571            (target Reg (put_in_reg ptr))
3572            (uses CallArgList (gen_return_call_args abi args))
3573            (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses)))
3574        (side_effect (return_call_unknown info))))
3575
3576;;;; Rules for `try_call` and `try_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3577
3578;; Direct call to an in-range function.
3579(rule 1 (lower_branch (try_call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args et) targets)
3580      (let ((abi Sig (abi_sig sig_ref))
3581            (trycall OptionTryCallInfo (try_call_info et targets))
3582            (uses CallArgList (gen_call_args abi args))
3583            (defs CallRetList (gen_try_call_rets abi))
3584            (info BoxCallInfo (gen_call_info abi name uses defs trycall patchable)))
3585        (emit_side_effect (call_known info))))
3586
3587;; Direct call to an out-of-range function (implicitly via pointer).
3588(rule (lower_branch (try_call (func_ref_data sig_ref name dist false) args et) targets)
3589      (let ((abi Sig (abi_sig sig_ref))
3590            (trycall OptionTryCallInfo (try_call_info et targets))
3591            (uses CallArgList (gen_call_args abi args))
3592            (defs CallRetList (gen_try_call_rets abi))
3593            (target RegMem (RegMem.Reg (load_ext_name name 0 dist)))
3594            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall)))
3595        (emit_side_effect (call_unknown info))))
3596
3597;; Indirect call.
3598(rule (lower_branch (try_call_indirect ptr args et) targets)
3599      (if-let (exception_sig sig_ref) et)
3600      (let ((abi Sig (abi_sig sig_ref))
3601            (trycall OptionTryCallInfo (try_call_info et targets))
3602            (target RegMem (RegMem.Reg (put_in_reg ptr)))
3603            (uses CallArgList (gen_call_args abi args))
3604            (defs CallRetList (gen_try_call_rets abi))
3605            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall)))
3606        (emit_side_effect (call_unknown info))))
3607
3608;; Rules for `stack_switch` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3609
3610;; currently, only the Basic model is supported
3611(rule (lower (stack_switch _ store_context_ptr load_context_ptr in_payload0))
3612      (if-let (StackSwitchModel.Basic) (stack_switch_model))
3613      (let ((store_context_ptr Gpr (put_in_gpr store_context_ptr))
3614            (load_context_ptr Gpr (put_in_gpr load_context_ptr))
3615            (in_payload0 Gpr (put_in_gpr in_payload0)))
3616        (x64_stack_switch_basic store_context_ptr load_context_ptr in_payload0)))
3617
3618;;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;;
3619
3620(rule (lower (get_frame_pointer _))
3621      (x64_rbp))
3622
3623(rule (lower (get_stack_pointer _))
3624      (x64_rsp))
3625
3626(rule (lower (get_return_address _))
3627      (x64_movq_rm (Amode.ImmReg 8 (x64_rbp) (mem_flags_trusted))))
3628
3629;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3630
3631(rule (lower_branch (jump _) (single_target target))
3632      (emit_side_effect (jmp_known target)))
3633
3634;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3635
3636(rule (lower_branch (brif val _ _) (two_targets then else))
3637      (emit_side_effect (jmp_cond_result (is_nonzero_cmp val) then else)))
3638
3639;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3640
3641(rule (lower_branch (br_table idx @ (value_type ty) _) (jump_table_targets default_target jt_targets))
3642      (let ((jt_size u32 (jump_table_size jt_targets))
3643            (size_reg Reg (imm ty jt_size))
3644            (idx_reg Gpr (extend_to_gpr idx $I64 (ExtendKind.Zero)))
3645            (clamped_idx Reg (with_flags_reg
3646              (x64_cmp ty idx_reg size_reg)
3647              (cmove ty (CC.B) idx_reg size_reg))))
3648      (emit_side_effect (jmp_table_seq ty clamped_idx default_target jt_targets))))
3649
3650;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3651
3652(rule (lower (select_spectre_guard _ cond x y))
3653  (lower_select (is_nonzero_cmp cond) x y))
3654
3655;; Note that for GPR-based spectre guards everything is forced into a register
3656;; not `GprMem`. The `lower_select_spectre_gpr` helper below handles "and"
3657;; conditions which the `lower_select_gpr` helper does not.
3658(rule 1 (lower (has_type (is_single_register_gpr_type ty) (select_spectre_guard _ cond x y)))
3659  (lower_select_spectre_gpr ty (is_nonzero_cmp cond) (put_in_gpr x) y))
3660
3661(decl lower_select_spectre_gpr (Type CondResult Gpr Gpr) Gpr)
3662(rule 0 (lower_select_spectre_gpr ty cond a b) (lower_select_gpr ty cond a b))
3663(rule 1 (lower_select_spectre_gpr ty cond @ (CondResult.And _ _ _) a b)
3664  (lower_select_gpr ty (cond_invert cond) b a))
3665
3666;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3667
3668;; Note that the `cvtsi2s{s,d}` instruction is not just an int-to-float
3669;; conversion instruction in isolation, it also takes the upper 64-bits of an
3670;; xmm register and places it into the destination. We don't actually want that
3671;; to happen as it could accidentally create a false dependency with a
3672;; previous instruction defining the register's upper 64-bits. See #7085 for
3673;; an instance of this.
3674;;
3675;; This means that the first operand to all of the int-to-float conversions here
3676;; are `(xmm_zero)` operands which is a guaranteed zero register that has no
3677;; dependencies on other instructions.
3678;;
3679;; Ideally this would be lifted out to a higher level to get deduplicated
3680;; between consecutive int-to-float operations but that's not easy
3681;; to do at this time. One possibility would be a mid-end rule which rewrites
3682;; `fcvt_from_sint` to an x86-specific opcode using a zero constant which would
3683;; be subject to normal LICM, but that's not feasible today.
3684
3685(rule 2 (lower (has_type $F32 (fcvt_from_sint _ a @ (value_type $I8))))
3686      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
3687
3688(rule 2 (lower (has_type $F32 (fcvt_from_sint _ a @ (value_type $I16))))
3689      (x64_cvtsi2ss $I32 (xmm_zero $F32X4) (extend_to_gpr a $I32 (ExtendKind.Sign))))
3690
3691(rule 1 (lower (has_type $F32 (fcvt_from_sint _ a @ (value_type (ty_int (fits_in_64 ty))))))
3692      (x64_cvtsi2ss ty (xmm_zero $F32X4) a))
3693
3694(rule 2 (lower (has_type $F64 (fcvt_from_sint _ a @ (value_type $I8))))
3695      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
3696
3697(rule 2 (lower (has_type $F64 (fcvt_from_sint _ a @ (value_type $I16))))
3698      (x64_cvtsi2sd $I32 (xmm_zero $F64X2) (extend_to_gpr a $I32 (ExtendKind.Sign))))
3699
3700(rule 1 (lower (has_type $F64 (fcvt_from_sint _ a @ (value_type (ty_int (fits_in_64 ty))))))
3701      (x64_cvtsi2sd ty (xmm_zero $F64X2) a))
3702
3703(rule 0 (lower (fcvt_from_sint _ a @ (value_type $I32X4)))
3704      (x64_cvtdq2ps a))
3705
3706;; Base case: decompose the i64x2 input into two scalar registers and convert
3707;; each of those into a float. Afterwards re-pack the two results into the final
3708;; destination.
3709(rule 0 (lower (fcvt_from_sint _ a @ (value_type $I64X2)))
3710      (let (
3711          (a Xmm a)
3712          (zero Xmm (xmm_zero $F64X2))
3713          (f0 Xmm (x64_cvtsi2sd $I64 zero (x64_movq_to_gpr a)))
3714          (f1 Xmm (x64_cvtsi2sd $I64 zero (x64_movq_to_gpr (x64_pshufd a 0b11_10_11_10))))
3715        )
3716        (x64_unpcklpd f0 f1)))
3717
3718(rule 1 (lower (has_type $F64X2 (fcvt_from_sint _ (swiden_low _ a @ (value_type $I32X4)))))
3719      (x64_cvtdq2pd a))
3720
3721;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3722
3723(rule 1 (lower (has_type $F32 (fcvt_from_uint _ val @ (value_type (fits_in_32 (ty_int ty))))))
3724      (x64_cvtsi2ss $I64 (xmm_zero $F32X4) (extend_to_gpr val $I64 (ExtendKind.Zero))))
3725
3726(rule 1 (lower (has_type $F64 (fcvt_from_uint _ val @ (value_type (fits_in_32 (ty_int ty))))))
3727      (x64_cvtsi2sd $I64 (xmm_zero $F64X2) (extend_to_gpr val $I64 (ExtendKind.Zero))))
3728
3729(rule (lower (has_type ty (fcvt_from_uint _ val @ (value_type $I64))))
3730      (cvt_u64_to_float_seq ty val))
3731
3732;; Base case of u64x2 being converted to f64x2. No native instruction for this
3733;; is available so it's emulated through a series of instructions that exploit
3734;; the binary representation of 64-bit floats. This sequence of instructions is
3735;; copied from LLVM and my understanding of the general idea is to roughly:
3736;;
3737;; * For each bullet below operate in parallel on the left and right lanes.
3738;; * Move the low 32 bits of the input into one register and the upper
3739;;   32-bits into a different register, where both have all 0s for the upper
3740;;   32-bits. (e.g. split the 64-bit input into two locations)
3741;; * For the low bits, create `1.<twenty-zeros><low32>p52` via bit tricks.
3742;; * For the high bits, create `1.<twenty-zeros><high32>p84` via bit tricks.
3743;; * Create the constant `1.0p84 + 1.0p52`
3744;; * Add the two high halves and subtract the constant.
3745;;
3746;; Apply some math and this should produce the same result as the native
3747;; conversion.
3748;;
3749;; As for the bit tricks a float is represented where the low 53 bits are the
3750;; decimal of the float, basically:
3751;;
3752;;  f = 1.<fraction> ^ (<exponent> - 1023)
3753;;
3754;; where `<fraction>` is the low 53 bits. By placing the 32-bit halves from
3755;; the original integer into the low 53 bits and setting the exponent right it
3756;; means that each 32-bit half can become part of a 64-bit floating point
3757;; number. The final step in combining via float arithmetic will chop off the
3758;; leading `1.` at the start of the float that we constructed, one for the low
3759;; half and one for the upper half.
3760(rule -1 (lower (has_type $F64X2 (fcvt_from_uint _ val @ (value_type $I64X2))))
3761  (let ((low32_mask XmmMem (emit_u128_le_const 0x00000000ffffffff_00000000ffffffff))
3762        (float_1p52 XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000))
3763        (float_1p84 XmmMem (emit_u128_le_const 0x4530000000000000_4530000000000000))
3764        (float_1p84_plus_1p52 XmmMem (emit_u128_le_const 0x4530000000100000_4530000000100000))
3765        (low32 Xmm (x64_pand val low32_mask))
3766        (low32_as_float Xmm (x64_por low32 float_1p52))
3767        (high32 Xmm (x64_psrlq val (xmi_imm 32)))
3768        (high32_as_float Xmm (x64_por high32 float_1p84)))
3769    (x64_addpd low32_as_float (x64_subpd high32_as_float float_1p84_plus_1p52))))
3770
3771;; Algorithm uses unpcklps to help create a float that is equivalent
3772;; 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
3773;; every value of the mantissa represents a corresponding uint32 number.
3774;; When we subtract 0x1.0p52 we are left with double(src).
3775(rule 1 (lower (has_type $F64X2 (fcvt_from_uint _ (uwiden_low _ val @ (value_type $I32X4)))))
3776      (let ((uint_mask XmmMem (emit_u128_le_const 0x43300000_43300000))
3777            (res Xmm (x64_unpcklps val uint_mask))
3778            (uint_mask_high XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000)))
3779        (x64_subpd res uint_mask_high)))
3780
3781;; When AVX512VL and AVX512F are available,
3782;; `fcvt_from_uint` can be lowered to a single instruction.
3783(rule 2 (lower (has_type $F32X4 (fcvt_from_uint _ src)))
3784      (if-let true (has_avx512vl))
3785      (if-let true (has_avx512f))
3786      (x64_vcvtudq2ps src))
3787
3788;; Converting packed unsigned integers to packed floats
3789;; requires a few steps. There is no single instruction
3790;; lowering for converting unsigned floats but there is for
3791;; converting packed signed integers to float (cvtdq2ps). In
3792;; the steps below we isolate the upper half (16 bits) and
3793;; lower half (16 bits) of each lane and then we convert
3794;; each half separately using cvtdq2ps meant for signed
3795;; integers. In order for this to work for the upper half
3796;; bits we must shift right by 1 (divide by 2) these bits in
3797;; order to ensure the most significant bit is 0 not signed,
3798;; and then after the conversion we double the value.
3799;; Finally we add the converted values where addition will
3800;; correctly round.
3801;;
3802;; Sequence:
3803;; -> A = 0xffffffff
3804;; -> Ah = 0xffff0000
3805;; -> Al = 0x0000ffff
3806;; -> Convert(Al) // Convert int to float
3807;; -> Ah = Ah >> 1 // Shift right 1 to assure Ah conversion isn't treated as signed
3808;; -> Convert(Ah) // Convert .. with no loss of significant digits from previous shift
3809;; -> Ah = Ah + Ah // Double Ah to account for shift right before the conversion.
3810;; -> dst = Ah + Al // Add the two floats together
3811(rule 1 (lower (has_type $F32X4 (fcvt_from_uint _ val)))
3812      (let ((a Xmm val)
3813
3814            ;;  get the low 16 bits
3815            (a_lo Xmm (x64_pslld a (xmi_imm 16)))
3816            (a_lo Xmm (x64_psrld a_lo (xmi_imm 16)))
3817
3818            ;; get the high 16 bits
3819            (a_hi Xmm (x64_psubd a a_lo))
3820
3821            ;; convert the low 16 bits
3822            (a_lo Xmm (x64_cvtdq2ps a_lo))
3823
3824            ;; shift the high bits by 1, convert, and double to get the correct
3825            ;; value
3826            (a_hi Xmm (x64_psrld a_hi (xmi_imm 1)))
3827            (a_hi Xmm (x64_cvtdq2ps a_hi))
3828            (a_hi Xmm (x64_addps a_hi a_hi)))
3829
3830        ;; add together the two converted values
3831        (x64_addps a_hi a_lo)))
3832
3833;; Rules for `fcvt_to_uint` and `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3834
3835(rule (lower (has_type out_ty (fcvt_to_uint _ val @ (value_type (ty_scalar_float _)))))
3836      (cvt_float_to_uint_seq out_ty val false))
3837
3838(rule (lower (has_type out_ty (fcvt_to_uint_sat _ val @ (value_type (ty_scalar_float _)))))
3839      (cvt_float_to_uint_seq out_ty val true))
3840
3841(rule (lower (has_type out_ty (fcvt_to_sint _ val @ (value_type (ty_scalar_float _)))))
3842      (cvt_float_to_sint_seq out_ty val false))
3843
3844(rule (lower (has_type out_ty (fcvt_to_sint_sat _ val @ (value_type (ty_scalar_float _)))))
3845      (cvt_float_to_sint_seq out_ty val true))
3846
3847;; The x64 backend currently only supports these two type combinations.
3848(rule 1 (lower (has_type $I32X4 (fcvt_to_sint_sat _ val @ (value_type $F32X4))))
3849      (let ((src Xmm val)
3850
3851            ;; Sets tmp to zero if float is NaN
3852            (tmp Xmm (x64_cmpps src src (FcmpImm.Equal)))
3853            (dst Xmm (x64_andps src tmp))
3854
3855            ;; Sets top bit of tmp if float is positive
3856            ;; Setting up to set top bit on negative float values
3857            (tmp Xmm (x64_pxor tmp dst))
3858
3859            ;; Convert the packed float to packed doubleword.
3860            (dst Xmm (x64_cvttps2dq dst))
3861
3862            ;; Set top bit only if < 0
3863            (tmp Xmm (x64_pand dst tmp))
3864            (tmp Xmm (x64_psrad tmp (xmi_imm 31))))
3865
3866        ;; On overflow 0x80000000 is returned to a lane.
3867        ;; Below sets positive overflow lanes to 0x7FFFFFFF
3868        ;; Keeps negative overflow lanes as is.
3869        (x64_pxor tmp dst)))
3870
3871;; The algorithm for converting floats to unsigned ints is a little tricky. The
3872;; complication arises because we are converting from a signed 64-bit int with a positive
3873;; integer range from 1..INT_MAX (0x1..0x7FFFFFFF) to an unsigned integer with an extended
3874;; range from (INT_MAX+1)..UINT_MAX. It's this range from (INT_MAX+1)..UINT_MAX
3875;; (0x80000000..0xFFFFFFFF) that needs to be accounted for as a special case since our
3876;; conversion instruction (cvttps2dq) only converts as high as INT_MAX (0x7FFFFFFF), but
3877;; which conveniently setting underflows and overflows (smaller than MIN_INT or larger than
3878;; MAX_INT) to be INT_MAX+1 (0x80000000). Nothing that the range (INT_MAX+1)..UINT_MAX includes
3879;; precisely INT_MAX values we can correctly account for and convert every value in this range
3880;; if we simply subtract INT_MAX+1 before doing the cvttps2dq conversion. After the subtraction
3881;; every value originally (INT_MAX+1)..UINT_MAX is now the range (0..INT_MAX).
3882;; After the conversion we add INT_MAX+1 back to this converted value, noting again that
3883;; values we are trying to account for were already set to INT_MAX+1 during the original conversion.
3884;; We simply have to create a mask and make sure we are adding together only the lanes that need
3885;; to be accounted for. Digesting it all the steps then are:
3886;;
3887;; Step 1 - Account for NaN and negative floats by setting these src values to zero.
3888;; Step 2 - Make a copy (tmp1) of the src value since we need to convert twice for
3889;;          reasons described above.
3890;; Step 3 - Convert the original src values. This will convert properly all floats up to INT_MAX
3891;; Step 4 - Subtract INT_MAX from the copy set (tmp1). Note, all zero and negative values are those
3892;;          values that were originally in the range (0..INT_MAX). This will come in handy during
3893;;          step 7 when we zero negative lanes.
3894;; Step 5 - Create a bit mask for tmp1 that will correspond to all lanes originally less than
3895;;          UINT_MAX that are now less than INT_MAX thanks to the subtraction.
3896;; Step 6 - Convert the second set of values (tmp1)
3897;; Step 7 - Prep the converted second set by zeroing out negative lanes (these have already been
3898;;          converted correctly with the first set) and by setting overflow lanes to 0x7FFFFFFF
3899;;          as this will allow us to properly saturate overflow lanes when adding to 0x80000000
3900;; Step 8 - Add the original converted src and the converted tmp1 where float values originally less
3901;;          than and equal to INT_MAX will be unchanged, float values originally between INT_MAX+1 and
3902;;          UINT_MAX will add together (INT_MAX) + (SRC - INT_MAX), and float values originally
3903;;          greater than UINT_MAX will be saturated to UINT_MAX (0xFFFFFFFF) after adding (0x8000000 + 0x7FFFFFFF).
3904;;
3905;;
3906;; The table below illustrates the result after each step where it matters for the converted set.
3907;; Note the original value range (original src set) is the final dst in Step 8:
3908;;
3909;; Original src set:
3910;; | Original Value Range |    Step 1    |         Step 3         |          Step 8           |
3911;; |  -FLT_MIN..FLT_MAX   | 0.0..FLT_MAX | 0..INT_MAX(w/overflow) | 0..UINT_MAX(w/saturation) |
3912;;
3913;; Copied src set (tmp1):
3914;; |    Step 2    |                  Step 4                  |
3915;; | 0.0..FLT_MAX | (0.0-(INT_MAX+1))..(FLT_MAX-(INT_MAX+1)) |
3916;;
3917;; |                       Step 6                        |                 Step 7                 |
3918;; | (0-(INT_MAX+1))..(UINT_MAX-(INT_MAX+1))(w/overflow) | ((INT_MAX+1)-(INT_MAX+1))..(INT_MAX+1) |
3919(rule 1 (lower (has_type $I32X4 (fcvt_to_uint_sat _ val @ (value_type $F32X4))))
3920      (let ((src Xmm val)
3921
3922            ;; Converting to unsigned int so if float src is negative or NaN
3923            ;; will first set to zero.
3924            (tmp2 Xmm (xmm_zero $F32X4))
3925            (dst Xmm (x64_maxps src tmp2))
3926
3927            ;; Set tmp2 to INT_MAX+1. It is important to note here that after it looks
3928            ;; like we are only converting INT_MAX (0x7FFFFFFF) but in fact because
3929            ;; single precision IEEE-754 floats can only accurately represent contiguous
3930            ;; integers up to 2^23 and outside of this range it rounds to the closest
3931            ;; integer that it can represent. In the case of INT_MAX, this value gets
3932            ;; represented as 0x4f000000 which is the integer value (INT_MAX+1).
3933            (tmp2 Xmm (x64_pcmpeqd tmp2 tmp2))
3934            (tmp2 Xmm (x64_psrld tmp2 (xmi_imm 1)))
3935            (tmp2 Xmm (x64_cvtdq2ps tmp2))
3936
3937            ;; Make a copy of these lanes and then do the first conversion.
3938            ;; Overflow lanes greater than the maximum allowed signed value will
3939            ;; set to 0x80000000. Negative and NaN lanes will be 0x0
3940            (tmp1 Xmm dst)
3941            (dst Xmm (x64_cvttps2dq dst))
3942
3943            ;; Set lanes to src - max_signed_int
3944            (tmp1 Xmm (x64_subps tmp1 tmp2))
3945
3946            ;; Create mask for all positive lanes to saturate (i.e. greater than
3947            ;; or equal to the maximum allowable unsigned int).
3948            (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))
3949
3950            ;; Convert those set of lanes that have the max_signed_int factored out.
3951            (tmp1 Xmm (x64_cvttps2dq tmp1))
3952
3953            ;; Prepare converted lanes by zeroing negative lanes and prepping lanes
3954            ;; that have positive overflow (based on the mask) by setting these lanes
3955            ;; to 0x7FFFFFFF
3956            (tmp1 Xmm (x64_pxor tmp1 tmp2))
3957            (tmp2 Xmm (xmm_zero $I32X4))
3958            (tmp1 Xmm (lower_vec_smax $I32X4 tmp1 tmp2)))
3959
3960        ;; Add this second set of converted lanes to the original to properly handle
3961        ;; values greater than max signed int.
3962        (x64_paddd tmp1 dst)))
3963
3964;; Rules for `x86_cvtt2dq` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3965
3966(rule (lower (has_type $I32X4 (x86_cvtt2dq _ val @ (value_type $F32X4))))
3967      (x64_cvttps2dq val))
3968
3969;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3970
3971(rule (lower (has_type $I8X16 (iadd_pairwise _ x y)))
3972      (let (
3973          ;; Shuffle all the even lanes of `x` and `y` into one register
3974          (even_lane_mask Xmm (x64_movdqu_load (emit_u128_le_const 0x00ff_00ff_00ff_00ff_00ff_00ff_00ff_00ff)))
3975          (x_evens Xmm (x64_pand x even_lane_mask))
3976          (y_evens Xmm (x64_pand y even_lane_mask))
3977          (evens Xmm (x64_packuswb x_evens y_evens))
3978
3979          ;; Shuffle all the odd lanes of `x` and `y` into one register
3980          (x_odds Xmm (x64_psrlw x (xmi_imm 8)))
3981          (y_odds Xmm (x64_psrlw y (xmi_imm 8)))
3982          (odds Xmm (x64_packuswb x_odds y_odds))
3983        )
3984        (x64_paddb evens odds)))
3985
3986
3987(rule 1 (lower (has_type $I16X8 (iadd_pairwise _ x y)))
3988        (if-let true (has_ssse3))
3989        (x64_phaddw x y))
3990
3991(rule (lower (has_type $I16X8 (iadd_pairwise _ x y)))
3992      (let (
3993          (x Xmm x)
3994          (y Xmm y)
3995
3996          ;; Shuffle the even-numbered 16-bit lanes into low four lanes of each
3997          ;; vector by shuffling 16-bit lanes then shuffling 32-bit lanes.
3998          ;; With these in place generate a new vector from the two low 64-bits
3999          ;; of each vector (the low four 16-bit lanes).
4000          ;;
4001          ;; 0xe8 == 0b11_10_10_00
4002          (x_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw x 0xe8) 0xe8) 0xe8))
4003          (y_evens Xmm (x64_pshufd (x64_pshufhw (x64_pshuflw y 0xe8) 0xe8) 0xe8))
4004          (evens Xmm (x64_punpcklqdq x_evens y_evens))
4005
4006          ;; Shuffle the odd-numbered 16-bit lanes into the low 8 lanes by
4007          ;; performing `sshr` operation on 32-bit lanes, effectively moving the
4008          ;; odd lanes into even lanes while leaving their sign bits in the
4009          ;; odd lanes. The `packssdw` instruction then conveniently will
4010          ;; put everything into one vector for us.
4011          (x_shifted Xmm (x64_psrad x (xmi_imm 16)))
4012          (y_shifted Xmm (x64_psrad y (xmi_imm 16)))
4013          (odds Xmm (x64_packssdw x_shifted y_shifted))
4014        )
4015      (x64_paddw evens odds)))
4016
4017(rule 1 (lower (has_type $I32X4 (iadd_pairwise _ x y)))
4018        (if-let true (has_ssse3))
4019        (x64_phaddd x y))
4020
4021(rule (lower (has_type $I32X4 (iadd_pairwise _ x y)))
4022      (let (
4023          (x Xmm x)
4024          (y Xmm y)
4025          ;; evens = [ x[0] x[2] y[0] y[2] ]
4026          (evens Xmm (x64_shufps x y 0b10_00_10_00))
4027          ;; odds  = [ x[1] x[3] y[1] y[3] ]
4028          (odds  Xmm (x64_shufps x y 0b11_01_11_01))
4029        )
4030      (x64_paddd evens odds)))
4031
4032;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
4033(rule 2 (lower
4034        (has_type $I16X8 (iadd_pairwise _
4035                           (swiden_low _ val @ (value_type $I8X16))
4036                           (swiden_high _ val))))
4037      (if-let true (has_ssse3))
4038      (let ((mul_const Xmm (x64_xmm_load_const $I8X16
4039              (emit_u128_le_const 0x01010101010101010101010101010101))))
4040        (x64_pmaddubsw mul_const val)))
4041
4042;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
4043(rule 2 (lower
4044        (has_type $I32X4 (iadd_pairwise _
4045                           (swiden_low _ val @ (value_type $I16X8))
4046                           (swiden_high _ val))))
4047      (let ((mul_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001)))
4048        (x64_pmaddwd val mul_const)))
4049
4050;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
4051(rule 2 (lower
4052        (has_type $I16X8 (iadd_pairwise _
4053                           (uwiden_low _ val @ (value_type $I8X16))
4054                           (uwiden_high _ val))))
4055      (if-let true (has_ssse3))
4056      (let ((mul_const XmmMem (emit_u128_le_const 0x01010101010101010101010101010101)))
4057        (x64_pmaddubsw val mul_const)))
4058
4059;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
4060(rule 2 (lower
4061        (has_type $I32X4 (iadd_pairwise _
4062                           (uwiden_low _ val @ (value_type $I16X8))
4063                           (uwiden_high _ val))))
4064      (let ((xor_const XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))
4065            (dst Xmm (x64_pxor val xor_const))
4066
4067            (madd_const XmmMem (emit_u128_le_const 0x0001_0001_0001_0001_0001_0001_0001_0001))
4068            (dst Xmm (x64_pmaddwd dst madd_const))
4069
4070            (addd_const XmmMem (emit_u128_le_const 0x00010000_00010000_00010000_00010000)))
4071        (x64_paddd dst addd_const)))
4072
4073;; special case for the `i32x4.dot_i16x8_s` wasm instruction
4074(rule 2 (lower
4075        (has_type $I32X4 (iadd_pairwise _
4076                           (imul _ (swiden_low _ x) (swiden_low _ y))
4077                           (imul _ (swiden_high _ x) (swiden_high _ y)))))
4078      (x64_pmaddwd x y))
4079
4080;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4081
4082;; With SSE4.1 use the `pmovsx*` instructions for this
4083(rule 1 (lower (has_type $I16X8 (swiden_low _ val @ (value_type $I8X16))))
4084        (if-let true (has_sse41))
4085        (x64_pmovsxbw val))
4086(rule 1 (lower (has_type $I32X4 (swiden_low _ val @ (value_type $I16X8))))
4087        (if-let true (has_sse41))
4088        (x64_pmovsxwd val))
4089(rule 1 (lower (has_type $I64X2 (swiden_low _ val @ (value_type $I32X4))))
4090        (if-let true (has_sse41))
4091        (x64_pmovsxdq val))
4092
4093(rule (lower (has_type ty (swiden_low _ val))) (lower_swiden_low ty val))
4094
4095(decl lower_swiden_low (Type Xmm) Xmm)
4096
4097;; Duplicate the low lanes next to each other, then perform a wider shift-right
4098;; by the low lane width to move the upper of each pair back into the lower lane
4099;; of each pair, achieving the widening of the lower lanes.
4100(rule (lower_swiden_low $I16X8 val)
4101      (x64_psraw (x64_punpcklbw val val) (xmi_imm 8)))
4102(rule (lower_swiden_low $I32X4 val)
4103      (x64_psrad (x64_punpcklwd val val) (xmi_imm 16)))
4104
4105;; Generate the sign-extended halves with a `val < 0` comparison (expressed
4106;; reversed here), then interleave the low 32-bit halves to create the full
4107;; 64-bit results.
4108(rule (lower_swiden_low $I64X2 val)
4109      (let ((tmp Xmm (x64_pcmpgtd_a (xmm_zero $I32X4) val)))
4110      (x64_punpckldq val tmp)))
4111
4112;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4113
4114;; Similar to `swiden_low` with SSE4.1 except that the upper lanes are moved
4115;; to the lower lanes first.
4116(rule 1 (lower (has_type $I16X8 (swiden_high _ val @ (value_type $I8X16))))
4117        (if-let true (has_sse41))
4118        (if-let true (has_ssse3))
4119        (let ((x Xmm val))
4120          (x64_pmovsxbw (x64_palignr x x 8))))
4121(rule 1 (lower (has_type $I32X4 (swiden_high _ val @ (value_type $I16X8))))
4122        (if-let true (has_sse41))
4123        (if-let true (has_ssse3))
4124        (let ((x Xmm val))
4125          (x64_pmovsxwd (x64_palignr x x 8))))
4126(rule 1 (lower (has_type $I64X2 (swiden_high _ val @ (value_type $I32X4))))
4127        (if-let true (has_sse41))
4128        (x64_pmovsxdq (x64_pshufd val 0b11_10_11_10)))
4129
4130;; Similar to `swiden_low` versions but using `punpckh*` instructions to
4131;; pair the high lanes next to each other.
4132(rule (lower (has_type $I16X8 (swiden_high _ val @ (value_type $I8X16))))
4133      (let ((val Xmm val))
4134        (x64_psraw (x64_punpckhbw val val) (xmi_imm 8))))
4135(rule (lower (has_type $I32X4 (swiden_high _ val @ (value_type $I16X8))))
4136      (let ((val Xmm val))
4137        (x64_psrad (x64_punpckhwd val val) (xmi_imm 16))))
4138
4139;; Same as `swiden_low`, but `val` has its high lanes moved down.
4140(rule (lower (has_type $I64X2 (swiden_high _ val @ (value_type $I32X4))))
4141      (let ((val Xmm (x64_pshufd val 0b00_00_11_10))
4142            (tmp Xmm (x64_pcmpgtd_a (xmm_zero $I32X4) val)))
4143      (x64_punpckldq val tmp)))
4144
4145;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4146
4147;; With SSE4.1 use the `pmovzx*` instructions for this
4148(rule 1 (lower (has_type $I16X8 (uwiden_low _ val @ (value_type $I8X16))))
4149        (if-let true (has_sse41))
4150        (x64_pmovzxbw val))
4151(rule 1 (lower (has_type $I32X4 (uwiden_low _ val @ (value_type $I16X8))))
4152        (if-let true (has_sse41))
4153        (x64_pmovzxwd val))
4154(rule 1 (lower (has_type $I64X2 (uwiden_low _ val @ (value_type $I32X4))))
4155        (if-let true (has_sse41))
4156        (x64_pmovzxdq val))
4157
4158(rule (lower (has_type ty (uwiden_low _ val))) (lower_uwiden_low ty val))
4159
4160;; Interleave an all-zero register with the low lanes to produce zero-extended
4161;; results.
4162(decl lower_uwiden_low (Type Xmm) Xmm)
4163(rule (lower_uwiden_low $I16X8 val) (x64_punpcklbw val (xmm_zero $I8X16)))
4164(rule (lower_uwiden_low $I32X4 val) (x64_punpcklwd val (xmm_zero $I8X16)))
4165(rule (lower_uwiden_low $I64X2 val) (x64_unpcklps val (xmm_zero $F32X4)))
4166
4167;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4168
4169;; Same as `uwiden_high`, but interleaving high lanes instead.
4170;;
4171;; Note that according to `llvm-mca` at least these instructions are faster
4172;; than using `pmovzx*` in terms of cycles, even if SSE4.1 is available.
4173(rule (lower (has_type $I16X8 (uwiden_high _ val @ (value_type $I8X16))))
4174      (x64_punpckhbw val (xmm_zero $I8X16)))
4175(rule (lower (has_type $I32X4 (uwiden_high _ val @ (value_type $I16X8))))
4176      (x64_punpckhwd val (xmm_zero $I8X16)))
4177(rule (lower (has_type $I64X2 (uwiden_high _ val @ (value_type $I32X4))))
4178      (x64_unpckhps val (xmm_zero $F32X4)))
4179
4180;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4181
4182(rule (lower (has_type $I8X16 (snarrow _ a @ (value_type $I16X8) b)))
4183      (x64_packsswb a b))
4184
4185(rule (lower (has_type $I16X8 (snarrow _ a @ (value_type $I32X4) b)))
4186      (x64_packssdw a b))
4187
4188;; We're missing a `snarrow` case for $I64X2
4189;; https://github.com/bytecodealliance/wasmtime/issues/4734
4190
4191;; This rule is a special case for handling the translation of the wasm op
4192;; `i32x4.trunc_sat_f64x2_s_zero`. It can be removed once we have an
4193;; implementation of `snarrow` for `I64X2`.
4194(rule (lower (has_type $I32X4 (snarrow _ (has_type $I64X2 (fcvt_to_sint_sat _ val))
4195                                       (vconst _ (u128_from_constant 0)))))
4196      (let ((a Xmm val)
4197
4198            ;; y = i32x4.trunc_sat_f64x2_s_zero(x) is lowered to:
4199            ;; MOVE xmm_tmp, xmm_x
4200            ;; CMPEQPD xmm_tmp, xmm_x
4201            ;; MOVE xmm_y, xmm_x
4202            ;; ANDPS xmm_tmp, [wasm_f64x2_splat(2147483647.0)]
4203            ;; MINPD xmm_y, xmm_tmp
4204            ;; CVTTPD2DQ xmm_y, xmm_y
4205
4206            (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal)))
4207
4208            ;; 2147483647.0 is equivalent to 0x41DFFFFFFFC00000
4209            (umax_mask XmmMem (emit_u128_le_const 0x41DFFFFFFFC00000_41DFFFFFFFC00000))
4210
4211            ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)]
4212            (tmp1 Xmm (x64_andps tmp1 umax_mask))
4213            (dst Xmm (x64_minpd a tmp1)))
4214        (x64_cvttpd2dq dst)))
4215
4216;; This rule is a special case for handling the translation of the wasm op
4217;; `i32x4.relaxed_trunc_f64x2_s_zero`.
4218(rule (lower (has_type $I32X4 (snarrow _ (has_type $I64X2 (x86_cvtt2dq _ val))
4219                                       (vconst _ (u128_from_constant 0)))))
4220        (x64_cvttpd2dq val))
4221
4222;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4223
4224(rule (lower (has_type $I8X16 (unarrow _ a @ (value_type $I16X8) b)))
4225      (x64_packuswb a b))
4226
4227(rule 1 (lower (has_type $I16X8 (unarrow _ a @ (value_type $I32X4) b)))
4228        (if-let true (has_sse41))
4229        (x64_packusdw a b))
4230
4231;; For each input `a` and `b` take the four 32-bit lanes and compress them to
4232;; the low 64-bits of the vector as four 16-bit lanes. Then these are woven
4233;; into one final vector with a `punpcklqdq`.
4234;;
4235;; If this is performance sensitive then it's probably best to upgrade the CPU
4236;; to get the above single-instruction lowering.
4237(rule (lower (has_type $I16X8 (unarrow _ a @ (value_type $I32X4) b)))
4238      (let (
4239          (a Xmm (unarrow_i32x4_lanes_to_low_u16_lanes a))
4240          (b Xmm (unarrow_i32x4_lanes_to_low_u16_lanes b))
4241        )
4242        (x64_punpcklqdq a b)))
4243
4244(decl unarrow_i32x4_lanes_to_low_u16_lanes (Xmm) Xmm)
4245(rule (unarrow_i32x4_lanes_to_low_u16_lanes val)
4246      (let (
4247          ;; First convert all negative values in `val` to zero lanes.
4248          (val_gt_zero Xmm (x64_pcmpgtd_a_or_avx val (xmm_zero $I32X4)))
4249          (val Xmm (x64_pand val val_gt_zero))
4250
4251          ;; Next clamp all larger-than-u16-max lanes to u16::MAX.
4252          (max Xmm (x64_movdqu_load (emit_u128_le_const 0x0000ffff_0000ffff_0000ffff_0000ffff)))
4253          (cmp Xmm (x64_pcmpgtd_a_or_avx max val))
4254          (valid_lanes Xmm (x64_pand val cmp))
4255          (clamped_lanes Xmm (x64_pandn cmp max))
4256          (val Xmm (x64_por valid_lanes clamped_lanes))
4257
4258          ;; Within each 64-bit half of the 32x4 vector move the first 16 bits
4259          ;; and the third 16 bits to the bottom of the half. Afterwards
4260          ;; for the 32x4 vector move the first and third lanes to the bottom
4261          ;; lanes, which finishes up the conversion here as all the lanes
4262          ;; are now converted to 16-bit values in the low 4 lanes.
4263          (val Xmm (x64_pshuflw val 0b00_00_10_00))
4264          (val Xmm (x64_pshufhw val 0b00_00_10_00))
4265        )
4266        (x64_pshufd val 0b00_00_10_00)))
4267
4268
4269;; We're missing a `unarrow` case for $I64X2
4270;; https://github.com/bytecodealliance/wasmtime/issues/4734
4271
4272;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4273
4274(rule -3 (lower (has_type (is_gpr_type (fits_in_64 ty)) (bitcast _ _ src @ (value_type (is_xmm_type _)))))
4275      (bitcast_xmm_to_gpr (ty_bits ty) src))
4276
4277(rule -2 (lower (has_type (is_xmm_type (fits_in_64 ty)) (bitcast _ _ src @ (value_type (is_gpr_type _)))))
4278      (bitcast_gpr_to_xmm (ty_bits ty) src))
4279
4280(rule -1 (lower (has_type $I128 (bitcast _ _ src @ (value_type (is_xmm_type _)))))
4281      (bitcast_xmm_to_gprs src))
4282
4283(rule 0 (lower (has_type (is_xmm_type _) (bitcast _ _ src @ (value_type $I128))))
4284      (bitcast_gprs_to_xmm src))
4285
4286;; Bitcast between types residing in GPR registers is a no-op.
4287(rule 1 (lower (has_type (is_gpr_type _)
4288                         (bitcast _ _ x @ (value_type (is_gpr_type _)))))
4289      x)
4290
4291;; Bitcast between types residing in XMM registers is a no-op.
4292(rule 3 (lower (has_type (is_xmm_type _)
4293                         (bitcast _ _ x @ (value_type (is_xmm_type _)))))
4294      x)
4295
4296;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4297
4298(rule (lower (has_type $F32 (fcopysign _ a @ (value_type $F32) b)))
4299      (let ((sign_bit Xmm (imm $F32 0x80000000))
4300            (a Xmm a) ;; force into reg so we don't sink a 128-bit load.
4301            (b Xmm b))
4302        (x64_orps
4303          (x64_andnps sign_bit a)
4304          (x64_andps sign_bit b))))
4305
4306(rule (lower (has_type $F64 (fcopysign _ a @ (value_type $F64) b)))
4307      (let ((sign_bit Xmm (imm $F64 0x8000000000000000))
4308            (a Xmm a) ;; force into reg so we don't sink a 128-bit load.
4309            (b Xmm b))
4310        (x64_orpd
4311          (x64_andnpd sign_bit a)
4312          (x64_andpd sign_bit b))))
4313
4314;; Helper for the `ceil`/`floor`/`nearest`/`trunc` instructions ;;;;;;;;;;;;;;;;
4315
4316;; Emits either a `round{ss,sd,ps,pd}` instruction, as appropriate, or generates
4317;; the appropriate libcall and sequence to call that.
4318;;
4319;; Recursion: at most once to convert memory case into register case.
4320(decl rec x64_round (Type RegMem RoundImm) Xmm)
4321(rule 1 (x64_round $F32 a imm)
4322        (if-let true (has_sse41))
4323        (x64_roundss a imm))
4324(rule 1 (x64_round $F64 a imm)
4325        (if-let true (has_sse41))
4326        (x64_roundsd a imm))
4327(rule 1 (x64_round $F32X4 a imm)
4328        (if-let true (has_sse41))
4329        (x64_roundps a imm))
4330(rule 1 (x64_round $F64X2 a imm)
4331        (if-let true (has_sse41))
4332        (x64_roundpd a imm))
4333
4334(rule (x64_round $F32 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F32 imm) a))
4335(rule (x64_round $F64 (RegMem.Reg a) imm) (libcall_1 (round_libcall $F64 imm) a))
4336(rule (x64_round $F32X4 (RegMem.Reg a) imm)
4337      (let (
4338          (libcall LibCall (round_libcall $F32 imm))
4339          (result Xmm (libcall_1 libcall a))
4340          (a1 Xmm (libcall_1 libcall (x64_pshufd a 1)))
4341          (result Xmm (f32x4_insertlane result a1 1))
4342          (a2 Xmm (libcall_1 libcall (x64_pshufd a 2)))
4343          (result Xmm (f32x4_insertlane result a2 2))
4344          (a3 Xmm (libcall_1 libcall (x64_pshufd a 3)))
4345          (result Xmm (f32x4_insertlane result a3 3))
4346        )
4347        result))
4348(rule (x64_round $F64X2 (RegMem.Reg a) imm)
4349      (let (
4350          (libcall LibCall (round_libcall $F64 imm))
4351          (result Xmm (libcall_1 libcall a))
4352          (a1 Xmm (libcall_1 libcall (x64_pshufd a 0b00_00_11_10)))
4353        )
4354        (x64_movlhps result a1)))
4355(rule (x64_round ty (RegMem.Mem addr) imm)
4356      (x64_round ty (RegMem.Reg (x64_load_xmm ty addr)) imm))
4357
4358(decl round_libcall (Type RoundImm) LibCall)
4359(rule (round_libcall $F32 (RoundImm.RoundUp)) (LibCall.CeilF32))
4360(rule (round_libcall $F64 (RoundImm.RoundUp)) (LibCall.CeilF64))
4361(rule (round_libcall $F32 (RoundImm.RoundDown)) (LibCall.FloorF32))
4362(rule (round_libcall $F64 (RoundImm.RoundDown)) (LibCall.FloorF64))
4363(rule (round_libcall $F32 (RoundImm.RoundNearest)) (LibCall.NearestF32))
4364(rule (round_libcall $F64 (RoundImm.RoundNearest)) (LibCall.NearestF64))
4365(rule (round_libcall $F32 (RoundImm.RoundZero)) (LibCall.TruncF32))
4366(rule (round_libcall $F64 (RoundImm.RoundZero)) (LibCall.TruncF64))
4367
4368;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4369
4370(rule (lower (ceil _ a @ (value_type ty)))
4371      (x64_round ty a (RoundImm.RoundUp)))
4372
4373;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4374
4375(rule (lower (floor _ a @ (value_type ty)))
4376      (x64_round ty a (RoundImm.RoundDown)))
4377
4378;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4379
4380(rule (lower (nearest _ a @ (value_type ty)))
4381      (x64_round ty a (RoundImm.RoundNearest)))
4382
4383;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4384
4385(rule (lower (trunc _ a @ (value_type ty)))
4386      (x64_round ty a (RoundImm.RoundZero)))
4387
4388;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4389
4390(rule (lower (stack_addr _ stack_slot offset))
4391      (stack_addr_impl stack_slot offset))
4392
4393;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4394
4395;; NB: a `RegMem` divisor, while allowed in the instruction encoding, isn't
4396;; used right now to prevent a possibly-trapping load getting folded into the
4397;; `div` instruction. Ideally non-trapping loads would get folded, however, or
4398;; alternatively Wasmtime/Cranelift would grow support for multiple traps on
4399;; a single opcode and the signal kind would differentiate at runtime.
4400
4401;; The inputs to the `div` instruction are different for 8-bit division so
4402;; it needs a special case here since the instruction being crafted has a
4403;; different shape.
4404(rule 2 (lower (udiv _ a @ (value_type $I8) b))
4405        (x64_divb_m (extend_to_gpr a $I32 (ExtendKind.Zero))
4406                    (put_in_gpr b)
4407                    (TrapCode.INTEGER_DIVISION_BY_ZERO)))
4408
4409;; 16-to-64-bit division is all done with a similar instruction and the only
4410;; tricky requirement here is that when div traps are disallowed the divisor
4411;; must not be zero.
4412(rule 1 (lower (udiv _ a @ (value_type (fits_in_64 ty)) b))
4413  (value_regs_get
4414    (x64_div ty a (imm $I64 0) (put_in_gpr b) (TrapCode.INTEGER_DIVISION_BY_ZERO))
4415    0))
4416
4417;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4418
4419(rule 2 (lower (sdiv _ a @ (value_type $I8) b))
4420        (x64_idivb_m (x64_cbtw_zo a)
4421                     (nonzero_sdiv_divisor $I8 b)
4422                     (TrapCode.INTEGER_OVERFLOW)))
4423
4424(rule 1 (lower (sdiv _ a @ (value_type (fits_in_64 ty)) b))
4425  (let ((a Gpr a))
4426    (value_regs_get
4427      (x64_idiv ty a (repeat_sign_bit ty a) (nonzero_sdiv_divisor ty b) (TrapCode.INTEGER_OVERFLOW))
4428      0)))
4429
4430;; Repeats the sign bit in the provided gpr, which will register-allocate to
4431;; %rax, into a destination gpr which will register-allocate to %rdx.
4432;;
4433;; This is intended to be used before x64 `div` instructions where
4434;; left-hand-side (divisor? dividend? I always forget) is double-wide and
4435;; present across the rax/rdx registers (sized to the operation in question).
4436(decl repeat_sign_bit (Type Gpr) Gpr)
4437(rule (repeat_sign_bit $I16 src) (x64_cwtd_zo src))
4438(rule (repeat_sign_bit $I32 src) (x64_cltd_zo src))
4439(rule (repeat_sign_bit $I64 src) (x64_cqto_zo src))
4440
4441;; Checks to make sure that the input `Value` is a non-zero value for `sdiv`.
4442;;
4443;; This is required to differentiate the divide-by-zero trap from the
4444;; integer-overflow trap, the two trapping conditions of signed division.
4445(decl nonzero_sdiv_divisor (Type Value) Reg)
4446(rule 1 (nonzero_sdiv_divisor ty (iconst _ imm))
4447        (if-let n (safe_divisor_from_imm64 ty imm))
4448        (imm ty n))
4449(rule 0 (nonzero_sdiv_divisor ty val)
4450      (let (
4451          (val Reg val)
4452          (_ InstOutput (side_effect (with_flags_side_effect
4453            (x64_test ty val val)
4454            (trap_if (CC.Z) (TrapCode.INTEGER_DIVISION_BY_ZERO)))))
4455        )
4456        val))
4457
4458;; Rules for `urem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4459
4460;; The remainder is in AH, so take the result of the division and right-shift
4461;; by 8.
4462(rule 2 (lower (urem _ a @ (value_type $I8) b))
4463        (let (
4464            (result Gpr (x64_divb_m (extend_to_gpr a $I32 (ExtendKind.Zero))
4465                                    (put_in_gpr b) ;; see `udiv` for why not `gpr_mem`
4466                                    (TrapCode.INTEGER_DIVISION_BY_ZERO)))
4467          )
4468          (x64_shrq_mi result 8)))
4469
4470(rule 1 (lower (urem _ a @ (value_type (fits_in_64 ty)) b))
4471  (value_regs_get
4472    (x64_div ty a (imm $I64 0) (put_in_gpr b) (TrapCode.INTEGER_DIVISION_BY_ZERO))
4473    1))
4474
4475;; Rules for `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4476
4477;; Special-cases first for constant `srem` where the checks for 0 and -1 aren't
4478;; applicable.
4479;;
4480;; Note that like `urem` for i8 types the result is in AH so to get the result
4481;; it's right-shifted down.
4482(rule 3 (lower (srem _ a @ (value_type $I8) (iconst _ imm)))
4483        (if-let n (safe_divisor_from_imm64 $I8 imm))
4484        (let (
4485            (a Gpr (x64_cbtw_zo a))
4486            (result Gpr (x64_idivb_m a (imm $I8 n) (TrapCode.INTEGER_DIVISION_BY_ZERO)))
4487          )
4488          (x64_shrq_mi result 8)))
4489
4490;; Same as the above rule but for 16-to-64 bit types.
4491(rule 2 (lower (srem _ a @ (value_type ty) (iconst _ imm)))
4492        (if-let n (safe_divisor_from_imm64 ty imm))
4493        (let (
4494            (a Gpr a)
4495            (size OperandSize (raw_operand_size_of_type ty))
4496          )
4497       (value_regs_get
4498         (x64_idiv ty a (repeat_sign_bit ty a) (imm ty n) (TrapCode.INTEGER_DIVISION_BY_ZERO))
4499         1)))
4500
4501(rule 1 (lower (srem _ a @ (value_type $I8) b))
4502  (x64_shrq_mi (x64_checked_srem_seq8 (x64_cbtw_zo a) b) 8))
4503
4504(rule (lower (srem _ a @ (value_type ty) b))
4505      (let (
4506          (a Gpr a)
4507          (size OperandSize (raw_operand_size_of_type ty))
4508          (hi Gpr (repeat_sign_bit ty a))
4509          (tmp ValueRegs (x64_checked_srem_seq size a hi b))
4510        )
4511        (value_regs_get tmp 1)))
4512
4513;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4514
4515(rule 0 (lower (umulhi _ a @ (value_type $I8) b))
4516        (x64_shrw_mi (x64_mul8 false a b) 8))
4517
4518(rule 1 (lower (umulhi _ a @ (value_type (ty_int_ref_16_to_64 ty)) b))
4519        (value_regs_get_gpr (x64_mul ty false a b) 1))
4520
4521;; The BMI2 instruction set introduced `mulx` which defines two registers but
4522;; if the two registers are the same then it only defines the upper bits. This
4523;; helps slightly reduce register pressure by ensuring only one register here is
4524;; clobbered.
4525(rule 2 (lower (umulhi _ a @ (value_type (ty_32_or_64 ty)) b))
4526        (if-let true (has_bmi2))
4527        (x64_mulx_hi ty a b))
4528
4529;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4530
4531(rule 0 (lower (smulhi _ a @ (value_type $I8) b))
4532        (x64_sarw_mi (x64_mul8 true a b) 8))
4533
4534(rule 1 (lower (smulhi _ a @ (value_type (ty_int_ref_16_to_64 ty)) b))
4535        (value_regs_get_gpr (x64_mul ty true a b) 1))
4536
4537;; Rules for `get_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4538
4539(rule (lower (get_pinned_reg _))
4540      (read_pinned_gpr))
4541
4542;; Rules for `set_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4543
4544(rule (lower (set_pinned_reg a @ (value_type ty)))
4545      (side_effect (write_pinned_gpr a)))
4546
4547;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4548
4549(rule (lower (has_type ty (vconst _ const)))
4550      ;; TODO use Inst::gen_constant() instead.
4551      (x64_xmm_load_const ty (const_to_vconst const)))
4552
4553;; Special cases for known constant patterns to skip a 16-byte load.
4554(rule 1 (lower (has_type ty (vconst _ (u128_from_constant 0)))) (xmm_zero ty))
4555(rule 1 (lower (has_type ty (vconst _ (u128_from_constant -1)))) (vector_all_ones))
4556
4557;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4558
4559;; Special case for `pblendw` which takes an 8-bit immediate where each bit
4560;; indicates which lane of the two operands is chosen for the output. A bit of
4561;; 0 chooses the corresponding 16-it lane from `a` and a bit of 1 chooses the
4562;; corresponding 16-bit lane from `b`.
4563(rule 14 (lower (shuffle _ a b (pblendw_imm n)))
4564         (if-let true (has_sse41))
4565         (x64_pblendw a b n))
4566(decl pblendw_imm (u8) Immediate)
4567(extern extractor pblendw_imm pblendw_imm)
4568
4569;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
4570;; bytes", that's a `palignr` instruction. Note that the order of operands are
4571;; swapped in the instruction here. The `palignr` instruction uses the second
4572;; operand as the low-order bytes and the first operand as high-order bytes,
4573;; so put `a` second.
4574(rule 13 (lower (shuffle _ a b (palignr_imm_from_immediate n)))
4575         (if-let true (has_ssse3))
4576         (x64_palignr b a n))
4577(decl palignr_imm_from_immediate (u8) Immediate)
4578(extern extractor palignr_imm_from_immediate palignr_imm_from_immediate)
4579
4580;; Special case the `pshuf{l,h}w` instruction which shuffles four 16-bit
4581;; integers within one value, preserving the other four 16-bit integers in that
4582;; value (either the high or low half). The complicated logic is in the
4583;; extractors here implemented in Rust and note that there's two cases for each
4584;; instruction here to match when either the first or second shuffle operand is
4585;; used.
4586(rule 12 (lower (shuffle _ x y (pshuflw_lhs_imm imm)))
4587      (x64_pshuflw x imm))
4588(rule 11 (lower (shuffle _ x y (pshuflw_rhs_imm imm)))
4589      (x64_pshuflw y imm))
4590(rule 10 (lower (shuffle _ x y (pshufhw_lhs_imm imm)))
4591      (x64_pshufhw x imm))
4592(rule 9 (lower (shuffle _ x y (pshufhw_rhs_imm imm)))
4593      (x64_pshufhw y imm))
4594
4595(decl pshuflw_lhs_imm (u8) Immediate)
4596(extern extractor pshuflw_lhs_imm pshuflw_lhs_imm)
4597(decl pshuflw_rhs_imm (u8) Immediate)
4598(extern extractor pshuflw_rhs_imm pshuflw_rhs_imm)
4599(decl pshufhw_lhs_imm (u8) Immediate)
4600(extern extractor pshufhw_lhs_imm pshufhw_lhs_imm)
4601(decl pshufhw_rhs_imm (u8) Immediate)
4602(extern extractor pshufhw_rhs_imm pshufhw_rhs_imm)
4603
4604;; Special case for the `pshufd` instruction which will permute 32-bit values
4605;; within a single register. This is only applicable if the `imm` specified
4606;; selects 32-bit values from either `x` or `y`, but not both. This means
4607;; there's one rule for selecting from `x` and another rule for selecting from
4608;; `y`.
4609(rule 8 (lower (shuffle _ x y (pshufd_lhs_imm imm)))
4610      (x64_pshufd x imm))
4611(rule 7 (lower (shuffle _ x y (pshufd_rhs_imm imm)))
4612      (x64_pshufd y imm))
4613
4614(decl pshufd_lhs_imm (u8) Immediate)
4615(extern extractor pshufd_lhs_imm pshufd_lhs_imm)
4616(decl pshufd_rhs_imm (u8) Immediate)
4617(extern extractor pshufd_rhs_imm pshufd_rhs_imm)
4618
4619;; Special case for i8-level interleaving of upper/low bytes.
4620(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
4621      (x64_punpckhbw a b))
4622(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
4623      (x64_punpcklbw a b))
4624
4625;; Special case for i16-level interleaving of upper/low bytes.
4626(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
4627      (x64_punpckhwd a b))
4628(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
4629      (x64_punpcklwd a b))
4630
4631;; Special case for i32-level interleaving of upper/low bytes.
4632(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
4633      (x64_punpckhdq a b))
4634(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
4635      (x64_punpckldq a b))
4636
4637;; Special case for i64-level interleaving of upper/low bytes.
4638(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
4639      (x64_punpckhqdq a b))
4640(rule 6 (lower (shuffle _ a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
4641      (x64_punpcklqdq a b))
4642
4643;; If the vector shift mask is all 0s then that means the first byte of the
4644;; first operand is broadcast to all bytes. Falling through would load an
4645;; all-zeros constant from a rip-relative location but it should be slightly
4646;; more efficient to execute the `pshufb` here-and-now with an xor'd-to-be-zero
4647;; register.
4648(rule 6 (lower (shuffle _ a _ (u128_from_immediate 0)))
4649        (if-let true (has_ssse3))
4650        (x64_pshufb a (xmm_zero $I8X16)))
4651
4652;; Special case for the `shufps` instruction which will select two 32-bit values
4653;; from the first operand and two 32-bit values from the second operand. Note
4654;; that there is a second case here as well for when the operands can be
4655;; swapped.
4656;;
4657;; Note that the priority of this instruction is currently lower than the above
4658;; special cases since `shufps` handles many of them and for now it's
4659;; hypothesized that the dedicated instructions are better than `shufps`.
4660;; Someone with more knowledge about x86 timings should perhaps reorder the
4661;; rules here eventually though.
4662(rule 5 (lower (shuffle _ x y (shufps_imm imm)))
4663      (x64_shufps x y imm))
4664(rule 4 (lower (shuffle _ x y (shufps_rev_imm imm)))
4665      (x64_shufps y x imm))
4666
4667(decl shufps_imm(u8) Immediate)
4668(extern extractor shufps_imm shufps_imm)
4669(decl shufps_rev_imm(u8) Immediate)
4670(extern extractor shufps_rev_imm shufps_rev_imm)
4671
4672
4673;; If `lhs` and `rhs` are the same we can use a single PSHUFB to shuffle the XMM
4674;; register. We statically build `constructed_mask` to zero out any unknown lane
4675;; indices (may not be completely necessary: verification could fail incorrect
4676;; mask values) and fix the indexes to all point to the `dst` vector.
4677(rule 3 (lower (shuffle _ a a (vec_mask_from_immediate mask)))
4678        (if-let true (has_ssse3))
4679        (x64_pshufb a (shuffle_0_31_mask mask)))
4680
4681;; For the case where the shuffle mask contains out-of-bounds values (values
4682;; greater than 31) we must mask off those resulting values in the result of
4683;; `vpermi2b`.
4684(rule 2 (lower (shuffle _ a b (vec_mask_from_immediate (perm_from_mask_with_zeros mask zeros))))
4685      (if-let true (has_avx512vl))
4686      (if-let true (has_avx512vbmi))
4687      (x64_andps (x64_vpermi2b (x64_xmm_load_const $I8X16 mask) a b) zeros))
4688
4689;; However, if the shuffle mask contains no out-of-bounds values, we can use
4690;; `vpermi2b` without any masking.
4691(rule 1 (lower (shuffle _ a b (vec_mask_from_immediate mask)))
4692      (if-let true (has_avx512vl))
4693      (if-let true (has_avx512vbmi))
4694      (x64_vpermi2b (x64_xmm_load_const $I8X16 (perm_from_mask mask)) a b))
4695
4696;; If `lhs` and `rhs` are different, we must shuffle each separately and then OR
4697;; them together. This is necessary due to PSHUFB semantics. As in the case
4698;; above, we build the `constructed_mask` for each case statically.
4699(rule (lower (shuffle _ a b (vec_mask_from_immediate mask)))
4700      (x64_por
4701        (lower_pshufb a (shuffle_0_15_mask mask))
4702        (lower_pshufb b (shuffle_16_31_mask mask))))
4703
4704;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4705
4706;; SIMD swizzle; the following inefficient implementation is due to the Wasm
4707;; SIMD spec requiring mask indexes greater than 15 to have the same semantics
4708;; as a 0 index. For the spec discussion, see
4709;; https://github.com/WebAssembly/simd/issues/93. The CLIF semantics match the
4710;; Wasm SIMD semantics for this instruction. The instruction format maps to
4711;; variables like: %dst = swizzle %src, %mask
4712(rule (lower (swizzle _ src mask))
4713      (let ((mask Xmm (x64_paddusb mask (emit_u128_le_const 0x70707070707070707070707070707070))))
4714        (lower_pshufb src mask)))
4715
4716;; Rules for `x86_pshufb` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4717
4718(rule (lower (x86_pshufb _ src mask))
4719      (if-let true (has_ssse3))
4720      (x64_pshufb src mask))
4721
4722;; A helper function to generate either the `pshufb` instruction or a libcall to
4723;; the `X86Pshufb` libcall. Note that the libcall is not exactly the most
4724;; performant thing in the world so this is primarily here for completeness
4725;; of lowerings on all x86 cpus but if rules are ideally gated on the presence
4726;; of SSSE3 to use the `pshufb` instruction itself.
4727;;
4728;; Recursion: at most once to implement the memory load case.
4729(decl rec lower_pshufb (Xmm RegMem) Xmm)
4730(rule 1 (lower_pshufb src mask)
4731        (if-let true (has_ssse3))
4732        (x64_pshufb src mask))
4733(rule (lower_pshufb src (RegMem.Reg mask))
4734      (libcall_2 (LibCall.X86Pshufb) src mask))
4735(rule (lower_pshufb src (RegMem.Mem addr))
4736      (lower_pshufb src (x64_movdqu_load addr)))
4737
4738;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4739
4740;; Remove the extractlane instruction, leaving the float where it is. The upper
4741;; bits will remain unchanged; for correctness, this relies on Cranelift type
4742;; checking to avoid using those bits.
4743(rule 3 (lower (has_type (ty_scalar_float _) (extractlane _ val 0)))
4744        val)
4745
4746;; `f32x4.extract_lane N` where `N != 0`
4747(rule 1 (lower (extractlane _ val @ (value_type $F32X4) (u8_from_uimm8 lane)))
4748        (x64_pshufd val lane))
4749
4750;; `f64x2.extract_lane N` where `N != 0` (aka N == 1)
4751(rule (lower (extractlane _ val @ (value_type $F64X2) 1))
4752      (x64_pshufd val 0b11_10_11_10))
4753
4754;; `i8x16.extract_lane N`
4755;;
4756;; Note that without SSE4.1 a 16-bit lane extraction is performed and then
4757;; the result is updated if the desired index is either odd or even.
4758(rule 2 (lower (extractlane _ val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane)))
4759        (if-let true (has_sse41))
4760        (x64_pextrb val lane))
4761;; extracting an odd lane has an extra shift-right
4762(rule 1 (lower (extractlane _ val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane)))
4763        (if-let 1 (u8_and lane 1))
4764        (x64_shrw_mi (x64_pextrw val (u8_wrapping_shr lane 1)) 8))
4765;; Extracting an even lane already has the desired lane in the lower bits. Note
4766;; that having arbitrary upper bits in the returned register should be ok since
4767;; all operators on the resulting `i8` type should work correctly regardless of
4768;; the bits in the rest of the register.
4769(rule (lower (extractlane _ val @ (value_type ty @ $I8X16) (u8_from_uimm8 lane)))
4770      (if-let 0 (u8_and lane 1))
4771      (x64_pextrw val (u8_wrapping_shr lane 1)))
4772
4773;; `i16x8.extract_lane N`
4774(rule (lower (extractlane _ val @ (value_type ty @ $I16X8) (u8_from_uimm8 lane)))
4775      (x64_pextrw val lane))
4776
4777;; `i32x4.extract_lane N`
4778(rule 2 (lower (extractlane _ val @ (value_type ty @ $I32X4) (u8_from_uimm8 lane)))
4779        (if-let true (has_sse41))
4780        (x64_pextrd val lane))
4781(rule 1 (lower (extractlane _ val @ (value_type $I32X4) 0))
4782        (x64_movd_to_gpr val))
4783(rule (lower (extractlane _ val @ (value_type $I32X4) (u8_from_uimm8 n)))
4784      (x64_movd_to_gpr (x64_pshufd val n)))
4785
4786;; `i64x2.extract_lane N`
4787(rule 1 (lower (extractlane _ val @ (value_type $I64X2) (u8_from_uimm8 lane)))
4788        (if-let true (has_sse41))
4789        (x64_pextrq val lane))
4790(rule (lower (extractlane _ val @ (value_type $I64X2) 0))
4791      (x64_movq_to_gpr val))
4792(rule (lower (extractlane _ val @ (value_type $I64X2) 1))
4793      (x64_movq_to_gpr (x64_pshufd val 0b00_00_11_10)))
4794
4795;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4796
4797;; Case 1: when moving a scalar float the `movss` and `movsd` variants with
4798;; xmm-register-to-xmm-register semantics are used to modify only the low bits
4799;; of a guaranteed-zero register. This ensures that the upper bits are cleared
4800;; as the upper bits of `src` in a register are otherwise undefined.
4801(rule 1 (lower (scalar_to_vector _ src @ (value_type $F32)))
4802  (x64_movss_regmove (xmm_zero $F32X4) src))
4803(rule 1 (lower (scalar_to_vector _ src @ (value_type $F64)))
4804  (x64_movsd_regmove (xmm_zero $F64X2) src))
4805
4806;; Case 2: when moving a scalar value of any other type, use MOVD to zero
4807;; the upper lanes.
4808(rule (lower (scalar_to_vector _ src @ (value_type ty)))
4809      (bitcast_gpr_to_xmm (ty_bits ty) src))
4810
4811;; Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
4812;; MOVSS/MOVSD instruction.
4813(rule 2 (lower (scalar_to_vector _ (and (sinkable_load src) (value_type (ty_32 _)))))
4814      (x64_movss_load src))
4815(rule 3 (lower (scalar_to_vector _ (and (sinkable_load src) (value_type (ty_64 _)))))
4816      (x64_movsd_load src))
4817
4818;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4819
4820;; For all the splat rules below one of the goals is that splatting a value
4821;; doesn't end up accidentally depending on the previous value in a register.
4822;; This means that instructions are chosen to avoid false dependencies where
4823;; new values are created fresh or otherwise overwrite previous register
4824;; contents where possible.
4825;;
4826;; Additionally splats are specialized to special-case load-and-splat which
4827;; has a number of micro-optimizations available.
4828
4829;; i8x16 splats: use `vpbroadcastb` on AVX2 and otherwise `pshufb` broadcasts
4830;; with a mask of zero which is calculated with an xor-against-itself register.
4831(rule 0 (lower (has_type $I8X16 (splat _ src)))
4832        (let ((src Xmm (x64_movd_to_xmm src)))
4833          (x64_pshufd (x64_pshuflw (x64_punpcklbw src src) 0) 0)))
4834(rule 1 (lower (has_type $I8X16 (splat _ src)))
4835        (if-let true (has_ssse3))
4836        (x64_pshufb (bitcast_gpr_to_xmm 32 src) (xmm_zero $I8X16)))
4837(rule 2 (lower (has_type $I8X16 (splat _ src)))
4838        (if-let true (use_avx2))
4839        (x64_vpbroadcastb (bitcast_gpr_to_xmm 32 src)))
4840(rule 3 (lower (has_type $I8X16 (splat _ (sinkable_load_exact addr))))
4841        (if-let true (has_sse41))
4842        (if-let true (has_ssse3))
4843        (x64_pshufb (x64_pinsrb (xmm_uninit_value) addr 0) (xmm_zero $I8X16)))
4844(rule 4 (lower (has_type $I8X16 (splat _ (sinkable_load_exact addr))))
4845        (if-let true (use_avx2))
4846        (x64_vpbroadcastb addr))
4847
4848;; i16x8 splats: use `vpbroadcastw` on AVX2 and otherwise a 16-bit value is
4849;; loaded into an xmm register, `pshuflw` broadcasts the low 16-bit lane
4850;; to the low four lanes, and `pshufd` broadcasts the low 32-bit lane (which
4851;; at that point is two of the 16-bit values we want to broadcast) to all the
4852;; lanes.
4853(rule 0 (lower (has_type $I16X8 (splat _ src)))
4854        (x64_pshufd (x64_pshuflw (bitcast_gpr_to_xmm 32 src) 0) 0))
4855(rule 1 (lower (has_type $I16X8 (splat _ src)))
4856        (if-let true (use_avx2))
4857        (x64_vpbroadcastw (bitcast_gpr_to_xmm 32 src)))
4858(rule 2 (lower (has_type $I16X8 (splat _ (sinkable_load_exact addr))))
4859        (x64_pshufd (x64_pshuflw (x64_pinsrw (xmm_uninit_value) addr 0) 0) 0))
4860(rule 3 (lower (has_type $I16X8 (splat _ (sinkable_load_exact addr))))
4861        (if-let true (use_avx2))
4862        (x64_vpbroadcastw addr))
4863
4864;; i32x4.splat - use `vpbroadcastd` on AVX2 and otherwise `pshufd` can be
4865;; used to broadcast the low lane to all other lanes.
4866;;
4867;; Note that sinkable-load cases come later
4868(rule 0 (lower (has_type $I32X4 (splat _ src)))
4869        (x64_pshufd (bitcast_gpr_to_xmm 32 src) 0))
4870(rule 1 (lower (has_type $I32X4 (splat _ src)))
4871        (if-let true (use_avx2))
4872        (x64_vpbroadcastd (bitcast_gpr_to_xmm 32 src)))
4873
4874;; f32x4.splat - the source is already in an xmm register so `shufps` is all
4875;; that's necessary to complete the splat. This is specialized to `vbroadcastss`
4876;; on AVX2 to leverage that specific instruction for this operation.
4877(rule 0 (lower (has_type $F32X4 (splat _ src)))
4878        (let ((tmp Xmm src))
4879          (x64_shufps src src 0)))
4880(rule 1 (lower (has_type $F32X4 (splat _ src)))
4881        (if-let true (use_avx2))
4882        (x64_vbroadcastss src))
4883
4884;; t32x4.splat of a load - use a `movss` to load into an xmm register and then
4885;; `shufps` broadcasts to the other lanes. Note that this is used for both i32
4886;; and f32 splats.
4887;;
4888;; With AVX the `vbroadcastss` instruction suits this purpose precisely. Note
4889;; that the memory-operand encoding of `vbroadcastss` is usable with AVX, but
4890;; the register-based encoding is only available with AVX2. With the
4891;; `sinkable_load` extractor this should be guaranteed to use the memory-based
4892;; encoding hence the `has_avx` test.
4893(rule 5 (lower (has_type (multi_lane 32 4) (splat _ (sinkable_load addr))))
4894        (let ((tmp Xmm (x64_movss_load addr)))
4895          (x64_shufps tmp tmp 0)))
4896(rule 6 (lower (has_type (multi_lane 32 4) (splat _ (sinkable_load addr))))
4897        (if-let true (has_avx))
4898        (x64_vbroadcastss addr))
4899
4900;; t64x2.splat - use `pshufd` to broadcast the lower 64-bit lane to the upper
4901;; lane. A minor specialization for sinkable loads to avoid going through a gpr
4902;; for i64 splats is used as well when `movddup` is available.
4903(rule 0 (lower (has_type $I64X2 (splat _ src)))
4904        (x64_pshufd (bitcast_gpr_to_xmm 64 src) 0b01_00_01_00))
4905(rule 0 (lower (has_type $F64X2 (splat _ src)))
4906        (x64_pshufd (put_in_xmm src) 0b01_00_01_00))
4907(rule 6 (lower (has_type (multi_lane 64 2) (splat _ (sinkable_load addr))))
4908        (if-let true (has_sse3))
4909        (x64_movddup addr))
4910
4911;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4912
4913(rule (lower (vany_true _ val)) (lower_cond_bool (is_vany_true val)))
4914
4915;; Any nonzero byte in `val` means that any lane is true. Compare `val` with a
4916;; zeroed register and extract the high bits to a gpr mask. If the mask is
4917;; 0xffff then every byte was equal to zero, so test if the comparison is
4918;; not-equal or NZ.
4919(decl is_vany_true (Value) CondResult)
4920(rule (is_vany_true val)
4921      (let (
4922          (any_byte_zero Xmm (x64_pcmpeqb val (xmm_zero $I8X16)))
4923          (mask Gpr (x64_pmovmskb any_byte_zero))
4924        )
4925        (CondResult.CC (x64_cmpl_mi mask 0xffff) (CC.NZ))))
4926(rule 1 (is_vany_true val)
4927  (if-let true (has_sse41))
4928  (let ((val Xmm val))
4929    (CondResult.CC (x64_ptest val val) (CC.NZ))))
4930
4931;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4932
4933(rule (lower (vall_true _ val)) (lower_cond_bool (is_vall_true val)))
4934
4935(decl is_vall_true (Value) CondResult)
4936(rule 1 (is_vall_true val @ (value_type ty))
4937        (if-let true (has_sse41))
4938        (let ((src Xmm val)
4939              (zeros Xmm (xmm_zero ty))
4940              (cmp Xmm (x64_pcmpeq (vec_int_type ty) src zeros)))
4941          (CondResult.CC (x64_ptest cmp cmp) (CC.Z))))
4942
4943;; Perform an appropriately-sized lane-wise comparison with zero. If the
4944;; result is all 0s then all of them are true because nothing was equal to
4945;; zero.
4946(rule (is_vall_true val @ (value_type ty))
4947      (let ((lanes_with_zero Xmm (x64_pcmpeq (vec_int_type ty) val (xmm_zero ty)))
4948            (mask Gpr (x64_pmovmskb lanes_with_zero)))
4949        (CondResult.CC (x64_testl_mr mask mask) (CC.Z))))
4950
4951;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4952
4953;; The Intel specification allows using both 32-bit and 64-bit GPRs as
4954;; destination for the "move mask" instructions. This is controlled by the REX.R
4955;; bit: "In 64-bit mode, the instruction can access additional registers when
4956;; used with a REX.R prefix. The default operand size is 64-bit in 64-bit mode"
4957;; (PMOVMSKB in IA Software Development Manual, vol. 2). This being the case, we
4958;; will always clear REX.W since its use is unnecessary (`OperandSize` is used
4959;; for setting/clearing REX.W) as we need at most 16 bits of output for
4960;; `vhigh_bits`.
4961
4962(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 8 16))))
4963      (x64_pmovmskb val))
4964
4965(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 32 4))))
4966      (x64_movmskps val))
4967
4968(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 64 2))))
4969      (x64_movmskpd val))
4970
4971;; There is no x86 instruction for extracting the high bit of 16-bit lanes so
4972;; here we:
4973;; - duplicate the 16-bit lanes of `src` into 8-bit lanes:
4974;;     PACKSSWB([x1, x2, ...], [x1, x2, ...]) = [x1', x2', ..., x1', x2', ...]
4975;; - use PMOVMSKB to gather the high bits; now we have duplicates, though
4976;; - shift away the bottom 8 high bits to remove the duplicates.
4977(rule (lower (vhigh_bits _ val @ (value_type (multi_lane 16 8))))
4978      (let ((src Xmm val)
4979            (tmp Xmm (x64_packsswb src src))
4980            (tmp Gpr (x64_pmovmskb tmp)))
4981        (x64_shrq_mi tmp 8)))
4982
4983;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4984
4985(rule (lower (iconcat _ lo @ (value_type $I64) hi))
4986      (value_regs lo hi))
4987
4988;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4989
4990(rule (lower (isplit _ val @ (value_type $I128)))
4991      (let ((regs ValueRegs val)
4992            (lo Reg (value_regs_get regs 0))
4993            (hi Reg (value_regs_get regs 1)))
4994        (output_pair lo hi)))
4995
4996;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4997
4998(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value _ (symbol_value_data name _ _))))
4999      (elf_tls_get_addr name))
5000
5001(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value _ (symbol_value_data name _ _))))
5002      (macho_tls_get_addr name))
5003
5004(rule (lower (has_type (tls_model (TlsModel.Coff)) (tls_value _ (symbol_value_data name _ _))))
5005      (coff_tls_get_addr name))
5006
5007;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5008
5009(rule 1 (lower (sqmul_round_sat _ qx @ (value_type $I16X8) qy))
5010        (if-let true (has_ssse3))
5011        (let ((src1 Xmm qx)
5012              (src2 Xmm qy)
5013
5014              (mask XmmMem (emit_u128_le_const 0x8000_8000_8000_8000_8000_8000_8000_8000))
5015              (dst Xmm (x64_pmulhrsw src1 src2))
5016              (cmp Xmm (x64_pcmpeqw dst mask)))
5017          (x64_pxor dst cmp)))
5018
5019;; This operation is defined in wasm as:
5020;;
5021;;    S.SignedSaturate((x * y + 0x4000) >> 15)
5022;;
5023;; so perform all those operations here manually with a lack of the native
5024;; instruction.
5025(rule (lower (sqmul_round_sat _ qx @ (value_type $I16X8) qy))
5026      (let (
5027          (qx Xmm qx)
5028          (qy Xmm qy)
5029          ;; Multiply `qx` and `qy` generating 32-bit intermediate results. The
5030          ;; 32-bit results have their low-halves stored in `mul_lsb` and the
5031          ;; high halves are stored in `mul_msb`. These are then shuffled into
5032          ;; `mul_lo` and `mul_hi` which represent the low 4 multiplications
5033          ;; and the upper 4 multiplications.
5034          (mul_lsb Xmm (x64_pmullw qx qy))
5035          (mul_msb Xmm (x64_pmulhw qx qy))
5036          (mul_lo Xmm (x64_punpcklwd mul_lsb mul_msb))
5037          (mul_hi Xmm (x64_punpckhwd mul_lsb mul_msb))
5038          ;; Add the 0x4000 constant to all multiplications
5039          (val Xmm (x64_movdqu_load (emit_u128_le_const 0x00004000_00004000_00004000_00004000)))
5040          (mul_lo Xmm (x64_paddd mul_lo val))
5041          (mul_hi Xmm (x64_paddd mul_hi val))
5042          ;; Perform the right-shift by 15 to all multiplications
5043          (lo Xmm (x64_psrad mul_lo (xmi_imm 15)))
5044          (hi Xmm (x64_psrad mul_hi (xmi_imm 15)))
5045        )
5046        ;; And finally perform a saturating 32-to-16-bit conversion.
5047        (x64_packssdw lo hi)))
5048
5049;; Rules for `x86_pmulhrsw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5050
5051(rule (lower (x86_pmulhrsw _ qx @ (value_type $I16X8) qy))
5052      (if-let true (has_ssse3))
5053      (x64_pmulhrsw qx qy))
5054
5055;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5056
5057;; TODO: currently we only lower a special case of `uunarrow` needed to support
5058;; the translation of wasm's i32x4.trunc_sat_f64x2_u_zero operation.
5059;; https://github.com/bytecodealliance/wasmtime/issues/4791
5060;;
5061;; y = i32x4.trunc_sat_f64x2_u_zero(x) is lowered to:
5062;; MOVAPD xmm_y, xmm_x
5063;; XORPD xmm_tmp, xmm_tmp
5064;; MAXPD xmm_y, xmm_tmp
5065;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
5066;; ROUNDPD xmm_y, xmm_y, 0x0B
5067;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
5068;; SHUFPS xmm_y, xmm_xmp, 0x88
5069(rule (lower (uunarrow _ (fcvt_to_uint_sat _ src @ (value_type $F64X2))
5070                       (vconst _ (u128_from_constant 0))))
5071      (let ((src Xmm src)
5072
5073            ;; MOVAPD xmm_y, xmm_x
5074            ;; XORPD xmm_tmp, xmm_tmp
5075            (zeros Xmm (xmm_zero $F64X2))
5076            (dst Xmm (x64_maxpd src zeros))
5077
5078            ;; 4294967295.0 is equivalent to 0x41EFFFFFFFE00000
5079            (umax_mask XmmMem (emit_u128_le_const 0x41EFFFFFFFE00000_41EFFFFFFFE00000))
5080
5081            ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
5082            (dst Xmm (x64_minpd dst umax_mask))
5083
5084            ;; ROUNDPD xmm_y, xmm_y, 0x0B
5085            (dst Xmm (x64_round $F64X2 dst (RoundImm.RoundZero)))
5086
5087            ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
5088            (uint_mask XmmMem (emit_u128_le_const 0x4330000000000000_4330000000000000))
5089
5090            (dst Xmm (x64_addpd dst uint_mask)))
5091
5092        ;; SHUFPS xmm_y, xmm_xmp, 0x88
5093        (x64_shufps dst zeros 0x88)))
5094
5095;; Rules for `get_exception_handler_address` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5096
5097(rule (lower (get_exception_handler_address _ (u64_from_imm64 idx) block))
5098      (let ((succ_label MachLabel (block_exn_successor_label block idx)))
5099        (x64_label_address succ_label)))
5100
5101;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5102
5103(rule (lower (nop))
5104      (invalid_reg))
5105
5106;; Rules for `sequence_point` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5107
5108(rule (lower (sequence_point))
5109      (side_effect
5110       (x64_sequence_point)))
5111