1;; aarch64 instruction selection and CLIF-to-MachInst lowering.
2
3;; The main lowering constructor term: takes a clif `Inst` and returns the
4;; register(s) within which the lowered instruction's result values live.
5(spec (lower arg)
6      (provide (= result arg)))
7(decl partial lower (Inst) InstOutput)
8
9;; Variant of the main lowering constructor term, which receives an
10;; additional argument (a vector of branch targets to be used) for
11;; implementing branches.
12;; For two-branch instructions, the first target is `taken` and the second
13;; `not_taken`, even if it is a Fallthrough instruction: because we reorder
14;; blocks while we lower, the fallthrough in the new order is not (necessarily)
15;; the same as the fallthrough in CLIF. So, we use the explicitly-provided
16;; target.
17(decl partial lower_branch (Inst MachLabelSlice) Unit)
18
19;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
20
21(rule iconst (lower (has_type ty (iconst _ (u64_from_imm64 n))))
22      (imm ty (ImmExtend.Zero) n))
23
24;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
25
26(rule (lower (f16const _ (u16_from_ieee16 n)))
27      (constant_f16 n))
28
29;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30
31(rule (lower (f32const _ (u32_from_ieee32 n)))
32      (constant_f32 n))
33
34;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
35
36(rule (lower (f64const _ (u64_from_ieee64 n)))
37      (constant_f64 n))
38
39;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
40
41(rule (lower (has_type $F128 (f128const _ (u128_from_constant n))))
42      (constant_f128 n))
43
44;;;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
45
46(rule (lower (nop))
47      (invalid_reg))
48
49;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50
51;; `i64` and smaller
52
53;; Base case, simply adding things in registers.
54(rule iadd_base_case -1 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x y)))
55      (add ty  x y))
56
57;; Special cases for when one operand is an immediate that fits in 12 bits.
58(rule iadd_imm12_right 4 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x (imm12_from_value y))))
59      (add_imm ty x y))
60
61(rule iadd_imm12_left 5 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ (imm12_from_value x) y)))
62      (add_imm ty y x))
63
64;; Same as the previous special cases, except we can switch the addition to a
65;; subtraction if the negated immediate fits in 12 bits.
66(rule iadd_imm12_neg_right 2 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x y)))
67      (if-let imm12_neg (imm12_from_negated_value y))
68      (sub_imm ty x imm12_neg))
69
70(rule iadd_imm12_neg_left 3 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x y)))
71      (if-let imm12_neg (imm12_from_negated_value x))
72      (sub_imm ty y imm12_neg))
73
74;; Special cases for when we're adding an extended register where the extending
75;; operation can get folded into the add itself.
76(rule iadd_extend_right 0 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x (extended_value_from_value y))))
77      (add_extend ty x y))
78
79(rule iadd_extend_left 1 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ (extended_value_from_value x) y)))
80      (add_extend ty y x))
81
82;; Special cases for when we're adding the shift of a different
83;; register by a constant amount and the shift can get folded into the add.
84(rule iadd_ishl_right 7 (lower (has_type (ty_int_ref_scalar_64 ty)
85                       (iadd _ x (ishl _ y (iconst _ k)))))
86      (if-let amt (lshl_from_imm64 ty k))
87      (add_shift ty x y amt))
88
89(rule iadd_ishl_left 6 (lower (has_type (ty_int_ref_scalar_64 ty)
90                       (iadd _ (ishl _ x (iconst _ k)) y)))
91      (if-let amt (lshl_from_imm64 ty k))
92      (add_shift ty y x amt))
93
94;; Fold an `iadd` and `imul` combination into a `madd` instruction.
95(rule iadd_imul_right 7 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x (imul _ y z))))
96      (madd ty y z x))
97
98(rule iadd_imul_left 6 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ (imul _ x y) z)))
99      (madd ty x y z))
100
101;; Fold an `isub` and `imul` combination into a `msub` instruction.
102(rule isub_imul (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x (imul _ y z))))
103      (msub ty y z x))
104
105;; vectors
106
107(rule -2 (lower (has_type ty @ (multi_lane _ _) (iadd _ x y)))
108      (add_vec x y (vector_size ty)))
109
110;; `i128`
111(rule -3 (lower (has_type $I128 (iadd _ x y)))
112      (let
113          ;; Get the high/low registers for `x`.
114          ((x_regs ValueRegs x)
115           (x_lo Reg (value_regs_get x_regs 0))
116           (x_hi Reg (value_regs_get x_regs 1))
117
118           ;; Get the high/low registers for `y`.
119           (y_regs ValueRegs y)
120           (y_lo Reg (value_regs_get y_regs 0))
121           (y_hi Reg (value_regs_get y_regs 1)))
122        ;; the actual addition is `adds` followed by `adc` which comprises the
123        ;; low/high bits of the result
124        (with_flags
125          (add_with_flags_paired $I64 x_lo y_lo)
126          (adc_paired $I64 x_hi y_hi))))
127
128;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
129
130;; When a single element of one vector is broadcast to all the destination
131;; lanes then the `dup` instruction can be used for this operation. Note that
132;; for now this only matches lane selection from the first vector `a`, but
133;; if necessary in the future rules can be added to select from `b` as well.
134(rule 6 (lower (shuffle _ a b (shuffle_dup8_from_imm n)))
135        (vec_dup_from_fpu a (VectorSize.Size8x16) n))
136(rule 5 (lower (shuffle _ a b (shuffle_dup16_from_imm n)))
137        (vec_dup_from_fpu a (VectorSize.Size16x8) n))
138(rule 4 (lower (shuffle _ a b (shuffle_dup32_from_imm n)))
139        (vec_dup_from_fpu a (VectorSize.Size32x4) n))
140(rule 3 (lower (shuffle _ a b (shuffle_dup64_from_imm n)))
141        (vec_dup_from_fpu a (VectorSize.Size64x2) n))
142
143;; If the `Immediate` specified to the extractor looks like a duplication of the
144;; `n`th lane of the first vector of size K-byte lanes, then each extractor
145;; returns the `n` value as a `u8` to be used as part of a `vec_dup_from_fpu`
146;; instruction. Note that there's a different extractor for each bit-width of
147;; lane.
148(decl shuffle_dup8_from_imm (u8) Immediate)
149(extern extractor shuffle_dup8_from_imm shuffle_dup8_from_imm)
150(decl shuffle_dup16_from_imm (u8) Immediate)
151(extern extractor shuffle_dup16_from_imm shuffle_dup16_from_imm)
152(decl shuffle_dup32_from_imm (u8) Immediate)
153(extern extractor shuffle_dup32_from_imm shuffle_dup32_from_imm)
154(decl shuffle_dup64_from_imm (u8) Immediate)
155(extern extractor shuffle_dup64_from_imm shuffle_dup64_from_imm)
156
157;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
158;; bytes", that's an `ext` instruction.
159(rule 2 (lower (shuffle _ a b (vec_extract_imm4_from_immediate n)))
160        (vec_extract a b n))
161
162;; Attempts to extract `n` from the specified shuffle `Immediate` where each
163;; byte of the `Immediate` is a consecutive sequence starting from `n`. This
164;; value of `n` is used as part of the `vec_extract` instruction which extracts
165;; consecutive bytes from two vectors into one final vector, offset by `n`
166;; bytes.
167(decl vec_extract_imm4_from_immediate (u8) Immediate)
168(extern extractor vec_extract_imm4_from_immediate vec_extract_imm4_from_immediate)
169
170;; Rules for the `uzp1` and `uzp2` instructions which gather even-numbered lanes
171;; or odd-numbered lanes
172(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1e1c_1a18_1614_1210_0e0c_0a08_0604_0200)))
173      (vec_uzp1 a b (VectorSize.Size8x16)))
174(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1d_1b19_1715_1311_0f0d_0b09_0705_0301)))
175      (vec_uzp2 a b (VectorSize.Size8x16)))
176(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1d1c_1918_1514_1110_0d0c_0908_0504_0100)))
177      (vec_uzp1 a b (VectorSize.Size16x8)))
178(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_1b1a_1716_1312_0f0e_0b0a_0706_0302)))
179      (vec_uzp2 a b (VectorSize.Size16x8)))
180(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1b1a1918_13121110_0b0a0908_03020100)))
181      (vec_uzp1 a b (VectorSize.Size32x4)))
182(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_17161514_0f0e0d0c_07060504)))
183      (vec_uzp2 a b (VectorSize.Size32x4)))
184(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
185      (vec_uzp1 a b (VectorSize.Size64x2)))
186(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
187      (vec_uzp2 a b (VectorSize.Size64x2)))
188
189;; Rules for the `zip1` and `zip2` instructions which interleave lanes in the
190;; low or high halves of the two input vectors.
191(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
192      (vec_zip1 a b (VectorSize.Size8x16)))
193(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
194      (vec_zip2 a b (VectorSize.Size8x16)))
195(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
196      (vec_zip1 a b (VectorSize.Size16x8)))
197(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
198      (vec_zip2 a b (VectorSize.Size16x8)))
199(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
200      (vec_zip1 a b (VectorSize.Size32x4)))
201(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
202      (vec_zip2 a b (VectorSize.Size32x4)))
203;; Note that zip1/zip2 for i64x2 vectors is omitted since it's already covered
204;; by the i64x2 cases of uzp1/uzp2 above where both zip and uzp have the same
205;; semantics for 64-bit lanes.
206
207;; Rules for the `trn1` and `trn2` instructions which interleave odd or even
208;; lanes in the two input vectors.
209(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1e0e_1c0c_1a0a_1808_1606_1404_1202_1000)))
210      (vec_trn1 a b (VectorSize.Size8x16)))
211(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f0f_1d0d_1b0b_1909_1707_1505_1303_1101)))
212      (vec_trn2 a b (VectorSize.Size8x16)))
213(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1d1c_0d0c_1918_0908_1514_0504_1110_0100)))
214      (vec_trn1 a b (VectorSize.Size16x8)))
215(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_0f0e_1b1a_0b0a_1716_0706_1312_0302)))
216      (vec_trn2 a b (VectorSize.Size16x8)))
217(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1b1a1918_0b0a0908_13121110_03020100)))
218      (vec_trn1 a b (VectorSize.Size32x4)))
219(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_17161514_07060504)))
220      (vec_trn2 a b (VectorSize.Size32x4)))
221;; Note that trn1/trn2 for i64x2 vectors is omitted since it's already covered
222;; by the i64x2 cases of uzp1/uzp2 above where both trn and uzp have the same
223;; semantics for 64-bit lanes.
224
225;; Rules for the `rev{16,32,64}` instructions where reversals happen at either
226;; the byte level, the 16-bit level, or 32-bit level. Note that all of these
227;; patterns only match reversals in the first operand, but they can
228;; theoretically be extended if necessary to reversals in the second operand.
229(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0e0f_0c0d_0a0b_0809_0607_0405_0203_0001)))
230      (rev16 a (VectorSize.Size8x16)))
231(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0c0d0e0f_08090a0b_04050607_00010203)))
232      (rev32 a (VectorSize.Size8x16)))
233(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0d0c0f0e_09080b0a_05040706_01000302)))
234      (rev32 a (VectorSize.Size16x8)))
235(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x08090a0b0c0d0e0f_0001020304050607)))
236      (rev64 a (VectorSize.Size8x16)))
237(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x09080b0a0d0c0f0e_0100030205040706)))
238      (rev64 a (VectorSize.Size16x8)))
239(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0b0a09080f0e0d0c_0302010007060504)))
240      (rev64 a (VectorSize.Size32x4)))
241
242(rule (lower (has_type ty (shuffle _ rn rn2 (u128_from_immediate mask))))
243      (let ((mask_reg Reg (constant_f128 mask)))
244       (vec_tbl2 rn rn2 mask_reg ty)))
245
246;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
247
248(rule (lower (has_type vec_i128_ty (swizzle _ rn rm)))
249      (vec_tbl rn rm))
250
251;;;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252
253(rule (lower (isplit _ x @ (value_type $I128)))
254      (let
255          ((x_regs ValueRegs x)
256           (x_lo ValueRegs (value_regs_get x_regs 0))
257           (x_hi ValueRegs (value_regs_get x_regs 1)))
258        (output_pair x_lo x_hi)))
259
260;; Special-case the lowering of an `isplit` of a 128-bit multiply where the
261;; lower bits of the result are discarded and the operands are sign or zero
262;; extended. This maps directly to `umulh` and `smulh`.
263(rule 1 (lower i @ (isplit _ (has_type $I128 (imul _ (uextend _ x) (uextend _ y)))))
264  (if-let (first_result lo) i)
265  (if-let true (value_is_unused lo))
266  (output_pair (invalid_reg)
267               (umulh $I64 (put_in_reg_zext64 x) (put_in_reg_zext64 y))))
268
269(rule 1 (lower i @ (isplit _ (has_type $I128 (imul _ (sextend _ x) (sextend _ y)))))
270  (if-let (first_result lo) i)
271  (if-let true (value_is_unused lo))
272  (output_pair (invalid_reg)
273               (smulh $I64 (put_in_reg_sext64 x) (put_in_reg_sext64 y))))
274
275;;;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
276
277(rule (lower (has_type $I128 (iconcat _ lo hi)))
278      (output (value_regs lo hi)))
279
280;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
281
282(rule (lower (has_type $F32X4 (scalar_to_vector _ x)))
283      (fpu_extend x (ScalarSize.Size32)))
284
285(rule (lower (has_type $F64X2 (scalar_to_vector _ x)))
286      (fpu_extend x (ScalarSize.Size64)))
287
288(rule -1 (lower (scalar_to_vector _ x @ (value_type $I64)))
289      (mov_to_fpu x (ScalarSize.Size64)))
290
291(rule -2 (lower (scalar_to_vector _ x @ (value_type (int_fits_in_32 _))))
292      (mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32)))
293
294;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
295
296;; cmeq vtmp.2d, vm.2d, #0
297;; addp dtmp, vtmp.2d
298;; fcmp dtmp, dtmp
299;; cset xd, eq
300;;
301;; Note that after the ADDP the value of the temporary register will be either
302;; 0 when all input elements are true, i.e. non-zero, or a NaN otherwise
303;; (either -1 or -2 when represented as an integer); NaNs are the only
304;; floating-point numbers that compare unequal to themselves.
305(rule (lower (vall_true _ x @ (value_type (multi_lane 64 2))))
306      (let ((x1 Reg (cmeq0 x (VectorSize.Size64x2)))
307            (x2 Reg (addp x1 x1 (VectorSize.Size64x2))))
308       (with_flags (fpu_cmp (ScalarSize.Size64) x2 x2)
309                   (materialize_bool_result (Cond.Eq)))))
310
311(rule (lower (vall_true _ x @ (value_type (multi_lane 32 2))))
312      (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
313       (with_flags (cmp_rr_shift (OperandSize.Size64) (zero_reg) x1 32)
314                   (ccmp_imm
315                    (OperandSize.Size32)
316                    x1
317                    (u8_into_uimm5 0)
318                    (nzcv false true false false)
319                    (Cond.Ne)))))
320
321;; This operation is implemented by using uminv to create a scalar value, which
322;; is then compared against zero.
323;;
324;; uminv bn, vm.16b
325;; mov xm, vn.d[0]
326;; cmp xm, #0
327;; cset xm, ne
328(rule -1 (lower (vall_true _ x @ (value_type (lane_fits_in_32 ty))))
329      (if (not_vec32x2 ty))
330      (let ((x1 Reg (vec_lanes (VecLanesOp.Uminv) x (vector_size ty)))
331            (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
332       (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
333                   (materialize_bool_result (Cond.Ne)))))
334
335;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
336
337(rule (lower (vany_true _ x @ (value_type in_ty)))
338      (with_flags (vanytrue x in_ty)
339                  (materialize_bool_result (Cond.Ne))))
340
341;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342
343;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
344(rule (lower (has_type $I16X8 (iadd_pairwise _ (swiden_low _ x) (swiden_high _ x))))
345      (saddlp8 x))
346
347;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
348(rule (lower (has_type $I32X4 (iadd_pairwise _ (swiden_low _ x) (swiden_high _ x))))
349      (saddlp16 x))
350
351;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
352(rule (lower (has_type $I16X8 (iadd_pairwise _ (uwiden_low _ x) (uwiden_high _ x))))
353      (uaddlp8 x))
354
355;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
356(rule (lower (has_type $I32X4 (iadd_pairwise _ (uwiden_low _ x) (uwiden_high _ x))))
357      (uaddlp16 x))
358
359(rule -1 (lower (has_type ty (iadd_pairwise _ x y)))
360      (addp x y (vector_size ty)))
361
362;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363
364(rule -1 (lower (has_type ty @ (multi_lane _ _) (iabs _ x)))
365      (vec_abs x (vector_size ty)))
366
367(rule iabs_64 2 (lower (has_type $I64 (iabs _ x)))
368      (abs (OperandSize.Size64) x))
369
370(rule iabs_8_16_32 1 (lower (has_type (fits_in_32 ty) (iabs _ x)))
371      (abs (OperandSize.Size32) (put_in_reg_sext32 x)))
372
373; `rustc` implementation.
374; - create a bitmask of all 1s if negative, or 0s if positive.
375; - xor all bits by bitmask. then subtract bitmask from xor'd values.
376; - if `x` is positive, the xor'd bits = x and the mask = 0, so we end up with
377;   `x - 0`.
378; - if `x` is negative, the xor'd bits = ~x and the mask = -1, so we end up with
379;   `~x - (-1) = ~x + 1`, which is exactly `abs(x)`.
380(rule (lower (has_type $I128 (iabs _ x)))
381      (let ((x_regs ValueRegs x)
382            (x_lo Reg (value_regs_get x_regs 0))
383            (x_hi Reg (value_regs_get x_regs 1))
384            (asr_reg Reg (asr_imm $I64 x_hi (imm_shift_from_u8 63)))
385            (eor_hi Reg (eor $I64 x_hi asr_reg))
386            (eor_lo Reg (eor $I64 x_lo asr_reg))
387            (subs_lo ProducesFlags (sub_with_flags_paired $I64 eor_lo asr_reg))
388            (sbc_hi ConsumesFlags (sbc_paired $I64 eor_hi asr_reg)))
389       (with_flags subs_lo sbc_hi)))
390
391;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392
393(rule (lower (has_type $I64X2 (avg_round _ x y)))
394      (let ((one Reg (splat_const 1 (VectorSize.Size64x2)))
395            (c Reg (orr_vec x y (VectorSize.Size64x2)))
396            (c Reg (and_vec c one (VectorSize.Size64x2)))
397            (x Reg (ushr_vec_imm x 1 (VectorSize.Size64x2)))
398            (y Reg (ushr_vec_imm y 1 (VectorSize.Size64x2)))
399            (sum Reg (add_vec x y (VectorSize.Size64x2))))
400       (add_vec c sum (VectorSize.Size64x2))))
401
402(rule -1 (lower (has_type (lane_fits_in_32 ty) (avg_round _ x y)))
403      (vec_rrr (VecALUOp.Urhadd) x y (vector_size ty)))
404
405;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
406
407(rule (lower (has_type ty @ (multi_lane _ _) (sqmul_round_sat _ x y)))
408      (vec_rrr (VecALUOp.Sqrdmulh) x y (vector_size ty)))
409
410;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
411
412(rule -1 (lower (has_type ty @ (multi_lane _ _) (fadd _ rn rm)))
413      (vec_rrr (VecALUOp.Fadd) rn rm (vector_size ty)))
414
415(rule (lower (has_type (ty_scalar_float ty) (fadd _ rn rm)))
416      (fpu_rrr (FPUOp2.Add) rn rm (scalar_size ty)))
417
418;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
419
420(rule -1 (lower (has_type ty @ (multi_lane _ _) (fsub _ rn rm)))
421      (vec_rrr (VecALUOp.Fsub) rn rm (vector_size ty)))
422
423(rule (lower (has_type (ty_scalar_float ty) (fsub _ rn rm)))
424      (fpu_rrr (FPUOp2.Sub) rn rm (scalar_size ty)))
425
426;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
427
428(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmul _ rn rm)))
429      (vec_rrr (VecALUOp.Fmul) rn rm (vector_size ty)))
430
431(rule (lower (has_type (ty_scalar_float ty) (fmul _ rn rm)))
432      (fpu_rrr (FPUOp2.Mul) rn rm (scalar_size ty)))
433
434;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
435
436(rule -1 (lower (has_type ty @ (multi_lane _ _) (fdiv _ rn rm)))
437      (vec_rrr (VecALUOp.Fdiv) rn rm (vector_size ty)))
438
439(rule (lower (has_type (ty_scalar_float ty) (fdiv _ rn rm)))
440      (fpu_rrr (FPUOp2.Div) rn rm (scalar_size ty)))
441
442;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
443
444(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmin _ rn rm)))
445      (vec_rrr (VecALUOp.Fmin) rn rm (vector_size ty)))
446
447(rule (lower (has_type (ty_scalar_float ty) (fmin _ rn rm)))
448      (fpu_rrr (FPUOp2.Min) rn rm (scalar_size ty)))
449
450;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
451
452(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmax _ rn rm)))
453      (vec_rrr (VecALUOp.Fmax) rn rm (vector_size ty)))
454
455(rule (lower (has_type (ty_scalar_float ty) (fmax _ rn rm)))
456      (fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty)))
457
458;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
459
460(rule -1 (lower (has_type ty @ (multi_lane _ _) (sqrt _ x)))
461      (vec_misc (VecMisc2.Fsqrt) x (vector_size ty)))
462
463(rule (lower (has_type (ty_scalar_float ty) (sqrt _ x)))
464      (fpu_rr (FPUOp1.Sqrt) x (scalar_size ty)))
465
466;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
467
468(rule -1 (lower (has_type ty @ (multi_lane _ _) (fneg _ x)))
469      (vec_misc (VecMisc2.Fneg) x (vector_size ty)))
470
471(rule (lower (has_type (ty_scalar_float ty) (fneg _ x)))
472      (fpu_rr (FPUOp1.Neg) x (scalar_size ty)))
473
474;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
475
476(rule -1 (lower (has_type ty @ (multi_lane _ _) (fabs _ x)))
477      (vec_misc (VecMisc2.Fabs) x (vector_size ty)))
478
479(rule (lower (has_type (ty_scalar_float ty) (fabs _ x)))
480      (fpu_rr (FPUOp1.Abs) x (scalar_size ty)))
481
482;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
483
484(rule (lower (has_type $F64 (fpromote _ x)))
485      (fpu_rr (FPUOp1.Cvt32To64) x (ScalarSize.Size32)))
486
487;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
488
489(rule (lower (has_type $F32 (fdemote _ x)))
490      (fpu_rr (FPUOp1.Cvt64To32) x (ScalarSize.Size64)))
491
492;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
493
494(rule -1 (lower (has_type ty @ (multi_lane _ _) (ceil _ x)))
495      (vec_misc (VecMisc2.Frintp) x (vector_size ty)))
496
497(rule (lower (has_type $F32 (ceil _ x)))
498      (fpu_round (FpuRoundMode.Plus32) x))
499
500(rule (lower (has_type $F64 (ceil _ x)))
501      (fpu_round (FpuRoundMode.Plus64) x))
502
503;;;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
504
505(rule -1 (lower (has_type ty @ (multi_lane _ _) (floor _ x)))
506      (vec_misc (VecMisc2.Frintm) x (vector_size ty)))
507
508(rule (lower (has_type $F32 (floor _ x)))
509      (fpu_round (FpuRoundMode.Minus32) x))
510
511(rule (lower (has_type $F64 (floor _ x)))
512      (fpu_round (FpuRoundMode.Minus64) x))
513
514;;;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
515
516(rule -1 (lower (has_type ty @ (multi_lane _ _) (trunc _ x)))
517      (vec_misc (VecMisc2.Frintz) x (vector_size ty)))
518
519(rule (lower (has_type $F32 (trunc _ x)))
520      (fpu_round (FpuRoundMode.Zero32) x))
521
522(rule (lower (has_type $F64 (trunc _ x)))
523      (fpu_round (FpuRoundMode.Zero64) x))
524
525;;;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
526
527(rule -1 (lower (has_type ty @ (multi_lane _ _) (nearest _ x)))
528      (vec_misc (VecMisc2.Frintn) x (vector_size ty)))
529
530(rule (lower (has_type $F32 (nearest _ x)))
531      (fpu_round (FpuRoundMode.Nearest32) x))
532
533(rule (lower (has_type $F64 (nearest _ x)))
534      (fpu_round (FpuRoundMode.Nearest64) x))
535
536;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
537
538(rule 1 (lower (has_type ty (fma _ x y z))) (fmadd ty x y z))
539(rule 2 (lower (has_type (ty_scalar_float ty) (fma _ x y (fneg _ z)))) (fnmsub ty x y z))
540
541;; Constructors matching the scalar behavior of aarch64. If you're confused like
542;; I was reading over these, they are:
543;;
544;; * fmadd   r =   a * b  + c   /   multiply, then add
545;; * fnmadd  r = -(a * b) - c   /   multiply, then negate, then subtract
546;; * fmsub   r = -(a * b) + c   /   multiply, then negate, then add
547;; * fnmsub  r =   a * b  - c   /   multiply, then subtract
548(decl fmadd (Type Value Value Value) Reg)
549(decl fnmadd (Type Value Value Value) Reg)
550(decl fmsub (Type Value Value Value) Reg)
551(decl fnmsub (Type Value Value Value) Reg)
552
553;; Switch `fm{add,sub}` to `fnm{add,sub}` if one of the operands are negated
554;; instruction instead
555(rule 2 (fmadd ty (fneg _ x) y z) (fmsub ty x y z))
556(rule 3 (fmadd ty x (fneg _ y) z) (fmsub ty x y z))
557(rule 2 (fnmsub ty (fneg _ x) y z) (fnmadd ty x y z))
558(rule 3 (fnmsub ty x (fneg _ y) z) (fnmadd ty x y z))
559
560;; Scalar cases
561(rule 0 (fmadd (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
562(rule 0 (fnmadd (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.NMAdd) (scalar_size ty) x y z))
563(rule 0 (fmsub (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.MSub) (scalar_size ty) x y z))
564(rule 0 (fnmsub (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.NMSub) (scalar_size ty) x y z))
565
566;; Vector cases
567(rule 1 (fmadd ty @ (multi_lane _ _) x y z) (lower_fmla (VecALUModOp.Fmla) x y z (vector_size ty)))
568(rule 1 (fmsub ty @ (multi_lane _ _) x y z) (lower_fmla (VecALUModOp.Fmls) x y z (vector_size ty)))
569
570;; Lowers a fused-multiply-add operation handling various forms of the
571;; instruction to get maximal coverage of what's available on AArch64.
572(decl lower_fmla (VecALUModOp Value Value Value VectorSize) Reg)
573
574;; Base case, emit the op requested.
575(rule (lower_fmla op x y z size)
576      (vec_rrr_mod op z x y size))
577
578;; Special case: if one of the multiplicands are a splat then the element-based
579;; fma can be used instead with 0 as the element index.
580(rule 1 (lower_fmla op (splat _ x) y z size)
581        (vec_fmla_elem op z y x size 0))
582(rule 2 (lower_fmla op x (splat _ y) z size)
583        (vec_fmla_elem op z x y size 0))
584
585;; Special case: if one of the multiplicands is a shuffle to broadcast a
586;; single element of a vector then the element-based fma can be used like splat
587;; above.
588;;
589;; Note that in Cranelift shuffle always has i8x16 inputs and outputs so
590;; a `bitcast` is matched here explicitly since that's the main way a shuffle
591;; output will be fed into this instruction.
592(rule 3 (lower_fmla op (bitcast _ _ (shuffle _ x x (shuffle32_from_imm n n n n))) y z size @ (VectorSize.Size32x4))
593        (if-let true (u64_lt n 4))
594        (vec_fmla_elem op z y x size n))
595(rule 4 (lower_fmla op x (bitcast _ _ (shuffle _ y y (shuffle32_from_imm n n n n))) z size @ (VectorSize.Size32x4))
596        (if-let true (u64_lt n 4))
597        (vec_fmla_elem op z x y size n))
598(rule 3 (lower_fmla op (bitcast _ _ (shuffle _ x x (shuffle64_from_imm n n))) y z size @ (VectorSize.Size64x2))
599        (if-let true (u64_lt n 2))
600        (vec_fmla_elem op z y x size n))
601(rule 4 (lower_fmla op x (bitcast _ _ (shuffle _ y y (shuffle64_from_imm n n))) z size @ (VectorSize.Size64x2))
602        (if-let true (u64_lt n 2))
603        (vec_fmla_elem op z x y size n))
604
605;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
606
607(rule (lower (has_type ty (fcopysign _ x y)))
608      (fcopy_sign x y ty))
609
610;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
611
612(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint _ x @ (value_type $F32))))
613      (fpu_to_int_cvt (FpuToIntOp.F32ToU32) x false $F32 out_ty))
614
615(rule 1 (lower (has_type $I64 (fcvt_to_uint _ x @ (value_type $F32))))
616      (fpu_to_int_cvt (FpuToIntOp.F32ToU64) x false $F32 $I64))
617
618(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint _ x @ (value_type $F64))))
619      (fpu_to_int_cvt (FpuToIntOp.F64ToU32) x false $F64 out_ty))
620
621(rule 1 (lower (has_type $I64 (fcvt_to_uint _ x @ (value_type $F64))))
622      (fpu_to_int_cvt (FpuToIntOp.F64ToU64) x false $F64 $I64))
623
624;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
625
626(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint _ x @ (value_type $F32))))
627      (fpu_to_int_cvt (FpuToIntOp.F32ToI32) x true $F32 out_ty))
628
629(rule 1 (lower (has_type $I64 (fcvt_to_sint _ x @ (value_type $F32))))
630      (fpu_to_int_cvt (FpuToIntOp.F32ToI64) x true $F32 $I64))
631
632(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint _ x @ (value_type $F64))))
633      (fpu_to_int_cvt (FpuToIntOp.F64ToI32) x true $F64 out_ty))
634
635(rule 1 (lower (has_type $I64 (fcvt_to_sint _ x @ (value_type $F64))))
636      (fpu_to_int_cvt (FpuToIntOp.F64ToI64) x true $F64 $I64))
637
638;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
639
640(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_uint _ x @ (value_type (multi_lane 32 _)))))
641      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))
642
643(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_uint _ x @ (value_type (multi_lane 64 _)))))
644      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))
645
646(rule (lower (has_type $F32 (fcvt_from_uint _ x @ (value_type (fits_in_32 _)))))
647      (int_to_fpu (IntToFpuOp.U32ToF32) (put_in_reg_zext32 x)))
648
649(rule (lower (has_type $F64 (fcvt_from_uint _ x @ (value_type (fits_in_32 _)))))
650      (int_to_fpu (IntToFpuOp.U32ToF64) (put_in_reg_zext32 x)))
651
652(rule 1 (lower (has_type $F32 (fcvt_from_uint _ x @ (value_type $I64))))
653      (int_to_fpu (IntToFpuOp.U64ToF32) x))
654
655(rule 1 (lower (has_type $F64 (fcvt_from_uint _ x @ (value_type $I64))))
656      (int_to_fpu (IntToFpuOp.U64ToF64) x))
657
658;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
659
660(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_sint _ x @ (value_type (multi_lane 32 _)))))
661      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))
662
663(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_sint _ x @ (value_type (multi_lane 64 _)))))
664      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))
665
666(rule (lower (has_type $F32 (fcvt_from_sint _ x @ (value_type (fits_in_32 _)))))
667      (int_to_fpu (IntToFpuOp.I32ToF32) (put_in_reg_sext32 x)))
668
669(rule (lower (has_type $F64 (fcvt_from_sint _ x @ (value_type (fits_in_32 _)))))
670      (int_to_fpu (IntToFpuOp.I32ToF64) (put_in_reg_sext32 x)))
671
672(rule 1 (lower (has_type $F32 (fcvt_from_sint _ x @ (value_type $I64))))
673      (int_to_fpu (IntToFpuOp.I64ToF32) x))
674
675(rule 1 (lower (has_type $F64 (fcvt_from_sint _ x @ (value_type $I64))))
676      (int_to_fpu (IntToFpuOp.I64ToF64) x))
677
678;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
679
680(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_uint_sat _ x @ (value_type (multi_lane 32 _)))))
681      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))
682
683(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_uint_sat _ x @ (value_type (multi_lane 64 _)))))
684      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))
685
686(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint_sat _ x @ (value_type $F32))))
687      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU32) x false out_ty))
688
689(rule 1 (lower (has_type $I64 (fcvt_to_uint_sat _ x @ (value_type $F32))))
690      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU64) x false $I64))
691
692(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint_sat _ x @ (value_type $F64))))
693      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU32) x false out_ty))
694
695(rule 1 (lower (has_type $I64 (fcvt_to_uint_sat _ x @ (value_type $F64))))
696      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU64) x false $I64))
697
698;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
699
700(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_sint_sat _ x @ (value_type (multi_lane 32 _)))))
701      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))
702
703(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_sint_sat _ x @ (value_type (multi_lane 64 _)))))
704      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))
705
706(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint_sat _ x @ (value_type $F32))))
707      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI32) x true out_ty))
708
709(rule 1 (lower (has_type $I64 (fcvt_to_sint_sat _ x @ (value_type $F32))))
710      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI64) x true $I64))
711
712(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint_sat _ x @ (value_type $F64))))
713      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI32) x true out_ty))
714
715(rule 1 (lower (has_type $I64 (fcvt_to_sint_sat _ x @ (value_type $F64))))
716      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI64) x true $I64))
717
718;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
719
720;; `i64` and smaller
721
722;; Base case, simply subtracting things in registers.
723(rule isub_base_case -4 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x y)))
724      (sub ty x y))
725
726;; Special case for when one operand is an immediate that fits in 12 bits.
727(rule isub_imm12 0 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x (imm12_from_value y))))
728      (sub_imm ty x y))
729
730;; Same as the previous special case, except we can switch the subtraction to an
731;; addition if the negated immediate fits in 12 bits.
732(rule isub_imm12_neg 2 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x y)))
733      (if-let imm12_neg (imm12_from_negated_value y))
734      (add_imm ty x imm12_neg))
735
736;; Special cases for when we're subtracting an extended register where the
737;; extending operation can get folded into the sub itself.
738(rule isub_extend 1 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x (extended_value_from_value y))))
739      (sub_extend ty x y))
740
741;; Finally a special case for when we're subtracting the shift of a different
742;; register by a constant amount and the shift can get folded into the sub.
743(rule isub_ishl -3 (lower (has_type (ty_int_ref_scalar_64 ty)
744                       (isub _ x (ishl _ y (iconst _ k)))))
745      (if-let amt (lshl_from_imm64 ty k))
746      (sub_shift ty x y amt))
747
748;; vectors
749(rule -2 (lower (has_type ty @ (multi_lane _ _) (isub _ x y)))
750      (sub_vec x y (vector_size ty)))
751
752;; `i128`
753(rule -1 (lower (has_type $I128 (isub _ x y)))
754      (sub_i128 x y))
755
756;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
757
758(rule (lower (has_type (ty_vec128 ty) (uadd_sat _ x y)))
759      (uqadd x y (vector_size ty)))
760
761;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
762
763(rule (lower (has_type (ty_vec128 ty) (sadd_sat _ x y)))
764      (sqadd x y (vector_size ty)))
765
766;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
767
768(rule (lower (has_type (ty_vec128 ty) (usub_sat _ x y)))
769      (uqsub x y (vector_size ty)))
770
771;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
772
773(rule (lower (has_type (ty_vec128 ty) (ssub_sat _ x y)))
774      (sqsub x y (vector_size ty)))
775
776;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
777
778;; `i64` and smaller.
779(rule ineg_base_case 1 (lower (has_type (fits_in_64 ty) (ineg _ x)))
780      (sub ty (zero_reg) x))
781
782;; `i128`
783(rule 2 (lower (has_type $I128 (ineg _ x)))
784      (sub_i128 (value_regs_zero) x))
785
786;; vectors.
787(rule (lower (has_type (ty_vec128 ty) (ineg _ x)))
788      (neg x (vector_size ty)))
789
790;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
791
792;; `i64` and smaller.
793(rule imul_base_case -3 (lower (has_type (ty_int_ref_scalar_64 ty) (imul _ x y)))
794      (madd ty x y (zero_reg)))
795
796;; `i128`.
797(rule -1 (lower (has_type $I128 (imul _ x y)))
798      (let
799          ;; Get the high/low registers for `x`.
800          ((x_regs ValueRegs x)
801           (x_lo Reg (value_regs_get x_regs 0))
802           (x_hi Reg (value_regs_get x_regs 1))
803
804           ;; Get the high/low registers for `y`.
805           (y_regs ValueRegs y)
806           (y_lo Reg (value_regs_get y_regs 0))
807           (y_hi Reg (value_regs_get y_regs 1))
808
809           ;; 128bit mul formula:
810           ;;   dst_lo = x_lo * y_lo
811           ;;   dst_hi = umulhi(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo)
812           ;;
813           ;; We can convert the above formula into the following
814           ;; umulh   dst_hi, x_lo, y_lo
815           ;; madd    dst_hi, x_lo, y_hi, dst_hi
816           ;; madd    dst_hi, x_hi, y_lo, dst_hi
817           ;; madd    dst_lo, x_lo, y_lo, zero
818           (dst_hi1 Reg (umulh $I64 x_lo y_lo))
819           (dst_hi2 Reg (madd $I64 x_lo y_hi dst_hi1))
820           (dst_hi Reg (madd $I64 x_hi y_lo dst_hi2))
821           (dst_lo Reg (madd $I64 x_lo y_lo (zero_reg))))
822        (value_regs dst_lo dst_hi)))
823
824;; Special cases where the upper bits are sign-or-zero extended of the lower bits
825;; so the calculation here is much simpler with just a `umulh` or `smulh`
826;; instead of the additions above as well.
827(rule (lower (has_type $I128 (imul _ (uextend _ x) (uextend _ y))))
828      (let (
829          (x Reg (put_in_reg_zext64 x))
830          (y Reg (put_in_reg_zext64 y))
831        )
832        (value_regs
833          (madd $I64 x y (zero_reg))
834          (umulh $I64 x y))))
835(rule (lower (has_type $I128 (imul _ (sextend _ x) (sextend _ y))))
836      (let (
837          (x Reg (put_in_reg_sext64 x))
838          (y Reg (put_in_reg_sext64 y))
839        )
840        (value_regs
841          (madd $I64 x y (zero_reg))
842          (smulh $I64 x y))))
843
844;; vectors (i8x8/i8x16/i16x4/i16x8/i32x2/i32x4)
845(rule -2 (lower (has_type (lane_fits_in_32 ty @ (multi_lane _ _)) (imul _ x y)))
846      (mul x y (vector_size ty)))
847
848;; Special lowering for i64x2.
849;;
850;; This I64X2 multiplication is performed with several 32-bit
851;; operations.
852;;
853;; 64-bit numbers x and y, can be represented as:
854;;   x = a + 2^32(b)
855;;   y = c + 2^32(d)
856;;
857;; A 64-bit multiplication is:
858;;   x * y = ac + 2^32(ad + bc) + 2^64(bd)
859;; note: `2^64(bd)` can be ignored, the value is too large to fit in
860;; 64 bits.
861;;
862;; This sequence implements a I64X2 multiply, where the registers
863;; `rn` and `rm` are split up into 32-bit components:
864;;   rn = |d|c|b|a|
865;;   rm = |h|g|f|e|
866;;
867;;   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
868;;
869;;  The sequence is:
870;;  rev64 rd.4s, rm.4s
871;;  mul rd.4s, rd.4s, rn.4s
872;;  xtn tmp1.2s, rn.2d
873;;  addp rd.4s, rd.4s, rd.4s
874;;  xtn tmp2.2s, rm.2d
875;;  shll rd.2d, rd.2s, #32
876;;  umlal rd.2d, tmp2.2s, tmp1.2s
877(rule -1 (lower (has_type $I64X2 (imul _ x y)))
878      (let ((rn Reg x)
879            (rm Reg y)
880            ;; Reverse the 32-bit elements in the 64-bit words.
881            ;;   rd = |g|h|e|f|
882            (rev Reg (rev64 rm (VectorSize.Size32x4)))
883
884            ;; Calculate the high half components.
885            ;;   rd = |dg|ch|be|af|
886            ;;
887            ;; Note that this 32-bit multiply of the high half
888            ;; discards the bits that would overflow, same as
889            ;; if 64-bit operations were used. Also the Shll
890            ;; below would shift out the overflow bits anyway.
891            (mul Reg (mul rev rn (VectorSize.Size32x4)))
892
893            ;; Extract the low half components of rn.
894            ;;   tmp1 = |c|a|
895            (tmp1 Reg (xtn rn (ScalarSize.Size32)))
896
897            ;; Sum the respective high half components.
898            ;;   rd = |dg+ch|be+af||dg+ch|be+af|
899            (sum Reg (addp mul mul (VectorSize.Size32x4)))
900
901            ;; Extract the low half components of rm.
902            ;;   tmp2 = |g|e|
903            (tmp2 Reg (xtn rm (ScalarSize.Size32)))
904
905            ;; Shift the high half components, into the high half.
906            ;;   rd = |dg+ch << 32|be+af << 32|
907            (shift Reg (shll32 sum false))
908
909            ;; Multiply the low components together, and accumulate with the high
910            ;; half.
911            ;;   rd = |rd[1] + cg|rd[0] + ae|
912            (result Reg (umlal32 shift tmp2 tmp1 false)))
913        result))
914
915;; Special case for `i16x8.extmul_low_i8x16_s`.
916(rule (lower (has_type $I16X8
917                       (imul _ (swiden_low _ x @ (value_type $I8X16))
918                             (swiden_low _ y @ (value_type $I8X16)))))
919      (smull8 x y false))
920
921;; Special case for `i16x8.extmul_high_i8x16_s`.
922(rule (lower (has_type $I16X8
923                       (imul _ (swiden_high _ x @ (value_type $I8X16))
924                             (swiden_high _ y @ (value_type $I8X16)))))
925      (smull8 x y true))
926
927;; Special case for `i16x8.extmul_low_i8x16_u`.
928(rule (lower (has_type $I16X8
929                       (imul _ (uwiden_low _ x @ (value_type $I8X16))
930                             (uwiden_low _ y @ (value_type $I8X16)))))
931      (umull8 x y false))
932
933;; Special case for `i16x8.extmul_high_i8x16_u`.
934(rule (lower (has_type $I16X8
935                       (imul _ (uwiden_high _ x @ (value_type $I8X16))
936                             (uwiden_high _ y @ (value_type $I8X16)))))
937      (umull8 x y true))
938
939;; Special case for `i32x4.extmul_low_i16x8_s`.
940(rule (lower (has_type $I32X4
941                       (imul _ (swiden_low _ x @ (value_type $I16X8))
942                             (swiden_low _ y @ (value_type $I16X8)))))
943      (smull16 x y false))
944
945;; Special case for `i32x4.extmul_high_i16x8_s`.
946(rule (lower (has_type $I32X4
947                       (imul _ (swiden_high _ x @ (value_type $I16X8))
948                             (swiden_high _ y @ (value_type $I16X8)))))
949      (smull16 x y true))
950
951;; Special case for `i32x4.extmul_low_i16x8_u`.
952(rule (lower (has_type $I32X4
953                       (imul _ (uwiden_low _ x @ (value_type $I16X8))
954                             (uwiden_low _ y @ (value_type $I16X8)))))
955      (umull16 x y false))
956
957;; Special case for `i32x4.extmul_high_i16x8_u`.
958(rule (lower (has_type $I32X4
959                       (imul _ (uwiden_high _ x @ (value_type $I16X8))
960                             (uwiden_high _ y @ (value_type $I16X8)))))
961      (umull16 x y true))
962
963;; Special case for `i64x2.extmul_low_i32x4_s`.
964(rule (lower (has_type $I64X2
965                       (imul _ (swiden_low _ x @ (value_type $I32X4))
966                             (swiden_low _ y @ (value_type $I32X4)))))
967      (smull32 x y false))
968
969;; Special case for `i64x2.extmul_high_i32x4_s`.
970(rule (lower (has_type $I64X2
971                       (imul _ (swiden_high _ x @ (value_type $I32X4))
972                             (swiden_high _ y @ (value_type $I32X4)))))
973      (smull32 x y true))
974
975;; Special case for `i64x2.extmul_low_i32x4_u`.
976(rule (lower (has_type $I64X2
977                       (imul _ (uwiden_low _ x @ (value_type $I32X4))
978                             (uwiden_low _ y @ (value_type $I32X4)))))
979      (umull32 x y false))
980
981;; Special case for `i64x2.extmul_high_i32x4_u`.
982(rule (lower (has_type $I64X2
983                       (imul _ (uwiden_high _ x @ (value_type $I32X4))
984                             (uwiden_high _ y @ (value_type $I32X4)))))
985      (umull32 x y true))
986
987;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
988
989(rule 1 (lower (has_type $I64 (smulhi _ x y)))
990      (smulh $I64 x y))
991
992(rule (lower (has_type (fits_in_32 ty) (smulhi _ x y)))
993      (let ((x64 Reg (put_in_reg_sext64 x))
994            (y64 Reg (put_in_reg_sext64 y))
995            (mul Reg (madd $I64 x64 y64 (zero_reg)))
996            (result Reg (asr_imm $I64 mul (imm_shift_from_u8 (ty_bits ty)))))
997        result))
998
999;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1000
1001(rule 1 (lower (has_type $I64 (umulhi _ x y)))
1002      (umulh $I64 x y))
1003
1004(rule (lower (has_type (fits_in_32 ty) (umulhi _ x y)))
1005      (let (
1006          (x64 Reg (put_in_reg_zext64 x))
1007          (y64 Reg (put_in_reg_zext64 y))
1008          (mul Reg (madd $I64 x64 y64 (zero_reg)))
1009          (result Reg (lsr_imm $I64 mul (imm_shift_from_u8 (ty_bits ty))))
1010        )
1011        (value_reg result)))
1012
1013;;;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1014
1015;; Enum representing the types of extensions
1016(type ExtType
1017      (enum
1018        (Signed)
1019        (Unsigned)))
1020
1021;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
1022;; It takes a value and extension type, and performs the appropriate checks.
1023;; TODO: restore spec
1024;  (spec (put_nonzero_in_reg_sext64 x)
1025;        (provide (= (sign_ext 64 x) result))
1026;        (require (not (= #x0000000000000000 result))))
1027(decl put_nonzero_in_reg (Value ExtType Type) Reg)
1028
1029;; Special case where if a `Value` is known to be nonzero we can trivially
1030;; move it into a register.
1031
1032;; zero-extend non-zero constant
1033(rule (put_nonzero_in_reg (iconst _ (nonzero_u64_from_imm64 n)) (ExtType.Unsigned) ty)
1034      (imm ty (ImmExtend.Zero) n))
1035
1036;; sign-extend non-zero constant
1037(rule (put_nonzero_in_reg (iconst _ (nonzero_u64_from_imm64 n)) (ExtType.Signed) ty)
1038      (imm ty (ImmExtend.Sign) n))
1039
1040(rule -1 (put_nonzero_in_reg val _ $I64)
1041      (trap_if_zero_divisor (put_in_reg val) (operand_size $I64)))
1042
1043(rule -2 (put_nonzero_in_reg val (ExtType.Signed) (fits_in_32 _))
1044      (trap_if_zero_divisor (put_in_reg_sext32 val) (operand_size $I32)))
1045
1046(rule -2 (put_nonzero_in_reg val (ExtType.Unsigned) (fits_in_32 _))
1047      (trap_if_zero_divisor (put_in_reg_zext32 val) (operand_size $I32)))
1048
1049;; Note that aarch64's `udiv` doesn't trap so to respect the semantics of
1050;; CLIF's `udiv` the check for zero needs to be manually performed.
1051
1052(rule udiv 1 (lower (has_type $I64 (udiv _ x y)))
1053      (a64_udiv $I64 (put_in_reg x) (put_nonzero_in_reg y (ExtType.Unsigned) $I64)))
1054
1055(rule udiv (lower (has_type (fits_in_32 ty) (udiv _ x y)))
1056      (a64_udiv $I32 (put_in_reg_zext32 x) (put_nonzero_in_reg y (ExtType.Unsigned) ty)))
1057
1058;;;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1059
1060;; TODO: Add SDiv32 to implement 32-bit directly, rather
1061;; than extending the input.
1062;;
1063;; The sequence of checks here should look like this for 32 or 64 bits:
1064;;
1065;;   cbnz rm, #8
1066;;   udf ; divide by zero
1067;;   cmn rm, 1
1068;;   ccmp rn, 1, #nzcv, eq
1069;;   b.vc #8
1070;;   udf ; signed overflow
1071;;
1072;; In the narrow 8 or 16 bit case, we need to insert an additional left-shift
1073;; to check for the minimum value using the 32-bit ccmp instruction.
1074;;
1075;; Note The div instruction does not trap on divide by zero or overflow, so
1076;; checks need to be manually inserted.
1077;;
1078;; TODO: if `y` is -1 then a check that `x` is not INT_MIN is all that's
1079;; necessary, but right now `y` is checked to not be -1 as well.
1080
1081(rule sdiv_base_case (lower (has_type $I64 (sdiv _ x y)))
1082      (let ((x64 Reg (put_in_reg_sext64 x))
1083            (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) $I64))
1084            (intmin_check_x Reg (intmin_check $I64 x64))
1085            (valid_x64 Reg (trap_if_div_overflow $I64 intmin_check_x x64 y64))
1086            (result Reg (a64_sdiv $I64 valid_x64 y64)))
1087        result))
1088
1089(rule sdiv_base_case -1 (lower (has_type (fits_in_32 ty) (sdiv _ x y)))
1090      (let ((x32 Reg (put_in_reg_sext32 x))
1091            (y32 Reg (put_nonzero_in_reg y (ExtType.Signed) ty))
1092            (intmin_check_x Reg (intmin_check ty x32))
1093            (valid_x32 Reg (trap_if_div_overflow ty intmin_check_x x32 y32))
1094            (result Reg (a64_sdiv ty valid_x32 y32)))
1095        result))
1096
1097;; Special case for `sdiv` where no checks are needed due to division by a
1098;; constant meaning the checks are always passed.
1099(rule sdiv_safe_divisor 2 (lower (has_type $I64 (sdiv _ x (iconst _ imm))))
1100      (if-let y (safe_divisor_from_imm64 $I64 imm))
1101      (a64_sdiv $I64 (put_in_reg_sext64 x) (imm $I64 (ImmExtend.Sign) y)))
1102
1103(rule sdiv_safe_divisor 1 (lower (has_type (fits_in_32 ty) (sdiv _ x (iconst _ imm))))
1104      (if-let y (safe_divisor_from_imm64 ty imm))
1105      (a64_sdiv ty (put_in_reg_sext32 x) (imm ty (ImmExtend.Sign) y)))
1106
1107;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
1108
1109;;;; Rules for `urem` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1110
1111;; Remainder (x % y) is implemented as:
1112;;
1113;;   tmp = x / y
1114;;   result = x - (tmp*y)
1115;;
1116;; use 'result' for tmp and you have:
1117;;
1118;;   cbnz y, #8         ; branch over trap
1119;;   udf                ; divide by zero
1120;;   div rd, x, y       ; rd = x / y
1121;;   msub rd, rd, y, x  ; rd = x - rd * y
1122
1123;; TODO: we can avoid a 0 check, if the dividend is a non-0 constant
1124
1125(rule urem (lower (has_type $I64 (urem _ x y)))
1126      (let ((x64 Reg (put_in_reg_zext64 x))
1127            (y64 Reg (put_nonzero_in_reg y (ExtType.Unsigned) $I64))
1128            (div Reg (a64_udiv $I64 x64 y64))
1129            (result Reg (msub $I64 div y64 x64)))
1130        result))
1131
1132(rule urem -1 (lower (has_type (fits_in_32 ty) (urem _ x y)))
1133      (let ((x64 Reg (put_in_reg_zext32 x))
1134            (y64 Reg (put_nonzero_in_reg y (ExtType.Unsigned) ty))
1135            (div Reg (a64_udiv ty x64 y64))
1136            (result Reg (msub ty div y64 x64)))
1137        result))
1138
1139(rule srem (lower (has_type $I64 (srem _ x y)))
1140      (let ((x64 Reg (put_in_reg_sext64 x))
1141            (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) $I64))
1142            (div Reg (a64_sdiv $I64 x64 y64))
1143            (result Reg (msub $I64 div y64 x64)))
1144        result))
1145
1146(rule srem -1 (lower (has_type (fits_in_32 ty) (srem _ x y)))
1147      (let ((x64 Reg (put_in_reg_sext32 x))
1148            (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) ty))
1149            (div Reg (a64_sdiv ty x64 y64))
1150            (result Reg (msub ty div y64 x64)))
1151        result))
1152
1153;;; Rules for integer min/max: umin, smin, umax, smax ;;;;;;;;;;;;;;;;;;;;;;;;;
1154
1155;; `i64` and smaller.
1156
1157;; cmp     $x, $y
1158;; csel    .., $x, $y, $cc
1159 (spec (cmp_and_choose ty cc signed x y)
1160   (provide
1161     (= result
1162        (switch cc
1163          (#x03 (if (bvule x y) x y))
1164          (#x08 (if (bvuge x y) x y))
1165          (#x0b (if (bvsle x y) x y))
1166          (#x0c (if (bvsge x y) x y)))))
1167   (require
1168     (or (= ty 8)
1169         (= ty 16)
1170         (= ty 32)
1171         (= ty 64))
1172     (or (= cc #x03)
1173         (= cc #x08)
1174         (= cc #x0b)
1175         (= cc #x0c))
1176     (if signed (or (= cc #x0b) (= cc #x0c))
1177                (or (= cc #x03) (= cc #x08)))))
1178(decl cmp_and_choose (Type Cond bool Value Value) ValueRegs)
1179(rule (cmp_and_choose (fits_in_64 ty) cc _ x y)
1180      (let ((x Reg (put_in_reg x))
1181            (y Reg (put_in_reg y)))
1182       (with_flags_reg (cmp (operand_size ty) x y)
1183                       (csel cc x y))))
1184
1185;; `i16` and `i8` min/max require sign extension as
1186;; the comparison operates on (at least) 32 bits.
1187(rule 1 (cmp_and_choose (fits_in_16 ty) cc signed x y)
1188      (let ((x Reg (extend (put_in_reg x) signed (ty_bits ty) 32))
1189            (y Reg (extend (put_in_reg y) signed (ty_bits ty) 32)))
1190      (with_flags_reg (cmp (operand_size ty) x y)
1191                      (csel cc x y))))
1192
1193(rule umin 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (umin _ x y)))
1194      (cmp_and_choose ty (Cond.Lo) false x y))
1195(rule smin 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (smin _ x y)))
1196      (cmp_and_choose ty (Cond.Lt) true x y))
1197(rule umax 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (umax _ x y)))
1198      (cmp_and_choose ty (Cond.Hi) false x y))
1199(rule smax 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (smax _ x y)))
1200      (cmp_and_choose ty (Cond.Gt) true x y))
1201
1202;; Vector types.
1203
1204(rule (lower (has_type ty @ (not_i64x2) (smin _ x y)))
1205      (vec_rrr (VecALUOp.Smin) x y (vector_size ty)))
1206
1207(rule 1 (lower (has_type $I64X2 (smin _ x y)))
1208      (bsl $I64X2 (vec_rrr (VecALUOp.Cmgt) y x (VectorSize.Size64x2)) x y))
1209
1210(rule (lower (has_type ty @ (not_i64x2) (umin _ x y)))
1211      (vec_rrr (VecALUOp.Umin) x y (vector_size ty)))
1212
1213(rule 1 (lower (has_type $I64X2 (umin _ x y)))
1214      (bsl $I64X2 (vec_rrr (VecALUOp.Cmhi) y x (VectorSize.Size64x2)) x y))
1215
1216(rule (lower (has_type ty @ (not_i64x2) (smax _ x y)))
1217      (vec_rrr (VecALUOp.Smax) x y (vector_size ty)))
1218
1219(rule 1 (lower (has_type $I64X2 (smax _ x y)))
1220      (bsl $I64X2 (vec_rrr (VecALUOp.Cmgt) x y (VectorSize.Size64x2)) x y))
1221
1222(rule (lower (has_type ty @ (not_i64x2) (umax _ x y)))
1223      (vec_rrr (VecALUOp.Umax) x y (vector_size ty)))
1224
1225(rule 1 (lower (has_type $I64X2 (umax _ x y)))
1226      (bsl $I64X2 (vec_rrr (VecALUOp.Cmhi) x y (VectorSize.Size64x2)) x y))
1227
1228;;;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1229
1230;; General rule for extending input to an output which fits in a single
1231;; register.
1232(rule uextend -2 (lower (has_type (fits_in_64 out) (uextend _ x @ (value_type in))))
1233      (extend x false (ty_bits in) (ty_bits out)))
1234
1235;; Extraction of a vector lane automatically extends as necessary, so we can
1236;; skip an explicit extending instruction.
1237(rule 1 (lower (has_type (fits_in_64 out)
1238                       (uextend _ (extractlane _ vec @ (value_type in)
1239                                             (u8_from_uimm8 lane)))))
1240      (mov_from_vec (put_in_reg vec) lane (lane_size in)))
1241
1242;; Atomic loads will also automatically zero their upper bits so the `uextend`
1243;; instruction can effectively get skipped here.
1244(rule 1 (lower (has_type (fits_in_64 out)
1245                       (uextend _ x @ (and (value_type in) (atomic_load _ (little_or_native_endian flags) _)))))
1246      (if-let mem_op (is_sinkable_inst x))
1247      (load_acquire in flags (sink_atomic_load mem_op)))
1248
1249;; Conversion to 128-bit needs a zero-extension of the lower bits and the upper
1250;; bits are all zero.
1251(rule -1 (lower (has_type $I128 (uextend _ x)))
1252      (value_regs (put_in_reg_zext64 x) (imm $I64 (ImmExtend.Zero) 0)))
1253
1254;; Like above where vector extraction automatically zero-extends extending to
1255;; i128 only requires generating a 0 constant for the upper bits.
1256(rule (lower (has_type $I128
1257                       (uextend _ (extractlane _ vec @ (value_type in)
1258                                             (u8_from_uimm8 lane)))))
1259      (value_regs (mov_from_vec (put_in_reg vec) lane (lane_size in)) (imm $I64 (ImmExtend.Zero) 0)))
1260
1261;; Zero extensions from a load can be encoded in the load itself
1262(rule (lower (has_type (fits_in_64 _) (uextend _ x @ (has_type in_ty (load _ (little_or_native_endian flags) address offset)))))
1263      (if-let inst (is_sinkable_inst x))
1264      (let ((_ Unit (sink_inst inst)))
1265            (aarch64_uload in_ty (amode in_ty address offset) flags)))
1266
1267(decl aarch64_uload (Type AMode MemFlags) Reg)
1268(rule (aarch64_uload $I8 amode flags) (aarch64_uload8 amode flags))
1269(rule (aarch64_uload $I16 amode flags) (aarch64_uload16 amode flags))
1270(rule (aarch64_uload $I32 amode flags) (aarch64_uload32 amode flags))
1271
1272;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1273
1274;; General rule for extending input to an output which fits in a single
1275;; register.
1276(rule sextend -4 (lower (has_type (fits_in_64 out) (sextend _ x @ (value_type in))))
1277      (extend x true (ty_bits in) (ty_bits out)))
1278
1279;; Extraction of a vector lane automatically extends as necessary, so we can
1280;; skip an explicit extending instruction.
1281(rule -3 (lower (has_type (fits_in_64 out)
1282                       (sextend _ (extractlane _ vec @ (value_type in)
1283                                             (u8_from_uimm8 lane)))))
1284      (mov_from_vec_signed (put_in_reg vec)
1285                           lane
1286                           (vector_size in)
1287                           (size_from_ty out)))
1288
1289;; 64-bit to 128-bit only needs to sign-extend the input to the upper bits.
1290(rule -2 (lower (has_type $I128 (sextend _ x)))
1291      (let ((lo Reg (put_in_reg_sext64 x))
1292            (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
1293        (value_regs lo hi)))
1294
1295;; Like above where vector extraction automatically zero-extends extending to
1296;; i128 only requires generating a 0 constant for the upper bits.
1297;;
1298;; Note that `mov_from_vec_signed` doesn't exist for i64x2, so that's
1299;; specifically excluded here.
1300(rule (lower (has_type $I128
1301                       (sextend _ (extractlane _ vec @ (value_type in @ (not_i64x2))
1302                                             (u8_from_uimm8 lane)))))
1303      (let ((lo Reg (mov_from_vec_signed (put_in_reg vec)
1304                                         lane
1305                                         (vector_size in)
1306                                         (size_from_ty $I64)))
1307            (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
1308        (value_regs lo hi)))
1309
1310;; Extension from an extraction of i64x2 into i128.
1311(rule -1 (lower (has_type $I128
1312                       (sextend _ (extractlane _ vec @ (value_type $I64X2)
1313                                             (u8_from_uimm8 lane)))))
1314      (let ((lo Reg (mov_from_vec (put_in_reg vec)
1315                                  lane
1316                                  (ScalarSize.Size64)))
1317            (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
1318        (value_regs lo hi)))
1319
1320;; Signed extensions from a load can be encoded in the load itself
1321(rule (lower (has_type (fits_in_64 _) (sextend _ x @ (has_type in_ty (load _ (little_or_native_endian flags) address offset)))))
1322      (if-let inst (is_sinkable_inst x))
1323      (let ((_ Unit (sink_inst inst)))
1324            (aarch64_sload in_ty (amode in_ty address offset) flags)))
1325
1326(decl aarch64_sload (Type AMode MemFlags) Reg)
1327(rule (aarch64_sload $I8 amode flags) (aarch64_sload8 amode flags))
1328(rule (aarch64_sload $I16 amode flags) (aarch64_sload16 amode flags))
1329(rule (aarch64_sload $I32 amode flags) (aarch64_sload32 amode flags))
1330
1331;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1332
1333;; Base case using `orn` between two registers.
1334;;
1335;; Note that bitwise negation is implemented here as
1336;;
1337;;      NOT rd, rm ==> ORR_NOT rd, zero, rm
1338(rule bnot_base_case -4 (lower (has_type (fits_in_64 (ty_int ty)) (bnot _ x)))
1339      (orr_not ty (zero_reg) x))
1340
1341;; Implementation of `bnot` for floats.
1342(rule -3 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (bnot _ x)))
1343      (not x (float_vector_size_in_64 ty)))
1344
1345;; Implementation of `bnot` for vector types.
1346(rule -2 (lower (has_type (ty_vec64 ty) (bnot _ x)))
1347      (not x (vector_size ty)))
1348(rule -1 (lower (has_type (ty_vec128 ty) (bnot _ x)))
1349      (not x (vector_size ty)))
1350
1351;; Implementation of `bnot` for `i128`.
1352(rule (lower (has_type $I128 (bnot _ x)))
1353      (let ((x_regs ValueRegs x)
1354            (x_lo Reg (value_regs_get x_regs 0))
1355            (x_hi Reg (value_regs_get x_regs 1))
1356            (new_lo Reg (orr_not $I64 (zero_reg) x_lo))
1357            (new_hi Reg (orr_not $I64 (zero_reg) x_hi)))
1358        (value_regs new_lo new_hi)))
1359
1360;; Special case to use `orr_not_shift` if it's a `bnot` of a const-left-shifted
1361;; value.
1362(rule bnot_ishl 1 (lower (has_type (fits_in_64 (ty_int ty))
1363                       (bnot _ (ishl _ x (iconst _ k)))))
1364      (if-let amt (lshl_from_imm64 ty k))
1365      (orr_not_shift ty (zero_reg) x amt))
1366
1367;; Special-cases for fusing a bnot with bxor
1368(rule 2 (lower (has_type (fits_in_64 ty) (bnot _ (bxor _ x y))))
1369      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
1370(rule 3 (lower (has_type $I128 (bnot _ (bxor _ x y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
1371
1372;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1373
1374(rule band_fits_in_64 -5 (lower (has_type (fits_in_64 (ty_int ty)) (band _ x y)))
1375      (alu_rs_imm_logic_commutative (ALUOp.And) ty x y))
1376
1377(rule -4 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (band _ x y)))
1378      (and_vec x y (float_vector_size_in_64 ty)))
1379
1380;; Implementation of `band` for vector types.
1381(rule -2 (lower (has_type (ty_vec64 ty) (band _ x y)))
1382      (and_vec x y (vector_size ty)))
1383(rule -1 (lower (has_type (ty_vec128 ty) (band _ x y)))
1384      (and_vec x y (vector_size ty)))
1385
1386(rule (lower (has_type $I128 (band _ x y))) (i128_alu_bitop (ALUOp.And) $I64 x y))
1387
1388;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
1389;; by Cranelift's `band_not` instruction that is legalized into the simpler
1390;; forms early on.
1391(rule band_not_right 1 (lower (has_type (fits_in_64 (ty_int ty)) (band _ x (bnot _ y))))
1392      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))
1393(rule band_not_left 2 (lower (has_type (fits_in_64 (ty_int ty)) (band _ (bnot _ y) x)))
1394      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))
1395
1396(rule 3 (lower (has_type $I128 (band _ x (bnot _ y)))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))
1397(rule 4 (lower (has_type $I128 (band _ (bnot _ y) x))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))
1398
1399(rule 5 (lower (has_type (ty_vec64 ty) (band _ x (bnot _ y))))
1400      (bic_vec x y (vector_size ty)))
1401(rule 6 (lower (has_type (ty_vec64 ty) (band _ (bnot _ y) x)))
1402      (bic_vec x y (vector_size ty)))
1403
1404(rule 7 (lower (has_type (ty_vec128 ty) (band _ x (bnot _ y))))
1405      (bic_vec x y (vector_size ty)))
1406(rule 8 (lower (has_type (ty_vec128 ty) (band _ (bnot _ y) x)))
1407      (bic_vec x y (vector_size ty)))
1408
1409;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1410
1411(rule bor_fits_in_64 -4 (lower (has_type (fits_in_64 (ty_int ty)) (bor _ x y)))
1412      (alu_rs_imm_logic_commutative (ALUOp.Orr) ty x y))
1413
1414(rule -3 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (bor _ x y)))
1415      (orr_vec x y (float_vector_size_in_64 ty)))
1416
1417;; Implementation of `bor` for vector types.
1418(rule -2 (lower (has_type (ty_vec64 ty) (bor _ x y)))
1419      (orr_vec x y (vector_size ty)))
1420(rule -1 (lower (has_type (ty_vec128 ty) (bor _ x y)))
1421      (orr_vec x y (vector_size ty)))
1422
1423(rule (lower (has_type $I128 (bor _ x y))) (i128_alu_bitop (ALUOp.Orr) $I64 x y))
1424
1425;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
1426;; by Cranelift's `bor_not` instruction that is legalized into the simpler
1427;; forms early on.
1428(rule bor_not_right 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor _ x (bnot _ y))))
1429      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))
1430(rule bor_not_left 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor _ (bnot _ y) x)))
1431      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))
1432
1433(rule 3 (lower (has_type $I128 (bor _ x (bnot _ y)))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
1434(rule 4 (lower (has_type $I128 (bor _ (bnot _ y) x))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
1435
1436(rule bor_not_right_vec64 5 (lower (has_type (ty_vec64 ty) (bor _ x (bnot _ y))))
1437      (orn_vec x y (vector_size ty)))
1438(rule bor_not_left_vec64 6 (lower (has_type (ty_vec64 ty) (bor _ (bnot _ y) x)))
1439      (orn_vec x y (vector_size ty)))
1440
1441(rule bor_not_right_vec128 7 (lower (has_type (ty_vec128 ty) (bor _ x (bnot _ y))))
1442      (orn_vec x y (vector_size ty)))
1443(rule bor_not_left_vec128 8 (lower (has_type (ty_vec128 ty) (bor _ (bnot _ y) x)))
1444      (orn_vec x y (vector_size ty)))
1445
1446
1447;; Specialized lowerings to generate the `extr` instruction.
1448;;
1449;; The `extr` instruction creates `a:b` and then extracts either 32 or 64-bits
1450;; starting from an immediate index. This is pattern-matched here as a `bor` of
1451;; the high/low halves of two values shifted around.
1452;;
1453;; The immediate used for the `extr` instruction itself is the N for the
1454;; shift-right. Two patterns are used here to detect either ordering of the
1455;; `bor`.
1456;;
1457;; (x << xs) | (y >> ys) if (xs + ys == widthof(ty)) => extr(x, y, ys)
1458;;
1459;; Note that both `xs` and `ys` must be larger than 0. If either one is 0 and
1460;; they sum to the type width then it means that the shifts don't actually do
1461;; anything CLIF-wise and this should compile down to a `bor` operation. Leave
1462;; that edge case to the mid-end and only lower to `extr` here.
1463(rule 5 (lower (has_type (ty_32_or_64 ty)
1464  (bor _ (ishl _ x (u8_from_iconst xs)) (ushr _ y (u8_from_iconst ys)))))
1465  (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys)))
1466  (if-let true (u64_gt xs 0))
1467  (if-let true (u64_gt ys 0))
1468  (a64_extr ty x y (imm_shift_from_u8 ys)))
1469(rule 5 (lower (has_type (ty_32_or_64 ty)
1470  (bor _ (ushr _ y (u8_from_iconst ys)) (ishl _ x (u8_from_iconst xs)))))
1471  (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys)))
1472  (if-let true (u64_gt xs 0))
1473  (if-let true (u64_gt ys 0))
1474  (a64_extr ty x y (imm_shift_from_u8 ys)))
1475
1476;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1477
1478(rule bxor_fits_in_64 -4 (lower (has_type (fits_in_64 (ty_int ty)) (bxor _ x y)))
1479      (alu_rs_imm_logic_commutative (ALUOp.Eor) ty x y))
1480
1481(rule -3 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (bxor _ x y)))
1482      (eor_vec x y (float_vector_size_in_64 ty)))
1483
1484;; Implementation of `bxor` for vector types.
1485(rule -2 (lower (has_type (ty_vec64 ty) (bxor _ x y)))
1486      (eor_vec x y (vector_size ty)))
1487(rule -1 (lower (has_type (ty_vec128 ty) (bxor _ x y)))
1488      (eor_vec x y (vector_size ty)))
1489
1490(rule (lower (has_type $I128 (bxor _ x y))) (i128_alu_bitop (ALUOp.Eor) $I64 x y))
1491
1492;; Specialized lowerings for `(bxor x (bnot y))` which is additionally produced
1493;; by Cranelift's `bxor_not` instruction that is legalized into the simpler
1494;; forms early on.
1495
1496(rule bxor_not_right 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor _ x (bnot _ y))))
1497      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
1498(rule bxor_not_left 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor _ (bnot _ y) x)))
1499      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
1500
1501(rule 3 (lower (has_type $I128 (bxor _ x (bnot _ y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
1502(rule 4 (lower (has_type $I128 (bxor _ (bnot _ y) x))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
1503
1504;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1505
1506;; Shift for i8/i16/i32.
1507(rule ishl_fits_in_32 -1 (lower (has_type (fits_in_32 ty) (ishl _ x y)))
1508      (do_shift (ALUOp.Lsl) ty x y))
1509
1510;; Shift for i64.
1511(rule ishl_64 (lower (has_type $I64 (ishl _ x y)))
1512      (do_shift (ALUOp.Lsl) $I64 x y))
1513
1514;; Shift for i128.
1515(rule (lower (has_type $I128 (ishl _ x y)))
1516      (lower_shl128 x (value_regs_get y 0)))
1517
1518;;     lsl     lo_lshift, src_lo, amt
1519;;     lsl     hi_lshift, src_hi, amt
1520;;     mvn     inv_amt, amt
1521;;     lsr     lo_rshift, src_lo, #1
1522;;     lsr     lo_rshift, lo_rshift, inv_amt
1523;;     orr     maybe_hi, hi_lshift, lo_rshift
1524;;     tst     amt, #0x40
1525;;     csel    dst_hi, lo_lshift, maybe_hi, ne
1526;;     csel    dst_lo, xzr, lo_lshift, ne
1527(decl lower_shl128 (ValueRegs Reg) ValueRegs)
1528(rule (lower_shl128 src amt)
1529      (let ((src_lo Reg (value_regs_get src 0))
1530            (src_hi Reg (value_regs_get src 1))
1531            (lo_lshift Reg (lsl $I64 src_lo amt))
1532            (hi_lshift Reg (lsl $I64 src_hi amt))
1533            (inv_amt Reg (orr_not $I32 (zero_reg) amt))
1534            (lo_rshift Reg (lsr $I64 (lsr_imm $I64 src_lo (imm_shift_from_u8 1))
1535                                inv_amt))
1536          (maybe_hi Reg (orr $I64 hi_lshift lo_rshift))
1537        )
1538        (with_flags
1539         (tst_imm $I64 amt (u64_into_imm_logic $I64 64))
1540         (consumes_flags_concat
1541          (csel (Cond.Ne) (zero_reg) lo_lshift)
1542          (csel (Cond.Ne) lo_lshift maybe_hi)))))
1543
1544;; Shift for vector types.
1545(rule -3 (lower (has_type (ty_vec128 ty) (ishl _ x y)))
1546      (let ((size VectorSize (vector_size ty))
1547            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
1548            (shift Reg (vec_dup masked_shift_amt size)))
1549        (sshl x shift size)))
1550(rule -2 (lower (has_type (ty_vec128 ty) (ishl _ x (iconst _ (u64_from_imm64 n)))))
1551        (ushl_vec_imm x (shift_masked_imm ty n) (vector_size ty)))
1552
1553(decl pure shift_masked_imm (Type u64) u8)
1554(extern constructor shift_masked_imm shift_masked_imm)
1555
1556;; Helper function to emit a shift operation with the opcode specified and
1557;; the output type specified. The `Reg` provided is shifted by the `Value`
1558;; given.
1559;;
1560;; Note that this automatically handles the clif semantics of masking the
1561;; shift amount where necessary.
1562 (spec (do_shift op t a b)
1563   (provide
1564     (= result
1565        (switch op
1566          ((ALUOp.Lsr) (conv_to 64
1567            (bvlshr (conv_to t a)
1568                 (conv_to t (zero_ext 64
1569                   (bvand (conv_to (widthof b) (bvsub (int2bv 64 (widthof b)) #x0000000000000001)) b))))))
1570          ((ALUOp.Asr) (conv_to 64
1571            (bvashr (conv_to t a)
1572                  (conv_to t (zero_ext 64
1573                    (bvand (conv_to (widthof b) (bvsub (int2bv 64 (widthof b)) #x0000000000000001)) b))))))
1574          ((ALUOp.Lsl) (conv_to 64
1575            (bvshl (conv_to t a)
1576                 (conv_to t (zero_ext 64
1577                   (bvand (conv_to (widthof b) (bvsub (int2bv 64 (widthof b)) #x0000000000000001)) b)))))))))
1578   (require
1579     (or (= op (ALUOp.Lsr)) (= op (ALUOp.Asr)) (= op (ALUOp.Lsl)))
1580     (= t (widthof b))
1581     (or (= t 8) (= t 16) (= t 32) (= t 64))
1582     (switch op
1583       ((ALUOp.Lsr) (switch t
1584         (8 (= (extract 31 0 a) (zero_ext 32 (extract 7 0 a))))
1585         (16 (= (extract 31 0 a) (zero_ext 32 (extract 15 0 a))))
1586         (32 true)
1587         (64 true)))
1588      ((ALUOp.Asr) (switch t
1589        (8 (= (extract 31 0 a) (sign_ext 32 (extract 7 0 a))))
1590        (16 (= (extract 31 0 a) (sign_ext 32 (extract 15 0 a))))
1591        (32 true)
1592        (64 true)))
1593      ((ALUOp.Lsl) true))))
1594(instantiate do_shift
1595    ((args (bv 8) Int (bv 64) (bv 8)) (ret (bv 64)) (canon (bv 8)))
1596    ((args (bv 8) Int (bv 64) (bv 16)) (ret (bv 64)) (canon (bv 16)))
1597    ((args (bv 8) Int (bv 64) (bv 32)) (ret (bv 64)) (canon (bv 32)))
1598    ((args (bv 8) Int (bv 64) (bv 64)) (ret (bv 64)) (canon (bv 64)))
1599)
1600(decl do_shift (ALUOp Type Reg Value) Reg)
1601
1602;; 8/16-bit shift base case.
1603;;
1604;; When shifting for amounts larger than the size of the type, the CLIF shift
1605;; instructions implement a "wrapping" behaviour, such that an i8 << 8 is
1606;; equivalent to i8 << 0
1607;;
1608;; On i32 and i64 types this matches what the aarch64 spec does, but on smaller
1609;; types (i16, i8) we need to do this manually, so we wrap the shift amount
1610;; with an AND instruction
1611(rule do_shift_fits_in_16 -1 (do_shift op (fits_in_16 ty) x y)
1612      (let ((shift_amt Reg (value_regs_get y 0))
1613            (masked_shift_amt Reg (and_imm $I32 shift_amt (shift_mask ty))))
1614        (alu_rrr op $I32 x masked_shift_amt)))
1615
1616 (spec (shift_mask t)
1617   (provide (= (bvsub (int2bv 64 t) #x0000000000000001) result)))
1618(decl shift_mask (Type) ImmLogic)
1619(extern constructor shift_mask shift_mask)
1620
1621;; 32/64-bit shift base cases.
1622(rule do_shift_32_base_case (do_shift op $I32 x y) (alu_rrr op $I32 x (value_regs_get y 0)))
1623(rule do_shift_64_base_case (do_shift op $I64 x y) (alu_rrr op $I64 x (value_regs_get y 0)))
1624
1625;; Special case for shifting by a constant value where the value can fit into an
1626;; `ImmShift`.
1627;;
1628;; Note that this rule explicitly has a higher priority than the others
1629;; to ensure it's attempted first, otherwise the type-based filters on the
1630;; previous rules seem to take priority over this rule.
1631(rule do_shift_imm 1 (do_shift op ty x (iconst _ k))
1632      (if-let shift (imm_shift_from_imm64 ty k))
1633      (alu_rr_imm_shift op ty x shift))
1634
1635;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1636
1637;; Shift for i8/i16/i32.
1638(rule ushr_fits_in_32 -1 (lower (has_type (fits_in_32 ty) (ushr _ x y)))
1639      (do_shift (ALUOp.Lsr) ty (put_in_reg_zext32 x) y))
1640
1641;; Shift for i64.
1642(rule ushr_64 (lower (has_type $I64 (ushr _ x y)))
1643      (do_shift (ALUOp.Lsr) $I64 (put_in_reg_zext64 x) y))
1644
1645;; Shift for i128.
1646(rule (lower (has_type $I128 (ushr _ x y)))
1647      (lower_ushr128 x (value_regs_get y 0)))
1648
1649;; Vector shifts.
1650;;
1651;; Note that for constant shifts a 0-width shift can't be emitted so it's
1652;; special cased to pass through the input as-is since a 0-shift doesn't modify
1653;; the input anyway.
1654(rule -4 (lower (has_type (ty_vec128 ty) (ushr _ x y)))
1655      (let ((size VectorSize (vector_size ty))
1656            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
1657            (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
1658        (ushl x shift size)))
1659(rule -3 (lower (has_type (ty_vec128 ty) (ushr _ x (iconst _ (u64_from_imm64 n)))))
1660         (ushr_vec_imm x (shift_masked_imm ty n) (vector_size ty)))
1661(rule -2 (lower (has_type (ty_vec128 ty) (ushr _ x (iconst _ (u64_from_imm64 n)))))
1662          (if-let 0 (shift_masked_imm ty n))
1663          x)
1664
1665;;     lsr       lo_rshift, src_lo, amt
1666;;     lsr       hi_rshift, src_hi, amt
1667;;     mvn       inv_amt, amt
1668;;     lsl       hi_lshift, src_hi, #1
1669;;     lsl       hi_lshift, hi_lshift, inv_amt
1670;;     tst       amt, #0x40
1671;;     orr       maybe_lo, lo_rshift, hi_lshift
1672;;     csel      dst_hi, xzr, hi_rshift, ne
1673;;     csel      dst_lo, hi_rshift, maybe_lo, ne
1674(decl lower_ushr128 (ValueRegs Reg) ValueRegs)
1675(rule (lower_ushr128 src amt)
1676      (let ((src_lo Reg (value_regs_get src 0))
1677            (src_hi Reg (value_regs_get src 1))
1678            (lo_rshift Reg (lsr $I64 src_lo amt))
1679            (hi_rshift Reg (lsr $I64 src_hi amt))
1680
1681            (inv_amt Reg (orr_not $I32 (zero_reg) amt))
1682            (hi_lshift Reg (lsl $I64 (lsl_imm $I64 src_hi (imm_shift_from_u8 1))
1683                                inv_amt))
1684          (maybe_lo Reg (orr $I64 lo_rshift hi_lshift))
1685        )
1686        (with_flags
1687         (tst_imm $I64 amt (u64_into_imm_logic $I64 64))
1688         (consumes_flags_concat
1689          (csel (Cond.Ne) hi_rshift maybe_lo)
1690          (csel (Cond.Ne) (zero_reg) hi_rshift)))))
1691
1692;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1693
1694;; Shift for i8/i16/i32.
1695(rule sshr_fits_in_32 -4 (lower (has_type (fits_in_32 ty) (sshr _ x y)))
1696      (do_shift (ALUOp.Asr) ty (put_in_reg_sext32 x) y))
1697
1698;; Shift for i64.
1699(rule sshr_64 (lower (has_type $I64 (sshr _ x y)))
1700      (do_shift (ALUOp.Asr) $I64 (put_in_reg_sext64 x) y))
1701
1702;; Shift for i128.
1703(rule (lower (has_type $I128 (sshr _ x y)))
1704      (lower_sshr128 x (value_regs_get y 0)))
1705
1706;; Vector shifts.
1707;;
1708;; Note that right shifts are implemented with a negative left shift. Also note
1709;; that for constant shifts a 0-width shift can't be emitted so it's special
1710;; cased to pass through the input as-is since a 0-shift doesn't modify the
1711;; input anyway.
1712(rule -3 (lower (has_type (ty_vec128 ty) (sshr _ x y)))
1713      (let ((size VectorSize (vector_size ty))
1714            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
1715            (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
1716        (sshl x shift size)))
1717(rule -2 (lower (has_type (ty_vec128 ty) (sshr _ x (iconst _ (u64_from_imm64 n)))))
1718          (sshr_vec_imm x (shift_masked_imm ty n) (vector_size ty)))
1719(rule -1 (lower (has_type (ty_vec128 ty) (sshr _ x (iconst _ (u64_from_imm64 n)))))
1720          (if-let 0 (shift_masked_imm ty n))
1721          x)
1722
1723;;     lsr       lo_rshift, src_lo, amt
1724;;     asr       hi_rshift, src_hi, amt
1725;;     mvn       inv_amt, amt
1726;;     lsl       hi_lshift, src_hi, #1
1727;;     lsl       hi_lshift, hi_lshift, inv_amt
1728;;     asr       hi_sign, src_hi, #63
1729;;     orr       maybe_lo, lo_rshift, hi_lshift
1730;;     tst       amt, #0x40
1731;;     csel      dst_hi, hi_sign, hi_rshift, ne
1732;;     csel      dst_lo, hi_rshift, maybe_lo, ne
1733(decl lower_sshr128 (ValueRegs Reg) ValueRegs)
1734(rule (lower_sshr128 src amt)
1735      (let ((src_lo Reg (value_regs_get src 0))
1736            (src_hi Reg (value_regs_get src 1))
1737            (lo_rshift Reg (lsr $I64 src_lo amt))
1738            (hi_rshift Reg (asr $I64 src_hi amt))
1739
1740            (inv_amt Reg (orr_not $I32 (zero_reg) amt))
1741            (hi_lshift Reg (lsl $I64 (lsl_imm $I64 src_hi (imm_shift_from_u8 1))
1742                                inv_amt))
1743          (hi_sign Reg (asr_imm $I64 src_hi (imm_shift_from_u8 63)))
1744          (maybe_lo Reg (orr $I64 lo_rshift hi_lshift))
1745        )
1746        (with_flags
1747         (tst_imm $I64 amt (u64_into_imm_logic $I64 64))
1748         (consumes_flags_concat
1749          (csel (Cond.Ne) hi_rshift maybe_lo)
1750          (csel (Cond.Ne) hi_sign hi_rshift)))))
1751
1752;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1753
1754;; General 8/16-bit case.
1755(rule rotl_fits_in_16 -2 (lower (has_type (fits_in_16 ty) (rotl _ x y)))
1756      (let ((amt Reg (value_regs_get y 0))
1757            (neg_shift Reg (sub $I32 (zero_reg) amt)))
1758        (small_rotr ty (put_in_reg_zext32 x) neg_shift)))
1759
1760;; Specialization for the 8/16-bit case when the rotation amount is an immediate.
1761(rule rotl_fits_in_16_imm -1 (lower (has_type (fits_in_16 ty) (rotl _ x (iconst _ k))))
1762      (if-let n (imm_shift_from_imm64 ty k))
1763      (small_rotr_imm ty (put_in_reg_zext32 x) (negate_imm_shift ty n)))
1764
1765;; aarch64 doesn't have a left-rotate instruction, but a left rotation of K
1766;; places is effectively a right rotation of N - K places, if N is the integer's
1767;; bit size. We implement left rotations with this trick.
1768;;
1769;; Note that when negating the shift amount here the upper bits are ignored
1770;; by the rotr instruction, meaning that we'll still left-shift by the desired
1771;; amount.
1772
1773;; General 32-bit case.
1774(rule rotl_32_base_case (lower (has_type $I32 (rotl _ x y)))
1775      (let ((amt Reg (value_regs_get y 0))
1776            (neg_shift Reg (sub $I32 (zero_reg) amt)))
1777        (a64_rotr $I32 x neg_shift)))
1778
1779;; General 64-bit case.
1780(rule rotl_64_base_case (lower (has_type $I64 (rotl _ x y)))
1781      (let ((amt Reg (value_regs_get y 0))
1782            (neg_shift Reg (sub $I64 (zero_reg) amt)))
1783        (a64_rotr $I64 x neg_shift)))
1784
1785;; Specialization for the 32-bit case when the rotation amount is an immediate.
1786(rule rotl_32_imm 1 (lower (has_type $I32 (rotl _ x (iconst _ k))))
1787      (if-let n (imm_shift_from_imm64 $I32 k))
1788      (a64_rotr_imm $I32 x (negate_imm_shift $I32 n)))
1789
1790;; Specialization for the 64-bit case when the rotation amount is an immediate.
1791(rule rotl_64_imm 1 (lower (has_type $I64 (rotl _ x (iconst _ k))))
1792      (if-let n (imm_shift_from_imm64 $I64 k))
1793      (a64_rotr_imm $I64 x (negate_imm_shift $I64 n)))
1794
1795;;     fn negate_imm_shift(&mut self, ty: Type, mut imm: ImmShift) -> ImmShift {
1796;;         let size = u8::try_from(ty.bits()).unwrap();
1797;;         imm.imm = size.wrapping_sub(imm.value());
1798;;         imm.imm &= size - 1;
1799;;         imm
1800;;     }
1801 (spec (negate_imm_shift ty x)
1802   (provide
1803     (= result (bvand (bvsub (int2bv 6 ty) x) (bvsub (int2bv 6 ty) #b000001)))))
1804(decl negate_imm_shift (Type ImmShift) ImmShift)
1805(extern constructor negate_imm_shift negate_imm_shift)
1806
1807;; General 128-bit case.
1808;;
1809;; TODO: much better codegen is possible with a constant amount.
1810(rule (lower (has_type $I128 (rotl _ x y)))
1811      (let ((val ValueRegs x)
1812            (amt Reg (value_regs_get y 0))
1813            (neg_amt Reg (sub $I64 (imm $I64 (ImmExtend.Zero) 128) amt))
1814            (lshift ValueRegs (lower_shl128 val amt))
1815            (rshift ValueRegs (lower_ushr128 val neg_amt)))
1816        (value_regs
1817          (orr $I64 (value_regs_get lshift 0) (value_regs_get rshift 0))
1818          (orr $I64 (value_regs_get lshift 1) (value_regs_get rshift 1)))))
1819
1820;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1821
1822;; General 8/16-bit case.
1823(rule rotr_fits_in_16 -3 (lower (has_type (fits_in_16 ty) (rotr _ x y)))
1824      (small_rotr ty (put_in_reg_zext32 x) (value_regs_get y 0)))
1825
1826;; General 32-bit case.
1827(rule rotr_32_base_case -1 (lower (has_type $I32 (rotr _ x y)))
1828      (a64_rotr $I32 x (value_regs_get y 0)))
1829
1830;; General 64-bit case.
1831(rule rotr_64_base_case -1 (lower (has_type $I64 (rotr _ x y)))
1832      (a64_rotr $I64 x (value_regs_get y 0)))
1833
1834;; Specialization for the 8/16-bit case when the rotation amount is an immediate.
1835(rule rotr_fits_in_16_imm -2 (lower (has_type (fits_in_16 ty) (rotr _ x (iconst _ k))))
1836      (if-let n (imm_shift_from_imm64 ty k))
1837      (small_rotr_imm ty (put_in_reg_zext32 x) n))
1838
1839;; Specialization for the 32-bit case when the rotation amount is an immediate.
1840(rule rotr_32_imm (lower (has_type $I32 (rotr _ x (iconst _ k))))
1841      (if-let n (imm_shift_from_imm64 $I32 k))
1842      (a64_rotr_imm $I32 x n))
1843
1844;; Specialization for the 64-bit case when the rotation amount is an immediate.
1845(rule rotr_64_imm (lower (has_type $I64 (rotr _ x (iconst _ k))))
1846      (if-let n (imm_shift_from_imm64 $I64 k))
1847      (a64_rotr_imm $I64 x n))
1848
1849;; For a < 32-bit rotate-right, we synthesize this as:
1850;;
1851;;    rotr rd, val, amt
1852;;
1853;;       =>
1854;;
1855;;    and masked_amt, amt, <bitwidth - 1>
1856;;    sub tmp_sub, masked_amt, <bitwidth>
1857;;    sub neg_amt, zero, tmp_sub  ; neg
1858;;    lsr val_rshift, val, masked_amt
1859;;    lsl val_lshift, val, neg_amt
1860;;    orr rd, val_lshift val_rshift
1861 (spec (small_rotr t x y)
1862   (provide
1863     (= result
1864        (switch t
1865          (8 (conv_to 64 (rotr (extract 7 0 x) (extract 7 0 y))))
1866          (16 (conv_to 64 (rotr (extract 15 0 x) (extract 15 0 y)))))))
1867   (require
1868     (or (= t 8) (= t 16))
1869     (switch t
1870       (8 (= (extract 31 8 x) #x000000))
1871       (16 (= (extract 31 16 x) #x0000)))))
1872(instantiate small_rotr
1873    ((args Int (bv 64) (bv 64)) (ret (bv 64)) (canon (bv 64))))
1874(decl small_rotr (Type Reg Reg) Reg)
1875(rule small_rotr (small_rotr ty val amt)
1876      (let ((masked_amt Reg (and_imm $I32 amt (rotr_mask ty)))
1877            (tmp_sub Reg (sub_imm $I32 masked_amt (u8_into_imm12 (ty_bits ty))))
1878            (neg_amt Reg (sub $I32 (zero_reg) tmp_sub))
1879            (val_rshift Reg (lsr $I32 val masked_amt))
1880            (val_lshift Reg (lsl $I32 val neg_amt)))
1881        (orr $I32 val_lshift val_rshift)))
1882
1883(spec (rotr_mask x) (provide (= (bvsub (int2bv 64 x) #x0000000000000001) result)))
1884(decl rotr_mask (Type) ImmLogic)
1885(extern constructor rotr_mask rotr_mask)
1886
1887;; For a constant amount, we can instead do:
1888;;
1889;;    rotr rd, val, #amt
1890;;
1891;;       =>
1892;;
1893;;    lsr val_rshift, val, #<amt>
1894;;    lsl val_lshift, val, <bitwidth - amt>
1895;;    orr rd, val_lshift, val_rshift
1896
1897(spec (small_rotr_imm t x y)
1898   (provide
1899     (= result
1900        (switch t
1901          (8 (conv_to 64 (rotr (extract 7 0 x) (zero_ext 8 y))))
1902          (16 (conv_to 64 (rotr (extract 15 0 x) (zero_ext 16 y)))))))
1903   (require
1904     (or (= t 8) (= t 16))
1905     (switch t
1906       (8 (= (extract 31 8 x) #x000000))
1907       (16 (= (extract 31 16 x) #x0000)))
1908     (bvult y (int2bv 6 t))))
1909(instantiate small_rotr_imm
1910    ((args Int (bv 64) (bv 6)) (ret (bv 64)) (canon (bv 64))))
1911(decl small_rotr_imm (Type Reg ImmShift) Reg)
1912(rule small_rotr_imm (small_rotr_imm ty val amt)
1913      (let ((val_rshift Reg (lsr_imm $I32 val amt))
1914            (val_lshift Reg (lsl_imm $I32 val (rotr_opposite_amount ty amt))))
1915        (orr $I32 val_lshift val_rshift)))
1916
1917(spec (rotr_opposite_amount ty x)
1918   (provide
1919    (= (bvsub (int2bv 6 ty) (bvand x (bvsub (int2bv 6 ty) #b000001))) result)))
1920(decl rotr_opposite_amount (Type ImmShift) ImmShift)
1921(extern constructor rotr_opposite_amount rotr_opposite_amount)
1922
1923;; General 128-bit case.
1924;;
1925;; TODO: much better codegen is possible with a constant amount.
1926(rule (lower (has_type $I128 (rotr _ x y)))
1927      (let ((val ValueRegs x)
1928            (amt Reg (value_regs_get y 0))
1929            (neg_amt Reg (sub $I64 (imm $I64 (ImmExtend.Zero) 128) amt))
1930            (rshift ValueRegs (lower_ushr128 val amt))
1931            (lshift ValueRegs (lower_shl128 val neg_amt))
1932            (hi Reg (orr $I64 (value_regs_get rshift 1) (value_regs_get lshift 1)))
1933            (lo Reg (orr $I64 (value_regs_get rshift 0) (value_regs_get lshift 0))))
1934        (value_regs lo hi)))
1935
1936;;;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1937
1938;; Reversing an 8-bit value with a 32-bit bitrev instruction will place
1939;; the reversed result in the highest 8 bits, so we need to shift them down into
1940;; place.
1941(rule (lower (has_type $I8 (bitrev _ x)))
1942      (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 24)))
1943
1944;; Reversing an 16-bit value with a 32-bit bitrev instruction will place
1945;; the reversed result in the highest 16 bits, so we need to shift them down into
1946;; place.
1947(rule (lower (has_type $I16 (bitrev _ x)))
1948      (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 16)))
1949
1950(rule (lower (has_type $I128 (bitrev _ x)))
1951      (let ((val ValueRegs x)
1952            (lo_rev Reg (rbit $I64 (value_regs_get val 0)))
1953            (hi_rev Reg (rbit $I64 (value_regs_get val 1))))
1954        (value_regs hi_rev lo_rev)))
1955
1956(rule -1 (lower (has_type ty (bitrev _ x)))
1957      (rbit ty x))
1958
1959
1960;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1961
1962(rule clz_8 (lower (has_type $I8 (clz _ x)))
1963      (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 24)))
1964
1965(rule clz_16 (lower (has_type $I16 (clz _ x)))
1966      (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 16)))
1967
1968(rule (lower (has_type $I128 (clz _ x)))
1969      (lower_clz128 x))
1970
1971(rule clz_32_64 -1 (lower (has_type ty (clz _ x)))
1972      (a64_clz ty x))
1973
1974;; clz hi_clz, hi
1975;; clz lo_clz, lo
1976;; lsr tmp, hi_clz, #6
1977;; madd dst_lo, lo_clz, tmp, hi_clz
1978;; mov  dst_hi, 0
1979(decl lower_clz128 (ValueRegs) ValueRegs)
1980(rule (lower_clz128 val)
1981      (let ((hi_clz Reg (a64_clz $I64 (value_regs_get val 1)))
1982            (lo_clz Reg (a64_clz $I64 (value_regs_get val 0)))
1983            (tmp Reg (lsr_imm $I64 hi_clz (imm_shift_from_u8 6))))
1984        (value_regs (madd $I64 lo_clz tmp hi_clz) (imm $I64 (ImmExtend.Zero) 0))))
1985
1986;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1987
1988;; Note that all `ctz` instructions are implemented by reversing the bits and
1989;; then using a `clz` instruction since the tail zeros are the same as the
1990;; leading zeros of the reversed value.
1991
1992(rule ctz_8 (lower (has_type $I8 (ctz _ x)))
1993      (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x800000))))
1994
1995(rule ctz_16 (lower (has_type $I16 (ctz _ x)))
1996      (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x8000))))
1997
1998(rule (lower (has_type $I128 (ctz _ x)))
1999      (let ((val ValueRegs x)
2000            (lo Reg (rbit $I64 (value_regs_get val 0)))
2001            (hi Reg (rbit $I64 (value_regs_get val 1))))
2002        (lower_clz128 (value_regs hi lo))))
2003
2004(rule ctz_32_64 -1 (lower (has_type ty (ctz _ x)))
2005      (a64_clz ty (rbit ty x)))
2006
2007;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2008
2009(rule cls_8 (lower (has_type $I8 (cls _ x)))
2010      (sub_imm $I32 (a64_cls $I32 (put_in_reg_sext32 x)) (u8_into_imm12 24)))
2011
2012(rule cls_16 (lower (has_type $I16 (cls _ x)))
2013      (sub_imm $I32 (a64_cls $I32 (put_in_reg_sext32 x)) (u8_into_imm12 16)))
2014
2015;; cls lo_cls, lo
2016;; cls hi_cls, hi
2017;; eon sign_eq_eor, hi, lo
2018;; lsr sign_eq, sign_eq_eor, #63
2019;; madd lo_sign_bits, out_lo, sign_eq, sign_eq
2020;; cmp hi_cls, #63
2021;; csel maybe_lo, lo_sign_bits, xzr, eq
2022;; add  out_lo, maybe_lo, hi_cls
2023;; mov  out_hi, 0
2024(rule (lower (has_type $I128 (cls _ x)))
2025      (let ((val ValueRegs x)
2026            (lo Reg (value_regs_get val 0))
2027            (hi Reg (value_regs_get val 1))
2028            (lo_cls Reg (a64_cls $I64 lo))
2029            (hi_cls Reg (a64_cls $I64 hi))
2030            (sign_eq_eon Reg (eon $I64 hi lo))
2031            (sign_eq Reg (lsr_imm $I64 sign_eq_eon (imm_shift_from_u8 63)))
2032            (lo_sign_bits Reg (madd $I64 lo_cls sign_eq sign_eq))
2033            (maybe_lo Reg (with_flags_reg
2034                           (cmp64_imm hi_cls (u8_into_imm12 63))
2035                           (csel (Cond.Eq) lo_sign_bits (zero_reg)))))
2036        (value_regs (add $I64 maybe_lo hi_cls) (imm $I64 (ImmExtend.Zero) 0))))
2037
2038(rule cls_32_64 -1 (lower (has_type ty (cls _ x)))
2039      (a64_cls ty x))
2040
2041;;;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2042
2043(rule (lower (has_type $I16 (bswap _ x)))
2044      (a64_rev16 $I16 x))
2045
2046(rule (lower (has_type $I32 (bswap _ x)))
2047      (a64_rev32 $I32 x))
2048
2049(rule (lower (has_type $I64 (bswap _ x)))
2050      (a64_rev64 $I64 x))
2051
2052(rule (lower (has_type $I128 (bswap _ x)))
2053      (value_regs
2054       (a64_rev64 $I64 (value_regs_get x 1))
2055       (a64_rev64 $I64 (value_regs_get x 0))))
2056
2057;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2058
2059;; Bmask tests the value against zero, and uses `csetm` to assert the result.
2060(rule (lower (has_type out_ty (bmask _ x @ (value_type in_ty))))
2061      (lower_bmask out_ty in_ty x))
2062
2063;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2064
2065;; The implementation of `popcnt` for scalar types is done by moving the value
2066;; into a vector register, using the `cnt` instruction, and then collating the
2067;; result back into a normal register.
2068;;
2069;; The general sequence emitted here is
2070;;
2071;;     fmov tmp, in_lo
2072;;     if ty == i128:
2073;;         mov tmp.d[1], in_hi
2074;;
2075;;     cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
2076;;     addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
2077;;
2078;;     umov out_lo, tmp.b[0]
2079;;     if ty == i128:
2080;;         mov out_hi, 0
2081
2082(rule popcnt_8 (lower (has_type $I8 (popcnt _ x)))
2083      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
2084            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))))
2085        (mov_from_vec nbits 0 (ScalarSize.Size8))))
2086
2087;; Note that this uses `addp` instead of `addv` as it's usually cheaper.
2088(rule popcnt_16 (lower (has_type $I16 (popcnt _ x)))
2089      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
2090            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
2091            (added Reg (addp nbits nbits (VectorSize.Size8x8))))
2092        (mov_from_vec added 0 (ScalarSize.Size8))))
2093
2094(rule popcnt_32 (lower (has_type $I32 (popcnt _ x)))
2095      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
2096            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
2097            (added Reg (addv nbits (VectorSize.Size8x8))))
2098        (mov_from_vec added 0 (ScalarSize.Size8))))
2099
2100(rule popcnt_64 (lower (has_type $I64 (popcnt _ x)))
2101      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64)))
2102            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
2103            (added Reg (addv nbits (VectorSize.Size8x8))))
2104        (mov_from_vec added 0 (ScalarSize.Size8))))
2105
2106(rule (lower (has_type $I128 (popcnt _ x)))
2107      (let ((val ValueRegs x)
2108            (tmp_half Reg (mov_to_fpu (value_regs_get val 0) (ScalarSize.Size64)))
2109            (tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2)))
2110            (nbits Reg (vec_cnt tmp (VectorSize.Size8x16)))
2111            (added Reg (addv nbits (VectorSize.Size8x16))))
2112        (value_regs (mov_from_vec added 0 (ScalarSize.Size8)) (imm $I64 (ImmExtend.Zero) 0))))
2113
2114(rule (lower (has_type $I8X16 (popcnt _ x)))
2115      (vec_cnt x (VectorSize.Size8x16)))
2116
2117;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2118
2119(rule bitselect (lower (has_type ty (bitselect _ c x y)))
2120      (if (ty_int_ref_scalar_64 ty))
2121      (let ((tmp1 Reg (and_reg ty x c))
2122            (tmp2 Reg (bic ty y c)))
2123        (orr ty tmp1 tmp2)))
2124
2125(rule 1 (lower (has_type (ty_vec128 ty) (bitselect _ c x y)))
2126        (bsl ty c x y))
2127
2128;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2129
2130;; T -> I{64,32,16,8}: We can simply pass through the value: values
2131;; are always stored with high bits undefined, so we can just leave
2132;; them be.
2133(rule (lower (has_type ty (ireduce _ src)))
2134    (if (ty_int_ref_scalar_64 ty))
2135    (value_regs_get src 0))
2136
2137;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2138
2139(rule 4 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond_not_eq cond) x y)))
2140      (if (zero_value y))
2141      (let ((rn Reg x)
2142            (vec_size VectorSize (vector_size ty)))
2143          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
2144
2145(rule 3 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond cond) x y)))
2146      (if (zero_value y))
2147      (let ((rn Reg x)
2148            (vec_size VectorSize (vector_size ty)))
2149          (value_reg (float_cmp_zero cond rn vec_size))))
2150
2151(rule 2 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond_not_eq cond) x y)))
2152      (if (zero_value x))
2153      (let ((rn Reg y)
2154            (vec_size VectorSize (vector_size ty)))
2155          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
2156
2157(rule 1 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond cond) x y)))
2158      (if (zero_value x))
2159      (let ((rn Reg y)
2160            (vec_size VectorSize (vector_size ty)))
2161          (value_reg (float_cmp_zero_swap cond rn vec_size))))
2162
2163(rule 0 (lower (has_type out_ty
2164              (fcmp _ cond x @ (value_type (ty_scalar_float in_ty)) y)))
2165      (with_flags (fpu_cmp (scalar_size in_ty) x y)
2166                  (materialize_bool_result (fp_cond_code cond))))
2167
2168(rule -1 (lower (has_type out_ty (fcmp _ cond x @ (value_type in_ty) y)))
2169      (if (ty_vector_float in_ty))
2170      (vec_cmp x y in_ty (fp_cond_code cond)))
2171
2172;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2173
2174(rule 3 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond_not_eq cond) x y)))
2175      (if (zero_value y))
2176      (let ((rn Reg x)
2177            (vec_size VectorSize (vector_size ty)))
2178          (value_reg (not (cmeq0 rn vec_size) vec_size))))
2179
2180(rule 2 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond cond) x y)))
2181      (if (zero_value y))
2182      (let ((rn Reg x)
2183            (vec_size VectorSize (vector_size ty)))
2184          (value_reg (int_cmp_zero cond rn vec_size))))
2185
2186(rule 1 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond_not_eq cond) x y)))
2187      (if (zero_value x))
2188      (let ((rn Reg y)
2189            (vec_size VectorSize (vector_size ty)))
2190          (value_reg (not (cmeq0 rn vec_size) vec_size))))
2191
2192(rule 0 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond cond) x y)))
2193      (if (zero_value x))
2194      (let ((rn Reg y)
2195            (vec_size VectorSize (vector_size ty)))
2196          (value_reg (int_cmp_zero_swap cond rn vec_size))))
2197
2198(rule icmp_8_16_32_64 -1 (lower (icmp _ cond x @ (value_type in_ty) y))
2199      (lower_icmp_into_reg cond x y in_ty $I8))
2200
2201;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2202
2203(rule (lower (trap trap_code))
2204      (side_effect (udf trap_code)))
2205
2206;;;;;  Rules for `trapz`;;;;;;;;;
2207
2208(rule (lower (trapz val trap_code))
2209  (trap_if_val (ZeroCond.Zero) val trap_code))
2210
2211;;;;;  Rules for `trapnz`;;;;;;;;;
2212
2213(rule (lower (trapnz val trap_code))
2214  (trap_if_val (ZeroCond.NonZero) val trap_code))
2215
2216;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2217
2218(rule (lower (has_type ty
2219                       (select _ (maybe_uextend (icmp _ cc
2220                                                    x @ (value_type in_ty)
2221                                                    y))
2222                               rn
2223                               rm)))
2224      (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y in_ty)))
2225       (lower_select (flags_and_cc_flags comparison)
2226                     (cond_code (flags_and_cc_cc comparison))
2227                     ty
2228                     rn
2229                     rm)))
2230
2231(rule (lower (has_type ty
2232                       (select _ (maybe_uextend (fcmp _ cc x @ (value_type in_ty) y))
2233                               rn
2234                               rm)))
2235      (let ((cond Cond (fp_cond_code cc)))
2236       (lower_select
2237        (fpu_cmp (scalar_size in_ty) x y)
2238        cond ty rn rm)))
2239
2240(rule -1 (lower (has_type ty (select _ rcond @ (value_type $I8) rn rm)))
2241      (let ((rcond Reg rcond))
2242       (lower_select
2243         (tst_imm $I32 rcond (u64_into_imm_logic $I32 255))
2244         (Cond.Ne) ty rn rm)))
2245
2246(rule -2 (lower (has_type ty (select _ rcond @ (value_type (fits_in_32 _)) rn rm)))
2247      (let ((rcond Reg (put_in_reg_zext32 rcond)))
2248       (lower_select
2249        (cmp (OperandSize.Size32) rcond (zero_reg))
2250        (Cond.Ne) ty rn rm)))
2251
2252(rule -3 (lower (has_type ty (select _ rcond @ (value_type (fits_in_64 _)) rn rm)))
2253      (let ((rcond Reg (put_in_reg_zext64 rcond)))
2254       (lower_select
2255        (cmp (OperandSize.Size64) rcond (zero_reg))
2256        (Cond.Ne) ty rn rm)))
2257
2258(rule -4 (lower (has_type ty (select _ rcond @ (value_type $I128) rn rm)))
2259      (let ((c ValueRegs (put_in_regs rcond))
2260            (c_lo Reg (value_regs_get c 0))
2261            (c_hi Reg (value_regs_get c 1))
2262            (rt Reg (orr $I64 c_lo c_hi)))
2263        (lower_select
2264         (cmp (OperandSize.Size64) rt (zero_reg))
2265         (Cond.Ne) ty rn rm)))
2266
2267;;;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2268
2269(decl maybe_csdb_after_select (ValueRegs) ValueRegs)
2270(rule (maybe_csdb_after_select dst)
2271  (if-let true (use_csdb))
2272  (let ((_ InstOutput (side_effect (csdb)))) dst))
2273(rule (maybe_csdb_after_select dst)
2274  (if-let false (use_csdb))
2275  dst)
2276
2277(rule (lower (has_type ty
2278                       (select_spectre_guard _ (maybe_uextend (icmp _ cc x @ (value_type in_ty) y))
2279                                             if_true
2280                                             if_false)))
2281      (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y in_ty))
2282            (dst ValueRegs (lower_select
2283                            (flags_and_cc_flags comparison)
2284                            (cond_code (flags_and_cc_cc comparison))
2285                            ty
2286                            if_true
2287                            if_false)))
2288       (maybe_csdb_after_select dst)))
2289
2290(rule -1 (lower (has_type ty (select_spectre_guard _ rcond @ (value_type (fits_in_64 _)) rn rm)))
2291      (let ((rcond Reg (put_in_reg_zext64 rcond)))
2292       (lower_select
2293        (cmp (OperandSize.Size64) rcond (zero_reg))
2294        (Cond.Ne) ty rn rm)))
2295
2296(rule -2 (lower (has_type ty (select_spectre_guard _ rcond @ (value_type $I128) rn rm)))
2297      (let ((c ValueRegs (put_in_regs rcond))
2298            (c_lo Reg (value_regs_get c 0))
2299            (c_hi Reg (value_regs_get c 1))
2300            (rt Reg (orr $I64 c_lo c_hi)))
2301        (lower_select
2302         (cmp (OperandSize.Size64) rt (zero_reg))
2303         (Cond.Ne) ty rn rm)))
2304
2305;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2306
2307(rule (lower (has_type (ty_vec128 _) (vconst _ (u128_from_constant x))))
2308      (constant_f128 x))
2309
2310(rule 1 (lower (has_type ty (vconst _ (u64_from_constant x))))
2311      (if (ty_vec64 ty))
2312      (constant_f64 x))
2313
2314;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2315
2316(rule -1 (lower (has_type ty (splat _ x @ (value_type in_ty))))
2317      (if (ty_int_ref_scalar_64 in_ty))
2318      (vec_dup x (vector_size ty)))
2319
2320(rule -2 (lower (has_type ty (splat _ x @ (value_type (ty_scalar_float _)))))
2321      (vec_dup_from_fpu x (vector_size ty) 0))
2322
2323(rule (lower (has_type ty (splat _ (f32const _ (u32_from_ieee32 n)))))
2324      (splat_const n (vector_size ty)))
2325
2326(rule (lower (has_type ty (splat _ (f64const _ (u64_from_ieee64 n)))))
2327      (splat_const n (vector_size ty)))
2328
2329(rule (lower (has_type ty (splat _ (iconst _ (u64_from_imm64 n)))))
2330      (splat_const n (vector_size ty)))
2331
2332(rule (lower (has_type ty (splat _ x @ (load _ flags _ _))))
2333      (if-let mem_op (is_sinkable_inst x))
2334      (let ((addr Reg (sink_load_into_addr (lane_type ty) mem_op)))
2335            (ld1r addr (vector_size ty) flags)))
2336
2337;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2338(rule (lower (has_type (valid_atomic_transaction ty) (atomic_load _ (little_or_native_endian flags) addr)))
2339      (load_acquire ty flags addr))
2340
2341
2342;;;; Rules for `AtomicStore` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2343(rule (lower (atomic_store (little_or_native_endian flags)
2344                src @ (value_type (valid_atomic_transaction ty))
2345                addr))
2346      (side_effect (store_release ty flags src addr)))
2347
2348;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2349
2350(rule 1 (lower (and (use_lse)
2351                  (has_type (valid_atomic_transaction ty)
2352                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) addr src))))
2353      (lse_atomic_rmw (AtomicRMWOp.Add) addr src ty flags))
2354(rule 1 (lower (and (use_lse)
2355                  (has_type (valid_atomic_transaction ty)
2356                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xor) addr src))))
2357      (lse_atomic_rmw (AtomicRMWOp.Eor) addr src ty flags))
2358(rule 1 (lower (and (use_lse)
2359                  (has_type (valid_atomic_transaction ty)
2360                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Or) addr src))))
2361      (lse_atomic_rmw (AtomicRMWOp.Set) addr src ty flags))
2362(rule 1 (lower (and (use_lse)
2363                  (has_type (valid_atomic_transaction ty)
2364                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smax) addr src))))
2365      (lse_atomic_rmw (AtomicRMWOp.Smax) addr src ty flags))
2366(rule 1 (lower (and (use_lse)
2367                  (has_type (valid_atomic_transaction ty)
2368                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smin) addr src))))
2369      (lse_atomic_rmw (AtomicRMWOp.Smin) addr src ty flags))
2370(rule 1 (lower (and (use_lse)
2371                  (has_type (valid_atomic_transaction ty)
2372                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umax) addr src))))
2373      (lse_atomic_rmw (AtomicRMWOp.Umax) addr src ty flags))
2374(rule 1 (lower (and (use_lse)
2375                  (has_type (valid_atomic_transaction ty)
2376                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umin) addr src))))
2377      (lse_atomic_rmw (AtomicRMWOp.Umin) addr src ty flags))
2378(rule 1 (lower (and (use_lse)
2379                  (has_type (valid_atomic_transaction ty)
2380                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) addr src))))
2381      (lse_atomic_rmw (AtomicRMWOp.Add) addr (sub ty (zero_reg) src) ty flags))
2382(rule 1 (lower (and (use_lse)
2383                  (has_type (valid_atomic_transaction ty)
2384                      (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.And) addr src))))
2385      (lse_atomic_rmw (AtomicRMWOp.Clr) addr (eon ty src (zero_reg)) ty flags))
2386
2387
2388(rule (lower (has_type (valid_atomic_transaction ty)
2389             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) addr src)))
2390      (atomic_rmw_loop (AtomicRMWLoopOp.Add) addr src ty flags))
2391(rule (lower (has_type (valid_atomic_transaction ty)
2392             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) addr src)))
2393      (atomic_rmw_loop (AtomicRMWLoopOp.Sub) addr src ty flags))
2394(rule (lower (has_type (valid_atomic_transaction ty)
2395             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.And) addr src)))
2396      (atomic_rmw_loop (AtomicRMWLoopOp.And) addr src ty flags))
2397(rule (lower (has_type (valid_atomic_transaction ty)
2398             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Nand) addr src)))
2399      (atomic_rmw_loop (AtomicRMWLoopOp.Nand) addr src ty flags))
2400(rule (lower (has_type (valid_atomic_transaction ty)
2401             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Or) addr src)))
2402      (atomic_rmw_loop (AtomicRMWLoopOp.Orr) addr src ty flags))
2403(rule (lower (has_type (valid_atomic_transaction ty)
2404             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xor) addr src)))
2405      (atomic_rmw_loop (AtomicRMWLoopOp.Eor) addr src ty flags))
2406(rule (lower (has_type (valid_atomic_transaction ty)
2407             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smin) addr src)))
2408      (atomic_rmw_loop (AtomicRMWLoopOp.Smin) addr src ty flags))
2409(rule (lower (has_type (valid_atomic_transaction ty)
2410             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smax) addr src)))
2411      (atomic_rmw_loop (AtomicRMWLoopOp.Smax) addr src ty flags))
2412(rule (lower (has_type (valid_atomic_transaction ty)
2413             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umin) addr src)))
2414      (atomic_rmw_loop (AtomicRMWLoopOp.Umin) addr src ty flags))
2415(rule (lower (has_type (valid_atomic_transaction ty)
2416             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umax) addr src)))
2417      (atomic_rmw_loop (AtomicRMWLoopOp.Umax) addr src ty flags))
2418(rule (lower (has_type (valid_atomic_transaction ty)
2419             (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xchg) addr src)))
2420      (atomic_rmw_loop (AtomicRMWLoopOp.Xchg) addr src ty flags))
2421
2422;;;; Rules for `AtomicCAS` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2423(rule 1 (lower (and (use_lse)
2424                  (has_type (valid_atomic_transaction ty)
2425                  (atomic_cas _ (little_or_native_endian flags) addr src1 src2))))
2426      (lse_atomic_cas addr src1 src2 ty flags))
2427
2428(rule (lower (and (has_type (valid_atomic_transaction ty)
2429                  (atomic_cas _ (little_or_native_endian flags) addr src1 src2))))
2430      (atomic_cas_loop addr src1 src2 ty flags))
2431
2432;;;; Rules for 'fvdemote' ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2433(rule (lower (fvdemote _ x))
2434      (fcvtn x (ScalarSize.Size32)))
2435
2436
2437;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2438(rule 1 (lower (has_type (ty_vec128_int ty) (snarrow _ x y)))
2439      (if (zero_value y))
2440      (sqxtn x (lane_size ty)))
2441
2442(rule 2 (lower (has_type (ty_vec64_int ty) (snarrow _ x y)))
2443      (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2))))
2444            (sqxtn dst (lane_size ty))))
2445
2446(rule 0 (lower (has_type (ty_vec128_int ty) (snarrow _ x y)))
2447      (let ((low_half Reg (sqxtn x (lane_size ty)))
2448            (result Reg (sqxtn2 low_half y (lane_size ty))))
2449        result))
2450
2451
2452;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2453(rule 1 (lower (has_type (ty_vec128_int ty) (unarrow _ x y)))
2454      (if (zero_value y))
2455      (sqxtun x (lane_size ty)))
2456
2457(rule 2 (lower (has_type (ty_vec64_int ty) (unarrow _ x y)))
2458      (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2))))
2459            (sqxtun dst (lane_size ty))))
2460
2461(rule 0 (lower (has_type (ty_vec128_int ty) (unarrow _ x y)))
2462      (let ((low_half Reg (sqxtun x (lane_size ty)))
2463            (result Reg (sqxtun2 low_half y (lane_size ty))))
2464        result))
2465
2466
2467;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2468
2469(rule 1 (lower (has_type (ty_vec128_int ty) (uunarrow _ x y)))
2470      (if (zero_value y))
2471      (uqxtn x (lane_size ty)))
2472
2473(rule 2 (lower (has_type (ty_vec64_int ty) (uunarrow _ x y)))
2474      (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2))))
2475            (uqxtn dst (lane_size ty))))
2476
2477(rule 0 (lower (has_type (ty_vec128_int ty) (uunarrow _ x y)))
2478      (let ((low_half Reg (uqxtn x (lane_size ty)))
2479            (result Reg (uqxtn2 low_half y (lane_size ty))))
2480        result))
2481
2482;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2483
2484(rule (lower (has_type ty (swiden_low _ x)))
2485      (vec_extend (VecExtendOp.Sxtl) x false (lane_size ty)))
2486
2487;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2488
2489(rule 1 (lower (has_type (ty_vec128 ty) (swiden_high _ x)))
2490      (vec_extend (VecExtendOp.Sxtl) x true (lane_size ty)))
2491
2492(rule (lower (has_type ty (swiden_high _ x)))
2493      (if (ty_vec64 ty))
2494      (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2))))
2495       (vec_extend (VecExtendOp.Sxtl) tmp false (lane_size ty))))
2496
2497;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2498
2499(rule (lower (has_type ty (uwiden_low _ x)))
2500      (vec_extend (VecExtendOp.Uxtl) x false (lane_size ty)))
2501
2502;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2503
2504(rule 1 (lower (has_type (ty_vec128 ty) (uwiden_high _ x)))
2505      (vec_extend (VecExtendOp.Uxtl) x true (lane_size ty)))
2506
2507(rule (lower (has_type ty (uwiden_high _ x)))
2508      (if (ty_vec64 ty))
2509      (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2))))
2510       (vec_extend (VecExtendOp.Uxtl) tmp false (lane_size ty))))
2511
2512;;;; Rules for `Fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2513
2514(rule (lower (fence))
2515      (side_effect (aarch64_fence)))
2516
2517;;;; Rules for `Debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2518
2519(rule (lower (debugtrap))
2520      (side_effect (brk)))
2521
2522;;;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2523
2524(rule (lower (func_addr _ (func_ref_data _ extname dist _)))
2525      (load_ext_name (box_external_name extname) 0 dist))
2526
2527;;;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2528
2529(rule (lower (symbol_value _ (symbol_value_data extname dist offset)))
2530      (load_ext_name (box_external_name extname) offset dist))
2531
2532;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;;;
2533
2534(rule (lower (get_frame_pointer _))
2535      (aarch64_fp))
2536
2537(rule (lower (get_stack_pointer _))
2538      (aarch64_sp))
2539
2540(rule (lower (get_return_address _))
2541      (aarch64_link))
2542
2543;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2544
2545;; Direct call to an in-range function.
2546(rule 1 (lower (call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args))
2547      (let ((output ValueRegsVec (gen_call_output sig_ref))
2548            (abi Sig (abi_sig sig_ref))
2549            (uses CallArgList (gen_call_args abi args))
2550            (defs CallRetList (gen_call_rets abi output))
2551            (info BoxCallInfo (gen_call_info abi name uses defs (try_call_none) patchable))
2552            (_ Unit (emit_side_effect (call_impl info))))
2553        output))
2554
2555;; Direct call to an out-of-range function (implicitly via pointer).
2556(rule (lower (call (func_ref_data sig_ref name dist false) args))
2557      (let ((output ValueRegsVec (gen_call_output sig_ref))
2558            (abi Sig (abi_sig sig_ref))
2559            (uses CallArgList (gen_call_args abi args))
2560            (defs CallRetList (gen_call_rets abi output))
2561            (target Reg (load_ext_name name 0 dist))
2562            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none)))
2563            (_ Unit (emit_side_effect (call_ind_impl info))))
2564        output))
2565
2566;; Indirect call.
2567(rule (lower (call_indirect sig_ref ptr args))
2568      (let ((output ValueRegsVec (gen_call_output sig_ref))
2569            (abi Sig (abi_sig sig_ref))
2570            (target Reg (put_in_reg ptr))
2571            (uses CallArgList (gen_call_args abi args))
2572            (defs CallRetList (gen_call_rets abi output))
2573            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none)))
2574            (_ Unit (emit_side_effect (call_ind_impl info))))
2575        output))
2576
2577;;;; Rules for `try_call` and `try_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2578
2579;; Direct call to an in-range function.
2580(rule 1 (lower_branch (try_call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args et) targets)
2581      (let ((abi Sig (abi_sig sig_ref))
2582            (trycall OptionTryCallInfo (try_call_info et targets))
2583            (uses CallArgList (gen_call_args abi args))
2584            (defs CallRetList (gen_try_call_rets abi))
2585            (info BoxCallInfo (gen_call_info abi name uses defs trycall patchable)))
2586        (emit_side_effect (call_impl info))))
2587
2588;; Direct call to an out-of-range function (implicitly via pointer).
2589(rule (lower_branch (try_call (func_ref_data sig_ref name dist false) args et) targets)
2590      (let ((abi Sig (abi_sig sig_ref))
2591            (trycall OptionTryCallInfo (try_call_info et targets))
2592            (uses CallArgList (gen_call_args abi args))
2593            (defs CallRetList (gen_try_call_rets abi))
2594            (target Reg (load_ext_name name 0 dist))
2595            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall)))
2596        (emit_side_effect (call_ind_impl info))))
2597
2598;; Indirect call.
2599(rule (lower_branch (try_call_indirect ptr args et) targets)
2600      (if-let (exception_sig sig_ref) et)
2601      (let ((abi Sig (abi_sig sig_ref))
2602            (trycall OptionTryCallInfo (try_call_info et targets))
2603            (target Reg (put_in_reg ptr))
2604            (uses CallArgList (gen_call_args abi args))
2605            (defs CallRetList (gen_try_call_rets abi))
2606            (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall)))
2607        (emit_side_effect (call_ind_impl info))))
2608
2609;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2610
2611;; N.B.: the Ret itself is generated by the ABI.
2612(rule (lower (return args))
2613      (lower_return args))
2614
2615;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;
2616
2617;; Direct call to an in-range function.
2618(rule 1 (lower (return_call (func_ref_data sig_ref name (RelocDistance.Near) false) args))
2619      (let ((abi Sig (abi_sig sig_ref))
2620            (uses CallArgList (gen_return_call_args abi args))
2621            (info BoxReturnCallInfo (gen_return_call_info abi name uses)))
2622        (side_effect (return_call_impl info))))
2623
2624;; Direct call to an out-of-range function (implicitly via pointer).
2625(rule (lower (return_call (func_ref_data sig_ref name dist false) args))
2626      (let ((abi Sig (abi_sig sig_ref))
2627            (uses CallArgList (gen_return_call_args abi args))
2628            (target Reg (load_ext_name name 0 dist))
2629            (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses)))
2630        (side_effect (return_call_ind_impl info))))
2631
2632;; Indirect call.
2633(rule (lower (return_call_indirect sig_ref ptr args))
2634      (let ((abi Sig (abi_sig sig_ref))
2635            (target Reg (put_in_reg ptr))
2636            (uses CallArgList (gen_return_call_args abi args))
2637            (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses)))
2638        (side_effect (return_call_ind_impl info))))
2639
2640;;;; Rules for loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2641
2642(rule load_i8_aarch64_uload8 (lower
2643       (has_type $I8 (load _ (little_or_native_endian flags) address offset)))
2644      (aarch64_uload8 (amode $I8 address offset) flags))
2645(rule load_i16_aarch64_uload16 (lower
2646       (has_type $I16 (load _ (little_or_native_endian flags) address offset)))
2647      (aarch64_uload16 (amode $I16 address offset) flags))
2648(rule load_i32_aarch64_uload32 (lower
2649       (has_type $I32 (load _ (little_or_native_endian flags) address offset)))
2650      (aarch64_uload32 (amode $I32 address offset) flags))
2651(rule load_i64_aarch64_uload64 (lower
2652       (has_type $I64 (load _ (little_or_native_endian flags) address offset)))
2653      (aarch64_uload64 (amode $I64 address offset) flags))
2654(rule (lower
2655       (has_type $I128 (load _ (little_or_native_endian flags) address offset)))
2656      (aarch64_loadp64 (pair_amode address offset) flags))
2657(rule -1 (lower
2658       (has_type (ty_float_or_vec (ty_16 _)) (load _ (little_or_native_endian flags) address offset)))
2659      (aarch64_fpuload16 (amode $F16 address offset) flags))
2660(rule -2 (lower
2661       (has_type (ty_float_or_vec (ty_32 _)) (load _ (little_or_native_endian flags) address offset)))
2662      (aarch64_fpuload32 (amode $F32 address offset) flags))
2663(rule -3 (lower
2664       (has_type (ty_float_or_vec (ty_64 _)) (load _ (little_or_native_endian flags) address offset)))
2665      (aarch64_fpuload64 (amode $F64 address offset) flags))
2666(rule -4 (lower
2667       (has_type (ty_float_or_vec (ty_128 _)) (load _ (little_or_native_endian flags) address offset)))
2668      (aarch64_fpuload128 (amode $F128 address offset) flags))
2669(rule -5 (lower
2670       (has_type (ty_dyn_vec64 _)
2671                        (load _ (little_or_native_endian flags) address offset)))
2672      (aarch64_fpuload64 (amode $F64 address offset) flags))
2673(rule -6 (lower
2674       (has_type (ty_dyn_vec128 _)
2675                        (load _ (little_or_native_endian flags) address offset)))
2676      (aarch64_fpuload128 (amode $I8X16 address offset) flags))
2677
2678(rule (lower
2679       (uload8 _ (little_or_native_endian flags) address offset))
2680      (aarch64_uload8 (amode $I8 address offset) flags))
2681(rule (lower
2682       (sload8 _ (little_or_native_endian flags) address offset))
2683      (aarch64_sload8 (amode $I8 address offset) flags))
2684(rule (lower
2685       (uload16 _ (little_or_native_endian flags) address offset))
2686      (aarch64_uload16 (amode $I16 address offset) flags))
2687(rule (lower
2688       (sload16 _ (little_or_native_endian flags) address offset))
2689      (aarch64_sload16 (amode $I16 address offset) flags))
2690(rule (lower
2691       (uload32 _ (little_or_native_endian flags) address offset))
2692      (aarch64_uload32 (amode $I32 address offset) flags))
2693(rule (lower
2694       (sload32 _ (little_or_native_endian flags) address offset))
2695      (aarch64_sload32 (amode $I32 address offset) flags))
2696
2697(rule (lower
2698       (sload8x8 _ (little_or_native_endian flags) address offset))
2699      (vec_extend (VecExtendOp.Sxtl)
2700                  (aarch64_fpuload64 (amode $F64 address offset) flags)
2701                  false
2702                  (ScalarSize.Size16)))
2703(rule (lower
2704       (uload8x8 _ (little_or_native_endian flags) address offset))
2705      (vec_extend (VecExtendOp.Uxtl)
2706                  (aarch64_fpuload64 (amode $F64 address offset) flags)
2707                  false
2708                  (ScalarSize.Size16)))
2709(rule (lower
2710       (sload16x4 _ (little_or_native_endian flags) address offset))
2711      (vec_extend (VecExtendOp.Sxtl)
2712                  (aarch64_fpuload64 (amode $F64 address offset) flags)
2713                  false
2714                  (ScalarSize.Size32)))
2715(rule (lower
2716       (uload16x4 _ (little_or_native_endian flags) address offset))
2717      (vec_extend (VecExtendOp.Uxtl)
2718                  (aarch64_fpuload64 (amode $F64 address offset) flags)
2719                  false
2720                  (ScalarSize.Size32)))
2721(rule (lower
2722       (sload32x2 _ (little_or_native_endian flags) address offset))
2723      (vec_extend (VecExtendOp.Sxtl)
2724                  (aarch64_fpuload64 (amode $F64 address offset) flags)
2725                  false
2726                  (ScalarSize.Size64)))
2727(rule (lower
2728       (uload32x2 _ (little_or_native_endian flags) address offset))
2729      (vec_extend (VecExtendOp.Uxtl)
2730                  (aarch64_fpuload64 (amode $F64 address offset) flags)
2731                  false
2732                  (ScalarSize.Size64)))
2733
2734;;;; Rules for stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2735
2736(rule store_i8_aarch64_store8 (lower
2737       (store (little_or_native_endian flags) value @ (value_type $I8) address offset))
2738      (side_effect
2739       (aarch64_store8 (amode $I8 address offset) flags value)))
2740(rule store_i16_aarch64_store16 (lower
2741       (store (little_or_native_endian flags) value @ (value_type $I16) address offset))
2742      (side_effect
2743       (aarch64_store16 (amode $I16 address offset) flags value)))
2744(rule store_i32_aarch64_store32 (lower
2745       (store (little_or_native_endian flags) value @ (value_type $I32) address offset))
2746      (side_effect
2747       (aarch64_store32 (amode $I32 address offset) flags value)))
2748(rule store_i64_aarch64_store64 (lower
2749       (store (little_or_native_endian flags) value @ (value_type $I64) address offset))
2750      (side_effect
2751       (aarch64_store64 (amode $I64 address offset) flags value)))
2752
2753(rule (lower
2754       (istore8 (little_or_native_endian flags) value address offset))
2755      (side_effect
2756       (aarch64_store8 (amode $I8 address offset) flags value)))
2757(rule (lower
2758       (istore16 (little_or_native_endian flags) value address offset))
2759      (side_effect
2760       (aarch64_store16 (amode $I16 address offset) flags value)))
2761(rule (lower
2762       (istore32 (little_or_native_endian flags) value address offset))
2763      (side_effect
2764       (aarch64_store32 (amode $I32 address offset) flags value)))
2765
2766(rule (lower
2767       (store (little_or_native_endian flags) value @ (value_type $I128) address offset))
2768      (side_effect
2769       (aarch64_storep64 (pair_amode address offset) flags
2770                         (value_regs_get value 0)
2771                         (value_regs_get value 1))))
2772
2773(rule -1 (lower
2774       (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_16 _))) address offset))
2775      (side_effect
2776       (aarch64_fpustore16 (amode $F16 address offset) flags value)))
2777(rule -2 (lower
2778       (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_32 _))) address offset))
2779      (side_effect
2780       (aarch64_fpustore32 (amode $F32 address offset) flags value)))
2781(rule -3 (lower
2782       (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_64 _))) address offset))
2783      (side_effect
2784       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
2785(rule -4 (lower
2786       (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_128 _))) address offset))
2787      (side_effect
2788       (aarch64_fpustore128 (amode $F128 address offset) flags value)))
2789
2790(rule -5 (lower
2791       (store (little_or_native_endian flags) value @ (value_type (ty_dyn_vec64 _)) address offset))
2792      (side_effect
2793       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
2794(rule -6 (lower
2795       (store (little_or_native_endian flags) value @ (value_type (ty_dyn_vec128 _)) address offset))
2796      (side_effect
2797       (aarch64_fpustore128 (amode $I8X16 address offset) flags value)))
2798
2799;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2800
2801(rule (lower (get_pinned_reg _))
2802      (mov_from_preg (preg_pinned)))
2803
2804(rule (lower (set_pinned_reg val))
2805      (side_effect (write_pinned_reg val)))
2806
2807;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2808
2809; SIMD&FP <=> SIMD&FP
2810(rule 7 (lower (has_type (ty_float_or_vec _) (bitcast _ _ x @ (value_type (ty_float_or_vec _)))))
2811      x)
2812
2813; I128 => SIMD&FP
2814(rule 6 (lower (has_type (ty_float_or_vec _) (bitcast _ _ x @ (value_type $I128))))
2815      (mov_to_vec (mov_to_fpu (value_regs_get x 0) (ScalarSize.Size64)) (value_regs_get x 1) 1 (VectorSize.Size64x2)))
2816
2817; SIMD&FP => I128
2818(rule 5 (lower (has_type $I128 (bitcast _ _ x @ (value_type (ty_float_or_vec _)))))
2819      (value_regs (mov_from_vec x 0 (ScalarSize.Size64)) (mov_from_vec x 1 (ScalarSize.Size64))))
2820
2821; GPR => SIMD&FP
2822(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ _ x @ (value_type in_ty))))
2823      (if (ty_int_ref_scalar_64 in_ty))
2824      (mov_to_fpu x (scalar_size in_ty)))
2825
2826; SIMD&FP => GPR
2827(rule 3 (lower (has_type out_ty (bitcast _ _ x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
2828      (if (ty_int_ref_scalar_64 out_ty))
2829      (mov_from_vec x 0 (scalar_size out_ty)))
2830
2831; GPR <=> GPR
2832(rule 1 (lower (has_type out_ty (bitcast _ _ x @ (value_type in_ty))))
2833      (if (ty_int_ref_scalar_64 out_ty))
2834      (if (ty_int_ref_scalar_64 in_ty))
2835      x)
2836(rule 0 (lower (has_type $I128 (bitcast _ _ x @ (value_type $I128)))) x)
2837
2838;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2839
2840;; extractlane with lane 0 can pass through the value unchanged; upper
2841;; bits are undefined when a narrower type is in a wider register.
2842(rule 2 (lower (has_type (ty_scalar_float _) (extractlane _ val (u8_from_uimm8 0))))
2843      val)
2844
2845(rule 0 (lower (has_type (ty_int ty)
2846                       (extractlane _ val
2847                                    (u8_from_uimm8 lane))))
2848      (mov_from_vec val lane (scalar_size ty)))
2849
2850(rule 1 (lower (has_type (ty_scalar_float ty)
2851                       (extractlane _ val @ (value_type vty)
2852                                    (u8_from_uimm8 lane))))
2853      (fpu_move_from_vec val lane (vector_size vty)))
2854
2855;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2856
2857(rule 1 (lower (insertlane _ vec @ (value_type vty)
2858                         val @ (value_type (ty_int _))
2859                         (u8_from_uimm8 lane)))
2860      (mov_to_vec vec val lane (vector_size vty)))
2861
2862(rule (lower (insertlane _ vec @ (value_type vty)
2863                         val @ (value_type (ty_scalar_float _))
2864                         (u8_from_uimm8 lane)))
2865      (mov_vec_elem vec val lane 0 (vector_size vty)))
2866
2867;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2868
2869(rule (lower (stack_addr _ stack_slot offset))
2870      (compute_stack_addr stack_slot offset))
2871
2872;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2873
2874;; All three sequences use one integer temporary and two vector
2875;; temporaries.  The shift is done early so as to give the register
2876;; allocator the possibility of using the same reg for `tmp_v1` and
2877;; `src_v` in the case that this is the last use of `src_v`.  See
2878;; https://github.com/WebAssembly/simd/pull/201 for the background and
2879;; derivation of these sequences. Alternative sequences are discussed
2880;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
2881;; although they are not used here.
2882
2883(rule (lower (vhigh_bits _ vec @ (value_type $I8X16)))
2884      (let (
2885            ;; Replicate the MSB of each of the 16 byte lanes across
2886            ;; the whole lane (sshr is an arithmetic right shift).
2887            (shifted Reg (sshr_vec_imm vec 7 (VectorSize.Size8x16)))
2888            ;; Bitwise-and with a mask
2889            ;; `0x80402010_08040201_80402010_08040201` to get the bit
2890            ;; in the proper location for each group of 8 lanes.
2891            (anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
2892            ;; Produce a version of `anded` with upper 8 lanes and
2893            ;; lower 8 lanes swapped.
2894            (anded_swapped Reg (vec_extract anded anded 8))
2895            ;; Zip together the two; with the above this produces the lane permutation:
2896            ;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
2897            (zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
2898            ;; Add 16-bit lanes together ("add across vector"), so we
2899            ;; get, in the low 16 bits, 15+14+...+8 in the high byte
2900            ;; and 7+6+...+0 in the low byte. This effectively puts
2901            ;; the 16 MSBs together, giving our results.
2902            ;;
2903            ;; N.B.: `Size16x8` is not a typo!
2904            (result Reg (addv zipped (VectorSize.Size16x8))))
2905        (mov_from_vec result 0 (ScalarSize.Size16))))
2906
2907(rule (lower (vhigh_bits _ vec @ (value_type $I16X8)))
2908      (let (
2909            ;; Replicate the MSB of each of the 8 16-bit lanes across
2910            ;; the whole lane (sshr is an arithmetic right shift).
2911            (shifted Reg (sshr_vec_imm vec 15 (VectorSize.Size16x8)))
2912            ;; Bitwise-and with a mask
2913            ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
2914            ;; bit in the proper location for each group of 4 lanes.
2915            (anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
2916            ;; Add lanes together to get the 8 MSBs in the low byte.
2917            (result Reg (addv anded (VectorSize.Size16x8))))
2918        (mov_from_vec result 0 (ScalarSize.Size16))))
2919
2920(rule (lower (vhigh_bits _ vec @ (value_type $I32X4)))
2921      (let (
2922            ;; Replicate the MSB of each of the 4 32-bit lanes across
2923            ;; the whole lane (sshr is an arithmetic right shift).
2924            (shifted Reg (sshr_vec_imm vec 31 (VectorSize.Size32x4)))
2925            ;; Bitwise-and with a mask
2926            ;; `0x00000008_00000004_00000002_00000001` to get the bit
2927            ;; in the proper location for each group of 4 lanes.
2928            (anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
2929            ;; Add lanes together to get the 4 MSBs in the low byte.
2930            (result Reg (addv anded (VectorSize.Size32x4))))
2931        (mov_from_vec result 0 (ScalarSize.Size32))))
2932
2933(rule (lower (vhigh_bits _ vec @ (value_type $I64X2)))
2934      (let (
2935            ;; Grab the MSB out of each of the lanes, right-shift to
2936            ;; LSB, and add with a left-shift of upper lane's MSB back
2937            ;; to bit 1.  the whole lane (sshr is an arithmetic right
2938            ;; shift).
2939            (upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
2940            (lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
2941            (upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
2942            (lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
2943        (add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))
2944
2945;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2946
2947(rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap _ a b tc)))
2948      (trap_if_overflow (add_with_flags_paired ty a b) tc))
2949
2950;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2951
2952;; put a narrow value into a register and sign-/zero-extend depending on the ArgumentExtension
2953(decl put_in_reg_ext32 (Value ArgumentExtension) Reg)
2954(rule (put_in_reg_ext32 val (ArgumentExtension.Sext))
2955      (put_in_reg_sext32 val))
2956(rule (put_in_reg_ext32 val (ArgumentExtension.Uext))
2957      (put_in_reg_zext32 val))
2958
2959;; For narrow values emit a normal op with both arguments zero/sign extended.
2960;; Then check if the output is the same as itself zero/sign extended from the narrower width.
2961(decl overflow_op_small (Type Value Value ArgumentExtension ALUOp) InstOutput)
2962(rule (overflow_op_small ty a b arg_ext alu_op)
2963      (let ((extend ExtendOp (lower_extend_op ty arg_ext))
2964
2965            ;; Instead of emitting two `{u,s}xt{b,h}` we do one as an instruction and
2966            ;; the other as an extend operation in the alu_op.
2967            ;;
2968            ;; uxtb    a_ext, a
2969            ;; alu_op  out, a_ext, b, {u,s}xtb
2970            ;; cmp     out, out, {u,s}xtb
2971            ;; cset    out_of, ne
2972            (a_ext Reg (put_in_reg_ext32 a arg_ext))
2973            (out Reg (alu_rrr_extend alu_op ty a_ext b extend))
2974            (out_of Reg (with_flags_reg
2975                  (cmp_extend (OperandSize.Size32) out out extend)
2976                  (cset (Cond.Ne)))))
2977      (output_pair
2978            (value_reg out)
2979            (value_reg out_of))))
2980
2981;; For register sized op's just emit a op+cset, without further masking.
2982;;
2983;; op out, a, b
2984;; cset out_of, cond
2985;;
2986;; conds expected:
2987;; Hs: Carry set, unsigned overflow; Vs: Signed Over-/Underflow;
2988;; Lo: Carry clear, meaning no unsigned overflow.
2989;; (this is because subtraction is implemented as an add with the two's complement value on aarch64, meaning there is a sub-overflow if the add does not overflow)
2990(decl overflow_op_normal (Type Value Value ALUOp Cond) InstOutput)
2991(rule (overflow_op_normal ty a b alu_op cond)
2992      (let ((out ValueRegs
2993              (with_flags
2994                  (alu_rrr_with_flags_paired ty a b alu_op)
2995                  (cset_paired cond))))
2996      (output_pair
2997            (value_regs_get out 0)
2998            (value_regs_get out 1))))
2999
3000;; For 128bit integers emit, for example, add+adcs+cset
3001(decl overflow_op_128 (Value Value ALUOp ALUOp Cond) InstOutput)
3002(rule (overflow_op_128 x y alu_op1 alu_op2 cond)
3003      (let
3004          ;; Get the high/low registers for `x`.
3005          ((x_regs ValueRegs x)
3006           (x_lo Reg (value_regs_get x_regs 0))
3007           (x_hi Reg (value_regs_get x_regs 1))
3008
3009           ;; Get the high/low registers for `y`.
3010           (y_regs ValueRegs y)
3011           (y_lo Reg (value_regs_get y_regs 0))
3012           (y_hi Reg (value_regs_get y_regs 1)))
3013        ;; cannot use the with_flags helper here but it should be fine right now
3014        (let
3015            ((lo_inst ProducesFlags (alu_rrr_with_flags_paired $I64 x_lo y_lo alu_op1))
3016             (hi_inst ConsumesAndProducesFlags (alu_rrr_with_flags_chained $I64 x_hi y_hi alu_op2))
3017             (of_inst ConsumesFlags (cset_paired cond))
3018
3019             (result MultiReg (with_flags_chained lo_inst hi_inst of_inst)))
3020            (multi_reg_to_pair_and_single result)))
3021)
3022
3023;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3024
3025;; For values smaller than a register, we do a normal `add` with both arguments
3026;; zero extended. We then check if the output is the same as itself zero extended.
3027(rule 1 (lower (has_type (fits_in_16 ty) (uadd_overflow _ a b)))
3028      (overflow_op_small ty a b (ArgumentExtension.Uext) (ALUOp.Add)))
3029
3030;; For register sized add's we just emit a adds+cset, without further masking.
3031(rule 2 (lower (has_type (ty_32_or_64 ty) (uadd_overflow _ a b)))
3032      (overflow_op_normal ty a b (ALUOp.AddS) (Cond.Hs)))
3033
3034;; For 128bit integers we emit add+adcs+cset
3035(rule 0 (lower (has_type $I128 (uadd_overflow _ x y)))
3036      (overflow_op_128 x y (ALUOp.AddS) (ALUOp.AdcS) (Cond.Hs)))
3037
3038;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3039
3040;; sxt{b,h} a_ext, a
3041;; add out, a_ext, b, sxt{b,h}
3042;; cmp out, out, sxt{b,h}
3043;; cset of, ne
3044(rule 1 (lower (has_type (fits_in_16 ty) (sadd_overflow _ a b)))
3045      (overflow_op_small ty a b (ArgumentExtension.Sext) (ALUOp.Add)))
3046
3047;; adds a, b
3048;; cset of, vs
3049(rule 2 (lower (has_type (ty_32_or_64 ty) (sadd_overflow _ a b)))
3050      (overflow_op_normal ty a b (ALUOp.AddS) (Cond.Vs)))
3051
3052;; adds x_lo, y_lo
3053;; addcs x_hi, y_hi
3054;; cset of, vs
3055(rule 0 (lower (has_type $I128 (sadd_overflow _ x y)))
3056      (overflow_op_128 x y (ALUOp.AddS) (ALUOp.AdcS) (Cond.Vs)))
3057
3058;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3059
3060;; uxt{b,h} a_ext, a
3061;; sub out, a_ext, b, ext{b,h}
3062;; cmp out, out, uxt{b,h}
3063;; cset of, ne
3064(rule 1 (lower (has_type (fits_in_16 ty) (usub_overflow _ a b)))
3065      (overflow_op_small ty a b (ArgumentExtension.Uext) (ALUOp.Sub)))
3066
3067;; subs a, b
3068;; cset of, lo
3069(rule 2 (lower (has_type (ty_32_or_64 ty) (usub_overflow _ a b)))
3070      (overflow_op_normal ty a b (ALUOp.SubS) (Cond.Lo)))
3071
3072;; subs x_lo, y_lo
3073;; sbcs x_hi, y_hi
3074;; cset of, lo
3075(rule 0 (lower (has_type $I128 (usub_overflow _ x y)))
3076      (overflow_op_128 x y (ALUOp.SubS) (ALUOp.SbcS) (Cond.Lo)))
3077
3078;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3079
3080;; sxt{b,h} a_ext, a
3081;; sub out, a_ext, b, sxt{b,h}
3082;; cmp out, out, sxt{b,h}
3083;; cset of, ne
3084(rule 1 (lower (has_type (fits_in_16 ty) (ssub_overflow _ a b)))
3085      (overflow_op_small ty a b (ArgumentExtension.Sext) (ALUOp.Sub)))
3086
3087;; subs a, b
3088;; cset of, vs
3089(rule 2 (lower (has_type (ty_32_or_64 ty) (ssub_overflow _ a b)))
3090      (overflow_op_normal ty a b (ALUOp.SubS) (Cond.Vs)))
3091
3092;; subs x_lo, y_lo
3093;; sbcs x_hi, y_hi
3094;; cset of, vs
3095(rule 0 (lower (has_type $I128 (ssub_overflow _ x y)))
3096      (overflow_op_128 x y (ALUOp.SubS) (ALUOp.SbcS) (Cond.Vs)))
3097
3098;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3099
3100;; uxt{b,h} a_ext, a
3101;; uxt{b,h} b_ext, b
3102;; mul out, a_ext, b_ext
3103;; cmp out, out, uxt{b,h}
3104;; cset of, ne
3105(rule 1 (lower (has_type (fits_in_16 ty) (umul_overflow _ a b)))
3106       (let ((extend ExtendOp (lower_extend_op ty (ArgumentExtension.Uext)))
3107
3108             (a_uext Reg (put_in_reg_zext32 a))
3109             (b_uext Reg (put_in_reg_zext32 b))
3110             (out Reg (madd ty a_uext b_uext (zero_reg)))
3111             (out_of Reg (with_flags_reg
3112                   (cmp_extend (OperandSize.Size32) out out extend)
3113                   (cset (Cond.Ne)))))
3114       (output_pair
3115             (value_reg out)
3116             (value_reg out_of))))
3117
3118;; umull out, a, b
3119;; cmp out, out, uxtw
3120;; cset of, ne
3121(rule 2 (lower (has_type $I32 (umul_overflow _ a b)))
3122       (let (
3123             (out Reg (umaddl a b (zero_reg)))
3124             (out_of Reg (with_flags_reg
3125                   (cmp_extend (OperandSize.Size64) out out (ExtendOp.UXTW))
3126                   (cset (Cond.Ne)))))
3127       (output_pair
3128             (value_reg out)
3129             (value_reg out_of))))
3130
3131;; mul out, a, b
3132;; umulh tmp, a, b
3133;; cmp tmp, #0
3134;; cset of, ne
3135(rule 2 (lower (has_type $I64 (umul_overflow _ a b)))
3136       (let (
3137             (out Reg (madd $I64 a b (zero_reg)))
3138             (tmp Reg (umulh $I64 a b))
3139             (out_of Reg (with_flags_reg
3140                   (cmp64_imm tmp (u8_into_imm12 0))
3141                   (cset (Cond.Ne)))))
3142       (output_pair
3143             (value_reg out)
3144             (value_reg out_of))))
3145
3146;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3147
3148;; sxt{b,h} a_ext, a
3149;; sxt{b,h} b_ext, b
3150;; mul out, a_ext, b_ext
3151;; cmp out, out, sxt{b,h}
3152;; cset of, ne
3153(rule 1 (lower (has_type (fits_in_16 ty) (smul_overflow _ a b)))
3154       (let ((extend ExtendOp (lower_extend_op ty (ArgumentExtension.Sext)))
3155
3156             (a_sext Reg (put_in_reg_sext32 a))
3157             (b_sext Reg (put_in_reg_sext32 b))
3158             (out Reg (madd ty a_sext b_sext (zero_reg)))
3159             (out_of Reg (with_flags_reg
3160                   (cmp_extend (OperandSize.Size32) out out extend)
3161                   (cset (Cond.Ne)))))
3162       (output_pair
3163             (value_reg out)
3164             (value_reg out_of))))
3165
3166;; smull out, a, b
3167;; cmp out, out, sxtw
3168;; cset of, ne
3169(rule 2 (lower (has_type $I32 (smul_overflow _ a b)))
3170       (let (
3171             (out Reg (smaddl a b (zero_reg)))
3172             (out_of Reg (with_flags_reg
3173                   (cmp_extend (OperandSize.Size64) out out (ExtendOp.SXTW))
3174                   (cset (Cond.Ne)))))
3175       (output_pair
3176             (value_reg out)
3177             (value_reg out_of))))
3178
3179;; mul out, a, b
3180;; smulh tmp, a, b
3181;; cmp tmp, out, ASR #63
3182;; cset of, ne
3183(rule 2 (lower (has_type $I64 (smul_overflow _ a b)))
3184       (let (
3185             (out Reg (madd $I64 a b (zero_reg)))
3186             (tmp Reg (smulh $I64 a b))
3187             (out_of Reg (with_flags_reg
3188                   (cmp_rr_shift_asr (OperandSize.Size64) tmp out 63)
3189                   (cset (Cond.Ne)))))
3190       (output_pair
3191             (value_reg out)
3192             (value_reg out_of))))
3193
3194;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3195
3196(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value _ (symbol_value_data name _ _))))
3197      (elf_tls_get_addr name))
3198
3199(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value _ (symbol_value_data name _ _))))
3200      (macho_tls_get_addr name))
3201
3202;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3203
3204(rule (lower (fvpromote_low _ val))
3205      (vec_rr_long (VecRRLongOp.Fcvtl32) val false))
3206
3207;;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3208
3209;; `brif` following `icmp`
3210(rule (lower_branch (brif (maybe_uextend (icmp _ cc x @ (value_type ty) y)) _ _) (two_targets taken not_taken))
3211      (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y ty))
3212            (cond Cond (cond_code (flags_and_cc_cc comparison))))
3213        (emit_side_effect
3214         (with_flags_side_effect (flags_and_cc_flags comparison)
3215                                 (cond_br taken
3216                                          not_taken
3217                                          (cond_br_cond cond))))))
3218
3219;; `brif` following `fcmp`
3220(rule (lower_branch (brif (maybe_uextend (fcmp _ cc x @ (value_type (ty_scalar_float ty)) y)) _ _) (two_targets taken not_taken))
3221      (let ((cond Cond (fp_cond_code cc)))
3222       (emit_side_effect
3223        (with_flags_side_effect (fpu_cmp (scalar_size ty) x y)
3224                                (cond_br taken not_taken
3225                                 (cond_br_cond cond))))))
3226
3227;; standard `brif`
3228(rule -1 (lower_branch (brif c @ (value_type $I128) _ _) (two_targets taken not_taken))
3229      (let ((flags ProducesFlags (flags_to_producesflags c))
3230            (c ValueRegs (put_in_regs c))
3231            (c_lo Reg (value_regs_get c 0))
3232            (c_hi Reg (value_regs_get c 1))
3233            (rt Reg (orr $I64 c_lo c_hi)))
3234       (emit_side_effect
3235        (with_flags_side_effect flags
3236         (cond_br taken not_taken (cond_br_not_zero rt (operand_size $I64)))))))
3237(rule -2 (lower_branch (brif c @ (value_type ty) _ _) (two_targets taken not_taken))
3238      (if (ty_int_ref_scalar_64 ty))
3239      (let ((flags ProducesFlags (flags_to_producesflags c))
3240            (rt Reg (put_in_reg_zext64 c)))
3241       (emit_side_effect
3242        (with_flags_side_effect flags
3243         (cond_br taken not_taken (cond_br_not_zero rt (operand_size $I64)))))))
3244
3245;; Special lowerings for `tbnz` - "Test bit and Branch if Nonzero"
3246(rule 1 (lower_branch (brif (band _ x @ (value_type ty) (u64_from_iconst n)) _ _)
3247                     (two_targets taken not_taken))
3248  (if-let bit (test_and_compare_bit_const ty n))
3249  (emit_side_effect (tbnz taken not_taken x bit)))
3250
3251;; Special lowering for `tbz` - "Test bit and Branch if Zero"
3252(rule 1 (lower_branch (brif (icmp _ (IntCC.Equal)
3253                                  (band _ x @ (value_type (fits_in_64 ty))
3254                                        (u64_from_iconst n))
3255                                  (u64_from_iconst 0)) _ _)
3256                     (two_targets taken not_taken))
3257  (if-let bit (test_and_compare_bit_const ty n))
3258  (emit_side_effect (tbz taken not_taken x bit)))
3259
3260(decl pure partial test_and_compare_bit_const (Type u64) u8)
3261(extern constructor test_and_compare_bit_const test_and_compare_bit_const)
3262
3263;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3264
3265(rule (lower_branch (jump _) (single_target label))
3266      (emit_side_effect (aarch64_jump label)))
3267
3268;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3269
3270;; `targets` contains the default target with the list of branch targets
3271;; concatenated.
3272(rule (lower_branch (br_table idx _) (jump_table_targets default targets))
3273      (let ((jt_size u32 (jump_table_size targets))
3274            (_ InstOutput (side_effect
3275                  (emit_island (targets_jt_space targets))))
3276            (ridx Reg (put_in_reg_zext32 idx)))
3277       (br_table_impl jt_size ridx default targets)))
3278
3279;; Rules for `get_exception_handler_address` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3280
3281(rule (lower (get_exception_handler_address _ (u64_from_imm64 idx) block))
3282      (let ((succ_label MachLabel (block_exn_successor_label block idx)))
3283        (a64_label_address succ_label)))
3284
3285;; Rules for `sequence_point` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3286
3287(rule (lower (sequence_point))
3288      (side_effect
3289       (a64_sequence_point)))
3290