1;; aarch64 instruction selection and CLIF-to-MachInst lowering. 2 3;; The main lowering constructor term: takes a clif `Inst` and returns the 4;; register(s) within which the lowered instruction's result values live. 5(spec (lower arg) 6 (provide (= result arg))) 7(decl partial lower (Inst) InstOutput) 8 9;; Variant of the main lowering constructor term, which receives an 10;; additional argument (a vector of branch targets to be used) for 11;; implementing branches. 12;; For two-branch instructions, the first target is `taken` and the second 13;; `not_taken`, even if it is a Fallthrough instruction: because we reorder 14;; blocks while we lower, the fallthrough in the new order is not (necessarily) 15;; the same as the fallthrough in CLIF. So, we use the explicitly-provided 16;; target. 17(decl partial lower_branch (Inst MachLabelSlice) Unit) 18 19;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 20 21(rule iconst (lower (has_type ty (iconst _ (u64_from_imm64 n)))) 22 (imm ty (ImmExtend.Zero) n)) 23 24;;;; Rules for `f16const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 25 26(rule (lower (f16const _ (u16_from_ieee16 n))) 27 (constant_f16 n)) 28 29;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 30 31(rule (lower (f32const _ (u32_from_ieee32 n))) 32 (constant_f32 n)) 33 34;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 35 36(rule (lower (f64const _ (u64_from_ieee64 n))) 37 (constant_f64 n)) 38 39;;;; Rules for `f128const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 40 41(rule (lower (has_type $F128 (f128const _ (u128_from_constant n)))) 42 (constant_f128 n)) 43 44;;;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 45 46(rule (lower (nop)) 47 (invalid_reg)) 48 49;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 50 51;; `i64` and smaller 52 53;; Base case, simply adding things in registers. 54(rule iadd_base_case -1 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x y))) 55 (add ty x y)) 56 57;; Special cases for when one operand is an immediate that fits in 12 bits. 58(rule iadd_imm12_right 4 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x (imm12_from_value y)))) 59 (add_imm ty x y)) 60 61(rule iadd_imm12_left 5 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ (imm12_from_value x) y))) 62 (add_imm ty y x)) 63 64;; Same as the previous special cases, except we can switch the addition to a 65;; subtraction if the negated immediate fits in 12 bits. 66(rule iadd_imm12_neg_right 2 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x y))) 67 (if-let imm12_neg (imm12_from_negated_value y)) 68 (sub_imm ty x imm12_neg)) 69 70(rule iadd_imm12_neg_left 3 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x y))) 71 (if-let imm12_neg (imm12_from_negated_value x)) 72 (sub_imm ty y imm12_neg)) 73 74;; Special cases for when we're adding an extended register where the extending 75;; operation can get folded into the add itself. 76(rule iadd_extend_right 0 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x (extended_value_from_value y)))) 77 (add_extend ty x y)) 78 79(rule iadd_extend_left 1 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ (extended_value_from_value x) y))) 80 (add_extend ty y x)) 81 82;; Special cases for when we're adding the shift of a different 83;; register by a constant amount and the shift can get folded into the add. 84(rule iadd_ishl_right 7 (lower (has_type (ty_int_ref_scalar_64 ty) 85 (iadd _ x (ishl _ y (iconst _ k))))) 86 (if-let amt (lshl_from_imm64 ty k)) 87 (add_shift ty x y amt)) 88 89(rule iadd_ishl_left 6 (lower (has_type (ty_int_ref_scalar_64 ty) 90 (iadd _ (ishl _ x (iconst _ k)) y))) 91 (if-let amt (lshl_from_imm64 ty k)) 92 (add_shift ty y x amt)) 93 94;; Fold an `iadd` and `imul` combination into a `madd` instruction. 95(rule iadd_imul_right 7 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ x (imul _ y z)))) 96 (madd ty y z x)) 97 98(rule iadd_imul_left 6 (lower (has_type (ty_int_ref_scalar_64 ty) (iadd _ (imul _ x y) z))) 99 (madd ty x y z)) 100 101;; Fold an `isub` and `imul` combination into a `msub` instruction. 102(rule isub_imul (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x (imul _ y z)))) 103 (msub ty y z x)) 104 105;; vectors 106 107(rule -2 (lower (has_type ty @ (multi_lane _ _) (iadd _ x y))) 108 (add_vec x y (vector_size ty))) 109 110;; `i128` 111(rule -3 (lower (has_type $I128 (iadd _ x y))) 112 (let 113 ;; Get the high/low registers for `x`. 114 ((x_regs ValueRegs x) 115 (x_lo Reg (value_regs_get x_regs 0)) 116 (x_hi Reg (value_regs_get x_regs 1)) 117 118 ;; Get the high/low registers for `y`. 119 (y_regs ValueRegs y) 120 (y_lo Reg (value_regs_get y_regs 0)) 121 (y_hi Reg (value_regs_get y_regs 1))) 122 ;; the actual addition is `adds` followed by `adc` which comprises the 123 ;; low/high bits of the result 124 (with_flags 125 (add_with_flags_paired $I64 x_lo y_lo) 126 (adc_paired $I64 x_hi y_hi)))) 127 128;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 129 130;; When a single element of one vector is broadcast to all the destination 131;; lanes then the `dup` instruction can be used for this operation. Note that 132;; for now this only matches lane selection from the first vector `a`, but 133;; if necessary in the future rules can be added to select from `b` as well. 134(rule 6 (lower (shuffle _ a b (shuffle_dup8_from_imm n))) 135 (vec_dup_from_fpu a (VectorSize.Size8x16) n)) 136(rule 5 (lower (shuffle _ a b (shuffle_dup16_from_imm n))) 137 (vec_dup_from_fpu a (VectorSize.Size16x8) n)) 138(rule 4 (lower (shuffle _ a b (shuffle_dup32_from_imm n))) 139 (vec_dup_from_fpu a (VectorSize.Size32x4) n)) 140(rule 3 (lower (shuffle _ a b (shuffle_dup64_from_imm n))) 141 (vec_dup_from_fpu a (VectorSize.Size64x2) n)) 142 143;; If the `Immediate` specified to the extractor looks like a duplication of the 144;; `n`th lane of the first vector of size K-byte lanes, then each extractor 145;; returns the `n` value as a `u8` to be used as part of a `vec_dup_from_fpu` 146;; instruction. Note that there's a different extractor for each bit-width of 147;; lane. 148(decl shuffle_dup8_from_imm (u8) Immediate) 149(extern extractor shuffle_dup8_from_imm shuffle_dup8_from_imm) 150(decl shuffle_dup16_from_imm (u8) Immediate) 151(extern extractor shuffle_dup16_from_imm shuffle_dup16_from_imm) 152(decl shuffle_dup32_from_imm (u8) Immediate) 153(extern extractor shuffle_dup32_from_imm shuffle_dup32_from_imm) 154(decl shuffle_dup64_from_imm (u8) Immediate) 155(extern extractor shuffle_dup64_from_imm shuffle_dup64_from_imm) 156 157;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8 158;; bytes", that's an `ext` instruction. 159(rule 2 (lower (shuffle _ a b (vec_extract_imm4_from_immediate n))) 160 (vec_extract a b n)) 161 162;; Attempts to extract `n` from the specified shuffle `Immediate` where each 163;; byte of the `Immediate` is a consecutive sequence starting from `n`. This 164;; value of `n` is used as part of the `vec_extract` instruction which extracts 165;; consecutive bytes from two vectors into one final vector, offset by `n` 166;; bytes. 167(decl vec_extract_imm4_from_immediate (u8) Immediate) 168(extern extractor vec_extract_imm4_from_immediate vec_extract_imm4_from_immediate) 169 170;; Rules for the `uzp1` and `uzp2` instructions which gather even-numbered lanes 171;; or odd-numbered lanes 172(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1e1c_1a18_1614_1210_0e0c_0a08_0604_0200))) 173 (vec_uzp1 a b (VectorSize.Size8x16))) 174(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1d_1b19_1715_1311_0f0d_0b09_0705_0301))) 175 (vec_uzp2 a b (VectorSize.Size8x16))) 176(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1d1c_1918_1514_1110_0d0c_0908_0504_0100))) 177 (vec_uzp1 a b (VectorSize.Size16x8))) 178(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_1b1a_1716_1312_0f0e_0b0a_0706_0302))) 179 (vec_uzp2 a b (VectorSize.Size16x8))) 180(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1b1a1918_13121110_0b0a0908_03020100))) 181 (vec_uzp1 a b (VectorSize.Size32x4))) 182(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_17161514_0f0e0d0c_07060504))) 183 (vec_uzp2 a b (VectorSize.Size32x4))) 184(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1716151413121110_0706050403020100))) 185 (vec_uzp1 a b (VectorSize.Size64x2))) 186(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908))) 187 (vec_uzp2 a b (VectorSize.Size64x2))) 188 189;; Rules for the `zip1` and `zip2` instructions which interleave lanes in the 190;; low or high halves of the two input vectors. 191(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) 192 (vec_zip1 a b (VectorSize.Size8x16))) 193(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) 194 (vec_zip2 a b (VectorSize.Size8x16))) 195(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100))) 196 (vec_zip1 a b (VectorSize.Size16x8))) 197(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908))) 198 (vec_zip2 a b (VectorSize.Size16x8))) 199(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x17161514_07060504_13121110_03020100))) 200 (vec_zip1 a b (VectorSize.Size32x4))) 201(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908))) 202 (vec_zip2 a b (VectorSize.Size32x4))) 203;; Note that zip1/zip2 for i64x2 vectors is omitted since it's already covered 204;; by the i64x2 cases of uzp1/uzp2 above where both zip and uzp have the same 205;; semantics for 64-bit lanes. 206 207;; Rules for the `trn1` and `trn2` instructions which interleave odd or even 208;; lanes in the two input vectors. 209(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1e0e_1c0c_1a0a_1808_1606_1404_1202_1000))) 210 (vec_trn1 a b (VectorSize.Size8x16))) 211(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f0f_1d0d_1b0b_1909_1707_1505_1303_1101))) 212 (vec_trn2 a b (VectorSize.Size8x16))) 213(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1d1c_0d0c_1918_0908_1514_0504_1110_0100))) 214 (vec_trn1 a b (VectorSize.Size16x8))) 215(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e_0f0e_1b1a_0b0a_1716_0706_1312_0302))) 216 (vec_trn2 a b (VectorSize.Size16x8))) 217(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1b1a1918_0b0a0908_13121110_03020100))) 218 (vec_trn1 a b (VectorSize.Size32x4))) 219(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_17161514_07060504))) 220 (vec_trn2 a b (VectorSize.Size32x4))) 221;; Note that trn1/trn2 for i64x2 vectors is omitted since it's already covered 222;; by the i64x2 cases of uzp1/uzp2 above where both trn and uzp have the same 223;; semantics for 64-bit lanes. 224 225;; Rules for the `rev{16,32,64}` instructions where reversals happen at either 226;; the byte level, the 16-bit level, or 32-bit level. Note that all of these 227;; patterns only match reversals in the first operand, but they can 228;; theoretically be extended if necessary to reversals in the second operand. 229(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0e0f_0c0d_0a0b_0809_0607_0405_0203_0001))) 230 (rev16 a (VectorSize.Size8x16))) 231(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0c0d0e0f_08090a0b_04050607_00010203))) 232 (rev32 a (VectorSize.Size8x16))) 233(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0d0c0f0e_09080b0a_05040706_01000302))) 234 (rev32 a (VectorSize.Size16x8))) 235(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x08090a0b0c0d0e0f_0001020304050607))) 236 (rev64 a (VectorSize.Size8x16))) 237(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x09080b0a0d0c0f0e_0100030205040706))) 238 (rev64 a (VectorSize.Size16x8))) 239(rule 1 (lower (shuffle _ a b (u128_from_immediate 0x0b0a09080f0e0d0c_0302010007060504))) 240 (rev64 a (VectorSize.Size32x4))) 241 242(rule (lower (has_type ty (shuffle _ rn rn2 (u128_from_immediate mask)))) 243 (let ((mask_reg Reg (constant_f128 mask))) 244 (vec_tbl2 rn rn2 mask_reg ty))) 245 246;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 247 248(rule (lower (has_type vec_i128_ty (swizzle _ rn rm))) 249 (vec_tbl rn rm)) 250 251;;;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 252 253(rule (lower (isplit _ x @ (value_type $I128))) 254 (let 255 ((x_regs ValueRegs x) 256 (x_lo ValueRegs (value_regs_get x_regs 0)) 257 (x_hi ValueRegs (value_regs_get x_regs 1))) 258 (output_pair x_lo x_hi))) 259 260;; Special-case the lowering of an `isplit` of a 128-bit multiply where the 261;; lower bits of the result are discarded and the operands are sign or zero 262;; extended. This maps directly to `umulh` and `smulh`. 263(rule 1 (lower i @ (isplit _ (has_type $I128 (imul _ (uextend _ x) (uextend _ y))))) 264 (if-let (first_result lo) i) 265 (if-let true (value_is_unused lo)) 266 (output_pair (invalid_reg) 267 (umulh $I64 (put_in_reg_zext64 x) (put_in_reg_zext64 y)))) 268 269(rule 1 (lower i @ (isplit _ (has_type $I128 (imul _ (sextend _ x) (sextend _ y))))) 270 (if-let (first_result lo) i) 271 (if-let true (value_is_unused lo)) 272 (output_pair (invalid_reg) 273 (smulh $I64 (put_in_reg_sext64 x) (put_in_reg_sext64 y)))) 274 275;;;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 276 277(rule (lower (has_type $I128 (iconcat _ lo hi))) 278 (output (value_regs lo hi))) 279 280;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 281 282(rule (lower (has_type $F32X4 (scalar_to_vector _ x))) 283 (fpu_extend x (ScalarSize.Size32))) 284 285(rule (lower (has_type $F64X2 (scalar_to_vector _ x))) 286 (fpu_extend x (ScalarSize.Size64))) 287 288(rule -1 (lower (scalar_to_vector _ x @ (value_type $I64))) 289 (mov_to_fpu x (ScalarSize.Size64))) 290 291(rule -2 (lower (scalar_to_vector _ x @ (value_type (int_fits_in_32 _)))) 292 (mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32))) 293 294;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 295 296;; cmeq vtmp.2d, vm.2d, #0 297;; addp dtmp, vtmp.2d 298;; fcmp dtmp, dtmp 299;; cset xd, eq 300;; 301;; Note that after the ADDP the value of the temporary register will be either 302;; 0 when all input elements are true, i.e. non-zero, or a NaN otherwise 303;; (either -1 or -2 when represented as an integer); NaNs are the only 304;; floating-point numbers that compare unequal to themselves. 305(rule (lower (vall_true _ x @ (value_type (multi_lane 64 2)))) 306 (let ((x1 Reg (cmeq0 x (VectorSize.Size64x2))) 307 (x2 Reg (addp x1 x1 (VectorSize.Size64x2)))) 308 (with_flags (fpu_cmp (ScalarSize.Size64) x2 x2) 309 (materialize_bool_result (Cond.Eq))))) 310 311(rule (lower (vall_true _ x @ (value_type (multi_lane 32 2)))) 312 (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64)))) 313 (with_flags (cmp_rr_shift (OperandSize.Size64) (zero_reg) x1 32) 314 (ccmp_imm 315 (OperandSize.Size32) 316 x1 317 (u8_into_uimm5 0) 318 (nzcv false true false false) 319 (Cond.Ne))))) 320 321;; This operation is implemented by using uminv to create a scalar value, which 322;; is then compared against zero. 323;; 324;; uminv bn, vm.16b 325;; mov xm, vn.d[0] 326;; cmp xm, #0 327;; cset xm, ne 328(rule -1 (lower (vall_true _ x @ (value_type (lane_fits_in_32 ty)))) 329 (if (not_vec32x2 ty)) 330 (let ((x1 Reg (vec_lanes (VecLanesOp.Uminv) x (vector_size ty))) 331 (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64)))) 332 (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0)) 333 (materialize_bool_result (Cond.Ne))))) 334 335;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 336 337(rule (lower (vany_true _ x @ (value_type in_ty))) 338 (with_flags (vanytrue x in_ty) 339 (materialize_bool_result (Cond.Ne)))) 340 341;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 342 343;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction 344(rule (lower (has_type $I16X8 (iadd_pairwise _ (swiden_low _ x) (swiden_high _ x)))) 345 (saddlp8 x)) 346 347;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction 348(rule (lower (has_type $I32X4 (iadd_pairwise _ (swiden_low _ x) (swiden_high _ x)))) 349 (saddlp16 x)) 350 351;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction 352(rule (lower (has_type $I16X8 (iadd_pairwise _ (uwiden_low _ x) (uwiden_high _ x)))) 353 (uaddlp8 x)) 354 355;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction 356(rule (lower (has_type $I32X4 (iadd_pairwise _ (uwiden_low _ x) (uwiden_high _ x)))) 357 (uaddlp16 x)) 358 359(rule -1 (lower (has_type ty (iadd_pairwise _ x y))) 360 (addp x y (vector_size ty))) 361 362;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 363 364(rule -1 (lower (has_type ty @ (multi_lane _ _) (iabs _ x))) 365 (vec_abs x (vector_size ty))) 366 367(rule iabs_64 2 (lower (has_type $I64 (iabs _ x))) 368 (abs (OperandSize.Size64) x)) 369 370(rule iabs_8_16_32 1 (lower (has_type (fits_in_32 ty) (iabs _ x))) 371 (abs (OperandSize.Size32) (put_in_reg_sext32 x))) 372 373; `rustc` implementation. 374; - create a bitmask of all 1s if negative, or 0s if positive. 375; - xor all bits by bitmask. then subtract bitmask from xor'd values. 376; - if `x` is positive, the xor'd bits = x and the mask = 0, so we end up with 377; `x - 0`. 378; - if `x` is negative, the xor'd bits = ~x and the mask = -1, so we end up with 379; `~x - (-1) = ~x + 1`, which is exactly `abs(x)`. 380(rule (lower (has_type $I128 (iabs _ x))) 381 (let ((x_regs ValueRegs x) 382 (x_lo Reg (value_regs_get x_regs 0)) 383 (x_hi Reg (value_regs_get x_regs 1)) 384 (asr_reg Reg (asr_imm $I64 x_hi (imm_shift_from_u8 63))) 385 (eor_hi Reg (eor $I64 x_hi asr_reg)) 386 (eor_lo Reg (eor $I64 x_lo asr_reg)) 387 (subs_lo ProducesFlags (sub_with_flags_paired $I64 eor_lo asr_reg)) 388 (sbc_hi ConsumesFlags (sbc_paired $I64 eor_hi asr_reg))) 389 (with_flags subs_lo sbc_hi))) 390 391;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 392 393(rule (lower (has_type $I64X2 (avg_round _ x y))) 394 (let ((one Reg (splat_const 1 (VectorSize.Size64x2))) 395 (c Reg (orr_vec x y (VectorSize.Size64x2))) 396 (c Reg (and_vec c one (VectorSize.Size64x2))) 397 (x Reg (ushr_vec_imm x 1 (VectorSize.Size64x2))) 398 (y Reg (ushr_vec_imm y 1 (VectorSize.Size64x2))) 399 (sum Reg (add_vec x y (VectorSize.Size64x2)))) 400 (add_vec c sum (VectorSize.Size64x2)))) 401 402(rule -1 (lower (has_type (lane_fits_in_32 ty) (avg_round _ x y))) 403 (vec_rrr (VecALUOp.Urhadd) x y (vector_size ty))) 404 405;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 406 407(rule (lower (has_type ty @ (multi_lane _ _) (sqmul_round_sat _ x y))) 408 (vec_rrr (VecALUOp.Sqrdmulh) x y (vector_size ty))) 409 410;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 411 412(rule -1 (lower (has_type ty @ (multi_lane _ _) (fadd _ rn rm))) 413 (vec_rrr (VecALUOp.Fadd) rn rm (vector_size ty))) 414 415(rule (lower (has_type (ty_scalar_float ty) (fadd _ rn rm))) 416 (fpu_rrr (FPUOp2.Add) rn rm (scalar_size ty))) 417 418;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 419 420(rule -1 (lower (has_type ty @ (multi_lane _ _) (fsub _ rn rm))) 421 (vec_rrr (VecALUOp.Fsub) rn rm (vector_size ty))) 422 423(rule (lower (has_type (ty_scalar_float ty) (fsub _ rn rm))) 424 (fpu_rrr (FPUOp2.Sub) rn rm (scalar_size ty))) 425 426;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 427 428(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmul _ rn rm))) 429 (vec_rrr (VecALUOp.Fmul) rn rm (vector_size ty))) 430 431(rule (lower (has_type (ty_scalar_float ty) (fmul _ rn rm))) 432 (fpu_rrr (FPUOp2.Mul) rn rm (scalar_size ty))) 433 434;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 435 436(rule -1 (lower (has_type ty @ (multi_lane _ _) (fdiv _ rn rm))) 437 (vec_rrr (VecALUOp.Fdiv) rn rm (vector_size ty))) 438 439(rule (lower (has_type (ty_scalar_float ty) (fdiv _ rn rm))) 440 (fpu_rrr (FPUOp2.Div) rn rm (scalar_size ty))) 441 442;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 443 444(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmin _ rn rm))) 445 (vec_rrr (VecALUOp.Fmin) rn rm (vector_size ty))) 446 447(rule (lower (has_type (ty_scalar_float ty) (fmin _ rn rm))) 448 (fpu_rrr (FPUOp2.Min) rn rm (scalar_size ty))) 449 450;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 451 452(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmax _ rn rm))) 453 (vec_rrr (VecALUOp.Fmax) rn rm (vector_size ty))) 454 455(rule (lower (has_type (ty_scalar_float ty) (fmax _ rn rm))) 456 (fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty))) 457 458;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 459 460(rule -1 (lower (has_type ty @ (multi_lane _ _) (sqrt _ x))) 461 (vec_misc (VecMisc2.Fsqrt) x (vector_size ty))) 462 463(rule (lower (has_type (ty_scalar_float ty) (sqrt _ x))) 464 (fpu_rr (FPUOp1.Sqrt) x (scalar_size ty))) 465 466;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 467 468(rule -1 (lower (has_type ty @ (multi_lane _ _) (fneg _ x))) 469 (vec_misc (VecMisc2.Fneg) x (vector_size ty))) 470 471(rule (lower (has_type (ty_scalar_float ty) (fneg _ x))) 472 (fpu_rr (FPUOp1.Neg) x (scalar_size ty))) 473 474;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 475 476(rule -1 (lower (has_type ty @ (multi_lane _ _) (fabs _ x))) 477 (vec_misc (VecMisc2.Fabs) x (vector_size ty))) 478 479(rule (lower (has_type (ty_scalar_float ty) (fabs _ x))) 480 (fpu_rr (FPUOp1.Abs) x (scalar_size ty))) 481 482;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 483 484(rule (lower (has_type $F64 (fpromote _ x))) 485 (fpu_rr (FPUOp1.Cvt32To64) x (ScalarSize.Size32))) 486 487;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 488 489(rule (lower (has_type $F32 (fdemote _ x))) 490 (fpu_rr (FPUOp1.Cvt64To32) x (ScalarSize.Size64))) 491 492;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 493 494(rule -1 (lower (has_type ty @ (multi_lane _ _) (ceil _ x))) 495 (vec_misc (VecMisc2.Frintp) x (vector_size ty))) 496 497(rule (lower (has_type $F32 (ceil _ x))) 498 (fpu_round (FpuRoundMode.Plus32) x)) 499 500(rule (lower (has_type $F64 (ceil _ x))) 501 (fpu_round (FpuRoundMode.Plus64) x)) 502 503;;;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 504 505(rule -1 (lower (has_type ty @ (multi_lane _ _) (floor _ x))) 506 (vec_misc (VecMisc2.Frintm) x (vector_size ty))) 507 508(rule (lower (has_type $F32 (floor _ x))) 509 (fpu_round (FpuRoundMode.Minus32) x)) 510 511(rule (lower (has_type $F64 (floor _ x))) 512 (fpu_round (FpuRoundMode.Minus64) x)) 513 514;;;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 515 516(rule -1 (lower (has_type ty @ (multi_lane _ _) (trunc _ x))) 517 (vec_misc (VecMisc2.Frintz) x (vector_size ty))) 518 519(rule (lower (has_type $F32 (trunc _ x))) 520 (fpu_round (FpuRoundMode.Zero32) x)) 521 522(rule (lower (has_type $F64 (trunc _ x))) 523 (fpu_round (FpuRoundMode.Zero64) x)) 524 525;;;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 526 527(rule -1 (lower (has_type ty @ (multi_lane _ _) (nearest _ x))) 528 (vec_misc (VecMisc2.Frintn) x (vector_size ty))) 529 530(rule (lower (has_type $F32 (nearest _ x))) 531 (fpu_round (FpuRoundMode.Nearest32) x)) 532 533(rule (lower (has_type $F64 (nearest _ x))) 534 (fpu_round (FpuRoundMode.Nearest64) x)) 535 536;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 537 538(rule 1 (lower (has_type ty (fma _ x y z))) (fmadd ty x y z)) 539(rule 2 (lower (has_type (ty_scalar_float ty) (fma _ x y (fneg _ z)))) (fnmsub ty x y z)) 540 541;; Constructors matching the scalar behavior of aarch64. If you're confused like 542;; I was reading over these, they are: 543;; 544;; * fmadd r = a * b + c / multiply, then add 545;; * fnmadd r = -(a * b) - c / multiply, then negate, then subtract 546;; * fmsub r = -(a * b) + c / multiply, then negate, then add 547;; * fnmsub r = a * b - c / multiply, then subtract 548(decl fmadd (Type Value Value Value) Reg) 549(decl fnmadd (Type Value Value Value) Reg) 550(decl fmsub (Type Value Value Value) Reg) 551(decl fnmsub (Type Value Value Value) Reg) 552 553;; Switch `fm{add,sub}` to `fnm{add,sub}` if one of the operands are negated 554;; instruction instead 555(rule 2 (fmadd ty (fneg _ x) y z) (fmsub ty x y z)) 556(rule 3 (fmadd ty x (fneg _ y) z) (fmsub ty x y z)) 557(rule 2 (fnmsub ty (fneg _ x) y z) (fnmadd ty x y z)) 558(rule 3 (fnmsub ty x (fneg _ y) z) (fnmadd ty x y z)) 559 560;; Scalar cases 561(rule 0 (fmadd (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z)) 562(rule 0 (fnmadd (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.NMAdd) (scalar_size ty) x y z)) 563(rule 0 (fmsub (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.MSub) (scalar_size ty) x y z)) 564(rule 0 (fnmsub (ty_scalar_float ty) x y z) (fpu_rrrr (FPUOp3.NMSub) (scalar_size ty) x y z)) 565 566;; Vector cases 567(rule 1 (fmadd ty @ (multi_lane _ _) x y z) (lower_fmla (VecALUModOp.Fmla) x y z (vector_size ty))) 568(rule 1 (fmsub ty @ (multi_lane _ _) x y z) (lower_fmla (VecALUModOp.Fmls) x y z (vector_size ty))) 569 570;; Lowers a fused-multiply-add operation handling various forms of the 571;; instruction to get maximal coverage of what's available on AArch64. 572(decl lower_fmla (VecALUModOp Value Value Value VectorSize) Reg) 573 574;; Base case, emit the op requested. 575(rule (lower_fmla op x y z size) 576 (vec_rrr_mod op z x y size)) 577 578;; Special case: if one of the multiplicands are a splat then the element-based 579;; fma can be used instead with 0 as the element index. 580(rule 1 (lower_fmla op (splat _ x) y z size) 581 (vec_fmla_elem op z y x size 0)) 582(rule 2 (lower_fmla op x (splat _ y) z size) 583 (vec_fmla_elem op z x y size 0)) 584 585;; Special case: if one of the multiplicands is a shuffle to broadcast a 586;; single element of a vector then the element-based fma can be used like splat 587;; above. 588;; 589;; Note that in Cranelift shuffle always has i8x16 inputs and outputs so 590;; a `bitcast` is matched here explicitly since that's the main way a shuffle 591;; output will be fed into this instruction. 592(rule 3 (lower_fmla op (bitcast _ _ (shuffle _ x x (shuffle32_from_imm n n n n))) y z size @ (VectorSize.Size32x4)) 593 (if-let true (u64_lt n 4)) 594 (vec_fmla_elem op z y x size n)) 595(rule 4 (lower_fmla op x (bitcast _ _ (shuffle _ y y (shuffle32_from_imm n n n n))) z size @ (VectorSize.Size32x4)) 596 (if-let true (u64_lt n 4)) 597 (vec_fmla_elem op z x y size n)) 598(rule 3 (lower_fmla op (bitcast _ _ (shuffle _ x x (shuffle64_from_imm n n))) y z size @ (VectorSize.Size64x2)) 599 (if-let true (u64_lt n 2)) 600 (vec_fmla_elem op z y x size n)) 601(rule 4 (lower_fmla op x (bitcast _ _ (shuffle _ y y (shuffle64_from_imm n n))) z size @ (VectorSize.Size64x2)) 602 (if-let true (u64_lt n 2)) 603 (vec_fmla_elem op z x y size n)) 604 605;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 606 607(rule (lower (has_type ty (fcopysign _ x y))) 608 (fcopy_sign x y ty)) 609 610;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 611 612(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint _ x @ (value_type $F32)))) 613 (fpu_to_int_cvt (FpuToIntOp.F32ToU32) x false $F32 out_ty)) 614 615(rule 1 (lower (has_type $I64 (fcvt_to_uint _ x @ (value_type $F32)))) 616 (fpu_to_int_cvt (FpuToIntOp.F32ToU64) x false $F32 $I64)) 617 618(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint _ x @ (value_type $F64)))) 619 (fpu_to_int_cvt (FpuToIntOp.F64ToU32) x false $F64 out_ty)) 620 621(rule 1 (lower (has_type $I64 (fcvt_to_uint _ x @ (value_type $F64)))) 622 (fpu_to_int_cvt (FpuToIntOp.F64ToU64) x false $F64 $I64)) 623 624;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 625 626(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint _ x @ (value_type $F32)))) 627 (fpu_to_int_cvt (FpuToIntOp.F32ToI32) x true $F32 out_ty)) 628 629(rule 1 (lower (has_type $I64 (fcvt_to_sint _ x @ (value_type $F32)))) 630 (fpu_to_int_cvt (FpuToIntOp.F32ToI64) x true $F32 $I64)) 631 632(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint _ x @ (value_type $F64)))) 633 (fpu_to_int_cvt (FpuToIntOp.F64ToI32) x true $F64 out_ty)) 634 635(rule 1 (lower (has_type $I64 (fcvt_to_sint _ x @ (value_type $F64)))) 636 (fpu_to_int_cvt (FpuToIntOp.F64ToI64) x true $F64 $I64)) 637 638;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 639 640(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_uint _ x @ (value_type (multi_lane 32 _))))) 641 (vec_misc (VecMisc2.Ucvtf) x (vector_size ty))) 642 643(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_uint _ x @ (value_type (multi_lane 64 _))))) 644 (vec_misc (VecMisc2.Ucvtf) x (vector_size ty))) 645 646(rule (lower (has_type $F32 (fcvt_from_uint _ x @ (value_type (fits_in_32 _))))) 647 (int_to_fpu (IntToFpuOp.U32ToF32) (put_in_reg_zext32 x))) 648 649(rule (lower (has_type $F64 (fcvt_from_uint _ x @ (value_type (fits_in_32 _))))) 650 (int_to_fpu (IntToFpuOp.U32ToF64) (put_in_reg_zext32 x))) 651 652(rule 1 (lower (has_type $F32 (fcvt_from_uint _ x @ (value_type $I64)))) 653 (int_to_fpu (IntToFpuOp.U64ToF32) x)) 654 655(rule 1 (lower (has_type $F64 (fcvt_from_uint _ x @ (value_type $I64)))) 656 (int_to_fpu (IntToFpuOp.U64ToF64) x)) 657 658;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 659 660(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_sint _ x @ (value_type (multi_lane 32 _))))) 661 (vec_misc (VecMisc2.Scvtf) x (vector_size ty))) 662 663(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_sint _ x @ (value_type (multi_lane 64 _))))) 664 (vec_misc (VecMisc2.Scvtf) x (vector_size ty))) 665 666(rule (lower (has_type $F32 (fcvt_from_sint _ x @ (value_type (fits_in_32 _))))) 667 (int_to_fpu (IntToFpuOp.I32ToF32) (put_in_reg_sext32 x))) 668 669(rule (lower (has_type $F64 (fcvt_from_sint _ x @ (value_type (fits_in_32 _))))) 670 (int_to_fpu (IntToFpuOp.I32ToF64) (put_in_reg_sext32 x))) 671 672(rule 1 (lower (has_type $F32 (fcvt_from_sint _ x @ (value_type $I64)))) 673 (int_to_fpu (IntToFpuOp.I64ToF32) x)) 674 675(rule 1 (lower (has_type $F64 (fcvt_from_sint _ x @ (value_type $I64)))) 676 (int_to_fpu (IntToFpuOp.I64ToF64) x)) 677 678;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 679 680(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_uint_sat _ x @ (value_type (multi_lane 32 _))))) 681 (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty))) 682 683(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_uint_sat _ x @ (value_type (multi_lane 64 _))))) 684 (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty))) 685 686(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint_sat _ x @ (value_type $F32)))) 687 (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU32) x false out_ty)) 688 689(rule 1 (lower (has_type $I64 (fcvt_to_uint_sat _ x @ (value_type $F32)))) 690 (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU64) x false $I64)) 691 692(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint_sat _ x @ (value_type $F64)))) 693 (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU32) x false out_ty)) 694 695(rule 1 (lower (has_type $I64 (fcvt_to_uint_sat _ x @ (value_type $F64)))) 696 (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU64) x false $I64)) 697 698;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 699 700(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_sint_sat _ x @ (value_type (multi_lane 32 _))))) 701 (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty))) 702 703(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_sint_sat _ x @ (value_type (multi_lane 64 _))))) 704 (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty))) 705 706(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint_sat _ x @ (value_type $F32)))) 707 (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI32) x true out_ty)) 708 709(rule 1 (lower (has_type $I64 (fcvt_to_sint_sat _ x @ (value_type $F32)))) 710 (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI64) x true $I64)) 711 712(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint_sat _ x @ (value_type $F64)))) 713 (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI32) x true out_ty)) 714 715(rule 1 (lower (has_type $I64 (fcvt_to_sint_sat _ x @ (value_type $F64)))) 716 (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI64) x true $I64)) 717 718;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 719 720;; `i64` and smaller 721 722;; Base case, simply subtracting things in registers. 723(rule isub_base_case -4 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x y))) 724 (sub ty x y)) 725 726;; Special case for when one operand is an immediate that fits in 12 bits. 727(rule isub_imm12 0 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x (imm12_from_value y)))) 728 (sub_imm ty x y)) 729 730;; Same as the previous special case, except we can switch the subtraction to an 731;; addition if the negated immediate fits in 12 bits. 732(rule isub_imm12_neg 2 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x y))) 733 (if-let imm12_neg (imm12_from_negated_value y)) 734 (add_imm ty x imm12_neg)) 735 736;; Special cases for when we're subtracting an extended register where the 737;; extending operation can get folded into the sub itself. 738(rule isub_extend 1 (lower (has_type (ty_int_ref_scalar_64 ty) (isub _ x (extended_value_from_value y)))) 739 (sub_extend ty x y)) 740 741;; Finally a special case for when we're subtracting the shift of a different 742;; register by a constant amount and the shift can get folded into the sub. 743(rule isub_ishl -3 (lower (has_type (ty_int_ref_scalar_64 ty) 744 (isub _ x (ishl _ y (iconst _ k))))) 745 (if-let amt (lshl_from_imm64 ty k)) 746 (sub_shift ty x y amt)) 747 748;; vectors 749(rule -2 (lower (has_type ty @ (multi_lane _ _) (isub _ x y))) 750 (sub_vec x y (vector_size ty))) 751 752;; `i128` 753(rule -1 (lower (has_type $I128 (isub _ x y))) 754 (sub_i128 x y)) 755 756;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 757 758(rule (lower (has_type (ty_vec128 ty) (uadd_sat _ x y))) 759 (uqadd x y (vector_size ty))) 760 761;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 762 763(rule (lower (has_type (ty_vec128 ty) (sadd_sat _ x y))) 764 (sqadd x y (vector_size ty))) 765 766;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 767 768(rule (lower (has_type (ty_vec128 ty) (usub_sat _ x y))) 769 (uqsub x y (vector_size ty))) 770 771;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 772 773(rule (lower (has_type (ty_vec128 ty) (ssub_sat _ x y))) 774 (sqsub x y (vector_size ty))) 775 776;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 777 778;; `i64` and smaller. 779(rule ineg_base_case 1 (lower (has_type (fits_in_64 ty) (ineg _ x))) 780 (sub ty (zero_reg) x)) 781 782;; `i128` 783(rule 2 (lower (has_type $I128 (ineg _ x))) 784 (sub_i128 (value_regs_zero) x)) 785 786;; vectors. 787(rule (lower (has_type (ty_vec128 ty) (ineg _ x))) 788 (neg x (vector_size ty))) 789 790;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 791 792;; `i64` and smaller. 793(rule imul_base_case -3 (lower (has_type (ty_int_ref_scalar_64 ty) (imul _ x y))) 794 (madd ty x y (zero_reg))) 795 796;; `i128`. 797(rule -1 (lower (has_type $I128 (imul _ x y))) 798 (let 799 ;; Get the high/low registers for `x`. 800 ((x_regs ValueRegs x) 801 (x_lo Reg (value_regs_get x_regs 0)) 802 (x_hi Reg (value_regs_get x_regs 1)) 803 804 ;; Get the high/low registers for `y`. 805 (y_regs ValueRegs y) 806 (y_lo Reg (value_regs_get y_regs 0)) 807 (y_hi Reg (value_regs_get y_regs 1)) 808 809 ;; 128bit mul formula: 810 ;; dst_lo = x_lo * y_lo 811 ;; dst_hi = umulhi(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo) 812 ;; 813 ;; We can convert the above formula into the following 814 ;; umulh dst_hi, x_lo, y_lo 815 ;; madd dst_hi, x_lo, y_hi, dst_hi 816 ;; madd dst_hi, x_hi, y_lo, dst_hi 817 ;; madd dst_lo, x_lo, y_lo, zero 818 (dst_hi1 Reg (umulh $I64 x_lo y_lo)) 819 (dst_hi2 Reg (madd $I64 x_lo y_hi dst_hi1)) 820 (dst_hi Reg (madd $I64 x_hi y_lo dst_hi2)) 821 (dst_lo Reg (madd $I64 x_lo y_lo (zero_reg)))) 822 (value_regs dst_lo dst_hi))) 823 824;; Special cases where the upper bits are sign-or-zero extended of the lower bits 825;; so the calculation here is much simpler with just a `umulh` or `smulh` 826;; instead of the additions above as well. 827(rule (lower (has_type $I128 (imul _ (uextend _ x) (uextend _ y)))) 828 (let ( 829 (x Reg (put_in_reg_zext64 x)) 830 (y Reg (put_in_reg_zext64 y)) 831 ) 832 (value_regs 833 (madd $I64 x y (zero_reg)) 834 (umulh $I64 x y)))) 835(rule (lower (has_type $I128 (imul _ (sextend _ x) (sextend _ y)))) 836 (let ( 837 (x Reg (put_in_reg_sext64 x)) 838 (y Reg (put_in_reg_sext64 y)) 839 ) 840 (value_regs 841 (madd $I64 x y (zero_reg)) 842 (smulh $I64 x y)))) 843 844;; vectors (i8x8/i8x16/i16x4/i16x8/i32x2/i32x4) 845(rule -2 (lower (has_type (lane_fits_in_32 ty @ (multi_lane _ _)) (imul _ x y))) 846 (mul x y (vector_size ty))) 847 848;; Special lowering for i64x2. 849;; 850;; This I64X2 multiplication is performed with several 32-bit 851;; operations. 852;; 853;; 64-bit numbers x and y, can be represented as: 854;; x = a + 2^32(b) 855;; y = c + 2^32(d) 856;; 857;; A 64-bit multiplication is: 858;; x * y = ac + 2^32(ad + bc) + 2^64(bd) 859;; note: `2^64(bd)` can be ignored, the value is too large to fit in 860;; 64 bits. 861;; 862;; This sequence implements a I64X2 multiply, where the registers 863;; `rn` and `rm` are split up into 32-bit components: 864;; rn = |d|c|b|a| 865;; rm = |h|g|f|e| 866;; 867;; rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)| 868;; 869;; The sequence is: 870;; rev64 rd.4s, rm.4s 871;; mul rd.4s, rd.4s, rn.4s 872;; xtn tmp1.2s, rn.2d 873;; addp rd.4s, rd.4s, rd.4s 874;; xtn tmp2.2s, rm.2d 875;; shll rd.2d, rd.2s, #32 876;; umlal rd.2d, tmp2.2s, tmp1.2s 877(rule -1 (lower (has_type $I64X2 (imul _ x y))) 878 (let ((rn Reg x) 879 (rm Reg y) 880 ;; Reverse the 32-bit elements in the 64-bit words. 881 ;; rd = |g|h|e|f| 882 (rev Reg (rev64 rm (VectorSize.Size32x4))) 883 884 ;; Calculate the high half components. 885 ;; rd = |dg|ch|be|af| 886 ;; 887 ;; Note that this 32-bit multiply of the high half 888 ;; discards the bits that would overflow, same as 889 ;; if 64-bit operations were used. Also the Shll 890 ;; below would shift out the overflow bits anyway. 891 (mul Reg (mul rev rn (VectorSize.Size32x4))) 892 893 ;; Extract the low half components of rn. 894 ;; tmp1 = |c|a| 895 (tmp1 Reg (xtn rn (ScalarSize.Size32))) 896 897 ;; Sum the respective high half components. 898 ;; rd = |dg+ch|be+af||dg+ch|be+af| 899 (sum Reg (addp mul mul (VectorSize.Size32x4))) 900 901 ;; Extract the low half components of rm. 902 ;; tmp2 = |g|e| 903 (tmp2 Reg (xtn rm (ScalarSize.Size32))) 904 905 ;; Shift the high half components, into the high half. 906 ;; rd = |dg+ch << 32|be+af << 32| 907 (shift Reg (shll32 sum false)) 908 909 ;; Multiply the low components together, and accumulate with the high 910 ;; half. 911 ;; rd = |rd[1] + cg|rd[0] + ae| 912 (result Reg (umlal32 shift tmp2 tmp1 false))) 913 result)) 914 915;; Special case for `i16x8.extmul_low_i8x16_s`. 916(rule (lower (has_type $I16X8 917 (imul _ (swiden_low _ x @ (value_type $I8X16)) 918 (swiden_low _ y @ (value_type $I8X16))))) 919 (smull8 x y false)) 920 921;; Special case for `i16x8.extmul_high_i8x16_s`. 922(rule (lower (has_type $I16X8 923 (imul _ (swiden_high _ x @ (value_type $I8X16)) 924 (swiden_high _ y @ (value_type $I8X16))))) 925 (smull8 x y true)) 926 927;; Special case for `i16x8.extmul_low_i8x16_u`. 928(rule (lower (has_type $I16X8 929 (imul _ (uwiden_low _ x @ (value_type $I8X16)) 930 (uwiden_low _ y @ (value_type $I8X16))))) 931 (umull8 x y false)) 932 933;; Special case for `i16x8.extmul_high_i8x16_u`. 934(rule (lower (has_type $I16X8 935 (imul _ (uwiden_high _ x @ (value_type $I8X16)) 936 (uwiden_high _ y @ (value_type $I8X16))))) 937 (umull8 x y true)) 938 939;; Special case for `i32x4.extmul_low_i16x8_s`. 940(rule (lower (has_type $I32X4 941 (imul _ (swiden_low _ x @ (value_type $I16X8)) 942 (swiden_low _ y @ (value_type $I16X8))))) 943 (smull16 x y false)) 944 945;; Special case for `i32x4.extmul_high_i16x8_s`. 946(rule (lower (has_type $I32X4 947 (imul _ (swiden_high _ x @ (value_type $I16X8)) 948 (swiden_high _ y @ (value_type $I16X8))))) 949 (smull16 x y true)) 950 951;; Special case for `i32x4.extmul_low_i16x8_u`. 952(rule (lower (has_type $I32X4 953 (imul _ (uwiden_low _ x @ (value_type $I16X8)) 954 (uwiden_low _ y @ (value_type $I16X8))))) 955 (umull16 x y false)) 956 957;; Special case for `i32x4.extmul_high_i16x8_u`. 958(rule (lower (has_type $I32X4 959 (imul _ (uwiden_high _ x @ (value_type $I16X8)) 960 (uwiden_high _ y @ (value_type $I16X8))))) 961 (umull16 x y true)) 962 963;; Special case for `i64x2.extmul_low_i32x4_s`. 964(rule (lower (has_type $I64X2 965 (imul _ (swiden_low _ x @ (value_type $I32X4)) 966 (swiden_low _ y @ (value_type $I32X4))))) 967 (smull32 x y false)) 968 969;; Special case for `i64x2.extmul_high_i32x4_s`. 970(rule (lower (has_type $I64X2 971 (imul _ (swiden_high _ x @ (value_type $I32X4)) 972 (swiden_high _ y @ (value_type $I32X4))))) 973 (smull32 x y true)) 974 975;; Special case for `i64x2.extmul_low_i32x4_u`. 976(rule (lower (has_type $I64X2 977 (imul _ (uwiden_low _ x @ (value_type $I32X4)) 978 (uwiden_low _ y @ (value_type $I32X4))))) 979 (umull32 x y false)) 980 981;; Special case for `i64x2.extmul_high_i32x4_u`. 982(rule (lower (has_type $I64X2 983 (imul _ (uwiden_high _ x @ (value_type $I32X4)) 984 (uwiden_high _ y @ (value_type $I32X4))))) 985 (umull32 x y true)) 986 987;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 988 989(rule 1 (lower (has_type $I64 (smulhi _ x y))) 990 (smulh $I64 x y)) 991 992(rule (lower (has_type (fits_in_32 ty) (smulhi _ x y))) 993 (let ((x64 Reg (put_in_reg_sext64 x)) 994 (y64 Reg (put_in_reg_sext64 y)) 995 (mul Reg (madd $I64 x64 y64 (zero_reg))) 996 (result Reg (asr_imm $I64 mul (imm_shift_from_u8 (ty_bits ty))))) 997 result)) 998 999;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1000 1001(rule 1 (lower (has_type $I64 (umulhi _ x y))) 1002 (umulh $I64 x y)) 1003 1004(rule (lower (has_type (fits_in_32 ty) (umulhi _ x y))) 1005 (let ( 1006 (x64 Reg (put_in_reg_zext64 x)) 1007 (y64 Reg (put_in_reg_zext64 y)) 1008 (mul Reg (madd $I64 x64 y64 (zero_reg))) 1009 (result Reg (lsr_imm $I64 mul (imm_shift_from_u8 (ty_bits ty)))) 1010 ) 1011 (value_reg result))) 1012 1013;;;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1014 1015;; Enum representing the types of extensions 1016(type ExtType 1017 (enum 1018 (Signed) 1019 (Unsigned))) 1020 1021;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero. 1022;; It takes a value and extension type, and performs the appropriate checks. 1023;; TODO: restore spec 1024; (spec (put_nonzero_in_reg_sext64 x) 1025; (provide (= (sign_ext 64 x) result)) 1026; (require (not (= #x0000000000000000 result)))) 1027(decl put_nonzero_in_reg (Value ExtType Type) Reg) 1028 1029;; Special case where if a `Value` is known to be nonzero we can trivially 1030;; move it into a register. 1031 1032;; zero-extend non-zero constant 1033(rule (put_nonzero_in_reg (iconst _ (nonzero_u64_from_imm64 n)) (ExtType.Unsigned) ty) 1034 (imm ty (ImmExtend.Zero) n)) 1035 1036;; sign-extend non-zero constant 1037(rule (put_nonzero_in_reg (iconst _ (nonzero_u64_from_imm64 n)) (ExtType.Signed) ty) 1038 (imm ty (ImmExtend.Sign) n)) 1039 1040(rule -1 (put_nonzero_in_reg val _ $I64) 1041 (trap_if_zero_divisor (put_in_reg val) (operand_size $I64))) 1042 1043(rule -2 (put_nonzero_in_reg val (ExtType.Signed) (fits_in_32 _)) 1044 (trap_if_zero_divisor (put_in_reg_sext32 val) (operand_size $I32))) 1045 1046(rule -2 (put_nonzero_in_reg val (ExtType.Unsigned) (fits_in_32 _)) 1047 (trap_if_zero_divisor (put_in_reg_zext32 val) (operand_size $I32))) 1048 1049;; Note that aarch64's `udiv` doesn't trap so to respect the semantics of 1050;; CLIF's `udiv` the check for zero needs to be manually performed. 1051 1052(rule udiv 1 (lower (has_type $I64 (udiv _ x y))) 1053 (a64_udiv $I64 (put_in_reg x) (put_nonzero_in_reg y (ExtType.Unsigned) $I64))) 1054 1055(rule udiv (lower (has_type (fits_in_32 ty) (udiv _ x y))) 1056 (a64_udiv $I32 (put_in_reg_zext32 x) (put_nonzero_in_reg y (ExtType.Unsigned) ty))) 1057 1058;;;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1059 1060;; TODO: Add SDiv32 to implement 32-bit directly, rather 1061;; than extending the input. 1062;; 1063;; The sequence of checks here should look like this for 32 or 64 bits: 1064;; 1065;; cbnz rm, #8 1066;; udf ; divide by zero 1067;; cmn rm, 1 1068;; ccmp rn, 1, #nzcv, eq 1069;; b.vc #8 1070;; udf ; signed overflow 1071;; 1072;; In the narrow 8 or 16 bit case, we need to insert an additional left-shift 1073;; to check for the minimum value using the 32-bit ccmp instruction. 1074;; 1075;; Note The div instruction does not trap on divide by zero or overflow, so 1076;; checks need to be manually inserted. 1077;; 1078;; TODO: if `y` is -1 then a check that `x` is not INT_MIN is all that's 1079;; necessary, but right now `y` is checked to not be -1 as well. 1080 1081(rule sdiv_base_case (lower (has_type $I64 (sdiv _ x y))) 1082 (let ((x64 Reg (put_in_reg_sext64 x)) 1083 (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) $I64)) 1084 (intmin_check_x Reg (intmin_check $I64 x64)) 1085 (valid_x64 Reg (trap_if_div_overflow $I64 intmin_check_x x64 y64)) 1086 (result Reg (a64_sdiv $I64 valid_x64 y64))) 1087 result)) 1088 1089(rule sdiv_base_case -1 (lower (has_type (fits_in_32 ty) (sdiv _ x y))) 1090 (let ((x32 Reg (put_in_reg_sext32 x)) 1091 (y32 Reg (put_nonzero_in_reg y (ExtType.Signed) ty)) 1092 (intmin_check_x Reg (intmin_check ty x32)) 1093 (valid_x32 Reg (trap_if_div_overflow ty intmin_check_x x32 y32)) 1094 (result Reg (a64_sdiv ty valid_x32 y32))) 1095 result)) 1096 1097;; Special case for `sdiv` where no checks are needed due to division by a 1098;; constant meaning the checks are always passed. 1099(rule sdiv_safe_divisor 2 (lower (has_type $I64 (sdiv _ x (iconst _ imm)))) 1100 (if-let y (safe_divisor_from_imm64 $I64 imm)) 1101 (a64_sdiv $I64 (put_in_reg_sext64 x) (imm $I64 (ImmExtend.Sign) y))) 1102 1103(rule sdiv_safe_divisor 1 (lower (has_type (fits_in_32 ty) (sdiv _ x (iconst _ imm)))) 1104 (if-let y (safe_divisor_from_imm64 ty imm)) 1105 (a64_sdiv ty (put_in_reg_sext32 x) (imm ty (ImmExtend.Sign) y))) 1106 1107;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero. 1108 1109;;;; Rules for `urem` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1110 1111;; Remainder (x % y) is implemented as: 1112;; 1113;; tmp = x / y 1114;; result = x - (tmp*y) 1115;; 1116;; use 'result' for tmp and you have: 1117;; 1118;; cbnz y, #8 ; branch over trap 1119;; udf ; divide by zero 1120;; div rd, x, y ; rd = x / y 1121;; msub rd, rd, y, x ; rd = x - rd * y 1122 1123;; TODO: we can avoid a 0 check, if the dividend is a non-0 constant 1124 1125(rule urem (lower (has_type $I64 (urem _ x y))) 1126 (let ((x64 Reg (put_in_reg_zext64 x)) 1127 (y64 Reg (put_nonzero_in_reg y (ExtType.Unsigned) $I64)) 1128 (div Reg (a64_udiv $I64 x64 y64)) 1129 (result Reg (msub $I64 div y64 x64))) 1130 result)) 1131 1132(rule urem -1 (lower (has_type (fits_in_32 ty) (urem _ x y))) 1133 (let ((x64 Reg (put_in_reg_zext32 x)) 1134 (y64 Reg (put_nonzero_in_reg y (ExtType.Unsigned) ty)) 1135 (div Reg (a64_udiv ty x64 y64)) 1136 (result Reg (msub ty div y64 x64))) 1137 result)) 1138 1139(rule srem (lower (has_type $I64 (srem _ x y))) 1140 (let ((x64 Reg (put_in_reg_sext64 x)) 1141 (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) $I64)) 1142 (div Reg (a64_sdiv $I64 x64 y64)) 1143 (result Reg (msub $I64 div y64 x64))) 1144 result)) 1145 1146(rule srem -1 (lower (has_type (fits_in_32 ty) (srem _ x y))) 1147 (let ((x64 Reg (put_in_reg_sext32 x)) 1148 (y64 Reg (put_nonzero_in_reg y (ExtType.Signed) ty)) 1149 (div Reg (a64_sdiv ty x64 y64)) 1150 (result Reg (msub ty div y64 x64))) 1151 result)) 1152 1153;;; Rules for integer min/max: umin, smin, umax, smax ;;;;;;;;;;;;;;;;;;;;;;;;; 1154 1155;; `i64` and smaller. 1156 1157;; cmp $x, $y 1158;; csel .., $x, $y, $cc 1159 (spec (cmp_and_choose ty cc signed x y) 1160 (provide 1161 (= result 1162 (switch cc 1163 (#x03 (if (bvule x y) x y)) 1164 (#x08 (if (bvuge x y) x y)) 1165 (#x0b (if (bvsle x y) x y)) 1166 (#x0c (if (bvsge x y) x y))))) 1167 (require 1168 (or (= ty 8) 1169 (= ty 16) 1170 (= ty 32) 1171 (= ty 64)) 1172 (or (= cc #x03) 1173 (= cc #x08) 1174 (= cc #x0b) 1175 (= cc #x0c)) 1176 (if signed (or (= cc #x0b) (= cc #x0c)) 1177 (or (= cc #x03) (= cc #x08))))) 1178(decl cmp_and_choose (Type Cond bool Value Value) ValueRegs) 1179(rule (cmp_and_choose (fits_in_64 ty) cc _ x y) 1180 (let ((x Reg (put_in_reg x)) 1181 (y Reg (put_in_reg y))) 1182 (with_flags_reg (cmp (operand_size ty) x y) 1183 (csel cc x y)))) 1184 1185;; `i16` and `i8` min/max require sign extension as 1186;; the comparison operates on (at least) 32 bits. 1187(rule 1 (cmp_and_choose (fits_in_16 ty) cc signed x y) 1188 (let ((x Reg (extend (put_in_reg x) signed (ty_bits ty) 32)) 1189 (y Reg (extend (put_in_reg y) signed (ty_bits ty) 32))) 1190 (with_flags_reg (cmp (operand_size ty) x y) 1191 (csel cc x y)))) 1192 1193(rule umin 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (umin _ x y))) 1194 (cmp_and_choose ty (Cond.Lo) false x y)) 1195(rule smin 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (smin _ x y))) 1196 (cmp_and_choose ty (Cond.Lt) true x y)) 1197(rule umax 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (umax _ x y))) 1198 (cmp_and_choose ty (Cond.Hi) false x y)) 1199(rule smax 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (smax _ x y))) 1200 (cmp_and_choose ty (Cond.Gt) true x y)) 1201 1202;; Vector types. 1203 1204(rule (lower (has_type ty @ (not_i64x2) (smin _ x y))) 1205 (vec_rrr (VecALUOp.Smin) x y (vector_size ty))) 1206 1207(rule 1 (lower (has_type $I64X2 (smin _ x y))) 1208 (bsl $I64X2 (vec_rrr (VecALUOp.Cmgt) y x (VectorSize.Size64x2)) x y)) 1209 1210(rule (lower (has_type ty @ (not_i64x2) (umin _ x y))) 1211 (vec_rrr (VecALUOp.Umin) x y (vector_size ty))) 1212 1213(rule 1 (lower (has_type $I64X2 (umin _ x y))) 1214 (bsl $I64X2 (vec_rrr (VecALUOp.Cmhi) y x (VectorSize.Size64x2)) x y)) 1215 1216(rule (lower (has_type ty @ (not_i64x2) (smax _ x y))) 1217 (vec_rrr (VecALUOp.Smax) x y (vector_size ty))) 1218 1219(rule 1 (lower (has_type $I64X2 (smax _ x y))) 1220 (bsl $I64X2 (vec_rrr (VecALUOp.Cmgt) x y (VectorSize.Size64x2)) x y)) 1221 1222(rule (lower (has_type ty @ (not_i64x2) (umax _ x y))) 1223 (vec_rrr (VecALUOp.Umax) x y (vector_size ty))) 1224 1225(rule 1 (lower (has_type $I64X2 (umax _ x y))) 1226 (bsl $I64X2 (vec_rrr (VecALUOp.Cmhi) x y (VectorSize.Size64x2)) x y)) 1227 1228;;;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1229 1230;; General rule for extending input to an output which fits in a single 1231;; register. 1232(rule uextend -2 (lower (has_type (fits_in_64 out) (uextend _ x @ (value_type in)))) 1233 (extend x false (ty_bits in) (ty_bits out))) 1234 1235;; Extraction of a vector lane automatically extends as necessary, so we can 1236;; skip an explicit extending instruction. 1237(rule 1 (lower (has_type (fits_in_64 out) 1238 (uextend _ (extractlane _ vec @ (value_type in) 1239 (u8_from_uimm8 lane))))) 1240 (mov_from_vec (put_in_reg vec) lane (lane_size in))) 1241 1242;; Atomic loads will also automatically zero their upper bits so the `uextend` 1243;; instruction can effectively get skipped here. 1244(rule 1 (lower (has_type (fits_in_64 out) 1245 (uextend _ x @ (and (value_type in) (atomic_load _ (little_or_native_endian flags) _))))) 1246 (if-let mem_op (is_sinkable_inst x)) 1247 (load_acquire in flags (sink_atomic_load mem_op))) 1248 1249;; Conversion to 128-bit needs a zero-extension of the lower bits and the upper 1250;; bits are all zero. 1251(rule -1 (lower (has_type $I128 (uextend _ x))) 1252 (value_regs (put_in_reg_zext64 x) (imm $I64 (ImmExtend.Zero) 0))) 1253 1254;; Like above where vector extraction automatically zero-extends extending to 1255;; i128 only requires generating a 0 constant for the upper bits. 1256(rule (lower (has_type $I128 1257 (uextend _ (extractlane _ vec @ (value_type in) 1258 (u8_from_uimm8 lane))))) 1259 (value_regs (mov_from_vec (put_in_reg vec) lane (lane_size in)) (imm $I64 (ImmExtend.Zero) 0))) 1260 1261;; Zero extensions from a load can be encoded in the load itself 1262(rule (lower (has_type (fits_in_64 _) (uextend _ x @ (has_type in_ty (load _ (little_or_native_endian flags) address offset))))) 1263 (if-let inst (is_sinkable_inst x)) 1264 (let ((_ Unit (sink_inst inst))) 1265 (aarch64_uload in_ty (amode in_ty address offset) flags))) 1266 1267(decl aarch64_uload (Type AMode MemFlags) Reg) 1268(rule (aarch64_uload $I8 amode flags) (aarch64_uload8 amode flags)) 1269(rule (aarch64_uload $I16 amode flags) (aarch64_uload16 amode flags)) 1270(rule (aarch64_uload $I32 amode flags) (aarch64_uload32 amode flags)) 1271 1272;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1273 1274;; General rule for extending input to an output which fits in a single 1275;; register. 1276(rule sextend -4 (lower (has_type (fits_in_64 out) (sextend _ x @ (value_type in)))) 1277 (extend x true (ty_bits in) (ty_bits out))) 1278 1279;; Extraction of a vector lane automatically extends as necessary, so we can 1280;; skip an explicit extending instruction. 1281(rule -3 (lower (has_type (fits_in_64 out) 1282 (sextend _ (extractlane _ vec @ (value_type in) 1283 (u8_from_uimm8 lane))))) 1284 (mov_from_vec_signed (put_in_reg vec) 1285 lane 1286 (vector_size in) 1287 (size_from_ty out))) 1288 1289;; 64-bit to 128-bit only needs to sign-extend the input to the upper bits. 1290(rule -2 (lower (has_type $I128 (sextend _ x))) 1291 (let ((lo Reg (put_in_reg_sext64 x)) 1292 (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63)))) 1293 (value_regs lo hi))) 1294 1295;; Like above where vector extraction automatically zero-extends extending to 1296;; i128 only requires generating a 0 constant for the upper bits. 1297;; 1298;; Note that `mov_from_vec_signed` doesn't exist for i64x2, so that's 1299;; specifically excluded here. 1300(rule (lower (has_type $I128 1301 (sextend _ (extractlane _ vec @ (value_type in @ (not_i64x2)) 1302 (u8_from_uimm8 lane))))) 1303 (let ((lo Reg (mov_from_vec_signed (put_in_reg vec) 1304 lane 1305 (vector_size in) 1306 (size_from_ty $I64))) 1307 (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63)))) 1308 (value_regs lo hi))) 1309 1310;; Extension from an extraction of i64x2 into i128. 1311(rule -1 (lower (has_type $I128 1312 (sextend _ (extractlane _ vec @ (value_type $I64X2) 1313 (u8_from_uimm8 lane))))) 1314 (let ((lo Reg (mov_from_vec (put_in_reg vec) 1315 lane 1316 (ScalarSize.Size64))) 1317 (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63)))) 1318 (value_regs lo hi))) 1319 1320;; Signed extensions from a load can be encoded in the load itself 1321(rule (lower (has_type (fits_in_64 _) (sextend _ x @ (has_type in_ty (load _ (little_or_native_endian flags) address offset))))) 1322 (if-let inst (is_sinkable_inst x)) 1323 (let ((_ Unit (sink_inst inst))) 1324 (aarch64_sload in_ty (amode in_ty address offset) flags))) 1325 1326(decl aarch64_sload (Type AMode MemFlags) Reg) 1327(rule (aarch64_sload $I8 amode flags) (aarch64_sload8 amode flags)) 1328(rule (aarch64_sload $I16 amode flags) (aarch64_sload16 amode flags)) 1329(rule (aarch64_sload $I32 amode flags) (aarch64_sload32 amode flags)) 1330 1331;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1332 1333;; Base case using `orn` between two registers. 1334;; 1335;; Note that bitwise negation is implemented here as 1336;; 1337;; NOT rd, rm ==> ORR_NOT rd, zero, rm 1338(rule bnot_base_case -4 (lower (has_type (fits_in_64 (ty_int ty)) (bnot _ x))) 1339 (orr_not ty (zero_reg) x)) 1340 1341;; Implementation of `bnot` for floats. 1342(rule -3 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (bnot _ x))) 1343 (not x (float_vector_size_in_64 ty))) 1344 1345;; Implementation of `bnot` for vector types. 1346(rule -2 (lower (has_type (ty_vec64 ty) (bnot _ x))) 1347 (not x (vector_size ty))) 1348(rule -1 (lower (has_type (ty_vec128 ty) (bnot _ x))) 1349 (not x (vector_size ty))) 1350 1351;; Implementation of `bnot` for `i128`. 1352(rule (lower (has_type $I128 (bnot _ x))) 1353 (let ((x_regs ValueRegs x) 1354 (x_lo Reg (value_regs_get x_regs 0)) 1355 (x_hi Reg (value_regs_get x_regs 1)) 1356 (new_lo Reg (orr_not $I64 (zero_reg) x_lo)) 1357 (new_hi Reg (orr_not $I64 (zero_reg) x_hi))) 1358 (value_regs new_lo new_hi))) 1359 1360;; Special case to use `orr_not_shift` if it's a `bnot` of a const-left-shifted 1361;; value. 1362(rule bnot_ishl 1 (lower (has_type (fits_in_64 (ty_int ty)) 1363 (bnot _ (ishl _ x (iconst _ k))))) 1364 (if-let amt (lshl_from_imm64 ty k)) 1365 (orr_not_shift ty (zero_reg) x amt)) 1366 1367;; Special-cases for fusing a bnot with bxor 1368(rule 2 (lower (has_type (fits_in_64 ty) (bnot _ (bxor _ x y)))) 1369 (alu_rs_imm_logic (ALUOp.EorNot) ty x y)) 1370(rule 3 (lower (has_type $I128 (bnot _ (bxor _ x y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y)) 1371 1372;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1373 1374(rule band_fits_in_64 -5 (lower (has_type (fits_in_64 (ty_int ty)) (band _ x y))) 1375 (alu_rs_imm_logic_commutative (ALUOp.And) ty x y)) 1376 1377(rule -4 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (band _ x y))) 1378 (and_vec x y (float_vector_size_in_64 ty))) 1379 1380;; Implementation of `band` for vector types. 1381(rule -2 (lower (has_type (ty_vec64 ty) (band _ x y))) 1382 (and_vec x y (vector_size ty))) 1383(rule -1 (lower (has_type (ty_vec128 ty) (band _ x y))) 1384 (and_vec x y (vector_size ty))) 1385 1386(rule (lower (has_type $I128 (band _ x y))) (i128_alu_bitop (ALUOp.And) $I64 x y)) 1387 1388;; Specialized lowerings for `(band x (bnot y))` which is additionally produced 1389;; by Cranelift's `band_not` instruction that is legalized into the simpler 1390;; forms early on. 1391(rule band_not_right 1 (lower (has_type (fits_in_64 (ty_int ty)) (band _ x (bnot _ y)))) 1392 (alu_rs_imm_logic (ALUOp.AndNot) ty x y)) 1393(rule band_not_left 2 (lower (has_type (fits_in_64 (ty_int ty)) (band _ (bnot _ y) x))) 1394 (alu_rs_imm_logic (ALUOp.AndNot) ty x y)) 1395 1396(rule 3 (lower (has_type $I128 (band _ x (bnot _ y)))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y)) 1397(rule 4 (lower (has_type $I128 (band _ (bnot _ y) x))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y)) 1398 1399(rule 5 (lower (has_type (ty_vec64 ty) (band _ x (bnot _ y)))) 1400 (bic_vec x y (vector_size ty))) 1401(rule 6 (lower (has_type (ty_vec64 ty) (band _ (bnot _ y) x))) 1402 (bic_vec x y (vector_size ty))) 1403 1404(rule 7 (lower (has_type (ty_vec128 ty) (band _ x (bnot _ y)))) 1405 (bic_vec x y (vector_size ty))) 1406(rule 8 (lower (has_type (ty_vec128 ty) (band _ (bnot _ y) x))) 1407 (bic_vec x y (vector_size ty))) 1408 1409;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1410 1411(rule bor_fits_in_64 -4 (lower (has_type (fits_in_64 (ty_int ty)) (bor _ x y))) 1412 (alu_rs_imm_logic_commutative (ALUOp.Orr) ty x y)) 1413 1414(rule -3 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (bor _ x y))) 1415 (orr_vec x y (float_vector_size_in_64 ty))) 1416 1417;; Implementation of `bor` for vector types. 1418(rule -2 (lower (has_type (ty_vec64 ty) (bor _ x y))) 1419 (orr_vec x y (vector_size ty))) 1420(rule -1 (lower (has_type (ty_vec128 ty) (bor _ x y))) 1421 (orr_vec x y (vector_size ty))) 1422 1423(rule (lower (has_type $I128 (bor _ x y))) (i128_alu_bitop (ALUOp.Orr) $I64 x y)) 1424 1425;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced 1426;; by Cranelift's `bor_not` instruction that is legalized into the simpler 1427;; forms early on. 1428(rule bor_not_right 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor _ x (bnot _ y)))) 1429 (alu_rs_imm_logic (ALUOp.OrrNot) ty x y)) 1430(rule bor_not_left 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor _ (bnot _ y) x))) 1431 (alu_rs_imm_logic (ALUOp.OrrNot) ty x y)) 1432 1433(rule 3 (lower (has_type $I128 (bor _ x (bnot _ y)))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y)) 1434(rule 4 (lower (has_type $I128 (bor _ (bnot _ y) x))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y)) 1435 1436(rule bor_not_right_vec64 5 (lower (has_type (ty_vec64 ty) (bor _ x (bnot _ y)))) 1437 (orn_vec x y (vector_size ty))) 1438(rule bor_not_left_vec64 6 (lower (has_type (ty_vec64 ty) (bor _ (bnot _ y) x))) 1439 (orn_vec x y (vector_size ty))) 1440 1441(rule bor_not_right_vec128 7 (lower (has_type (ty_vec128 ty) (bor _ x (bnot _ y)))) 1442 (orn_vec x y (vector_size ty))) 1443(rule bor_not_left_vec128 8 (lower (has_type (ty_vec128 ty) (bor _ (bnot _ y) x))) 1444 (orn_vec x y (vector_size ty))) 1445 1446 1447;; Specialized lowerings to generate the `extr` instruction. 1448;; 1449;; The `extr` instruction creates `a:b` and then extracts either 32 or 64-bits 1450;; starting from an immediate index. This is pattern-matched here as a `bor` of 1451;; the high/low halves of two values shifted around. 1452;; 1453;; The immediate used for the `extr` instruction itself is the N for the 1454;; shift-right. Two patterns are used here to detect either ordering of the 1455;; `bor`. 1456;; 1457;; (x << xs) | (y >> ys) if (xs + ys == widthof(ty)) => extr(x, y, ys) 1458;; 1459;; Note that both `xs` and `ys` must be larger than 0. If either one is 0 and 1460;; they sum to the type width then it means that the shifts don't actually do 1461;; anything CLIF-wise and this should compile down to a `bor` operation. Leave 1462;; that edge case to the mid-end and only lower to `extr` here. 1463(rule 5 (lower (has_type (ty_32_or_64 ty) 1464 (bor _ (ishl _ x (u8_from_iconst xs)) (ushr _ y (u8_from_iconst ys))))) 1465 (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys))) 1466 (if-let true (u64_gt xs 0)) 1467 (if-let true (u64_gt ys 0)) 1468 (a64_extr ty x y (imm_shift_from_u8 ys))) 1469(rule 5 (lower (has_type (ty_32_or_64 ty) 1470 (bor _ (ushr _ y (u8_from_iconst ys)) (ishl _ x (u8_from_iconst xs))))) 1471 (if-let true (u64_eq (ty_bits ty) (u64_wrapping_add xs ys))) 1472 (if-let true (u64_gt xs 0)) 1473 (if-let true (u64_gt ys 0)) 1474 (a64_extr ty x y (imm_shift_from_u8 ys))) 1475 1476;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1477 1478(rule bxor_fits_in_64 -4 (lower (has_type (fits_in_64 (ty_int ty)) (bxor _ x y))) 1479 (alu_rs_imm_logic_commutative (ALUOp.Eor) ty x y)) 1480 1481(rule -3 (lower (has_type (fits_in_64 (ty_scalar_float ty)) (bxor _ x y))) 1482 (eor_vec x y (float_vector_size_in_64 ty))) 1483 1484;; Implementation of `bxor` for vector types. 1485(rule -2 (lower (has_type (ty_vec64 ty) (bxor _ x y))) 1486 (eor_vec x y (vector_size ty))) 1487(rule -1 (lower (has_type (ty_vec128 ty) (bxor _ x y))) 1488 (eor_vec x y (vector_size ty))) 1489 1490(rule (lower (has_type $I128 (bxor _ x y))) (i128_alu_bitop (ALUOp.Eor) $I64 x y)) 1491 1492;; Specialized lowerings for `(bxor x (bnot y))` which is additionally produced 1493;; by Cranelift's `bxor_not` instruction that is legalized into the simpler 1494;; forms early on. 1495 1496(rule bxor_not_right 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor _ x (bnot _ y)))) 1497 (alu_rs_imm_logic (ALUOp.EorNot) ty x y)) 1498(rule bxor_not_left 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor _ (bnot _ y) x))) 1499 (alu_rs_imm_logic (ALUOp.EorNot) ty x y)) 1500 1501(rule 3 (lower (has_type $I128 (bxor _ x (bnot _ y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y)) 1502(rule 4 (lower (has_type $I128 (bxor _ (bnot _ y) x))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y)) 1503 1504;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1505 1506;; Shift for i8/i16/i32. 1507(rule ishl_fits_in_32 -1 (lower (has_type (fits_in_32 ty) (ishl _ x y))) 1508 (do_shift (ALUOp.Lsl) ty x y)) 1509 1510;; Shift for i64. 1511(rule ishl_64 (lower (has_type $I64 (ishl _ x y))) 1512 (do_shift (ALUOp.Lsl) $I64 x y)) 1513 1514;; Shift for i128. 1515(rule (lower (has_type $I128 (ishl _ x y))) 1516 (lower_shl128 x (value_regs_get y 0))) 1517 1518;; lsl lo_lshift, src_lo, amt 1519;; lsl hi_lshift, src_hi, amt 1520;; mvn inv_amt, amt 1521;; lsr lo_rshift, src_lo, #1 1522;; lsr lo_rshift, lo_rshift, inv_amt 1523;; orr maybe_hi, hi_lshift, lo_rshift 1524;; tst amt, #0x40 1525;; csel dst_hi, lo_lshift, maybe_hi, ne 1526;; csel dst_lo, xzr, lo_lshift, ne 1527(decl lower_shl128 (ValueRegs Reg) ValueRegs) 1528(rule (lower_shl128 src amt) 1529 (let ((src_lo Reg (value_regs_get src 0)) 1530 (src_hi Reg (value_regs_get src 1)) 1531 (lo_lshift Reg (lsl $I64 src_lo amt)) 1532 (hi_lshift Reg (lsl $I64 src_hi amt)) 1533 (inv_amt Reg (orr_not $I32 (zero_reg) amt)) 1534 (lo_rshift Reg (lsr $I64 (lsr_imm $I64 src_lo (imm_shift_from_u8 1)) 1535 inv_amt)) 1536 (maybe_hi Reg (orr $I64 hi_lshift lo_rshift)) 1537 ) 1538 (with_flags 1539 (tst_imm $I64 amt (u64_into_imm_logic $I64 64)) 1540 (consumes_flags_concat 1541 (csel (Cond.Ne) (zero_reg) lo_lshift) 1542 (csel (Cond.Ne) lo_lshift maybe_hi))))) 1543 1544;; Shift for vector types. 1545(rule -3 (lower (has_type (ty_vec128 ty) (ishl _ x y))) 1546 (let ((size VectorSize (vector_size ty)) 1547 (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) 1548 (shift Reg (vec_dup masked_shift_amt size))) 1549 (sshl x shift size))) 1550(rule -2 (lower (has_type (ty_vec128 ty) (ishl _ x (iconst _ (u64_from_imm64 n))))) 1551 (ushl_vec_imm x (shift_masked_imm ty n) (vector_size ty))) 1552 1553(decl pure shift_masked_imm (Type u64) u8) 1554(extern constructor shift_masked_imm shift_masked_imm) 1555 1556;; Helper function to emit a shift operation with the opcode specified and 1557;; the output type specified. The `Reg` provided is shifted by the `Value` 1558;; given. 1559;; 1560;; Note that this automatically handles the clif semantics of masking the 1561;; shift amount where necessary. 1562 (spec (do_shift op t a b) 1563 (provide 1564 (= result 1565 (switch op 1566 ((ALUOp.Lsr) (conv_to 64 1567 (bvlshr (conv_to t a) 1568 (conv_to t (zero_ext 64 1569 (bvand (conv_to (widthof b) (bvsub (int2bv 64 (widthof b)) #x0000000000000001)) b)))))) 1570 ((ALUOp.Asr) (conv_to 64 1571 (bvashr (conv_to t a) 1572 (conv_to t (zero_ext 64 1573 (bvand (conv_to (widthof b) (bvsub (int2bv 64 (widthof b)) #x0000000000000001)) b)))))) 1574 ((ALUOp.Lsl) (conv_to 64 1575 (bvshl (conv_to t a) 1576 (conv_to t (zero_ext 64 1577 (bvand (conv_to (widthof b) (bvsub (int2bv 64 (widthof b)) #x0000000000000001)) b))))))))) 1578 (require 1579 (or (= op (ALUOp.Lsr)) (= op (ALUOp.Asr)) (= op (ALUOp.Lsl))) 1580 (= t (widthof b)) 1581 (or (= t 8) (= t 16) (= t 32) (= t 64)) 1582 (switch op 1583 ((ALUOp.Lsr) (switch t 1584 (8 (= (extract 31 0 a) (zero_ext 32 (extract 7 0 a)))) 1585 (16 (= (extract 31 0 a) (zero_ext 32 (extract 15 0 a)))) 1586 (32 true) 1587 (64 true))) 1588 ((ALUOp.Asr) (switch t 1589 (8 (= (extract 31 0 a) (sign_ext 32 (extract 7 0 a)))) 1590 (16 (= (extract 31 0 a) (sign_ext 32 (extract 15 0 a)))) 1591 (32 true) 1592 (64 true))) 1593 ((ALUOp.Lsl) true)))) 1594(instantiate do_shift 1595 ((args (bv 8) Int (bv 64) (bv 8)) (ret (bv 64)) (canon (bv 8))) 1596 ((args (bv 8) Int (bv 64) (bv 16)) (ret (bv 64)) (canon (bv 16))) 1597 ((args (bv 8) Int (bv 64) (bv 32)) (ret (bv 64)) (canon (bv 32))) 1598 ((args (bv 8) Int (bv 64) (bv 64)) (ret (bv 64)) (canon (bv 64))) 1599) 1600(decl do_shift (ALUOp Type Reg Value) Reg) 1601 1602;; 8/16-bit shift base case. 1603;; 1604;; When shifting for amounts larger than the size of the type, the CLIF shift 1605;; instructions implement a "wrapping" behaviour, such that an i8 << 8 is 1606;; equivalent to i8 << 0 1607;; 1608;; On i32 and i64 types this matches what the aarch64 spec does, but on smaller 1609;; types (i16, i8) we need to do this manually, so we wrap the shift amount 1610;; with an AND instruction 1611(rule do_shift_fits_in_16 -1 (do_shift op (fits_in_16 ty) x y) 1612 (let ((shift_amt Reg (value_regs_get y 0)) 1613 (masked_shift_amt Reg (and_imm $I32 shift_amt (shift_mask ty)))) 1614 (alu_rrr op $I32 x masked_shift_amt))) 1615 1616 (spec (shift_mask t) 1617 (provide (= (bvsub (int2bv 64 t) #x0000000000000001) result))) 1618(decl shift_mask (Type) ImmLogic) 1619(extern constructor shift_mask shift_mask) 1620 1621;; 32/64-bit shift base cases. 1622(rule do_shift_32_base_case (do_shift op $I32 x y) (alu_rrr op $I32 x (value_regs_get y 0))) 1623(rule do_shift_64_base_case (do_shift op $I64 x y) (alu_rrr op $I64 x (value_regs_get y 0))) 1624 1625;; Special case for shifting by a constant value where the value can fit into an 1626;; `ImmShift`. 1627;; 1628;; Note that this rule explicitly has a higher priority than the others 1629;; to ensure it's attempted first, otherwise the type-based filters on the 1630;; previous rules seem to take priority over this rule. 1631(rule do_shift_imm 1 (do_shift op ty x (iconst _ k)) 1632 (if-let shift (imm_shift_from_imm64 ty k)) 1633 (alu_rr_imm_shift op ty x shift)) 1634 1635;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1636 1637;; Shift for i8/i16/i32. 1638(rule ushr_fits_in_32 -1 (lower (has_type (fits_in_32 ty) (ushr _ x y))) 1639 (do_shift (ALUOp.Lsr) ty (put_in_reg_zext32 x) y)) 1640 1641;; Shift for i64. 1642(rule ushr_64 (lower (has_type $I64 (ushr _ x y))) 1643 (do_shift (ALUOp.Lsr) $I64 (put_in_reg_zext64 x) y)) 1644 1645;; Shift for i128. 1646(rule (lower (has_type $I128 (ushr _ x y))) 1647 (lower_ushr128 x (value_regs_get y 0))) 1648 1649;; Vector shifts. 1650;; 1651;; Note that for constant shifts a 0-width shift can't be emitted so it's 1652;; special cased to pass through the input as-is since a 0-shift doesn't modify 1653;; the input anyway. 1654(rule -4 (lower (has_type (ty_vec128 ty) (ushr _ x y))) 1655 (let ((size VectorSize (vector_size ty)) 1656 (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) 1657 (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size))) 1658 (ushl x shift size))) 1659(rule -3 (lower (has_type (ty_vec128 ty) (ushr _ x (iconst _ (u64_from_imm64 n))))) 1660 (ushr_vec_imm x (shift_masked_imm ty n) (vector_size ty))) 1661(rule -2 (lower (has_type (ty_vec128 ty) (ushr _ x (iconst _ (u64_from_imm64 n))))) 1662 (if-let 0 (shift_masked_imm ty n)) 1663 x) 1664 1665;; lsr lo_rshift, src_lo, amt 1666;; lsr hi_rshift, src_hi, amt 1667;; mvn inv_amt, amt 1668;; lsl hi_lshift, src_hi, #1 1669;; lsl hi_lshift, hi_lshift, inv_amt 1670;; tst amt, #0x40 1671;; orr maybe_lo, lo_rshift, hi_lshift 1672;; csel dst_hi, xzr, hi_rshift, ne 1673;; csel dst_lo, hi_rshift, maybe_lo, ne 1674(decl lower_ushr128 (ValueRegs Reg) ValueRegs) 1675(rule (lower_ushr128 src amt) 1676 (let ((src_lo Reg (value_regs_get src 0)) 1677 (src_hi Reg (value_regs_get src 1)) 1678 (lo_rshift Reg (lsr $I64 src_lo amt)) 1679 (hi_rshift Reg (lsr $I64 src_hi amt)) 1680 1681 (inv_amt Reg (orr_not $I32 (zero_reg) amt)) 1682 (hi_lshift Reg (lsl $I64 (lsl_imm $I64 src_hi (imm_shift_from_u8 1)) 1683 inv_amt)) 1684 (maybe_lo Reg (orr $I64 lo_rshift hi_lshift)) 1685 ) 1686 (with_flags 1687 (tst_imm $I64 amt (u64_into_imm_logic $I64 64)) 1688 (consumes_flags_concat 1689 (csel (Cond.Ne) hi_rshift maybe_lo) 1690 (csel (Cond.Ne) (zero_reg) hi_rshift))))) 1691 1692;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1693 1694;; Shift for i8/i16/i32. 1695(rule sshr_fits_in_32 -4 (lower (has_type (fits_in_32 ty) (sshr _ x y))) 1696 (do_shift (ALUOp.Asr) ty (put_in_reg_sext32 x) y)) 1697 1698;; Shift for i64. 1699(rule sshr_64 (lower (has_type $I64 (sshr _ x y))) 1700 (do_shift (ALUOp.Asr) $I64 (put_in_reg_sext64 x) y)) 1701 1702;; Shift for i128. 1703(rule (lower (has_type $I128 (sshr _ x y))) 1704 (lower_sshr128 x (value_regs_get y 0))) 1705 1706;; Vector shifts. 1707;; 1708;; Note that right shifts are implemented with a negative left shift. Also note 1709;; that for constant shifts a 0-width shift can't be emitted so it's special 1710;; cased to pass through the input as-is since a 0-shift doesn't modify the 1711;; input anyway. 1712(rule -3 (lower (has_type (ty_vec128 ty) (sshr _ x y))) 1713 (let ((size VectorSize (vector_size ty)) 1714 (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty))) 1715 (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size))) 1716 (sshl x shift size))) 1717(rule -2 (lower (has_type (ty_vec128 ty) (sshr _ x (iconst _ (u64_from_imm64 n))))) 1718 (sshr_vec_imm x (shift_masked_imm ty n) (vector_size ty))) 1719(rule -1 (lower (has_type (ty_vec128 ty) (sshr _ x (iconst _ (u64_from_imm64 n))))) 1720 (if-let 0 (shift_masked_imm ty n)) 1721 x) 1722 1723;; lsr lo_rshift, src_lo, amt 1724;; asr hi_rshift, src_hi, amt 1725;; mvn inv_amt, amt 1726;; lsl hi_lshift, src_hi, #1 1727;; lsl hi_lshift, hi_lshift, inv_amt 1728;; asr hi_sign, src_hi, #63 1729;; orr maybe_lo, lo_rshift, hi_lshift 1730;; tst amt, #0x40 1731;; csel dst_hi, hi_sign, hi_rshift, ne 1732;; csel dst_lo, hi_rshift, maybe_lo, ne 1733(decl lower_sshr128 (ValueRegs Reg) ValueRegs) 1734(rule (lower_sshr128 src amt) 1735 (let ((src_lo Reg (value_regs_get src 0)) 1736 (src_hi Reg (value_regs_get src 1)) 1737 (lo_rshift Reg (lsr $I64 src_lo amt)) 1738 (hi_rshift Reg (asr $I64 src_hi amt)) 1739 1740 (inv_amt Reg (orr_not $I32 (zero_reg) amt)) 1741 (hi_lshift Reg (lsl $I64 (lsl_imm $I64 src_hi (imm_shift_from_u8 1)) 1742 inv_amt)) 1743 (hi_sign Reg (asr_imm $I64 src_hi (imm_shift_from_u8 63))) 1744 (maybe_lo Reg (orr $I64 lo_rshift hi_lshift)) 1745 ) 1746 (with_flags 1747 (tst_imm $I64 amt (u64_into_imm_logic $I64 64)) 1748 (consumes_flags_concat 1749 (csel (Cond.Ne) hi_rshift maybe_lo) 1750 (csel (Cond.Ne) hi_sign hi_rshift))))) 1751 1752;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1753 1754;; General 8/16-bit case. 1755(rule rotl_fits_in_16 -2 (lower (has_type (fits_in_16 ty) (rotl _ x y))) 1756 (let ((amt Reg (value_regs_get y 0)) 1757 (neg_shift Reg (sub $I32 (zero_reg) amt))) 1758 (small_rotr ty (put_in_reg_zext32 x) neg_shift))) 1759 1760;; Specialization for the 8/16-bit case when the rotation amount is an immediate. 1761(rule rotl_fits_in_16_imm -1 (lower (has_type (fits_in_16 ty) (rotl _ x (iconst _ k)))) 1762 (if-let n (imm_shift_from_imm64 ty k)) 1763 (small_rotr_imm ty (put_in_reg_zext32 x) (negate_imm_shift ty n))) 1764 1765;; aarch64 doesn't have a left-rotate instruction, but a left rotation of K 1766;; places is effectively a right rotation of N - K places, if N is the integer's 1767;; bit size. We implement left rotations with this trick. 1768;; 1769;; Note that when negating the shift amount here the upper bits are ignored 1770;; by the rotr instruction, meaning that we'll still left-shift by the desired 1771;; amount. 1772 1773;; General 32-bit case. 1774(rule rotl_32_base_case (lower (has_type $I32 (rotl _ x y))) 1775 (let ((amt Reg (value_regs_get y 0)) 1776 (neg_shift Reg (sub $I32 (zero_reg) amt))) 1777 (a64_rotr $I32 x neg_shift))) 1778 1779;; General 64-bit case. 1780(rule rotl_64_base_case (lower (has_type $I64 (rotl _ x y))) 1781 (let ((amt Reg (value_regs_get y 0)) 1782 (neg_shift Reg (sub $I64 (zero_reg) amt))) 1783 (a64_rotr $I64 x neg_shift))) 1784 1785;; Specialization for the 32-bit case when the rotation amount is an immediate. 1786(rule rotl_32_imm 1 (lower (has_type $I32 (rotl _ x (iconst _ k)))) 1787 (if-let n (imm_shift_from_imm64 $I32 k)) 1788 (a64_rotr_imm $I32 x (negate_imm_shift $I32 n))) 1789 1790;; Specialization for the 64-bit case when the rotation amount is an immediate. 1791(rule rotl_64_imm 1 (lower (has_type $I64 (rotl _ x (iconst _ k)))) 1792 (if-let n (imm_shift_from_imm64 $I64 k)) 1793 (a64_rotr_imm $I64 x (negate_imm_shift $I64 n))) 1794 1795;; fn negate_imm_shift(&mut self, ty: Type, mut imm: ImmShift) -> ImmShift { 1796;; let size = u8::try_from(ty.bits()).unwrap(); 1797;; imm.imm = size.wrapping_sub(imm.value()); 1798;; imm.imm &= size - 1; 1799;; imm 1800;; } 1801 (spec (negate_imm_shift ty x) 1802 (provide 1803 (= result (bvand (bvsub (int2bv 6 ty) x) (bvsub (int2bv 6 ty) #b000001))))) 1804(decl negate_imm_shift (Type ImmShift) ImmShift) 1805(extern constructor negate_imm_shift negate_imm_shift) 1806 1807;; General 128-bit case. 1808;; 1809;; TODO: much better codegen is possible with a constant amount. 1810(rule (lower (has_type $I128 (rotl _ x y))) 1811 (let ((val ValueRegs x) 1812 (amt Reg (value_regs_get y 0)) 1813 (neg_amt Reg (sub $I64 (imm $I64 (ImmExtend.Zero) 128) amt)) 1814 (lshift ValueRegs (lower_shl128 val amt)) 1815 (rshift ValueRegs (lower_ushr128 val neg_amt))) 1816 (value_regs 1817 (orr $I64 (value_regs_get lshift 0) (value_regs_get rshift 0)) 1818 (orr $I64 (value_regs_get lshift 1) (value_regs_get rshift 1))))) 1819 1820;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1821 1822;; General 8/16-bit case. 1823(rule rotr_fits_in_16 -3 (lower (has_type (fits_in_16 ty) (rotr _ x y))) 1824 (small_rotr ty (put_in_reg_zext32 x) (value_regs_get y 0))) 1825 1826;; General 32-bit case. 1827(rule rotr_32_base_case -1 (lower (has_type $I32 (rotr _ x y))) 1828 (a64_rotr $I32 x (value_regs_get y 0))) 1829 1830;; General 64-bit case. 1831(rule rotr_64_base_case -1 (lower (has_type $I64 (rotr _ x y))) 1832 (a64_rotr $I64 x (value_regs_get y 0))) 1833 1834;; Specialization for the 8/16-bit case when the rotation amount is an immediate. 1835(rule rotr_fits_in_16_imm -2 (lower (has_type (fits_in_16 ty) (rotr _ x (iconst _ k)))) 1836 (if-let n (imm_shift_from_imm64 ty k)) 1837 (small_rotr_imm ty (put_in_reg_zext32 x) n)) 1838 1839;; Specialization for the 32-bit case when the rotation amount is an immediate. 1840(rule rotr_32_imm (lower (has_type $I32 (rotr _ x (iconst _ k)))) 1841 (if-let n (imm_shift_from_imm64 $I32 k)) 1842 (a64_rotr_imm $I32 x n)) 1843 1844;; Specialization for the 64-bit case when the rotation amount is an immediate. 1845(rule rotr_64_imm (lower (has_type $I64 (rotr _ x (iconst _ k)))) 1846 (if-let n (imm_shift_from_imm64 $I64 k)) 1847 (a64_rotr_imm $I64 x n)) 1848 1849;; For a < 32-bit rotate-right, we synthesize this as: 1850;; 1851;; rotr rd, val, amt 1852;; 1853;; => 1854;; 1855;; and masked_amt, amt, <bitwidth - 1> 1856;; sub tmp_sub, masked_amt, <bitwidth> 1857;; sub neg_amt, zero, tmp_sub ; neg 1858;; lsr val_rshift, val, masked_amt 1859;; lsl val_lshift, val, neg_amt 1860;; orr rd, val_lshift val_rshift 1861 (spec (small_rotr t x y) 1862 (provide 1863 (= result 1864 (switch t 1865 (8 (conv_to 64 (rotr (extract 7 0 x) (extract 7 0 y)))) 1866 (16 (conv_to 64 (rotr (extract 15 0 x) (extract 15 0 y))))))) 1867 (require 1868 (or (= t 8) (= t 16)) 1869 (switch t 1870 (8 (= (extract 31 8 x) #x000000)) 1871 (16 (= (extract 31 16 x) #x0000))))) 1872(instantiate small_rotr 1873 ((args Int (bv 64) (bv 64)) (ret (bv 64)) (canon (bv 64)))) 1874(decl small_rotr (Type Reg Reg) Reg) 1875(rule small_rotr (small_rotr ty val amt) 1876 (let ((masked_amt Reg (and_imm $I32 amt (rotr_mask ty))) 1877 (tmp_sub Reg (sub_imm $I32 masked_amt (u8_into_imm12 (ty_bits ty)))) 1878 (neg_amt Reg (sub $I32 (zero_reg) tmp_sub)) 1879 (val_rshift Reg (lsr $I32 val masked_amt)) 1880 (val_lshift Reg (lsl $I32 val neg_amt))) 1881 (orr $I32 val_lshift val_rshift))) 1882 1883(spec (rotr_mask x) (provide (= (bvsub (int2bv 64 x) #x0000000000000001) result))) 1884(decl rotr_mask (Type) ImmLogic) 1885(extern constructor rotr_mask rotr_mask) 1886 1887;; For a constant amount, we can instead do: 1888;; 1889;; rotr rd, val, #amt 1890;; 1891;; => 1892;; 1893;; lsr val_rshift, val, #<amt> 1894;; lsl val_lshift, val, <bitwidth - amt> 1895;; orr rd, val_lshift, val_rshift 1896 1897(spec (small_rotr_imm t x y) 1898 (provide 1899 (= result 1900 (switch t 1901 (8 (conv_to 64 (rotr (extract 7 0 x) (zero_ext 8 y)))) 1902 (16 (conv_to 64 (rotr (extract 15 0 x) (zero_ext 16 y))))))) 1903 (require 1904 (or (= t 8) (= t 16)) 1905 (switch t 1906 (8 (= (extract 31 8 x) #x000000)) 1907 (16 (= (extract 31 16 x) #x0000))) 1908 (bvult y (int2bv 6 t)))) 1909(instantiate small_rotr_imm 1910 ((args Int (bv 64) (bv 6)) (ret (bv 64)) (canon (bv 64)))) 1911(decl small_rotr_imm (Type Reg ImmShift) Reg) 1912(rule small_rotr_imm (small_rotr_imm ty val amt) 1913 (let ((val_rshift Reg (lsr_imm $I32 val amt)) 1914 (val_lshift Reg (lsl_imm $I32 val (rotr_opposite_amount ty amt)))) 1915 (orr $I32 val_lshift val_rshift))) 1916 1917(spec (rotr_opposite_amount ty x) 1918 (provide 1919 (= (bvsub (int2bv 6 ty) (bvand x (bvsub (int2bv 6 ty) #b000001))) result))) 1920(decl rotr_opposite_amount (Type ImmShift) ImmShift) 1921(extern constructor rotr_opposite_amount rotr_opposite_amount) 1922 1923;; General 128-bit case. 1924;; 1925;; TODO: much better codegen is possible with a constant amount. 1926(rule (lower (has_type $I128 (rotr _ x y))) 1927 (let ((val ValueRegs x) 1928 (amt Reg (value_regs_get y 0)) 1929 (neg_amt Reg (sub $I64 (imm $I64 (ImmExtend.Zero) 128) amt)) 1930 (rshift ValueRegs (lower_ushr128 val amt)) 1931 (lshift ValueRegs (lower_shl128 val neg_amt)) 1932 (hi Reg (orr $I64 (value_regs_get rshift 1) (value_regs_get lshift 1))) 1933 (lo Reg (orr $I64 (value_regs_get rshift 0) (value_regs_get lshift 0)))) 1934 (value_regs lo hi))) 1935 1936;;;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1937 1938;; Reversing an 8-bit value with a 32-bit bitrev instruction will place 1939;; the reversed result in the highest 8 bits, so we need to shift them down into 1940;; place. 1941(rule (lower (has_type $I8 (bitrev _ x))) 1942 (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 24))) 1943 1944;; Reversing an 16-bit value with a 32-bit bitrev instruction will place 1945;; the reversed result in the highest 16 bits, so we need to shift them down into 1946;; place. 1947(rule (lower (has_type $I16 (bitrev _ x))) 1948 (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 16))) 1949 1950(rule (lower (has_type $I128 (bitrev _ x))) 1951 (let ((val ValueRegs x) 1952 (lo_rev Reg (rbit $I64 (value_regs_get val 0))) 1953 (hi_rev Reg (rbit $I64 (value_regs_get val 1)))) 1954 (value_regs hi_rev lo_rev))) 1955 1956(rule -1 (lower (has_type ty (bitrev _ x))) 1957 (rbit ty x)) 1958 1959 1960;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1961 1962(rule clz_8 (lower (has_type $I8 (clz _ x))) 1963 (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 24))) 1964 1965(rule clz_16 (lower (has_type $I16 (clz _ x))) 1966 (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 16))) 1967 1968(rule (lower (has_type $I128 (clz _ x))) 1969 (lower_clz128 x)) 1970 1971(rule clz_32_64 -1 (lower (has_type ty (clz _ x))) 1972 (a64_clz ty x)) 1973 1974;; clz hi_clz, hi 1975;; clz lo_clz, lo 1976;; lsr tmp, hi_clz, #6 1977;; madd dst_lo, lo_clz, tmp, hi_clz 1978;; mov dst_hi, 0 1979(decl lower_clz128 (ValueRegs) ValueRegs) 1980(rule (lower_clz128 val) 1981 (let ((hi_clz Reg (a64_clz $I64 (value_regs_get val 1))) 1982 (lo_clz Reg (a64_clz $I64 (value_regs_get val 0))) 1983 (tmp Reg (lsr_imm $I64 hi_clz (imm_shift_from_u8 6)))) 1984 (value_regs (madd $I64 lo_clz tmp hi_clz) (imm $I64 (ImmExtend.Zero) 0)))) 1985 1986;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1987 1988;; Note that all `ctz` instructions are implemented by reversing the bits and 1989;; then using a `clz` instruction since the tail zeros are the same as the 1990;; leading zeros of the reversed value. 1991 1992(rule ctz_8 (lower (has_type $I8 (ctz _ x))) 1993 (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x800000)))) 1994 1995(rule ctz_16 (lower (has_type $I16 (ctz _ x))) 1996 (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x8000)))) 1997 1998(rule (lower (has_type $I128 (ctz _ x))) 1999 (let ((val ValueRegs x) 2000 (lo Reg (rbit $I64 (value_regs_get val 0))) 2001 (hi Reg (rbit $I64 (value_regs_get val 1)))) 2002 (lower_clz128 (value_regs hi lo)))) 2003 2004(rule ctz_32_64 -1 (lower (has_type ty (ctz _ x))) 2005 (a64_clz ty (rbit ty x))) 2006 2007;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2008 2009(rule cls_8 (lower (has_type $I8 (cls _ x))) 2010 (sub_imm $I32 (a64_cls $I32 (put_in_reg_sext32 x)) (u8_into_imm12 24))) 2011 2012(rule cls_16 (lower (has_type $I16 (cls _ x))) 2013 (sub_imm $I32 (a64_cls $I32 (put_in_reg_sext32 x)) (u8_into_imm12 16))) 2014 2015;; cls lo_cls, lo 2016;; cls hi_cls, hi 2017;; eon sign_eq_eor, hi, lo 2018;; lsr sign_eq, sign_eq_eor, #63 2019;; madd lo_sign_bits, out_lo, sign_eq, sign_eq 2020;; cmp hi_cls, #63 2021;; csel maybe_lo, lo_sign_bits, xzr, eq 2022;; add out_lo, maybe_lo, hi_cls 2023;; mov out_hi, 0 2024(rule (lower (has_type $I128 (cls _ x))) 2025 (let ((val ValueRegs x) 2026 (lo Reg (value_regs_get val 0)) 2027 (hi Reg (value_regs_get val 1)) 2028 (lo_cls Reg (a64_cls $I64 lo)) 2029 (hi_cls Reg (a64_cls $I64 hi)) 2030 (sign_eq_eon Reg (eon $I64 hi lo)) 2031 (sign_eq Reg (lsr_imm $I64 sign_eq_eon (imm_shift_from_u8 63))) 2032 (lo_sign_bits Reg (madd $I64 lo_cls sign_eq sign_eq)) 2033 (maybe_lo Reg (with_flags_reg 2034 (cmp64_imm hi_cls (u8_into_imm12 63)) 2035 (csel (Cond.Eq) lo_sign_bits (zero_reg))))) 2036 (value_regs (add $I64 maybe_lo hi_cls) (imm $I64 (ImmExtend.Zero) 0)))) 2037 2038(rule cls_32_64 -1 (lower (has_type ty (cls _ x))) 2039 (a64_cls ty x)) 2040 2041;;;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2042 2043(rule (lower (has_type $I16 (bswap _ x))) 2044 (a64_rev16 $I16 x)) 2045 2046(rule (lower (has_type $I32 (bswap _ x))) 2047 (a64_rev32 $I32 x)) 2048 2049(rule (lower (has_type $I64 (bswap _ x))) 2050 (a64_rev64 $I64 x)) 2051 2052(rule (lower (has_type $I128 (bswap _ x))) 2053 (value_regs 2054 (a64_rev64 $I64 (value_regs_get x 1)) 2055 (a64_rev64 $I64 (value_regs_get x 0)))) 2056 2057;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2058 2059;; Bmask tests the value against zero, and uses `csetm` to assert the result. 2060(rule (lower (has_type out_ty (bmask _ x @ (value_type in_ty)))) 2061 (lower_bmask out_ty in_ty x)) 2062 2063;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2064 2065;; The implementation of `popcnt` for scalar types is done by moving the value 2066;; into a vector register, using the `cnt` instruction, and then collating the 2067;; result back into a normal register. 2068;; 2069;; The general sequence emitted here is 2070;; 2071;; fmov tmp, in_lo 2072;; if ty == i128: 2073;; mov tmp.d[1], in_hi 2074;; 2075;; cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b 2076;; addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs) 2077;; 2078;; umov out_lo, tmp.b[0] 2079;; if ty == i128: 2080;; mov out_hi, 0 2081 2082(rule popcnt_8 (lower (has_type $I8 (popcnt _ x))) 2083 (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) 2084 (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))) 2085 (mov_from_vec nbits 0 (ScalarSize.Size8)))) 2086 2087;; Note that this uses `addp` instead of `addv` as it's usually cheaper. 2088(rule popcnt_16 (lower (has_type $I16 (popcnt _ x))) 2089 (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) 2090 (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) 2091 (added Reg (addp nbits nbits (VectorSize.Size8x8)))) 2092 (mov_from_vec added 0 (ScalarSize.Size8)))) 2093 2094(rule popcnt_32 (lower (has_type $I32 (popcnt _ x))) 2095 (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32))) 2096 (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) 2097 (added Reg (addv nbits (VectorSize.Size8x8)))) 2098 (mov_from_vec added 0 (ScalarSize.Size8)))) 2099 2100(rule popcnt_64 (lower (has_type $I64 (popcnt _ x))) 2101 (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64))) 2102 (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))) 2103 (added Reg (addv nbits (VectorSize.Size8x8)))) 2104 (mov_from_vec added 0 (ScalarSize.Size8)))) 2105 2106(rule (lower (has_type $I128 (popcnt _ x))) 2107 (let ((val ValueRegs x) 2108 (tmp_half Reg (mov_to_fpu (value_regs_get val 0) (ScalarSize.Size64))) 2109 (tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2))) 2110 (nbits Reg (vec_cnt tmp (VectorSize.Size8x16))) 2111 (added Reg (addv nbits (VectorSize.Size8x16)))) 2112 (value_regs (mov_from_vec added 0 (ScalarSize.Size8)) (imm $I64 (ImmExtend.Zero) 0)))) 2113 2114(rule (lower (has_type $I8X16 (popcnt _ x))) 2115 (vec_cnt x (VectorSize.Size8x16))) 2116 2117;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2118 2119(rule bitselect (lower (has_type ty (bitselect _ c x y))) 2120 (if (ty_int_ref_scalar_64 ty)) 2121 (let ((tmp1 Reg (and_reg ty x c)) 2122 (tmp2 Reg (bic ty y c))) 2123 (orr ty tmp1 tmp2))) 2124 2125(rule 1 (lower (has_type (ty_vec128 ty) (bitselect _ c x y))) 2126 (bsl ty c x y)) 2127 2128;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2129 2130;; T -> I{64,32,16,8}: We can simply pass through the value: values 2131;; are always stored with high bits undefined, so we can just leave 2132;; them be. 2133(rule (lower (has_type ty (ireduce _ src))) 2134 (if (ty_int_ref_scalar_64 ty)) 2135 (value_regs_get src 0)) 2136 2137;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2138 2139(rule 4 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond_not_eq cond) x y))) 2140 (if (zero_value y)) 2141 (let ((rn Reg x) 2142 (vec_size VectorSize (vector_size ty))) 2143 (value_reg (not (fcmeq0 rn vec_size) vec_size)))) 2144 2145(rule 3 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond cond) x y))) 2146 (if (zero_value y)) 2147 (let ((rn Reg x) 2148 (vec_size VectorSize (vector_size ty))) 2149 (value_reg (float_cmp_zero cond rn vec_size)))) 2150 2151(rule 2 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond_not_eq cond) x y))) 2152 (if (zero_value x)) 2153 (let ((rn Reg y) 2154 (vec_size VectorSize (vector_size ty))) 2155 (value_reg (not (fcmeq0 rn vec_size) vec_size)))) 2156 2157(rule 1 (lower (has_type ty @ (multi_lane _ _) (fcmp _ (fcmp_zero_cond cond) x y))) 2158 (if (zero_value x)) 2159 (let ((rn Reg y) 2160 (vec_size VectorSize (vector_size ty))) 2161 (value_reg (float_cmp_zero_swap cond rn vec_size)))) 2162 2163(rule 0 (lower (has_type out_ty 2164 (fcmp _ cond x @ (value_type (ty_scalar_float in_ty)) y))) 2165 (with_flags (fpu_cmp (scalar_size in_ty) x y) 2166 (materialize_bool_result (fp_cond_code cond)))) 2167 2168(rule -1 (lower (has_type out_ty (fcmp _ cond x @ (value_type in_ty) y))) 2169 (if (ty_vector_float in_ty)) 2170 (vec_cmp x y in_ty (fp_cond_code cond))) 2171 2172;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2173 2174(rule 3 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond_not_eq cond) x y))) 2175 (if (zero_value y)) 2176 (let ((rn Reg x) 2177 (vec_size VectorSize (vector_size ty))) 2178 (value_reg (not (cmeq0 rn vec_size) vec_size)))) 2179 2180(rule 2 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond cond) x y))) 2181 (if (zero_value y)) 2182 (let ((rn Reg x) 2183 (vec_size VectorSize (vector_size ty))) 2184 (value_reg (int_cmp_zero cond rn vec_size)))) 2185 2186(rule 1 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond_not_eq cond) x y))) 2187 (if (zero_value x)) 2188 (let ((rn Reg y) 2189 (vec_size VectorSize (vector_size ty))) 2190 (value_reg (not (cmeq0 rn vec_size) vec_size)))) 2191 2192(rule 0 (lower (has_type ty @ (multi_lane _ _) (icmp _ (icmp_zero_cond cond) x y))) 2193 (if (zero_value x)) 2194 (let ((rn Reg y) 2195 (vec_size VectorSize (vector_size ty))) 2196 (value_reg (int_cmp_zero_swap cond rn vec_size)))) 2197 2198(rule icmp_8_16_32_64 -1 (lower (icmp _ cond x @ (value_type in_ty) y)) 2199 (lower_icmp_into_reg cond x y in_ty $I8)) 2200 2201;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2202 2203(rule (lower (trap trap_code)) 2204 (side_effect (udf trap_code))) 2205 2206;;;;; Rules for `trapz`;;;;;;;;; 2207 2208(rule (lower (trapz val trap_code)) 2209 (trap_if_val (ZeroCond.Zero) val trap_code)) 2210 2211;;;;; Rules for `trapnz`;;;;;;;;; 2212 2213(rule (lower (trapnz val trap_code)) 2214 (trap_if_val (ZeroCond.NonZero) val trap_code)) 2215 2216;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2217 2218(rule (lower (has_type ty 2219 (select _ (maybe_uextend (icmp _ cc 2220 x @ (value_type in_ty) 2221 y)) 2222 rn 2223 rm))) 2224 (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y in_ty))) 2225 (lower_select (flags_and_cc_flags comparison) 2226 (cond_code (flags_and_cc_cc comparison)) 2227 ty 2228 rn 2229 rm))) 2230 2231(rule (lower (has_type ty 2232 (select _ (maybe_uextend (fcmp _ cc x @ (value_type in_ty) y)) 2233 rn 2234 rm))) 2235 (let ((cond Cond (fp_cond_code cc))) 2236 (lower_select 2237 (fpu_cmp (scalar_size in_ty) x y) 2238 cond ty rn rm))) 2239 2240(rule -1 (lower (has_type ty (select _ rcond @ (value_type $I8) rn rm))) 2241 (let ((rcond Reg rcond)) 2242 (lower_select 2243 (tst_imm $I32 rcond (u64_into_imm_logic $I32 255)) 2244 (Cond.Ne) ty rn rm))) 2245 2246(rule -2 (lower (has_type ty (select _ rcond @ (value_type (fits_in_32 _)) rn rm))) 2247 (let ((rcond Reg (put_in_reg_zext32 rcond))) 2248 (lower_select 2249 (cmp (OperandSize.Size32) rcond (zero_reg)) 2250 (Cond.Ne) ty rn rm))) 2251 2252(rule -3 (lower (has_type ty (select _ rcond @ (value_type (fits_in_64 _)) rn rm))) 2253 (let ((rcond Reg (put_in_reg_zext64 rcond))) 2254 (lower_select 2255 (cmp (OperandSize.Size64) rcond (zero_reg)) 2256 (Cond.Ne) ty rn rm))) 2257 2258(rule -4 (lower (has_type ty (select _ rcond @ (value_type $I128) rn rm))) 2259 (let ((c ValueRegs (put_in_regs rcond)) 2260 (c_lo Reg (value_regs_get c 0)) 2261 (c_hi Reg (value_regs_get c 1)) 2262 (rt Reg (orr $I64 c_lo c_hi))) 2263 (lower_select 2264 (cmp (OperandSize.Size64) rt (zero_reg)) 2265 (Cond.Ne) ty rn rm))) 2266 2267;;;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2268 2269(decl maybe_csdb_after_select (ValueRegs) ValueRegs) 2270(rule (maybe_csdb_after_select dst) 2271 (if-let true (use_csdb)) 2272 (let ((_ InstOutput (side_effect (csdb)))) dst)) 2273(rule (maybe_csdb_after_select dst) 2274 (if-let false (use_csdb)) 2275 dst) 2276 2277(rule (lower (has_type ty 2278 (select_spectre_guard _ (maybe_uextend (icmp _ cc x @ (value_type in_ty) y)) 2279 if_true 2280 if_false))) 2281 (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y in_ty)) 2282 (dst ValueRegs (lower_select 2283 (flags_and_cc_flags comparison) 2284 (cond_code (flags_and_cc_cc comparison)) 2285 ty 2286 if_true 2287 if_false))) 2288 (maybe_csdb_after_select dst))) 2289 2290(rule -1 (lower (has_type ty (select_spectre_guard _ rcond @ (value_type (fits_in_64 _)) rn rm))) 2291 (let ((rcond Reg (put_in_reg_zext64 rcond))) 2292 (lower_select 2293 (cmp (OperandSize.Size64) rcond (zero_reg)) 2294 (Cond.Ne) ty rn rm))) 2295 2296(rule -2 (lower (has_type ty (select_spectre_guard _ rcond @ (value_type $I128) rn rm))) 2297 (let ((c ValueRegs (put_in_regs rcond)) 2298 (c_lo Reg (value_regs_get c 0)) 2299 (c_hi Reg (value_regs_get c 1)) 2300 (rt Reg (orr $I64 c_lo c_hi))) 2301 (lower_select 2302 (cmp (OperandSize.Size64) rt (zero_reg)) 2303 (Cond.Ne) ty rn rm))) 2304 2305;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2306 2307(rule (lower (has_type (ty_vec128 _) (vconst _ (u128_from_constant x)))) 2308 (constant_f128 x)) 2309 2310(rule 1 (lower (has_type ty (vconst _ (u64_from_constant x)))) 2311 (if (ty_vec64 ty)) 2312 (constant_f64 x)) 2313 2314;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2315 2316(rule -1 (lower (has_type ty (splat _ x @ (value_type in_ty)))) 2317 (if (ty_int_ref_scalar_64 in_ty)) 2318 (vec_dup x (vector_size ty))) 2319 2320(rule -2 (lower (has_type ty (splat _ x @ (value_type (ty_scalar_float _))))) 2321 (vec_dup_from_fpu x (vector_size ty) 0)) 2322 2323(rule (lower (has_type ty (splat _ (f32const _ (u32_from_ieee32 n))))) 2324 (splat_const n (vector_size ty))) 2325 2326(rule (lower (has_type ty (splat _ (f64const _ (u64_from_ieee64 n))))) 2327 (splat_const n (vector_size ty))) 2328 2329(rule (lower (has_type ty (splat _ (iconst _ (u64_from_imm64 n))))) 2330 (splat_const n (vector_size ty))) 2331 2332(rule (lower (has_type ty (splat _ x @ (load _ flags _ _)))) 2333 (if-let mem_op (is_sinkable_inst x)) 2334 (let ((addr Reg (sink_load_into_addr (lane_type ty) mem_op))) 2335 (ld1r addr (vector_size ty) flags))) 2336 2337;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2338(rule (lower (has_type (valid_atomic_transaction ty) (atomic_load _ (little_or_native_endian flags) addr))) 2339 (load_acquire ty flags addr)) 2340 2341 2342;;;; Rules for `AtomicStore` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2343(rule (lower (atomic_store (little_or_native_endian flags) 2344 src @ (value_type (valid_atomic_transaction ty)) 2345 addr)) 2346 (side_effect (store_release ty flags src addr))) 2347 2348;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2349 2350(rule 1 (lower (and (use_lse) 2351 (has_type (valid_atomic_transaction ty) 2352 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) addr src)))) 2353 (lse_atomic_rmw (AtomicRMWOp.Add) addr src ty flags)) 2354(rule 1 (lower (and (use_lse) 2355 (has_type (valid_atomic_transaction ty) 2356 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xor) addr src)))) 2357 (lse_atomic_rmw (AtomicRMWOp.Eor) addr src ty flags)) 2358(rule 1 (lower (and (use_lse) 2359 (has_type (valid_atomic_transaction ty) 2360 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Or) addr src)))) 2361 (lse_atomic_rmw (AtomicRMWOp.Set) addr src ty flags)) 2362(rule 1 (lower (and (use_lse) 2363 (has_type (valid_atomic_transaction ty) 2364 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smax) addr src)))) 2365 (lse_atomic_rmw (AtomicRMWOp.Smax) addr src ty flags)) 2366(rule 1 (lower (and (use_lse) 2367 (has_type (valid_atomic_transaction ty) 2368 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smin) addr src)))) 2369 (lse_atomic_rmw (AtomicRMWOp.Smin) addr src ty flags)) 2370(rule 1 (lower (and (use_lse) 2371 (has_type (valid_atomic_transaction ty) 2372 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umax) addr src)))) 2373 (lse_atomic_rmw (AtomicRMWOp.Umax) addr src ty flags)) 2374(rule 1 (lower (and (use_lse) 2375 (has_type (valid_atomic_transaction ty) 2376 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umin) addr src)))) 2377 (lse_atomic_rmw (AtomicRMWOp.Umin) addr src ty flags)) 2378(rule 1 (lower (and (use_lse) 2379 (has_type (valid_atomic_transaction ty) 2380 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) addr src)))) 2381 (lse_atomic_rmw (AtomicRMWOp.Add) addr (sub ty (zero_reg) src) ty flags)) 2382(rule 1 (lower (and (use_lse) 2383 (has_type (valid_atomic_transaction ty) 2384 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.And) addr src)))) 2385 (lse_atomic_rmw (AtomicRMWOp.Clr) addr (eon ty src (zero_reg)) ty flags)) 2386 2387 2388(rule (lower (has_type (valid_atomic_transaction ty) 2389 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Add) addr src))) 2390 (atomic_rmw_loop (AtomicRMWLoopOp.Add) addr src ty flags)) 2391(rule (lower (has_type (valid_atomic_transaction ty) 2392 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Sub) addr src))) 2393 (atomic_rmw_loop (AtomicRMWLoopOp.Sub) addr src ty flags)) 2394(rule (lower (has_type (valid_atomic_transaction ty) 2395 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.And) addr src))) 2396 (atomic_rmw_loop (AtomicRMWLoopOp.And) addr src ty flags)) 2397(rule (lower (has_type (valid_atomic_transaction ty) 2398 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Nand) addr src))) 2399 (atomic_rmw_loop (AtomicRMWLoopOp.Nand) addr src ty flags)) 2400(rule (lower (has_type (valid_atomic_transaction ty) 2401 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Or) addr src))) 2402 (atomic_rmw_loop (AtomicRMWLoopOp.Orr) addr src ty flags)) 2403(rule (lower (has_type (valid_atomic_transaction ty) 2404 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xor) addr src))) 2405 (atomic_rmw_loop (AtomicRMWLoopOp.Eor) addr src ty flags)) 2406(rule (lower (has_type (valid_atomic_transaction ty) 2407 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smin) addr src))) 2408 (atomic_rmw_loop (AtomicRMWLoopOp.Smin) addr src ty flags)) 2409(rule (lower (has_type (valid_atomic_transaction ty) 2410 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Smax) addr src))) 2411 (atomic_rmw_loop (AtomicRMWLoopOp.Smax) addr src ty flags)) 2412(rule (lower (has_type (valid_atomic_transaction ty) 2413 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umin) addr src))) 2414 (atomic_rmw_loop (AtomicRMWLoopOp.Umin) addr src ty flags)) 2415(rule (lower (has_type (valid_atomic_transaction ty) 2416 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Umax) addr src))) 2417 (atomic_rmw_loop (AtomicRMWLoopOp.Umax) addr src ty flags)) 2418(rule (lower (has_type (valid_atomic_transaction ty) 2419 (atomic_rmw _ (little_or_native_endian flags) (AtomicRmwOp.Xchg) addr src))) 2420 (atomic_rmw_loop (AtomicRMWLoopOp.Xchg) addr src ty flags)) 2421 2422;;;; Rules for `AtomicCAS` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2423(rule 1 (lower (and (use_lse) 2424 (has_type (valid_atomic_transaction ty) 2425 (atomic_cas _ (little_or_native_endian flags) addr src1 src2)))) 2426 (lse_atomic_cas addr src1 src2 ty flags)) 2427 2428(rule (lower (and (has_type (valid_atomic_transaction ty) 2429 (atomic_cas _ (little_or_native_endian flags) addr src1 src2)))) 2430 (atomic_cas_loop addr src1 src2 ty flags)) 2431 2432;;;; Rules for 'fvdemote' ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2433(rule (lower (fvdemote _ x)) 2434 (fcvtn x (ScalarSize.Size32))) 2435 2436 2437;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2438(rule 1 (lower (has_type (ty_vec128_int ty) (snarrow _ x y))) 2439 (if (zero_value y)) 2440 (sqxtn x (lane_size ty))) 2441 2442(rule 2 (lower (has_type (ty_vec64_int ty) (snarrow _ x y))) 2443 (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) 2444 (sqxtn dst (lane_size ty)))) 2445 2446(rule 0 (lower (has_type (ty_vec128_int ty) (snarrow _ x y))) 2447 (let ((low_half Reg (sqxtn x (lane_size ty))) 2448 (result Reg (sqxtn2 low_half y (lane_size ty)))) 2449 result)) 2450 2451 2452;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2453(rule 1 (lower (has_type (ty_vec128_int ty) (unarrow _ x y))) 2454 (if (zero_value y)) 2455 (sqxtun x (lane_size ty))) 2456 2457(rule 2 (lower (has_type (ty_vec64_int ty) (unarrow _ x y))) 2458 (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) 2459 (sqxtun dst (lane_size ty)))) 2460 2461(rule 0 (lower (has_type (ty_vec128_int ty) (unarrow _ x y))) 2462 (let ((low_half Reg (sqxtun x (lane_size ty))) 2463 (result Reg (sqxtun2 low_half y (lane_size ty)))) 2464 result)) 2465 2466 2467;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2468 2469(rule 1 (lower (has_type (ty_vec128_int ty) (uunarrow _ x y))) 2470 (if (zero_value y)) 2471 (uqxtn x (lane_size ty))) 2472 2473(rule 2 (lower (has_type (ty_vec64_int ty) (uunarrow _ x y))) 2474 (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2)))) 2475 (uqxtn dst (lane_size ty)))) 2476 2477(rule 0 (lower (has_type (ty_vec128_int ty) (uunarrow _ x y))) 2478 (let ((low_half Reg (uqxtn x (lane_size ty))) 2479 (result Reg (uqxtn2 low_half y (lane_size ty)))) 2480 result)) 2481 2482;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2483 2484(rule (lower (has_type ty (swiden_low _ x))) 2485 (vec_extend (VecExtendOp.Sxtl) x false (lane_size ty))) 2486 2487;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2488 2489(rule 1 (lower (has_type (ty_vec128 ty) (swiden_high _ x))) 2490 (vec_extend (VecExtendOp.Sxtl) x true (lane_size ty))) 2491 2492(rule (lower (has_type ty (swiden_high _ x))) 2493 (if (ty_vec64 ty)) 2494 (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2)))) 2495 (vec_extend (VecExtendOp.Sxtl) tmp false (lane_size ty)))) 2496 2497;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2498 2499(rule (lower (has_type ty (uwiden_low _ x))) 2500 (vec_extend (VecExtendOp.Uxtl) x false (lane_size ty))) 2501 2502;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2503 2504(rule 1 (lower (has_type (ty_vec128 ty) (uwiden_high _ x))) 2505 (vec_extend (VecExtendOp.Uxtl) x true (lane_size ty))) 2506 2507(rule (lower (has_type ty (uwiden_high _ x))) 2508 (if (ty_vec64 ty)) 2509 (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2)))) 2510 (vec_extend (VecExtendOp.Uxtl) tmp false (lane_size ty)))) 2511 2512;;;; Rules for `Fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2513 2514(rule (lower (fence)) 2515 (side_effect (aarch64_fence))) 2516 2517;;;; Rules for `Debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2518 2519(rule (lower (debugtrap)) 2520 (side_effect (brk))) 2521 2522;;;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2523 2524(rule (lower (func_addr _ (func_ref_data _ extname dist _))) 2525 (load_ext_name (box_external_name extname) 0 dist)) 2526 2527;;;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2528 2529(rule (lower (symbol_value _ (symbol_value_data extname dist offset))) 2530 (load_ext_name (box_external_name extname) offset dist)) 2531 2532;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;;; 2533 2534(rule (lower (get_frame_pointer _)) 2535 (aarch64_fp)) 2536 2537(rule (lower (get_stack_pointer _)) 2538 (aarch64_sp)) 2539 2540(rule (lower (get_return_address _)) 2541 (aarch64_link)) 2542 2543;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2544 2545;; Direct call to an in-range function. 2546(rule 1 (lower (call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args)) 2547 (let ((output ValueRegsVec (gen_call_output sig_ref)) 2548 (abi Sig (abi_sig sig_ref)) 2549 (uses CallArgList (gen_call_args abi args)) 2550 (defs CallRetList (gen_call_rets abi output)) 2551 (info BoxCallInfo (gen_call_info abi name uses defs (try_call_none) patchable)) 2552 (_ Unit (emit_side_effect (call_impl info)))) 2553 output)) 2554 2555;; Direct call to an out-of-range function (implicitly via pointer). 2556(rule (lower (call (func_ref_data sig_ref name dist false) args)) 2557 (let ((output ValueRegsVec (gen_call_output sig_ref)) 2558 (abi Sig (abi_sig sig_ref)) 2559 (uses CallArgList (gen_call_args abi args)) 2560 (defs CallRetList (gen_call_rets abi output)) 2561 (target Reg (load_ext_name name 0 dist)) 2562 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none))) 2563 (_ Unit (emit_side_effect (call_ind_impl info)))) 2564 output)) 2565 2566;; Indirect call. 2567(rule (lower (call_indirect sig_ref ptr args)) 2568 (let ((output ValueRegsVec (gen_call_output sig_ref)) 2569 (abi Sig (abi_sig sig_ref)) 2570 (target Reg (put_in_reg ptr)) 2571 (uses CallArgList (gen_call_args abi args)) 2572 (defs CallRetList (gen_call_rets abi output)) 2573 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs (try_call_none))) 2574 (_ Unit (emit_side_effect (call_ind_impl info)))) 2575 output)) 2576 2577;;;; Rules for `try_call` and `try_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2578 2579;; Direct call to an in-range function. 2580(rule 1 (lower_branch (try_call (func_ref_data sig_ref name (RelocDistance.Near) patchable) args et) targets) 2581 (let ((abi Sig (abi_sig sig_ref)) 2582 (trycall OptionTryCallInfo (try_call_info et targets)) 2583 (uses CallArgList (gen_call_args abi args)) 2584 (defs CallRetList (gen_try_call_rets abi)) 2585 (info BoxCallInfo (gen_call_info abi name uses defs trycall patchable))) 2586 (emit_side_effect (call_impl info)))) 2587 2588;; Direct call to an out-of-range function (implicitly via pointer). 2589(rule (lower_branch (try_call (func_ref_data sig_ref name dist false) args et) targets) 2590 (let ((abi Sig (abi_sig sig_ref)) 2591 (trycall OptionTryCallInfo (try_call_info et targets)) 2592 (uses CallArgList (gen_call_args abi args)) 2593 (defs CallRetList (gen_try_call_rets abi)) 2594 (target Reg (load_ext_name name 0 dist)) 2595 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall))) 2596 (emit_side_effect (call_ind_impl info)))) 2597 2598;; Indirect call. 2599(rule (lower_branch (try_call_indirect ptr args et) targets) 2600 (if-let (exception_sig sig_ref) et) 2601 (let ((abi Sig (abi_sig sig_ref)) 2602 (trycall OptionTryCallInfo (try_call_info et targets)) 2603 (target Reg (put_in_reg ptr)) 2604 (uses CallArgList (gen_call_args abi args)) 2605 (defs CallRetList (gen_try_call_rets abi)) 2606 (info BoxCallIndInfo (gen_call_ind_info abi target uses defs trycall))) 2607 (emit_side_effect (call_ind_impl info)))) 2608 2609;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2610 2611;; N.B.: the Ret itself is generated by the ABI. 2612(rule (lower (return args)) 2613 (lower_return args)) 2614 2615;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;; 2616 2617;; Direct call to an in-range function. 2618(rule 1 (lower (return_call (func_ref_data sig_ref name (RelocDistance.Near) false) args)) 2619 (let ((abi Sig (abi_sig sig_ref)) 2620 (uses CallArgList (gen_return_call_args abi args)) 2621 (info BoxReturnCallInfo (gen_return_call_info abi name uses))) 2622 (side_effect (return_call_impl info)))) 2623 2624;; Direct call to an out-of-range function (implicitly via pointer). 2625(rule (lower (return_call (func_ref_data sig_ref name dist false) args)) 2626 (let ((abi Sig (abi_sig sig_ref)) 2627 (uses CallArgList (gen_return_call_args abi args)) 2628 (target Reg (load_ext_name name 0 dist)) 2629 (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses))) 2630 (side_effect (return_call_ind_impl info)))) 2631 2632;; Indirect call. 2633(rule (lower (return_call_indirect sig_ref ptr args)) 2634 (let ((abi Sig (abi_sig sig_ref)) 2635 (target Reg (put_in_reg ptr)) 2636 (uses CallArgList (gen_return_call_args abi args)) 2637 (info BoxReturnCallIndInfo (gen_return_call_ind_info abi target uses))) 2638 (side_effect (return_call_ind_impl info)))) 2639 2640;;;; Rules for loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2641 2642(rule load_i8_aarch64_uload8 (lower 2643 (has_type $I8 (load _ (little_or_native_endian flags) address offset))) 2644 (aarch64_uload8 (amode $I8 address offset) flags)) 2645(rule load_i16_aarch64_uload16 (lower 2646 (has_type $I16 (load _ (little_or_native_endian flags) address offset))) 2647 (aarch64_uload16 (amode $I16 address offset) flags)) 2648(rule load_i32_aarch64_uload32 (lower 2649 (has_type $I32 (load _ (little_or_native_endian flags) address offset))) 2650 (aarch64_uload32 (amode $I32 address offset) flags)) 2651(rule load_i64_aarch64_uload64 (lower 2652 (has_type $I64 (load _ (little_or_native_endian flags) address offset))) 2653 (aarch64_uload64 (amode $I64 address offset) flags)) 2654(rule (lower 2655 (has_type $I128 (load _ (little_or_native_endian flags) address offset))) 2656 (aarch64_loadp64 (pair_amode address offset) flags)) 2657(rule -1 (lower 2658 (has_type (ty_float_or_vec (ty_16 _)) (load _ (little_or_native_endian flags) address offset))) 2659 (aarch64_fpuload16 (amode $F16 address offset) flags)) 2660(rule -2 (lower 2661 (has_type (ty_float_or_vec (ty_32 _)) (load _ (little_or_native_endian flags) address offset))) 2662 (aarch64_fpuload32 (amode $F32 address offset) flags)) 2663(rule -3 (lower 2664 (has_type (ty_float_or_vec (ty_64 _)) (load _ (little_or_native_endian flags) address offset))) 2665 (aarch64_fpuload64 (amode $F64 address offset) flags)) 2666(rule -4 (lower 2667 (has_type (ty_float_or_vec (ty_128 _)) (load _ (little_or_native_endian flags) address offset))) 2668 (aarch64_fpuload128 (amode $F128 address offset) flags)) 2669(rule -5 (lower 2670 (has_type (ty_dyn_vec64 _) 2671 (load _ (little_or_native_endian flags) address offset))) 2672 (aarch64_fpuload64 (amode $F64 address offset) flags)) 2673(rule -6 (lower 2674 (has_type (ty_dyn_vec128 _) 2675 (load _ (little_or_native_endian flags) address offset))) 2676 (aarch64_fpuload128 (amode $I8X16 address offset) flags)) 2677 2678(rule (lower 2679 (uload8 _ (little_or_native_endian flags) address offset)) 2680 (aarch64_uload8 (amode $I8 address offset) flags)) 2681(rule (lower 2682 (sload8 _ (little_or_native_endian flags) address offset)) 2683 (aarch64_sload8 (amode $I8 address offset) flags)) 2684(rule (lower 2685 (uload16 _ (little_or_native_endian flags) address offset)) 2686 (aarch64_uload16 (amode $I16 address offset) flags)) 2687(rule (lower 2688 (sload16 _ (little_or_native_endian flags) address offset)) 2689 (aarch64_sload16 (amode $I16 address offset) flags)) 2690(rule (lower 2691 (uload32 _ (little_or_native_endian flags) address offset)) 2692 (aarch64_uload32 (amode $I32 address offset) flags)) 2693(rule (lower 2694 (sload32 _ (little_or_native_endian flags) address offset)) 2695 (aarch64_sload32 (amode $I32 address offset) flags)) 2696 2697(rule (lower 2698 (sload8x8 _ (little_or_native_endian flags) address offset)) 2699 (vec_extend (VecExtendOp.Sxtl) 2700 (aarch64_fpuload64 (amode $F64 address offset) flags) 2701 false 2702 (ScalarSize.Size16))) 2703(rule (lower 2704 (uload8x8 _ (little_or_native_endian flags) address offset)) 2705 (vec_extend (VecExtendOp.Uxtl) 2706 (aarch64_fpuload64 (amode $F64 address offset) flags) 2707 false 2708 (ScalarSize.Size16))) 2709(rule (lower 2710 (sload16x4 _ (little_or_native_endian flags) address offset)) 2711 (vec_extend (VecExtendOp.Sxtl) 2712 (aarch64_fpuload64 (amode $F64 address offset) flags) 2713 false 2714 (ScalarSize.Size32))) 2715(rule (lower 2716 (uload16x4 _ (little_or_native_endian flags) address offset)) 2717 (vec_extend (VecExtendOp.Uxtl) 2718 (aarch64_fpuload64 (amode $F64 address offset) flags) 2719 false 2720 (ScalarSize.Size32))) 2721(rule (lower 2722 (sload32x2 _ (little_or_native_endian flags) address offset)) 2723 (vec_extend (VecExtendOp.Sxtl) 2724 (aarch64_fpuload64 (amode $F64 address offset) flags) 2725 false 2726 (ScalarSize.Size64))) 2727(rule (lower 2728 (uload32x2 _ (little_or_native_endian flags) address offset)) 2729 (vec_extend (VecExtendOp.Uxtl) 2730 (aarch64_fpuload64 (amode $F64 address offset) flags) 2731 false 2732 (ScalarSize.Size64))) 2733 2734;;;; Rules for stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2735 2736(rule store_i8_aarch64_store8 (lower 2737 (store (little_or_native_endian flags) value @ (value_type $I8) address offset)) 2738 (side_effect 2739 (aarch64_store8 (amode $I8 address offset) flags value))) 2740(rule store_i16_aarch64_store16 (lower 2741 (store (little_or_native_endian flags) value @ (value_type $I16) address offset)) 2742 (side_effect 2743 (aarch64_store16 (amode $I16 address offset) flags value))) 2744(rule store_i32_aarch64_store32 (lower 2745 (store (little_or_native_endian flags) value @ (value_type $I32) address offset)) 2746 (side_effect 2747 (aarch64_store32 (amode $I32 address offset) flags value))) 2748(rule store_i64_aarch64_store64 (lower 2749 (store (little_or_native_endian flags) value @ (value_type $I64) address offset)) 2750 (side_effect 2751 (aarch64_store64 (amode $I64 address offset) flags value))) 2752 2753(rule (lower 2754 (istore8 (little_or_native_endian flags) value address offset)) 2755 (side_effect 2756 (aarch64_store8 (amode $I8 address offset) flags value))) 2757(rule (lower 2758 (istore16 (little_or_native_endian flags) value address offset)) 2759 (side_effect 2760 (aarch64_store16 (amode $I16 address offset) flags value))) 2761(rule (lower 2762 (istore32 (little_or_native_endian flags) value address offset)) 2763 (side_effect 2764 (aarch64_store32 (amode $I32 address offset) flags value))) 2765 2766(rule (lower 2767 (store (little_or_native_endian flags) value @ (value_type $I128) address offset)) 2768 (side_effect 2769 (aarch64_storep64 (pair_amode address offset) flags 2770 (value_regs_get value 0) 2771 (value_regs_get value 1)))) 2772 2773(rule -1 (lower 2774 (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_16 _))) address offset)) 2775 (side_effect 2776 (aarch64_fpustore16 (amode $F16 address offset) flags value))) 2777(rule -2 (lower 2778 (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_32 _))) address offset)) 2779 (side_effect 2780 (aarch64_fpustore32 (amode $F32 address offset) flags value))) 2781(rule -3 (lower 2782 (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_64 _))) address offset)) 2783 (side_effect 2784 (aarch64_fpustore64 (amode $F64 address offset) flags value))) 2785(rule -4 (lower 2786 (store (little_or_native_endian flags) value @ (value_type (ty_float_or_vec (ty_128 _))) address offset)) 2787 (side_effect 2788 (aarch64_fpustore128 (amode $F128 address offset) flags value))) 2789 2790(rule -5 (lower 2791 (store (little_or_native_endian flags) value @ (value_type (ty_dyn_vec64 _)) address offset)) 2792 (side_effect 2793 (aarch64_fpustore64 (amode $F64 address offset) flags value))) 2794(rule -6 (lower 2795 (store (little_or_native_endian flags) value @ (value_type (ty_dyn_vec128 _)) address offset)) 2796 (side_effect 2797 (aarch64_fpustore128 (amode $I8X16 address offset) flags value))) 2798 2799;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2800 2801(rule (lower (get_pinned_reg _)) 2802 (mov_from_preg (preg_pinned))) 2803 2804(rule (lower (set_pinned_reg val)) 2805 (side_effect (write_pinned_reg val))) 2806 2807;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2808 2809; SIMD&FP <=> SIMD&FP 2810(rule 7 (lower (has_type (ty_float_or_vec _) (bitcast _ _ x @ (value_type (ty_float_or_vec _))))) 2811 x) 2812 2813; I128 => SIMD&FP 2814(rule 6 (lower (has_type (ty_float_or_vec _) (bitcast _ _ x @ (value_type $I128)))) 2815 (mov_to_vec (mov_to_fpu (value_regs_get x 0) (ScalarSize.Size64)) (value_regs_get x 1) 1 (VectorSize.Size64x2))) 2816 2817; SIMD&FP => I128 2818(rule 5 (lower (has_type $I128 (bitcast _ _ x @ (value_type (ty_float_or_vec _))))) 2819 (value_regs (mov_from_vec x 0 (ScalarSize.Size64)) (mov_from_vec x 1 (ScalarSize.Size64)))) 2820 2821; GPR => SIMD&FP 2822(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ _ x @ (value_type in_ty)))) 2823 (if (ty_int_ref_scalar_64 in_ty)) 2824 (mov_to_fpu x (scalar_size in_ty))) 2825 2826; SIMD&FP => GPR 2827(rule 3 (lower (has_type out_ty (bitcast _ _ x @ (value_type (fits_in_64 (ty_float_or_vec _)))))) 2828 (if (ty_int_ref_scalar_64 out_ty)) 2829 (mov_from_vec x 0 (scalar_size out_ty))) 2830 2831; GPR <=> GPR 2832(rule 1 (lower (has_type out_ty (bitcast _ _ x @ (value_type in_ty)))) 2833 (if (ty_int_ref_scalar_64 out_ty)) 2834 (if (ty_int_ref_scalar_64 in_ty)) 2835 x) 2836(rule 0 (lower (has_type $I128 (bitcast _ _ x @ (value_type $I128)))) x) 2837 2838;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2839 2840;; extractlane with lane 0 can pass through the value unchanged; upper 2841;; bits are undefined when a narrower type is in a wider register. 2842(rule 2 (lower (has_type (ty_scalar_float _) (extractlane _ val (u8_from_uimm8 0)))) 2843 val) 2844 2845(rule 0 (lower (has_type (ty_int ty) 2846 (extractlane _ val 2847 (u8_from_uimm8 lane)))) 2848 (mov_from_vec val lane (scalar_size ty))) 2849 2850(rule 1 (lower (has_type (ty_scalar_float ty) 2851 (extractlane _ val @ (value_type vty) 2852 (u8_from_uimm8 lane)))) 2853 (fpu_move_from_vec val lane (vector_size vty))) 2854 2855;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2856 2857(rule 1 (lower (insertlane _ vec @ (value_type vty) 2858 val @ (value_type (ty_int _)) 2859 (u8_from_uimm8 lane))) 2860 (mov_to_vec vec val lane (vector_size vty))) 2861 2862(rule (lower (insertlane _ vec @ (value_type vty) 2863 val @ (value_type (ty_scalar_float _)) 2864 (u8_from_uimm8 lane))) 2865 (mov_vec_elem vec val lane 0 (vector_size vty))) 2866 2867;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2868 2869(rule (lower (stack_addr _ stack_slot offset)) 2870 (compute_stack_addr stack_slot offset)) 2871 2872;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2873 2874;; All three sequences use one integer temporary and two vector 2875;; temporaries. The shift is done early so as to give the register 2876;; allocator the possibility of using the same reg for `tmp_v1` and 2877;; `src_v` in the case that this is the last use of `src_v`. See 2878;; https://github.com/WebAssembly/simd/pull/201 for the background and 2879;; derivation of these sequences. Alternative sequences are discussed 2880;; in https://github.com/bytecodealliance/wasmtime/issues/2296, 2881;; although they are not used here. 2882 2883(rule (lower (vhigh_bits _ vec @ (value_type $I8X16))) 2884 (let ( 2885 ;; Replicate the MSB of each of the 16 byte lanes across 2886 ;; the whole lane (sshr is an arithmetic right shift). 2887 (shifted Reg (sshr_vec_imm vec 7 (VectorSize.Size8x16))) 2888 ;; Bitwise-and with a mask 2889 ;; `0x80402010_08040201_80402010_08040201` to get the bit 2890 ;; in the proper location for each group of 8 lanes. 2891 (anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16))) 2892 ;; Produce a version of `anded` with upper 8 lanes and 2893 ;; lower 8 lanes swapped. 2894 (anded_swapped Reg (vec_extract anded anded 8)) 2895 ;; Zip together the two; with the above this produces the lane permutation: 2896 ;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0 2897 (zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16))) 2898 ;; Add 16-bit lanes together ("add across vector"), so we 2899 ;; get, in the low 16 bits, 15+14+...+8 in the high byte 2900 ;; and 7+6+...+0 in the low byte. This effectively puts 2901 ;; the 16 MSBs together, giving our results. 2902 ;; 2903 ;; N.B.: `Size16x8` is not a typo! 2904 (result Reg (addv zipped (VectorSize.Size16x8)))) 2905 (mov_from_vec result 0 (ScalarSize.Size16)))) 2906 2907(rule (lower (vhigh_bits _ vec @ (value_type $I16X8))) 2908 (let ( 2909 ;; Replicate the MSB of each of the 8 16-bit lanes across 2910 ;; the whole lane (sshr is an arithmetic right shift). 2911 (shifted Reg (sshr_vec_imm vec 15 (VectorSize.Size16x8))) 2912 ;; Bitwise-and with a mask 2913 ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the 2914 ;; bit in the proper location for each group of 4 lanes. 2915 (anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8))) 2916 ;; Add lanes together to get the 8 MSBs in the low byte. 2917 (result Reg (addv anded (VectorSize.Size16x8)))) 2918 (mov_from_vec result 0 (ScalarSize.Size16)))) 2919 2920(rule (lower (vhigh_bits _ vec @ (value_type $I32X4))) 2921 (let ( 2922 ;; Replicate the MSB of each of the 4 32-bit lanes across 2923 ;; the whole lane (sshr is an arithmetic right shift). 2924 (shifted Reg (sshr_vec_imm vec 31 (VectorSize.Size32x4))) 2925 ;; Bitwise-and with a mask 2926 ;; `0x00000008_00000004_00000002_00000001` to get the bit 2927 ;; in the proper location for each group of 4 lanes. 2928 (anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4))) 2929 ;; Add lanes together to get the 4 MSBs in the low byte. 2930 (result Reg (addv anded (VectorSize.Size32x4)))) 2931 (mov_from_vec result 0 (ScalarSize.Size32)))) 2932 2933(rule (lower (vhigh_bits _ vec @ (value_type $I64X2))) 2934 (let ( 2935 ;; Grab the MSB out of each of the lanes, right-shift to 2936 ;; LSB, and add with a left-shift of upper lane's MSB back 2937 ;; to bit 1. the whole lane (sshr is an arithmetic right 2938 ;; shift). 2939 (upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64))) 2940 (lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64))) 2941 (upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63))) 2942 (lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63)))) 2943 (add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1)))) 2944 2945;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2946 2947(rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap _ a b tc))) 2948 (trap_if_overflow (add_with_flags_paired ty a b) tc)) 2949 2950;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2951 2952;; put a narrow value into a register and sign-/zero-extend depending on the ArgumentExtension 2953(decl put_in_reg_ext32 (Value ArgumentExtension) Reg) 2954(rule (put_in_reg_ext32 val (ArgumentExtension.Sext)) 2955 (put_in_reg_sext32 val)) 2956(rule (put_in_reg_ext32 val (ArgumentExtension.Uext)) 2957 (put_in_reg_zext32 val)) 2958 2959;; For narrow values emit a normal op with both arguments zero/sign extended. 2960;; Then check if the output is the same as itself zero/sign extended from the narrower width. 2961(decl overflow_op_small (Type Value Value ArgumentExtension ALUOp) InstOutput) 2962(rule (overflow_op_small ty a b arg_ext alu_op) 2963 (let ((extend ExtendOp (lower_extend_op ty arg_ext)) 2964 2965 ;; Instead of emitting two `{u,s}xt{b,h}` we do one as an instruction and 2966 ;; the other as an extend operation in the alu_op. 2967 ;; 2968 ;; uxtb a_ext, a 2969 ;; alu_op out, a_ext, b, {u,s}xtb 2970 ;; cmp out, out, {u,s}xtb 2971 ;; cset out_of, ne 2972 (a_ext Reg (put_in_reg_ext32 a arg_ext)) 2973 (out Reg (alu_rrr_extend alu_op ty a_ext b extend)) 2974 (out_of Reg (with_flags_reg 2975 (cmp_extend (OperandSize.Size32) out out extend) 2976 (cset (Cond.Ne))))) 2977 (output_pair 2978 (value_reg out) 2979 (value_reg out_of)))) 2980 2981;; For register sized op's just emit a op+cset, without further masking. 2982;; 2983;; op out, a, b 2984;; cset out_of, cond 2985;; 2986;; conds expected: 2987;; Hs: Carry set, unsigned overflow; Vs: Signed Over-/Underflow; 2988;; Lo: Carry clear, meaning no unsigned overflow. 2989;; (this is because subtraction is implemented as an add with the two's complement value on aarch64, meaning there is a sub-overflow if the add does not overflow) 2990(decl overflow_op_normal (Type Value Value ALUOp Cond) InstOutput) 2991(rule (overflow_op_normal ty a b alu_op cond) 2992 (let ((out ValueRegs 2993 (with_flags 2994 (alu_rrr_with_flags_paired ty a b alu_op) 2995 (cset_paired cond)))) 2996 (output_pair 2997 (value_regs_get out 0) 2998 (value_regs_get out 1)))) 2999 3000;; For 128bit integers emit, for example, add+adcs+cset 3001(decl overflow_op_128 (Value Value ALUOp ALUOp Cond) InstOutput) 3002(rule (overflow_op_128 x y alu_op1 alu_op2 cond) 3003 (let 3004 ;; Get the high/low registers for `x`. 3005 ((x_regs ValueRegs x) 3006 (x_lo Reg (value_regs_get x_regs 0)) 3007 (x_hi Reg (value_regs_get x_regs 1)) 3008 3009 ;; Get the high/low registers for `y`. 3010 (y_regs ValueRegs y) 3011 (y_lo Reg (value_regs_get y_regs 0)) 3012 (y_hi Reg (value_regs_get y_regs 1))) 3013 ;; cannot use the with_flags helper here but it should be fine right now 3014 (let 3015 ((lo_inst ProducesFlags (alu_rrr_with_flags_paired $I64 x_lo y_lo alu_op1)) 3016 (hi_inst ConsumesAndProducesFlags (alu_rrr_with_flags_chained $I64 x_hi y_hi alu_op2)) 3017 (of_inst ConsumesFlags (cset_paired cond)) 3018 3019 (result MultiReg (with_flags_chained lo_inst hi_inst of_inst))) 3020 (multi_reg_to_pair_and_single result))) 3021) 3022 3023;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3024 3025;; For values smaller than a register, we do a normal `add` with both arguments 3026;; zero extended. We then check if the output is the same as itself zero extended. 3027(rule 1 (lower (has_type (fits_in_16 ty) (uadd_overflow _ a b))) 3028 (overflow_op_small ty a b (ArgumentExtension.Uext) (ALUOp.Add))) 3029 3030;; For register sized add's we just emit a adds+cset, without further masking. 3031(rule 2 (lower (has_type (ty_32_or_64 ty) (uadd_overflow _ a b))) 3032 (overflow_op_normal ty a b (ALUOp.AddS) (Cond.Hs))) 3033 3034;; For 128bit integers we emit add+adcs+cset 3035(rule 0 (lower (has_type $I128 (uadd_overflow _ x y))) 3036 (overflow_op_128 x y (ALUOp.AddS) (ALUOp.AdcS) (Cond.Hs))) 3037 3038;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3039 3040;; sxt{b,h} a_ext, a 3041;; add out, a_ext, b, sxt{b,h} 3042;; cmp out, out, sxt{b,h} 3043;; cset of, ne 3044(rule 1 (lower (has_type (fits_in_16 ty) (sadd_overflow _ a b))) 3045 (overflow_op_small ty a b (ArgumentExtension.Sext) (ALUOp.Add))) 3046 3047;; adds a, b 3048;; cset of, vs 3049(rule 2 (lower (has_type (ty_32_or_64 ty) (sadd_overflow _ a b))) 3050 (overflow_op_normal ty a b (ALUOp.AddS) (Cond.Vs))) 3051 3052;; adds x_lo, y_lo 3053;; addcs x_hi, y_hi 3054;; cset of, vs 3055(rule 0 (lower (has_type $I128 (sadd_overflow _ x y))) 3056 (overflow_op_128 x y (ALUOp.AddS) (ALUOp.AdcS) (Cond.Vs))) 3057 3058;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3059 3060;; uxt{b,h} a_ext, a 3061;; sub out, a_ext, b, ext{b,h} 3062;; cmp out, out, uxt{b,h} 3063;; cset of, ne 3064(rule 1 (lower (has_type (fits_in_16 ty) (usub_overflow _ a b))) 3065 (overflow_op_small ty a b (ArgumentExtension.Uext) (ALUOp.Sub))) 3066 3067;; subs a, b 3068;; cset of, lo 3069(rule 2 (lower (has_type (ty_32_or_64 ty) (usub_overflow _ a b))) 3070 (overflow_op_normal ty a b (ALUOp.SubS) (Cond.Lo))) 3071 3072;; subs x_lo, y_lo 3073;; sbcs x_hi, y_hi 3074;; cset of, lo 3075(rule 0 (lower (has_type $I128 (usub_overflow _ x y))) 3076 (overflow_op_128 x y (ALUOp.SubS) (ALUOp.SbcS) (Cond.Lo))) 3077 3078;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3079 3080;; sxt{b,h} a_ext, a 3081;; sub out, a_ext, b, sxt{b,h} 3082;; cmp out, out, sxt{b,h} 3083;; cset of, ne 3084(rule 1 (lower (has_type (fits_in_16 ty) (ssub_overflow _ a b))) 3085 (overflow_op_small ty a b (ArgumentExtension.Sext) (ALUOp.Sub))) 3086 3087;; subs a, b 3088;; cset of, vs 3089(rule 2 (lower (has_type (ty_32_or_64 ty) (ssub_overflow _ a b))) 3090 (overflow_op_normal ty a b (ALUOp.SubS) (Cond.Vs))) 3091 3092;; subs x_lo, y_lo 3093;; sbcs x_hi, y_hi 3094;; cset of, vs 3095(rule 0 (lower (has_type $I128 (ssub_overflow _ x y))) 3096 (overflow_op_128 x y (ALUOp.SubS) (ALUOp.SbcS) (Cond.Vs))) 3097 3098;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3099 3100;; uxt{b,h} a_ext, a 3101;; uxt{b,h} b_ext, b 3102;; mul out, a_ext, b_ext 3103;; cmp out, out, uxt{b,h} 3104;; cset of, ne 3105(rule 1 (lower (has_type (fits_in_16 ty) (umul_overflow _ a b))) 3106 (let ((extend ExtendOp (lower_extend_op ty (ArgumentExtension.Uext))) 3107 3108 (a_uext Reg (put_in_reg_zext32 a)) 3109 (b_uext Reg (put_in_reg_zext32 b)) 3110 (out Reg (madd ty a_uext b_uext (zero_reg))) 3111 (out_of Reg (with_flags_reg 3112 (cmp_extend (OperandSize.Size32) out out extend) 3113 (cset (Cond.Ne))))) 3114 (output_pair 3115 (value_reg out) 3116 (value_reg out_of)))) 3117 3118;; umull out, a, b 3119;; cmp out, out, uxtw 3120;; cset of, ne 3121(rule 2 (lower (has_type $I32 (umul_overflow _ a b))) 3122 (let ( 3123 (out Reg (umaddl a b (zero_reg))) 3124 (out_of Reg (with_flags_reg 3125 (cmp_extend (OperandSize.Size64) out out (ExtendOp.UXTW)) 3126 (cset (Cond.Ne))))) 3127 (output_pair 3128 (value_reg out) 3129 (value_reg out_of)))) 3130 3131;; mul out, a, b 3132;; umulh tmp, a, b 3133;; cmp tmp, #0 3134;; cset of, ne 3135(rule 2 (lower (has_type $I64 (umul_overflow _ a b))) 3136 (let ( 3137 (out Reg (madd $I64 a b (zero_reg))) 3138 (tmp Reg (umulh $I64 a b)) 3139 (out_of Reg (with_flags_reg 3140 (cmp64_imm tmp (u8_into_imm12 0)) 3141 (cset (Cond.Ne))))) 3142 (output_pair 3143 (value_reg out) 3144 (value_reg out_of)))) 3145 3146;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3147 3148;; sxt{b,h} a_ext, a 3149;; sxt{b,h} b_ext, b 3150;; mul out, a_ext, b_ext 3151;; cmp out, out, sxt{b,h} 3152;; cset of, ne 3153(rule 1 (lower (has_type (fits_in_16 ty) (smul_overflow _ a b))) 3154 (let ((extend ExtendOp (lower_extend_op ty (ArgumentExtension.Sext))) 3155 3156 (a_sext Reg (put_in_reg_sext32 a)) 3157 (b_sext Reg (put_in_reg_sext32 b)) 3158 (out Reg (madd ty a_sext b_sext (zero_reg))) 3159 (out_of Reg (with_flags_reg 3160 (cmp_extend (OperandSize.Size32) out out extend) 3161 (cset (Cond.Ne))))) 3162 (output_pair 3163 (value_reg out) 3164 (value_reg out_of)))) 3165 3166;; smull out, a, b 3167;; cmp out, out, sxtw 3168;; cset of, ne 3169(rule 2 (lower (has_type $I32 (smul_overflow _ a b))) 3170 (let ( 3171 (out Reg (smaddl a b (zero_reg))) 3172 (out_of Reg (with_flags_reg 3173 (cmp_extend (OperandSize.Size64) out out (ExtendOp.SXTW)) 3174 (cset (Cond.Ne))))) 3175 (output_pair 3176 (value_reg out) 3177 (value_reg out_of)))) 3178 3179;; mul out, a, b 3180;; smulh tmp, a, b 3181;; cmp tmp, out, ASR #63 3182;; cset of, ne 3183(rule 2 (lower (has_type $I64 (smul_overflow _ a b))) 3184 (let ( 3185 (out Reg (madd $I64 a b (zero_reg))) 3186 (tmp Reg (smulh $I64 a b)) 3187 (out_of Reg (with_flags_reg 3188 (cmp_rr_shift_asr (OperandSize.Size64) tmp out 63) 3189 (cset (Cond.Ne))))) 3190 (output_pair 3191 (value_reg out) 3192 (value_reg out_of)))) 3193 3194;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3195 3196(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value _ (symbol_value_data name _ _)))) 3197 (elf_tls_get_addr name)) 3198 3199(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value _ (symbol_value_data name _ _)))) 3200 (macho_tls_get_addr name)) 3201 3202;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3203 3204(rule (lower (fvpromote_low _ val)) 3205 (vec_rr_long (VecRRLongOp.Fcvtl32) val false)) 3206 3207;;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3208 3209;; `brif` following `icmp` 3210(rule (lower_branch (brif (maybe_uextend (icmp _ cc x @ (value_type ty) y)) _ _) (two_targets taken not_taken)) 3211 (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y ty)) 3212 (cond Cond (cond_code (flags_and_cc_cc comparison)))) 3213 (emit_side_effect 3214 (with_flags_side_effect (flags_and_cc_flags comparison) 3215 (cond_br taken 3216 not_taken 3217 (cond_br_cond cond)))))) 3218 3219;; `brif` following `fcmp` 3220(rule (lower_branch (brif (maybe_uextend (fcmp _ cc x @ (value_type (ty_scalar_float ty)) y)) _ _) (two_targets taken not_taken)) 3221 (let ((cond Cond (fp_cond_code cc))) 3222 (emit_side_effect 3223 (with_flags_side_effect (fpu_cmp (scalar_size ty) x y) 3224 (cond_br taken not_taken 3225 (cond_br_cond cond)))))) 3226 3227;; standard `brif` 3228(rule -1 (lower_branch (brif c @ (value_type $I128) _ _) (two_targets taken not_taken)) 3229 (let ((flags ProducesFlags (flags_to_producesflags c)) 3230 (c ValueRegs (put_in_regs c)) 3231 (c_lo Reg (value_regs_get c 0)) 3232 (c_hi Reg (value_regs_get c 1)) 3233 (rt Reg (orr $I64 c_lo c_hi))) 3234 (emit_side_effect 3235 (with_flags_side_effect flags 3236 (cond_br taken not_taken (cond_br_not_zero rt (operand_size $I64))))))) 3237(rule -2 (lower_branch (brif c @ (value_type ty) _ _) (two_targets taken not_taken)) 3238 (if (ty_int_ref_scalar_64 ty)) 3239 (let ((flags ProducesFlags (flags_to_producesflags c)) 3240 (rt Reg (put_in_reg_zext64 c))) 3241 (emit_side_effect 3242 (with_flags_side_effect flags 3243 (cond_br taken not_taken (cond_br_not_zero rt (operand_size $I64))))))) 3244 3245;; Special lowerings for `tbnz` - "Test bit and Branch if Nonzero" 3246(rule 1 (lower_branch (brif (band _ x @ (value_type ty) (u64_from_iconst n)) _ _) 3247 (two_targets taken not_taken)) 3248 (if-let bit (test_and_compare_bit_const ty n)) 3249 (emit_side_effect (tbnz taken not_taken x bit))) 3250 3251;; Special lowering for `tbz` - "Test bit and Branch if Zero" 3252(rule 1 (lower_branch (brif (icmp _ (IntCC.Equal) 3253 (band _ x @ (value_type (fits_in_64 ty)) 3254 (u64_from_iconst n)) 3255 (u64_from_iconst 0)) _ _) 3256 (two_targets taken not_taken)) 3257 (if-let bit (test_and_compare_bit_const ty n)) 3258 (emit_side_effect (tbz taken not_taken x bit))) 3259 3260(decl pure partial test_and_compare_bit_const (Type u64) u8) 3261(extern constructor test_and_compare_bit_const test_and_compare_bit_const) 3262 3263;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3264 3265(rule (lower_branch (jump _) (single_target label)) 3266 (emit_side_effect (aarch64_jump label))) 3267 3268;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3269 3270;; `targets` contains the default target with the list of branch targets 3271;; concatenated. 3272(rule (lower_branch (br_table idx _) (jump_table_targets default targets)) 3273 (let ((jt_size u32 (jump_table_size targets)) 3274 (_ InstOutput (side_effect 3275 (emit_island (targets_jt_space targets)))) 3276 (ridx Reg (put_in_reg_zext32 idx))) 3277 (br_table_impl jt_size ridx default targets))) 3278 3279;; Rules for `get_exception_handler_address` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3280 3281(rule (lower (get_exception_handler_address _ (u64_from_imm64 idx) block)) 3282 (let ((succ_label MachLabel (block_exn_successor_label block idx))) 3283 (a64_label_address succ_label))) 3284 3285;; Rules for `sequence_point` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3286 3287(rule (lower (sequence_point)) 3288 (side_effect 3289 (a64_sequence_point))) 3290