1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 3 4; rdar://12471808 5 6define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { 7; CHECK-LABEL: v_bsli8: 8; CHECK: @ %bb.0: 9; CHECK-NEXT: vldr d18, [r0] 10; CHECK-NEXT: vldr d16, [r2] 11; CHECK-NEXT: vldr d17, [r1] 12; CHECK-NEXT: vbit d16, d17, d18 13; CHECK-NEXT: vmov r0, r1, d16 14; CHECK-NEXT: mov pc, lr 15 %tmp1 = load <8 x i8>, <8 x i8>* %A 16 %tmp2 = load <8 x i8>, <8 x i8>* %B 17 %tmp3 = load <8 x i8>, <8 x i8>* %C 18 %tmp4 = and <8 x i8> %tmp1, %tmp2 19 %tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > 20 %tmp6 = and <8 x i8> %tmp5, %tmp3 21 %tmp7 = or <8 x i8> %tmp4, %tmp6 22 ret <8 x i8> %tmp7 23} 24 25define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { 26; CHECK-LABEL: v_bsli16: 27; CHECK: @ %bb.0: 28; CHECK-NEXT: vldr d18, [r0] 29; CHECK-NEXT: vldr d16, [r2] 30; CHECK-NEXT: vldr d17, [r1] 31; CHECK-NEXT: vbit d16, d17, d18 32; CHECK-NEXT: vmov r0, r1, d16 33; CHECK-NEXT: mov pc, lr 34 %tmp1 = load <4 x i16>, <4 x i16>* %A 35 %tmp2 = load <4 x i16>, <4 x i16>* %B 36 %tmp3 = load <4 x i16>, <4 x i16>* %C 37 %tmp4 = and <4 x i16> %tmp1, %tmp2 38 %tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 > 39 %tmp6 = and <4 x i16> %tmp5, %tmp3 40 %tmp7 = or <4 x i16> %tmp4, %tmp6 41 ret <4 x i16> %tmp7 42} 43 44define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { 45; CHECK-LABEL: v_bsli32: 46; CHECK: @ %bb.0: 47; CHECK-NEXT: vldr d18, [r0] 48; CHECK-NEXT: vldr d16, [r2] 49; CHECK-NEXT: vldr d17, [r1] 50; CHECK-NEXT: vbit d16, d17, d18 51; CHECK-NEXT: vmov r0, r1, d16 52; CHECK-NEXT: mov pc, lr 53 %tmp1 = load <2 x i32>, <2 x i32>* %A 54 %tmp2 = load <2 x i32>, <2 x i32>* %B 55 %tmp3 = load <2 x i32>, <2 x i32>* %C 56 %tmp4 = and <2 x i32> %tmp1, %tmp2 57 %tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 > 58 %tmp6 = and <2 x i32> %tmp5, %tmp3 59 %tmp7 = or <2 x i32> %tmp4, %tmp6 60 ret <2 x i32> %tmp7 61} 62 63define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind { 64; CHECK-LABEL: v_bsli64: 65; CHECK: @ %bb.0: 66; CHECK-NEXT: vldr d18, [r0] 67; CHECK-NEXT: vldr d16, [r2] 68; CHECK-NEXT: vldr d17, [r1] 69; CHECK-NEXT: vbit d16, d17, d18 70; CHECK-NEXT: vmov r0, r1, d16 71; CHECK-NEXT: mov pc, lr 72 %tmp1 = load <1 x i64>, <1 x i64>* %A 73 %tmp2 = load <1 x i64>, <1 x i64>* %B 74 %tmp3 = load <1 x i64>, <1 x i64>* %C 75 %tmp4 = and <1 x i64> %tmp1, %tmp2 76 %tmp5 = xor <1 x i64> %tmp1, < i64 -1 > 77 %tmp6 = and <1 x i64> %tmp5, %tmp3 78 %tmp7 = or <1 x i64> %tmp4, %tmp6 79 ret <1 x i64> %tmp7 80} 81 82define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind { 83; CHECK-LABEL: v_bslQi8: 84; CHECK: @ %bb.0: 85; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 86; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 87; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 88; CHECK-NEXT: vbit q8, q9, q10 89; CHECK-NEXT: vmov r0, r1, d16 90; CHECK-NEXT: vmov r2, r3, d17 91; CHECK-NEXT: mov pc, lr 92 %tmp1 = load <16 x i8>, <16 x i8>* %A 93 %tmp2 = load <16 x i8>, <16 x i8>* %B 94 %tmp3 = load <16 x i8>, <16 x i8>* %C 95 %tmp4 = and <16 x i8> %tmp1, %tmp2 96 %tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > 97 %tmp6 = and <16 x i8> %tmp5, %tmp3 98 %tmp7 = or <16 x i8> %tmp4, %tmp6 99 ret <16 x i8> %tmp7 100} 101 102define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind { 103; CHECK-LABEL: v_bslQi16: 104; CHECK: @ %bb.0: 105; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 106; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 107; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 108; CHECK-NEXT: vbit q8, q9, q10 109; CHECK-NEXT: vmov r0, r1, d16 110; CHECK-NEXT: vmov r2, r3, d17 111; CHECK-NEXT: mov pc, lr 112 %tmp1 = load <8 x i16>, <8 x i16>* %A 113 %tmp2 = load <8 x i16>, <8 x i16>* %B 114 %tmp3 = load <8 x i16>, <8 x i16>* %C 115 %tmp4 = and <8 x i16> %tmp1, %tmp2 116 %tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > 117 %tmp6 = and <8 x i16> %tmp5, %tmp3 118 %tmp7 = or <8 x i16> %tmp4, %tmp6 119 ret <8 x i16> %tmp7 120} 121 122define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind { 123; CHECK-LABEL: v_bslQi32: 124; CHECK: @ %bb.0: 125; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 126; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 127; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 128; CHECK-NEXT: vbit q8, q9, q10 129; CHECK-NEXT: vmov r0, r1, d16 130; CHECK-NEXT: vmov r2, r3, d17 131; CHECK-NEXT: mov pc, lr 132 %tmp1 = load <4 x i32>, <4 x i32>* %A 133 %tmp2 = load <4 x i32>, <4 x i32>* %B 134 %tmp3 = load <4 x i32>, <4 x i32>* %C 135 %tmp4 = and <4 x i32> %tmp1, %tmp2 136 %tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 > 137 %tmp6 = and <4 x i32> %tmp5, %tmp3 138 %tmp7 = or <4 x i32> %tmp4, %tmp6 139 ret <4 x i32> %tmp7 140} 141 142define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind { 143; CHECK-LABEL: v_bslQi64: 144; CHECK: @ %bb.0: 145; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 146; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 147; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 148; CHECK-NEXT: vbit q8, q9, q10 149; CHECK-NEXT: vmov r0, r1, d16 150; CHECK-NEXT: vmov r2, r3, d17 151; CHECK-NEXT: mov pc, lr 152 %tmp1 = load <2 x i64>, <2 x i64>* %A 153 %tmp2 = load <2 x i64>, <2 x i64>* %B 154 %tmp3 = load <2 x i64>, <2 x i64>* %C 155 %tmp4 = and <2 x i64> %tmp1, %tmp2 156 %tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 > 157 %tmp6 = and <2 x i64> %tmp5, %tmp3 158 %tmp7 = or <2 x i64> %tmp4, %tmp6 159 ret <2 x i64> %tmp7 160} 161 162define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp { 163; CHECK-LABEL: f1: 164; CHECK: @ %bb.0: 165; CHECK-NEXT: vldr d16, [sp] 166; CHECK-NEXT: vmov d17, r2, r3 167; CHECK-NEXT: vmov d18, r0, r1 168; CHECK-NEXT: vbit d16, d17, d18 169; CHECK-NEXT: vmov r0, r1, d16 170; CHECK-NEXT: mov pc, lr 171 %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind 172 ret <8 x i8> %vbsl.i 173} 174 175define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { 176; CHECK-LABEL: f2: 177; CHECK: @ %bb.0: 178; CHECK-NEXT: vldr d16, [sp] 179; CHECK-NEXT: vmov d17, r2, r3 180; CHECK-NEXT: vmov d18, r0, r1 181; CHECK-NEXT: vbit d16, d17, d18 182; CHECK-NEXT: vmov r0, r1, d16 183; CHECK-NEXT: mov pc, lr 184 %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind 185 ret <4 x i16> %vbsl3.i 186} 187 188define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { 189; CHECK-LABEL: f3: 190; CHECK: @ %bb.0: 191; CHECK-NEXT: vldr d16, [sp] 192; CHECK-NEXT: vmov d17, r2, r3 193; CHECK-NEXT: vmov d18, r0, r1 194; CHECK-NEXT: vbit d16, d17, d18 195; CHECK-NEXT: vmov r0, r1, d16 196; CHECK-NEXT: mov pc, lr 197 %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind 198 ret <2 x i32> %vbsl3.i 199} 200 201define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp { 202; CHECK-LABEL: f4: 203; CHECK: @ %bb.0: 204; CHECK-NEXT: vldr d16, [sp] 205; CHECK-NEXT: vmov d17, r2, r3 206; CHECK-NEXT: vmov d18, r0, r1 207; CHECK-NEXT: vbit d16, d17, d18 208; CHECK-NEXT: vmov r0, r1, d16 209; CHECK-NEXT: mov pc, lr 210 %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind 211 ret <2 x float> %vbsl4.i 212} 213 214define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp { 215; CHECK-LABEL: g1: 216; CHECK: @ %bb.0: 217; CHECK-NEXT: vmov d19, r2, r3 218; CHECK-NEXT: add r12, sp, #16 219; CHECK-NEXT: vmov d18, r0, r1 220; CHECK-NEXT: mov r0, sp 221; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 222; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 223; CHECK-NEXT: vbit q8, q10, q9 224; CHECK-NEXT: vmov r0, r1, d16 225; CHECK-NEXT: vmov r2, r3, d17 226; CHECK-NEXT: mov pc, lr 227 %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind 228 ret <16 x i8> %vbsl.i 229} 230 231define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp { 232; CHECK-LABEL: g2: 233; CHECK: @ %bb.0: 234; CHECK-NEXT: vmov d19, r2, r3 235; CHECK-NEXT: add r12, sp, #16 236; CHECK-NEXT: vmov d18, r0, r1 237; CHECK-NEXT: mov r0, sp 238; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 239; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 240; CHECK-NEXT: vbit q8, q10, q9 241; CHECK-NEXT: vmov r0, r1, d16 242; CHECK-NEXT: vmov r2, r3, d17 243; CHECK-NEXT: mov pc, lr 244 %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind 245 ret <8 x i16> %vbsl3.i 246} 247 248define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { 249; CHECK-LABEL: g3: 250; CHECK: @ %bb.0: 251; CHECK-NEXT: vmov d19, r2, r3 252; CHECK-NEXT: add r12, sp, #16 253; CHECK-NEXT: vmov d18, r0, r1 254; CHECK-NEXT: mov r0, sp 255; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 256; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 257; CHECK-NEXT: vbit q8, q10, q9 258; CHECK-NEXT: vmov r0, r1, d16 259; CHECK-NEXT: vmov r2, r3, d17 260; CHECK-NEXT: mov pc, lr 261 %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind 262 ret <4 x i32> %vbsl3.i 263} 264 265define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp { 266; CHECK-LABEL: g4: 267; CHECK: @ %bb.0: 268; CHECK-NEXT: vmov d19, r2, r3 269; CHECK-NEXT: add r12, sp, #16 270; CHECK-NEXT: vmov d18, r0, r1 271; CHECK-NEXT: mov r0, sp 272; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 273; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 274; CHECK-NEXT: vbit q8, q10, q9 275; CHECK-NEXT: vmov r0, r1, d16 276; CHECK-NEXT: vmov r2, r3, d17 277; CHECK-NEXT: mov pc, lr 278 %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind 279 ret <4 x float> %vbsl4.i 280} 281 282define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp { 283; CHECK-LABEL: test_vbsl_s64: 284; CHECK: @ %bb.0: 285; CHECK-NEXT: vldr d16, [sp] 286; CHECK-NEXT: vmov d17, r2, r3 287; CHECK-NEXT: vmov d18, r0, r1 288; CHECK-NEXT: vbit d16, d17, d18 289; CHECK-NEXT: vmov r0, r1, d16 290; CHECK-NEXT: mov pc, lr 291 %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind 292 ret <1 x i64> %vbsl3.i 293} 294 295define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp { 296; CHECK-LABEL: test_vbsl_u64: 297; CHECK: @ %bb.0: 298; CHECK-NEXT: vldr d16, [sp] 299; CHECK-NEXT: vmov d17, r2, r3 300; CHECK-NEXT: vmov d18, r0, r1 301; CHECK-NEXT: vbit d16, d17, d18 302; CHECK-NEXT: vmov r0, r1, d16 303; CHECK-NEXT: mov pc, lr 304 %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind 305 ret <1 x i64> %vbsl3.i 306} 307 308define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { 309; CHECK-LABEL: test_vbslq_s64: 310; CHECK: @ %bb.0: 311; CHECK-NEXT: vmov d19, r2, r3 312; CHECK-NEXT: add r12, sp, #16 313; CHECK-NEXT: vmov d18, r0, r1 314; CHECK-NEXT: mov r0, sp 315; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 316; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 317; CHECK-NEXT: vbit q8, q10, q9 318; CHECK-NEXT: vmov r0, r1, d16 319; CHECK-NEXT: vmov r2, r3, d17 320; CHECK-NEXT: mov pc, lr 321 %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind 322 ret <2 x i64> %vbsl3.i 323} 324 325define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { 326; CHECK-LABEL: test_vbslq_u64: 327; CHECK: @ %bb.0: 328; CHECK-NEXT: vmov d19, r2, r3 329; CHECK-NEXT: add r12, sp, #16 330; CHECK-NEXT: vmov d18, r0, r1 331; CHECK-NEXT: mov r0, sp 332; CHECK-NEXT: vld1.64 {d16, d17}, [r12] 333; CHECK-NEXT: vld1.64 {d20, d21}, [r0] 334; CHECK-NEXT: vbit q8, q10, q9 335; CHECK-NEXT: vmov r0, r1, d16 336; CHECK-NEXT: vmov r2, r3, d17 337; CHECK-NEXT: mov pc, lr 338 %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind 339 ret <2 x i64> %vbsl3.i 340} 341 342declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 343declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone 344declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone 345declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 346declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone 347declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone 348declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone 349declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 350declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone 351declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone 352