1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 3 4define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { 5; CHECK-LABEL: vuzpi8: 6; CHECK: @ BB#0: 7; CHECK-NEXT: vldr d16, [r1] 8; CHECK-NEXT: vldr d17, [r0] 9; CHECK-NEXT: vuzp.8 d17, d16 10; CHECK-NEXT: vmul.i8 d16, d17, d16 11; CHECK-NEXT: vmov r0, r1, d16 12; CHECK-NEXT: mov pc, lr 13 %tmp1 = load <8 x i8>, <8 x i8>* %A 14 %tmp2 = load <8 x i8>, <8 x i8>* %B 15 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 16 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 17 %tmp5 = mul <8 x i8> %tmp3, %tmp4 18 ret <8 x i8> %tmp5 19} 20 21define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 22; CHECK-LABEL: vuzpi8_Qres: 23; CHECK: @ BB#0: 24; CHECK-NEXT: vldr d17, [r1] 25; CHECK-NEXT: vldr d16, [r0] 26; CHECK-NEXT: vuzp.8 d16, d17 27; CHECK-NEXT: vmov r0, r1, d16 28; CHECK-NEXT: vmov r2, r3, d17 29; CHECK-NEXT: mov pc, lr 30 %tmp1 = load <8 x i8>, <8 x i8>* %A 31 %tmp2 = load <8 x i8>, <8 x i8>* %B 32 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 33 ret <16 x i8> %tmp3 34} 35 36define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { 37; CHECK-LABEL: vuzpi16: 38; CHECK: @ BB#0: 39; CHECK-NEXT: vldr d16, [r1] 40; CHECK-NEXT: vldr d17, [r0] 41; CHECK-NEXT: vuzp.16 d17, d16 42; CHECK-NEXT: vmul.i16 d16, d17, d16 43; CHECK-NEXT: vmov r0, r1, d16 44; CHECK-NEXT: mov pc, lr 45 %tmp1 = load <4 x i16>, <4 x i16>* %A 46 %tmp2 = load <4 x i16>, <4 x i16>* %B 47 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 48 %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 49 %tmp5 = mul <4 x i16> %tmp3, %tmp4 50 ret <4 x i16> %tmp5 51} 52 53define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { 54; CHECK-LABEL: vuzpi16_Qres: 55; CHECK: @ BB#0: 56; CHECK-NEXT: vldr d17, [r1] 57; CHECK-NEXT: vldr d16, [r0] 58; CHECK-NEXT: vuzp.16 d16, d17 59; CHECK-NEXT: vmov r0, r1, d16 60; CHECK-NEXT: vmov r2, r3, d17 61; CHECK-NEXT: mov pc, lr 62 %tmp1 = load <4 x i16>, <4 x i16>* %A 63 %tmp2 = load <4 x i16>, <4 x i16>* %B 64 %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 65 ret <8 x i16> %tmp3 66} 67 68; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. 69 70define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { 71; CHECK-LABEL: vuzpQi8: 72; CHECK: @ BB#0: 73; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 74; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 75; CHECK-NEXT: vuzp.8 q9, q8 76; CHECK-NEXT: vadd.i8 q8, q9, q8 77; CHECK-NEXT: vmov r0, r1, d16 78; CHECK-NEXT: vmov r2, r3, d17 79; CHECK-NEXT: mov pc, lr 80 %tmp1 = load <16 x i8>, <16 x i8>* %A 81 %tmp2 = load <16 x i8>, <16 x i8>* %B 82 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 83 %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 84 %tmp5 = add <16 x i8> %tmp3, %tmp4 85 ret <16 x i8> %tmp5 86} 87 88define <32 x i8> @vuzpQi8_QQres(<16 x i8>* %A, <16 x i8>* %B) nounwind { 89; CHECK-LABEL: vuzpQi8_QQres: 90; CHECK: @ BB#0: 91; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 92; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 93; CHECK-NEXT: vuzp.8 q9, q8 94; CHECK-NEXT: vst1.8 {d18, d19}, [r0:128]! 95; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 96; CHECK-NEXT: mov pc, lr 97 %tmp1 = load <16 x i8>, <16 x i8>* %A 98 %tmp2 = load <16 x i8>, <16 x i8>* %B 99 %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 100 ret <32 x i8> %tmp3 101} 102 103define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { 104; CHECK-LABEL: vuzpQi16: 105; CHECK: @ BB#0: 106; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 107; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 108; CHECK-NEXT: vuzp.16 q9, q8 109; CHECK-NEXT: vadd.i16 q8, q9, q8 110; CHECK-NEXT: vmov r0, r1, d16 111; CHECK-NEXT: vmov r2, r3, d17 112; CHECK-NEXT: mov pc, lr 113 %tmp1 = load <8 x i16>, <8 x i16>* %A 114 %tmp2 = load <8 x i16>, <8 x i16>* %B 115 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 116 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 117 %tmp5 = add <8 x i16> %tmp3, %tmp4 118 ret <8 x i16> %tmp5 119} 120 121define <16 x i16> @vuzpQi16_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 122; CHECK-LABEL: vuzpQi16_QQres: 123; CHECK: @ BB#0: 124; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 125; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 126; CHECK-NEXT: vuzp.16 q9, q8 127; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 128; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 129; CHECK-NEXT: mov pc, lr 130 %tmp1 = load <8 x i16>, <8 x i16>* %A 131 %tmp2 = load <8 x i16>, <8 x i16>* %B 132 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 133 ret <16 x i16> %tmp3 134} 135 136define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { 137; CHECK-LABEL: vuzpQi32: 138; CHECK: @ BB#0: 139; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 140; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 141; CHECK-NEXT: vuzp.32 q9, q8 142; CHECK-NEXT: vadd.i32 q8, q9, q8 143; CHECK-NEXT: vmov r0, r1, d16 144; CHECK-NEXT: vmov r2, r3, d17 145; CHECK-NEXT: mov pc, lr 146 %tmp1 = load <4 x i32>, <4 x i32>* %A 147 %tmp2 = load <4 x i32>, <4 x i32>* %B 148 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 149 %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 150 %tmp5 = add <4 x i32> %tmp3, %tmp4 151 ret <4 x i32> %tmp5 152} 153 154define <8 x i32> @vuzpQi32_QQres(<4 x i32>* %A, <4 x i32>* %B) nounwind { 155; CHECK-LABEL: vuzpQi32_QQres: 156; CHECK: @ BB#0: 157; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 158; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 159; CHECK-NEXT: vuzp.32 q9, q8 160; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 161; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 162; CHECK-NEXT: mov pc, lr 163 %tmp1 = load <4 x i32>, <4 x i32>* %A 164 %tmp2 = load <4 x i32>, <4 x i32>* %B 165 %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 166 ret <8 x i32> %tmp3 167} 168 169define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { 170; CHECK-LABEL: vuzpQf: 171; CHECK: @ BB#0: 172; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 173; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 174; CHECK-NEXT: vuzp.32 q9, q8 175; CHECK-NEXT: vadd.f32 q8, q9, q8 176; CHECK-NEXT: vmov r0, r1, d16 177; CHECK-NEXT: vmov r2, r3, d17 178; CHECK-NEXT: mov pc, lr 179 %tmp1 = load <4 x float>, <4 x float>* %A 180 %tmp2 = load <4 x float>, <4 x float>* %B 181 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 182 %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 183 %tmp5 = fadd <4 x float> %tmp3, %tmp4 184 ret <4 x float> %tmp5 185} 186 187define <8 x float> @vuzpQf_QQres(<4 x float>* %A, <4 x float>* %B) nounwind { 188; CHECK-LABEL: vuzpQf_QQres: 189; CHECK: @ BB#0: 190; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 191; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 192; CHECK-NEXT: vuzp.32 q9, q8 193; CHECK-NEXT: vst1.32 {d18, d19}, [r0:128]! 194; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 195; CHECK-NEXT: mov pc, lr 196 %tmp1 = load <4 x float>, <4 x float>* %A 197 %tmp2 = load <4 x float>, <4 x float>* %B 198 %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 199 ret <8 x float> %tmp3 200} 201 202; Undef shuffle indices should not prevent matching to VUZP: 203 204define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { 205; CHECK-LABEL: vuzpi8_undef: 206; CHECK: @ BB#0: 207; CHECK-NEXT: vldr d16, [r1] 208; CHECK-NEXT: vldr d17, [r0] 209; CHECK-NEXT: vuzp.8 d17, d16 210; CHECK-NEXT: vmul.i8 d16, d17, d16 211; CHECK-NEXT: vmov r0, r1, d16 212; CHECK-NEXT: mov pc, lr 213 %tmp1 = load <8 x i8>, <8 x i8>* %A 214 %tmp2 = load <8 x i8>, <8 x i8>* %B 215 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14> 216 %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 217 %tmp5 = mul <8 x i8> %tmp3, %tmp4 218 ret <8 x i8> %tmp5 219} 220 221define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { 222; CHECK-LABEL: vuzpi8_undef_Qres: 223; CHECK: @ BB#0: 224; CHECK-NEXT: vldr d17, [r1] 225; CHECK-NEXT: vldr d16, [r0] 226; CHECK-NEXT: vuzp.8 d16, d17 227; CHECK-NEXT: vmov r0, r1, d16 228; CHECK-NEXT: vmov r2, r3, d17 229; CHECK-NEXT: mov pc, lr 230 %tmp1 = load <8 x i8>, <8 x i8>* %A 231 %tmp2 = load <8 x i8>, <8 x i8>* %B 232 %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15> 233 ret <16 x i8> %tmp3 234} 235 236define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { 237; CHECK-LABEL: vuzpQi16_undef: 238; CHECK: @ BB#0: 239; CHECK-NEXT: vld1.64 {d16, d17}, [r1] 240; CHECK-NEXT: vld1.64 {d18, d19}, [r0] 241; CHECK-NEXT: vuzp.16 q9, q8 242; CHECK-NEXT: vadd.i16 q8, q9, q8 243; CHECK-NEXT: vmov r0, r1, d16 244; CHECK-NEXT: vmov r2, r3, d17 245; CHECK-NEXT: mov pc, lr 246 %tmp1 = load <8 x i16>, <8 x i16>* %A 247 %tmp2 = load <8 x i16>, <8 x i16>* %B 248 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14> 249 %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 250 %tmp5 = add <8 x i16> %tmp3, %tmp4 251 ret <8 x i16> %tmp5 252} 253 254define <16 x i16> @vuzpQi16_undef_QQres(<8 x i16>* %A, <8 x i16>* %B) nounwind { 255; CHECK-LABEL: vuzpQi16_undef_QQres: 256; CHECK: @ BB#0: 257; CHECK-NEXT: vld1.64 {d16, d17}, [r2] 258; CHECK-NEXT: vld1.64 {d18, d19}, [r1] 259; CHECK-NEXT: vuzp.16 q9, q8 260; CHECK-NEXT: vst1.16 {d18, d19}, [r0:128]! 261; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] 262; CHECK-NEXT: mov pc, lr 263 %tmp1 = load <8 x i16>, <8 x i16>* %A 264 %tmp2 = load <8 x i16>, <8 x i16>* %B 265 %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <16 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15> 266 ret <16 x i16> %tmp3 267} 268 269define <8 x i16> @vuzp_lower_shufflemask_undef(<4 x i16>* %A, <4 x i16>* %B) { 270; CHECK-LABEL: vuzp_lower_shufflemask_undef: 271; CHECK: @ BB#0: @ %entry 272; CHECK-NEXT: vldr d17, [r1] 273; CHECK-NEXT: vldr d16, [r0] 274; CHECK-NEXT: vorr q9, q8, q8 275; CHECK-NEXT: vuzp.16 q8, q9 276; CHECK-NEXT: vmov r0, r1, d18 277; CHECK-NEXT: vmov r2, r3, d19 278; CHECK-NEXT: mov pc, lr 279entry: 280 %tmp1 = load <4 x i16>, <4 x i16>* %A 281 %tmp2 = load <4 x i16>, <4 x i16>* %B 282 %0 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 283 ret <8 x i16> %0 284} 285 286define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) { 287; CHECK-LABEL: vuzp_lower_shufflemask_zeroed: 288; CHECK: @ BB#0: @ %entry 289; CHECK-NEXT: vldr d17, [r1] 290; CHECK-NEXT: vldr d16, [r0] 291; CHECK-NEXT: vdup.32 q9, d16[0] 292; CHECK-NEXT: vuzp.32 q8, q9 293; CHECK-NEXT: vext.32 q8, q9, q9, #2 294; CHECK-NEXT: vmov r0, r1, d16 295; CHECK-NEXT: vmov r2, r3, d17 296; CHECK-NEXT: mov pc, lr 297entry: 298 %tmp1 = load <2 x i32>, <2 x i32>* %A 299 %tmp2 = load <2 x i32>, <2 x i32>* %B 300 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 0, i32 0, i32 1, i32 3> 301 ret <4 x i32> %0 302} 303 304define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { 305; CHECK-LABEL: vuzp_rev_shufflemask_vtrn: 306; CHECK: @ BB#0: @ %entry 307; CHECK-NEXT: vldr d17, [r1] 308; CHECK-NEXT: vldr d16, [r0] 309; CHECK-NEXT: vrev64.32 q9, q8 310; CHECK-NEXT: vuzp.32 q8, q9 311; CHECK-NEXT: vst1.64 {d18, d19}, [r2] 312; CHECK-NEXT: mov pc, lr 313entry: 314 %tmp1 = load <2 x i32>, <2 x i32>* %A 315 %tmp2 = load <2 x i32>, <2 x i32>* %B 316 %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 0, i32 2> 317 store <4 x i32> %0, <4 x i32>* %C 318 ret void 319} 320 321define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { 322; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. 323; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to 324; truncate from i32 to i16 and one vuzp to perform the final truncation for i8. 325; CHECK-LABEL: vuzp_trunc: 326; CHECK: @ BB#0: 327; CHECK-NEXT: .save {r4, r5, r11, lr} 328; CHECK-NEXT: push {r4, r5, r11, lr} 329; CHECK-NEXT: add r12, sp, #48 330; CHECK-NEXT: add lr, sp, #16 331; CHECK-NEXT: add r4, sp, #64 332; CHECK-NEXT: add r5, sp, #32 333; CHECK-NEXT: vld1.64 {d16, d17}, [r5] 334; CHECK-NEXT: vld1.64 {d18, d19}, [r4] 335; CHECK-NEXT: vld1.64 {d20, d21}, [lr] 336; CHECK-NEXT: vld1.64 {d22, d23}, [r12] 337; CHECK-NEXT: vcgt.u32 q8, q9, q8 338; CHECK-NEXT: vcgt.u32 q9, q11, q10 339; CHECK-NEXT: vmovn.i32 d16, q8 340; CHECK-NEXT: vmovn.i32 d17, q9 341; CHECK-NEXT: vmov.i8 d18, #0x7 342; CHECK-NEXT: vmov d19, r0, r1 343; CHECK-NEXT: vuzp.8 d17, d16 344; CHECK-NEXT: vneg.s8 d16, d18 345; CHECK-NEXT: vshl.i8 d17, d17, #7 346; CHECK-NEXT: vmov d18, r2, r3 347; CHECK-NEXT: vshl.s8 d16, d17, d16 348; CHECK-NEXT: vbsl d16, d19, d18 349; CHECK-NEXT: vmov r0, r1, d16 350; CHECK-NEXT: pop {r4, r5, r11, lr} 351; CHECK-NEXT: mov pc, lr 352 %c = icmp ult <8 x i32> %cmp0, %cmp1 353 %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1 354 ret <8 x i8> %res 355} 356 357; Shuffle the result from the compare with a <4 x i8>. 358; We need to extend the loaded <4 x i8> to <4 x i16>. Otherwise we wouldn't be able 359; to perform the vuzp and get the vbsl mask. 360define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1, 361; CHECK-LABEL: vuzp_trunc_and_shuffle: 362; CHECK: @ BB#0: 363; CHECK-NEXT: .save {r4, lr} 364; CHECK-NEXT: push {r4, lr} 365; CHECK-NEXT: ldr r12, [sp, #40] 366; CHECK-NEXT: add lr, sp, #24 367; CHECK-NEXT: add r4, sp, #8 368; CHECK-NEXT: vld1.64 {d16, d17}, [r4] 369; CHECK-NEXT: vld1.64 {d18, d19}, [lr] 370; CHECK-NEXT: vld1.32 {d20[0]}, [r12:32] 371; CHECK-NEXT: vcgt.u32 q8, q9, q8 372; CHECK-NEXT: vmovn.i32 d16, q8 373; CHECK-NEXT: vmov.i8 d17, #0x7 374; CHECK-NEXT: vneg.s8 d17, d17 375; CHECK-NEXT: vmovl.u8 q9, d20 376; CHECK-NEXT: vuzp.8 d16, d18 377; CHECK-NEXT: vshl.i8 d16, d16, #7 378; CHECK-NEXT: vmov d18, r2, r3 379; CHECK-NEXT: vmov d19, r0, r1 380; CHECK-NEXT: vshl.s8 d16, d16, d17 381; CHECK-NEXT: vbsl d16, d19, d18 382; CHECK-NEXT: vmov r0, r1, d16 383; CHECK-NEXT: pop {r4, lr} 384; CHECK-NEXT: mov pc, lr 385 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 386 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 387 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 388 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 389 %c = shufflevector <4 x i1> %c0, <4 x i1> %cmp2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 390 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 391 ret <8 x i8> %rv 392} 393 394; Use an undef value for the <4 x i8> that is being shuffled with the compare result. 395; This produces a build_vector with some of the operands undefs. 396define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1, 397; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right: 398; CHECK: @ BB#0: 399; CHECK-NEXT: .save {r11, lr} 400; CHECK-NEXT: push {r11, lr} 401; CHECK-NEXT: add r12, sp, #24 402; CHECK-NEXT: add lr, sp, #8 403; CHECK-NEXT: vld1.64 {d16, d17}, [lr] 404; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 405; CHECK-NEXT: vcgt.u32 q8, q9, q8 406; CHECK-NEXT: vmov d19, r0, r1 407; CHECK-NEXT: vmovn.i32 d16, q8 408; CHECK-NEXT: vmov.i8 d17, #0x7 409; CHECK-NEXT: vuzp.8 d16, d18 410; CHECK-NEXT: vneg.s8 d17, d17 411; CHECK-NEXT: vshl.i8 d16, d16, #7 412; CHECK-NEXT: vmov d18, r2, r3 413; CHECK-NEXT: vshl.s8 d16, d16, d17 414; CHECK-NEXT: vbsl d16, d19, d18 415; CHECK-NEXT: vmov r0, r1, d16 416; CHECK-NEXT: pop {r11, lr} 417; CHECK-NEXT: mov pc, lr 418 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 419 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 420 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 421 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 422 %c = shufflevector <4 x i1> %c0, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 423 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 424 ret <8 x i8> %rv 425} 426 427define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1, 428; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left: 429; CHECK: @ BB#0: 430; CHECK-NEXT: .save {r11, lr} 431; CHECK-NEXT: push {r11, lr} 432; CHECK-NEXT: add r12, sp, #24 433; CHECK-NEXT: add lr, sp, #8 434; CHECK-NEXT: vldr d20, .LCPI22_0 435; CHECK-NEXT: vld1.64 {d16, d17}, [lr] 436; CHECK-NEXT: vld1.64 {d18, d19}, [r12] 437; CHECK-NEXT: vcgt.u32 q8, q9, q8 438; CHECK-NEXT: vmov d18, r2, r3 439; CHECK-NEXT: vmov d19, r0, r1 440; CHECK-NEXT: vmovn.i32 d16, q8 441; CHECK-NEXT: vmov.i8 d17, #0x7 442; CHECK-NEXT: vtbl.8 d16, {d16}, d20 443; CHECK-NEXT: vneg.s8 d17, d17 444; CHECK-NEXT: vshl.i8 d16, d16, #7 445; CHECK-NEXT: vshl.s8 d16, d16, d17 446; CHECK-NEXT: vbsl d16, d19, d18 447; CHECK-NEXT: vmov r0, r1, d16 448; CHECK-NEXT: pop {r11, lr} 449; CHECK-NEXT: mov pc, lr 450; CHECK-NEXT: .p2align 3 451; CHECK-NEXT: @ BB#1: 452; CHECK-NEXT: .LCPI22_0: 453; CHECK-NEXT: .byte 255 @ 0xff 454; CHECK-NEXT: .byte 255 @ 0xff 455; CHECK-NEXT: .byte 255 @ 0xff 456; CHECK-NEXT: .byte 255 @ 0xff 457; CHECK-NEXT: .byte 0 @ 0x0 458; CHECK-NEXT: .byte 2 @ 0x2 459; CHECK-NEXT: .byte 4 @ 0x4 460; CHECK-NEXT: .byte 6 @ 0x6 461 <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { 462 %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 463 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> 464 %c0 = icmp ult <4 x i32> %cmp0, %cmp1 465 %c = shufflevector <4 x i1> undef, <4 x i1> %c0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 466 %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 467 ret <8 x i8> %rv 468} 469 470; We're using large data types here, and we have to fill with undef values until we 471; get some vector size that we can represent. 472define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1, 473; CHECK-LABEL: vuzp_wide_type: 474; CHECK: @ BB#0: 475; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} 476; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} 477; CHECK-NEXT: .setfp r11, sp, #16 478; CHECK-NEXT: add r11, sp, #16 479; CHECK-NEXT: .pad #8 480; CHECK-NEXT: sub sp, sp, #8 481; CHECK-NEXT: bic sp, sp, #15 482; CHECK-NEXT: add r5, r11, #52 483; CHECK-NEXT: add r7, r11, #32 484; CHECK-NEXT: add r4, r11, #44 485; CHECK-NEXT: add r6, r11, #24 486; CHECK-NEXT: add r12, r11, #60 487; CHECK-NEXT: add lr, r11, #40 488; CHECK-NEXT: vld1.32 {d17[0]}, [r7:32] 489; CHECK-NEXT: vld1.32 {d19[0]}, [r5:32] 490; CHECK-NEXT: vld1.32 {d22[0]}, [r12:32] 491; CHECK-NEXT: ldr r12, [r11, #64] 492; CHECK-NEXT: vld1.32 {d20[0]}, [lr:32] 493; CHECK-NEXT: add r7, r11, #48 494; CHECK-NEXT: add r5, r11, #28 495; CHECK-NEXT: vld1.32 {d16[0]}, [r6:32] 496; CHECK-NEXT: vld1.32 {d18[0]}, [r4:32] 497; CHECK-NEXT: add r6, r11, #56 498; CHECK-NEXT: add r4, r11, #36 499; CHECK-NEXT: vcgt.u32 q10, q11, q10 500; CHECK-NEXT: vld1.32 {d19[1]}, [r6:32] 501; CHECK-NEXT: vld1.32 {d17[1]}, [r4:32] 502; CHECK-NEXT: add r6, r12, #4 503; CHECK-NEXT: vld1.32 {d18[1]}, [r7:32] 504; CHECK-NEXT: vld1.32 {d16[1]}, [r5:32] 505; CHECK-NEXT: ldr r7, [r12] 506; CHECK-NEXT: vcgt.u32 q8, q9, q8 507; CHECK-NEXT: vmovn.i32 d18, q10 508; CHECK-NEXT: vmov.32 d21[0], r7 509; CHECK-NEXT: vmovn.i32 d16, q8 510; CHECK-NEXT: vmov.u8 r7, d21[3] 511; CHECK-NEXT: vmov.i8 d17, #0x7 512; CHECK-NEXT: vuzp.8 d16, d18 513; CHECK-NEXT: vmov.8 d23[0], r7 514; CHECK-NEXT: vneg.s8 d17, d17 515; CHECK-NEXT: add r7, r11, #8 516; CHECK-NEXT: vldr d18, .LCPI23_0 517; CHECK-NEXT: vld1.8 {d23[1]}, [r6] 518; CHECK-NEXT: vshl.i8 d16, d16, #7 519; CHECK-NEXT: vshl.s8 d20, d16, d17 520; CHECK-NEXT: vmov.i8 q8, #0x7 521; CHECK-NEXT: vneg.s8 q8, q8 522; CHECK-NEXT: vtbl.8 d22, {d20, d21}, d18 523; CHECK-NEXT: vld1.64 {d18, d19}, [r7] 524; CHECK-NEXT: vshl.i8 q10, q11, #7 525; CHECK-NEXT: vmov d23, r2, r3 526; CHECK-NEXT: vmov d22, r0, r1 527; CHECK-NEXT: vshl.s8 q8, q10, q8 528; CHECK-NEXT: vbsl q8, q11, q9 529; CHECK-NEXT: vmov r0, r1, d16 530; CHECK-NEXT: vmov r2, r3, d17 531; CHECK-NEXT: sub sp, r11, #16 532; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} 533; CHECK-NEXT: mov pc, lr 534; CHECK-NEXT: .p2align 3 535; CHECK-NEXT: @ BB#1: 536; CHECK-NEXT: .LCPI23_0: 537; CHECK-NEXT: .byte 0 @ 0x0 538; CHECK-NEXT: .byte 1 @ 0x1 539; CHECK-NEXT: .byte 2 @ 0x2 540; CHECK-NEXT: .byte 3 @ 0x3 541; CHECK-NEXT: .byte 4 @ 0x4 542; CHECK-NEXT: .byte 8 @ 0x8 543; CHECK-NEXT: .byte 9 @ 0x9 544; CHECK-NEXT: .byte 10 @ 0xa 545 <5 x i32> %cmp0, <5 x i32> %cmp1, <5 x i8> *%cmp2_ptr) { 546 %cmp2_load = load <5 x i8>, <5 x i8> * %cmp2_ptr, align 4 547 %cmp2 = trunc <5 x i8> %cmp2_load to <5 x i1> 548 %c0 = icmp ult <5 x i32> %cmp0, %cmp1 549 %c = shufflevector <5 x i1> %c0, <5 x i1> %cmp2, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 550 %rv = select <10 x i1> %c, <10 x i8> %tr0, <10 x i8> %tr1 551 ret <10 x i8> %rv 552} 553 554%struct.uint8x8x2_t = type { [2 x <8 x i8>] } 555define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 { 556; CHECK-LABEL: vuzp_extract_subvector: 557; CHECK: @ BB#0: 558; CHECK-NEXT: vmov d17, r2, r3 559; CHECK-NEXT: vmov d16, r0, r1 560; CHECK-NEXT: vorr d18, d17, d17 561; CHECK-NEXT: vuzp.8 d16, d18 562; CHECK-NEXT: vmov r0, r1, d16 563; CHECK-NEXT: vmov r2, r3, d18 564; CHECK-NEXT: mov pc, lr 565 566 %vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 567 %vuzp1.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 568 %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 569 %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 570 ret %struct.uint8x8x2_t %.fca.0.1.insert 571} 572