1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s 3 4 5; i8 6 7define <16 x i8> @insert_v16i8_2_1(float %tmp, <16 x i8> %b, <16 x i8> %a) { 8; CHECK-LABEL: insert_v16i8_2_1: 9; CHECK: // %bb.0: 10; CHECK-NEXT: mov v0.16b, v1.16b 11; CHECK-NEXT: mov v0.h[0], v2.h[0] 12; CHECK-NEXT: ret 13 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 14 ret <16 x i8> %s2 15} 16 17define <16 x i8> @insert_v16i8_2_2(float %tmp, <16 x i8> %b, <16 x i8> %a) { 18; CHECK-LABEL: insert_v16i8_2_2: 19; CHECK: // %bb.0: 20; CHECK-NEXT: mov v0.16b, v1.16b 21; CHECK-NEXT: mov v0.h[1], v2.h[0] 22; CHECK-NEXT: ret 23 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 24 ret <16 x i8> %s2 25} 26 27define <16 x i8> @insert_v16i8_2_6(float %tmp, <16 x i8> %b, <16 x i8> %a) { 28; CHECK-LABEL: insert_v16i8_2_6: 29; CHECK: // %bb.0: 30; CHECK-NEXT: mov v0.16b, v1.16b 31; CHECK-NEXT: mov v0.h[6], v2.h[0] 32; CHECK-NEXT: ret 33 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 30, i32 31> 34 ret <16 x i8> %s2 35} 36 37define <16 x i8> @insert_v16i8_4_1(float %tmp, <16 x i8> %b, <16 x i8> %a) { 38; CHECK-LABEL: insert_v16i8_4_1: 39; CHECK: // %bb.0: 40; CHECK-NEXT: mov v0.16b, v1.16b 41; CHECK-NEXT: mov v0.s[0], v2.s[0] 42; CHECK-NEXT: ret 43 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 44 ret <16 x i8> %s2 45} 46 47define <16 x i8> @insert_v16i8_4_15(float %tmp, <16 x i8> %b, <16 x i8> %a) { 48; CHECK-LABEL: insert_v16i8_4_15: 49; CHECK: // %bb.0: 50; CHECK-NEXT: adrp x8, .LCPI4_0 51; CHECK-NEXT: // kill: def $q2 killed $q2 def $q2_q3 52; CHECK-NEXT: mov v3.16b, v1.16b 53; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI4_0] 54; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b 55; CHECK-NEXT: ret 56 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 57 ret <16 x i8> %s2 58} 59 60define <16 x i8> @insert_v16i8_4_2(float %tmp, <16 x i8> %b, <16 x i8> %a) { 61; CHECK-LABEL: insert_v16i8_4_2: 62; CHECK: // %bb.0: 63; CHECK-NEXT: mov v0.16b, v1.16b 64; CHECK-NEXT: mov v0.s[1], v2.s[0] 65; CHECK-NEXT: ret 66 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 67 ret <16 x i8> %s2 68} 69 70define <16 x i8> @insert_v16i8_4_3(float %tmp, <16 x i8> %b, <16 x i8> %a) { 71; CHECK-LABEL: insert_v16i8_4_3: 72; CHECK: // %bb.0: 73; CHECK-NEXT: mov v0.16b, v1.16b 74; CHECK-NEXT: mov v0.s[2], v2.s[0] 75; CHECK-NEXT: ret 76 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31> 77 ret <16 x i8> %s2 78} 79 80define <16 x i8> @insert_v16i8_4_4(float %tmp, <16 x i8> %b, <16 x i8> %a) { 81; CHECK-LABEL: insert_v16i8_4_4: 82; CHECK: // %bb.0: 83; CHECK-NEXT: mov v0.16b, v1.16b 84; CHECK-NEXT: mov v0.s[3], v2.s[0] 85; CHECK-NEXT: ret 86 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 2, i32 3> 87 ret <16 x i8> %s2 88} 89 90define <8 x i8> @insert_v8i8_4_1(float %tmp, <8 x i8> %b, <8 x i8> %a) { 91; CHECK-LABEL: insert_v8i8_4_1: 92; CHECK: // %bb.0: 93; CHECK-NEXT: fmov d0, d2 94; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 95; CHECK-NEXT: mov v0.s[1], v1.s[1] 96; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 97; CHECK-NEXT: ret 98 %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 99 ret <8 x i8> %s2 100} 101 102define <8 x i8> @insert_v8i8_4_2(float %tmp, <8 x i8> %b, <8 x i8> %a) { 103; CHECK-LABEL: insert_v8i8_4_2: 104; CHECK: // %bb.0: 105; CHECK-NEXT: fmov d0, d1 106; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 107; CHECK-NEXT: mov v0.s[1], v2.s[0] 108; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 109; CHECK-NEXT: ret 110 %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> 111 ret <8 x i8> %s2 112} 113 114define <16 x i8> @insert_v16i8_8_1(float %tmp, <16 x i8> %b, <16 x i8> %a) { 115; CHECK-LABEL: insert_v16i8_8_1: 116; CHECK: // %bb.0: 117; CHECK-NEXT: mov v0.16b, v2.16b 118; CHECK-NEXT: mov v0.d[1], v1.d[1] 119; CHECK-NEXT: ret 120 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 121 ret <16 x i8> %s2 122} 123 124define <16 x i8> @insert_v16i8_8_2(float %tmp, <16 x i8> %b, <16 x i8> %a) { 125; CHECK-LABEL: insert_v16i8_8_2: 126; CHECK: // %bb.0: 127; CHECK-NEXT: mov v0.16b, v1.16b 128; CHECK-NEXT: mov v0.d[1], v2.d[0] 129; CHECK-NEXT: ret 130 %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 131 ret <16 x i8> %s2 132} 133 134; i16 135 136define <8 x i16> @insert_v8i16_2_1(float %tmp, <8 x i16> %b, <8 x i16> %a) { 137; CHECK-LABEL: insert_v8i16_2_1: 138; CHECK: // %bb.0: 139; CHECK-NEXT: mov v0.16b, v1.16b 140; CHECK-NEXT: mov v0.s[0], v2.s[0] 141; CHECK-NEXT: ret 142 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 143 ret <8 x i16> %s2 144} 145 146define <8 x i16> @insert_v8i16_2_15(float %tmp, <8 x i16> %b, <8 x i16> %a) { 147; CHECK-LABEL: insert_v8i16_2_15: 148; CHECK: // %bb.0: 149; CHECK-NEXT: adrp x8, .LCPI13_0 150; CHECK-NEXT: // kill: def $q2 killed $q2 def $q2_q3 151; CHECK-NEXT: mov v3.16b, v1.16b 152; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_0] 153; CHECK-NEXT: tbl v0.16b, { v2.16b, v3.16b }, v0.16b 154; CHECK-NEXT: ret 155 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15> 156 ret <8 x i16> %s2 157} 158 159define <8 x i16> @insert_v8i16_2_2(float %tmp, <8 x i16> %b, <8 x i16> %a) { 160; CHECK-LABEL: insert_v8i16_2_2: 161; CHECK: // %bb.0: 162; CHECK-NEXT: mov v0.16b, v1.16b 163; CHECK-NEXT: mov v0.s[1], v2.s[0] 164; CHECK-NEXT: ret 165 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15> 166 ret <8 x i16> %s2 167} 168 169define <8 x i16> @insert_v8i16_2_3(float %tmp, <8 x i16> %b, <8 x i16> %a) { 170; CHECK-LABEL: insert_v8i16_2_3: 171; CHECK: // %bb.0: 172; CHECK-NEXT: mov v0.16b, v1.16b 173; CHECK-NEXT: mov v0.s[2], v2.s[0] 174; CHECK-NEXT: ret 175 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15> 176 ret <8 x i16> %s2 177} 178 179define <8 x i16> @insert_v8i16_2_4(float %tmp, <8 x i16> %b, <8 x i16> %a) { 180; CHECK-LABEL: insert_v8i16_2_4: 181; CHECK: // %bb.0: 182; CHECK-NEXT: mov v0.16b, v1.16b 183; CHECK-NEXT: mov v0.s[3], v2.s[0] 184; CHECK-NEXT: ret 185 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1> 186 ret <8 x i16> %s2 187} 188 189define <4 x i16> @insert_v4i16_2_1(float %tmp, <4 x i16> %b, <4 x i16> %a) { 190; CHECK-LABEL: insert_v4i16_2_1: 191; CHECK: // %bb.0: 192; CHECK-NEXT: fmov d0, d2 193; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 194; CHECK-NEXT: mov v0.s[1], v1.s[1] 195; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 196; CHECK-NEXT: ret 197 %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 198 ret <4 x i16> %s2 199} 200 201define <4 x i16> @insert_v4i16_2_2(float %tmp, <4 x i16> %b, <4 x i16> %a) { 202; CHECK-LABEL: insert_v4i16_2_2: 203; CHECK: // %bb.0: 204; CHECK-NEXT: fmov d0, d1 205; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 206; CHECK-NEXT: mov v0.s[1], v2.s[0] 207; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 208; CHECK-NEXT: ret 209 %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 210 ret <4 x i16> %s2 211} 212 213define <8 x i16> @insert_v8i16_4_1(float %tmp, <8 x i16> %b, <8 x i16> %a) { 214; CHECK-LABEL: insert_v8i16_4_1: 215; CHECK: // %bb.0: 216; CHECK-NEXT: mov v0.16b, v2.16b 217; CHECK-NEXT: mov v0.d[1], v1.d[1] 218; CHECK-NEXT: ret 219 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 220 ret <8 x i16> %s2 221} 222 223define <8 x i16> @insert_v8i16_4_2(float %tmp, <8 x i16> %b, <8 x i16> %a) { 224; CHECK-LABEL: insert_v8i16_4_2: 225; CHECK: // %bb.0: 226; CHECK-NEXT: mov v0.16b, v1.16b 227; CHECK-NEXT: mov v0.d[1], v2.d[0] 228; CHECK-NEXT: ret 229 %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> 230 ret <8 x i16> %s2 231} 232 233; i32 234 235define <4 x i32> @insert_v4i32_2_1(float %tmp, <4 x i32> %b, <4 x i32> %a) { 236; CHECK-LABEL: insert_v4i32_2_1: 237; CHECK: // %bb.0: 238; CHECK-NEXT: mov v0.16b, v2.16b 239; CHECK-NEXT: mov v0.d[1], v1.d[1] 240; CHECK-NEXT: ret 241 %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 242 ret <4 x i32> %s2 243} 244 245define <4 x i32> @insert_v4i32_2_2(float %tmp, <4 x i32> %b, <4 x i32> %a) { 246; CHECK-LABEL: insert_v4i32_2_2: 247; CHECK: // %bb.0: 248; CHECK-NEXT: mov v0.16b, v1.16b 249; CHECK-NEXT: mov v0.d[1], v2.d[0] 250; CHECK-NEXT: ret 251 %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 252 ret <4 x i32> %s2 253} 254 255 256 257 258; i8 259 260define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, <4 x i8> *%a) { 261; CHECK-LABEL: load_v16i8_4_1: 262; CHECK: // %bb.0: 263; CHECK-NEXT: mov v0.16b, v1.16b 264; CHECK-NEXT: ld1 { v0.s }[0], [x0] 265; CHECK-NEXT: ret 266 %l = load <4 x i8>, <4 x i8> *%a 267 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 268 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 269 ret <16 x i8> %s2 270} 271 272define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, <4 x i8> *%a) { 273; CHECK-LABEL: load_v16i8_4_15: 274; CHECK: // %bb.0: 275; CHECK-NEXT: adrp x8, .LCPI24_0 276; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1 277; CHECK-NEXT: ldr s0, [x0] 278; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_0] 279; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b 280; CHECK-NEXT: ret 281 %l = load <4 x i8>, <4 x i8> *%a 282 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 283 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 284 ret <16 x i8> %s2 285} 286 287define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, <4 x i8> *%a) { 288; CHECK-LABEL: load_v16i8_4_2: 289; CHECK: // %bb.0: 290; CHECK-NEXT: mov v0.16b, v1.16b 291; CHECK-NEXT: ld1 { v0.s }[1], [x0] 292; CHECK-NEXT: ret 293 %l = load <4 x i8>, <4 x i8> *%a 294 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 295 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 296 ret <16 x i8> %s2 297} 298 299define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, <4 x i8> *%a) { 300; CHECK-LABEL: load_v16i8_4_3: 301; CHECK: // %bb.0: 302; CHECK-NEXT: mov v0.16b, v1.16b 303; CHECK-NEXT: ld1 { v0.s }[2], [x0] 304; CHECK-NEXT: ret 305 %l = load <4 x i8>, <4 x i8> *%a 306 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 307 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31> 308 ret <16 x i8> %s2 309} 310 311define <16 x i8> @load_v16i8_4_4(float %tmp, <16 x i8> %b, <4 x i8> *%a) { 312; CHECK-LABEL: load_v16i8_4_4: 313; CHECK: // %bb.0: 314; CHECK-NEXT: mov v0.16b, v1.16b 315; CHECK-NEXT: ld1 { v0.s }[3], [x0] 316; CHECK-NEXT: ret 317 %l = load <4 x i8>, <4 x i8> *%a 318 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 319 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 2, i32 3> 320 ret <16 x i8> %s2 321} 322 323define <8 x i8> @load_v8i8_4_1(float %tmp, <8 x i8> %b, <4 x i8> *%a) { 324; CHECK-LABEL: load_v8i8_4_1: 325; CHECK: // %bb.0: 326; CHECK-NEXT: ldr s0, [x0] 327; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 328; CHECK-NEXT: mov v0.s[1], v1.s[1] 329; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 330; CHECK-NEXT: ret 331 %l = load <4 x i8>, <4 x i8> *%a 332 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 333 %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 334 ret <8 x i8> %s2 335} 336 337define <8 x i8> @load_v8i8_4_2(float %tmp, <8 x i8> %b, <4 x i8> *%a) { 338; CHECK-LABEL: load_v8i8_4_2: 339; CHECK: // %bb.0: 340; CHECK-NEXT: fmov d0, d1 341; CHECK-NEXT: ldr s1, [x0] 342; CHECK-NEXT: mov v0.s[1], v1.s[0] 343; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 344; CHECK-NEXT: ret 345 %l = load <4 x i8>, <4 x i8> *%a 346 %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 347 %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> 348 ret <8 x i8> %s2 349} 350 351define <16 x i8> @load_v16i8_8_1(float %tmp, <16 x i8> %b, <8 x i8> *%a) { 352; CHECK-LABEL: load_v16i8_8_1: 353; CHECK: // %bb.0: 354; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 355; CHECK-NEXT: ldr d0, [x0] 356; CHECK-NEXT: mov v0.d[1], v1.d[0] 357; CHECK-NEXT: ret 358 %l = load <8 x i8>, <8 x i8> *%a 359 %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 360 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 361 ret <16 x i8> %s2 362} 363 364define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, <8 x i8> *%a) { 365; CHECK-LABEL: load_v16i8_8_2: 366; CHECK: // %bb.0: 367; CHECK-NEXT: mov v0.16b, v1.16b 368; CHECK-NEXT: ldr d1, [x0] 369; CHECK-NEXT: mov v0.d[1], v1.d[0] 370; CHECK-NEXT: ret 371 %l = load <8 x i8>, <8 x i8> *%a 372 %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 373 %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 374 ret <16 x i8> %s2 375} 376 377; i16 378 379define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, <2 x i16> *%a) { 380; CHECK-LABEL: load_v8i16_2_1: 381; CHECK: // %bb.0: 382; CHECK-NEXT: ldrh w9, [x0] 383; CHECK-NEXT: add x8, x0, #2 384; CHECK-NEXT: mov v0.16b, v1.16b 385; CHECK-NEXT: fmov s2, w9 386; CHECK-NEXT: ld1 { v2.h }[2], [x8] 387; CHECK-NEXT: xtn v1.4h, v2.4s 388; CHECK-NEXT: mov v0.s[0], v1.s[0] 389; CHECK-NEXT: ret 390 %l = load <2 x i16>, <2 x i16> *%a 391 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 392 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 393 ret <8 x i16> %s2 394} 395 396define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, <2 x i16> *%a) { 397; CHECK-LABEL: load_v8i16_2_15: 398; CHECK: // %bb.0: 399; CHECK-NEXT: ldrh w9, [x0] 400; CHECK-NEXT: add x8, x0, #2 401; CHECK-NEXT: // kill: def $q1 killed $q1 def $q0_q1 402; CHECK-NEXT: fmov s2, w9 403; CHECK-NEXT: ld1 { v2.h }[2], [x8] 404; CHECK-NEXT: adrp x8, .LCPI33_0 405; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI33_0] 406; CHECK-NEXT: xtn v0.4h, v2.4s 407; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v3.16b 408; CHECK-NEXT: ret 409 %l = load <2 x i16>, <2 x i16> *%a 410 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 411 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15> 412 ret <8 x i16> %s2 413} 414 415define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, <2 x i16> *%a) { 416; CHECK-LABEL: load_v8i16_2_2: 417; CHECK: // %bb.0: 418; CHECK-NEXT: ldrh w9, [x0] 419; CHECK-NEXT: add x8, x0, #2 420; CHECK-NEXT: mov v0.16b, v1.16b 421; CHECK-NEXT: fmov s2, w9 422; CHECK-NEXT: ld1 { v2.h }[2], [x8] 423; CHECK-NEXT: xtn v1.4h, v2.4s 424; CHECK-NEXT: mov v0.s[1], v1.s[0] 425; CHECK-NEXT: ret 426 %l = load <2 x i16>, <2 x i16> *%a 427 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 428 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15> 429 ret <8 x i16> %s2 430} 431 432define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, <2 x i16> *%a) { 433; CHECK-LABEL: load_v8i16_2_3: 434; CHECK: // %bb.0: 435; CHECK-NEXT: ldrh w9, [x0] 436; CHECK-NEXT: add x8, x0, #2 437; CHECK-NEXT: mov v0.16b, v1.16b 438; CHECK-NEXT: fmov s2, w9 439; CHECK-NEXT: ld1 { v2.h }[2], [x8] 440; CHECK-NEXT: xtn v1.4h, v2.4s 441; CHECK-NEXT: mov v0.s[2], v1.s[0] 442; CHECK-NEXT: ret 443 %l = load <2 x i16>, <2 x i16> *%a 444 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 445 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15> 446 ret <8 x i16> %s2 447} 448 449define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, <2 x i16> *%a) { 450; CHECK-LABEL: load_v8i16_2_4: 451; CHECK: // %bb.0: 452; CHECK-NEXT: ldrh w9, [x0] 453; CHECK-NEXT: add x8, x0, #2 454; CHECK-NEXT: mov v0.16b, v1.16b 455; CHECK-NEXT: fmov s2, w9 456; CHECK-NEXT: ld1 { v2.h }[2], [x8] 457; CHECK-NEXT: xtn v1.4h, v2.4s 458; CHECK-NEXT: mov v0.s[3], v1.s[0] 459; CHECK-NEXT: ret 460 %l = load <2 x i16>, <2 x i16> *%a 461 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 462 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1> 463 ret <8 x i16> %s2 464} 465 466define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, <2 x i16> *%a) { 467; CHECK-LABEL: load_v4i16_2_1: 468; CHECK: // %bb.0: 469; CHECK-NEXT: ld1 { v0.h }[0], [x0] 470; CHECK-NEXT: add x8, x0, #2 471; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 472; CHECK-NEXT: ld1 { v0.h }[2], [x8] 473; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h 474; CHECK-NEXT: mov v0.s[1], v1.s[1] 475; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 476; CHECK-NEXT: ret 477 %l = load <2 x i16>, <2 x i16> *%a 478 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 479 %s2 = shufflevector <4 x i16> %s1, <4 x i16> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 480 ret <4 x i16> %s2 481} 482 483define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, <2 x i16> *%a) { 484; CHECK-LABEL: load_v4i16_2_2: 485; CHECK: // %bb.0: 486; CHECK-NEXT: ld1 { v0.h }[0], [x0] 487; CHECK-NEXT: add x8, x0, #2 488; CHECK-NEXT: ld1 { v0.h }[2], [x8] 489; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h 490; CHECK-NEXT: fmov d0, d1 491; CHECK-NEXT: mov v0.s[1], v2.s[0] 492; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 493; CHECK-NEXT: ret 494 %l = load <2 x i16>, <2 x i16> *%a 495 %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 496 %s2 = shufflevector <4 x i16> %s1, <4 x i16> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 497 ret <4 x i16> %s2 498} 499 500define <8 x i16> @load_v8i16_4_1(float %tmp, <8 x i16> %b, <4 x i16> *%a) { 501; CHECK-LABEL: load_v8i16_4_1: 502; CHECK: // %bb.0: 503; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 504; CHECK-NEXT: ldr d0, [x0] 505; CHECK-NEXT: mov v0.d[1], v1.d[0] 506; CHECK-NEXT: ret 507 %l = load <4 x i16>, <4 x i16> *%a 508 %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 509 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 510 ret <8 x i16> %s2 511} 512 513define <8 x i16> @load_v8i16_4_2(float %tmp, <8 x i16> %b, <4 x i16> *%a) { 514; CHECK-LABEL: load_v8i16_4_2: 515; CHECK: // %bb.0: 516; CHECK-NEXT: mov v0.16b, v1.16b 517; CHECK-NEXT: ldr d1, [x0] 518; CHECK-NEXT: mov v0.d[1], v1.d[0] 519; CHECK-NEXT: ret 520 %l = load <4 x i16>, <4 x i16> *%a 521 %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 522 %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> 523 ret <8 x i16> %s2 524} 525 526; i32 527 528define <4 x i32> @load_v4i32_2_1(float %tmp, <4 x i32> %b, <2 x i32> *%a) { 529; CHECK-LABEL: load_v4i32_2_1: 530; CHECK: // %bb.0: 531; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 532; CHECK-NEXT: ldr d0, [x0] 533; CHECK-NEXT: mov v0.d[1], v1.d[0] 534; CHECK-NEXT: ret 535 %l = load <2 x i32>, <2 x i32> *%a 536 %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 537 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 538 ret <4 x i32> %s2 539} 540 541define <4 x i32> @load_v4i32_2_2(float %tmp, <4 x i32> %b, <2 x i32> *%a) { 542; CHECK-LABEL: load_v4i32_2_2: 543; CHECK: // %bb.0: 544; CHECK-NEXT: mov v0.16b, v1.16b 545; CHECK-NEXT: ldr d1, [x0] 546; CHECK-NEXT: mov v0.d[1], v1.d[0] 547; CHECK-NEXT: ret 548 %l = load <2 x i32>, <2 x i32> *%a 549 %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 550 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 551 ret <4 x i32> %s2 552} 553 554; More than a single vector 555 556define <8 x i8> @load2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { 557; CHECK-LABEL: load2_v4i8: 558; CHECK: // %bb.0: 559; CHECK-NEXT: ldr s0, [x0] 560; CHECK-NEXT: ld1 { v0.s }[1], [x1] 561; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 562; CHECK-NEXT: ret 563 %la = load <4 x i8>, <4 x i8> *%a 564 %lb = load <4 x i8>, <4 x i8> *%b 565 %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 566 ret <8 x i8> %s1 567} 568 569define <16 x i8> @load3_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { 570; CHECK-LABEL: load3_v4i8: 571; CHECK: // %bb.0: 572; CHECK-NEXT: ldp s0, s1, [x0] 573; CHECK-NEXT: ld1 { v0.s }[1], [x1] 574; CHECK-NEXT: mov v0.d[1], v1.d[0] 575; CHECK-NEXT: ret 576 %la = load <4 x i8>, <4 x i8> *%a 577 %lb = load <4 x i8>, <4 x i8> *%b 578 %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1 579 %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1 580 %lc = load <4 x i8>, <4 x i8> *%c 581 %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 582 %s2 = shufflevector <4 x i8> %lc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 583 %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 584 ret <16 x i8> %s3 585} 586 587define <16 x i8> @load4_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { 588; CHECK-LABEL: load4_v4i8: 589; CHECK: // %bb.0: 590; CHECK-NEXT: ldp s0, s1, [x0] 591; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 592; CHECK-NEXT: ld1 { v1.s }[1], [x1] 593; CHECK-NEXT: mov v0.d[1], v1.d[0] 594; CHECK-NEXT: ret 595 %la = load <4 x i8>, <4 x i8> *%a 596 %lb = load <4 x i8>, <4 x i8> *%b 597 %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1 598 %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1 599 %lc = load <4 x i8>, <4 x i8> *%c 600 %ld = load <4 x i8>, <4 x i8> *%d 601 %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 602 %s2 = shufflevector <4 x i8> %lc, <4 x i8> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 603 %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 604 ret <16 x i8> %s3 605} 606 607define <16 x i8> @load2multi1_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { 608; CHECK-LABEL: load2multi1_v4i8: 609; CHECK: // %bb.0: 610; CHECK-NEXT: ldr s0, [x0] 611; CHECK-NEXT: ld1 { v0.s }[1], [x1] 612; CHECK-NEXT: mov v0.d[1], v0.d[0] 613; CHECK-NEXT: ret 614 %la = load <4 x i8>, <4 x i8> *%a 615 %lb = load <4 x i8>, <4 x i8> *%b 616 %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 617 %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 618 ret <16 x i8> %s3 619} 620 621define <16 x i8> @load2multi2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { 622; CHECK-LABEL: load2multi2_v4i8: 623; CHECK: // %bb.0: 624; CHECK-NEXT: ldr s0, [x1] 625; CHECK-NEXT: ldr s1, [x0] 626; CHECK-NEXT: ushll v0.8h, v0.8b, #0 627; CHECK-NEXT: ushll v1.8h, v1.8b, #0 628; CHECK-NEXT: mov v0.d[1], v0.d[0] 629; CHECK-NEXT: mov v1.d[1], v1.d[0] 630; CHECK-NEXT: uzp1 v0.16b, v1.16b, v0.16b 631; CHECK-NEXT: ret 632 %la = load <4 x i8>, <4 x i8> *%a 633 %lb = load <4 x i8>, <4 x i8> *%b 634 %s1 = shufflevector <4 x i8> %la, <4 x i8> %la, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 635 %s2 = shufflevector <4 x i8> %lb, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 636 %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 637 ret <16 x i8> %s3 638} 639 640define void @loads_before_stores(i8* %i44) { 641; CHECK-LABEL: loads_before_stores: 642; CHECK: // %bb.0: // %bb 643; CHECK-NEXT: add x8, x0, #20 644; CHECK-NEXT: ldr s0, [x0, #28] 645; CHECK-NEXT: ldrh w9, [x0, #26] 646; CHECK-NEXT: ldrh w10, [x0, #24] 647; CHECK-NEXT: ld1 { v0.s }[1], [x8] 648; CHECK-NEXT: strh w9, [x0, #20] 649; CHECK-NEXT: strh w10, [x0, #30] 650; CHECK-NEXT: stur d0, [x0, #22] 651; CHECK-NEXT: ret 652bb: 653 %i45 = getelementptr inbounds i8, i8* %i44, i64 20 654 %i46 = getelementptr inbounds i8, i8* %i44, i64 26 655 %i48 = load i8, i8* %i46, align 1 656 %i49 = getelementptr inbounds i8, i8* %i44, i64 21 657 %i50 = getelementptr inbounds i8, i8* %i44, i64 27 658 %i52 = load i8, i8* %i50, align 1 659 %i53 = getelementptr inbounds i8, i8* %i44, i64 22 660 %i54 = getelementptr inbounds i8, i8* %i44, i64 28 661 %i61 = getelementptr inbounds i8, i8* %i44, i64 24 662 %i62 = getelementptr inbounds i8, i8* %i44, i64 30 663 %i63 = load i8, i8* %i61, align 1 664 %i65 = getelementptr inbounds i8, i8* %i44, i64 25 665 %i66 = getelementptr inbounds i8, i8* %i44, i64 31 666 %i67 = load i8, i8* %i65, align 1 667 %0 = bitcast i8* %i45 to <4 x i8>* 668 %1 = load <4 x i8>, <4 x i8>* %0, align 1 669 store i8 %i48, i8* %i45, align 1 670 store i8 %i52, i8* %i49, align 1 671 %2 = bitcast i8* %i54 to <4 x i8>* 672 %3 = load <4 x i8>, <4 x i8>* %2, align 1 673 store i8 %i63, i8* %i62, align 1 674 %4 = shufflevector <4 x i8> %3, <4 x i8> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 675 %5 = bitcast i8* %i53 to <8 x i8>* 676 store <8 x i8> %4, <8 x i8>* %5, align 1 677 store i8 %i67, i8* %i66, align 1 678 ret void 679} 680