1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4 5; FIXME: Broken on evergreen 6; FIXME: For some reason the 8 and 16 vectors are being stored as 7; individual elements instead of 128-bit stores. 8 9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { 10; SI-LABEL: insertelement_v2f32_0: 11; SI: ; %bb.0: 12; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 14; SI-NEXT: s_mov_b32 s3, 0x100f000 15; SI-NEXT: s_mov_b32 s2, -1 16; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 17; SI-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NEXT: v_mov_b32_e32 v1, s5 19; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: insertelement_v2f32_0: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 25; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 26; VI-NEXT: s_mov_b32 s3, 0x1100f000 27; VI-NEXT: s_mov_b32 s2, -1 28; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: v_mov_b32_e32 v1, s5 31; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 32; VI-NEXT: s_endpgm 33 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0 34 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 35 ret void 36} 37 38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { 39; SI-LABEL: insertelement_v2f32_1: 40; SI: ; %bb.0: 41; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 42; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 43; SI-NEXT: s_mov_b32 s3, 0x100f000 44; SI-NEXT: s_mov_b32 s2, -1 45; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 46; SI-NEXT: s_waitcnt lgkmcnt(0) 47; SI-NEXT: v_mov_b32_e32 v0, s4 48; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 49; SI-NEXT: s_endpgm 50; 51; VI-LABEL: insertelement_v2f32_1: 52; VI: ; %bb.0: 53; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 54; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 55; VI-NEXT: s_mov_b32 s3, 0x1100f000 56; VI-NEXT: s_mov_b32 s2, -1 57; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 58; VI-NEXT: s_waitcnt lgkmcnt(0) 59; VI-NEXT: v_mov_b32_e32 v0, s4 60; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 61; VI-NEXT: s_endpgm 62 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1 63 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 64 ret void 65} 66 67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { 68; SI-LABEL: insertelement_v2i32_0: 69; SI: ; %bb.0: 70; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 71; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 72; SI-NEXT: s_mov_b32 s3, 0x100f000 73; SI-NEXT: s_mov_b32 s2, -1 74; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 75; SI-NEXT: s_waitcnt lgkmcnt(0) 76; SI-NEXT: v_mov_b32_e32 v1, s5 77; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 78; SI-NEXT: s_endpgm 79; 80; VI-LABEL: insertelement_v2i32_0: 81; VI: ; %bb.0: 82; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 83; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 84; VI-NEXT: s_mov_b32 s3, 0x1100f000 85; VI-NEXT: s_mov_b32 s2, -1 86; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: v_mov_b32_e32 v1, s5 89; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 90; VI-NEXT: s_endpgm 91 %vecins = insertelement <2 x i32> %a, i32 999, i32 0 92 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 93 ret void 94} 95 96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { 97; SI-LABEL: insertelement_v2i32_1: 98; SI: ; %bb.0: 99; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 100; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 101; SI-NEXT: s_mov_b32 s3, 0x100f000 102; SI-NEXT: s_mov_b32 s2, -1 103; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s4 106; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 107; SI-NEXT: s_endpgm 108; 109; VI-LABEL: insertelement_v2i32_1: 110; VI: ; %bb.0: 111; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 112; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 113; VI-NEXT: s_mov_b32 s3, 0x1100f000 114; VI-NEXT: s_mov_b32 s2, -1 115; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s4 118; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 119; VI-NEXT: s_endpgm 120 %vecins = insertelement <2 x i32> %a, i32 999, i32 1 121 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 122 ret void 123} 124 125; FIXME: Why is the constant moved into the intermediate register and 126; not just directly into the vector component? 127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 128; SI-LABEL: insertelement_v4f32_0: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 131; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: s_mov_b32 s4, 0x40a00000 134; SI-NEXT: s_mov_b32 s3, 0x100f000 135; SI-NEXT: s_mov_b32 s2, -1 136; SI-NEXT: v_mov_b32_e32 v0, s4 137; SI-NEXT: v_mov_b32_e32 v1, s5 138; SI-NEXT: v_mov_b32_e32 v2, s6 139; SI-NEXT: v_mov_b32_e32 v3, s7 140; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 141; SI-NEXT: s_endpgm 142; 143; VI-LABEL: insertelement_v4f32_0: 144; VI: ; %bb.0: 145; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 146; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 147; VI-NEXT: s_waitcnt lgkmcnt(0) 148; VI-NEXT: s_mov_b32 s4, 0x40a00000 149; VI-NEXT: s_mov_b32 s3, 0x1100f000 150; VI-NEXT: s_mov_b32 s2, -1 151; VI-NEXT: v_mov_b32_e32 v0, s4 152; VI-NEXT: v_mov_b32_e32 v1, s5 153; VI-NEXT: v_mov_b32_e32 v2, s6 154; VI-NEXT: v_mov_b32_e32 v3, s7 155; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 156; VI-NEXT: s_endpgm 157 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 158 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 159 ret void 160} 161 162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 163; SI-LABEL: insertelement_v4f32_1: 164; SI: ; %bb.0: 165; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 166; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 167; SI-NEXT: s_waitcnt lgkmcnt(0) 168; SI-NEXT: s_mov_b32 s5, 0x40a00000 169; SI-NEXT: s_mov_b32 s3, 0x100f000 170; SI-NEXT: s_mov_b32 s2, -1 171; SI-NEXT: v_mov_b32_e32 v0, s4 172; SI-NEXT: v_mov_b32_e32 v1, s5 173; SI-NEXT: v_mov_b32_e32 v2, s6 174; SI-NEXT: v_mov_b32_e32 v3, s7 175; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 176; SI-NEXT: s_endpgm 177; 178; VI-LABEL: insertelement_v4f32_1: 179; VI: ; %bb.0: 180; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 181; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 182; VI-NEXT: s_waitcnt lgkmcnt(0) 183; VI-NEXT: s_mov_b32 s5, 0x40a00000 184; VI-NEXT: s_mov_b32 s3, 0x1100f000 185; VI-NEXT: s_mov_b32 s2, -1 186; VI-NEXT: v_mov_b32_e32 v0, s4 187; VI-NEXT: v_mov_b32_e32 v1, s5 188; VI-NEXT: v_mov_b32_e32 v2, s6 189; VI-NEXT: v_mov_b32_e32 v3, s7 190; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 191; VI-NEXT: s_endpgm 192 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 193 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 194 ret void 195} 196 197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 198; SI-LABEL: insertelement_v4f32_2: 199; SI: ; %bb.0: 200; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 201; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: s_mov_b32 s6, 0x40a00000 204; SI-NEXT: s_mov_b32 s3, 0x100f000 205; SI-NEXT: s_mov_b32 s2, -1 206; SI-NEXT: v_mov_b32_e32 v0, s4 207; SI-NEXT: v_mov_b32_e32 v1, s5 208; SI-NEXT: v_mov_b32_e32 v2, s6 209; SI-NEXT: v_mov_b32_e32 v3, s7 210; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 211; SI-NEXT: s_endpgm 212; 213; VI-LABEL: insertelement_v4f32_2: 214; VI: ; %bb.0: 215; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 216; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 217; VI-NEXT: s_waitcnt lgkmcnt(0) 218; VI-NEXT: s_mov_b32 s6, 0x40a00000 219; VI-NEXT: s_mov_b32 s3, 0x1100f000 220; VI-NEXT: s_mov_b32 s2, -1 221; VI-NEXT: v_mov_b32_e32 v0, s4 222; VI-NEXT: v_mov_b32_e32 v1, s5 223; VI-NEXT: v_mov_b32_e32 v2, s6 224; VI-NEXT: v_mov_b32_e32 v3, s7 225; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 226; VI-NEXT: s_endpgm 227 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 228 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 229 ret void 230} 231 232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 233; SI-LABEL: insertelement_v4f32_3: 234; SI: ; %bb.0: 235; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 236; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 237; SI-NEXT: s_waitcnt lgkmcnt(0) 238; SI-NEXT: s_mov_b32 s7, 0x40a00000 239; SI-NEXT: s_mov_b32 s3, 0x100f000 240; SI-NEXT: s_mov_b32 s2, -1 241; SI-NEXT: v_mov_b32_e32 v0, s4 242; SI-NEXT: v_mov_b32_e32 v1, s5 243; SI-NEXT: v_mov_b32_e32 v2, s6 244; SI-NEXT: v_mov_b32_e32 v3, s7 245; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 246; SI-NEXT: s_endpgm 247; 248; VI-LABEL: insertelement_v4f32_3: 249; VI: ; %bb.0: 250; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 251; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 252; VI-NEXT: s_waitcnt lgkmcnt(0) 253; VI-NEXT: s_mov_b32 s7, 0x40a00000 254; VI-NEXT: s_mov_b32 s3, 0x1100f000 255; VI-NEXT: s_mov_b32 s2, -1 256; VI-NEXT: v_mov_b32_e32 v0, s4 257; VI-NEXT: v_mov_b32_e32 v1, s5 258; VI-NEXT: v_mov_b32_e32 v2, s6 259; VI-NEXT: v_mov_b32_e32 v3, s7 260; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 261; VI-NEXT: s_endpgm 262 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 263 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 264 ret void 265} 266 267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { 268; SI-LABEL: insertelement_v4i32_0: 269; SI: ; %bb.0: 270; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 271; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 272; SI-NEXT: s_waitcnt lgkmcnt(0) 273; SI-NEXT: s_movk_i32 s4, 0x3e7 274; SI-NEXT: s_mov_b32 s3, 0x100f000 275; SI-NEXT: s_mov_b32 s2, -1 276; SI-NEXT: v_mov_b32_e32 v0, s4 277; SI-NEXT: v_mov_b32_e32 v1, s5 278; SI-NEXT: v_mov_b32_e32 v2, s6 279; SI-NEXT: v_mov_b32_e32 v3, s7 280; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 281; SI-NEXT: s_endpgm 282; 283; VI-LABEL: insertelement_v4i32_0: 284; VI: ; %bb.0: 285; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 286; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 287; VI-NEXT: s_waitcnt lgkmcnt(0) 288; VI-NEXT: s_movk_i32 s4, 0x3e7 289; VI-NEXT: s_mov_b32 s3, 0x1100f000 290; VI-NEXT: s_mov_b32 s2, -1 291; VI-NEXT: v_mov_b32_e32 v0, s4 292; VI-NEXT: v_mov_b32_e32 v1, s5 293; VI-NEXT: v_mov_b32_e32 v2, s6 294; VI-NEXT: v_mov_b32_e32 v3, s7 295; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 296; VI-NEXT: s_endpgm 297 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 298 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 299 ret void 300} 301 302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 303; SI-LABEL: insertelement_v3f32_1: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 306; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 307; SI-NEXT: s_mov_b32 s3, 0x100f000 308; SI-NEXT: s_mov_b32 s2, -1 309; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: v_mov_b32_e32 v0, s4 312; SI-NEXT: v_mov_b32_e32 v2, s6 313; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 314; SI-NEXT: s_endpgm 315; 316; VI-LABEL: insertelement_v3f32_1: 317; VI: ; %bb.0: 318; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 319; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 320; VI-NEXT: s_mov_b32 s3, 0x1100f000 321; VI-NEXT: s_mov_b32 s2, -1 322; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 323; VI-NEXT: s_waitcnt lgkmcnt(0) 324; VI-NEXT: v_mov_b32_e32 v0, s4 325; VI-NEXT: v_mov_b32_e32 v2, s6 326; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 327; VI-NEXT: s_endpgm 328 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 329 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 330 ret void 331} 332 333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 334; SI-LABEL: insertelement_v3f32_2: 335; SI: ; %bb.0: 336; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 337; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 338; SI-NEXT: s_mov_b32 s3, 0x100f000 339; SI-NEXT: s_mov_b32 s2, -1 340; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 341; SI-NEXT: s_waitcnt lgkmcnt(0) 342; SI-NEXT: v_mov_b32_e32 v0, s4 343; SI-NEXT: v_mov_b32_e32 v1, s5 344; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 345; SI-NEXT: s_endpgm 346; 347; VI-LABEL: insertelement_v3f32_2: 348; VI: ; %bb.0: 349; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 350; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 351; VI-NEXT: s_mov_b32 s3, 0x1100f000 352; VI-NEXT: s_mov_b32 s2, -1 353; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 354; VI-NEXT: s_waitcnt lgkmcnt(0) 355; VI-NEXT: v_mov_b32_e32 v0, s4 356; VI-NEXT: v_mov_b32_e32 v1, s5 357; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 358; VI-NEXT: s_endpgm 359 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 360 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 361 ret void 362} 363 364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 365; GCN-LABEL: insertelement_v3f32_3: 366; GCN: ; %bb.0: 367; GCN-NEXT: s_endpgm 368 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 369 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 370 ret void 371} 372 373define <4 x float> @insertelement_to_sgpr() nounwind { 374; GCN-LABEL: insertelement_to_sgpr: 375; GCN: ; %bb.0: 376; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 378; GCN-NEXT: s_waitcnt lgkmcnt(0) 379; GCN-NEXT: s_mov_b32 s12, 0 380; GCN-NEXT: s_mov_b32 s4, s12 381; GCN-NEXT: s_mov_b32 s5, s12 382; GCN-NEXT: s_mov_b32 s6, s12 383; GCN-NEXT: s_mov_b32 s7, s12 384; GCN-NEXT: s_mov_b32 s8, s12 385; GCN-NEXT: s_mov_b32 s9, s12 386; GCN-NEXT: s_mov_b32 s10, s12 387; GCN-NEXT: s_mov_b32 s11, s12 388; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 389; GCN-NEXT: s_waitcnt vmcnt(0) 390; GCN-NEXT: s_setpc_b64 s[30:31] 391 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef 392 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 393 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 394 ret <4 x float> %tmp2 395} 396 397define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { 398; SI-LABEL: dynamic_insertelement_v2f32: 399; SI: ; %bb.0: 400; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 401; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 402; SI-NEXT: s_load_dword s4, s[4:5], 0x4 403; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 404; SI-NEXT: s_mov_b32 s3, 0x100f000 405; SI-NEXT: s_mov_b32 s2, -1 406; SI-NEXT: s_waitcnt lgkmcnt(0) 407; SI-NEXT: v_mov_b32_e32 v1, s7 408; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 409; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 410; SI-NEXT: v_mov_b32_e32 v2, s6 411; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 412; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 413; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 414; SI-NEXT: s_endpgm 415; 416; VI-LABEL: dynamic_insertelement_v2f32: 417; VI: ; %bb.0: 418; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 419; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 420; VI-NEXT: s_load_dword s4, s[4:5], 0x10 421; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 422; VI-NEXT: s_mov_b32 s3, 0x1100f000 423; VI-NEXT: s_mov_b32 s2, -1 424; VI-NEXT: s_waitcnt lgkmcnt(0) 425; VI-NEXT: v_mov_b32_e32 v1, s7 426; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 427; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 428; VI-NEXT: v_mov_b32_e32 v2, s6 429; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 430; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 431; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 432; VI-NEXT: s_endpgm 433 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 434 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 435 ret void 436} 437 438define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { 439; SI-LABEL: dynamic_insertelement_v3f32: 440; SI: ; %bb.0: 441; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 442; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 443; SI-NEXT: s_load_dword s4, s[4:5], 0x8 444; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 445; SI-NEXT: s_mov_b32 s3, 0x100f000 446; SI-NEXT: s_mov_b32 s2, -1 447; SI-NEXT: s_waitcnt lgkmcnt(0) 448; SI-NEXT: v_mov_b32_e32 v1, s10 449; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 450; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 451; SI-NEXT: v_mov_b32_e32 v1, s9 452; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 453; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 454; SI-NEXT: v_mov_b32_e32 v3, s8 455; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 456; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 457; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 458; SI-NEXT: s_endpgm 459; 460; VI-LABEL: dynamic_insertelement_v3f32: 461; VI: ; %bb.0: 462; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 463; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 464; VI-NEXT: s_load_dword s4, s[4:5], 0x20 465; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 466; VI-NEXT: s_mov_b32 s3, 0x1100f000 467; VI-NEXT: s_mov_b32 s2, -1 468; VI-NEXT: s_waitcnt lgkmcnt(0) 469; VI-NEXT: v_mov_b32_e32 v1, s10 470; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 471; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 472; VI-NEXT: v_mov_b32_e32 v1, s9 473; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 474; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 475; VI-NEXT: v_mov_b32_e32 v3, s8 476; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 477; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 478; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 479; VI-NEXT: s_endpgm 480 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 481 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 482 ret void 483} 484 485define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { 486; SI-LABEL: dynamic_insertelement_v4f32: 487; SI: ; %bb.0: 488; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 489; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 490; SI-NEXT: s_load_dword s4, s[4:5], 0x8 491; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 492; SI-NEXT: s_mov_b32 s3, 0x100f000 493; SI-NEXT: s_mov_b32 s2, -1 494; SI-NEXT: s_waitcnt lgkmcnt(0) 495; SI-NEXT: v_mov_b32_e32 v1, s11 496; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 497; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 498; SI-NEXT: v_mov_b32_e32 v1, s10 499; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 500; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 501; SI-NEXT: v_mov_b32_e32 v1, s9 502; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 503; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 504; SI-NEXT: v_mov_b32_e32 v4, s8 505; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 506; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 507; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 508; SI-NEXT: s_endpgm 509; 510; VI-LABEL: dynamic_insertelement_v4f32: 511; VI: ; %bb.0: 512; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 513; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 514; VI-NEXT: s_load_dword s4, s[4:5], 0x20 515; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 516; VI-NEXT: s_mov_b32 s3, 0x1100f000 517; VI-NEXT: s_mov_b32 s2, -1 518; VI-NEXT: s_waitcnt lgkmcnt(0) 519; VI-NEXT: v_mov_b32_e32 v1, s11 520; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 521; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 522; VI-NEXT: v_mov_b32_e32 v1, s10 523; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 524; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 525; VI-NEXT: v_mov_b32_e32 v1, s9 526; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 527; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 528; VI-NEXT: v_mov_b32_e32 v4, s8 529; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 530; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 531; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 532; VI-NEXT: s_endpgm 533 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 534 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 535 ret void 536} 537 538define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { 539; SI-LABEL: dynamic_insertelement_v8f32: 540; SI: ; %bb.0: 541; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 542; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 543; SI-NEXT: s_load_dword s4, s[4:5], 0x10 544; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 545; SI-NEXT: s_mov_b32 s3, 0x100f000 546; SI-NEXT: s_mov_b32 s2, -1 547; SI-NEXT: s_waitcnt lgkmcnt(0) 548; SI-NEXT: v_mov_b32_e32 v0, s11 549; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 550; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 551; SI-NEXT: v_mov_b32_e32 v0, s10 552; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 553; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 554; SI-NEXT: v_mov_b32_e32 v0, s9 555; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 556; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 557; SI-NEXT: v_mov_b32_e32 v0, s8 558; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 559; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 560; SI-NEXT: v_mov_b32_e32 v5, s15 561; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 562; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 563; SI-NEXT: v_mov_b32_e32 v5, s14 564; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 565; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 566; SI-NEXT: v_mov_b32_e32 v5, s13 567; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 568; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 569; SI-NEXT: v_mov_b32_e32 v8, s12 570; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 571; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 572; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 573; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 574; SI-NEXT: s_endpgm 575; 576; VI-LABEL: dynamic_insertelement_v8f32: 577; VI: ; %bb.0: 578; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 579; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 580; VI-NEXT: s_load_dword s4, s[4:5], 0x40 581; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 582; VI-NEXT: s_mov_b32 s3, 0x1100f000 583; VI-NEXT: s_mov_b32 s2, -1 584; VI-NEXT: s_waitcnt lgkmcnt(0) 585; VI-NEXT: v_mov_b32_e32 v0, s11 586; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 587; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 588; VI-NEXT: v_mov_b32_e32 v0, s10 589; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 590; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 591; VI-NEXT: v_mov_b32_e32 v0, s9 592; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 593; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 594; VI-NEXT: v_mov_b32_e32 v0, s8 595; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 596; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 597; VI-NEXT: v_mov_b32_e32 v5, s15 598; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 599; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 600; VI-NEXT: v_mov_b32_e32 v5, s14 601; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 602; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 603; VI-NEXT: v_mov_b32_e32 v5, s13 604; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 605; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 606; VI-NEXT: v_mov_b32_e32 v8, s12 607; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 608; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 609; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 610; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 611; VI-NEXT: s_endpgm 612 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 613 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 614 ret void 615} 616 617define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { 618; SI-LABEL: dynamic_insertelement_v16f32: 619; SI: ; %bb.0: 620; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 621; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 622; SI-NEXT: s_load_dword s4, s[4:5], 0x20 623; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 624; SI-NEXT: s_mov_b32 s3, 0x100f000 625; SI-NEXT: s_mov_b32 s2, -1 626; SI-NEXT: s_waitcnt lgkmcnt(0) 627; SI-NEXT: v_mov_b32_e32 v0, s8 628; SI-NEXT: v_mov_b32_e32 v1, s9 629; SI-NEXT: v_mov_b32_e32 v2, s10 630; SI-NEXT: v_mov_b32_e32 v3, s11 631; SI-NEXT: v_mov_b32_e32 v4, s12 632; SI-NEXT: v_mov_b32_e32 v5, s13 633; SI-NEXT: v_mov_b32_e32 v6, s14 634; SI-NEXT: v_mov_b32_e32 v7, s15 635; SI-NEXT: v_mov_b32_e32 v8, s16 636; SI-NEXT: v_mov_b32_e32 v9, s17 637; SI-NEXT: v_mov_b32_e32 v10, s18 638; SI-NEXT: v_mov_b32_e32 v11, s19 639; SI-NEXT: v_mov_b32_e32 v12, s20 640; SI-NEXT: v_mov_b32_e32 v13, s21 641; SI-NEXT: v_mov_b32_e32 v14, s22 642; SI-NEXT: v_mov_b32_e32 v15, s23 643; SI-NEXT: s_mov_b32 m0, s4 644; SI-NEXT: v_movreld_b32_e32 v0, v16 645; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 646; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 647; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 648; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 649; SI-NEXT: s_endpgm 650; 651; VI-LABEL: dynamic_insertelement_v16f32: 652; VI: ; %bb.0: 653; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 654; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 655; VI-NEXT: s_load_dword s4, s[4:5], 0x80 656; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 657; VI-NEXT: s_mov_b32 s3, 0x1100f000 658; VI-NEXT: s_mov_b32 s2, -1 659; VI-NEXT: s_waitcnt lgkmcnt(0) 660; VI-NEXT: v_mov_b32_e32 v0, s8 661; VI-NEXT: v_mov_b32_e32 v1, s9 662; VI-NEXT: v_mov_b32_e32 v2, s10 663; VI-NEXT: v_mov_b32_e32 v3, s11 664; VI-NEXT: v_mov_b32_e32 v4, s12 665; VI-NEXT: v_mov_b32_e32 v5, s13 666; VI-NEXT: v_mov_b32_e32 v6, s14 667; VI-NEXT: v_mov_b32_e32 v7, s15 668; VI-NEXT: v_mov_b32_e32 v8, s16 669; VI-NEXT: v_mov_b32_e32 v9, s17 670; VI-NEXT: v_mov_b32_e32 v10, s18 671; VI-NEXT: v_mov_b32_e32 v11, s19 672; VI-NEXT: v_mov_b32_e32 v12, s20 673; VI-NEXT: v_mov_b32_e32 v13, s21 674; VI-NEXT: v_mov_b32_e32 v14, s22 675; VI-NEXT: v_mov_b32_e32 v15, s23 676; VI-NEXT: s_mov_b32 m0, s4 677; VI-NEXT: v_movreld_b32_e32 v0, v16 678; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 679; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 680; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 681; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 682; VI-NEXT: s_endpgm 683 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 684 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 685 ret void 686} 687 688define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { 689; SI-LABEL: dynamic_insertelement_v2i32: 690; SI: ; %bb.0: 691; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 692; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 693; SI-NEXT: s_load_dword s4, s[4:5], 0x4 694; SI-NEXT: s_mov_b32 s3, 0x100f000 695; SI-NEXT: s_mov_b32 s2, -1 696; SI-NEXT: s_waitcnt lgkmcnt(0) 697; SI-NEXT: v_mov_b32_e32 v0, s7 698; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 699; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 700; SI-NEXT: v_mov_b32_e32 v0, s6 701; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 702; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 703; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 704; SI-NEXT: s_endpgm 705; 706; VI-LABEL: dynamic_insertelement_v2i32: 707; VI: ; %bb.0: 708; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 709; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 710; VI-NEXT: s_load_dword s4, s[4:5], 0x10 711; VI-NEXT: s_mov_b32 s3, 0x1100f000 712; VI-NEXT: s_mov_b32 s2, -1 713; VI-NEXT: s_waitcnt lgkmcnt(0) 714; VI-NEXT: s_cmp_lg_u32 s4, 1 715; VI-NEXT: s_cselect_b32 s5, s7, 5 716; VI-NEXT: s_cmp_lg_u32 s4, 0 717; VI-NEXT: s_cselect_b32 s4, s6, 5 718; VI-NEXT: v_mov_b32_e32 v0, s4 719; VI-NEXT: v_mov_b32_e32 v1, s5 720; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 721; VI-NEXT: s_endpgm 722 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 723 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 724 ret void 725} 726 727define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { 728; SI-LABEL: dynamic_insertelement_v3i32: 729; SI: ; %bb.0: 730; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 731; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 732; SI-NEXT: s_load_dword s4, s[4:5], 0x8 733; SI-NEXT: s_mov_b32 s3, 0x100f000 734; SI-NEXT: s_mov_b32 s2, -1 735; SI-NEXT: s_waitcnt lgkmcnt(0) 736; SI-NEXT: v_mov_b32_e32 v0, s10 737; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 738; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 739; SI-NEXT: v_mov_b32_e32 v0, s9 740; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 741; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 742; SI-NEXT: v_mov_b32_e32 v0, s8 743; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 744; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 745; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 746; SI-NEXT: s_endpgm 747; 748; VI-LABEL: dynamic_insertelement_v3i32: 749; VI: ; %bb.0: 750; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 751; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 752; VI-NEXT: s_load_dword s4, s[4:5], 0x20 753; VI-NEXT: s_mov_b32 s3, 0x1100f000 754; VI-NEXT: s_mov_b32 s2, -1 755; VI-NEXT: s_waitcnt lgkmcnt(0) 756; VI-NEXT: s_cmp_lg_u32 s4, 2 757; VI-NEXT: s_cselect_b32 s5, s10, 5 758; VI-NEXT: s_cmp_lg_u32 s4, 1 759; VI-NEXT: s_cselect_b32 s6, s9, 5 760; VI-NEXT: s_cmp_lg_u32 s4, 0 761; VI-NEXT: s_cselect_b32 s4, s8, 5 762; VI-NEXT: v_mov_b32_e32 v0, s4 763; VI-NEXT: v_mov_b32_e32 v1, s6 764; VI-NEXT: v_mov_b32_e32 v2, s5 765; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 766; VI-NEXT: s_endpgm 767 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 768 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 769 ret void 770} 771 772define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 773; SI-LABEL: dynamic_insertelement_v4i32: 774; SI: ; %bb.0: 775; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 776; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 777; SI-NEXT: s_load_dword s6, s[4:5], 0x8 778; SI-NEXT: s_load_dword s4, s[4:5], 0x11 779; SI-NEXT: s_mov_b32 s3, 0x100f000 780; SI-NEXT: s_mov_b32 s2, -1 781; SI-NEXT: s_waitcnt lgkmcnt(0) 782; SI-NEXT: v_mov_b32_e32 v0, s11 783; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 784; SI-NEXT: v_mov_b32_e32 v4, s4 785; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 786; SI-NEXT: v_mov_b32_e32 v0, s10 787; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 788; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 789; SI-NEXT: v_mov_b32_e32 v0, s9 790; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 791; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 792; SI-NEXT: v_mov_b32_e32 v0, s8 793; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 794; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 795; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 796; SI-NEXT: s_endpgm 797; 798; VI-LABEL: dynamic_insertelement_v4i32: 799; VI: ; %bb.0: 800; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 801; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 802; VI-NEXT: s_load_dword s6, s[4:5], 0x20 803; VI-NEXT: s_load_dword s4, s[4:5], 0x44 804; VI-NEXT: s_mov_b32 s3, 0x1100f000 805; VI-NEXT: s_mov_b32 s2, -1 806; VI-NEXT: s_waitcnt lgkmcnt(0) 807; VI-NEXT: s_cmp_eq_u32 s6, 3 808; VI-NEXT: s_cselect_b32 s5, s4, s11 809; VI-NEXT: s_cmp_eq_u32 s6, 2 810; VI-NEXT: s_cselect_b32 s7, s4, s10 811; VI-NEXT: s_cmp_eq_u32 s6, 1 812; VI-NEXT: s_cselect_b32 s9, s4, s9 813; VI-NEXT: s_cmp_eq_u32 s6, 0 814; VI-NEXT: s_cselect_b32 s4, s4, s8 815; VI-NEXT: v_mov_b32_e32 v0, s4 816; VI-NEXT: v_mov_b32_e32 v1, s9 817; VI-NEXT: v_mov_b32_e32 v2, s7 818; VI-NEXT: v_mov_b32_e32 v3, s5 819; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 820; VI-NEXT: s_endpgm 821 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 822 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 823 ret void 824} 825 826define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { 827; SI-LABEL: dynamic_insertelement_v8i32: 828; SI: ; %bb.0: 829; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 830; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 831; SI-NEXT: s_load_dword s4, s[4:5], 0x10 832; SI-NEXT: s_mov_b32 s3, 0x100f000 833; SI-NEXT: s_mov_b32 s2, -1 834; SI-NEXT: s_waitcnt lgkmcnt(0) 835; SI-NEXT: v_mov_b32_e32 v0, s11 836; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 837; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc 838; SI-NEXT: v_mov_b32_e32 v0, s10 839; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 840; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 841; SI-NEXT: v_mov_b32_e32 v0, s9 842; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 843; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 844; SI-NEXT: v_mov_b32_e32 v0, s8 845; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 846; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 847; SI-NEXT: v_mov_b32_e32 v4, s15 848; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 849; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc 850; SI-NEXT: v_mov_b32_e32 v4, s14 851; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 852; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc 853; SI-NEXT: v_mov_b32_e32 v4, s13 854; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 855; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc 856; SI-NEXT: v_mov_b32_e32 v4, s12 857; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 858; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 859; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 860; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 861; SI-NEXT: s_endpgm 862; 863; VI-LABEL: dynamic_insertelement_v8i32: 864; VI: ; %bb.0: 865; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 866; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 867; VI-NEXT: s_load_dword s4, s[4:5], 0x40 868; VI-NEXT: s_mov_b32 s3, 0x1100f000 869; VI-NEXT: s_mov_b32 s2, -1 870; VI-NEXT: s_waitcnt lgkmcnt(0) 871; VI-NEXT: s_cmp_lg_u32 s4, 3 872; VI-NEXT: s_cselect_b32 s5, s11, 5 873; VI-NEXT: s_cmp_lg_u32 s4, 2 874; VI-NEXT: s_cselect_b32 s6, s10, 5 875; VI-NEXT: s_cmp_lg_u32 s4, 1 876; VI-NEXT: s_cselect_b32 s7, s9, 5 877; VI-NEXT: s_cmp_lg_u32 s4, 0 878; VI-NEXT: s_cselect_b32 s8, s8, 5 879; VI-NEXT: s_cmp_lg_u32 s4, 7 880; VI-NEXT: s_cselect_b32 s9, s15, 5 881; VI-NEXT: s_cmp_lg_u32 s4, 6 882; VI-NEXT: s_cselect_b32 s10, s14, 5 883; VI-NEXT: s_cmp_lg_u32 s4, 5 884; VI-NEXT: s_cselect_b32 s11, s13, 5 885; VI-NEXT: s_cmp_lg_u32 s4, 4 886; VI-NEXT: s_cselect_b32 s4, s12, 5 887; VI-NEXT: v_mov_b32_e32 v0, s4 888; VI-NEXT: v_mov_b32_e32 v1, s11 889; VI-NEXT: v_mov_b32_e32 v2, s10 890; VI-NEXT: v_mov_b32_e32 v3, s9 891; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 892; VI-NEXT: s_nop 0 893; VI-NEXT: v_mov_b32_e32 v0, s8 894; VI-NEXT: v_mov_b32_e32 v1, s7 895; VI-NEXT: v_mov_b32_e32 v2, s6 896; VI-NEXT: v_mov_b32_e32 v3, s5 897; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 898; VI-NEXT: s_endpgm 899 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 900 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 901 ret void 902} 903 904define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { 905; SI-LABEL: dynamic_insertelement_v16i32: 906; SI: ; %bb.0: 907; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 908; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 909; SI-NEXT: s_load_dword s4, s[4:5], 0x20 910; SI-NEXT: s_mov_b32 s3, 0x100f000 911; SI-NEXT: s_mov_b32 s2, -1 912; SI-NEXT: s_waitcnt lgkmcnt(0) 913; SI-NEXT: v_mov_b32_e32 v0, s8 914; SI-NEXT: v_mov_b32_e32 v1, s9 915; SI-NEXT: v_mov_b32_e32 v2, s10 916; SI-NEXT: v_mov_b32_e32 v3, s11 917; SI-NEXT: v_mov_b32_e32 v4, s12 918; SI-NEXT: v_mov_b32_e32 v5, s13 919; SI-NEXT: v_mov_b32_e32 v6, s14 920; SI-NEXT: v_mov_b32_e32 v7, s15 921; SI-NEXT: v_mov_b32_e32 v8, s16 922; SI-NEXT: v_mov_b32_e32 v9, s17 923; SI-NEXT: v_mov_b32_e32 v10, s18 924; SI-NEXT: v_mov_b32_e32 v11, s19 925; SI-NEXT: v_mov_b32_e32 v12, s20 926; SI-NEXT: v_mov_b32_e32 v13, s21 927; SI-NEXT: v_mov_b32_e32 v14, s22 928; SI-NEXT: v_mov_b32_e32 v15, s23 929; SI-NEXT: s_mov_b32 m0, s4 930; SI-NEXT: v_movreld_b32_e32 v0, 5 931; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 932; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 933; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 934; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 935; SI-NEXT: s_endpgm 936; 937; VI-LABEL: dynamic_insertelement_v16i32: 938; VI: ; %bb.0: 939; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 940; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 941; VI-NEXT: s_load_dword s4, s[4:5], 0x80 942; VI-NEXT: s_mov_b32 s3, 0x1100f000 943; VI-NEXT: s_mov_b32 s2, -1 944; VI-NEXT: s_waitcnt lgkmcnt(0) 945; VI-NEXT: v_mov_b32_e32 v0, s8 946; VI-NEXT: v_mov_b32_e32 v1, s9 947; VI-NEXT: v_mov_b32_e32 v2, s10 948; VI-NEXT: v_mov_b32_e32 v3, s11 949; VI-NEXT: v_mov_b32_e32 v4, s12 950; VI-NEXT: v_mov_b32_e32 v5, s13 951; VI-NEXT: v_mov_b32_e32 v6, s14 952; VI-NEXT: v_mov_b32_e32 v7, s15 953; VI-NEXT: v_mov_b32_e32 v8, s16 954; VI-NEXT: v_mov_b32_e32 v9, s17 955; VI-NEXT: v_mov_b32_e32 v10, s18 956; VI-NEXT: v_mov_b32_e32 v11, s19 957; VI-NEXT: v_mov_b32_e32 v12, s20 958; VI-NEXT: v_mov_b32_e32 v13, s21 959; VI-NEXT: v_mov_b32_e32 v14, s22 960; VI-NEXT: v_mov_b32_e32 v15, s23 961; VI-NEXT: s_mov_b32 m0, s4 962; VI-NEXT: v_movreld_b32_e32 v0, 5 963; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 964; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 965; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 966; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 967; VI-NEXT: s_endpgm 968 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 969 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 970 ret void 971} 972 973define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { 974; SI-LABEL: dynamic_insertelement_v2i16: 975; SI: ; %bb.0: 976; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 977; SI-NEXT: s_load_dword s6, s[4:5], 0x2 978; SI-NEXT: s_load_dword s4, s[4:5], 0x3 979; SI-NEXT: s_mov_b32 s3, 0x100f000 980; SI-NEXT: s_mov_b32 s2, -1 981; SI-NEXT: s_waitcnt lgkmcnt(0) 982; SI-NEXT: s_lshl_b32 s4, s4, 4 983; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 984; SI-NEXT: s_andn2_b32 s5, s6, s4 985; SI-NEXT: s_and_b32 s4, s4, 0x50005 986; SI-NEXT: s_or_b32 s4, s4, s5 987; SI-NEXT: v_mov_b32_e32 v0, s4 988; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 989; SI-NEXT: s_endpgm 990; 991; VI-LABEL: dynamic_insertelement_v2i16: 992; VI: ; %bb.0: 993; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 994; VI-NEXT: s_load_dword s6, s[4:5], 0x8 995; VI-NEXT: s_load_dword s4, s[4:5], 0xc 996; VI-NEXT: s_mov_b32 s3, 0x1100f000 997; VI-NEXT: s_mov_b32 s2, -1 998; VI-NEXT: s_waitcnt lgkmcnt(0) 999; VI-NEXT: s_lshl_b32 s4, s4, 4 1000; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1001; VI-NEXT: s_andn2_b32 s5, s6, s4 1002; VI-NEXT: s_and_b32 s4, s4, 0x50005 1003; VI-NEXT: s_or_b32 s4, s4, s5 1004; VI-NEXT: v_mov_b32_e32 v0, s4 1005; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1006; VI-NEXT: s_endpgm 1007 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 1008 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 1009 ret void 1010} 1011 1012define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { 1013; SI-LABEL: dynamic_insertelement_v3i16: 1014; SI: ; %bb.0: 1015; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1016; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 1017; SI-NEXT: s_load_dword s4, s[4:5], 0x4 1018; SI-NEXT: s_mov_b32 s3, 0x100f000 1019; SI-NEXT: s_mov_b32 s2, -1 1020; SI-NEXT: s_waitcnt lgkmcnt(0) 1021; SI-NEXT: s_lshl_b32 s8, s4, 4 1022; SI-NEXT: s_mov_b64 s[4:5], 0xffff 1023; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 1024; SI-NEXT: s_mov_b32 s8, 0x50005 1025; SI-NEXT: s_and_b32 s9, s5, s8 1026; SI-NEXT: s_and_b32 s8, s4, s8 1027; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] 1028; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1029; SI-NEXT: v_mov_b32_e32 v0, s5 1030; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1031; SI-NEXT: v_mov_b32_e32 v0, s4 1032; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1033; SI-NEXT: s_endpgm 1034; 1035; VI-LABEL: dynamic_insertelement_v3i16: 1036; VI: ; %bb.0: 1037; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1038; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 1039; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1040; VI-NEXT: s_mov_b32 s3, 0x1100f000 1041; VI-NEXT: s_mov_b32 s2, -1 1042; VI-NEXT: s_waitcnt lgkmcnt(0) 1043; VI-NEXT: s_lshl_b32 s8, s4, 4 1044; VI-NEXT: s_mov_b64 s[4:5], 0xffff 1045; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 1046; VI-NEXT: s_mov_b32 s8, 0x50005 1047; VI-NEXT: s_mov_b32 s9, s8 1048; VI-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5] 1049; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] 1050; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] 1051; VI-NEXT: v_mov_b32_e32 v0, s5 1052; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1053; VI-NEXT: v_mov_b32_e32 v0, s4 1054; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1055; VI-NEXT: s_endpgm 1056 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 1057 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 1058 ret void 1059} 1060 1061define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 1062; SI-LABEL: dynamic_insertelement_v2i8: 1063; SI: ; %bb.0: 1064; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1065; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1066; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1067; SI-NEXT: s_mov_b32 s3, 0x100f000 1068; SI-NEXT: s_mov_b32 s2, -1 1069; SI-NEXT: s_waitcnt lgkmcnt(0) 1070; SI-NEXT: s_lshl_b32 s4, s4, 3 1071; SI-NEXT: s_lshl_b32 s4, -1, s4 1072; SI-NEXT: s_andn2_b32 s5, s6, s4 1073; SI-NEXT: s_and_b32 s4, s4, 0x505 1074; SI-NEXT: s_or_b32 s4, s4, s5 1075; SI-NEXT: v_mov_b32_e32 v0, s4 1076; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1077; SI-NEXT: s_endpgm 1078; 1079; VI-LABEL: dynamic_insertelement_v2i8: 1080; VI: ; %bb.0: 1081; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1082; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1083; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1084; VI-NEXT: s_mov_b32 s3, 0x1100f000 1085; VI-NEXT: s_mov_b32 s2, -1 1086; VI-NEXT: s_waitcnt lgkmcnt(0) 1087; VI-NEXT: s_lshl_b32 s4, s4, 3 1088; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 1089; VI-NEXT: v_not_b32_e32 v1, v0 1090; VI-NEXT: v_and_b32_e32 v1, s6, v1 1091; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 1092; VI-NEXT: v_or_b32_e32 v0, v0, v1 1093; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1094; VI-NEXT: s_endpgm 1095 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 1096 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 1097 ret void 1098} 1099 1100; FIXME: post legalize i16 and i32 shifts aren't merged because of 1101; isTypeDesirableForOp in SimplifyDemandedBits 1102define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 1103; SI-LABEL: dynamic_insertelement_v3i8: 1104; SI: ; %bb.0: 1105; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1106; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1107; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1108; SI-NEXT: s_mov_b32 s3, 0x100f000 1109; SI-NEXT: s_mov_b32 s2, -1 1110; SI-NEXT: s_waitcnt lgkmcnt(0) 1111; SI-NEXT: s_lshl_b32 s4, s4, 3 1112; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1113; SI-NEXT: s_andn2_b32 s5, s6, s4 1114; SI-NEXT: s_and_b32 s4, s4, 0x5050505 1115; SI-NEXT: s_or_b32 s4, s4, s5 1116; SI-NEXT: v_mov_b32_e32 v0, s4 1117; SI-NEXT: s_lshr_b32 s5, s4, 16 1118; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1119; SI-NEXT: v_mov_b32_e32 v0, s5 1120; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1121; SI-NEXT: s_endpgm 1122; 1123; VI-LABEL: dynamic_insertelement_v3i8: 1124; VI: ; %bb.0: 1125; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1126; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1127; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1128; VI-NEXT: s_mov_b32 s3, 0x1100f000 1129; VI-NEXT: s_mov_b32 s2, -1 1130; VI-NEXT: s_waitcnt lgkmcnt(0) 1131; VI-NEXT: s_lshl_b32 s4, s4, 3 1132; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1133; VI-NEXT: s_andn2_b32 s5, s6, s4 1134; VI-NEXT: s_and_b32 s4, s4, 0x5050505 1135; VI-NEXT: s_or_b32 s4, s4, s5 1136; VI-NEXT: v_mov_b32_e32 v0, s4 1137; VI-NEXT: s_lshr_b32 s5, s4, 16 1138; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1139; VI-NEXT: v_mov_b32_e32 v0, s5 1140; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1141; VI-NEXT: s_endpgm 1142 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 1143 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 1144 ret void 1145} 1146 1147define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 1148; SI-LABEL: dynamic_insertelement_v4i8: 1149; SI: ; %bb.0: 1150; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1151; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1152; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1153; SI-NEXT: s_mov_b32 s3, 0x100f000 1154; SI-NEXT: s_mov_b32 s2, -1 1155; SI-NEXT: s_waitcnt lgkmcnt(0) 1156; SI-NEXT: s_lshl_b32 s4, s4, 3 1157; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1158; SI-NEXT: s_andn2_b32 s5, s6, s4 1159; SI-NEXT: s_and_b32 s4, s4, 0x5050505 1160; SI-NEXT: s_or_b32 s4, s4, s5 1161; SI-NEXT: v_mov_b32_e32 v0, s4 1162; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1163; SI-NEXT: s_endpgm 1164; 1165; VI-LABEL: dynamic_insertelement_v4i8: 1166; VI: ; %bb.0: 1167; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1168; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1169; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1170; VI-NEXT: s_mov_b32 s3, 0x1100f000 1171; VI-NEXT: s_mov_b32 s2, -1 1172; VI-NEXT: s_waitcnt lgkmcnt(0) 1173; VI-NEXT: s_lshl_b32 s4, s4, 3 1174; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1175; VI-NEXT: s_andn2_b32 s5, s6, s4 1176; VI-NEXT: s_and_b32 s4, s4, 0x5050505 1177; VI-NEXT: s_or_b32 s4, s4, s5 1178; VI-NEXT: v_mov_b32_e32 v0, s4 1179; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1180; VI-NEXT: s_endpgm 1181 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 1182 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 1183 ret void 1184} 1185 1186define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { 1187; SI-LABEL: s_dynamic_insertelement_v8i8: 1188; SI: ; %bb.0: 1189; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1190; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1191; SI-NEXT: s_mov_b32 s3, 0x100f000 1192; SI-NEXT: s_mov_b32 s2, -1 1193; SI-NEXT: s_waitcnt lgkmcnt(0) 1194; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1195; SI-NEXT: s_mov_b32 s0, s8 1196; SI-NEXT: s_lshl_b32 s8, s6, 3 1197; SI-NEXT: s_mov_b64 s[6:7], 0xffff 1198; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1199; SI-NEXT: s_mov_b32 s8, 0x5050505 1200; SI-NEXT: s_mov_b32 s1, s9 1201; SI-NEXT: s_and_b32 s9, s7, s8 1202; SI-NEXT: s_and_b32 s8, s6, s8 1203; SI-NEXT: s_waitcnt lgkmcnt(0) 1204; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1205; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1206; SI-NEXT: v_mov_b32_e32 v0, s4 1207; SI-NEXT: v_mov_b32_e32 v1, s5 1208; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1209; SI-NEXT: s_endpgm 1210; 1211; VI-LABEL: s_dynamic_insertelement_v8i8: 1212; VI: ; %bb.0: 1213; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1214; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1215; VI-NEXT: s_mov_b32 s3, 0x1100f000 1216; VI-NEXT: s_mov_b32 s2, -1 1217; VI-NEXT: s_waitcnt lgkmcnt(0) 1218; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1219; VI-NEXT: s_mov_b32 s0, s8 1220; VI-NEXT: s_lshl_b32 s8, s6, 3 1221; VI-NEXT: s_mov_b64 s[6:7], 0xffff 1222; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1223; VI-NEXT: s_mov_b32 s8, 0x5050505 1224; VI-NEXT: s_mov_b32 s1, s9 1225; VI-NEXT: s_and_b32 s9, s7, s8 1226; VI-NEXT: s_and_b32 s8, s6, s8 1227; VI-NEXT: s_waitcnt lgkmcnt(0) 1228; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1229; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1230; VI-NEXT: v_mov_b32_e32 v0, s4 1231; VI-NEXT: v_mov_b32_e32 v1, s5 1232; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1233; VI-NEXT: s_endpgm 1234 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 1235 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 1236 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 1237 ret void 1238} 1239 1240define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { 1241; SI-LABEL: dynamic_insertelement_v16i8: 1242; SI: ; %bb.0: 1243; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1244; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1245; SI-NEXT: s_load_dword s4, s[4:5], 0x8 1246; SI-NEXT: s_mov_b32 s3, 0x100f000 1247; SI-NEXT: s_mov_b32 s2, -1 1248; SI-NEXT: s_waitcnt lgkmcnt(0) 1249; SI-NEXT: s_lshr_b32 s5, s11, 24 1250; SI-NEXT: v_mov_b32_e32 v0, s5 1251; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 1252; SI-NEXT: s_lshr_b32 s5, s11, 16 1253; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1254; SI-NEXT: v_mov_b32_e32 v1, s5 1255; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 1256; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1257; SI-NEXT: s_movk_i32 s5, 0xff 1258; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1259; SI-NEXT: v_and_b32_e32 v1, s5, v1 1260; SI-NEXT: s_lshr_b32 s6, s11, 8 1261; SI-NEXT: v_or_b32_e32 v0, v1, v0 1262; SI-NEXT: v_mov_b32_e32 v1, s6 1263; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 1264; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1265; SI-NEXT: v_mov_b32_e32 v2, s11 1266; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 1267; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1268; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1269; SI-NEXT: v_and_b32_e32 v2, s5, v2 1270; SI-NEXT: v_or_b32_e32 v1, v2, v1 1271; SI-NEXT: s_mov_b32 s6, 0xffff 1272; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1273; SI-NEXT: v_and_b32_e32 v1, s6, v1 1274; SI-NEXT: s_lshr_b32 s7, s10, 24 1275; SI-NEXT: v_or_b32_e32 v3, v1, v0 1276; SI-NEXT: v_mov_b32_e32 v0, s7 1277; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 1278; SI-NEXT: s_lshr_b32 s7, s10, 16 1279; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1280; SI-NEXT: v_mov_b32_e32 v1, s7 1281; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 1282; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1283; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1284; SI-NEXT: v_and_b32_e32 v1, s5, v1 1285; SI-NEXT: s_lshr_b32 s7, s10, 8 1286; SI-NEXT: v_or_b32_e32 v0, v1, v0 1287; SI-NEXT: v_mov_b32_e32 v1, s7 1288; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 1289; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1290; SI-NEXT: v_mov_b32_e32 v2, s10 1291; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 1292; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1293; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1294; SI-NEXT: v_and_b32_e32 v2, s5, v2 1295; SI-NEXT: v_or_b32_e32 v1, v2, v1 1296; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1297; SI-NEXT: v_and_b32_e32 v1, s6, v1 1298; SI-NEXT: s_lshr_b32 s7, s9, 24 1299; SI-NEXT: v_or_b32_e32 v2, v1, v0 1300; SI-NEXT: v_mov_b32_e32 v0, s7 1301; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 1302; SI-NEXT: s_lshr_b32 s7, s9, 16 1303; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1304; SI-NEXT: v_mov_b32_e32 v1, s7 1305; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 1306; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1307; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1308; SI-NEXT: v_and_b32_e32 v1, s5, v1 1309; SI-NEXT: s_lshr_b32 s7, s9, 8 1310; SI-NEXT: v_or_b32_e32 v0, v1, v0 1311; SI-NEXT: v_mov_b32_e32 v1, s7 1312; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 1313; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1314; SI-NEXT: v_mov_b32_e32 v4, s9 1315; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 1316; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1317; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1318; SI-NEXT: v_and_b32_e32 v4, s5, v4 1319; SI-NEXT: v_or_b32_e32 v1, v4, v1 1320; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1321; SI-NEXT: v_and_b32_e32 v1, s6, v1 1322; SI-NEXT: s_lshr_b32 s7, s8, 24 1323; SI-NEXT: v_or_b32_e32 v1, v1, v0 1324; SI-NEXT: v_mov_b32_e32 v0, s7 1325; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 1326; SI-NEXT: s_lshr_b32 s7, s8, 16 1327; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1328; SI-NEXT: v_mov_b32_e32 v4, s7 1329; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 1330; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1331; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1332; SI-NEXT: v_and_b32_e32 v4, s5, v4 1333; SI-NEXT: s_lshr_b32 s7, s8, 8 1334; SI-NEXT: v_or_b32_e32 v0, v4, v0 1335; SI-NEXT: v_mov_b32_e32 v4, s7 1336; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 1337; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1338; SI-NEXT: v_mov_b32_e32 v5, s8 1339; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 1340; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1341; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 1342; SI-NEXT: v_and_b32_e32 v5, s5, v5 1343; SI-NEXT: v_or_b32_e32 v4, v5, v4 1344; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1345; SI-NEXT: v_and_b32_e32 v4, s6, v4 1346; SI-NEXT: v_or_b32_e32 v0, v4, v0 1347; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1348; SI-NEXT: s_endpgm 1349; 1350; VI-LABEL: dynamic_insertelement_v16i8: 1351; VI: ; %bb.0: 1352; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1353; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1354; VI-NEXT: s_load_dword s4, s[4:5], 0x20 1355; VI-NEXT: s_mov_b32 s3, 0x1100f000 1356; VI-NEXT: s_mov_b32 s2, -1 1357; VI-NEXT: s_waitcnt lgkmcnt(0) 1358; VI-NEXT: s_lshr_b32 s5, s11, 24 1359; VI-NEXT: v_mov_b32_e32 v0, s5 1360; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 1361; VI-NEXT: s_lshr_b32 s5, s11, 16 1362; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1363; VI-NEXT: v_mov_b32_e32 v1, s5 1364; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 1365; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1366; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1367; VI-NEXT: s_lshr_b32 s5, s11, 8 1368; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1369; VI-NEXT: v_mov_b32_e32 v1, s5 1370; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 1371; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1372; VI-NEXT: v_mov_b32_e32 v2, s11 1373; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 1374; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1375; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1376; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1377; VI-NEXT: s_lshr_b32 s5, s10, 24 1378; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1379; VI-NEXT: v_mov_b32_e32 v0, s5 1380; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 1381; VI-NEXT: s_lshr_b32 s5, s10, 16 1382; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1383; VI-NEXT: v_mov_b32_e32 v1, s5 1384; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 1385; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1386; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1387; VI-NEXT: s_lshr_b32 s5, s10, 8 1388; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1389; VI-NEXT: v_mov_b32_e32 v1, s5 1390; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 1391; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1392; VI-NEXT: v_mov_b32_e32 v2, s10 1393; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 1394; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1395; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1396; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1397; VI-NEXT: s_lshr_b32 s5, s9, 24 1398; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1399; VI-NEXT: v_mov_b32_e32 v0, s5 1400; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 1401; VI-NEXT: s_lshr_b32 s5, s9, 16 1402; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1403; VI-NEXT: v_mov_b32_e32 v1, s5 1404; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 1405; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1406; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1407; VI-NEXT: s_lshr_b32 s5, s9, 8 1408; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1409; VI-NEXT: v_mov_b32_e32 v1, s5 1410; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 1411; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1412; VI-NEXT: v_mov_b32_e32 v4, s9 1413; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 1414; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1415; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1416; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1417; VI-NEXT: s_lshr_b32 s5, s8, 24 1418; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1419; VI-NEXT: v_mov_b32_e32 v0, s5 1420; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 1421; VI-NEXT: s_lshr_b32 s5, s8, 16 1422; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1423; VI-NEXT: v_mov_b32_e32 v4, s5 1424; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 1425; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1426; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1427; VI-NEXT: s_lshr_b32 s5, s8, 8 1428; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1429; VI-NEXT: v_mov_b32_e32 v4, s5 1430; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 1431; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1432; VI-NEXT: v_mov_b32_e32 v5, s8 1433; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 1434; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 1435; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1436; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1437; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1438; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1439; VI-NEXT: s_endpgm 1440 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 1441 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 1442 ret void 1443} 1444 1445; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 1446; the compiler doesn't crash. 1447define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { 1448; SI-LABEL: insert_split_bb: 1449; SI: ; %bb.0: ; %entry 1450; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1451; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1452; SI-NEXT: s_waitcnt lgkmcnt(0) 1453; SI-NEXT: s_cmp_lg_u32 s6, 0 1454; SI-NEXT: s_cbranch_scc0 BB30_2 1455; SI-NEXT: ; %bb.1: ; %else 1456; SI-NEXT: s_load_dword s7, s[2:3], 0x1 1457; SI-NEXT: s_mov_b64 s[4:5], 0 1458; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1459; SI-NEXT: s_waitcnt lgkmcnt(0) 1460; SI-NEXT: s_mov_b64 vcc, vcc 1461; SI-NEXT: s_cbranch_vccz BB30_3 1462; SI-NEXT: s_branch BB30_4 1463; SI-NEXT: BB30_2: 1464; SI-NEXT: BB30_3: ; %if 1465; SI-NEXT: s_load_dword s7, s[2:3], 0x0 1466; SI-NEXT: BB30_4: ; %endif 1467; SI-NEXT: s_waitcnt lgkmcnt(0) 1468; SI-NEXT: v_mov_b32_e32 v0, s6 1469; SI-NEXT: s_mov_b32 s3, 0x100f000 1470; SI-NEXT: s_mov_b32 s2, -1 1471; SI-NEXT: v_mov_b32_e32 v1, s7 1472; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1473; SI-NEXT: s_endpgm 1474; 1475; VI-LABEL: insert_split_bb: 1476; VI: ; %bb.0: ; %entry 1477; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1478; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1479; VI-NEXT: s_waitcnt lgkmcnt(0) 1480; VI-NEXT: s_cmp_lg_u32 s6, 0 1481; VI-NEXT: s_cbranch_scc0 BB30_2 1482; VI-NEXT: ; %bb.1: ; %else 1483; VI-NEXT: s_load_dword s7, s[2:3], 0x4 1484; VI-NEXT: s_cbranch_execz BB30_3 1485; VI-NEXT: s_branch BB30_4 1486; VI-NEXT: BB30_2: 1487; VI-NEXT: BB30_3: ; %if 1488; VI-NEXT: s_waitcnt lgkmcnt(0) 1489; VI-NEXT: s_load_dword s7, s[2:3], 0x0 1490; VI-NEXT: BB30_4: ; %endif 1491; VI-NEXT: s_waitcnt lgkmcnt(0) 1492; VI-NEXT: v_mov_b32_e32 v0, s6 1493; VI-NEXT: s_mov_b32 s3, 0x1100f000 1494; VI-NEXT: s_mov_b32 s2, -1 1495; VI-NEXT: v_mov_b32_e32 v1, s7 1496; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1497; VI-NEXT: s_endpgm 1498entry: 1499 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 1500 %1 = icmp eq i32 %a, 0 1501 br i1 %1, label %if, label %else 1502 1503if: 1504 %2 = load i32, i32 addrspace(1)* %in 1505 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 1506 br label %endif 1507 1508else: 1509 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 1510 %5 = load i32, i32 addrspace(1)* %4 1511 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 1512 br label %endif 1513 1514endif: 1515 %7 = phi <2 x i32> [%3, %if], [%6, %else] 1516 store <2 x i32> %7, <2 x i32> addrspace(1)* %out 1517 ret void 1518} 1519 1520define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 1521; SI-LABEL: dynamic_insertelement_v2f64: 1522; SI: ; %bb.0: 1523; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1524; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc 1525; SI-NEXT: s_load_dword s4, s[4:5], 0x18 1526; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 1527; SI-NEXT: s_mov_b32 s3, 0x100f000 1528; SI-NEXT: s_mov_b32 s2, -1 1529; SI-NEXT: s_waitcnt lgkmcnt(0) 1530; SI-NEXT: v_mov_b32_e32 v0, s11 1531; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1532; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1533; SI-NEXT: v_mov_b32_e32 v0, s10 1534; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1535; SI-NEXT: v_mov_b32_e32 v0, s9 1536; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1537; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1538; SI-NEXT: v_mov_b32_e32 v0, s8 1539; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1540; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1541; SI-NEXT: s_endpgm 1542; 1543; VI-LABEL: dynamic_insertelement_v2f64: 1544; VI: ; %bb.0: 1545; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1546; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 1547; VI-NEXT: s_load_dword s4, s[4:5], 0x60 1548; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 1549; VI-NEXT: s_mov_b32 s3, 0x1100f000 1550; VI-NEXT: s_mov_b32 s2, -1 1551; VI-NEXT: s_waitcnt lgkmcnt(0) 1552; VI-NEXT: v_mov_b32_e32 v0, s11 1553; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1554; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1555; VI-NEXT: v_mov_b32_e32 v0, s10 1556; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1557; VI-NEXT: v_mov_b32_e32 v0, s9 1558; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1559; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1560; VI-NEXT: v_mov_b32_e32 v0, s8 1561; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1562; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1563; VI-NEXT: s_endpgm 1564 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 1565 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 1566 ret void 1567} 1568 1569define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { 1570; SI-LABEL: dynamic_insertelement_v2i64: 1571; SI: ; %bb.0: 1572; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1573; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1574; SI-NEXT: s_load_dword s6, s[4:5], 0x8 1575; SI-NEXT: s_mov_b32 s3, 0x100f000 1576; SI-NEXT: s_mov_b32 s2, -1 1577; SI-NEXT: s_waitcnt lgkmcnt(0) 1578; SI-NEXT: v_mov_b32_e32 v0, s11 1579; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1580; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1581; SI-NEXT: v_mov_b32_e32 v0, s10 1582; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1583; SI-NEXT: v_mov_b32_e32 v0, s9 1584; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1585; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1586; SI-NEXT: v_mov_b32_e32 v0, s8 1587; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1588; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1589; SI-NEXT: s_endpgm 1590; 1591; VI-LABEL: dynamic_insertelement_v2i64: 1592; VI: ; %bb.0: 1593; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1594; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1595; VI-NEXT: s_load_dword s6, s[4:5], 0x20 1596; VI-NEXT: s_mov_b32 s3, 0x1100f000 1597; VI-NEXT: s_mov_b32 s2, -1 1598; VI-NEXT: s_waitcnt lgkmcnt(0) 1599; VI-NEXT: v_mov_b32_e32 v0, s11 1600; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1601; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1602; VI-NEXT: v_mov_b32_e32 v0, s10 1603; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1604; VI-NEXT: v_mov_b32_e32 v0, s9 1605; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1606; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1607; VI-NEXT: v_mov_b32_e32 v0, s8 1608; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1609; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1610; VI-NEXT: s_endpgm 1611 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 1612 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 1613 ret void 1614} 1615 1616define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { 1617; SI-LABEL: dynamic_insertelement_v3i64: 1618; SI: ; %bb.0: 1619; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1620; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1621; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc 1622; SI-NEXT: s_load_dword s12, s[4:5], 0x10 1623; SI-NEXT: s_mov_b32 s3, 0x100f000 1624; SI-NEXT: s_mov_b32 s2, -1 1625; SI-NEXT: s_waitcnt lgkmcnt(0) 1626; SI-NEXT: v_mov_b32_e32 v0, s11 1627; SI-NEXT: v_mov_b32_e32 v4, s7 1628; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1 1629; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1630; SI-NEXT: v_mov_b32_e32 v0, s10 1631; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1632; SI-NEXT: v_mov_b32_e32 v0, s9 1633; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0 1634; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1635; SI-NEXT: v_mov_b32_e32 v0, s8 1636; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1637; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2 1638; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] 1639; SI-NEXT: v_mov_b32_e32 v4, s6 1640; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] 1641; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1642; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1643; SI-NEXT: s_endpgm 1644; 1645; VI-LABEL: dynamic_insertelement_v3i64: 1646; VI: ; %bb.0: 1647; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1648; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 1649; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30 1650; VI-NEXT: s_load_dword s12, s[4:5], 0x40 1651; VI-NEXT: s_mov_b32 s3, 0x1100f000 1652; VI-NEXT: s_mov_b32 s2, -1 1653; VI-NEXT: s_waitcnt lgkmcnt(0) 1654; VI-NEXT: v_mov_b32_e32 v0, s11 1655; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1 1656; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1657; VI-NEXT: v_mov_b32_e32 v0, s10 1658; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1659; VI-NEXT: v_mov_b32_e32 v0, s9 1660; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0 1661; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1662; VI-NEXT: v_mov_b32_e32 v0, s8 1663; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1664; VI-NEXT: v_mov_b32_e32 v4, s7 1665; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2 1666; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] 1667; VI-NEXT: v_mov_b32_e32 v4, s6 1668; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] 1669; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1670; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1671; VI-NEXT: s_endpgm 1672 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 1673 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 1674 ret void 1675} 1676 1677define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { 1678; SI-LABEL: dynamic_insertelement_v4f64: 1679; SI: ; %bb.0: 1680; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1681; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1682; SI-NEXT: s_load_dword s4, s[4:5], 0x10 1683; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 1684; SI-NEXT: s_mov_b32 s3, 0x100f000 1685; SI-NEXT: s_mov_b32 s2, -1 1686; SI-NEXT: s_waitcnt lgkmcnt(0) 1687; SI-NEXT: v_mov_b32_e32 v0, s11 1688; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1689; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1690; SI-NEXT: v_mov_b32_e32 v0, s10 1691; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1692; SI-NEXT: v_mov_b32_e32 v0, s9 1693; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1694; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1695; SI-NEXT: v_mov_b32_e32 v0, s8 1696; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1697; SI-NEXT: v_mov_b32_e32 v5, s15 1698; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 1699; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1700; SI-NEXT: v_mov_b32_e32 v5, s14 1701; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1702; SI-NEXT: v_mov_b32_e32 v5, s13 1703; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 1704; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1705; SI-NEXT: v_mov_b32_e32 v4, s12 1706; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1707; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1708; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1709; SI-NEXT: s_endpgm 1710; 1711; VI-LABEL: dynamic_insertelement_v4f64: 1712; VI: ; %bb.0: 1713; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1714; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1715; VI-NEXT: s_load_dword s4, s[4:5], 0x40 1716; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 1717; VI-NEXT: s_mov_b32 s3, 0x1100f000 1718; VI-NEXT: s_mov_b32 s2, -1 1719; VI-NEXT: s_waitcnt lgkmcnt(0) 1720; VI-NEXT: v_mov_b32_e32 v0, s11 1721; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1722; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1723; VI-NEXT: v_mov_b32_e32 v0, s10 1724; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1725; VI-NEXT: v_mov_b32_e32 v0, s9 1726; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1727; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1728; VI-NEXT: v_mov_b32_e32 v0, s8 1729; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1730; VI-NEXT: v_mov_b32_e32 v5, s15 1731; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 1732; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1733; VI-NEXT: v_mov_b32_e32 v5, s14 1734; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1735; VI-NEXT: v_mov_b32_e32 v5, s13 1736; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 1737; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1738; VI-NEXT: v_mov_b32_e32 v4, s12 1739; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1740; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1741; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1742; VI-NEXT: s_endpgm 1743 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 1744 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 1745 ret void 1746} 1747 1748define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { 1749; SI-LABEL: dynamic_insertelement_v8f64: 1750; SI: ; %bb.0: 1751; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1752; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 1753; SI-NEXT: s_load_dword s4, s[4:5], 0x20 1754; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 1755; SI-NEXT: s_mov_b32 s3, 0x100f000 1756; SI-NEXT: s_mov_b32 s2, -1 1757; SI-NEXT: s_waitcnt lgkmcnt(0) 1758; SI-NEXT: v_mov_b32_e32 v0, s8 1759; SI-NEXT: s_lshl_b32 s4, s4, 1 1760; SI-NEXT: v_mov_b32_e32 v1, s9 1761; SI-NEXT: v_mov_b32_e32 v2, s10 1762; SI-NEXT: v_mov_b32_e32 v3, s11 1763; SI-NEXT: v_mov_b32_e32 v4, s12 1764; SI-NEXT: v_mov_b32_e32 v5, s13 1765; SI-NEXT: v_mov_b32_e32 v6, s14 1766; SI-NEXT: v_mov_b32_e32 v7, s15 1767; SI-NEXT: v_mov_b32_e32 v8, s16 1768; SI-NEXT: v_mov_b32_e32 v9, s17 1769; SI-NEXT: v_mov_b32_e32 v10, s18 1770; SI-NEXT: v_mov_b32_e32 v11, s19 1771; SI-NEXT: v_mov_b32_e32 v12, s20 1772; SI-NEXT: v_mov_b32_e32 v13, s21 1773; SI-NEXT: v_mov_b32_e32 v14, s22 1774; SI-NEXT: v_mov_b32_e32 v15, s23 1775; SI-NEXT: s_mov_b32 m0, s4 1776; SI-NEXT: v_movreld_b32_e32 v0, 0 1777; SI-NEXT: v_movreld_b32_e32 v1, v16 1778; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1779; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1780; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1781; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1782; SI-NEXT: s_endpgm 1783; 1784; VI-LABEL: dynamic_insertelement_v8f64: 1785; VI: ; %bb.0: 1786; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1787; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 1788; VI-NEXT: s_load_dword s4, s[4:5], 0x80 1789; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 1790; VI-NEXT: s_mov_b32 s3, 0x1100f000 1791; VI-NEXT: s_mov_b32 s2, -1 1792; VI-NEXT: s_waitcnt lgkmcnt(0) 1793; VI-NEXT: v_mov_b32_e32 v0, s8 1794; VI-NEXT: s_lshl_b32 s4, s4, 1 1795; VI-NEXT: v_mov_b32_e32 v1, s9 1796; VI-NEXT: v_mov_b32_e32 v2, s10 1797; VI-NEXT: v_mov_b32_e32 v3, s11 1798; VI-NEXT: v_mov_b32_e32 v4, s12 1799; VI-NEXT: v_mov_b32_e32 v5, s13 1800; VI-NEXT: v_mov_b32_e32 v6, s14 1801; VI-NEXT: v_mov_b32_e32 v7, s15 1802; VI-NEXT: v_mov_b32_e32 v8, s16 1803; VI-NEXT: v_mov_b32_e32 v9, s17 1804; VI-NEXT: v_mov_b32_e32 v10, s18 1805; VI-NEXT: v_mov_b32_e32 v11, s19 1806; VI-NEXT: v_mov_b32_e32 v12, s20 1807; VI-NEXT: v_mov_b32_e32 v13, s21 1808; VI-NEXT: v_mov_b32_e32 v14, s22 1809; VI-NEXT: v_mov_b32_e32 v15, s23 1810; VI-NEXT: s_mov_b32 m0, s4 1811; VI-NEXT: v_movreld_b32_e32 v0, 0 1812; VI-NEXT: v_movreld_b32_e32 v1, v16 1813; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1814; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1815; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1816; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1817; VI-NEXT: s_endpgm 1818 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 1819 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 1820 ret void 1821} 1822 1823declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1824 1825attributes #0 = { nounwind } 1826attributes #1 = { nounwind readnone } 1827