1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s 4 5; FIXME: Broken on evergreen 6; FIXME: For some reason the 8 and 16 vectors are being stored as 7; individual elements instead of 128-bit stores. 8 9 10; FIXME: Why is the constant moved into the intermediate register and 11; not just directly into the vector component? 12define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 13; SI-LABEL: insertelement_v4f32_0: 14; SI: ; %bb.0: 15; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 17; SI-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NEXT: s_mov_b32 s4, 0x40a00000 19; SI-NEXT: s_mov_b32 s3, 0x100f000 20; SI-NEXT: s_mov_b32 s2, -1 21; SI-NEXT: v_mov_b32_e32 v0, s4 22; SI-NEXT: v_mov_b32_e32 v1, s5 23; SI-NEXT: v_mov_b32_e32 v2, s6 24; SI-NEXT: v_mov_b32_e32 v3, s7 25; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: insertelement_v4f32_0: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 31; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 32; VI-NEXT: s_waitcnt lgkmcnt(0) 33; VI-NEXT: s_mov_b32 s4, 0x40a00000 34; VI-NEXT: s_mov_b32 s3, 0x1100f000 35; VI-NEXT: s_mov_b32 s2, -1 36; VI-NEXT: v_mov_b32_e32 v0, s4 37; VI-NEXT: v_mov_b32_e32 v1, s5 38; VI-NEXT: v_mov_b32_e32 v2, s6 39; VI-NEXT: v_mov_b32_e32 v3, s7 40; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 41; VI-NEXT: s_endpgm 42 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 43 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 44 ret void 45} 46 47define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 48; SI-LABEL: insertelement_v4f32_1: 49; SI: ; %bb.0: 50; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 51; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 52; SI-NEXT: s_waitcnt lgkmcnt(0) 53; SI-NEXT: s_mov_b32 s5, 0x40a00000 54; SI-NEXT: s_mov_b32 s3, 0x100f000 55; SI-NEXT: s_mov_b32 s2, -1 56; SI-NEXT: v_mov_b32_e32 v0, s4 57; SI-NEXT: v_mov_b32_e32 v1, s5 58; SI-NEXT: v_mov_b32_e32 v2, s6 59; SI-NEXT: v_mov_b32_e32 v3, s7 60; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 61; SI-NEXT: s_endpgm 62; 63; VI-LABEL: insertelement_v4f32_1: 64; VI: ; %bb.0: 65; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 66; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 67; VI-NEXT: s_waitcnt lgkmcnt(0) 68; VI-NEXT: s_mov_b32 s5, 0x40a00000 69; VI-NEXT: s_mov_b32 s3, 0x1100f000 70; VI-NEXT: s_mov_b32 s2, -1 71; VI-NEXT: v_mov_b32_e32 v0, s4 72; VI-NEXT: v_mov_b32_e32 v1, s5 73; VI-NEXT: v_mov_b32_e32 v2, s6 74; VI-NEXT: v_mov_b32_e32 v3, s7 75; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 76; VI-NEXT: s_endpgm 77 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 78 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 79 ret void 80} 81 82define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 83; SI-LABEL: insertelement_v4f32_2: 84; SI: ; %bb.0: 85; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 86; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 87; SI-NEXT: s_waitcnt lgkmcnt(0) 88; SI-NEXT: s_mov_b32 s6, 0x40a00000 89; SI-NEXT: s_mov_b32 s3, 0x100f000 90; SI-NEXT: s_mov_b32 s2, -1 91; SI-NEXT: v_mov_b32_e32 v0, s4 92; SI-NEXT: v_mov_b32_e32 v1, s5 93; SI-NEXT: v_mov_b32_e32 v2, s6 94; SI-NEXT: v_mov_b32_e32 v3, s7 95; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 96; SI-NEXT: s_endpgm 97; 98; VI-LABEL: insertelement_v4f32_2: 99; VI: ; %bb.0: 100; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 101; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 102; VI-NEXT: s_waitcnt lgkmcnt(0) 103; VI-NEXT: s_mov_b32 s6, 0x40a00000 104; VI-NEXT: s_mov_b32 s3, 0x1100f000 105; VI-NEXT: s_mov_b32 s2, -1 106; VI-NEXT: v_mov_b32_e32 v0, s4 107; VI-NEXT: v_mov_b32_e32 v1, s5 108; VI-NEXT: v_mov_b32_e32 v2, s6 109; VI-NEXT: v_mov_b32_e32 v3, s7 110; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 111; VI-NEXT: s_endpgm 112 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 113 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 114 ret void 115} 116 117define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 118; SI-LABEL: insertelement_v4f32_3: 119; SI: ; %bb.0: 120; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 121; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 122; SI-NEXT: s_waitcnt lgkmcnt(0) 123; SI-NEXT: s_mov_b32 s7, 0x40a00000 124; SI-NEXT: s_mov_b32 s3, 0x100f000 125; SI-NEXT: s_mov_b32 s2, -1 126; SI-NEXT: v_mov_b32_e32 v0, s4 127; SI-NEXT: v_mov_b32_e32 v1, s5 128; SI-NEXT: v_mov_b32_e32 v2, s6 129; SI-NEXT: v_mov_b32_e32 v3, s7 130; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 131; SI-NEXT: s_endpgm 132; 133; VI-LABEL: insertelement_v4f32_3: 134; VI: ; %bb.0: 135; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 136; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 137; VI-NEXT: s_waitcnt lgkmcnt(0) 138; VI-NEXT: s_mov_b32 s7, 0x40a00000 139; VI-NEXT: s_mov_b32 s3, 0x1100f000 140; VI-NEXT: s_mov_b32 s2, -1 141; VI-NEXT: v_mov_b32_e32 v0, s4 142; VI-NEXT: v_mov_b32_e32 v1, s5 143; VI-NEXT: v_mov_b32_e32 v2, s6 144; VI-NEXT: v_mov_b32_e32 v3, s7 145; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 146; VI-NEXT: s_endpgm 147 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 148 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 149 ret void 150} 151 152define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { 153; SI-LABEL: insertelement_v4i32_0: 154; SI: ; %bb.0: 155; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 156; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 157; SI-NEXT: s_waitcnt lgkmcnt(0) 158; SI-NEXT: s_movk_i32 s4, 0x3e7 159; SI-NEXT: s_mov_b32 s3, 0x100f000 160; SI-NEXT: s_mov_b32 s2, -1 161; SI-NEXT: v_mov_b32_e32 v0, s4 162; SI-NEXT: v_mov_b32_e32 v1, s5 163; SI-NEXT: v_mov_b32_e32 v2, s6 164; SI-NEXT: v_mov_b32_e32 v3, s7 165; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 166; SI-NEXT: s_endpgm 167; 168; VI-LABEL: insertelement_v4i32_0: 169; VI: ; %bb.0: 170; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 171; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: s_movk_i32 s4, 0x3e7 174; VI-NEXT: s_mov_b32 s3, 0x1100f000 175; VI-NEXT: s_mov_b32 s2, -1 176; VI-NEXT: v_mov_b32_e32 v0, s4 177; VI-NEXT: v_mov_b32_e32 v1, s5 178; VI-NEXT: v_mov_b32_e32 v2, s6 179; VI-NEXT: v_mov_b32_e32 v3, s7 180; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 181; VI-NEXT: s_endpgm 182 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 183 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 184 ret void 185} 186 187define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 188; SI-LABEL: insertelement_v3f32_1: 189; SI: ; %bb.0: 190; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 191; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 192; SI-NEXT: s_mov_b32 s3, 0x100f000 193; SI-NEXT: s_mov_b32 s2, -1 194; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 195; SI-NEXT: s_waitcnt lgkmcnt(0) 196; SI-NEXT: v_mov_b32_e32 v0, s4 197; SI-NEXT: v_mov_b32_e32 v2, s6 198; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 199; SI-NEXT: s_endpgm 200; 201; VI-LABEL: insertelement_v3f32_1: 202; VI: ; %bb.0: 203; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 204; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 205; VI-NEXT: s_mov_b32 s3, 0x1100f000 206; VI-NEXT: s_mov_b32 s2, -1 207; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 208; VI-NEXT: s_waitcnt lgkmcnt(0) 209; VI-NEXT: v_mov_b32_e32 v0, s4 210; VI-NEXT: v_mov_b32_e32 v2, s6 211; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 212; VI-NEXT: s_endpgm 213 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 214 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 215 ret void 216} 217 218define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 219; SI-LABEL: insertelement_v3f32_2: 220; SI: ; %bb.0: 221; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 222; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 223; SI-NEXT: s_mov_b32 s3, 0x100f000 224; SI-NEXT: s_mov_b32 s2, -1 225; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 226; SI-NEXT: s_waitcnt lgkmcnt(0) 227; SI-NEXT: v_mov_b32_e32 v0, s4 228; SI-NEXT: v_mov_b32_e32 v1, s5 229; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 230; SI-NEXT: s_endpgm 231; 232; VI-LABEL: insertelement_v3f32_2: 233; VI: ; %bb.0: 234; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 235; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 236; VI-NEXT: s_mov_b32 s3, 0x1100f000 237; VI-NEXT: s_mov_b32 s2, -1 238; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 239; VI-NEXT: s_waitcnt lgkmcnt(0) 240; VI-NEXT: v_mov_b32_e32 v0, s4 241; VI-NEXT: v_mov_b32_e32 v1, s5 242; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 243; VI-NEXT: s_endpgm 244 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 245 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 246 ret void 247} 248 249define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 250; GCN-LABEL: insertelement_v3f32_3: 251; GCN: ; %bb.0: 252; GCN-NEXT: s_endpgm 253 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 254 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 255 ret void 256} 257 258define <4 x float> @insertelement_to_sgpr() nounwind { 259; GCN-LABEL: insertelement_to_sgpr: 260; GCN: ; %bb.0: 261; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 263; GCN-NEXT: s_waitcnt lgkmcnt(0) 264; GCN-NEXT: s_mov_b32 s12, 0 265; GCN-NEXT: s_mov_b32 s4, s12 266; GCN-NEXT: s_mov_b32 s5, s12 267; GCN-NEXT: s_mov_b32 s6, s12 268; GCN-NEXT: s_mov_b32 s7, s12 269; GCN-NEXT: s_mov_b32 s8, s12 270; GCN-NEXT: s_mov_b32 s9, s12 271; GCN-NEXT: s_mov_b32 s10, s12 272; GCN-NEXT: s_mov_b32 s11, s12 273; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 274; GCN-NEXT: s_waitcnt vmcnt(0) 275; GCN-NEXT: s_setpc_b64 s[30:31] 276 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef 277 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 278 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 279 ret <4 x float> %tmp2 280} 281 282define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { 283; SI-LABEL: dynamic_insertelement_v2f32: 284; SI: ; %bb.0: 285; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 286; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 287; SI-NEXT: s_load_dword s4, s[4:5], 0x4 288; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 289; SI-NEXT: s_mov_b32 s3, 0x100f000 290; SI-NEXT: s_mov_b32 s2, -1 291; SI-NEXT: s_waitcnt lgkmcnt(0) 292; SI-NEXT: v_mov_b32_e32 v1, s7 293; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 294; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 295; SI-NEXT: v_mov_b32_e32 v2, s6 296; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 297; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 298; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 299; SI-NEXT: s_endpgm 300; 301; VI-LABEL: dynamic_insertelement_v2f32: 302; VI: ; %bb.0: 303; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 304; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 305; VI-NEXT: s_load_dword s4, s[4:5], 0x10 306; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 307; VI-NEXT: s_mov_b32 s3, 0x1100f000 308; VI-NEXT: s_mov_b32 s2, -1 309; VI-NEXT: s_waitcnt lgkmcnt(0) 310; VI-NEXT: v_mov_b32_e32 v1, s7 311; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 312; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 313; VI-NEXT: v_mov_b32_e32 v2, s6 314; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 315; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 316; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 317; VI-NEXT: s_endpgm 318 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 319 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 320 ret void 321} 322 323define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { 324; SI-LABEL: dynamic_insertelement_v3f32: 325; SI: ; %bb.0: 326; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 327; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 328; SI-NEXT: s_load_dword s4, s[4:5], 0x8 329; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 330; SI-NEXT: s_mov_b32 s3, 0x100f000 331; SI-NEXT: s_mov_b32 s2, -1 332; SI-NEXT: s_waitcnt lgkmcnt(0) 333; SI-NEXT: v_mov_b32_e32 v1, s10 334; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 335; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 336; SI-NEXT: v_mov_b32_e32 v1, s9 337; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 338; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 339; SI-NEXT: v_mov_b32_e32 v3, s8 340; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 341; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 342; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 343; SI-NEXT: s_endpgm 344; 345; VI-LABEL: dynamic_insertelement_v3f32: 346; VI: ; %bb.0: 347; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 348; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 349; VI-NEXT: s_load_dword s4, s[4:5], 0x20 350; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 351; VI-NEXT: s_mov_b32 s3, 0x1100f000 352; VI-NEXT: s_mov_b32 s2, -1 353; VI-NEXT: s_waitcnt lgkmcnt(0) 354; VI-NEXT: v_mov_b32_e32 v1, s10 355; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 356; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 357; VI-NEXT: v_mov_b32_e32 v1, s9 358; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 359; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 360; VI-NEXT: v_mov_b32_e32 v3, s8 361; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 362; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 363; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 364; VI-NEXT: s_endpgm 365 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 366 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 367 ret void 368} 369 370define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { 371; SI-LABEL: dynamic_insertelement_v4f32: 372; SI: ; %bb.0: 373; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 374; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 375; SI-NEXT: s_load_dword s4, s[4:5], 0x8 376; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 377; SI-NEXT: s_mov_b32 s3, 0x100f000 378; SI-NEXT: s_mov_b32 s2, -1 379; SI-NEXT: s_waitcnt lgkmcnt(0) 380; SI-NEXT: v_mov_b32_e32 v1, s11 381; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 382; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 383; SI-NEXT: v_mov_b32_e32 v1, s10 384; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 385; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 386; SI-NEXT: v_mov_b32_e32 v1, s9 387; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 388; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 389; SI-NEXT: v_mov_b32_e32 v4, s8 390; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 391; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 392; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 393; SI-NEXT: s_endpgm 394; 395; VI-LABEL: dynamic_insertelement_v4f32: 396; VI: ; %bb.0: 397; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 398; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 399; VI-NEXT: s_load_dword s4, s[4:5], 0x20 400; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 401; VI-NEXT: s_mov_b32 s3, 0x1100f000 402; VI-NEXT: s_mov_b32 s2, -1 403; VI-NEXT: s_waitcnt lgkmcnt(0) 404; VI-NEXT: v_mov_b32_e32 v1, s11 405; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 406; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 407; VI-NEXT: v_mov_b32_e32 v1, s10 408; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 409; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 410; VI-NEXT: v_mov_b32_e32 v1, s9 411; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 412; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 413; VI-NEXT: v_mov_b32_e32 v4, s8 414; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 415; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 416; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 417; VI-NEXT: s_endpgm 418 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 419 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 420 ret void 421} 422 423define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { 424; SI-LABEL: dynamic_insertelement_v8f32: 425; SI: ; %bb.0: 426; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 427; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 428; SI-NEXT: s_load_dword s4, s[4:5], 0x10 429; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 430; SI-NEXT: s_mov_b32 s3, 0x100f000 431; SI-NEXT: s_mov_b32 s2, -1 432; SI-NEXT: s_waitcnt lgkmcnt(0) 433; SI-NEXT: v_mov_b32_e32 v0, s11 434; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 435; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 436; SI-NEXT: v_mov_b32_e32 v0, s10 437; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 438; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 439; SI-NEXT: v_mov_b32_e32 v0, s9 440; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 441; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 442; SI-NEXT: v_mov_b32_e32 v0, s8 443; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 444; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 445; SI-NEXT: v_mov_b32_e32 v5, s15 446; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 447; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 448; SI-NEXT: v_mov_b32_e32 v5, s14 449; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 450; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 451; SI-NEXT: v_mov_b32_e32 v5, s13 452; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 453; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 454; SI-NEXT: v_mov_b32_e32 v8, s12 455; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 456; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 457; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 458; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 459; SI-NEXT: s_endpgm 460; 461; VI-LABEL: dynamic_insertelement_v8f32: 462; VI: ; %bb.0: 463; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 464; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 465; VI-NEXT: s_load_dword s4, s[4:5], 0x40 466; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 467; VI-NEXT: s_mov_b32 s3, 0x1100f000 468; VI-NEXT: s_mov_b32 s2, -1 469; VI-NEXT: s_waitcnt lgkmcnt(0) 470; VI-NEXT: v_mov_b32_e32 v0, s11 471; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 472; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 473; VI-NEXT: v_mov_b32_e32 v0, s10 474; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 475; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 476; VI-NEXT: v_mov_b32_e32 v0, s9 477; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 478; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 479; VI-NEXT: v_mov_b32_e32 v0, s8 480; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 481; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 482; VI-NEXT: v_mov_b32_e32 v5, s15 483; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 484; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 485; VI-NEXT: v_mov_b32_e32 v5, s14 486; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 487; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 488; VI-NEXT: v_mov_b32_e32 v5, s13 489; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 490; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 491; VI-NEXT: v_mov_b32_e32 v8, s12 492; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 493; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 494; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 495; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 496; VI-NEXT: s_endpgm 497 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 498 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 499 ret void 500} 501 502define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { 503; SI-LABEL: dynamic_insertelement_v16f32: 504; SI: ; %bb.0: 505; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 506; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 507; SI-NEXT: s_load_dword s4, s[4:5], 0x20 508; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 509; SI-NEXT: s_mov_b32 s3, 0x100f000 510; SI-NEXT: s_mov_b32 s2, -1 511; SI-NEXT: s_waitcnt lgkmcnt(0) 512; SI-NEXT: v_mov_b32_e32 v0, s8 513; SI-NEXT: v_mov_b32_e32 v1, s9 514; SI-NEXT: v_mov_b32_e32 v2, s10 515; SI-NEXT: v_mov_b32_e32 v3, s11 516; SI-NEXT: v_mov_b32_e32 v4, s12 517; SI-NEXT: v_mov_b32_e32 v5, s13 518; SI-NEXT: v_mov_b32_e32 v6, s14 519; SI-NEXT: v_mov_b32_e32 v7, s15 520; SI-NEXT: v_mov_b32_e32 v8, s16 521; SI-NEXT: v_mov_b32_e32 v9, s17 522; SI-NEXT: v_mov_b32_e32 v10, s18 523; SI-NEXT: v_mov_b32_e32 v11, s19 524; SI-NEXT: v_mov_b32_e32 v12, s20 525; SI-NEXT: v_mov_b32_e32 v13, s21 526; SI-NEXT: v_mov_b32_e32 v14, s22 527; SI-NEXT: v_mov_b32_e32 v15, s23 528; SI-NEXT: s_mov_b32 m0, s4 529; SI-NEXT: v_movreld_b32_e32 v0, v16 530; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 531; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 532; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 533; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 534; SI-NEXT: s_endpgm 535; 536; VI-LABEL: dynamic_insertelement_v16f32: 537; VI: ; %bb.0: 538; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 539; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 540; VI-NEXT: s_load_dword s4, s[4:5], 0x80 541; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 542; VI-NEXT: s_mov_b32 s3, 0x1100f000 543; VI-NEXT: s_mov_b32 s2, -1 544; VI-NEXT: s_waitcnt lgkmcnt(0) 545; VI-NEXT: v_mov_b32_e32 v0, s8 546; VI-NEXT: v_mov_b32_e32 v1, s9 547; VI-NEXT: v_mov_b32_e32 v2, s10 548; VI-NEXT: v_mov_b32_e32 v3, s11 549; VI-NEXT: v_mov_b32_e32 v4, s12 550; VI-NEXT: v_mov_b32_e32 v5, s13 551; VI-NEXT: v_mov_b32_e32 v6, s14 552; VI-NEXT: v_mov_b32_e32 v7, s15 553; VI-NEXT: v_mov_b32_e32 v8, s16 554; VI-NEXT: v_mov_b32_e32 v9, s17 555; VI-NEXT: v_mov_b32_e32 v10, s18 556; VI-NEXT: v_mov_b32_e32 v11, s19 557; VI-NEXT: v_mov_b32_e32 v12, s20 558; VI-NEXT: v_mov_b32_e32 v13, s21 559; VI-NEXT: v_mov_b32_e32 v14, s22 560; VI-NEXT: v_mov_b32_e32 v15, s23 561; VI-NEXT: s_mov_b32 m0, s4 562; VI-NEXT: v_movreld_b32_e32 v0, v16 563; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 564; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 565; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 566; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 567; VI-NEXT: s_endpgm 568 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 569 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 570 ret void 571} 572 573define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { 574; SI-LABEL: dynamic_insertelement_v2i32: 575; SI: ; %bb.0: 576; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 577; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 578; SI-NEXT: s_load_dword s4, s[4:5], 0x4 579; SI-NEXT: s_mov_b32 s3, 0x100f000 580; SI-NEXT: s_mov_b32 s2, -1 581; SI-NEXT: s_waitcnt lgkmcnt(0) 582; SI-NEXT: v_mov_b32_e32 v0, s7 583; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 584; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 585; SI-NEXT: v_mov_b32_e32 v0, s6 586; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 587; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 588; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 589; SI-NEXT: s_endpgm 590; 591; VI-LABEL: dynamic_insertelement_v2i32: 592; VI: ; %bb.0: 593; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 594; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 595; VI-NEXT: s_load_dword s4, s[4:5], 0x10 596; VI-NEXT: s_mov_b32 s3, 0x1100f000 597; VI-NEXT: s_mov_b32 s2, -1 598; VI-NEXT: s_waitcnt lgkmcnt(0) 599; VI-NEXT: v_mov_b32_e32 v0, s7 600; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 601; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 602; VI-NEXT: v_mov_b32_e32 v0, s6 603; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 604; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 605; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 606; VI-NEXT: s_endpgm 607 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 608 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 609 ret void 610} 611 612define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { 613; SI-LABEL: dynamic_insertelement_v3i32: 614; SI: ; %bb.0: 615; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 616; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 617; SI-NEXT: s_load_dword s4, s[4:5], 0x8 618; SI-NEXT: s_mov_b32 s3, 0x100f000 619; SI-NEXT: s_mov_b32 s2, -1 620; SI-NEXT: s_waitcnt lgkmcnt(0) 621; SI-NEXT: v_mov_b32_e32 v0, s10 622; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 623; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 624; SI-NEXT: v_mov_b32_e32 v0, s9 625; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 626; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 627; SI-NEXT: v_mov_b32_e32 v0, s8 628; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 629; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 630; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 631; SI-NEXT: s_endpgm 632; 633; VI-LABEL: dynamic_insertelement_v3i32: 634; VI: ; %bb.0: 635; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 636; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 637; VI-NEXT: s_load_dword s4, s[4:5], 0x20 638; VI-NEXT: s_mov_b32 s3, 0x1100f000 639; VI-NEXT: s_mov_b32 s2, -1 640; VI-NEXT: s_waitcnt lgkmcnt(0) 641; VI-NEXT: v_mov_b32_e32 v0, s10 642; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 643; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 644; VI-NEXT: v_mov_b32_e32 v0, s9 645; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 646; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 647; VI-NEXT: v_mov_b32_e32 v0, s8 648; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 649; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 650; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 651; VI-NEXT: s_endpgm 652 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 653 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 654 ret void 655} 656 657define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 658; SI-LABEL: dynamic_insertelement_v4i32: 659; SI: ; %bb.0: 660; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 661; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 662; SI-NEXT: s_load_dword s6, s[4:5], 0x8 663; SI-NEXT: s_load_dword s4, s[4:5], 0x11 664; SI-NEXT: s_mov_b32 s3, 0x100f000 665; SI-NEXT: s_mov_b32 s2, -1 666; SI-NEXT: s_waitcnt lgkmcnt(0) 667; SI-NEXT: v_mov_b32_e32 v0, s11 668; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 669; SI-NEXT: v_mov_b32_e32 v4, s4 670; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 671; SI-NEXT: v_mov_b32_e32 v0, s10 672; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 673; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 674; SI-NEXT: v_mov_b32_e32 v0, s9 675; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 676; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 677; SI-NEXT: v_mov_b32_e32 v0, s8 678; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 679; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 680; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 681; SI-NEXT: s_endpgm 682; 683; VI-LABEL: dynamic_insertelement_v4i32: 684; VI: ; %bb.0: 685; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 686; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 687; VI-NEXT: s_load_dword s6, s[4:5], 0x20 688; VI-NEXT: s_load_dword s4, s[4:5], 0x44 689; VI-NEXT: s_mov_b32 s3, 0x1100f000 690; VI-NEXT: s_mov_b32 s2, -1 691; VI-NEXT: s_waitcnt lgkmcnt(0) 692; VI-NEXT: v_mov_b32_e32 v0, s11 693; VI-NEXT: v_mov_b32_e32 v4, s4 694; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 695; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 696; VI-NEXT: v_mov_b32_e32 v0, s10 697; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 698; VI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 699; VI-NEXT: v_mov_b32_e32 v0, s9 700; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 701; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 702; VI-NEXT: v_mov_b32_e32 v0, s8 703; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 704; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 705; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 706; VI-NEXT: s_endpgm 707 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 708 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 709 ret void 710} 711 712define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { 713; SI-LABEL: dynamic_insertelement_v8i32: 714; SI: ; %bb.0: 715; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 716; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 717; SI-NEXT: s_load_dword s4, s[4:5], 0x10 718; SI-NEXT: s_mov_b32 s3, 0x100f000 719; SI-NEXT: s_mov_b32 s2, -1 720; SI-NEXT: s_waitcnt lgkmcnt(0) 721; SI-NEXT: v_mov_b32_e32 v0, s11 722; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 723; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc 724; SI-NEXT: v_mov_b32_e32 v0, s10 725; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 726; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 727; SI-NEXT: v_mov_b32_e32 v0, s9 728; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 729; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 730; SI-NEXT: v_mov_b32_e32 v0, s8 731; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 732; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 733; SI-NEXT: v_mov_b32_e32 v4, s15 734; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 735; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc 736; SI-NEXT: v_mov_b32_e32 v4, s14 737; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 738; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc 739; SI-NEXT: v_mov_b32_e32 v4, s13 740; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 741; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc 742; SI-NEXT: v_mov_b32_e32 v4, s12 743; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 744; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 745; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 746; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 747; SI-NEXT: s_endpgm 748; 749; VI-LABEL: dynamic_insertelement_v8i32: 750; VI: ; %bb.0: 751; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 752; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 753; VI-NEXT: s_load_dword s4, s[4:5], 0x40 754; VI-NEXT: s_mov_b32 s3, 0x1100f000 755; VI-NEXT: s_mov_b32 s2, -1 756; VI-NEXT: s_waitcnt lgkmcnt(0) 757; VI-NEXT: v_mov_b32_e32 v0, s11 758; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 759; VI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc 760; VI-NEXT: v_mov_b32_e32 v0, s10 761; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 762; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 763; VI-NEXT: v_mov_b32_e32 v0, s9 764; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 765; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 766; VI-NEXT: v_mov_b32_e32 v0, s8 767; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 768; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 769; VI-NEXT: v_mov_b32_e32 v4, s15 770; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 771; VI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc 772; VI-NEXT: v_mov_b32_e32 v4, s14 773; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 774; VI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc 775; VI-NEXT: v_mov_b32_e32 v4, s13 776; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 777; VI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc 778; VI-NEXT: v_mov_b32_e32 v4, s12 779; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 780; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 781; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 782; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 783; VI-NEXT: s_endpgm 784 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 785 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 786 ret void 787} 788 789define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { 790; SI-LABEL: dynamic_insertelement_v16i32: 791; SI: ; %bb.0: 792; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 793; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 794; SI-NEXT: s_load_dword s4, s[4:5], 0x20 795; SI-NEXT: s_mov_b32 s3, 0x100f000 796; SI-NEXT: s_mov_b32 s2, -1 797; SI-NEXT: s_waitcnt lgkmcnt(0) 798; SI-NEXT: v_mov_b32_e32 v0, s8 799; SI-NEXT: v_mov_b32_e32 v1, s9 800; SI-NEXT: v_mov_b32_e32 v2, s10 801; SI-NEXT: v_mov_b32_e32 v3, s11 802; SI-NEXT: v_mov_b32_e32 v4, s12 803; SI-NEXT: v_mov_b32_e32 v5, s13 804; SI-NEXT: v_mov_b32_e32 v6, s14 805; SI-NEXT: v_mov_b32_e32 v7, s15 806; SI-NEXT: v_mov_b32_e32 v8, s16 807; SI-NEXT: v_mov_b32_e32 v9, s17 808; SI-NEXT: v_mov_b32_e32 v10, s18 809; SI-NEXT: v_mov_b32_e32 v11, s19 810; SI-NEXT: v_mov_b32_e32 v12, s20 811; SI-NEXT: v_mov_b32_e32 v13, s21 812; SI-NEXT: v_mov_b32_e32 v14, s22 813; SI-NEXT: v_mov_b32_e32 v15, s23 814; SI-NEXT: s_mov_b32 m0, s4 815; SI-NEXT: v_movreld_b32_e32 v0, 5 816; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 817; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 818; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 819; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 820; SI-NEXT: s_endpgm 821; 822; VI-LABEL: dynamic_insertelement_v16i32: 823; VI: ; %bb.0: 824; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 825; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 826; VI-NEXT: s_load_dword s4, s[4:5], 0x80 827; VI-NEXT: s_mov_b32 s3, 0x1100f000 828; VI-NEXT: s_mov_b32 s2, -1 829; VI-NEXT: s_waitcnt lgkmcnt(0) 830; VI-NEXT: v_mov_b32_e32 v0, s8 831; VI-NEXT: v_mov_b32_e32 v1, s9 832; VI-NEXT: v_mov_b32_e32 v2, s10 833; VI-NEXT: v_mov_b32_e32 v3, s11 834; VI-NEXT: v_mov_b32_e32 v4, s12 835; VI-NEXT: v_mov_b32_e32 v5, s13 836; VI-NEXT: v_mov_b32_e32 v6, s14 837; VI-NEXT: v_mov_b32_e32 v7, s15 838; VI-NEXT: v_mov_b32_e32 v8, s16 839; VI-NEXT: v_mov_b32_e32 v9, s17 840; VI-NEXT: v_mov_b32_e32 v10, s18 841; VI-NEXT: v_mov_b32_e32 v11, s19 842; VI-NEXT: v_mov_b32_e32 v12, s20 843; VI-NEXT: v_mov_b32_e32 v13, s21 844; VI-NEXT: v_mov_b32_e32 v14, s22 845; VI-NEXT: v_mov_b32_e32 v15, s23 846; VI-NEXT: s_mov_b32 m0, s4 847; VI-NEXT: v_movreld_b32_e32 v0, 5 848; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 849; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 850; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 851; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 852; VI-NEXT: s_endpgm 853 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 854 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 855 ret void 856} 857 858define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { 859; SI-LABEL: dynamic_insertelement_v2i16: 860; SI: ; %bb.0: 861; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 862; SI-NEXT: s_load_dword s6, s[4:5], 0x2 863; SI-NEXT: s_load_dword s4, s[4:5], 0x3 864; SI-NEXT: v_mov_b32_e32 v0, 0x50005 865; SI-NEXT: s_mov_b32 s3, 0x100f000 866; SI-NEXT: s_mov_b32 s2, -1 867; SI-NEXT: s_waitcnt lgkmcnt(0) 868; SI-NEXT: v_mov_b32_e32 v1, s6 869; SI-NEXT: s_lshl_b32 s4, s4, 4 870; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 871; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 872; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 873; SI-NEXT: s_endpgm 874; 875; VI-LABEL: dynamic_insertelement_v2i16: 876; VI: ; %bb.0: 877; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 878; VI-NEXT: s_load_dword s6, s[4:5], 0x8 879; VI-NEXT: s_load_dword s4, s[4:5], 0xc 880; VI-NEXT: v_mov_b32_e32 v0, 0x50005 881; VI-NEXT: s_mov_b32 s3, 0x1100f000 882; VI-NEXT: s_mov_b32 s2, -1 883; VI-NEXT: s_waitcnt lgkmcnt(0) 884; VI-NEXT: v_mov_b32_e32 v1, s6 885; VI-NEXT: s_lshl_b32 s4, s4, 4 886; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 887; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 888; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 889; VI-NEXT: s_endpgm 890 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 891 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 892 ret void 893} 894 895define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { 896; SI-LABEL: dynamic_insertelement_v3i16: 897; SI: ; %bb.0: 898; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 899; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 900; SI-NEXT: s_load_dword s4, s[4:5], 0x4 901; SI-NEXT: s_mov_b32 s5, 0 902; SI-NEXT: s_mov_b32 s3, 0x100f000 903; SI-NEXT: s_mov_b32 s2, -1 904; SI-NEXT: s_waitcnt lgkmcnt(0) 905; SI-NEXT: s_lshl_b32 s8, s4, 4 906; SI-NEXT: s_mov_b32 s4, 0xffff 907; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 908; SI-NEXT: s_mov_b32 s8, 0x50005 909; SI-NEXT: s_and_b32 s9, s5, s8 910; SI-NEXT: s_and_b32 s8, s4, s8 911; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] 912; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 913; SI-NEXT: v_mov_b32_e32 v0, s5 914; SI-NEXT: v_mov_b32_e32 v1, s4 915; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 916; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 917; SI-NEXT: s_endpgm 918; 919; VI-LABEL: dynamic_insertelement_v3i16: 920; VI: ; %bb.0: 921; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 922; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 923; VI-NEXT: s_load_dword s4, s[4:5], 0x10 924; VI-NEXT: s_mov_b32 s5, 0 925; VI-NEXT: s_mov_b32 s3, 0x1100f000 926; VI-NEXT: s_mov_b32 s2, -1 927; VI-NEXT: s_waitcnt lgkmcnt(0) 928; VI-NEXT: v_mov_b32_e32 v1, s7 929; VI-NEXT: s_lshl_b32 s8, s4, 4 930; VI-NEXT: s_mov_b32 s4, 0xffff 931; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 932; VI-NEXT: s_mov_b32 s8, 0x50005 933; VI-NEXT: v_mov_b32_e32 v0, s8 934; VI-NEXT: v_bfi_b32 v0, s5, v0, v1 935; VI-NEXT: v_mov_b32_e32 v1, s8 936; VI-NEXT: v_mov_b32_e32 v2, s6 937; VI-NEXT: v_bfi_b32 v1, s4, v1, v2 938; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 939; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 940; VI-NEXT: s_endpgm 941 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 942 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 943 ret void 944} 945 946define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 947; SI-LABEL: dynamic_insertelement_v2i8: 948; SI: ; %bb.0: 949; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 950; SI-NEXT: s_load_dword s6, s[4:5], 0xa 951; SI-NEXT: s_load_dword s4, s[4:5], 0x13 952; SI-NEXT: v_mov_b32_e32 v0, 0x505 953; SI-NEXT: s_mov_b32 s3, 0x100f000 954; SI-NEXT: s_mov_b32 s2, -1 955; SI-NEXT: s_waitcnt lgkmcnt(0) 956; SI-NEXT: v_mov_b32_e32 v1, s6 957; SI-NEXT: s_lshl_b32 s4, s4, 3 958; SI-NEXT: s_lshl_b32 s4, -1, s4 959; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 960; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 961; SI-NEXT: s_endpgm 962; 963; VI-LABEL: dynamic_insertelement_v2i8: 964; VI: ; %bb.0: 965; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 966; VI-NEXT: s_load_dword s6, s[4:5], 0x28 967; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 968; VI-NEXT: s_mov_b32 s3, 0x1100f000 969; VI-NEXT: s_mov_b32 s2, -1 970; VI-NEXT: s_waitcnt lgkmcnt(0) 971; VI-NEXT: s_lshl_b32 s4, s4, 3 972; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 973; VI-NEXT: v_and_b32_e32 v1, 0x505, v0 974; VI-NEXT: v_xor_b32_e32 v0, -1, v0 975; VI-NEXT: v_and_b32_e32 v0, s6, v0 976; VI-NEXT: v_or_b32_e32 v0, v1, v0 977; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 978; VI-NEXT: s_endpgm 979 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 980 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 981 ret void 982} 983 984; FIXME: post legalize i16 and i32 shifts aren't merged because of 985; isTypeDesirableForOp in SimplifyDemandedBits 986define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 987; SI-LABEL: dynamic_insertelement_v3i8: 988; SI: ; %bb.0: 989; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 990; SI-NEXT: s_load_dword s6, s[4:5], 0xa 991; SI-NEXT: s_load_dword s4, s[4:5], 0x13 992; SI-NEXT: v_mov_b32_e32 v0, 0x5050505 993; SI-NEXT: s_mov_b32 s3, 0x100f000 994; SI-NEXT: s_mov_b32 s2, -1 995; SI-NEXT: s_waitcnt lgkmcnt(0) 996; SI-NEXT: v_mov_b32_e32 v1, s6 997; SI-NEXT: s_lshl_b32 s4, s4, 3 998; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 999; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 1000; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1001; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1002; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 1003; SI-NEXT: s_endpgm 1004; 1005; VI-LABEL: dynamic_insertelement_v3i8: 1006; VI: ; %bb.0: 1007; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1008; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1009; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1010; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 1011; VI-NEXT: s_mov_b32 s3, 0x1100f000 1012; VI-NEXT: s_mov_b32 s2, -1 1013; VI-NEXT: s_waitcnt lgkmcnt(0) 1014; VI-NEXT: v_mov_b32_e32 v1, s6 1015; VI-NEXT: s_lshl_b32 s4, s4, 3 1016; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1017; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 1018; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1019; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1020; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 1021; VI-NEXT: s_endpgm 1022 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 1023 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 1024 ret void 1025} 1026 1027define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 1028; SI-LABEL: dynamic_insertelement_v4i8: 1029; SI: ; %bb.0: 1030; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1031; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1032; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1033; SI-NEXT: v_mov_b32_e32 v0, 0x5050505 1034; SI-NEXT: s_mov_b32 s3, 0x100f000 1035; SI-NEXT: s_mov_b32 s2, -1 1036; SI-NEXT: s_waitcnt lgkmcnt(0) 1037; SI-NEXT: v_mov_b32_e32 v1, s6 1038; SI-NEXT: s_lshl_b32 s4, s4, 3 1039; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1040; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 1041; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1042; SI-NEXT: s_endpgm 1043; 1044; VI-LABEL: dynamic_insertelement_v4i8: 1045; VI: ; %bb.0: 1046; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1047; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1048; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1049; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 1050; VI-NEXT: s_mov_b32 s3, 0x1100f000 1051; VI-NEXT: s_mov_b32 s2, -1 1052; VI-NEXT: s_waitcnt lgkmcnt(0) 1053; VI-NEXT: v_mov_b32_e32 v1, s6 1054; VI-NEXT: s_lshl_b32 s4, s4, 3 1055; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1056; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 1057; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1058; VI-NEXT: s_endpgm 1059 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 1060 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 1061 ret void 1062} 1063 1064define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { 1065; SI-LABEL: s_dynamic_insertelement_v8i8: 1066; SI: ; %bb.0: 1067; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1068; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1069; SI-NEXT: s_mov_b32 s7, 0 1070; SI-NEXT: s_mov_b32 s3, 0x100f000 1071; SI-NEXT: s_mov_b32 s2, -1 1072; SI-NEXT: s_waitcnt lgkmcnt(0) 1073; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1074; SI-NEXT: s_mov_b32 s0, s8 1075; SI-NEXT: s_lshl_b32 s8, s6, 3 1076; SI-NEXT: s_mov_b32 s6, 0xffff 1077; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1078; SI-NEXT: s_mov_b32 s8, 0x5050505 1079; SI-NEXT: s_mov_b32 s1, s9 1080; SI-NEXT: s_and_b32 s9, s7, s8 1081; SI-NEXT: s_and_b32 s8, s6, s8 1082; SI-NEXT: s_waitcnt lgkmcnt(0) 1083; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1084; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1085; SI-NEXT: v_mov_b32_e32 v0, s4 1086; SI-NEXT: v_mov_b32_e32 v1, s5 1087; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1088; SI-NEXT: s_endpgm 1089; 1090; VI-LABEL: s_dynamic_insertelement_v8i8: 1091; VI: ; %bb.0: 1092; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1093; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1094; VI-NEXT: s_mov_b32 s7, 0 1095; VI-NEXT: s_mov_b32 s3, 0x1100f000 1096; VI-NEXT: s_mov_b32 s2, -1 1097; VI-NEXT: s_waitcnt lgkmcnt(0) 1098; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1099; VI-NEXT: s_mov_b32 s0, s8 1100; VI-NEXT: s_lshl_b32 s8, s6, 3 1101; VI-NEXT: s_mov_b32 s6, 0xffff 1102; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1103; VI-NEXT: s_mov_b32 s8, 0x5050505 1104; VI-NEXT: s_mov_b32 s1, s9 1105; VI-NEXT: s_and_b32 s9, s7, s8 1106; VI-NEXT: s_and_b32 s8, s6, s8 1107; VI-NEXT: s_waitcnt lgkmcnt(0) 1108; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1109; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1110; VI-NEXT: v_mov_b32_e32 v0, s4 1111; VI-NEXT: v_mov_b32_e32 v1, s5 1112; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1113; VI-NEXT: s_endpgm 1114 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 1115 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 1116 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 1117 ret void 1118} 1119 1120define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { 1121; SI-LABEL: dynamic_insertelement_v16i8: 1122; SI: ; %bb.0: 1123; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1124; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1125; SI-NEXT: s_load_dword s4, s[4:5], 0x8 1126; SI-NEXT: s_mov_b32 s3, 0x100f000 1127; SI-NEXT: s_mov_b32 s2, -1 1128; SI-NEXT: s_waitcnt lgkmcnt(0) 1129; SI-NEXT: s_lshr_b32 s5, s11, 24 1130; SI-NEXT: v_mov_b32_e32 v0, s5 1131; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 1132; SI-NEXT: s_lshr_b32 s5, s11, 16 1133; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1134; SI-NEXT: v_mov_b32_e32 v1, s5 1135; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 1136; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1137; SI-NEXT: s_movk_i32 s5, 0xff 1138; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1139; SI-NEXT: v_and_b32_e32 v1, s5, v1 1140; SI-NEXT: s_lshr_b32 s6, s11, 8 1141; SI-NEXT: v_or_b32_e32 v0, v1, v0 1142; SI-NEXT: v_mov_b32_e32 v1, s6 1143; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 1144; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1145; SI-NEXT: v_mov_b32_e32 v2, s11 1146; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 1147; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1148; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1149; SI-NEXT: v_and_b32_e32 v2, s5, v2 1150; SI-NEXT: v_or_b32_e32 v1, v2, v1 1151; SI-NEXT: s_mov_b32 s6, 0xffff 1152; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1153; SI-NEXT: v_and_b32_e32 v1, s6, v1 1154; SI-NEXT: s_lshr_b32 s7, s10, 24 1155; SI-NEXT: v_or_b32_e32 v3, v1, v0 1156; SI-NEXT: v_mov_b32_e32 v0, s7 1157; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 1158; SI-NEXT: s_lshr_b32 s7, s10, 16 1159; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1160; SI-NEXT: v_mov_b32_e32 v1, s7 1161; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 1162; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1163; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1164; SI-NEXT: v_and_b32_e32 v1, s5, v1 1165; SI-NEXT: s_lshr_b32 s7, s10, 8 1166; SI-NEXT: v_or_b32_e32 v0, v1, v0 1167; SI-NEXT: v_mov_b32_e32 v1, s7 1168; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 1169; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1170; SI-NEXT: v_mov_b32_e32 v2, s10 1171; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 1172; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1173; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1174; SI-NEXT: v_and_b32_e32 v2, s5, v2 1175; SI-NEXT: v_or_b32_e32 v1, v2, v1 1176; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1177; SI-NEXT: v_and_b32_e32 v1, s6, v1 1178; SI-NEXT: s_lshr_b32 s7, s9, 24 1179; SI-NEXT: v_or_b32_e32 v2, v1, v0 1180; SI-NEXT: v_mov_b32_e32 v0, s7 1181; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 1182; SI-NEXT: s_lshr_b32 s7, s9, 16 1183; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1184; SI-NEXT: v_mov_b32_e32 v1, s7 1185; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 1186; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1187; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1188; SI-NEXT: v_and_b32_e32 v1, s5, v1 1189; SI-NEXT: s_lshr_b32 s7, s9, 8 1190; SI-NEXT: v_or_b32_e32 v0, v1, v0 1191; SI-NEXT: v_mov_b32_e32 v1, s7 1192; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 1193; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1194; SI-NEXT: v_mov_b32_e32 v4, s9 1195; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 1196; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1197; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1198; SI-NEXT: v_and_b32_e32 v4, s5, v4 1199; SI-NEXT: v_or_b32_e32 v1, v4, v1 1200; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1201; SI-NEXT: v_and_b32_e32 v1, s6, v1 1202; SI-NEXT: s_lshr_b32 s7, s8, 24 1203; SI-NEXT: v_or_b32_e32 v1, v1, v0 1204; SI-NEXT: v_mov_b32_e32 v0, s7 1205; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 1206; SI-NEXT: s_lshr_b32 s7, s8, 16 1207; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1208; SI-NEXT: v_mov_b32_e32 v4, s7 1209; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 1210; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1211; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1212; SI-NEXT: v_and_b32_e32 v4, s5, v4 1213; SI-NEXT: s_lshr_b32 s7, s8, 8 1214; SI-NEXT: v_or_b32_e32 v0, v4, v0 1215; SI-NEXT: v_mov_b32_e32 v4, s7 1216; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 1217; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1218; SI-NEXT: v_mov_b32_e32 v5, s8 1219; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 1220; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1221; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 1222; SI-NEXT: v_and_b32_e32 v5, s5, v5 1223; SI-NEXT: v_or_b32_e32 v4, v5, v4 1224; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1225; SI-NEXT: v_and_b32_e32 v4, s6, v4 1226; SI-NEXT: v_or_b32_e32 v0, v4, v0 1227; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1228; SI-NEXT: s_endpgm 1229; 1230; VI-LABEL: dynamic_insertelement_v16i8: 1231; VI: ; %bb.0: 1232; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1233; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1234; VI-NEXT: s_load_dword s4, s[4:5], 0x20 1235; VI-NEXT: s_mov_b32 s3, 0x1100f000 1236; VI-NEXT: s_mov_b32 s2, -1 1237; VI-NEXT: s_waitcnt lgkmcnt(0) 1238; VI-NEXT: s_lshr_b32 s5, s11, 24 1239; VI-NEXT: v_mov_b32_e32 v0, s5 1240; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 1241; VI-NEXT: s_lshr_b32 s5, s11, 16 1242; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1243; VI-NEXT: v_mov_b32_e32 v1, s5 1244; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 1245; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1246; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1247; VI-NEXT: s_lshr_b32 s5, s11, 8 1248; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1249; VI-NEXT: v_mov_b32_e32 v1, s5 1250; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 1251; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1252; VI-NEXT: v_mov_b32_e32 v2, s11 1253; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 1254; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1255; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1256; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1257; VI-NEXT: s_lshr_b32 s5, s10, 24 1258; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1259; VI-NEXT: v_mov_b32_e32 v0, s5 1260; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 1261; VI-NEXT: s_lshr_b32 s5, s10, 16 1262; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1263; VI-NEXT: v_mov_b32_e32 v1, s5 1264; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 1265; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1266; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1267; VI-NEXT: s_lshr_b32 s5, s10, 8 1268; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1269; VI-NEXT: v_mov_b32_e32 v1, s5 1270; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 1271; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1272; VI-NEXT: v_mov_b32_e32 v2, s10 1273; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 1274; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1275; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1276; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1277; VI-NEXT: s_lshr_b32 s5, s9, 24 1278; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1279; VI-NEXT: v_mov_b32_e32 v0, s5 1280; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 1281; VI-NEXT: s_lshr_b32 s5, s9, 16 1282; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1283; VI-NEXT: v_mov_b32_e32 v1, s5 1284; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 1285; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1286; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1287; VI-NEXT: s_lshr_b32 s5, s9, 8 1288; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1289; VI-NEXT: v_mov_b32_e32 v1, s5 1290; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 1291; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1292; VI-NEXT: v_mov_b32_e32 v4, s9 1293; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 1294; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1295; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1296; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1297; VI-NEXT: s_lshr_b32 s5, s8, 24 1298; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1299; VI-NEXT: v_mov_b32_e32 v0, s5 1300; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 1301; VI-NEXT: s_lshr_b32 s5, s8, 16 1302; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1303; VI-NEXT: v_mov_b32_e32 v4, s5 1304; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 1305; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1306; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1307; VI-NEXT: s_lshr_b32 s5, s8, 8 1308; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1309; VI-NEXT: v_mov_b32_e32 v4, s5 1310; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 1311; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1312; VI-NEXT: v_mov_b32_e32 v5, s8 1313; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 1314; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 1315; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1316; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1317; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1318; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1319; VI-NEXT: s_endpgm 1320 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 1321 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 1322 ret void 1323} 1324 1325; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 1326; the compiler doesn't crash. 1327define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { 1328; SI-LABEL: insert_split_bb: 1329; SI: ; %bb.0: ; %entry 1330; SI-NEXT: s_load_dword s0, s[4:5], 0x4 1331; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 1332; SI-NEXT: s_waitcnt lgkmcnt(0) 1333; SI-NEXT: s_cmp_lg_u32 s0, 0 1334; SI-NEXT: s_cbranch_scc0 BB26_2 1335; SI-NEXT: ; %bb.1: ; %else 1336; SI-NEXT: s_load_dword s1, s[6:7], 0x1 1337; SI-NEXT: s_mov_b64 s[2:3], 0 1338; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] 1339; SI-NEXT: s_waitcnt lgkmcnt(0) 1340; SI-NEXT: s_mov_b64 vcc, vcc 1341; SI-NEXT: s_cbranch_vccz BB26_3 1342; SI-NEXT: s_branch BB26_4 1343; SI-NEXT: BB26_2: 1344; SI-NEXT: s_mov_b64 s[2:3], -1 1345; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] 1346; SI-NEXT: s_cbranch_vccnz BB26_4 1347; SI-NEXT: BB26_3: ; %if 1348; SI-NEXT: s_load_dword s1, s[6:7], 0x0 1349; SI-NEXT: BB26_4: ; %endif 1350; SI-NEXT: s_waitcnt lgkmcnt(0) 1351; SI-NEXT: v_mov_b32_e32 v0, s0 1352; SI-NEXT: s_mov_b32 s7, 0x100f000 1353; SI-NEXT: s_mov_b32 s6, -1 1354; SI-NEXT: v_mov_b32_e32 v1, s1 1355; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1356; SI-NEXT: s_endpgm 1357; 1358; VI-LABEL: insert_split_bb: 1359; VI: ; %bb.0: ; %entry 1360; VI-NEXT: s_load_dword s0, s[4:5], 0x10 1361; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 1362; VI-NEXT: s_waitcnt lgkmcnt(0) 1363; VI-NEXT: s_cmp_lg_u32 s0, 0 1364; VI-NEXT: s_cbranch_scc0 BB26_2 1365; VI-NEXT: ; %bb.1: ; %else 1366; VI-NEXT: s_load_dword s1, s[6:7], 0x4 1367; VI-NEXT: s_mov_b64 s[2:3], 0 1368; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3] 1369; VI-NEXT: s_cbranch_vccz BB26_3 1370; VI-NEXT: s_branch BB26_4 1371; VI-NEXT: BB26_2: 1372; VI-NEXT: s_mov_b64 s[2:3], -1 1373; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3] 1374; VI-NEXT: s_cbranch_vccnz BB26_4 1375; VI-NEXT: BB26_3: ; %if 1376; VI-NEXT: s_waitcnt lgkmcnt(0) 1377; VI-NEXT: s_load_dword s1, s[6:7], 0x0 1378; VI-NEXT: BB26_4: ; %endif 1379; VI-NEXT: s_waitcnt lgkmcnt(0) 1380; VI-NEXT: v_mov_b32_e32 v0, s0 1381; VI-NEXT: s_mov_b32 s7, 0x1100f000 1382; VI-NEXT: s_mov_b32 s6, -1 1383; VI-NEXT: v_mov_b32_e32 v1, s1 1384; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1385; VI-NEXT: s_endpgm 1386entry: 1387 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 1388 %1 = icmp eq i32 %a, 0 1389 br i1 %1, label %if, label %else 1390 1391if: 1392 %2 = load i32, i32 addrspace(1)* %in 1393 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 1394 br label %endif 1395 1396else: 1397 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 1398 %5 = load i32, i32 addrspace(1)* %4 1399 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 1400 br label %endif 1401 1402endif: 1403 %7 = phi <2 x i32> [%3, %if], [%6, %else] 1404 store <2 x i32> %7, <2 x i32> addrspace(1)* %out 1405 ret void 1406} 1407 1408define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 1409; SI-LABEL: dynamic_insertelement_v2f64: 1410; SI: ; %bb.0: 1411; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1412; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc 1413; SI-NEXT: s_load_dword s4, s[4:5], 0x18 1414; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 1415; SI-NEXT: s_mov_b32 s3, 0x100f000 1416; SI-NEXT: s_mov_b32 s2, -1 1417; SI-NEXT: s_waitcnt lgkmcnt(0) 1418; SI-NEXT: v_mov_b32_e32 v0, s11 1419; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1420; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1421; SI-NEXT: v_mov_b32_e32 v0, s10 1422; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1423; SI-NEXT: v_mov_b32_e32 v0, s9 1424; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1425; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1426; SI-NEXT: v_mov_b32_e32 v0, s8 1427; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1428; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1429; SI-NEXT: s_endpgm 1430; 1431; VI-LABEL: dynamic_insertelement_v2f64: 1432; VI: ; %bb.0: 1433; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1434; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 1435; VI-NEXT: s_load_dword s4, s[4:5], 0x60 1436; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 1437; VI-NEXT: s_mov_b32 s3, 0x1100f000 1438; VI-NEXT: s_mov_b32 s2, -1 1439; VI-NEXT: s_waitcnt lgkmcnt(0) 1440; VI-NEXT: v_mov_b32_e32 v0, s11 1441; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1442; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1443; VI-NEXT: v_mov_b32_e32 v0, s10 1444; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1445; VI-NEXT: v_mov_b32_e32 v0, s9 1446; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1447; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1448; VI-NEXT: v_mov_b32_e32 v0, s8 1449; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1450; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1451; VI-NEXT: s_endpgm 1452 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 1453 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 1454 ret void 1455} 1456 1457define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { 1458; SI-LABEL: dynamic_insertelement_v2i64: 1459; SI: ; %bb.0: 1460; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1461; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1462; SI-NEXT: s_load_dword s6, s[4:5], 0x8 1463; SI-NEXT: s_mov_b32 s3, 0x100f000 1464; SI-NEXT: s_mov_b32 s2, -1 1465; SI-NEXT: s_waitcnt lgkmcnt(0) 1466; SI-NEXT: v_mov_b32_e32 v0, s11 1467; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1468; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1469; SI-NEXT: v_mov_b32_e32 v0, s10 1470; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1471; SI-NEXT: v_mov_b32_e32 v0, s9 1472; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1473; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1474; SI-NEXT: v_mov_b32_e32 v0, s8 1475; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1476; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1477; SI-NEXT: s_endpgm 1478; 1479; VI-LABEL: dynamic_insertelement_v2i64: 1480; VI: ; %bb.0: 1481; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1482; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1483; VI-NEXT: s_load_dword s6, s[4:5], 0x20 1484; VI-NEXT: s_mov_b32 s3, 0x1100f000 1485; VI-NEXT: s_mov_b32 s2, -1 1486; VI-NEXT: s_waitcnt lgkmcnt(0) 1487; VI-NEXT: v_mov_b32_e32 v0, s11 1488; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1489; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1490; VI-NEXT: v_mov_b32_e32 v0, s10 1491; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1492; VI-NEXT: v_mov_b32_e32 v0, s9 1493; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1494; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1495; VI-NEXT: v_mov_b32_e32 v0, s8 1496; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1497; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1498; VI-NEXT: s_endpgm 1499 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 1500 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 1501 ret void 1502} 1503 1504define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { 1505; SI-LABEL: dynamic_insertelement_v3i64: 1506; SI: ; %bb.0: 1507; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1508; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1509; SI-NEXT: s_load_dword s6, s[4:5], 0x10 1510; SI-NEXT: s_mov_b32 s3, 0x100f000 1511; SI-NEXT: s_mov_b32 s2, -1 1512; SI-NEXT: s_waitcnt lgkmcnt(0) 1513; SI-NEXT: v_mov_b32_e32 v0, s13 1514; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 1515; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] 1516; SI-NEXT: v_mov_b32_e32 v0, s12 1517; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] 1518; SI-NEXT: v_mov_b32_e32 v0, s11 1519; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1520; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1521; SI-NEXT: v_mov_b32_e32 v0, s10 1522; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1523; SI-NEXT: v_mov_b32_e32 v0, s9 1524; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1525; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1526; SI-NEXT: v_mov_b32_e32 v0, s8 1527; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1528; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1529; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1530; SI-NEXT: s_endpgm 1531; 1532; VI-LABEL: dynamic_insertelement_v3i64: 1533; VI: ; %bb.0: 1534; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1535; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1536; VI-NEXT: s_load_dword s6, s[4:5], 0x40 1537; VI-NEXT: s_mov_b32 s3, 0x1100f000 1538; VI-NEXT: s_mov_b32 s2, -1 1539; VI-NEXT: s_waitcnt lgkmcnt(0) 1540; VI-NEXT: v_mov_b32_e32 v0, s13 1541; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 1542; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] 1543; VI-NEXT: v_mov_b32_e32 v0, s12 1544; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] 1545; VI-NEXT: v_mov_b32_e32 v0, s11 1546; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1547; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1548; VI-NEXT: v_mov_b32_e32 v0, s10 1549; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1550; VI-NEXT: v_mov_b32_e32 v0, s9 1551; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1552; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1553; VI-NEXT: v_mov_b32_e32 v0, s8 1554; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1555; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1556; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1557; VI-NEXT: s_endpgm 1558 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 1559 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 1560 ret void 1561} 1562 1563define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { 1564; SI-LABEL: dynamic_insertelement_v4f64: 1565; SI: ; %bb.0: 1566; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1567; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1568; SI-NEXT: s_load_dword s4, s[4:5], 0x10 1569; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 1570; SI-NEXT: s_mov_b32 s3, 0x100f000 1571; SI-NEXT: s_mov_b32 s2, -1 1572; SI-NEXT: s_waitcnt lgkmcnt(0) 1573; SI-NEXT: v_mov_b32_e32 v0, s11 1574; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1575; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1576; SI-NEXT: v_mov_b32_e32 v0, s10 1577; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1578; SI-NEXT: v_mov_b32_e32 v0, s9 1579; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1580; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1581; SI-NEXT: v_mov_b32_e32 v0, s8 1582; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1583; SI-NEXT: v_mov_b32_e32 v5, s15 1584; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 1585; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1586; SI-NEXT: v_mov_b32_e32 v5, s14 1587; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1588; SI-NEXT: v_mov_b32_e32 v5, s13 1589; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 1590; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1591; SI-NEXT: v_mov_b32_e32 v4, s12 1592; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1593; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1594; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1595; SI-NEXT: s_endpgm 1596; 1597; VI-LABEL: dynamic_insertelement_v4f64: 1598; VI: ; %bb.0: 1599; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1600; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1601; VI-NEXT: s_load_dword s4, s[4:5], 0x40 1602; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 1603; VI-NEXT: s_mov_b32 s3, 0x1100f000 1604; VI-NEXT: s_mov_b32 s2, -1 1605; VI-NEXT: s_waitcnt lgkmcnt(0) 1606; VI-NEXT: v_mov_b32_e32 v0, s11 1607; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1608; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1609; VI-NEXT: v_mov_b32_e32 v0, s10 1610; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1611; VI-NEXT: v_mov_b32_e32 v0, s9 1612; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1613; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1614; VI-NEXT: v_mov_b32_e32 v0, s8 1615; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1616; VI-NEXT: v_mov_b32_e32 v5, s15 1617; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 1618; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1619; VI-NEXT: v_mov_b32_e32 v5, s14 1620; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1621; VI-NEXT: v_mov_b32_e32 v5, s13 1622; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 1623; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1624; VI-NEXT: v_mov_b32_e32 v4, s12 1625; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1626; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1627; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1628; VI-NEXT: s_endpgm 1629 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 1630 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 1631 ret void 1632} 1633 1634define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { 1635; SI-LABEL: dynamic_insertelement_v8f64: 1636; SI: ; %bb.0: 1637; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1638; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 1639; SI-NEXT: s_load_dword s4, s[4:5], 0x20 1640; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 1641; SI-NEXT: s_mov_b32 s3, 0x100f000 1642; SI-NEXT: s_mov_b32 s2, -1 1643; SI-NEXT: s_waitcnt lgkmcnt(0) 1644; SI-NEXT: v_mov_b32_e32 v0, s8 1645; SI-NEXT: s_lshl_b32 s4, s4, 1 1646; SI-NEXT: v_mov_b32_e32 v1, s9 1647; SI-NEXT: v_mov_b32_e32 v2, s10 1648; SI-NEXT: v_mov_b32_e32 v3, s11 1649; SI-NEXT: v_mov_b32_e32 v4, s12 1650; SI-NEXT: v_mov_b32_e32 v5, s13 1651; SI-NEXT: v_mov_b32_e32 v6, s14 1652; SI-NEXT: v_mov_b32_e32 v7, s15 1653; SI-NEXT: v_mov_b32_e32 v8, s16 1654; SI-NEXT: v_mov_b32_e32 v9, s17 1655; SI-NEXT: v_mov_b32_e32 v10, s18 1656; SI-NEXT: v_mov_b32_e32 v11, s19 1657; SI-NEXT: v_mov_b32_e32 v12, s20 1658; SI-NEXT: v_mov_b32_e32 v13, s21 1659; SI-NEXT: v_mov_b32_e32 v14, s22 1660; SI-NEXT: v_mov_b32_e32 v15, s23 1661; SI-NEXT: s_mov_b32 m0, s4 1662; SI-NEXT: v_movreld_b32_e32 v0, 0 1663; SI-NEXT: v_movreld_b32_e32 v1, v16 1664; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1665; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1666; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1667; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1668; SI-NEXT: s_endpgm 1669; 1670; VI-LABEL: dynamic_insertelement_v8f64: 1671; VI: ; %bb.0: 1672; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1673; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 1674; VI-NEXT: s_load_dword s4, s[4:5], 0x80 1675; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 1676; VI-NEXT: s_mov_b32 s3, 0x1100f000 1677; VI-NEXT: s_mov_b32 s2, -1 1678; VI-NEXT: s_waitcnt lgkmcnt(0) 1679; VI-NEXT: v_mov_b32_e32 v0, s8 1680; VI-NEXT: s_lshl_b32 s4, s4, 1 1681; VI-NEXT: v_mov_b32_e32 v1, s9 1682; VI-NEXT: v_mov_b32_e32 v2, s10 1683; VI-NEXT: v_mov_b32_e32 v3, s11 1684; VI-NEXT: v_mov_b32_e32 v4, s12 1685; VI-NEXT: v_mov_b32_e32 v5, s13 1686; VI-NEXT: v_mov_b32_e32 v6, s14 1687; VI-NEXT: v_mov_b32_e32 v7, s15 1688; VI-NEXT: v_mov_b32_e32 v8, s16 1689; VI-NEXT: v_mov_b32_e32 v9, s17 1690; VI-NEXT: v_mov_b32_e32 v10, s18 1691; VI-NEXT: v_mov_b32_e32 v11, s19 1692; VI-NEXT: v_mov_b32_e32 v12, s20 1693; VI-NEXT: v_mov_b32_e32 v13, s21 1694; VI-NEXT: v_mov_b32_e32 v14, s22 1695; VI-NEXT: v_mov_b32_e32 v15, s23 1696; VI-NEXT: s_mov_b32 m0, s4 1697; VI-NEXT: v_movreld_b32_e32 v0, 0 1698; VI-NEXT: v_movreld_b32_e32 v1, v16 1699; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1700; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1701; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1702; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1703; VI-NEXT: s_endpgm 1704 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 1705 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 1706 ret void 1707} 1708 1709declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1710 1711attributes #0 = { nounwind } 1712attributes #1 = { nounwind readnone } 1713