1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s 4 5; FIXME: Broken on evergreen 6; FIXME: For some reason the 8 and 16 vectors are being stored as 7; individual elements instead of 128-bit stores. 8 9 10; FIXME: Why is the constant moved into the intermediate register and 11; not just directly into the vector component? 12define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 13; SI-LABEL: insertelement_v4f32_0: 14; SI: ; %bb.0: 15; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 16; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 17; SI-NEXT: s_mov_b32 s8, 0x40a00000 18; SI-NEXT: s_mov_b32 s3, 0x100f000 19; SI-NEXT: s_mov_b32 s2, -1 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: v_mov_b32_e32 v0, s4 22; SI-NEXT: v_mov_b32_e32 v1, s5 23; SI-NEXT: v_mov_b32_e32 v2, s6 24; SI-NEXT: v_mov_b32_e32 v3, s7 25; SI-NEXT: v_mov_b32_e32 v0, s8 26; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: insertelement_v4f32_0: 30; VI: ; %bb.0: 31; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 32; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 33; VI-NEXT: s_mov_b32 s8, 0x40a00000 34; VI-NEXT: s_mov_b32 s3, 0x1100f000 35; VI-NEXT: s_mov_b32 s2, -1 36; VI-NEXT: s_waitcnt lgkmcnt(0) 37; VI-NEXT: v_mov_b32_e32 v0, s4 38; VI-NEXT: v_mov_b32_e32 v1, s5 39; VI-NEXT: v_mov_b32_e32 v2, s6 40; VI-NEXT: v_mov_b32_e32 v3, s7 41; VI-NEXT: v_mov_b32_e32 v0, s8 42; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 43; VI-NEXT: s_endpgm 44 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 45 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 46 ret void 47} 48 49define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 50; SI-LABEL: insertelement_v4f32_1: 51; SI: ; %bb.0: 52; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 53; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 54; SI-NEXT: s_mov_b32 s8, 0x40a00000 55; SI-NEXT: s_mov_b32 s3, 0x100f000 56; SI-NEXT: s_mov_b32 s2, -1 57; SI-NEXT: s_waitcnt lgkmcnt(0) 58; SI-NEXT: v_mov_b32_e32 v0, s4 59; SI-NEXT: v_mov_b32_e32 v1, s5 60; SI-NEXT: v_mov_b32_e32 v2, s6 61; SI-NEXT: v_mov_b32_e32 v3, s7 62; SI-NEXT: v_mov_b32_e32 v1, s8 63; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 64; SI-NEXT: s_endpgm 65; 66; VI-LABEL: insertelement_v4f32_1: 67; VI: ; %bb.0: 68; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 69; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 70; VI-NEXT: s_mov_b32 s8, 0x40a00000 71; VI-NEXT: s_mov_b32 s3, 0x1100f000 72; VI-NEXT: s_mov_b32 s2, -1 73; VI-NEXT: s_waitcnt lgkmcnt(0) 74; VI-NEXT: v_mov_b32_e32 v0, s4 75; VI-NEXT: v_mov_b32_e32 v1, s5 76; VI-NEXT: v_mov_b32_e32 v2, s6 77; VI-NEXT: v_mov_b32_e32 v3, s7 78; VI-NEXT: v_mov_b32_e32 v1, s8 79; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 80; VI-NEXT: s_endpgm 81 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 82 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 83 ret void 84} 85 86define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 87; SI-LABEL: insertelement_v4f32_2: 88; SI: ; %bb.0: 89; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 90; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 91; SI-NEXT: s_mov_b32 s8, 0x40a00000 92; SI-NEXT: s_mov_b32 s3, 0x100f000 93; SI-NEXT: s_mov_b32 s2, -1 94; SI-NEXT: s_waitcnt lgkmcnt(0) 95; SI-NEXT: v_mov_b32_e32 v0, s4 96; SI-NEXT: v_mov_b32_e32 v2, s6 97; SI-NEXT: v_mov_b32_e32 v1, s5 98; SI-NEXT: v_mov_b32_e32 v3, s7 99; SI-NEXT: v_mov_b32_e32 v2, s8 100; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 101; SI-NEXT: s_endpgm 102; 103; VI-LABEL: insertelement_v4f32_2: 104; VI: ; %bb.0: 105; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 106; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 107; VI-NEXT: s_mov_b32 s8, 0x40a00000 108; VI-NEXT: s_mov_b32 s3, 0x1100f000 109; VI-NEXT: s_mov_b32 s2, -1 110; VI-NEXT: s_waitcnt lgkmcnt(0) 111; VI-NEXT: v_mov_b32_e32 v0, s4 112; VI-NEXT: v_mov_b32_e32 v2, s6 113; VI-NEXT: v_mov_b32_e32 v1, s5 114; VI-NEXT: v_mov_b32_e32 v3, s7 115; VI-NEXT: v_mov_b32_e32 v2, s8 116; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 117; VI-NEXT: s_endpgm 118 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 119 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 120 ret void 121} 122 123define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 124; SI-LABEL: insertelement_v4f32_3: 125; SI: ; %bb.0: 126; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 127; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 128; SI-NEXT: s_mov_b32 s8, 0x40a00000 129; SI-NEXT: s_mov_b32 s3, 0x100f000 130; SI-NEXT: s_mov_b32 s2, -1 131; SI-NEXT: s_waitcnt lgkmcnt(0) 132; SI-NEXT: v_mov_b32_e32 v0, s4 133; SI-NEXT: v_mov_b32_e32 v3, s7 134; SI-NEXT: v_mov_b32_e32 v1, s5 135; SI-NEXT: v_mov_b32_e32 v2, s6 136; SI-NEXT: v_mov_b32_e32 v3, s8 137; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 138; SI-NEXT: s_endpgm 139; 140; VI-LABEL: insertelement_v4f32_3: 141; VI: ; %bb.0: 142; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 143; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 144; VI-NEXT: s_mov_b32 s8, 0x40a00000 145; VI-NEXT: s_mov_b32 s3, 0x1100f000 146; VI-NEXT: s_mov_b32 s2, -1 147; VI-NEXT: s_waitcnt lgkmcnt(0) 148; VI-NEXT: v_mov_b32_e32 v0, s4 149; VI-NEXT: v_mov_b32_e32 v3, s7 150; VI-NEXT: v_mov_b32_e32 v1, s5 151; VI-NEXT: v_mov_b32_e32 v2, s6 152; VI-NEXT: v_mov_b32_e32 v3, s8 153; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 154; VI-NEXT: s_endpgm 155 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 156 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 157 ret void 158} 159 160define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { 161; SI-LABEL: insertelement_v4i32_0: 162; SI: ; %bb.0: 163; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 164; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 165; SI-NEXT: s_waitcnt lgkmcnt(0) 166; SI-NEXT: s_movk_i32 s4, 0x3e7 167; SI-NEXT: v_mov_b32_e32 v0, s4 168; SI-NEXT: s_mov_b32 s3, 0x100f000 169; SI-NEXT: s_mov_b32 s2, -1 170; SI-NEXT: v_mov_b32_e32 v1, s5 171; SI-NEXT: v_mov_b32_e32 v2, s6 172; SI-NEXT: v_mov_b32_e32 v3, s7 173; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 174; SI-NEXT: s_endpgm 175; 176; VI-LABEL: insertelement_v4i32_0: 177; VI: ; %bb.0: 178; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 179; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 180; VI-NEXT: s_waitcnt lgkmcnt(0) 181; VI-NEXT: s_movk_i32 s4, 0x3e7 182; VI-NEXT: v_mov_b32_e32 v0, s4 183; VI-NEXT: s_mov_b32 s3, 0x1100f000 184; VI-NEXT: s_mov_b32 s2, -1 185; VI-NEXT: v_mov_b32_e32 v1, s5 186; VI-NEXT: v_mov_b32_e32 v2, s6 187; VI-NEXT: v_mov_b32_e32 v3, s7 188; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 189; VI-NEXT: s_endpgm 190 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 191 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 192 ret void 193} 194 195define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 196; SI-LABEL: insertelement_v3f32_1: 197; SI: ; %bb.0: 198; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 199; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 200; SI-NEXT: s_mov_b32 s3, 0x100f000 201; SI-NEXT: s_mov_b32 s2, -1 202; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 203; SI-NEXT: s_waitcnt lgkmcnt(0) 204; SI-NEXT: v_mov_b32_e32 v2, s6 205; SI-NEXT: v_mov_b32_e32 v0, s4 206; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 207; SI-NEXT: s_endpgm 208; 209; VI-LABEL: insertelement_v3f32_1: 210; VI: ; %bb.0: 211; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 212; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 213; VI-NEXT: s_mov_b32 s3, 0x1100f000 214; VI-NEXT: s_mov_b32 s2, -1 215; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 216; VI-NEXT: s_waitcnt lgkmcnt(0) 217; VI-NEXT: v_mov_b32_e32 v2, s6 218; VI-NEXT: v_mov_b32_e32 v0, s4 219; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 220; VI-NEXT: s_endpgm 221 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 222 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 223 ret void 224} 225 226define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 227; SI-LABEL: insertelement_v3f32_2: 228; SI: ; %bb.0: 229; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 230; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 231; SI-NEXT: s_mov_b32 s3, 0x100f000 232; SI-NEXT: s_mov_b32 s2, -1 233; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 234; SI-NEXT: s_waitcnt lgkmcnt(0) 235; SI-NEXT: v_mov_b32_e32 v1, s5 236; SI-NEXT: v_mov_b32_e32 v0, s4 237; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 238; SI-NEXT: s_endpgm 239; 240; VI-LABEL: insertelement_v3f32_2: 241; VI: ; %bb.0: 242; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 243; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 244; VI-NEXT: s_mov_b32 s3, 0x1100f000 245; VI-NEXT: s_mov_b32 s2, -1 246; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: v_mov_b32_e32 v1, s5 249; VI-NEXT: v_mov_b32_e32 v0, s4 250; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 251; VI-NEXT: s_endpgm 252 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 253 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 254 ret void 255} 256 257define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 258; GCN-LABEL: insertelement_v3f32_3: 259; GCN: ; %bb.0: 260; GCN-NEXT: s_endpgm 261 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 262 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 263 ret void 264} 265 266define <4 x float> @insertelement_to_sgpr() nounwind { 267; GCN-LABEL: insertelement_to_sgpr: 268; GCN: ; %bb.0: 269; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 270; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 271; GCN-NEXT: s_waitcnt lgkmcnt(0) 272; GCN-NEXT: s_mov_b32 s12, 0 273; GCN-NEXT: s_mov_b32 s4, s12 274; GCN-NEXT: s_mov_b32 s5, s12 275; GCN-NEXT: s_mov_b32 s6, s12 276; GCN-NEXT: s_mov_b32 s7, s12 277; GCN-NEXT: s_mov_b32 s8, s12 278; GCN-NEXT: s_mov_b32 s9, s12 279; GCN-NEXT: s_mov_b32 s10, s12 280; GCN-NEXT: s_mov_b32 s11, s12 281; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 282; GCN-NEXT: s_waitcnt vmcnt(0) 283; GCN-NEXT: s_setpc_b64 s[30:31] 284 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef 285 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 286 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 287 ret <4 x float> %tmp2 288} 289 290define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { 291; SI-LABEL: dynamic_insertelement_v2f32: 292; SI: ; %bb.0: 293; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 294; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 295; SI-NEXT: s_load_dword s4, s[4:5], 0x4 296; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 297; SI-NEXT: s_mov_b32 s3, 0x100f000 298; SI-NEXT: s_mov_b32 s2, -1 299; SI-NEXT: s_waitcnt lgkmcnt(0) 300; SI-NEXT: v_mov_b32_e32 v0, s7 301; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 302; SI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc 303; SI-NEXT: v_mov_b32_e32 v0, s6 304; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 305; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 306; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 307; SI-NEXT: s_endpgm 308; 309; VI-LABEL: dynamic_insertelement_v2f32: 310; VI: ; %bb.0: 311; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 312; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 313; VI-NEXT: s_load_dword s4, s[4:5], 0x10 314; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 315; VI-NEXT: s_mov_b32 s3, 0x1100f000 316; VI-NEXT: s_mov_b32 s2, -1 317; VI-NEXT: s_waitcnt lgkmcnt(0) 318; VI-NEXT: v_mov_b32_e32 v0, s7 319; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 320; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc 321; VI-NEXT: v_mov_b32_e32 v0, s6 322; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 323; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 324; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 325; VI-NEXT: s_endpgm 326 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 327 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 328 ret void 329} 330 331define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { 332; SI-LABEL: dynamic_insertelement_v3f32: 333; SI: ; %bb.0: 334; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 335; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 336; SI-NEXT: s_load_dword s4, s[4:5], 0x8 337; SI-NEXT: v_mov_b32_e32 v3, 0x40a00000 338; SI-NEXT: s_mov_b32 s3, 0x100f000 339; SI-NEXT: s_mov_b32 s2, -1 340; SI-NEXT: s_waitcnt lgkmcnt(0) 341; SI-NEXT: v_mov_b32_e32 v2, s10 342; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 343; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 344; SI-NEXT: v_mov_b32_e32 v1, s9 345; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 346; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 347; SI-NEXT: v_mov_b32_e32 v0, s8 348; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 349; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 350; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 351; SI-NEXT: s_endpgm 352; 353; VI-LABEL: dynamic_insertelement_v3f32: 354; VI: ; %bb.0: 355; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 356; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 357; VI-NEXT: s_load_dword s4, s[4:5], 0x20 358; VI-NEXT: v_mov_b32_e32 v3, 0x40a00000 359; VI-NEXT: s_mov_b32 s3, 0x1100f000 360; VI-NEXT: s_mov_b32 s2, -1 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: v_mov_b32_e32 v2, s10 363; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 364; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 365; VI-NEXT: v_mov_b32_e32 v1, s9 366; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 367; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 368; VI-NEXT: v_mov_b32_e32 v0, s8 369; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 370; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 371; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 372; VI-NEXT: s_endpgm 373 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 374 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 375 ret void 376} 377 378define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { 379; SI-LABEL: dynamic_insertelement_v4f32: 380; SI: ; %bb.0: 381; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 382; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 383; SI-NEXT: s_load_dword s4, s[4:5], 0x8 384; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 385; SI-NEXT: s_mov_b32 s3, 0x100f000 386; SI-NEXT: s_mov_b32 s2, -1 387; SI-NEXT: s_waitcnt lgkmcnt(0) 388; SI-NEXT: v_mov_b32_e32 v0, s11 389; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 390; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 391; SI-NEXT: v_mov_b32_e32 v0, s10 392; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 393; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 394; SI-NEXT: v_mov_b32_e32 v0, s9 395; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 396; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 397; SI-NEXT: v_mov_b32_e32 v0, s8 398; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 399; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 400; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 401; SI-NEXT: s_endpgm 402; 403; VI-LABEL: dynamic_insertelement_v4f32: 404; VI: ; %bb.0: 405; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 406; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 407; VI-NEXT: s_load_dword s4, s[4:5], 0x20 408; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 409; VI-NEXT: s_mov_b32 s3, 0x1100f000 410; VI-NEXT: s_mov_b32 s2, -1 411; VI-NEXT: s_waitcnt lgkmcnt(0) 412; VI-NEXT: v_mov_b32_e32 v0, s11 413; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 414; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 415; VI-NEXT: v_mov_b32_e32 v0, s10 416; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 417; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 418; VI-NEXT: v_mov_b32_e32 v0, s9 419; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 420; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 421; VI-NEXT: v_mov_b32_e32 v0, s8 422; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 423; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 424; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 425; VI-NEXT: s_endpgm 426 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 427 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 428 ret void 429} 430 431define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { 432; SI-LABEL: dynamic_insertelement_v8f32: 433; SI: ; %bb.0: 434; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 435; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 436; SI-NEXT: s_load_dword s4, s[4:5], 0x10 437; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 438; SI-NEXT: s_mov_b32 s3, 0x100f000 439; SI-NEXT: s_mov_b32 s2, -1 440; SI-NEXT: s_waitcnt lgkmcnt(0) 441; SI-NEXT: v_mov_b32_e32 v0, s11 442; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 443; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 444; SI-NEXT: v_mov_b32_e32 v0, s10 445; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 446; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 447; SI-NEXT: v_mov_b32_e32 v0, s9 448; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 449; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 450; SI-NEXT: v_mov_b32_e32 v0, s8 451; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 452; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 453; SI-NEXT: v_mov_b32_e32 v5, s15 454; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 455; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 456; SI-NEXT: v_mov_b32_e32 v5, s14 457; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 458; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 459; SI-NEXT: v_mov_b32_e32 v5, s13 460; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 461; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 462; SI-NEXT: v_mov_b32_e32 v8, s12 463; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 464; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 465; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 466; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 467; SI-NEXT: s_endpgm 468; 469; VI-LABEL: dynamic_insertelement_v8f32: 470; VI: ; %bb.0: 471; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 472; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 473; VI-NEXT: s_load_dword s4, s[4:5], 0x40 474; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 475; VI-NEXT: s_mov_b32 s3, 0x1100f000 476; VI-NEXT: s_mov_b32 s2, -1 477; VI-NEXT: s_waitcnt lgkmcnt(0) 478; VI-NEXT: v_mov_b32_e32 v0, s11 479; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 480; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 481; VI-NEXT: v_mov_b32_e32 v0, s10 482; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 483; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 484; VI-NEXT: v_mov_b32_e32 v0, s9 485; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 486; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 487; VI-NEXT: v_mov_b32_e32 v0, s8 488; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 489; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 490; VI-NEXT: v_mov_b32_e32 v5, s15 491; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 492; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 493; VI-NEXT: v_mov_b32_e32 v5, s14 494; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 495; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 496; VI-NEXT: v_mov_b32_e32 v5, s13 497; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 498; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 499; VI-NEXT: v_mov_b32_e32 v8, s12 500; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 501; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 502; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 503; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 504; VI-NEXT: s_endpgm 505 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 506 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 507 ret void 508} 509 510define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { 511; SI-LABEL: dynamic_insertelement_v16f32: 512; SI: ; %bb.0: 513; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 514; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 515; SI-NEXT: s_load_dword s4, s[4:5], 0x20 516; SI-NEXT: s_mov_b32 s3, 0x100f000 517; SI-NEXT: s_mov_b32 s2, -1 518; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 519; SI-NEXT: s_waitcnt lgkmcnt(0) 520; SI-NEXT: v_mov_b32_e32 v0, s8 521; SI-NEXT: v_mov_b32_e32 v1, s9 522; SI-NEXT: v_mov_b32_e32 v2, s10 523; SI-NEXT: v_mov_b32_e32 v3, s11 524; SI-NEXT: v_mov_b32_e32 v4, s12 525; SI-NEXT: v_mov_b32_e32 v5, s13 526; SI-NEXT: v_mov_b32_e32 v6, s14 527; SI-NEXT: v_mov_b32_e32 v7, s15 528; SI-NEXT: v_mov_b32_e32 v8, s16 529; SI-NEXT: v_mov_b32_e32 v9, s17 530; SI-NEXT: v_mov_b32_e32 v10, s18 531; SI-NEXT: v_mov_b32_e32 v11, s19 532; SI-NEXT: v_mov_b32_e32 v12, s20 533; SI-NEXT: v_mov_b32_e32 v13, s21 534; SI-NEXT: v_mov_b32_e32 v14, s22 535; SI-NEXT: v_mov_b32_e32 v15, s23 536; SI-NEXT: s_mov_b32 m0, s4 537; SI-NEXT: v_movreld_b32_e32 v0, v16 538; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 539; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 540; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 541; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 542; SI-NEXT: s_endpgm 543; 544; VI-LABEL: dynamic_insertelement_v16f32: 545; VI: ; %bb.0: 546; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 547; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 548; VI-NEXT: s_load_dword s4, s[4:5], 0x80 549; VI-NEXT: s_mov_b32 s3, 0x1100f000 550; VI-NEXT: s_mov_b32 s2, -1 551; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 552; VI-NEXT: s_waitcnt lgkmcnt(0) 553; VI-NEXT: v_mov_b32_e32 v0, s8 554; VI-NEXT: v_mov_b32_e32 v1, s9 555; VI-NEXT: v_mov_b32_e32 v2, s10 556; VI-NEXT: v_mov_b32_e32 v3, s11 557; VI-NEXT: v_mov_b32_e32 v4, s12 558; VI-NEXT: v_mov_b32_e32 v5, s13 559; VI-NEXT: v_mov_b32_e32 v6, s14 560; VI-NEXT: v_mov_b32_e32 v7, s15 561; VI-NEXT: v_mov_b32_e32 v8, s16 562; VI-NEXT: v_mov_b32_e32 v9, s17 563; VI-NEXT: v_mov_b32_e32 v10, s18 564; VI-NEXT: v_mov_b32_e32 v11, s19 565; VI-NEXT: v_mov_b32_e32 v12, s20 566; VI-NEXT: v_mov_b32_e32 v13, s21 567; VI-NEXT: v_mov_b32_e32 v14, s22 568; VI-NEXT: v_mov_b32_e32 v15, s23 569; VI-NEXT: s_mov_b32 m0, s4 570; VI-NEXT: v_movreld_b32_e32 v0, v16 571; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 572; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 573; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 574; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 575; VI-NEXT: s_endpgm 576 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 577 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 578 ret void 579} 580 581define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { 582; SI-LABEL: dynamic_insertelement_v2i32: 583; SI: ; %bb.0: 584; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 585; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 586; SI-NEXT: s_load_dword s4, s[4:5], 0x4 587; SI-NEXT: s_mov_b32 s3, 0x100f000 588; SI-NEXT: s_mov_b32 s2, -1 589; SI-NEXT: s_waitcnt lgkmcnt(0) 590; SI-NEXT: v_mov_b32_e32 v0, s7 591; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 592; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 593; SI-NEXT: v_mov_b32_e32 v0, s6 594; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 595; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 596; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 597; SI-NEXT: s_endpgm 598; 599; VI-LABEL: dynamic_insertelement_v2i32: 600; VI: ; %bb.0: 601; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 602; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 603; VI-NEXT: s_load_dword s4, s[4:5], 0x10 604; VI-NEXT: s_mov_b32 s3, 0x1100f000 605; VI-NEXT: s_mov_b32 s2, -1 606; VI-NEXT: s_waitcnt lgkmcnt(0) 607; VI-NEXT: v_mov_b32_e32 v0, s7 608; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 609; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 610; VI-NEXT: v_mov_b32_e32 v0, s6 611; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 612; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 613; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 614; VI-NEXT: s_endpgm 615 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 616 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 617 ret void 618} 619 620define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { 621; SI-LABEL: dynamic_insertelement_v3i32: 622; SI: ; %bb.0: 623; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 624; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 625; SI-NEXT: s_load_dword s4, s[4:5], 0x8 626; SI-NEXT: s_mov_b32 s3, 0x100f000 627; SI-NEXT: s_mov_b32 s2, -1 628; SI-NEXT: s_waitcnt lgkmcnt(0) 629; SI-NEXT: v_mov_b32_e32 v0, s10 630; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 631; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 632; SI-NEXT: v_mov_b32_e32 v0, s9 633; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 634; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 635; SI-NEXT: v_mov_b32_e32 v0, s8 636; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 637; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 638; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 639; SI-NEXT: s_endpgm 640; 641; VI-LABEL: dynamic_insertelement_v3i32: 642; VI: ; %bb.0: 643; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 644; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 645; VI-NEXT: s_load_dword s4, s[4:5], 0x20 646; VI-NEXT: s_mov_b32 s3, 0x1100f000 647; VI-NEXT: s_mov_b32 s2, -1 648; VI-NEXT: s_waitcnt lgkmcnt(0) 649; VI-NEXT: v_mov_b32_e32 v0, s10 650; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 651; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 652; VI-NEXT: v_mov_b32_e32 v0, s9 653; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 654; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 655; VI-NEXT: v_mov_b32_e32 v0, s8 656; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 657; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 658; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 659; VI-NEXT: s_endpgm 660 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 661 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 662 ret void 663} 664 665define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 666; SI-LABEL: dynamic_insertelement_v4i32: 667; SI: ; %bb.0: 668; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 669; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 670; SI-NEXT: s_load_dword s6, s[4:5], 0x8 671; SI-NEXT: s_load_dword s4, s[4:5], 0x11 672; SI-NEXT: s_mov_b32 s3, 0x100f000 673; SI-NEXT: s_mov_b32 s2, -1 674; SI-NEXT: s_waitcnt lgkmcnt(0) 675; SI-NEXT: v_mov_b32_e32 v0, s11 676; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 677; SI-NEXT: v_mov_b32_e32 v4, s4 678; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 679; SI-NEXT: v_mov_b32_e32 v0, s10 680; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 681; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 682; SI-NEXT: v_mov_b32_e32 v0, s9 683; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 684; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 685; SI-NEXT: v_mov_b32_e32 v0, s8 686; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 687; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 688; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 689; SI-NEXT: s_endpgm 690; 691; VI-LABEL: dynamic_insertelement_v4i32: 692; VI: ; %bb.0: 693; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 694; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 695; VI-NEXT: s_load_dword s6, s[4:5], 0x20 696; VI-NEXT: s_load_dword s4, s[4:5], 0x44 697; VI-NEXT: s_mov_b32 s3, 0x1100f000 698; VI-NEXT: s_mov_b32 s2, -1 699; VI-NEXT: s_waitcnt lgkmcnt(0) 700; VI-NEXT: v_mov_b32_e32 v0, s11 701; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 702; VI-NEXT: v_mov_b32_e32 v4, s4 703; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 704; VI-NEXT: v_mov_b32_e32 v0, s10 705; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 706; VI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 707; VI-NEXT: v_mov_b32_e32 v0, s9 708; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 709; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 710; VI-NEXT: v_mov_b32_e32 v0, s8 711; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 712; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 713; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 714; VI-NEXT: s_endpgm 715 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 716 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 717 ret void 718} 719 720define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { 721; SI-LABEL: dynamic_insertelement_v8i32: 722; SI: ; %bb.0: 723; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 724; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 725; SI-NEXT: s_load_dword s4, s[4:5], 0x10 726; SI-NEXT: s_mov_b32 s3, 0x100f000 727; SI-NEXT: s_mov_b32 s2, -1 728; SI-NEXT: s_waitcnt lgkmcnt(0) 729; SI-NEXT: v_mov_b32_e32 v0, s11 730; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 731; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc 732; SI-NEXT: v_mov_b32_e32 v0, s10 733; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 734; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 735; SI-NEXT: v_mov_b32_e32 v0, s9 736; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 737; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 738; SI-NEXT: v_mov_b32_e32 v0, s8 739; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 740; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 741; SI-NEXT: v_mov_b32_e32 v4, s15 742; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 743; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc 744; SI-NEXT: v_mov_b32_e32 v4, s14 745; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 746; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc 747; SI-NEXT: v_mov_b32_e32 v4, s13 748; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 749; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc 750; SI-NEXT: v_mov_b32_e32 v4, s12 751; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 752; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 753; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 754; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 755; SI-NEXT: s_endpgm 756; 757; VI-LABEL: dynamic_insertelement_v8i32: 758; VI: ; %bb.0: 759; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 760; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 761; VI-NEXT: s_load_dword s4, s[4:5], 0x40 762; VI-NEXT: s_mov_b32 s3, 0x1100f000 763; VI-NEXT: s_mov_b32 s2, -1 764; VI-NEXT: s_waitcnt lgkmcnt(0) 765; VI-NEXT: v_mov_b32_e32 v0, s11 766; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 767; VI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc 768; VI-NEXT: v_mov_b32_e32 v0, s10 769; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 770; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 771; VI-NEXT: v_mov_b32_e32 v0, s9 772; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 773; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 774; VI-NEXT: v_mov_b32_e32 v0, s8 775; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 776; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 777; VI-NEXT: v_mov_b32_e32 v4, s15 778; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 779; VI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc 780; VI-NEXT: v_mov_b32_e32 v4, s14 781; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 782; VI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc 783; VI-NEXT: v_mov_b32_e32 v4, s13 784; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 785; VI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc 786; VI-NEXT: v_mov_b32_e32 v4, s12 787; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 788; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 789; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 790; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 791; VI-NEXT: s_endpgm 792 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 793 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 794 ret void 795} 796 797define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { 798; SI-LABEL: dynamic_insertelement_v16i32: 799; SI: ; %bb.0: 800; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 801; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 802; SI-NEXT: s_load_dword s4, s[4:5], 0x20 803; SI-NEXT: s_mov_b32 s3, 0x100f000 804; SI-NEXT: s_mov_b32 s2, -1 805; SI-NEXT: s_waitcnt lgkmcnt(0) 806; SI-NEXT: v_mov_b32_e32 v0, s8 807; SI-NEXT: v_mov_b32_e32 v1, s9 808; SI-NEXT: v_mov_b32_e32 v2, s10 809; SI-NEXT: v_mov_b32_e32 v3, s11 810; SI-NEXT: v_mov_b32_e32 v4, s12 811; SI-NEXT: v_mov_b32_e32 v5, s13 812; SI-NEXT: v_mov_b32_e32 v6, s14 813; SI-NEXT: v_mov_b32_e32 v7, s15 814; SI-NEXT: v_mov_b32_e32 v8, s16 815; SI-NEXT: v_mov_b32_e32 v9, s17 816; SI-NEXT: v_mov_b32_e32 v10, s18 817; SI-NEXT: v_mov_b32_e32 v11, s19 818; SI-NEXT: v_mov_b32_e32 v12, s20 819; SI-NEXT: v_mov_b32_e32 v13, s21 820; SI-NEXT: v_mov_b32_e32 v14, s22 821; SI-NEXT: v_mov_b32_e32 v15, s23 822; SI-NEXT: s_mov_b32 m0, s4 823; SI-NEXT: v_movreld_b32_e32 v0, 5 824; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 825; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 826; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 827; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 828; SI-NEXT: s_endpgm 829; 830; VI-LABEL: dynamic_insertelement_v16i32: 831; VI: ; %bb.0: 832; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 833; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 834; VI-NEXT: s_load_dword s4, s[4:5], 0x80 835; VI-NEXT: s_mov_b32 s3, 0x1100f000 836; VI-NEXT: s_mov_b32 s2, -1 837; VI-NEXT: s_waitcnt lgkmcnt(0) 838; VI-NEXT: v_mov_b32_e32 v0, s8 839; VI-NEXT: v_mov_b32_e32 v1, s9 840; VI-NEXT: v_mov_b32_e32 v2, s10 841; VI-NEXT: v_mov_b32_e32 v3, s11 842; VI-NEXT: v_mov_b32_e32 v4, s12 843; VI-NEXT: v_mov_b32_e32 v5, s13 844; VI-NEXT: v_mov_b32_e32 v6, s14 845; VI-NEXT: v_mov_b32_e32 v7, s15 846; VI-NEXT: v_mov_b32_e32 v8, s16 847; VI-NEXT: v_mov_b32_e32 v9, s17 848; VI-NEXT: v_mov_b32_e32 v10, s18 849; VI-NEXT: v_mov_b32_e32 v11, s19 850; VI-NEXT: v_mov_b32_e32 v12, s20 851; VI-NEXT: v_mov_b32_e32 v13, s21 852; VI-NEXT: v_mov_b32_e32 v14, s22 853; VI-NEXT: v_mov_b32_e32 v15, s23 854; VI-NEXT: s_mov_b32 m0, s4 855; VI-NEXT: v_movreld_b32_e32 v0, 5 856; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 857; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 858; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 859; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 860; VI-NEXT: s_endpgm 861 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 862 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 863 ret void 864} 865 866define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { 867; SI-LABEL: dynamic_insertelement_v2i16: 868; SI: ; %bb.0: 869; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 870; SI-NEXT: s_load_dword s6, s[4:5], 0x2 871; SI-NEXT: s_load_dword s4, s[4:5], 0x3 872; SI-NEXT: v_mov_b32_e32 v0, 0x50005 873; SI-NEXT: s_mov_b32 s3, 0x100f000 874; SI-NEXT: s_mov_b32 s2, -1 875; SI-NEXT: s_waitcnt lgkmcnt(0) 876; SI-NEXT: v_mov_b32_e32 v1, s6 877; SI-NEXT: s_lshl_b32 s4, s4, 4 878; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 879; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 880; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 881; SI-NEXT: s_endpgm 882; 883; VI-LABEL: dynamic_insertelement_v2i16: 884; VI: ; %bb.0: 885; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 886; VI-NEXT: s_load_dword s6, s[4:5], 0x8 887; VI-NEXT: s_load_dword s4, s[4:5], 0xc 888; VI-NEXT: v_mov_b32_e32 v0, 0x50005 889; VI-NEXT: s_mov_b32 s3, 0x1100f000 890; VI-NEXT: s_mov_b32 s2, -1 891; VI-NEXT: s_waitcnt lgkmcnt(0) 892; VI-NEXT: v_mov_b32_e32 v1, s6 893; VI-NEXT: s_lshl_b32 s4, s4, 4 894; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 895; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 896; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 897; VI-NEXT: s_endpgm 898 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 899 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 900 ret void 901} 902 903define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { 904; SI-LABEL: dynamic_insertelement_v3i16: 905; SI: ; %bb.0: 906; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 907; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 908; SI-NEXT: s_load_dword s4, s[4:5], 0x4 909; SI-NEXT: s_mov_b32 s5, 0 910; SI-NEXT: s_mov_b32 s3, 0x100f000 911; SI-NEXT: s_mov_b32 s2, -1 912; SI-NEXT: s_waitcnt lgkmcnt(0) 913; SI-NEXT: s_lshl_b32 s8, s4, 4 914; SI-NEXT: s_mov_b32 s4, 0xffff 915; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 916; SI-NEXT: s_mov_b32 s8, 0x50005 917; SI-NEXT: s_and_b32 s9, s5, s8 918; SI-NEXT: s_and_b32 s8, s4, s8 919; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] 920; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 921; SI-NEXT: v_mov_b32_e32 v0, s5 922; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 923; SI-NEXT: v_mov_b32_e32 v0, s4 924; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 925; SI-NEXT: s_endpgm 926; 927; VI-LABEL: dynamic_insertelement_v3i16: 928; VI: ; %bb.0: 929; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 930; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 931; VI-NEXT: s_load_dword s4, s[4:5], 0x10 932; VI-NEXT: s_mov_b32 s5, 0 933; VI-NEXT: s_mov_b32 s3, 0x1100f000 934; VI-NEXT: s_mov_b32 s2, -1 935; VI-NEXT: s_waitcnt lgkmcnt(0) 936; VI-NEXT: v_mov_b32_e32 v1, s7 937; VI-NEXT: s_lshl_b32 s8, s4, 4 938; VI-NEXT: s_mov_b32 s4, 0xffff 939; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 940; VI-NEXT: s_mov_b32 s8, 0x50005 941; VI-NEXT: v_mov_b32_e32 v0, s8 942; VI-NEXT: v_bfi_b32 v0, s5, v0, v1 943; VI-NEXT: v_mov_b32_e32 v1, s8 944; VI-NEXT: v_mov_b32_e32 v2, s6 945; VI-NEXT: v_bfi_b32 v1, s4, v1, v2 946; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 947; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 948; VI-NEXT: s_endpgm 949 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 950 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 951 ret void 952} 953 954define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 955; SI-LABEL: dynamic_insertelement_v2i8: 956; SI: ; %bb.0: 957; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 958; SI-NEXT: s_load_dword s6, s[4:5], 0xa 959; SI-NEXT: s_load_dword s4, s[4:5], 0x13 960; SI-NEXT: v_mov_b32_e32 v0, 0x505 961; SI-NEXT: s_mov_b32 s3, 0x100f000 962; SI-NEXT: s_mov_b32 s2, -1 963; SI-NEXT: s_waitcnt lgkmcnt(0) 964; SI-NEXT: v_mov_b32_e32 v1, s6 965; SI-NEXT: s_lshl_b32 s4, s4, 3 966; SI-NEXT: s_lshl_b32 s4, -1, s4 967; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 968; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 969; SI-NEXT: s_endpgm 970; 971; VI-LABEL: dynamic_insertelement_v2i8: 972; VI: ; %bb.0: 973; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 974; VI-NEXT: s_load_dword s6, s[4:5], 0x28 975; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 976; VI-NEXT: s_mov_b32 s3, 0x1100f000 977; VI-NEXT: s_mov_b32 s2, -1 978; VI-NEXT: s_waitcnt lgkmcnt(0) 979; VI-NEXT: s_lshl_b32 s4, s4, 3 980; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 981; VI-NEXT: v_and_b32_e32 v1, 0x505, v0 982; VI-NEXT: v_xor_b32_e32 v0, -1, v0 983; VI-NEXT: v_and_b32_e32 v0, s6, v0 984; VI-NEXT: v_or_b32_e32 v0, v1, v0 985; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 986; VI-NEXT: s_endpgm 987 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 988 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 989 ret void 990} 991 992; FIXME: post legalize i16 and i32 shifts aren't merged because of 993; isTypeDesirableForOp in SimplifyDemandedBits 994define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 995; SI-LABEL: dynamic_insertelement_v3i8: 996; SI: ; %bb.0: 997; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 998; SI-NEXT: s_load_dword s6, s[4:5], 0xa 999; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1000; SI-NEXT: v_mov_b32_e32 v0, 0x5050505 1001; SI-NEXT: s_mov_b32 s3, 0x100f000 1002; SI-NEXT: s_mov_b32 s2, -1 1003; SI-NEXT: s_waitcnt lgkmcnt(0) 1004; SI-NEXT: v_mov_b32_e32 v1, s6 1005; SI-NEXT: s_lshl_b32 s4, s4, 3 1006; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1007; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 1008; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1009; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1010; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 1011; SI-NEXT: s_endpgm 1012; 1013; VI-LABEL: dynamic_insertelement_v3i8: 1014; VI: ; %bb.0: 1015; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1016; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1017; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1018; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 1019; VI-NEXT: s_mov_b32 s3, 0x1100f000 1020; VI-NEXT: s_mov_b32 s2, -1 1021; VI-NEXT: s_waitcnt lgkmcnt(0) 1022; VI-NEXT: v_mov_b32_e32 v1, s6 1023; VI-NEXT: s_lshl_b32 s4, s4, 3 1024; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1025; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 1026; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1027; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1028; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 1029; VI-NEXT: s_endpgm 1030 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 1031 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 1032 ret void 1033} 1034 1035define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 1036; SI-LABEL: dynamic_insertelement_v4i8: 1037; SI: ; %bb.0: 1038; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1039; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1040; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1041; SI-NEXT: v_mov_b32_e32 v0, 0x5050505 1042; SI-NEXT: s_mov_b32 s3, 0x100f000 1043; SI-NEXT: s_mov_b32 s2, -1 1044; SI-NEXT: s_waitcnt lgkmcnt(0) 1045; SI-NEXT: v_mov_b32_e32 v1, s6 1046; SI-NEXT: s_lshl_b32 s4, s4, 3 1047; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1048; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 1049; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1050; SI-NEXT: s_endpgm 1051; 1052; VI-LABEL: dynamic_insertelement_v4i8: 1053; VI: ; %bb.0: 1054; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1055; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1056; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1057; VI-NEXT: v_mov_b32_e32 v0, 0x5050505 1058; VI-NEXT: s_mov_b32 s3, 0x1100f000 1059; VI-NEXT: s_mov_b32 s2, -1 1060; VI-NEXT: s_waitcnt lgkmcnt(0) 1061; VI-NEXT: v_mov_b32_e32 v1, s6 1062; VI-NEXT: s_lshl_b32 s4, s4, 3 1063; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1064; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 1065; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1066; VI-NEXT: s_endpgm 1067 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 1068 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 1069 ret void 1070} 1071 1072define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { 1073; SI-LABEL: s_dynamic_insertelement_v8i8: 1074; SI: ; %bb.0: 1075; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1076; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1077; SI-NEXT: s_mov_b32 s7, 0 1078; SI-NEXT: s_mov_b32 s3, 0x100f000 1079; SI-NEXT: s_mov_b32 s2, -1 1080; SI-NEXT: s_waitcnt lgkmcnt(0) 1081; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1082; SI-NEXT: s_mov_b32 s0, s8 1083; SI-NEXT: s_lshl_b32 s8, s6, 3 1084; SI-NEXT: s_mov_b32 s6, 0xffff 1085; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1086; SI-NEXT: s_mov_b32 s8, 0x5050505 1087; SI-NEXT: s_mov_b32 s1, s9 1088; SI-NEXT: s_and_b32 s9, s7, s8 1089; SI-NEXT: s_and_b32 s8, s6, s8 1090; SI-NEXT: s_waitcnt lgkmcnt(0) 1091; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1092; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1093; SI-NEXT: v_mov_b32_e32 v0, s4 1094; SI-NEXT: v_mov_b32_e32 v1, s5 1095; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1096; SI-NEXT: s_endpgm 1097; 1098; VI-LABEL: s_dynamic_insertelement_v8i8: 1099; VI: ; %bb.0: 1100; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1101; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1102; VI-NEXT: s_mov_b32 s7, 0 1103; VI-NEXT: s_mov_b32 s3, 0x1100f000 1104; VI-NEXT: s_mov_b32 s2, -1 1105; VI-NEXT: s_waitcnt lgkmcnt(0) 1106; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1107; VI-NEXT: s_mov_b32 s0, s8 1108; VI-NEXT: s_lshl_b32 s8, s6, 3 1109; VI-NEXT: s_mov_b32 s6, 0xffff 1110; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1111; VI-NEXT: s_mov_b32 s8, 0x5050505 1112; VI-NEXT: s_mov_b32 s1, s9 1113; VI-NEXT: s_and_b32 s9, s7, s8 1114; VI-NEXT: s_and_b32 s8, s6, s8 1115; VI-NEXT: s_waitcnt lgkmcnt(0) 1116; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1117; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1118; VI-NEXT: v_mov_b32_e32 v0, s4 1119; VI-NEXT: v_mov_b32_e32 v1, s5 1120; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1121; VI-NEXT: s_endpgm 1122 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 1123 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 1124 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 1125 ret void 1126} 1127 1128define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { 1129; SI-LABEL: dynamic_insertelement_v16i8: 1130; SI: ; %bb.0: 1131; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1132; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1133; SI-NEXT: s_load_dword s4, s[4:5], 0x8 1134; SI-NEXT: s_mov_b32 s3, 0x100f000 1135; SI-NEXT: s_mov_b32 s2, -1 1136; SI-NEXT: s_waitcnt lgkmcnt(0) 1137; SI-NEXT: s_lshr_b32 s5, s11, 24 1138; SI-NEXT: v_mov_b32_e32 v0, s5 1139; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 1140; SI-NEXT: s_lshr_b32 s5, s11, 16 1141; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1142; SI-NEXT: v_mov_b32_e32 v1, s5 1143; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 1144; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1145; SI-NEXT: s_movk_i32 s5, 0xff 1146; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1147; SI-NEXT: v_and_b32_e32 v1, s5, v1 1148; SI-NEXT: s_lshr_b32 s6, s11, 8 1149; SI-NEXT: v_or_b32_e32 v0, v1, v0 1150; SI-NEXT: v_mov_b32_e32 v1, s6 1151; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 1152; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1153; SI-NEXT: v_mov_b32_e32 v2, s11 1154; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 1155; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1156; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1157; SI-NEXT: v_and_b32_e32 v2, s5, v2 1158; SI-NEXT: v_or_b32_e32 v1, v2, v1 1159; SI-NEXT: s_mov_b32 s6, 0xffff 1160; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1161; SI-NEXT: v_and_b32_e32 v1, s6, v1 1162; SI-NEXT: s_lshr_b32 s7, s10, 24 1163; SI-NEXT: v_or_b32_e32 v3, v1, v0 1164; SI-NEXT: v_mov_b32_e32 v0, s7 1165; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 1166; SI-NEXT: s_lshr_b32 s7, s10, 16 1167; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1168; SI-NEXT: v_mov_b32_e32 v1, s7 1169; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 1170; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1171; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1172; SI-NEXT: v_and_b32_e32 v1, s5, v1 1173; SI-NEXT: s_lshr_b32 s7, s10, 8 1174; SI-NEXT: v_or_b32_e32 v0, v1, v0 1175; SI-NEXT: v_mov_b32_e32 v1, s7 1176; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 1177; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1178; SI-NEXT: v_mov_b32_e32 v2, s10 1179; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 1180; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1181; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1182; SI-NEXT: v_and_b32_e32 v2, s5, v2 1183; SI-NEXT: v_or_b32_e32 v1, v2, v1 1184; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1185; SI-NEXT: v_and_b32_e32 v1, s6, v1 1186; SI-NEXT: s_lshr_b32 s7, s9, 24 1187; SI-NEXT: v_or_b32_e32 v2, v1, v0 1188; SI-NEXT: v_mov_b32_e32 v0, s7 1189; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 1190; SI-NEXT: s_lshr_b32 s7, s9, 16 1191; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1192; SI-NEXT: v_mov_b32_e32 v1, s7 1193; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 1194; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1195; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1196; SI-NEXT: v_and_b32_e32 v1, s5, v1 1197; SI-NEXT: s_lshr_b32 s7, s9, 8 1198; SI-NEXT: v_or_b32_e32 v0, v1, v0 1199; SI-NEXT: v_mov_b32_e32 v1, s7 1200; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 1201; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1202; SI-NEXT: v_mov_b32_e32 v4, s9 1203; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 1204; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1205; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1206; SI-NEXT: v_and_b32_e32 v4, s5, v4 1207; SI-NEXT: v_or_b32_e32 v1, v4, v1 1208; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1209; SI-NEXT: v_and_b32_e32 v1, s6, v1 1210; SI-NEXT: s_lshr_b32 s7, s8, 24 1211; SI-NEXT: v_or_b32_e32 v1, v1, v0 1212; SI-NEXT: v_mov_b32_e32 v0, s7 1213; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 1214; SI-NEXT: s_lshr_b32 s7, s8, 16 1215; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1216; SI-NEXT: v_mov_b32_e32 v4, s7 1217; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 1218; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1219; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1220; SI-NEXT: v_and_b32_e32 v4, s5, v4 1221; SI-NEXT: s_lshr_b32 s7, s8, 8 1222; SI-NEXT: v_or_b32_e32 v0, v4, v0 1223; SI-NEXT: v_mov_b32_e32 v4, s7 1224; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 1225; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1226; SI-NEXT: v_mov_b32_e32 v5, s8 1227; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 1228; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1229; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 1230; SI-NEXT: v_and_b32_e32 v5, s5, v5 1231; SI-NEXT: v_or_b32_e32 v4, v5, v4 1232; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1233; SI-NEXT: v_and_b32_e32 v4, s6, v4 1234; SI-NEXT: v_or_b32_e32 v0, v4, v0 1235; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1236; SI-NEXT: s_endpgm 1237; 1238; VI-LABEL: dynamic_insertelement_v16i8: 1239; VI: ; %bb.0: 1240; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1241; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1242; VI-NEXT: s_load_dword s4, s[4:5], 0x20 1243; VI-NEXT: s_mov_b32 s3, 0x1100f000 1244; VI-NEXT: s_mov_b32 s2, -1 1245; VI-NEXT: s_waitcnt lgkmcnt(0) 1246; VI-NEXT: s_lshr_b32 s5, s11, 24 1247; VI-NEXT: v_mov_b32_e32 v0, s5 1248; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15 1249; VI-NEXT: s_lshr_b32 s5, s11, 16 1250; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1251; VI-NEXT: v_mov_b32_e32 v1, s5 1252; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14 1253; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1254; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1255; VI-NEXT: s_lshr_b32 s5, s11, 8 1256; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1257; VI-NEXT: v_mov_b32_e32 v1, s5 1258; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13 1259; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1260; VI-NEXT: v_mov_b32_e32 v2, s11 1261; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12 1262; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1263; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1264; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1265; VI-NEXT: s_lshr_b32 s5, s10, 24 1266; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1267; VI-NEXT: v_mov_b32_e32 v0, s5 1268; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11 1269; VI-NEXT: s_lshr_b32 s5, s10, 16 1270; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1271; VI-NEXT: v_mov_b32_e32 v1, s5 1272; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10 1273; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1274; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1275; VI-NEXT: s_lshr_b32 s5, s10, 8 1276; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1277; VI-NEXT: v_mov_b32_e32 v1, s5 1278; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9 1279; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1280; VI-NEXT: v_mov_b32_e32 v2, s10 1281; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8 1282; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1283; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1284; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1285; VI-NEXT: s_lshr_b32 s5, s9, 24 1286; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1287; VI-NEXT: v_mov_b32_e32 v0, s5 1288; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7 1289; VI-NEXT: s_lshr_b32 s5, s9, 16 1290; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1291; VI-NEXT: v_mov_b32_e32 v1, s5 1292; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6 1293; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1294; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1295; VI-NEXT: s_lshr_b32 s5, s9, 8 1296; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1297; VI-NEXT: v_mov_b32_e32 v1, s5 1298; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5 1299; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1300; VI-NEXT: v_mov_b32_e32 v4, s9 1301; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4 1302; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1303; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1304; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1305; VI-NEXT: s_lshr_b32 s5, s8, 24 1306; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1307; VI-NEXT: v_mov_b32_e32 v0, s5 1308; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3 1309; VI-NEXT: s_lshr_b32 s5, s8, 16 1310; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1311; VI-NEXT: v_mov_b32_e32 v4, s5 1312; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2 1313; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1314; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1315; VI-NEXT: s_lshr_b32 s5, s8, 8 1316; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1317; VI-NEXT: v_mov_b32_e32 v4, s5 1318; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1 1319; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1320; VI-NEXT: v_mov_b32_e32 v5, s8 1321; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0 1322; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 1323; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1324; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1325; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1326; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1327; VI-NEXT: s_endpgm 1328 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 1329 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 1330 ret void 1331} 1332 1333; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 1334; the compiler doesn't crash. 1335define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { 1336; SI-LABEL: insert_split_bb: 1337; SI: ; %bb.0: ; %entry 1338; SI-NEXT: s_load_dword s0, s[4:5], 0x4 1339; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 1340; SI-NEXT: s_waitcnt lgkmcnt(0) 1341; SI-NEXT: s_cmp_lg_u32 s0, 0 1342; SI-NEXT: s_cbranch_scc0 BB26_2 1343; SI-NEXT: ; %bb.1: ; %else 1344; SI-NEXT: s_load_dword s1, s[6:7], 0x1 1345; SI-NEXT: s_branch BB26_3 1346; SI-NEXT: BB26_2: ; %if 1347; SI-NEXT: s_load_dword s1, s[6:7], 0x0 1348; SI-NEXT: BB26_3: ; %endif 1349; SI-NEXT: s_waitcnt lgkmcnt(0) 1350; SI-NEXT: v_mov_b32_e32 v0, s0 1351; SI-NEXT: s_mov_b32 s7, 0x100f000 1352; SI-NEXT: s_mov_b32 s6, -1 1353; SI-NEXT: v_mov_b32_e32 v1, s1 1354; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1355; SI-NEXT: s_endpgm 1356; 1357; VI-LABEL: insert_split_bb: 1358; VI: ; %bb.0: ; %entry 1359; VI-NEXT: s_load_dword s0, s[4:5], 0x10 1360; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 1361; VI-NEXT: s_waitcnt lgkmcnt(0) 1362; VI-NEXT: s_cmp_lg_u32 s0, 0 1363; VI-NEXT: s_cbranch_scc0 BB26_2 1364; VI-NEXT: ; %bb.1: ; %else 1365; VI-NEXT: s_load_dword s1, s[6:7], 0x4 1366; VI-NEXT: s_branch BB26_3 1367; VI-NEXT: BB26_2: ; %if 1368; VI-NEXT: s_load_dword s1, s[6:7], 0x0 1369; VI-NEXT: BB26_3: ; %endif 1370; VI-NEXT: s_waitcnt lgkmcnt(0) 1371; VI-NEXT: v_mov_b32_e32 v0, s0 1372; VI-NEXT: s_mov_b32 s7, 0x1100f000 1373; VI-NEXT: s_mov_b32 s6, -1 1374; VI-NEXT: v_mov_b32_e32 v1, s1 1375; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1376; VI-NEXT: s_endpgm 1377entry: 1378 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 1379 %1 = icmp eq i32 %a, 0 1380 br i1 %1, label %if, label %else 1381 1382if: 1383 %2 = load i32, i32 addrspace(1)* %in 1384 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 1385 br label %endif 1386 1387else: 1388 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 1389 %5 = load i32, i32 addrspace(1)* %4 1390 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 1391 br label %endif 1392 1393endif: 1394 %7 = phi <2 x i32> [%3, %if], [%6, %else] 1395 store <2 x i32> %7, <2 x i32> addrspace(1)* %out 1396 ret void 1397} 1398 1399define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 1400; SI-LABEL: dynamic_insertelement_v2f64: 1401; SI: ; %bb.0: 1402; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1403; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc 1404; SI-NEXT: s_load_dword s4, s[4:5], 0x18 1405; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 1406; SI-NEXT: s_mov_b32 s3, 0x100f000 1407; SI-NEXT: s_mov_b32 s2, -1 1408; SI-NEXT: s_waitcnt lgkmcnt(0) 1409; SI-NEXT: v_mov_b32_e32 v0, s11 1410; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1411; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1412; SI-NEXT: v_mov_b32_e32 v0, s10 1413; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1414; SI-NEXT: v_mov_b32_e32 v0, s9 1415; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1416; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1417; SI-NEXT: v_mov_b32_e32 v0, s8 1418; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1419; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1420; SI-NEXT: s_endpgm 1421; 1422; VI-LABEL: dynamic_insertelement_v2f64: 1423; VI: ; %bb.0: 1424; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1425; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 1426; VI-NEXT: s_load_dword s4, s[4:5], 0x60 1427; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 1428; VI-NEXT: s_mov_b32 s3, 0x1100f000 1429; VI-NEXT: s_mov_b32 s2, -1 1430; VI-NEXT: s_waitcnt lgkmcnt(0) 1431; VI-NEXT: v_mov_b32_e32 v0, s11 1432; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1433; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1434; VI-NEXT: v_mov_b32_e32 v0, s10 1435; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1436; VI-NEXT: v_mov_b32_e32 v0, s9 1437; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1438; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1439; VI-NEXT: v_mov_b32_e32 v0, s8 1440; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1441; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1442; VI-NEXT: s_endpgm 1443 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 1444 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 1445 ret void 1446} 1447 1448define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { 1449; SI-LABEL: dynamic_insertelement_v2i64: 1450; SI: ; %bb.0: 1451; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1452; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1453; SI-NEXT: s_load_dword s6, s[4:5], 0x8 1454; SI-NEXT: s_mov_b32 s3, 0x100f000 1455; SI-NEXT: s_mov_b32 s2, -1 1456; SI-NEXT: s_waitcnt lgkmcnt(0) 1457; SI-NEXT: v_mov_b32_e32 v0, s11 1458; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1459; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1460; SI-NEXT: v_mov_b32_e32 v0, s10 1461; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1462; SI-NEXT: v_mov_b32_e32 v0, s9 1463; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1464; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1465; SI-NEXT: v_mov_b32_e32 v0, s8 1466; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1467; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1468; SI-NEXT: s_endpgm 1469; 1470; VI-LABEL: dynamic_insertelement_v2i64: 1471; VI: ; %bb.0: 1472; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1473; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1474; VI-NEXT: s_load_dword s6, s[4:5], 0x20 1475; VI-NEXT: s_mov_b32 s3, 0x1100f000 1476; VI-NEXT: s_mov_b32 s2, -1 1477; VI-NEXT: s_waitcnt lgkmcnt(0) 1478; VI-NEXT: v_mov_b32_e32 v0, s11 1479; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1480; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1481; VI-NEXT: v_mov_b32_e32 v0, s10 1482; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1483; VI-NEXT: v_mov_b32_e32 v0, s9 1484; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1485; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1486; VI-NEXT: v_mov_b32_e32 v0, s8 1487; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1488; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1489; VI-NEXT: s_endpgm 1490 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 1491 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 1492 ret void 1493} 1494 1495define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { 1496; SI-LABEL: dynamic_insertelement_v3i64: 1497; SI: ; %bb.0: 1498; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1499; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1500; SI-NEXT: s_load_dword s6, s[4:5], 0x10 1501; SI-NEXT: s_mov_b32 s3, 0x100f000 1502; SI-NEXT: s_mov_b32 s2, -1 1503; SI-NEXT: s_waitcnt lgkmcnt(0) 1504; SI-NEXT: v_mov_b32_e32 v0, s13 1505; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 1506; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] 1507; SI-NEXT: v_mov_b32_e32 v0, s12 1508; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] 1509; SI-NEXT: v_mov_b32_e32 v0, s11 1510; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1511; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1512; SI-NEXT: v_mov_b32_e32 v0, s10 1513; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1514; SI-NEXT: v_mov_b32_e32 v0, s9 1515; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1516; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1517; SI-NEXT: v_mov_b32_e32 v0, s8 1518; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1519; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1520; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1521; SI-NEXT: s_endpgm 1522; 1523; VI-LABEL: dynamic_insertelement_v3i64: 1524; VI: ; %bb.0: 1525; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1526; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1527; VI-NEXT: s_load_dword s6, s[4:5], 0x40 1528; VI-NEXT: s_mov_b32 s3, 0x1100f000 1529; VI-NEXT: s_mov_b32 s2, -1 1530; VI-NEXT: s_waitcnt lgkmcnt(0) 1531; VI-NEXT: v_mov_b32_e32 v0, s13 1532; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2 1533; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] 1534; VI-NEXT: v_mov_b32_e32 v0, s12 1535; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5] 1536; VI-NEXT: v_mov_b32_e32 v0, s11 1537; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1 1538; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1539; VI-NEXT: v_mov_b32_e32 v0, s10 1540; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1541; VI-NEXT: v_mov_b32_e32 v0, s9 1542; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0 1543; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1544; VI-NEXT: v_mov_b32_e32 v0, s8 1545; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1546; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1547; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1548; VI-NEXT: s_endpgm 1549 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 1550 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 1551 ret void 1552} 1553 1554define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { 1555; SI-LABEL: dynamic_insertelement_v4f64: 1556; SI: ; %bb.0: 1557; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1558; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1559; SI-NEXT: s_load_dword s4, s[4:5], 0x10 1560; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 1561; SI-NEXT: s_mov_b32 s3, 0x100f000 1562; SI-NEXT: s_mov_b32 s2, -1 1563; SI-NEXT: s_waitcnt lgkmcnt(0) 1564; SI-NEXT: v_mov_b32_e32 v0, s11 1565; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1566; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1567; SI-NEXT: v_mov_b32_e32 v0, s10 1568; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1569; SI-NEXT: v_mov_b32_e32 v0, s9 1570; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1571; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1572; SI-NEXT: v_mov_b32_e32 v0, s8 1573; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1574; SI-NEXT: v_mov_b32_e32 v5, s15 1575; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 1576; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1577; SI-NEXT: v_mov_b32_e32 v5, s14 1578; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1579; SI-NEXT: v_mov_b32_e32 v5, s13 1580; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 1581; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1582; SI-NEXT: v_mov_b32_e32 v4, s12 1583; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1584; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1585; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1586; SI-NEXT: s_endpgm 1587; 1588; VI-LABEL: dynamic_insertelement_v4f64: 1589; VI: ; %bb.0: 1590; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1591; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1592; VI-NEXT: s_load_dword s4, s[4:5], 0x40 1593; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 1594; VI-NEXT: s_mov_b32 s3, 0x1100f000 1595; VI-NEXT: s_mov_b32 s2, -1 1596; VI-NEXT: s_waitcnt lgkmcnt(0) 1597; VI-NEXT: v_mov_b32_e32 v0, s11 1598; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 1599; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1600; VI-NEXT: v_mov_b32_e32 v0, s10 1601; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1602; VI-NEXT: v_mov_b32_e32 v0, s9 1603; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 1604; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1605; VI-NEXT: v_mov_b32_e32 v0, s8 1606; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1607; VI-NEXT: v_mov_b32_e32 v5, s15 1608; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3 1609; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1610; VI-NEXT: v_mov_b32_e32 v5, s14 1611; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1612; VI-NEXT: v_mov_b32_e32 v5, s13 1613; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2 1614; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1615; VI-NEXT: v_mov_b32_e32 v4, s12 1616; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1617; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1618; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1619; VI-NEXT: s_endpgm 1620 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 1621 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 1622 ret void 1623} 1624 1625define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { 1626; SI-LABEL: dynamic_insertelement_v8f64: 1627; SI: ; %bb.0: 1628; SI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x0 1629; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 1630; SI-NEXT: s_load_dword s4, s[4:5], 0x20 1631; SI-NEXT: v_mov_b32_e32 v16, 64 1632; SI-NEXT: s_mov_b32 s27, 0x100f000 1633; SI-NEXT: s_mov_b32 s26, -1 1634; SI-NEXT: s_waitcnt lgkmcnt(0) 1635; SI-NEXT: v_mov_b32_e32 v0, s8 1636; SI-NEXT: s_and_b32 s4, s4, 7 1637; SI-NEXT: s_lshl_b32 s4, s4, 3 1638; SI-NEXT: v_mov_b32_e32 v1, s9 1639; SI-NEXT: v_mov_b32_e32 v2, s10 1640; SI-NEXT: v_mov_b32_e32 v3, s11 1641; SI-NEXT: v_mov_b32_e32 v4, s12 1642; SI-NEXT: v_mov_b32_e32 v5, s13 1643; SI-NEXT: v_mov_b32_e32 v6, s14 1644; SI-NEXT: v_mov_b32_e32 v7, s15 1645; SI-NEXT: v_mov_b32_e32 v8, s16 1646; SI-NEXT: v_mov_b32_e32 v9, s17 1647; SI-NEXT: v_mov_b32_e32 v10, s18 1648; SI-NEXT: v_mov_b32_e32 v11, s19 1649; SI-NEXT: v_mov_b32_e32 v12, s20 1650; SI-NEXT: v_mov_b32_e32 v13, s21 1651; SI-NEXT: v_mov_b32_e32 v14, s22 1652; SI-NEXT: v_mov_b32_e32 v15, s23 1653; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 1654; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 1655; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 1656; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 1657; SI-NEXT: v_or_b32_e32 v16, s4, v16 1658; SI-NEXT: v_mov_b32_e32 v0, 0 1659; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 1660; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen 1661; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 1662; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 1663; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 1664; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 1665; SI-NEXT: s_waitcnt vmcnt(0) 1666; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48 1667; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32 1668; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16 1669; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0 1670; SI-NEXT: s_endpgm 1671; 1672; VI-LABEL: dynamic_insertelement_v8f64: 1673; VI: ; %bb.0: 1674; VI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x0 1675; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 1676; VI-NEXT: s_load_dword s4, s[4:5], 0x80 1677; VI-NEXT: v_mov_b32_e32 v16, 64 1678; VI-NEXT: s_mov_b32 s27, 0x1100f000 1679; VI-NEXT: s_mov_b32 s26, -1 1680; VI-NEXT: s_waitcnt lgkmcnt(0) 1681; VI-NEXT: v_mov_b32_e32 v0, s8 1682; VI-NEXT: s_and_b32 s4, s4, 7 1683; VI-NEXT: s_lshl_b32 s4, s4, 3 1684; VI-NEXT: v_mov_b32_e32 v1, s9 1685; VI-NEXT: v_mov_b32_e32 v2, s10 1686; VI-NEXT: v_mov_b32_e32 v3, s11 1687; VI-NEXT: v_mov_b32_e32 v4, s12 1688; VI-NEXT: v_mov_b32_e32 v5, s13 1689; VI-NEXT: v_mov_b32_e32 v6, s14 1690; VI-NEXT: v_mov_b32_e32 v7, s15 1691; VI-NEXT: v_mov_b32_e32 v8, s16 1692; VI-NEXT: v_mov_b32_e32 v9, s17 1693; VI-NEXT: v_mov_b32_e32 v10, s18 1694; VI-NEXT: v_mov_b32_e32 v11, s19 1695; VI-NEXT: v_mov_b32_e32 v12, s20 1696; VI-NEXT: v_mov_b32_e32 v13, s21 1697; VI-NEXT: v_mov_b32_e32 v14, s22 1698; VI-NEXT: v_mov_b32_e32 v15, s23 1699; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 1700; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 1701; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 1702; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 1703; VI-NEXT: v_or_b32_e32 v16, s4, v16 1704; VI-NEXT: v_mov_b32_e32 v0, 0 1705; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 1706; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen 1707; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 1708; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 1709; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 1710; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 1711; VI-NEXT: s_waitcnt vmcnt(0) 1712; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48 1713; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32 1714; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16 1715; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0 1716; VI-NEXT: s_endpgm 1717 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 1718 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 1719 ret void 1720} 1721 1722declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1723 1724attributes #0 = { nounwind } 1725attributes #1 = { nounwind readnone } 1726