1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s 4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s 5; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 6 7define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 8; GFX9-LABEL: s_insertelement_v2i16_0: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 11; GFX9-NEXT: v_mov_b32_e32 v0, 0 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 16; GFX9-NEXT: v_mov_b32_e32 v1, s2 17; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 18; GFX9-NEXT: s_endpgm 19; 20; CIVI-LABEL: s_insertelement_v2i16_0: 21; CIVI: ; %bb.0: 22; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 23; CIVI-NEXT: s_waitcnt lgkmcnt(0) 24; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 25; CIVI-NEXT: v_mov_b32_e32 v0, s0 26; CIVI-NEXT: v_mov_b32_e32 v1, s1 27; CIVI-NEXT: s_waitcnt lgkmcnt(0) 28; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 29; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 30; CIVI-NEXT: v_mov_b32_e32 v2, s0 31; CIVI-NEXT: flat_store_dword v[0:1], v2 32; CIVI-NEXT: s_endpgm 33; 34; GFX11-LABEL: s_insertelement_v2i16_0: 35; GFX11: ; %bb.0: 36; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 37; GFX11-NEXT: s_waitcnt lgkmcnt(0) 38; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 39; GFX11-NEXT: s_waitcnt lgkmcnt(0) 40; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 41; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 42; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 43; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 44; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 45; GFX11-NEXT: s_endpgm 46 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 47 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 48 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 49 ret void 50} 51 52 53define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 54; GFX9-LABEL: s_insertelement_v2i16_0_reg: 55; GFX9: ; %bb.0: 56; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 57; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 58; GFX9-NEXT: v_mov_b32_e32 v0, 0 59; GFX9-NEXT: s_waitcnt lgkmcnt(0) 60; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 61; GFX9-NEXT: s_waitcnt lgkmcnt(0) 62; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 63; GFX9-NEXT: v_mov_b32_e32 v1, s2 64; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 65; GFX9-NEXT: s_endpgm 66; 67; VI-LABEL: s_insertelement_v2i16_0_reg: 68; VI: ; %bb.0: 69; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 70; VI-NEXT: s_load_dword s4, s[4:5], 0x30 71; VI-NEXT: s_waitcnt lgkmcnt(0) 72; VI-NEXT: s_load_dword s2, s[2:3], 0x0 73; VI-NEXT: v_mov_b32_e32 v0, s0 74; VI-NEXT: v_mov_b32_e32 v1, s1 75; VI-NEXT: s_and_b32 s0, s4, 0xffff 76; VI-NEXT: s_waitcnt lgkmcnt(0) 77; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 78; VI-NEXT: s_or_b32 s0, s0, s1 79; VI-NEXT: v_mov_b32_e32 v2, s0 80; VI-NEXT: flat_store_dword v[0:1], v2 81; VI-NEXT: s_endpgm 82; 83; CI-LABEL: s_insertelement_v2i16_0_reg: 84; CI: ; %bb.0: 85; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 86; CI-NEXT: s_load_dword s4, s[4:5], 0xc 87; CI-NEXT: s_waitcnt lgkmcnt(0) 88; CI-NEXT: s_load_dword s2, s[2:3], 0x0 89; CI-NEXT: v_mov_b32_e32 v0, s0 90; CI-NEXT: v_mov_b32_e32 v1, s1 91; CI-NEXT: s_and_b32 s1, s4, 0xffff 92; CI-NEXT: s_waitcnt lgkmcnt(0) 93; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 94; CI-NEXT: s_or_b32 s0, s1, s0 95; CI-NEXT: v_mov_b32_e32 v2, s0 96; CI-NEXT: flat_store_dword v[0:1], v2 97; CI-NEXT: s_endpgm 98; 99; GFX11-LABEL: s_insertelement_v2i16_0_reg: 100; GFX11: ; %bb.0: 101; GFX11-NEXT: s_clause 0x1 102; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 103; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 104; GFX11-NEXT: s_waitcnt lgkmcnt(0) 105; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 106; GFX11-NEXT: s_waitcnt lgkmcnt(0) 107; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1 108; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 109; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 110; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 111; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 112; GFX11-NEXT: s_endpgm 113 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 114 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 115 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 116 ret void 117} 118 119define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 120; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 121; GFX9: ; %bb.0: 122; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 123; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 124; GFX9-NEXT: v_mov_b32_e32 v0, 0 125; GFX9-NEXT: s_waitcnt lgkmcnt(0) 126; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 128; GFX9-NEXT: s_lshr_b32 s2, s2, 16 129; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 130; GFX9-NEXT: v_mov_b32_e32 v1, s3 131; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 132; GFX9-NEXT: ;;#ASMSTART 133; GFX9-NEXT: ; use s2 134; GFX9-NEXT: ;;#ASMEND 135; GFX9-NEXT: s_endpgm 136; 137; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 138; VI: ; %bb.0: 139; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 140; VI-NEXT: s_load_dword s4, s[4:5], 0x30 141; VI-NEXT: s_waitcnt lgkmcnt(0) 142; VI-NEXT: s_load_dword s2, s[2:3], 0x0 143; VI-NEXT: v_mov_b32_e32 v0, s0 144; VI-NEXT: v_mov_b32_e32 v1, s1 145; VI-NEXT: s_and_b32 s0, s4, 0xffff 146; VI-NEXT: s_waitcnt lgkmcnt(0) 147; VI-NEXT: s_lshr_b32 s1, s2, 16 148; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 149; VI-NEXT: s_or_b32 s0, s0, s2 150; VI-NEXT: v_mov_b32_e32 v2, s0 151; VI-NEXT: flat_store_dword v[0:1], v2 152; VI-NEXT: ;;#ASMSTART 153; VI-NEXT: ; use s1 154; VI-NEXT: ;;#ASMEND 155; VI-NEXT: s_endpgm 156; 157; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 158; CI: ; %bb.0: 159; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 160; CI-NEXT: s_load_dword s4, s[4:5], 0xc 161; CI-NEXT: s_waitcnt lgkmcnt(0) 162; CI-NEXT: s_load_dword s2, s[2:3], 0x0 163; CI-NEXT: v_mov_b32_e32 v0, s0 164; CI-NEXT: v_mov_b32_e32 v1, s1 165; CI-NEXT: s_and_b32 s0, s4, 0xffff 166; CI-NEXT: s_waitcnt lgkmcnt(0) 167; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 168; CI-NEXT: s_or_b32 s0, s0, s1 169; CI-NEXT: v_mov_b32_e32 v2, s0 170; CI-NEXT: s_lshr_b32 s2, s2, 16 171; CI-NEXT: flat_store_dword v[0:1], v2 172; CI-NEXT: ;;#ASMSTART 173; CI-NEXT: ; use s2 174; CI-NEXT: ;;#ASMEND 175; CI-NEXT: s_endpgm 176; 177; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 178; GFX11: ; %bb.0: 179; GFX11-NEXT: s_clause 0x1 180; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 181; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 182; GFX11-NEXT: s_waitcnt lgkmcnt(0) 183; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 184; GFX11-NEXT: s_waitcnt lgkmcnt(0) 185; GFX11-NEXT: s_lshr_b32 s1, s1, 16 186; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 187; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 188; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 189; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 190; GFX11-NEXT: ;;#ASMSTART 191; GFX11-NEXT: ; use s1 192; GFX11-NEXT: ;;#ASMEND 193; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 194; GFX11-NEXT: s_endpgm 195 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 196 %elt1 = extractelement <2 x i16> %vec, i32 1 197 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 198 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 199 %use1 = zext i16 %elt1 to i32 200 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 201 ret void 202} 203 204define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 { 205; GFX9-LABEL: s_insertelement_v2i16_0_reghi: 206; GFX9: ; %bb.0: 207; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 208; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 209; GFX9-NEXT: v_mov_b32_e32 v0, 0 210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 211; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 212; GFX9-NEXT: s_waitcnt lgkmcnt(0) 213; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 214; GFX9-NEXT: v_mov_b32_e32 v1, s2 215; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 216; GFX9-NEXT: s_endpgm 217; 218; VI-LABEL: s_insertelement_v2i16_0_reghi: 219; VI: ; %bb.0: 220; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 221; VI-NEXT: s_load_dword s4, s[4:5], 0x30 222; VI-NEXT: s_waitcnt lgkmcnt(0) 223; VI-NEXT: s_load_dword s2, s[2:3], 0x0 224; VI-NEXT: v_mov_b32_e32 v0, s0 225; VI-NEXT: v_mov_b32_e32 v2, s4 226; VI-NEXT: v_mov_b32_e32 v1, s1 227; VI-NEXT: s_waitcnt lgkmcnt(0) 228; VI-NEXT: s_lshr_b32 s0, s2, 16 229; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 230; VI-NEXT: flat_store_dword v[0:1], v2 231; VI-NEXT: s_endpgm 232; 233; CI-LABEL: s_insertelement_v2i16_0_reghi: 234; CI: ; %bb.0: 235; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 236; CI-NEXT: s_load_dword s4, s[4:5], 0xc 237; CI-NEXT: s_waitcnt lgkmcnt(0) 238; CI-NEXT: s_load_dword s2, s[2:3], 0x0 239; CI-NEXT: v_mov_b32_e32 v0, s0 240; CI-NEXT: v_mov_b32_e32 v1, s1 241; CI-NEXT: s_lshr_b32 s1, s4, 16 242; CI-NEXT: s_waitcnt lgkmcnt(0) 243; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 244; CI-NEXT: s_or_b32 s0, s1, s0 245; CI-NEXT: v_mov_b32_e32 v2, s0 246; CI-NEXT: flat_store_dword v[0:1], v2 247; CI-NEXT: s_endpgm 248; 249; GFX11-LABEL: s_insertelement_v2i16_0_reghi: 250; GFX11: ; %bb.0: 251; GFX11-NEXT: s_clause 0x1 252; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 253; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 254; GFX11-NEXT: s_waitcnt lgkmcnt(0) 255; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 256; GFX11-NEXT: s_waitcnt lgkmcnt(0) 257; GFX11-NEXT: s_pack_hh_b32_b16 s0, s0, s1 258; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 259; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 260; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 261; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 262; GFX11-NEXT: s_endpgm 263 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 264 %elt.hi = lshr i32 %elt.arg, 16 265 %elt = trunc i32 %elt.hi to i16 266 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 267 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 268 ret void 269} 270 271define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 272; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 273; GFX9: ; %bb.0: 274; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 275; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 276; GFX9-NEXT: v_mov_b32_e32 v0, 0 277; GFX9-NEXT: s_waitcnt lgkmcnt(0) 278; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 279; GFX9-NEXT: s_lshr_b32 s3, s6, 16 280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 281; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 282; GFX9-NEXT: v_mov_b32_e32 v1, s2 283; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 284; GFX9-NEXT: ;;#ASMSTART 285; GFX9-NEXT: ; use s3 286; GFX9-NEXT: ;;#ASMEND 287; GFX9-NEXT: s_endpgm 288; 289; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 290; VI: ; %bb.0: 291; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 292; VI-NEXT: s_load_dword s4, s[4:5], 0x10 293; VI-NEXT: s_waitcnt lgkmcnt(0) 294; VI-NEXT: s_load_dword s2, s[2:3], 0x0 295; VI-NEXT: v_mov_b32_e32 v1, s1 296; VI-NEXT: v_mov_b32_e32 v2, s4 297; VI-NEXT: v_mov_b32_e32 v0, s0 298; VI-NEXT: s_lshr_b32 s0, s4, 16 299; VI-NEXT: s_waitcnt lgkmcnt(0) 300; VI-NEXT: s_lshr_b32 s1, s2, 16 301; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 302; VI-NEXT: flat_store_dword v[0:1], v2 303; VI-NEXT: ;;#ASMSTART 304; VI-NEXT: ; use s0 305; VI-NEXT: ;;#ASMEND 306; VI-NEXT: s_endpgm 307; 308; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 309; CI: ; %bb.0: 310; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 311; CI-NEXT: s_load_dword s4, s[4:5], 0x4 312; CI-NEXT: s_waitcnt lgkmcnt(0) 313; CI-NEXT: s_load_dword s2, s[2:3], 0x0 314; CI-NEXT: v_mov_b32_e32 v0, s0 315; CI-NEXT: v_mov_b32_e32 v1, s1 316; CI-NEXT: s_lshr_b32 s0, s4, 16 317; CI-NEXT: s_waitcnt lgkmcnt(0) 318; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 319; CI-NEXT: s_or_b32 s1, s0, s1 320; CI-NEXT: v_mov_b32_e32 v2, s1 321; CI-NEXT: flat_store_dword v[0:1], v2 322; CI-NEXT: ;;#ASMSTART 323; CI-NEXT: ; use s0 324; CI-NEXT: ;;#ASMEND 325; CI-NEXT: s_endpgm 326; 327; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 328; GFX11: ; %bb.0: 329; GFX11-NEXT: s_clause 0x1 330; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 331; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 332; GFX11-NEXT: s_waitcnt lgkmcnt(0) 333; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 334; GFX11-NEXT: s_lshr_b32 s0, s0, 16 335; GFX11-NEXT: s_waitcnt lgkmcnt(0) 336; GFX11-NEXT: s_pack_lh_b32_b16 s1, s0, s1 337; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 338; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 339; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 340; GFX11-NEXT: ;;#ASMSTART 341; GFX11-NEXT: ; use s0 342; GFX11-NEXT: ;;#ASMEND 343; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 344; GFX11-NEXT: s_endpgm 345 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 346 %elt.hi = lshr i32 %elt.arg, 16 347 %elt = trunc i32 %elt.hi to i16 348 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 349 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 350 %use1 = zext i16 %elt to i32 351 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 352 ret void 353} 354 355define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 356; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 357; GFX9: ; %bb.0: 358; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 359; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 360; GFX9-NEXT: v_mov_b32_e32 v0, 0 361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 362; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 363; GFX9-NEXT: s_lshr_b32 s3, s6, 16 364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 365; GFX9-NEXT: s_lshr_b32 s2, s2, 16 366; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 367; GFX9-NEXT: v_mov_b32_e32 v1, s4 368; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 369; GFX9-NEXT: ;;#ASMSTART 370; GFX9-NEXT: ; use s3 371; GFX9-NEXT: ;;#ASMEND 372; GFX9-NEXT: ;;#ASMSTART 373; GFX9-NEXT: ; use s2 374; GFX9-NEXT: ;;#ASMEND 375; GFX9-NEXT: s_endpgm 376; 377; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 378; VI: ; %bb.0: 379; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 380; VI-NEXT: s_load_dword s4, s[4:5], 0x10 381; VI-NEXT: s_waitcnt lgkmcnt(0) 382; VI-NEXT: s_load_dword s2, s[2:3], 0x0 383; VI-NEXT: v_mov_b32_e32 v1, s1 384; VI-NEXT: v_mov_b32_e32 v2, s4 385; VI-NEXT: v_mov_b32_e32 v0, s0 386; VI-NEXT: s_lshr_b32 s0, s4, 16 387; VI-NEXT: s_waitcnt lgkmcnt(0) 388; VI-NEXT: s_lshr_b32 s1, s2, 16 389; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 390; VI-NEXT: flat_store_dword v[0:1], v2 391; VI-NEXT: ;;#ASMSTART 392; VI-NEXT: ; use s0 393; VI-NEXT: ;;#ASMEND 394; VI-NEXT: ;;#ASMSTART 395; VI-NEXT: ; use s1 396; VI-NEXT: ;;#ASMEND 397; VI-NEXT: s_endpgm 398; 399; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 400; CI: ; %bb.0: 401; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 402; CI-NEXT: s_load_dword s4, s[4:5], 0x4 403; CI-NEXT: s_waitcnt lgkmcnt(0) 404; CI-NEXT: s_load_dword s2, s[2:3], 0x0 405; CI-NEXT: v_mov_b32_e32 v1, s1 406; CI-NEXT: v_mov_b32_e32 v2, s4 407; CI-NEXT: v_mov_b32_e32 v0, s0 408; CI-NEXT: s_lshr_b32 s0, s4, 16 409; CI-NEXT: s_waitcnt lgkmcnt(0) 410; CI-NEXT: s_lshr_b32 s1, s2, 16 411; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 412; CI-NEXT: flat_store_dword v[0:1], v2 413; CI-NEXT: ;;#ASMSTART 414; CI-NEXT: ; use s0 415; CI-NEXT: ;;#ASMEND 416; CI-NEXT: ;;#ASMSTART 417; CI-NEXT: ; use s1 418; CI-NEXT: ;;#ASMEND 419; CI-NEXT: s_endpgm 420; 421; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 422; GFX11: ; %bb.0: 423; GFX11-NEXT: s_clause 0x1 424; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 425; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 426; GFX11-NEXT: s_waitcnt lgkmcnt(0) 427; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 428; GFX11-NEXT: s_lshr_b32 s0, s0, 16 429; GFX11-NEXT: s_waitcnt lgkmcnt(0) 430; GFX11-NEXT: s_lshr_b32 s1, s1, 16 431; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 432; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s1 433; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 434; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 435; GFX11-NEXT: ;;#ASMSTART 436; GFX11-NEXT: ; use s0 437; GFX11-NEXT: ;;#ASMEND 438; GFX11-NEXT: ;;#ASMSTART 439; GFX11-NEXT: ; use s1 440; GFX11-NEXT: ;;#ASMEND 441; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 442; GFX11-NEXT: s_endpgm 443 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 444 %elt.hi = lshr i32 %elt.arg, 16 445 %elt = trunc i32 %elt.hi to i16 446 %vec.hi = extractelement <2 x i16> %vec, i32 1 447 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 448 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 449 %use1 = zext i16 %elt to i32 450 %vec.hi.use1 = zext i16 %vec.hi to i32 451 452 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 453 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 454 ret void 455} 456 457define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 458; GFX9-LABEL: s_insertelement_v2i16_1: 459; GFX9: ; %bb.0: 460; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 461; GFX9-NEXT: v_mov_b32_e32 v0, 0 462; GFX9-NEXT: s_waitcnt lgkmcnt(0) 463; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 464; GFX9-NEXT: s_waitcnt lgkmcnt(0) 465; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 466; GFX9-NEXT: v_mov_b32_e32 v1, s2 467; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 468; GFX9-NEXT: s_endpgm 469; 470; CIVI-LABEL: s_insertelement_v2i16_1: 471; CIVI: ; %bb.0: 472; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 473; CIVI-NEXT: s_waitcnt lgkmcnt(0) 474; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 475; CIVI-NEXT: v_mov_b32_e32 v0, s0 476; CIVI-NEXT: v_mov_b32_e32 v1, s1 477; CIVI-NEXT: s_waitcnt lgkmcnt(0) 478; CIVI-NEXT: s_and_b32 s0, s2, 0xffff 479; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 480; CIVI-NEXT: v_mov_b32_e32 v2, s0 481; CIVI-NEXT: flat_store_dword v[0:1], v2 482; CIVI-NEXT: s_endpgm 483; 484; GFX11-LABEL: s_insertelement_v2i16_1: 485; GFX11: ; %bb.0: 486; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 487; GFX11-NEXT: s_waitcnt lgkmcnt(0) 488; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 489; GFX11-NEXT: s_waitcnt lgkmcnt(0) 490; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 491; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 492; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 493; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 494; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 495; GFX11-NEXT: s_endpgm 496 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 497 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 498 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 499 ret void 500} 501 502define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 503; GFX9-LABEL: s_insertelement_v2i16_1_reg: 504; GFX9: ; %bb.0: 505; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 506; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 507; GFX9-NEXT: v_mov_b32_e32 v0, 0 508; GFX9-NEXT: s_waitcnt lgkmcnt(0) 509; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 510; GFX9-NEXT: s_waitcnt lgkmcnt(0) 511; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 512; GFX9-NEXT: v_mov_b32_e32 v1, s2 513; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 514; GFX9-NEXT: s_endpgm 515; 516; VI-LABEL: s_insertelement_v2i16_1_reg: 517; VI: ; %bb.0: 518; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 519; VI-NEXT: s_load_dword s4, s[4:5], 0x30 520; VI-NEXT: s_waitcnt lgkmcnt(0) 521; VI-NEXT: s_load_dword s2, s[2:3], 0x0 522; VI-NEXT: v_mov_b32_e32 v0, s0 523; VI-NEXT: v_mov_b32_e32 v1, s1 524; VI-NEXT: s_lshl_b32 s0, s4, 16 525; VI-NEXT: s_waitcnt lgkmcnt(0) 526; VI-NEXT: s_and_b32 s1, s2, 0xffff 527; VI-NEXT: s_or_b32 s0, s1, s0 528; VI-NEXT: v_mov_b32_e32 v2, s0 529; VI-NEXT: flat_store_dword v[0:1], v2 530; VI-NEXT: s_endpgm 531; 532; CI-LABEL: s_insertelement_v2i16_1_reg: 533; CI: ; %bb.0: 534; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 535; CI-NEXT: s_load_dword s4, s[4:5], 0xc 536; CI-NEXT: s_waitcnt lgkmcnt(0) 537; CI-NEXT: s_load_dword s2, s[2:3], 0x0 538; CI-NEXT: v_mov_b32_e32 v0, s0 539; CI-NEXT: v_mov_b32_e32 v1, s1 540; CI-NEXT: s_lshl_b32 s1, s4, 16 541; CI-NEXT: s_waitcnt lgkmcnt(0) 542; CI-NEXT: s_and_b32 s0, s2, 0xffff 543; CI-NEXT: s_or_b32 s0, s0, s1 544; CI-NEXT: v_mov_b32_e32 v2, s0 545; CI-NEXT: flat_store_dword v[0:1], v2 546; CI-NEXT: s_endpgm 547; 548; GFX11-LABEL: s_insertelement_v2i16_1_reg: 549; GFX11: ; %bb.0: 550; GFX11-NEXT: s_clause 0x1 551; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 552; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 553; GFX11-NEXT: s_waitcnt lgkmcnt(0) 554; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 555; GFX11-NEXT: s_waitcnt lgkmcnt(0) 556; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0 557; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 558; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 559; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 560; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 561; GFX11-NEXT: s_endpgm 562 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 563 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 564 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 565 ret void 566} 567 568define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 569; GFX9-LABEL: s_insertelement_v2f16_0: 570; GFX9: ; %bb.0: 571; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 572; GFX9-NEXT: v_mov_b32_e32 v0, 0 573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 574; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 575; GFX9-NEXT: s_waitcnt lgkmcnt(0) 576; GFX9-NEXT: s_lshr_b32 s2, s2, 16 577; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 578; GFX9-NEXT: v_mov_b32_e32 v1, s2 579; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 580; GFX9-NEXT: s_endpgm 581; 582; CIVI-LABEL: s_insertelement_v2f16_0: 583; CIVI: ; %bb.0: 584; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 585; CIVI-NEXT: s_waitcnt lgkmcnt(0) 586; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 587; CIVI-NEXT: v_mov_b32_e32 v0, s0 588; CIVI-NEXT: v_mov_b32_e32 v1, s1 589; CIVI-NEXT: s_waitcnt lgkmcnt(0) 590; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 591; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 592; CIVI-NEXT: v_mov_b32_e32 v2, s0 593; CIVI-NEXT: flat_store_dword v[0:1], v2 594; CIVI-NEXT: s_endpgm 595; 596; GFX11-LABEL: s_insertelement_v2f16_0: 597; GFX11: ; %bb.0: 598; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 599; GFX11-NEXT: s_waitcnt lgkmcnt(0) 600; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 601; GFX11-NEXT: s_waitcnt lgkmcnt(0) 602; GFX11-NEXT: s_lshr_b32 s2, s2, 16 603; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 604; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 605; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 606; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 607; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 608; GFX11-NEXT: s_endpgm 609 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 610 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 611 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 612 ret void 613} 614 615define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 616; GFX9-LABEL: s_insertelement_v2f16_1: 617; GFX9: ; %bb.0: 618; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 619; GFX9-NEXT: v_mov_b32_e32 v0, 0 620; GFX9-NEXT: s_waitcnt lgkmcnt(0) 621; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 622; GFX9-NEXT: s_waitcnt lgkmcnt(0) 623; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 624; GFX9-NEXT: v_mov_b32_e32 v1, s2 625; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 626; GFX9-NEXT: s_endpgm 627; 628; CIVI-LABEL: s_insertelement_v2f16_1: 629; CIVI: ; %bb.0: 630; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 631; CIVI-NEXT: s_waitcnt lgkmcnt(0) 632; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 633; CIVI-NEXT: v_mov_b32_e32 v0, s0 634; CIVI-NEXT: v_mov_b32_e32 v1, s1 635; CIVI-NEXT: s_waitcnt lgkmcnt(0) 636; CIVI-NEXT: s_and_b32 s0, s2, 0xffff 637; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 638; CIVI-NEXT: v_mov_b32_e32 v2, s0 639; CIVI-NEXT: flat_store_dword v[0:1], v2 640; CIVI-NEXT: s_endpgm 641; 642; GFX11-LABEL: s_insertelement_v2f16_1: 643; GFX11: ; %bb.0: 644; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 645; GFX11-NEXT: s_waitcnt lgkmcnt(0) 646; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 647; GFX11-NEXT: s_waitcnt lgkmcnt(0) 648; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 649; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 650; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 651; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 652; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 653; GFX11-NEXT: s_endpgm 654 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 655 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 656 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 657 ret void 658} 659 660define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 661; GFX9-LABEL: v_insertelement_v2i16_0: 662; GFX9: ; %bb.0: 663; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 664; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 665; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 666; GFX9-NEXT: s_waitcnt lgkmcnt(0) 667; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 668; GFX9-NEXT: s_movk_i32 s2, 0x3e7 669; GFX9-NEXT: s_waitcnt vmcnt(0) 670; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1 671; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 672; GFX9-NEXT: s_endpgm 673; 674; VI-LABEL: v_insertelement_v2i16_0: 675; VI: ; %bb.0: 676; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 677; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 678; VI-NEXT: s_waitcnt lgkmcnt(0) 679; VI-NEXT: v_mov_b32_e32 v1, s3 680; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 681; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 682; VI-NEXT: flat_load_dword v3, v[0:1] 683; VI-NEXT: v_mov_b32_e32 v1, s1 684; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 685; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 686; VI-NEXT: s_waitcnt vmcnt(0) 687; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 688; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 689; VI-NEXT: flat_store_dword v[0:1], v2 690; VI-NEXT: s_endpgm 691; 692; CI-LABEL: v_insertelement_v2i16_0: 693; CI: ; %bb.0: 694; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 695; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 696; CI-NEXT: s_waitcnt lgkmcnt(0) 697; CI-NEXT: v_mov_b32_e32 v1, s3 698; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 699; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 700; CI-NEXT: flat_load_dword v3, v[0:1] 701; CI-NEXT: v_mov_b32_e32 v1, s1 702; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 703; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 704; CI-NEXT: s_waitcnt vmcnt(0) 705; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 706; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 707; CI-NEXT: flat_store_dword v[0:1], v2 708; CI-NEXT: s_endpgm 709; 710; GFX11-LABEL: v_insertelement_v2i16_0: 711; GFX11: ; %bb.0: 712; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 713; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 714; GFX11-NEXT: s_waitcnt lgkmcnt(0) 715; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 716; GFX11-NEXT: s_movk_i32 s2, 0x3e7 717; GFX11-NEXT: s_waitcnt vmcnt(0) 718; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 719; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 720; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 721; GFX11-NEXT: s_endpgm 722 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 723 %tid.ext = sext i32 %tid to i64 724 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 725 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 726 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 727 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 728 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 729 ret void 730} 731 732define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { 733; GFX9-LABEL: v_insertelement_v2i16_0_reghi: 734; GFX9: ; %bb.0: 735; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 736; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 737; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 738; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 739; GFX9-NEXT: s_waitcnt lgkmcnt(0) 740; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 741; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 742; GFX9-NEXT: s_waitcnt vmcnt(0) 743; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 744; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 745; GFX9-NEXT: s_endpgm 746; 747; VI-LABEL: v_insertelement_v2i16_0_reghi: 748; VI: ; %bb.0: 749; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 750; VI-NEXT: s_load_dword s4, s[4:5], 0x10 751; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 752; VI-NEXT: s_waitcnt lgkmcnt(0) 753; VI-NEXT: v_mov_b32_e32 v1, s3 754; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 755; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 756; VI-NEXT: flat_load_dword v3, v[0:1] 757; VI-NEXT: v_mov_b32_e32 v1, s1 758; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 759; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 760; VI-NEXT: s_waitcnt vmcnt(0) 761; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 762; VI-NEXT: v_alignbit_b32 v2, v2, s4, 16 763; VI-NEXT: flat_store_dword v[0:1], v2 764; VI-NEXT: s_endpgm 765; 766; CI-LABEL: v_insertelement_v2i16_0_reghi: 767; CI: ; %bb.0: 768; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 769; CI-NEXT: s_load_dword s4, s[4:5], 0x4 770; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 771; CI-NEXT: s_waitcnt lgkmcnt(0) 772; CI-NEXT: v_mov_b32_e32 v1, s3 773; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 774; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 775; CI-NEXT: flat_load_dword v3, v[0:1] 776; CI-NEXT: v_mov_b32_e32 v1, s1 777; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 778; CI-NEXT: s_lshr_b32 s0, s4, 16 779; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 780; CI-NEXT: s_waitcnt vmcnt(0) 781; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 782; CI-NEXT: v_or_b32_e32 v2, s0, v2 783; CI-NEXT: flat_store_dword v[0:1], v2 784; CI-NEXT: s_endpgm 785; 786; GFX11-LABEL: v_insertelement_v2i16_0_reghi: 787; GFX11: ; %bb.0: 788; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 789; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 790; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 791; GFX11-NEXT: s_waitcnt lgkmcnt(0) 792; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 793; GFX11-NEXT: v_lshrrev_b32_e64 v2, 16, s0 794; GFX11-NEXT: s_waitcnt vmcnt(0) 795; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 796; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v2 797; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 798; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 799; GFX11-NEXT: s_endpgm 800 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 801 %tid.ext = sext i32 %tid to i64 802 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 803 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 804 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 805 %elt.hi = lshr i32 %elt.arg, 16 806 %elt = trunc i32 %elt.hi to i16 807 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 808 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 809 ret void 810} 811 812define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 813; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: 814; GFX9: ; %bb.0: 815; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 816; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 817; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 819; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 820; GFX9-NEXT: s_waitcnt vmcnt(0) 821; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1 822; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 823; GFX9-NEXT: s_endpgm 824; 825; VI-LABEL: v_insertelement_v2i16_0_inlineimm: 826; VI: ; %bb.0: 827; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 828; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 829; VI-NEXT: s_waitcnt lgkmcnt(0) 830; VI-NEXT: v_mov_b32_e32 v1, s3 831; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 832; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 833; VI-NEXT: flat_load_dword v3, v[0:1] 834; VI-NEXT: v_mov_b32_e32 v1, s1 835; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 836; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 837; VI-NEXT: s_waitcnt vmcnt(0) 838; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 839; VI-NEXT: v_or_b32_e32 v2, 53, v2 840; VI-NEXT: flat_store_dword v[0:1], v2 841; VI-NEXT: s_endpgm 842; 843; CI-LABEL: v_insertelement_v2i16_0_inlineimm: 844; CI: ; %bb.0: 845; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 846; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 847; CI-NEXT: s_waitcnt lgkmcnt(0) 848; CI-NEXT: v_mov_b32_e32 v1, s3 849; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 850; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 851; CI-NEXT: flat_load_dword v3, v[0:1] 852; CI-NEXT: v_mov_b32_e32 v1, s1 853; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 854; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 855; CI-NEXT: s_waitcnt vmcnt(0) 856; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 857; CI-NEXT: v_or_b32_e32 v2, 53, v2 858; CI-NEXT: flat_store_dword v[0:1], v2 859; CI-NEXT: s_endpgm 860; 861; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: 862; GFX11: ; %bb.0: 863; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 864; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 865; GFX11-NEXT: s_waitcnt lgkmcnt(0) 866; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 867; GFX11-NEXT: s_waitcnt vmcnt(0) 868; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 869; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 870; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 871; GFX11-NEXT: s_endpgm 872 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 873 %tid.ext = sext i32 %tid to i64 874 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 875 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 876 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 877 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0 878 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 879 ret void 880} 881 882; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 883define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 884; GFX9-LABEL: v_insertelement_v2i16_1: 885; GFX9: ; %bb.0: 886; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 887; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 888; GFX9-NEXT: s_waitcnt lgkmcnt(0) 889; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 890; GFX9-NEXT: s_movk_i32 s2, 0x3e7 891; GFX9-NEXT: s_waitcnt vmcnt(0) 892; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 893; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 894; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 895; GFX9-NEXT: s_endpgm 896; 897; VI-LABEL: v_insertelement_v2i16_1: 898; VI: ; %bb.0: 899; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 900; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 901; VI-NEXT: s_waitcnt lgkmcnt(0) 902; VI-NEXT: v_mov_b32_e32 v1, s3 903; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 904; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 905; VI-NEXT: flat_load_dword v3, v[0:1] 906; VI-NEXT: v_mov_b32_e32 v1, s1 907; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 908; VI-NEXT: v_mov_b32_e32 v2, 0x3e70000 909; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 910; VI-NEXT: s_waitcnt vmcnt(0) 911; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 912; VI-NEXT: flat_store_dword v[0:1], v2 913; VI-NEXT: s_endpgm 914; 915; CI-LABEL: v_insertelement_v2i16_1: 916; CI: ; %bb.0: 917; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 918; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 919; CI-NEXT: s_waitcnt lgkmcnt(0) 920; CI-NEXT: v_mov_b32_e32 v1, s3 921; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 922; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 923; CI-NEXT: flat_load_dword v3, v[0:1] 924; CI-NEXT: v_mov_b32_e32 v1, s1 925; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 926; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 927; CI-NEXT: s_waitcnt vmcnt(0) 928; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 929; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 930; CI-NEXT: flat_store_dword v[0:1], v2 931; CI-NEXT: s_endpgm 932; 933; GFX11-LABEL: v_insertelement_v2i16_1: 934; GFX11: ; %bb.0: 935; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 936; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 937; GFX11-NEXT: s_waitcnt lgkmcnt(0) 938; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 939; GFX11-NEXT: s_waitcnt vmcnt(0) 940; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 941; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 942; GFX11-NEXT: v_lshl_or_b32 v1, 0x3e7, 16, v1 943; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 944; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 945; GFX11-NEXT: s_endpgm 946 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 947 %tid.ext = sext i32 %tid to i64 948 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 949 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 950 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 951 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 952 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 953 ret void 954} 955 956define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 957; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: 958; GFX9: ; %bb.0: 959; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 960; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 961; GFX9-NEXT: s_waitcnt lgkmcnt(0) 962; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 963; GFX9-NEXT: s_waitcnt vmcnt(0) 964; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 965; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1 966; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 967; GFX9-NEXT: s_endpgm 968; 969; VI-LABEL: v_insertelement_v2i16_1_inlineimm: 970; VI: ; %bb.0: 971; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 972; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 973; VI-NEXT: s_waitcnt lgkmcnt(0) 974; VI-NEXT: v_mov_b32_e32 v1, s3 975; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 976; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 977; VI-NEXT: flat_load_dword v3, v[0:1] 978; VI-NEXT: v_mov_b32_e32 v1, s1 979; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 980; VI-NEXT: v_mov_b32_e32 v2, 0xfff10000 981; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 982; VI-NEXT: s_waitcnt vmcnt(0) 983; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 984; VI-NEXT: flat_store_dword v[0:1], v2 985; VI-NEXT: s_endpgm 986; 987; CI-LABEL: v_insertelement_v2i16_1_inlineimm: 988; CI: ; %bb.0: 989; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 990; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 991; CI-NEXT: s_waitcnt lgkmcnt(0) 992; CI-NEXT: v_mov_b32_e32 v1, s3 993; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 994; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 995; CI-NEXT: flat_load_dword v3, v[0:1] 996; CI-NEXT: v_mov_b32_e32 v1, s1 997; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 998; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 999; CI-NEXT: s_waitcnt vmcnt(0) 1000; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 1001; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 1002; CI-NEXT: flat_store_dword v[0:1], v2 1003; CI-NEXT: s_endpgm 1004; 1005; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: 1006; GFX11: ; %bb.0: 1007; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1008; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1009; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1010; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1011; GFX11-NEXT: s_waitcnt vmcnt(0) 1012; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1013; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1014; GFX11-NEXT: v_lshl_or_b32 v1, -15, 16, v1 1015; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1016; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1017; GFX11-NEXT: s_endpgm 1018 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1019 %tid.ext = sext i32 %tid to i64 1020 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1021 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1022 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 1023 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1 1024 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 1025 ret void 1026} 1027 1028define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1029; GFX9-LABEL: v_insertelement_v2f16_0: 1030; GFX9: ; %bb.0: 1031; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1032; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1033; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 1034; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1036; GFX9-NEXT: s_waitcnt vmcnt(0) 1037; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1038; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 1039; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1040; GFX9-NEXT: s_endpgm 1041; 1042; VI-LABEL: v_insertelement_v2f16_0: 1043; VI: ; %bb.0: 1044; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1045; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1046; VI-NEXT: s_waitcnt lgkmcnt(0) 1047; VI-NEXT: v_mov_b32_e32 v1, s3 1048; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1049; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1050; VI-NEXT: flat_load_dword v3, v[0:1] 1051; VI-NEXT: v_mov_b32_e32 v1, s1 1052; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1053; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1054; VI-NEXT: s_waitcnt vmcnt(0) 1055; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1056; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 1057; VI-NEXT: flat_store_dword v[0:1], v2 1058; VI-NEXT: s_endpgm 1059; 1060; CI-LABEL: v_insertelement_v2f16_0: 1061; CI: ; %bb.0: 1062; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1063; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1064; CI-NEXT: s_waitcnt lgkmcnt(0) 1065; CI-NEXT: v_mov_b32_e32 v1, s3 1066; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1067; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1068; CI-NEXT: flat_load_dword v3, v[0:1] 1069; CI-NEXT: v_mov_b32_e32 v1, s1 1070; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1071; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1072; CI-NEXT: s_waitcnt vmcnt(0) 1073; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1074; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 1075; CI-NEXT: flat_store_dword v[0:1], v2 1076; CI-NEXT: s_endpgm 1077; 1078; GFX11-LABEL: v_insertelement_v2f16_0: 1079; GFX11: ; %bb.0: 1080; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1081; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1082; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1084; GFX11-NEXT: s_waitcnt vmcnt(0) 1085; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1086; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1087; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 0x4500 1088; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1089; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1090; GFX11-NEXT: s_endpgm 1091 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1092 %tid.ext = sext i32 %tid to i64 1093 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1094 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1095 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1096 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 1097 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1098 ret void 1099} 1100 1101define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1102; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: 1103; GFX9: ; %bb.0: 1104; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1105; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1106; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1107; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1108; GFX9-NEXT: s_waitcnt vmcnt(0) 1109; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1110; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53 1111; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1112; GFX9-NEXT: s_endpgm 1113; 1114; VI-LABEL: v_insertelement_v2f16_0_inlineimm: 1115; VI: ; %bb.0: 1116; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1117; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1118; VI-NEXT: s_waitcnt lgkmcnt(0) 1119; VI-NEXT: v_mov_b32_e32 v1, s3 1120; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1121; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1122; VI-NEXT: flat_load_dword v3, v[0:1] 1123; VI-NEXT: v_mov_b32_e32 v1, s1 1124; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1125; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1126; VI-NEXT: s_waitcnt vmcnt(0) 1127; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1128; VI-NEXT: v_or_b32_e32 v2, 53, v2 1129; VI-NEXT: flat_store_dword v[0:1], v2 1130; VI-NEXT: s_endpgm 1131; 1132; CI-LABEL: v_insertelement_v2f16_0_inlineimm: 1133; CI: ; %bb.0: 1134; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1135; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1136; CI-NEXT: s_waitcnt lgkmcnt(0) 1137; CI-NEXT: v_mov_b32_e32 v1, s3 1138; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1139; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1140; CI-NEXT: flat_load_dword v3, v[0:1] 1141; CI-NEXT: v_mov_b32_e32 v1, s1 1142; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1143; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1144; CI-NEXT: s_waitcnt vmcnt(0) 1145; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 1146; CI-NEXT: v_or_b32_e32 v2, 53, v2 1147; CI-NEXT: flat_store_dword v[0:1], v2 1148; CI-NEXT: s_endpgm 1149; 1150; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: 1151; GFX11: ; %bb.0: 1152; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1153; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1154; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1156; GFX11-NEXT: s_waitcnt vmcnt(0) 1157; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1158; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1159; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 53 1160; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1161; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1162; GFX11-NEXT: s_endpgm 1163 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1164 %tid.ext = sext i32 %tid to i64 1165 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1166 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1167 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1168 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0 1169 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1170 ret void 1171} 1172 1173define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1174; GFX9-LABEL: v_insertelement_v2f16_1: 1175; GFX9: ; %bb.0: 1176; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1177; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1178; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1179; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1180; GFX9-NEXT: s_movk_i32 s2, 0x4500 1181; GFX9-NEXT: s_waitcnt vmcnt(0) 1182; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1183; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 1184; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1185; GFX9-NEXT: s_endpgm 1186; 1187; VI-LABEL: v_insertelement_v2f16_1: 1188; VI: ; %bb.0: 1189; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1190; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1191; VI-NEXT: s_waitcnt lgkmcnt(0) 1192; VI-NEXT: v_mov_b32_e32 v1, s3 1193; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1194; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1195; VI-NEXT: flat_load_dword v3, v[0:1] 1196; VI-NEXT: v_mov_b32_e32 v1, s1 1197; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1198; VI-NEXT: v_mov_b32_e32 v2, 0x45000000 1199; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1200; VI-NEXT: s_waitcnt vmcnt(0) 1201; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1202; VI-NEXT: flat_store_dword v[0:1], v2 1203; VI-NEXT: s_endpgm 1204; 1205; CI-LABEL: v_insertelement_v2f16_1: 1206; CI: ; %bb.0: 1207; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1208; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1209; CI-NEXT: s_waitcnt lgkmcnt(0) 1210; CI-NEXT: v_mov_b32_e32 v1, s3 1211; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1212; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1213; CI-NEXT: flat_load_dword v3, v[0:1] 1214; CI-NEXT: v_mov_b32_e32 v1, s1 1215; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1216; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1217; CI-NEXT: s_waitcnt vmcnt(0) 1218; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 1219; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 1220; CI-NEXT: flat_store_dword v[0:1], v2 1221; CI-NEXT: s_endpgm 1222; 1223; GFX11-LABEL: v_insertelement_v2f16_1: 1224; GFX11: ; %bb.0: 1225; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1226; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1227; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1228; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1229; GFX11-NEXT: s_waitcnt vmcnt(0) 1230; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1232; GFX11-NEXT: v_lshl_or_b32 v1, 0x4500, 16, v1 1233; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1234; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1235; GFX11-NEXT: s_endpgm 1236 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1237 %tid.ext = sext i32 %tid to i64 1238 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1239 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1240 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1241 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 1242 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1243 ret void 1244} 1245 1246define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1247; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: 1248; GFX9: ; %bb.0: 1249; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1250; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1251; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1253; GFX9-NEXT: s_waitcnt vmcnt(0) 1254; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1255; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1 1256; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1257; GFX9-NEXT: s_endpgm 1258; 1259; VI-LABEL: v_insertelement_v2f16_1_inlineimm: 1260; VI: ; %bb.0: 1261; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1262; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1263; VI-NEXT: s_waitcnt lgkmcnt(0) 1264; VI-NEXT: v_mov_b32_e32 v1, s3 1265; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1266; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1267; VI-NEXT: flat_load_dword v3, v[0:1] 1268; VI-NEXT: v_mov_b32_e32 v1, s1 1269; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1270; VI-NEXT: v_mov_b32_e32 v2, 0x230000 1271; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1272; VI-NEXT: s_waitcnt vmcnt(0) 1273; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1274; VI-NEXT: flat_store_dword v[0:1], v2 1275; VI-NEXT: s_endpgm 1276; 1277; CI-LABEL: v_insertelement_v2f16_1_inlineimm: 1278; CI: ; %bb.0: 1279; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1280; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1281; CI-NEXT: s_waitcnt lgkmcnt(0) 1282; CI-NEXT: v_mov_b32_e32 v1, s3 1283; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1284; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1285; CI-NEXT: flat_load_dword v3, v[0:1] 1286; CI-NEXT: v_mov_b32_e32 v1, s1 1287; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1288; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1289; CI-NEXT: s_waitcnt vmcnt(0) 1290; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 1291; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 1292; CI-NEXT: flat_store_dword v[0:1], v2 1293; CI-NEXT: s_endpgm 1294; 1295; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: 1296; GFX11: ; %bb.0: 1297; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1298; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1299; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1300; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1301; GFX11-NEXT: s_waitcnt vmcnt(0) 1302; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1303; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1304; GFX11-NEXT: v_lshl_or_b32 v1, 35, 16, v1 1305; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1306; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1307; GFX11-NEXT: s_endpgm 1308 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1309 %tid.ext = sext i32 %tid to i64 1310 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1311 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1312 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1313 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1 1314 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1315 ret void 1316} 1317 1318; FIXME: Enable for others when argument load not split 1319define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { 1320; GFX9-LABEL: s_insertelement_v2i16_dynamic: 1321; GFX9: ; %bb.0: 1322; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1323; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1324; GFX9-NEXT: v_mov_b32_e32 v0, 0 1325; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 1327; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 1328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX9-NEXT: s_lshl_b32 s2, s4, 4 1330; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1331; GFX9-NEXT: s_andn2_b32 s3, s5, s2 1332; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 1333; GFX9-NEXT: s_or_b32 s2, s2, s3 1334; GFX9-NEXT: v_mov_b32_e32 v1, s2 1335; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1336; GFX9-NEXT: s_endpgm 1337; 1338; VI-LABEL: s_insertelement_v2i16_dynamic: 1339; VI: ; %bb.0: 1340; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1341; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1342; VI-NEXT: s_waitcnt lgkmcnt(0) 1343; VI-NEXT: s_load_dword s4, s[6:7], 0x0 1344; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1345; VI-NEXT: v_mov_b32_e32 v0, s0 1346; VI-NEXT: v_mov_b32_e32 v1, s1 1347; VI-NEXT: s_waitcnt lgkmcnt(0) 1348; VI-NEXT: s_lshl_b32 s0, s4, 4 1349; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1350; VI-NEXT: s_andn2_b32 s1, s2, s0 1351; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1352; VI-NEXT: s_or_b32 s0, s0, s1 1353; VI-NEXT: v_mov_b32_e32 v2, s0 1354; VI-NEXT: flat_store_dword v[0:1], v2 1355; VI-NEXT: s_endpgm 1356; 1357; CI-LABEL: s_insertelement_v2i16_dynamic: 1358; CI: ; %bb.0: 1359; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 1360; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1361; CI-NEXT: s_waitcnt lgkmcnt(0) 1362; CI-NEXT: s_load_dword s4, s[6:7], 0x0 1363; CI-NEXT: s_load_dword s2, s[2:3], 0x0 1364; CI-NEXT: v_mov_b32_e32 v0, s0 1365; CI-NEXT: v_mov_b32_e32 v1, s1 1366; CI-NEXT: s_waitcnt lgkmcnt(0) 1367; CI-NEXT: s_lshl_b32 s0, s4, 4 1368; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1369; CI-NEXT: s_andn2_b32 s1, s2, s0 1370; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1371; CI-NEXT: s_or_b32 s0, s0, s1 1372; CI-NEXT: v_mov_b32_e32 v2, s0 1373; CI-NEXT: flat_store_dword v[0:1], v2 1374; CI-NEXT: s_endpgm 1375; 1376; GFX11-LABEL: s_insertelement_v2i16_dynamic: 1377; GFX11: ; %bb.0: 1378; GFX11-NEXT: s_clause 0x1 1379; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 1380; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1381; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 1383; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 1384; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1385; GFX11-NEXT: s_lshl_b32 s3, s4, 4 1386; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1387; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3 1388; GFX11-NEXT: s_and_not1_b32 s2, s2, s3 1389; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7 1390; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1391; GFX11-NEXT: s_or_b32 s2, s3, s2 1392; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 1393; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1394; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1395; GFX11-NEXT: s_endpgm 1396 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr 1397 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 1398 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1399 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 1400 ret void 1401} 1402 1403define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { 1404; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1405; GFX9: ; %bb.0: 1406; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1407; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1408; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1409; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1412; GFX9-NEXT: s_lshl_b32 s2, s6, 4 1413; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1414; GFX9-NEXT: s_waitcnt vmcnt(0) 1415; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 1416; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1417; GFX9-NEXT: s_endpgm 1418; 1419; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1420; VI: ; %bb.0: 1421; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1422; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1423; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1424; VI-NEXT: s_waitcnt lgkmcnt(0) 1425; VI-NEXT: v_mov_b32_e32 v1, s3 1426; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1427; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1428; VI-NEXT: flat_load_dword v3, v[0:1] 1429; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1430; VI-NEXT: s_lshl_b32 s0, s4, 4 1431; VI-NEXT: v_mov_b32_e32 v1, s1 1432; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1433; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1434; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1435; VI-NEXT: s_waitcnt vmcnt(0) 1436; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1437; VI-NEXT: flat_store_dword v[0:1], v2 1438; VI-NEXT: s_endpgm 1439; 1440; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1441; CI: ; %bb.0: 1442; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1443; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1444; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1445; CI-NEXT: s_waitcnt lgkmcnt(0) 1446; CI-NEXT: v_mov_b32_e32 v1, s3 1447; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1448; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1449; CI-NEXT: flat_load_dword v3, v[0:1] 1450; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1451; CI-NEXT: s_lshl_b32 s0, s4, 4 1452; CI-NEXT: v_mov_b32_e32 v1, s1 1453; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1454; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1455; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1456; CI-NEXT: s_waitcnt vmcnt(0) 1457; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 1458; CI-NEXT: flat_store_dword v[0:1], v2 1459; CI-NEXT: s_endpgm 1460; 1461; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1462; GFX11: ; %bb.0: 1463; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1464; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1465; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 1466; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1467; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 1468; GFX11-NEXT: s_lshl_b32 s0, s0, 4 1469; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1470; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 1471; GFX11-NEXT: s_waitcnt vmcnt(0) 1472; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 1473; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] 1474; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1475; GFX11-NEXT: s_endpgm 1476 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1477 %tid.ext = sext i32 %tid to i64 1478 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1479 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1480 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 1481 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1482 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 1483 ret void 1484} 1485 1486define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { 1487; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1488; GFX9: ; %bb.0: 1489; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1490; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1491; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1492; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1493; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 1494; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1495; GFX9-NEXT: s_mov_b32 s2, 0xffff 1496; GFX9-NEXT: s_waitcnt vmcnt(1) 1497; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1498; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 1499; GFX9-NEXT: s_mov_b32 s2, 0x12341234 1500; GFX9-NEXT: s_waitcnt vmcnt(0) 1501; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2 1502; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1503; GFX9-NEXT: s_endpgm 1504; 1505; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1506; VI: ; %bb.0: 1507; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1508; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1509; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1510; VI-NEXT: s_waitcnt lgkmcnt(0) 1511; VI-NEXT: v_mov_b32_e32 v3, s3 1512; VI-NEXT: v_mov_b32_e32 v1, s5 1513; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1514; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1515; VI-NEXT: flat_load_dword v4, v[0:1] 1516; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1517; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1518; VI-NEXT: flat_load_dword v3, v[0:1] 1519; VI-NEXT: s_mov_b32 s2, 0xffff 1520; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1521; VI-NEXT: v_mov_b32_e32 v1, s1 1522; VI-NEXT: s_mov_b32 s0, 0x12341234 1523; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1524; VI-NEXT: s_waitcnt vmcnt(1) 1525; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 1526; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 1527; VI-NEXT: s_waitcnt vmcnt(0) 1528; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 1529; VI-NEXT: flat_store_dword v[0:1], v2 1530; VI-NEXT: s_endpgm 1531; 1532; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1533; CI: ; %bb.0: 1534; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1535; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 1536; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1537; CI-NEXT: s_waitcnt lgkmcnt(0) 1538; CI-NEXT: v_mov_b32_e32 v3, s3 1539; CI-NEXT: v_mov_b32_e32 v1, s5 1540; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 1541; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1542; CI-NEXT: flat_load_dword v4, v[0:1] 1543; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1544; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1545; CI-NEXT: flat_load_dword v3, v[0:1] 1546; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1547; CI-NEXT: v_mov_b32_e32 v1, s1 1548; CI-NEXT: s_mov_b32 s0, 0x12341234 1549; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1550; CI-NEXT: s_waitcnt vmcnt(1) 1551; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 1552; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 1553; CI-NEXT: s_waitcnt vmcnt(0) 1554; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 1555; CI-NEXT: flat_store_dword v[0:1], v2 1556; CI-NEXT: s_endpgm 1557; 1558; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1559; GFX11: ; %bb.0: 1560; GFX11-NEXT: s_clause 0x1 1561; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 1562; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 1563; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1564; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1565; GFX11-NEXT: s_clause 0x1 1566; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] 1567; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] 1568; GFX11-NEXT: s_waitcnt vmcnt(1) 1569; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1570; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1571; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff 1572; GFX11-NEXT: s_waitcnt vmcnt(0) 1573; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2 1574; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1575; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1576; GFX11-NEXT: s_endpgm 1577 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1578 %tid.ext = sext i32 %tid to i64 1579 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1580 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext 1581 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1582 %idx = load i32, i32 addrspace(1)* %idx.gep 1583 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1584 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx 1585 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1586 ret void 1587} 1588 1589define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1590; GFX9-LABEL: v_insertelement_v4f16_0: 1591; GFX9: ; %bb.0: 1592; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1593; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 1594; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1595; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1596; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1597; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1598; GFX9-NEXT: s_waitcnt vmcnt(0) 1599; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 1600; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1601; GFX9-NEXT: s_endpgm 1602; 1603; VI-LABEL: v_insertelement_v4f16_0: 1604; VI: ; %bb.0: 1605; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1606; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1607; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1608; VI-NEXT: s_waitcnt lgkmcnt(0) 1609; VI-NEXT: v_mov_b32_e32 v1, s3 1610; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1611; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1612; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1613; VI-NEXT: v_mov_b32_e32 v3, s1 1614; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1615; VI-NEXT: s_mov_b32 s0, 0xffff 1616; VI-NEXT: v_mov_b32_e32 v4, s4 1617; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1618; VI-NEXT: s_waitcnt vmcnt(0) 1619; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 1620; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1621; VI-NEXT: s_endpgm 1622; 1623; CI-LABEL: v_insertelement_v4f16_0: 1624; CI: ; %bb.0: 1625; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1626; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1627; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1628; CI-NEXT: s_waitcnt lgkmcnt(0) 1629; CI-NEXT: v_mov_b32_e32 v1, s3 1630; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1631; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1632; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1633; CI-NEXT: v_mov_b32_e32 v3, s1 1634; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1635; CI-NEXT: s_mov_b32 s0, 0xffff 1636; CI-NEXT: v_mov_b32_e32 v4, s4 1637; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1638; CI-NEXT: s_waitcnt vmcnt(0) 1639; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 1640; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1641; CI-NEXT: s_endpgm 1642; 1643; GFX11-LABEL: v_insertelement_v4f16_0: 1644; GFX11: ; %bb.0: 1645; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1646; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1647; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 1648; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1649; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] 1650; GFX11-NEXT: s_waitcnt vmcnt(0) 1651; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0 1652; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1653; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1654; GFX11-NEXT: s_endpgm 1655 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1656 %tid.ext = sext i32 %tid to i64 1657 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1658 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1659 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1660 %val.trunc = trunc i32 %val to i16 1661 %val.cvt = bitcast i16 %val.trunc to half 1662 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0 1663 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1664 ret void 1665} 1666 1667define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1668; GFX9-LABEL: v_insertelement_v4f16_1: 1669; GFX9: ; %bb.0: 1670; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1671; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1672; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1673; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1674; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1675; GFX9-NEXT: s_waitcnt vmcnt(0) 1676; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 1677; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 1678; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1679; GFX9-NEXT: s_endpgm 1680; 1681; VI-LABEL: v_insertelement_v4f16_1: 1682; VI: ; %bb.0: 1683; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1684; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1685; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1686; VI-NEXT: s_waitcnt lgkmcnt(0) 1687; VI-NEXT: v_mov_b32_e32 v1, s3 1688; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1689; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1690; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1691; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1692; VI-NEXT: s_lshl_b32 s0, s4, 16 1693; VI-NEXT: v_mov_b32_e32 v3, s1 1694; VI-NEXT: v_mov_b32_e32 v4, s0 1695; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1696; VI-NEXT: s_waitcnt vmcnt(0) 1697; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1698; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1699; VI-NEXT: s_endpgm 1700; 1701; CI-LABEL: v_insertelement_v4f16_1: 1702; CI: ; %bb.0: 1703; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1704; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1705; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1706; CI-NEXT: s_waitcnt lgkmcnt(0) 1707; CI-NEXT: v_mov_b32_e32 v1, s3 1708; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1709; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1710; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1711; CI-NEXT: v_mov_b32_e32 v3, s1 1712; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1713; CI-NEXT: s_lshl_b32 s0, s4, 16 1714; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1715; CI-NEXT: s_waitcnt vmcnt(0) 1716; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1717; CI-NEXT: v_or_b32_e32 v0, s0, v0 1718; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1719; CI-NEXT: s_endpgm 1720; 1721; GFX11-LABEL: v_insertelement_v4f16_1: 1722; GFX11: ; %bb.0: 1723; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1724; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1725; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 1726; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] 1728; GFX11-NEXT: s_waitcnt vmcnt(0) 1729; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1730; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1731; GFX11-NEXT: v_lshl_or_b32 v0, s0, 16, v0 1732; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1733; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1734; GFX11-NEXT: s_endpgm 1735 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1736 %tid.ext = sext i32 %tid to i64 1737 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1738 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1739 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1740 %val.trunc = trunc i32 %val to i16 1741 %val.cvt = bitcast i16 %val.trunc to half 1742 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1 1743 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1744 ret void 1745} 1746 1747define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1748; GFX9-LABEL: v_insertelement_v4f16_2: 1749; GFX9: ; %bb.0: 1750; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1751; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 1752; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1753; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1755; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1756; GFX9-NEXT: s_waitcnt vmcnt(0) 1757; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 1758; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1759; GFX9-NEXT: s_endpgm 1760; 1761; VI-LABEL: v_insertelement_v4f16_2: 1762; VI: ; %bb.0: 1763; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1764; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1765; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1766; VI-NEXT: s_waitcnt lgkmcnt(0) 1767; VI-NEXT: v_mov_b32_e32 v1, s3 1768; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1769; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1770; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1771; VI-NEXT: v_mov_b32_e32 v3, s1 1772; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1773; VI-NEXT: s_mov_b32 s0, 0xffff 1774; VI-NEXT: v_mov_b32_e32 v4, s4 1775; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1776; VI-NEXT: s_waitcnt vmcnt(0) 1777; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 1778; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1779; VI-NEXT: s_endpgm 1780; 1781; CI-LABEL: v_insertelement_v4f16_2: 1782; CI: ; %bb.0: 1783; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1784; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1785; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1786; CI-NEXT: s_waitcnt lgkmcnt(0) 1787; CI-NEXT: v_mov_b32_e32 v1, s3 1788; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1789; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1790; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1791; CI-NEXT: v_mov_b32_e32 v3, s1 1792; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1793; CI-NEXT: s_mov_b32 s0, 0xffff 1794; CI-NEXT: v_mov_b32_e32 v4, s4 1795; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1796; CI-NEXT: s_waitcnt vmcnt(0) 1797; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1798; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1799; CI-NEXT: s_endpgm 1800; 1801; GFX11-LABEL: v_insertelement_v4f16_2: 1802; GFX11: ; %bb.0: 1803; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1804; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1805; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 1806; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1807; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] 1808; GFX11-NEXT: s_waitcnt vmcnt(0) 1809; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 1810; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1811; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1812; GFX11-NEXT: s_endpgm 1813 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1814 %tid.ext = sext i32 %tid to i64 1815 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1816 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1817 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1818 %val.trunc = trunc i32 %val to i16 1819 %val.cvt = bitcast i16 %val.trunc to half 1820 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2 1821 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1822 ret void 1823} 1824 1825define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1826; GFX9-LABEL: v_insertelement_v4f16_3: 1827; GFX9: ; %bb.0: 1828; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1829; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1830; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1831; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1832; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1833; GFX9-NEXT: s_waitcnt vmcnt(0) 1834; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1835; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 1836; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1837; GFX9-NEXT: s_endpgm 1838; 1839; VI-LABEL: v_insertelement_v4f16_3: 1840; VI: ; %bb.0: 1841; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1842; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1843; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1844; VI-NEXT: s_waitcnt lgkmcnt(0) 1845; VI-NEXT: v_mov_b32_e32 v1, s3 1846; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1847; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1848; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1849; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1850; VI-NEXT: s_lshl_b32 s0, s4, 16 1851; VI-NEXT: v_mov_b32_e32 v3, s1 1852; VI-NEXT: v_mov_b32_e32 v4, s0 1853; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1854; VI-NEXT: s_waitcnt vmcnt(0) 1855; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1856; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1857; VI-NEXT: s_endpgm 1858; 1859; CI-LABEL: v_insertelement_v4f16_3: 1860; CI: ; %bb.0: 1861; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1862; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1863; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1864; CI-NEXT: s_waitcnt lgkmcnt(0) 1865; CI-NEXT: v_mov_b32_e32 v1, s3 1866; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1867; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1868; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1869; CI-NEXT: v_mov_b32_e32 v3, s1 1870; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1871; CI-NEXT: s_lshl_b32 s0, s4, 16 1872; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1873; CI-NEXT: s_waitcnt vmcnt(0) 1874; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1875; CI-NEXT: v_or_b32_e32 v1, s0, v1 1876; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1877; CI-NEXT: s_endpgm 1878; 1879; GFX11-LABEL: v_insertelement_v4f16_3: 1880; GFX11: ; %bb.0: 1881; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1882; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1883; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 1884; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1885; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] 1886; GFX11-NEXT: s_waitcnt vmcnt(0) 1887; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1888; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1889; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 1890; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1891; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1892; GFX11-NEXT: s_endpgm 1893 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1894 %tid.ext = sext i32 %tid to i64 1895 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1896 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1897 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1898 %val.trunc = trunc i32 %val to i16 1899 %val.cvt = bitcast i16 %val.trunc to half 1900 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3 1901 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1902 ret void 1903} 1904 1905define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1906; GFX9-LABEL: v_insertelement_v4i16_2: 1907; GFX9: ; %bb.0: 1908; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1909; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1910; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1911; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1912; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1913; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1914; GFX9-NEXT: s_waitcnt vmcnt(0) 1915; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 1916; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1917; GFX9-NEXT: s_endpgm 1918; 1919; VI-LABEL: v_insertelement_v4i16_2: 1920; VI: ; %bb.0: 1921; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1922; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1923; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1924; VI-NEXT: s_waitcnt lgkmcnt(0) 1925; VI-NEXT: v_mov_b32_e32 v1, s3 1926; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1927; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1928; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1929; VI-NEXT: v_mov_b32_e32 v3, s1 1930; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1931; VI-NEXT: s_mov_b32 s0, 0xffff 1932; VI-NEXT: v_mov_b32_e32 v4, s4 1933; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1934; VI-NEXT: s_waitcnt vmcnt(0) 1935; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 1936; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1937; VI-NEXT: s_endpgm 1938; 1939; CI-LABEL: v_insertelement_v4i16_2: 1940; CI: ; %bb.0: 1941; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1942; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1943; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1944; CI-NEXT: s_waitcnt lgkmcnt(0) 1945; CI-NEXT: v_mov_b32_e32 v1, s3 1946; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1947; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1948; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1949; CI-NEXT: v_mov_b32_e32 v3, s1 1950; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1951; CI-NEXT: s_mov_b32 s0, 0xffff 1952; CI-NEXT: v_mov_b32_e32 v4, s4 1953; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1954; CI-NEXT: s_waitcnt vmcnt(0) 1955; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1956; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1957; CI-NEXT: s_endpgm 1958; 1959; GFX11-LABEL: v_insertelement_v4i16_2: 1960; GFX11: ; %bb.0: 1961; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 1962; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1963; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 1964; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1965; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] 1966; GFX11-NEXT: s_waitcnt vmcnt(0) 1967; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 1968; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1969; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1970; GFX11-NEXT: s_endpgm 1971 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1972 %tid.ext = sext i32 %tid to i64 1973 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1974 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1975 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1976 %val.trunc = trunc i32 %val to i16 1977 %val.cvt = bitcast i16 %val.trunc to i16 1978 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2 1979 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1980 ret void 1981} 1982 1983; FIXME: Better code on CI? 1984define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1985; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1986; GFX9: ; %bb.0: 1987; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1988; GFX9-NEXT: global_load_dword v2, v[0:1], off glc 1989; GFX9-NEXT: s_waitcnt vmcnt(0) 1990; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1991; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1992; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1993; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1994; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff 1995; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1996; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] 1997; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 1998; GFX9-NEXT: s_waitcnt vmcnt(0) 1999; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 2000; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 2001; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 2002; GFX9-NEXT: s_endpgm 2003; 2004; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 2005; VI: ; %bb.0: 2006; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2007; VI-NEXT: flat_load_dword v4, v[0:1] glc 2008; VI-NEXT: s_waitcnt vmcnt(0) 2009; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2010; VI-NEXT: s_load_dword s4, s[4:5], 0x10 2011; VI-NEXT: s_waitcnt lgkmcnt(0) 2012; VI-NEXT: v_mov_b32_e32 v1, s3 2013; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2014; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2015; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2016; VI-NEXT: s_mov_b64 s[2:3], 0xffff 2017; VI-NEXT: v_mov_b32_e32 v3, s1 2018; VI-NEXT: s_lshl_b32 s1, s4, 16 2019; VI-NEXT: s_and_b32 s4, s4, 0xffff 2020; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 2021; VI-NEXT: s_or_b32 s0, s4, s1 2022; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2023; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 2024; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] 2025; VI-NEXT: s_waitcnt vmcnt(0) 2026; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 2027; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 2028; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2029; VI-NEXT: s_endpgm 2030; 2031; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 2032; CI: ; %bb.0: 2033; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2034; CI-NEXT: flat_load_dword v4, v[0:1] glc 2035; CI-NEXT: s_waitcnt vmcnt(0) 2036; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2037; CI-NEXT: s_load_dword s4, s[4:5], 0x4 2038; CI-NEXT: s_waitcnt lgkmcnt(0) 2039; CI-NEXT: v_mov_b32_e32 v1, s3 2040; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2041; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2042; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2043; CI-NEXT: s_mov_b64 s[2:3], 0xffff 2044; CI-NEXT: v_mov_b32_e32 v3, s1 2045; CI-NEXT: s_lshl_b32 s1, s4, 16 2046; CI-NEXT: s_and_b32 s4, s4, 0xffff 2047; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 2048; CI-NEXT: s_or_b32 s0, s4, s1 2049; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2050; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 2051; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 2052; CI-NEXT: s_waitcnt vmcnt(0) 2053; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 2054; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 2055; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2056; CI-NEXT: s_endpgm 2057; 2058; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: 2059; GFX11: ; %bb.0: 2060; GFX11-NEXT: s_clause 0x1 2061; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2062; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 2063; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc 2064; GFX11-NEXT: s_waitcnt vmcnt(0) 2065; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 2066; GFX11-NEXT: s_mov_b64 s[0:1], 0xffff 2067; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2068; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] 2069; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 2070; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2071; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] 2072; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s2 2073; GFX11-NEXT: s_waitcnt vmcnt(0) 2074; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 2075; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2076; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 2077; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] 2078; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2079; GFX11-NEXT: s_endpgm 2080 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2081 %tid.ext = sext i32 %tid to i64 2082 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 2083 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 2084 %idx.val = load volatile i32, i32 addrspace(1)* undef 2085 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 2086 %val.trunc = trunc i32 %val to i16 2087 %val.cvt = bitcast i16 %val.trunc to i16 2088 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val 2089 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 2090 ret void 2091} 2092 2093define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 { 2094; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2095; GFX9: ; %bb.0: 2096; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2097; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 2098; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2099; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2100; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 2101; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff 2102; GFX9-NEXT: s_lshl_b32 s4, s7, 4 2103; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 2104; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 2105; GFX9-NEXT: v_mov_b32_e32 v3, s5 2106; GFX9-NEXT: v_mov_b32_e32 v4, s5 2107; GFX9-NEXT: s_waitcnt vmcnt(0) 2108; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 2109; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 2110; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2111; GFX9-NEXT: s_endpgm 2112; 2113; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2114; VI: ; %bb.0: 2115; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2116; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 2117; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2118; VI-NEXT: s_waitcnt lgkmcnt(0) 2119; VI-NEXT: v_mov_b32_e32 v1, s3 2120; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2121; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2122; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2123; VI-NEXT: s_mov_b64 s[2:3], 0xffff 2124; VI-NEXT: v_mov_b32_e32 v3, s1 2125; VI-NEXT: s_lshl_b32 s1, s5, 4 2126; VI-NEXT: s_lshl_b32 s5, s4, 16 2127; VI-NEXT: s_and_b32 s4, s4, 0xffff 2128; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 2129; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 2130; VI-NEXT: s_or_b32 s2, s4, s5 2131; VI-NEXT: v_mov_b32_e32 v4, s2 2132; VI-NEXT: v_mov_b32_e32 v5, s2 2133; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2134; VI-NEXT: s_waitcnt vmcnt(0) 2135; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 2136; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 2137; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2138; VI-NEXT: s_endpgm 2139; 2140; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2141; CI: ; %bb.0: 2142; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2143; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 2144; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2145; CI-NEXT: s_waitcnt lgkmcnt(0) 2146; CI-NEXT: v_mov_b32_e32 v1, s3 2147; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 2148; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2149; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2150; CI-NEXT: s_mov_b64 s[2:3], 0xffff 2151; CI-NEXT: v_mov_b32_e32 v3, s1 2152; CI-NEXT: s_and_b32 s6, s4, 0xffff 2153; CI-NEXT: s_lshl_b32 s1, s5, 4 2154; CI-NEXT: s_lshl_b32 s4, s4, 16 2155; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 2156; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 2157; CI-NEXT: s_or_b32 s2, s6, s4 2158; CI-NEXT: v_mov_b32_e32 v4, s2 2159; CI-NEXT: v_mov_b32_e32 v5, s2 2160; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2161; CI-NEXT: s_waitcnt vmcnt(0) 2162; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 2163; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 2164; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2165; CI-NEXT: s_endpgm 2166; 2167; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: 2168; GFX11: ; %bb.0: 2169; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2170; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 2171; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 2172; GFX11-NEXT: s_mov_b64 s[2:3], 0xffff 2173; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2174; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] 2175; GFX11-NEXT: s_lshl_b32 s1, s1, 4 2176; GFX11-NEXT: s_pack_ll_b32_b16 s6, s0, s0 2177; GFX11-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 2178; GFX11-NEXT: s_waitcnt vmcnt(0) 2179; GFX11-NEXT: v_bfi_b32 v1, s1, s6, v1 2180; GFX11-NEXT: v_bfi_b32 v0, s0, s6, v0 2181; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 2182; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2183; GFX11-NEXT: s_endpgm 2184 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2185 %tid.ext = sext i32 %tid to i64 2186 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 2187 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 2188 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 2189 %val.trunc = trunc i32 %val to i16 2190 %val.cvt = bitcast i16 %val.trunc to half 2191 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval 2192 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 2193 ret void 2194} 2195 2196define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) { 2197; GFX9-LABEL: v_insertelement_v8f16_3: 2198; GFX9: ; %bb.0: 2199; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2200; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 2201; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 2204; GFX9-NEXT: s_waitcnt vmcnt(0) 2205; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2206; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 2207; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2208; GFX9-NEXT: s_endpgm 2209; 2210; VI-LABEL: v_insertelement_v8f16_3: 2211; VI: ; %bb.0: 2212; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2213; VI-NEXT: s_load_dword s4, s[4:5], 0x10 2214; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2215; VI-NEXT: s_waitcnt lgkmcnt(0) 2216; VI-NEXT: v_mov_b32_e32 v1, s3 2217; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2218; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2219; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2220; VI-NEXT: v_mov_b32_e32 v5, s1 2221; VI-NEXT: s_lshl_b32 s1, s4, 16 2222; VI-NEXT: s_mov_b32 s2, 0xffff 2223; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 2224; VI-NEXT: v_mov_b32_e32 v6, s1 2225; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2226; VI-NEXT: s_waitcnt vmcnt(0) 2227; VI-NEXT: v_bfi_b32 v3, s2, v3, v3 2228; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2229; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2230; VI-NEXT: s_endpgm 2231; 2232; CI-LABEL: v_insertelement_v8f16_3: 2233; CI: ; %bb.0: 2234; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2235; CI-NEXT: s_load_dword s4, s[4:5], 0x4 2236; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2237; CI-NEXT: s_waitcnt lgkmcnt(0) 2238; CI-NEXT: v_mov_b32_e32 v1, s3 2239; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2240; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2241; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2242; CI-NEXT: v_mov_b32_e32 v5, s1 2243; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2244; CI-NEXT: s_lshl_b32 s0, s4, 16 2245; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2246; CI-NEXT: s_waitcnt vmcnt(0) 2247; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 2248; CI-NEXT: v_or_b32_e32 v1, s0, v1 2249; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2250; CI-NEXT: s_endpgm 2251; 2252; GFX11-LABEL: v_insertelement_v8f16_3: 2253; GFX11: ; %bb.0: 2254; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2255; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2256; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 2257; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2258; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] 2259; GFX11-NEXT: s_waitcnt vmcnt(0) 2260; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 2261; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2262; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 2263; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] 2264; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2265; GFX11-NEXT: s_endpgm 2266 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2267 %tid.ext = sext i32 %tid to i64 2268 %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext 2269 %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext 2270 %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep 2271 %val.trunc = trunc i32 %val to i16 2272 %val.cvt = bitcast i16 %val.trunc to half 2273 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3 2274 store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep 2275 ret void 2276} 2277 2278define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) { 2279; GFX9-LABEL: v_insertelement_v8i16_6: 2280; GFX9: ; %bb.0: 2281; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2282; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 2283; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2284; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 2285; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2286; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 2287; GFX9-NEXT: s_waitcnt vmcnt(0) 2288; GFX9-NEXT: v_bfi_b32 v3, v5, s6, v3 2289; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2290; GFX9-NEXT: s_endpgm 2291; 2292; VI-LABEL: v_insertelement_v8i16_6: 2293; VI: ; %bb.0: 2294; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2295; VI-NEXT: s_load_dword s4, s[4:5], 0x10 2296; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2297; VI-NEXT: s_waitcnt lgkmcnt(0) 2298; VI-NEXT: v_mov_b32_e32 v1, s3 2299; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2300; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2301; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2302; VI-NEXT: s_mov_b32 s2, 0xffff 2303; VI-NEXT: v_mov_b32_e32 v5, s1 2304; VI-NEXT: v_mov_b32_e32 v6, s4 2305; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 2306; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2307; VI-NEXT: s_waitcnt vmcnt(0) 2308; VI-NEXT: v_bfi_b32 v3, s2, v6, v3 2309; VI-NEXT: v_bfi_b32 v1, s2, v1, v1 2310; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2311; VI-NEXT: s_endpgm 2312; 2313; CI-LABEL: v_insertelement_v8i16_6: 2314; CI: ; %bb.0: 2315; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2316; CI-NEXT: s_load_dword s4, s[4:5], 0x4 2317; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2318; CI-NEXT: s_waitcnt lgkmcnt(0) 2319; CI-NEXT: v_mov_b32_e32 v1, s3 2320; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2321; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2322; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2323; CI-NEXT: v_mov_b32_e32 v5, s1 2324; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2325; CI-NEXT: s_mov_b32 s0, 0xffff 2326; CI-NEXT: v_mov_b32_e32 v6, s4 2327; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2328; CI-NEXT: s_waitcnt vmcnt(0) 2329; CI-NEXT: v_bfi_b32 v3, s0, v6, v3 2330; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2331; CI-NEXT: s_endpgm 2332; 2333; GFX11-LABEL: v_insertelement_v8i16_6: 2334; GFX11: ; %bb.0: 2335; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2336; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2337; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 2338; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2339; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] 2340; GFX11-NEXT: s_waitcnt vmcnt(0) 2341; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 2342; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] 2343; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2344; GFX11-NEXT: s_endpgm 2345 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2346 %tid.ext = sext i32 %tid to i64 2347 %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext 2348 %out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext 2349 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep 2350 %val.trunc = trunc i32 %val to i16 2351 %val.cvt = bitcast i16 %val.trunc to i16 2352 %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6 2353 store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep 2354 ret void 2355} 2356 2357define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) { 2358; GFX9-LABEL: v_insertelement_v8f16_dynamic: 2359; GFX9: ; %bb.0: 2360; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2361; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 2362; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2364; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 2365; GFX9-NEXT: s_cmp_eq_u32 s7, 7 2366; GFX9-NEXT: v_mov_b32_e32 v5, s6 2367; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2368; GFX9-NEXT: s_cmp_eq_u32 s7, 6 2369; GFX9-NEXT: s_waitcnt vmcnt(0) 2370; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 2371; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v5, vcc 2372; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2373; GFX9-NEXT: s_cmp_eq_u32 s7, 5 2374; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 2375; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2376; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2377; GFX9-NEXT: s_cmp_eq_u32 s7, 4 2378; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc 2379; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2380; GFX9-NEXT: s_cmp_eq_u32 s7, 3 2381; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1 2382; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 2383; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2384; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2385; GFX9-NEXT: s_cmp_eq_u32 s7, 2 2386; GFX9-NEXT: v_lshl_or_b32 v3, v6, 16, v3 2387; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc 2388; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2389; GFX9-NEXT: s_cmp_eq_u32 s7, 1 2390; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0 2391; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 2392; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2393; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2394; GFX9-NEXT: s_cmp_eq_u32 s7, 0 2395; GFX9-NEXT: v_lshl_or_b32 v2, v7, 16, v2 2396; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v5, vcc 2397; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2398; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 2399; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2400; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2401; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 2402; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 2403; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2404; GFX9-NEXT: s_endpgm 2405; 2406; VI-LABEL: v_insertelement_v8f16_dynamic: 2407; VI: ; %bb.0: 2408; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2409; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 2410; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2411; VI-NEXT: s_waitcnt lgkmcnt(0) 2412; VI-NEXT: v_mov_b32_e32 v1, s3 2413; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2414; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2415; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2416; VI-NEXT: v_mov_b32_e32 v5, s1 2417; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 2418; VI-NEXT: s_cmp_eq_u32 s5, 6 2419; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2420; VI-NEXT: v_mov_b32_e32 v6, s4 2421; VI-NEXT: s_cselect_b64 vcc, -1, 0 2422; VI-NEXT: s_cmp_eq_u32 s5, 7 2423; VI-NEXT: s_waitcnt vmcnt(0) 2424; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc 2425; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2426; VI-NEXT: s_cselect_b64 vcc, -1, 0 2427; VI-NEXT: s_cmp_eq_u32 s5, 4 2428; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2429; VI-NEXT: s_cselect_b64 vcc, -1, 0 2430; VI-NEXT: s_cmp_eq_u32 s5, 5 2431; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2432; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2433; VI-NEXT: s_cselect_b64 vcc, -1, 0 2434; VI-NEXT: s_cmp_eq_u32 s5, 2 2435; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2436; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 2437; VI-NEXT: s_cselect_b64 vcc, -1, 0 2438; VI-NEXT: s_cmp_eq_u32 s5, 3 2439; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2440; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2441; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 2442; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2443; VI-NEXT: s_cselect_b64 vcc, -1, 0 2444; VI-NEXT: s_cmp_eq_u32 s5, 0 2445; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2446; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc 2447; VI-NEXT: s_cselect_b64 vcc, -1, 0 2448; VI-NEXT: s_cmp_eq_u32 s5, 1 2449; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 2450; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2451; VI-NEXT: s_cselect_b64 vcc, -1, 0 2452; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 2453; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2454; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2455; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2456; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2457; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2458; VI-NEXT: s_endpgm 2459; 2460; CI-LABEL: v_insertelement_v8f16_dynamic: 2461; CI: ; %bb.0: 2462; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2463; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 2464; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2465; CI-NEXT: s_waitcnt lgkmcnt(0) 2466; CI-NEXT: v_mov_b32_e32 v1, s3 2467; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2468; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2469; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2470; CI-NEXT: v_mov_b32_e32 v5, s1 2471; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 2472; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 2473; CI-NEXT: s_cmp_eq_u32 s5, 7 2474; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2475; CI-NEXT: s_cselect_b64 vcc, -1, 0 2476; CI-NEXT: s_cmp_eq_u32 s5, 6 2477; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2478; CI-NEXT: s_cmp_eq_u32 s5, 5 2479; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 2480; CI-NEXT: s_cmp_eq_u32 s5, 4 2481; CI-NEXT: s_waitcnt vmcnt(0) 2482; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 2483; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2484; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2485; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 2486; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 2487; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2488; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2489; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 2490; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 2491; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 2492; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2493; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2494; CI-NEXT: s_cmp_eq_u32 s5, 3 2495; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2496; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2497; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 2498; CI-NEXT: s_cselect_b64 vcc, -1, 0 2499; CI-NEXT: s_cmp_eq_u32 s5, 2 2500; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc 2501; CI-NEXT: s_cselect_b64 vcc, -1, 0 2502; CI-NEXT: s_cmp_eq_u32 s5, 1 2503; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2504; CI-NEXT: s_cselect_b64 vcc, -1, 0 2505; CI-NEXT: s_cmp_eq_u32 s5, 0 2506; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] 2507; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2508; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc 2509; CI-NEXT: s_cselect_b64 vcc, -1, 0 2510; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2511; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2512; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 2513; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 2514; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 2515; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2516; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2517; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2518; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2519; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 2520; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 2521; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 2522; CI-NEXT: v_or_b32_e32 v3, v3, v6 2523; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 2524; CI-NEXT: v_or_b32_e32 v2, v2, v7 2525; CI-NEXT: v_or_b32_e32 v1, v1, v8 2526; CI-NEXT: v_or_b32_e32 v0, v0, v6 2527; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2528; CI-NEXT: s_endpgm 2529; 2530; GFX11-LABEL: v_insertelement_v8f16_dynamic: 2531; GFX11: ; %bb.0: 2532; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2533; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 2534; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 2535; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2536; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] 2537; GFX11-NEXT: s_cmp_eq_u32 s1, 7 2538; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2539; GFX11-NEXT: s_cmp_eq_u32 s1, 6 2540; GFX11-NEXT: s_cselect_b32 s3, -1, 0 2541; GFX11-NEXT: s_cmp_eq_u32 s1, 5 2542; GFX11-NEXT: s_waitcnt vmcnt(0) 2543; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2544; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2545; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 2546; GFX11-NEXT: s_cselect_b32 s3, -1, 0 2547; GFX11-NEXT: s_cmp_eq_u32 s1, 4 2548; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 2549; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2550; GFX11-NEXT: s_cmp_eq_u32 s1, 3 2551; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 2552; GFX11-NEXT: s_cselect_b32 s3, -1, 0 2553; GFX11-NEXT: s_cmp_eq_u32 s1, 2 2554; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 2555; GFX11-NEXT: s_cselect_b32 s2, -1, 0 2556; GFX11-NEXT: s_cmp_eq_u32 s1, 1 2557; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 2558; GFX11-NEXT: s_cselect_b32 s6, -1, 0 2559; GFX11-NEXT: s_cmp_eq_u32 s1, 0 2560; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 2561; GFX11-NEXT: s_cselect_b32 s1, -1, 0 2562; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 2563; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 2564; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 2565; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 2566; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s3 2567; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 2568; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s6 2569; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 2570; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v3 2571; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 2572; GFX11-NEXT: v_lshl_or_b32 v1, v7, 16, v1 2573; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 2574; GFX11-NEXT: v_lshl_or_b32 v0, v8, 16, v0 2575; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] 2576; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2577; GFX11-NEXT: s_endpgm 2578 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2579 %tid.ext = sext i32 %tid to i64 2580 %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext 2581 %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext 2582 %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep 2583 %val.trunc = trunc i32 %val to i16 2584 %val.cvt = bitcast i16 %val.trunc to half 2585 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n 2586 store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep 2587 ret void 2588} 2589 2590define amdgpu_kernel void @v_insertelement_v16f16_3(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val) { 2591; GFX9-LABEL: v_insertelement_v16f16_3: 2592; GFX9: ; %bb.0: 2593; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2594; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 2595; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2596; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2597; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 2598; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 2599; GFX9-NEXT: s_waitcnt vmcnt(1) 2600; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2601; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 2602; GFX9-NEXT: s_waitcnt vmcnt(0) 2603; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 2604; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 2605; GFX9-NEXT: s_endpgm 2606; 2607; VI-LABEL: v_insertelement_v16f16_3: 2608; VI: ; %bb.0: 2609; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2610; VI-NEXT: s_load_dword s4, s[4:5], 0x10 2611; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2612; VI-NEXT: s_waitcnt lgkmcnt(0) 2613; VI-NEXT: v_mov_b32_e32 v1, s3 2614; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 2615; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2616; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 2617; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2618; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2619; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2620; VI-NEXT: v_mov_b32_e32 v9, s1 2621; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 2622; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2623; VI-NEXT: s_lshl_b32 s1, s4, 16 2624; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 2625; VI-NEXT: v_mov_b32_e32 v12, s1 2626; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2627; VI-NEXT: s_waitcnt vmcnt(1) 2628; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2629; VI-NEXT: s_waitcnt vmcnt(0) 2630; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2631; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2632; VI-NEXT: s_endpgm 2633; 2634; CI-LABEL: v_insertelement_v16f16_3: 2635; CI: ; %bb.0: 2636; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2637; CI-NEXT: s_load_dword s4, s[4:5], 0x4 2638; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2639; CI-NEXT: s_waitcnt lgkmcnt(0) 2640; CI-NEXT: v_mov_b32_e32 v0, s3 2641; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 2642; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc 2643; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 2644; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4 2645; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 2646; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2647; CI-NEXT: v_mov_b32_e32 v9, s1 2648; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 2649; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2650; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8 2651; CI-NEXT: s_lshl_b32 s1, s4, 16 2652; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2653; CI-NEXT: s_waitcnt vmcnt(1) 2654; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 2655; CI-NEXT: v_or_b32_e32 v1, s1, v1 2656; CI-NEXT: s_waitcnt vmcnt(0) 2657; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2658; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2659; CI-NEXT: s_endpgm 2660; 2661; GFX11-LABEL: v_insertelement_v16f16_3: 2662; GFX11: ; %bb.0: 2663; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2664; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2665; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 2666; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2667; GFX11-NEXT: s_clause 0x1 2668; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] 2669; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 2670; GFX11-NEXT: s_waitcnt vmcnt(1) 2671; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 2672; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2673; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 2674; GFX11-NEXT: s_waitcnt vmcnt(0) 2675; GFX11-NEXT: s_clause 0x1 2676; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 2677; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] 2678; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2679; GFX11-NEXT: s_endpgm 2680 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2681 %tid.ext = sext i32 %tid to i64 2682 %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext 2683 %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext 2684 %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep 2685 %val.trunc = trunc i32 %val to i16 2686 %val.cvt = bitcast i16 %val.trunc to half 2687 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3 2688 store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep 2689 ret void 2690} 2691 2692define amdgpu_kernel void @v_insertelement_v16i16_6(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in, i32 %val) { 2693; GFX9-LABEL: v_insertelement_v16i16_6: 2694; GFX9: ; %bb.0: 2695; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2696; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 2697; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2698; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff 2699; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2700; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 2701; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 2702; GFX9-NEXT: s_waitcnt vmcnt(1) 2703; GFX9-NEXT: v_bfi_b32 v3, v9, s6, v3 2704; GFX9-NEXT: s_waitcnt vmcnt(0) 2705; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 2706; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 2707; GFX9-NEXT: s_endpgm 2708; 2709; VI-LABEL: v_insertelement_v16i16_6: 2710; VI: ; %bb.0: 2711; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2712; VI-NEXT: s_load_dword s4, s[4:5], 0x10 2713; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2714; VI-NEXT: s_waitcnt lgkmcnt(0) 2715; VI-NEXT: v_mov_b32_e32 v1, s3 2716; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 2717; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2718; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0 2719; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2720; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2721; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2722; VI-NEXT: v_mov_b32_e32 v9, s1 2723; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 2724; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2725; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 2726; VI-NEXT: s_mov_b32 s2, 0xffff 2727; VI-NEXT: v_mov_b32_e32 v12, s4 2728; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2729; VI-NEXT: s_waitcnt vmcnt(1) 2730; VI-NEXT: v_bfi_b32 v3, s2, v12, v3 2731; VI-NEXT: s_waitcnt vmcnt(0) 2732; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2733; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2734; VI-NEXT: s_endpgm 2735; 2736; CI-LABEL: v_insertelement_v16i16_6: 2737; CI: ; %bb.0: 2738; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2739; CI-NEXT: s_load_dword s4, s[4:5], 0x4 2740; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2741; CI-NEXT: s_waitcnt lgkmcnt(0) 2742; CI-NEXT: v_mov_b32_e32 v1, s3 2743; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8 2744; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2745; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 2746; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2747; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2748; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2749; CI-NEXT: v_mov_b32_e32 v9, s1 2750; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 2751; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2752; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8 2753; CI-NEXT: s_mov_b32 s2, 0xffff 2754; CI-NEXT: v_mov_b32_e32 v12, s4 2755; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2756; CI-NEXT: s_waitcnt vmcnt(1) 2757; CI-NEXT: v_bfi_b32 v3, s2, v12, v3 2758; CI-NEXT: s_waitcnt vmcnt(0) 2759; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 2760; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2761; CI-NEXT: s_endpgm 2762; 2763; GFX11-LABEL: v_insertelement_v16i16_6: 2764; GFX11: ; %bb.0: 2765; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 2766; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2767; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 2768; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2769; GFX11-NEXT: s_clause 0x1 2770; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] 2771; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 2772; GFX11-NEXT: s_waitcnt vmcnt(1) 2773; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 2774; GFX11-NEXT: s_waitcnt vmcnt(0) 2775; GFX11-NEXT: s_clause 0x1 2776; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 2777; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] 2778; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2779; GFX11-NEXT: s_endpgm 2780 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2781 %tid.ext = sext i32 %tid to i64 2782 %in.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %in, i64 %tid.ext 2783 %out.gep = getelementptr inbounds <16 x i16>, <16 x i16> addrspace(1)* %out, i64 %tid.ext 2784 %vec = load <16 x i16>, <16 x i16> addrspace(1)* %in.gep 2785 %val.trunc = trunc i32 %val to i16 2786 %val.cvt = bitcast i16 %val.trunc to i16 2787 %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6 2788 store <16 x i16> %vecins, <16 x i16> addrspace(1)* %out.gep 2789 ret void 2790} 2791 2792define amdgpu_kernel void @v_insertelement_v16f16_dynamic(<16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %in, i32 %val, i32 %n) { 2793; GFX9-LABEL: v_insertelement_v16f16_dynamic: 2794; GFX9: ; %bb.0: 2795; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2796; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 2797; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2798; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2799; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 2800; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 2801; GFX9-NEXT: s_cmp_eq_u32 s7, 7 2802; GFX9-NEXT: v_mov_b32_e32 v9, s6 2803; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2804; GFX9-NEXT: s_cmp_eq_u32 s7, 6 2805; GFX9-NEXT: s_waitcnt vmcnt(1) 2806; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v3 2807; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc 2808; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2809; GFX9-NEXT: s_cmp_eq_u32 s7, 5 2810; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 2811; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc 2812; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2813; GFX9-NEXT: s_cmp_eq_u32 s7, 4 2814; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc 2815; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2816; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc 2817; GFX9-NEXT: s_cmp_eq_u32 s7, 3 2818; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v1 2819; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 2820; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2821; GFX9-NEXT: s_cmp_eq_u32 s7, 2 2822; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2 2823; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v9, vcc 2824; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2825; GFX9-NEXT: s_cmp_eq_u32 s7, 1 2826; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 2827; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc 2828; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2829; GFX9-NEXT: s_cmp_eq_u32 s7, 0 2830; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v9, vcc 2831; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2832; GFX9-NEXT: s_cmp_eq_u32 s7, 15 2833; GFX9-NEXT: s_waitcnt vmcnt(0) 2834; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v7 2835; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc 2836; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2837; GFX9-NEXT: s_cmp_eq_u32 s7, 14 2838; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v9, vcc 2839; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2840; GFX9-NEXT: s_cmp_eq_u32 s7, 13 2841; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v6 2842; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2843; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 2844; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2845; GFX9-NEXT: s_cmp_eq_u32 s7, 12 2846; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 2847; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0 2848; GFX9-NEXT: v_cndmask_b32_e32 v12, v15, v9, vcc 2849; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2850; GFX9-NEXT: s_cmp_eq_u32 s7, 11 2851; GFX9-NEXT: v_lshl_or_b32 v3, v10, 16, v3 2852; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5 2853; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc 2854; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2855; GFX9-NEXT: s_cmp_eq_u32 s7, 10 2856; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2857; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v9, vcc 2858; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2859; GFX9-NEXT: s_cmp_eq_u32 s7, 9 2860; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1 2861; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4 2862; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc 2863; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2864; GFX9-NEXT: s_cmp_eq_u32 s7, 8 2865; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v9, vcc 2866; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2867; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc 2868; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 2869; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 2870; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 2871; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 2872; GFX9-NEXT: v_lshl_or_b32 v7, v13, 16, v7 2873; GFX9-NEXT: v_lshl_or_b32 v6, v12, 16, v6 2874; GFX9-NEXT: v_lshl_or_b32 v5, v10, 16, v5 2875; GFX9-NEXT: v_lshl_or_b32 v4, v11, 16, v4 2876; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 2877; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 2878; GFX9-NEXT: s_endpgm 2879; 2880; VI-LABEL: v_insertelement_v16f16_dynamic: 2881; VI: ; %bb.0: 2882; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2883; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 2884; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2885; VI-NEXT: s_waitcnt lgkmcnt(0) 2886; VI-NEXT: v_mov_b32_e32 v0, s3 2887; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8 2888; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc 2889; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4 2890; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 2891; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2892; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2893; VI-NEXT: v_mov_b32_e32 v9, s1 2894; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 2895; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2896; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 2897; VI-NEXT: s_cmp_eq_u32 s7, 14 2898; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc 2899; VI-NEXT: v_mov_b32_e32 v12, s6 2900; VI-NEXT: s_cselect_b64 vcc, -1, 0 2901; VI-NEXT: s_cmp_eq_u32 s7, 15 2902; VI-NEXT: s_waitcnt vmcnt(1) 2903; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc 2904; VI-NEXT: s_cselect_b64 vcc, -1, 0 2905; VI-NEXT: s_cmp_eq_u32 s7, 12 2906; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 2907; VI-NEXT: s_cmp_eq_u32 s7, 13 2908; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 2909; VI-NEXT: v_cndmask_b32_e64 v2, v2, v12, s[0:1] 2910; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 2911; VI-NEXT: s_cmp_eq_u32 s7, 10 2912; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 2913; VI-NEXT: s_cmp_eq_u32 s7, 11 2914; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 2915; VI-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] 2916; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 2917; VI-NEXT: s_cmp_eq_u32 s7, 8 2918; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2919; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 2920; VI-NEXT: v_cndmask_b32_e64 v15, v15, v12, s[2:3] 2921; VI-NEXT: s_cmp_eq_u32 s7, 9 2922; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 2923; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc 2924; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2925; VI-NEXT: s_cselect_b64 vcc, -1, 0 2926; VI-NEXT: s_cmp_eq_u32 s7, 6 2927; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2928; VI-NEXT: v_cndmask_b32_e32 v15, v16, v12, vcc 2929; VI-NEXT: s_cselect_b64 vcc, -1, 0 2930; VI-NEXT: s_cmp_eq_u32 s7, 7 2931; VI-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[4:5] 2932; VI-NEXT: s_waitcnt vmcnt(0) 2933; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 2934; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2935; VI-NEXT: v_cndmask_b32_e64 v14, v14, v12, s[0:1] 2936; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2937; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc 2938; VI-NEXT: s_cselect_b64 vcc, -1, 0 2939; VI-NEXT: s_cmp_eq_u32 s7, 4 2940; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2941; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 2942; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2943; VI-NEXT: v_cndmask_b32_e32 v15, v17, v12, vcc 2944; VI-NEXT: s_cselect_b64 vcc, -1, 0 2945; VI-NEXT: s_cmp_eq_u32 s7, 5 2946; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2947; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 2948; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc 2949; VI-NEXT: s_cselect_b64 vcc, -1, 0 2950; VI-NEXT: s_cmp_eq_u32 s7, 2 2951; VI-NEXT: v_cndmask_b32_e32 v13, v13, v12, vcc 2952; VI-NEXT: s_cselect_b64 vcc, -1, 0 2953; VI-NEXT: s_cmp_eq_u32 s7, 3 2954; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 2955; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc 2956; VI-NEXT: s_cselect_b64 vcc, -1, 0 2957; VI-NEXT: s_cmp_eq_u32 s7, 0 2958; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 2959; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc 2960; VI-NEXT: s_cselect_b64 vcc, -1, 0 2961; VI-NEXT: s_cmp_eq_u32 s7, 1 2962; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2963; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 2964; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc 2965; VI-NEXT: s_cselect_b64 vcc, -1, 0 2966; VI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc 2967; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 2968; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 2969; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 2970; VI-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2971; VI-NEXT: v_or_b32_sdwa v5, v5, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2972; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2973; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 2974; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3] 2975; VI-NEXT: s_endpgm 2976; 2977; CI-LABEL: v_insertelement_v16f16_dynamic: 2978; CI: ; %bb.0: 2979; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2980; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 2981; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 2982; CI-NEXT: s_waitcnt lgkmcnt(0) 2983; CI-NEXT: v_mov_b32_e32 v0, s3 2984; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8 2985; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc 2986; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4 2987; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 2988; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2989; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2990; CI-NEXT: v_mov_b32_e32 v9, s1 2991; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8 2992; CI-NEXT: v_cvt_f32_f16_e32 v10, s4 2993; CI-NEXT: s_cmp_eq_u32 s5, 15 2994; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 2995; CI-NEXT: s_cselect_b64 vcc, -1, 0 2996; CI-NEXT: s_cmp_eq_u32 s5, 14 2997; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2998; CI-NEXT: s_cmp_eq_u32 s5, 13 2999; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 3000; CI-NEXT: s_cmp_eq_u32 s5, 12 3001; CI-NEXT: s_waitcnt vmcnt(1) 3002; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v3 3003; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 3004; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 3005; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 3006; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3007; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 3008; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 3009; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 3010; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 3011; CI-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] 3012; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 3013; CI-NEXT: s_cmp_eq_u32 s5, 11 3014; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc 3015; CI-NEXT: v_cndmask_b32_e64 v12, v12, v10, s[2:3] 3016; CI-NEXT: s_cselect_b64 vcc, -1, 0 3017; CI-NEXT: s_cmp_eq_u32 s5, 10 3018; CI-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] 3019; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc 3020; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3021; CI-NEXT: s_cselect_b64 vcc, -1, 0 3022; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 3023; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3024; CI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc 3025; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 3026; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 3027; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v0 3028; CI-NEXT: v_or_b32_e32 v2, v2, v12 3029; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 3030; CI-NEXT: v_or_b32_e32 v1, v1, v12 3031; CI-NEXT: v_cvt_f32_f16_e32 v12, v14 3032; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 3033; CI-NEXT: s_waitcnt vmcnt(0) 3034; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 3035; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 3036; CI-NEXT: s_cmp_eq_u32 s5, 9 3037; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 3038; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 3039; CI-NEXT: s_cselect_b64 vcc, -1, 0 3040; CI-NEXT: s_cmp_eq_u32 s5, 8 3041; CI-NEXT: v_cvt_f32_f16_e32 v14, v16 3042; CI-NEXT: v_cndmask_b32_e32 v12, v12, v10, vcc 3043; CI-NEXT: s_cselect_b64 vcc, -1, 0 3044; CI-NEXT: s_cmp_eq_u32 s5, 7 3045; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 3046; CI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc 3047; CI-NEXT: s_cselect_b64 vcc, -1, 0 3048; CI-NEXT: s_cmp_eq_u32 s5, 6 3049; CI-NEXT: v_cndmask_b32_e32 v13, v13, v10, vcc 3050; CI-NEXT: s_cselect_b64 vcc, -1, 0 3051; CI-NEXT: s_cmp_eq_u32 s5, 5 3052; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3053; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3054; CI-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 3055; CI-NEXT: s_cselect_b64 vcc, -1, 0 3056; CI-NEXT: s_cmp_eq_u32 s5, 4 3057; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 3058; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 3059; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 3060; CI-NEXT: v_cndmask_b32_e32 v14, v14, v10, vcc 3061; CI-NEXT: s_cselect_b64 vcc, -1, 0 3062; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 3063; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 3064; CI-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc 3065; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 3066; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 3067; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 3068; CI-NEXT: v_or_b32_e32 v3, v3, v11 3069; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v5 3070; CI-NEXT: v_or_b32_e32 v0, v0, v12 3071; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 3072; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 3073; CI-NEXT: v_or_b32_e32 v7, v7, v12 3074; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14 3075; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 3076; CI-NEXT: v_or_b32_e32 v6, v6, v12 3077; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v4 3078; CI-NEXT: s_cmp_eq_u32 s5, 3 3079; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 3080; CI-NEXT: s_cselect_b64 vcc, -1, 0 3081; CI-NEXT: s_cmp_eq_u32 s5, 2 3082; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 3083; CI-NEXT: v_cndmask_b32_e32 v11, v11, v10, vcc 3084; CI-NEXT: s_cselect_b64 vcc, -1, 0 3085; CI-NEXT: s_cmp_eq_u32 s5, 1 3086; CI-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc 3087; CI-NEXT: s_cselect_b64 vcc, -1, 0 3088; CI-NEXT: s_cmp_eq_u32 s5, 0 3089; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 3090; CI-NEXT: v_cndmask_b32_e32 v12, v12, v10, vcc 3091; CI-NEXT: s_cselect_b64 vcc, -1, 0 3092; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 3093; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 3094; CI-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc 3095; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 3096; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v11 3097; CI-NEXT: v_or_b32_e32 v5, v5, v10 3098; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v12 3099; CI-NEXT: v_or_b32_e32 v4, v4, v10 3100; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 3101; CI-NEXT: s_nop 0 3102; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v8 3103; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc 3104; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3105; CI-NEXT: s_endpgm 3106; 3107; GFX11-LABEL: v_insertelement_v16f16_dynamic: 3108; GFX11: ; %bb.0: 3109; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 3110; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 3111; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 3112; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3113; GFX11-NEXT: s_clause 0x1 3114; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] 3115; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16 3116; GFX11-NEXT: s_cmp_eq_u32 s1, 7 3117; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3118; GFX11-NEXT: s_cmp_eq_u32 s1, 6 3119; GFX11-NEXT: s_cselect_b32 s3, -1, 0 3120; GFX11-NEXT: s_cmp_eq_u32 s1, 5 3121; GFX11-NEXT: s_waitcnt vmcnt(1) 3122; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 3123; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 3124; GFX11-NEXT: s_cselect_b32 s3, -1, 0 3125; GFX11-NEXT: s_cmp_eq_u32 s1, 4 3126; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 3127; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, s0, s2 3128; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3129; GFX11-NEXT: s_cmp_eq_u32 s1, 3 3130; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 3131; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 3132; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3133; GFX11-NEXT: s_cmp_eq_u32 s1, 2 3134; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0 3135; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 3136; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2 3137; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3138; GFX11-NEXT: s_cmp_eq_u32 s1, 1 3139; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 3140; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3141; GFX11-NEXT: s_cmp_eq_u32 s1, 0 3142; GFX11-NEXT: s_waitcnt vmcnt(0) 3143; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7 3144; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3 3145; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 3146; GFX11-NEXT: v_lshl_or_b32 v3, v9, 16, v3 3147; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2 3148; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3149; GFX11-NEXT: s_cmp_eq_u32 s1, 15 3150; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 3151; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3152; GFX11-NEXT: s_cmp_eq_u32 s1, 14 3153; GFX11-NEXT: v_lshl_or_b32 v2, v10, 16, v2 3154; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2 3155; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3156; GFX11-NEXT: s_cmp_eq_u32 s1, 13 3157; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 3158; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2 3159; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3160; GFX11-NEXT: s_cmp_eq_u32 s1, 12 3161; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5 3162; GFX11-NEXT: s_cselect_b32 s3, -1, 0 3163; GFX11-NEXT: s_cmp_eq_u32 s1, 11 3164; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2 3165; GFX11-NEXT: s_cselect_b32 s2, -1, 0 3166; GFX11-NEXT: s_cmp_eq_u32 s1, 10 3167; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 3168; GFX11-NEXT: s_cselect_b32 s3, -1, 0 3169; GFX11-NEXT: s_cmp_eq_u32 s1, 9 3170; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4 3171; GFX11-NEXT: s_cselect_b32 s6, -1, 0 3172; GFX11-NEXT: s_cmp_eq_u32 s1, 8 3173; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s3 3174; GFX11-NEXT: s_cselect_b32 s1, -1, 0 3175; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 3176; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s1 3177; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2 3178; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 3179; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s6 3180; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 3181; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 3182; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 3183; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 3184; GFX11-NEXT: v_lshl_or_b32 v7, v10, 16, v7 3185; GFX11-NEXT: v_lshl_or_b32 v6, v12, 16, v6 3186; GFX11-NEXT: v_lshl_or_b32 v5, v13, 16, v5 3187; GFX11-NEXT: v_lshl_or_b32 v4, v14, 16, v4 3188; GFX11-NEXT: v_lshl_or_b32 v1, v11, 16, v1 3189; GFX11-NEXT: v_lshl_or_b32 v0, v9, 16, v0 3190; GFX11-NEXT: s_clause 0x1 3191; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 3192; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] 3193; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3194; GFX11-NEXT: s_endpgm 3195 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 3196 %tid.ext = sext i32 %tid to i64 3197 %in.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %in, i64 %tid.ext 3198 %out.gep = getelementptr inbounds <16 x half>, <16 x half> addrspace(1)* %out, i64 %tid.ext 3199 %vec = load <16 x half>, <16 x half> addrspace(1)* %in.gep 3200 %val.trunc = trunc i32 %val to i16 3201 %val.cvt = bitcast i16 %val.trunc to half 3202 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n 3203 store <16 x half> %vecins, <16 x half> addrspace(1)* %out.gep 3204 ret void 3205} 3206 3207 3208declare i32 @llvm.amdgcn.workitem.id.x() #1 3209 3210attributes #0 = { nounwind } 3211attributes #1 = { nounwind readnone } 3212