1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,CIVI,VI %s 4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s 5 6define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 7; GFX9-LABEL: s_insertelement_v2i16_0: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 10; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11; GFX9-NEXT: v_mov_b32_e32 v0, s0 12; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 13; GFX9-NEXT: v_mov_b32_e32 v1, s1 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s0 16; GFX9-NEXT: v_mov_b32_e32 v2, s0 17; GFX9-NEXT: global_store_dword v[0:1], v2, off 18; GFX9-NEXT: s_endpgm 19; 20; CIVI-LABEL: s_insertelement_v2i16_0: 21; CIVI: ; %bb.0: 22; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 23; CIVI-NEXT: s_waitcnt lgkmcnt(0) 24; CIVI-NEXT: v_mov_b32_e32 v0, s0 25; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 26; CIVI-NEXT: v_mov_b32_e32 v1, s1 27; CIVI-NEXT: s_waitcnt lgkmcnt(0) 28; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 29; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 30; CIVI-NEXT: v_mov_b32_e32 v2, s0 31; CIVI-NEXT: flat_store_dword v[0:1], v2 32; CIVI-NEXT: s_endpgm 33 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 34 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 35 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 36 ret void 37} 38 39 40define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 41; GFX9-LABEL: s_insertelement_v2i16_0_reg: 42; GFX9: ; %bb.0: 43; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 44; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 45; GFX9-NEXT: s_waitcnt lgkmcnt(0) 46; GFX9-NEXT: v_mov_b32_e32 v0, s0 47; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 48; GFX9-NEXT: v_mov_b32_e32 v1, s1 49; GFX9-NEXT: s_waitcnt lgkmcnt(0) 50; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0 51; GFX9-NEXT: v_mov_b32_e32 v2, s0 52; GFX9-NEXT: global_store_dword v[0:1], v2, off 53; GFX9-NEXT: s_endpgm 54; 55; VI-LABEL: s_insertelement_v2i16_0_reg: 56; VI: ; %bb.0: 57; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 58; VI-NEXT: s_load_dword s4, s[4:5], 0x30 59; VI-NEXT: s_waitcnt lgkmcnt(0) 60; VI-NEXT: v_mov_b32_e32 v0, s0 61; VI-NEXT: s_load_dword s0, s[2:3], 0x0 62; VI-NEXT: v_mov_b32_e32 v1, s1 63; VI-NEXT: s_and_b32 s1, s4, 0xffff 64; VI-NEXT: s_waitcnt lgkmcnt(0) 65; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 66; VI-NEXT: s_or_b32 s0, s1, s0 67; VI-NEXT: v_mov_b32_e32 v2, s0 68; VI-NEXT: flat_store_dword v[0:1], v2 69; VI-NEXT: s_endpgm 70; 71; CI-LABEL: s_insertelement_v2i16_0_reg: 72; CI: ; %bb.0: 73; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 74; CI-NEXT: s_load_dword s4, s[4:5], 0xc 75; CI-NEXT: s_waitcnt lgkmcnt(0) 76; CI-NEXT: v_mov_b32_e32 v0, s0 77; CI-NEXT: s_load_dword s0, s[2:3], 0x0 78; CI-NEXT: v_mov_b32_e32 v1, s1 79; CI-NEXT: s_and_b32 s1, s4, 0xffff 80; CI-NEXT: s_waitcnt lgkmcnt(0) 81; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 82; CI-NEXT: s_or_b32 s0, s1, s0 83; CI-NEXT: v_mov_b32_e32 v2, s0 84; CI-NEXT: flat_store_dword v[0:1], v2 85; CI-NEXT: s_endpgm 86 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 87 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 88 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 89 ret void 90} 91 92define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 93; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 94; GFX9: ; %bb.0: 95; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 96; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 97; GFX9-NEXT: s_waitcnt lgkmcnt(0) 98; GFX9-NEXT: v_mov_b32_e32 v0, s0 99; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 100; GFX9-NEXT: v_mov_b32_e32 v1, s1 101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 102; GFX9-NEXT: s_lshr_b32 s0, s0, 16 103; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 104; GFX9-NEXT: v_mov_b32_e32 v2, s1 105; GFX9-NEXT: global_store_dword v[0:1], v2, off 106; GFX9-NEXT: ;;#ASMSTART 107; GFX9-NEXT: ; use s0 108; GFX9-NEXT: ;;#ASMEND 109; GFX9-NEXT: s_endpgm 110; 111; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 112; VI: ; %bb.0: 113; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 114; VI-NEXT: s_load_dword s4, s[4:5], 0x30 115; VI-NEXT: s_waitcnt lgkmcnt(0) 116; VI-NEXT: v_mov_b32_e32 v0, s0 117; VI-NEXT: s_load_dword s0, s[2:3], 0x0 118; VI-NEXT: v_mov_b32_e32 v1, s1 119; VI-NEXT: s_and_b32 s1, s4, 0xffff 120; VI-NEXT: s_waitcnt lgkmcnt(0) 121; VI-NEXT: s_lshr_b32 s2, s0, 16 122; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 123; VI-NEXT: s_or_b32 s0, s1, s0 124; VI-NEXT: v_mov_b32_e32 v2, s0 125; VI-NEXT: flat_store_dword v[0:1], v2 126; VI-NEXT: ;;#ASMSTART 127; VI-NEXT: ; use s2 128; VI-NEXT: ;;#ASMEND 129; VI-NEXT: s_endpgm 130; 131; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 132; CI: ; %bb.0: 133; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 134; CI-NEXT: s_load_dword s4, s[4:5], 0xc 135; CI-NEXT: s_waitcnt lgkmcnt(0) 136; CI-NEXT: v_mov_b32_e32 v0, s0 137; CI-NEXT: s_load_dword s0, s[2:3], 0x0 138; CI-NEXT: v_mov_b32_e32 v1, s1 139; CI-NEXT: s_and_b32 s1, s4, 0xffff 140; CI-NEXT: s_waitcnt lgkmcnt(0) 141; CI-NEXT: s_lshr_b32 s0, s0, 16 142; CI-NEXT: s_lshl_b32 s2, s0, 16 143; CI-NEXT: s_or_b32 s1, s1, s2 144; CI-NEXT: v_mov_b32_e32 v2, s1 145; CI-NEXT: flat_store_dword v[0:1], v2 146; CI-NEXT: ;;#ASMSTART 147; CI-NEXT: ; use s0 148; CI-NEXT: ;;#ASMEND 149; CI-NEXT: s_endpgm 150 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 151 %elt1 = extractelement <2 x i16> %vec, i32 1 152 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 153 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 154 %use1 = zext i16 %elt1 to i32 155 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 156 ret void 157} 158 159define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 { 160; GFX9-LABEL: s_insertelement_v2i16_0_reghi: 161; GFX9: ; %bb.0: 162; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 163; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 165; GFX9-NEXT: v_mov_b32_e32 v0, s0 166; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 167; GFX9-NEXT: v_mov_b32_e32 v1, s1 168; GFX9-NEXT: s_waitcnt lgkmcnt(0) 169; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0 170; GFX9-NEXT: v_mov_b32_e32 v2, s0 171; GFX9-NEXT: global_store_dword v[0:1], v2, off 172; GFX9-NEXT: s_endpgm 173; 174; VI-LABEL: s_insertelement_v2i16_0_reghi: 175; VI: ; %bb.0: 176; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 177; VI-NEXT: s_load_dword s4, s[4:5], 0x30 178; VI-NEXT: s_waitcnt lgkmcnt(0) 179; VI-NEXT: v_mov_b32_e32 v0, s0 180; VI-NEXT: s_load_dword s0, s[2:3], 0x0 181; VI-NEXT: v_mov_b32_e32 v1, s1 182; VI-NEXT: s_lshr_b32 s1, s4, 16 183; VI-NEXT: s_waitcnt lgkmcnt(0) 184; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 185; VI-NEXT: s_or_b32 s0, s1, s0 186; VI-NEXT: v_mov_b32_e32 v2, s0 187; VI-NEXT: flat_store_dword v[0:1], v2 188; VI-NEXT: s_endpgm 189; 190; CI-LABEL: s_insertelement_v2i16_0_reghi: 191; CI: ; %bb.0: 192; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 193; CI-NEXT: s_load_dword s4, s[4:5], 0xc 194; CI-NEXT: s_waitcnt lgkmcnt(0) 195; CI-NEXT: v_mov_b32_e32 v0, s0 196; CI-NEXT: s_load_dword s0, s[2:3], 0x0 197; CI-NEXT: v_mov_b32_e32 v1, s1 198; CI-NEXT: s_lshr_b32 s1, s4, 16 199; CI-NEXT: s_waitcnt lgkmcnt(0) 200; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 201; CI-NEXT: s_or_b32 s0, s1, s0 202; CI-NEXT: v_mov_b32_e32 v2, s0 203; CI-NEXT: flat_store_dword v[0:1], v2 204; CI-NEXT: s_endpgm 205 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 206 %elt.hi = lshr i32 %elt.arg, 16 207 %elt = trunc i32 %elt.hi to i16 208 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 209 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 210 ret void 211} 212 213define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 214; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 215; GFX9: ; %bb.0: 216; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 217; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 219; GFX9-NEXT: v_mov_b32_e32 v0, s0 220; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 221; GFX9-NEXT: v_mov_b32_e32 v1, s1 222; GFX9-NEXT: s_lshr_b32 s1, s4, 16 223; GFX9-NEXT: s_waitcnt lgkmcnt(0) 224; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0 225; GFX9-NEXT: v_mov_b32_e32 v2, s0 226; GFX9-NEXT: global_store_dword v[0:1], v2, off 227; GFX9-NEXT: ;;#ASMSTART 228; GFX9-NEXT: ; use s1 229; GFX9-NEXT: ;;#ASMEND 230; GFX9-NEXT: s_endpgm 231; 232; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 233; VI: ; %bb.0: 234; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 235; VI-NEXT: s_load_dword s4, s[4:5], 0x10 236; VI-NEXT: s_waitcnt lgkmcnt(0) 237; VI-NEXT: v_mov_b32_e32 v0, s0 238; VI-NEXT: s_load_dword s0, s[2:3], 0x0 239; VI-NEXT: v_mov_b32_e32 v1, s1 240; VI-NEXT: s_lshr_b32 s1, s4, 16 241; VI-NEXT: s_waitcnt lgkmcnt(0) 242; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 243; VI-NEXT: s_or_b32 s0, s1, s0 244; VI-NEXT: v_mov_b32_e32 v2, s0 245; VI-NEXT: flat_store_dword v[0:1], v2 246; VI-NEXT: ;;#ASMSTART 247; VI-NEXT: ; use s1 248; VI-NEXT: ;;#ASMEND 249; VI-NEXT: s_endpgm 250; 251; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 252; CI: ; %bb.0: 253; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 254; CI-NEXT: s_load_dword s4, s[4:5], 0x4 255; CI-NEXT: s_waitcnt lgkmcnt(0) 256; CI-NEXT: v_mov_b32_e32 v0, s0 257; CI-NEXT: s_load_dword s0, s[2:3], 0x0 258; CI-NEXT: v_mov_b32_e32 v1, s1 259; CI-NEXT: s_lshr_b32 s1, s4, 16 260; CI-NEXT: s_waitcnt lgkmcnt(0) 261; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 262; CI-NEXT: s_or_b32 s0, s1, s0 263; CI-NEXT: v_mov_b32_e32 v2, s0 264; CI-NEXT: flat_store_dword v[0:1], v2 265; CI-NEXT: ;;#ASMSTART 266; CI-NEXT: ; use s1 267; CI-NEXT: ;;#ASMEND 268; CI-NEXT: s_endpgm 269 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 270 %elt.hi = lshr i32 %elt.arg, 16 271 %elt = trunc i32 %elt.hi to i16 272 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 273 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 274 %use1 = zext i16 %elt to i32 275 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 276 ret void 277} 278 279define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 280; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 281; GFX9: ; %bb.0: 282; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 283; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 284; GFX9-NEXT: s_waitcnt lgkmcnt(0) 285; GFX9-NEXT: v_mov_b32_e32 v0, s0 286; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 287; GFX9-NEXT: v_mov_b32_e32 v1, s1 288; GFX9-NEXT: s_lshr_b32 s1, s4, 16 289; GFX9-NEXT: s_waitcnt lgkmcnt(0) 290; GFX9-NEXT: s_lshr_b32 s0, s0, 16 291; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0 292; GFX9-NEXT: v_mov_b32_e32 v2, s2 293; GFX9-NEXT: global_store_dword v[0:1], v2, off 294; GFX9-NEXT: ;;#ASMSTART 295; GFX9-NEXT: ; use s1 296; GFX9-NEXT: ;;#ASMEND 297; GFX9-NEXT: ;;#ASMSTART 298; GFX9-NEXT: ; use s0 299; GFX9-NEXT: ;;#ASMEND 300; GFX9-NEXT: s_endpgm 301; 302; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 303; VI: ; %bb.0: 304; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 305; VI-NEXT: s_load_dword s4, s[4:5], 0x10 306; VI-NEXT: s_waitcnt lgkmcnt(0) 307; VI-NEXT: v_mov_b32_e32 v0, s0 308; VI-NEXT: s_load_dword s0, s[2:3], 0x0 309; VI-NEXT: v_mov_b32_e32 v1, s1 310; VI-NEXT: s_lshr_b32 s1, s4, 16 311; VI-NEXT: s_waitcnt lgkmcnt(0) 312; VI-NEXT: s_lshr_b32 s2, s0, 16 313; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 314; VI-NEXT: s_or_b32 s0, s1, s0 315; VI-NEXT: v_mov_b32_e32 v2, s0 316; VI-NEXT: flat_store_dword v[0:1], v2 317; VI-NEXT: ;;#ASMSTART 318; VI-NEXT: ; use s1 319; VI-NEXT: ;;#ASMEND 320; VI-NEXT: ;;#ASMSTART 321; VI-NEXT: ; use s2 322; VI-NEXT: ;;#ASMEND 323; VI-NEXT: s_endpgm 324; 325; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 326; CI: ; %bb.0: 327; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 328; CI-NEXT: s_load_dword s4, s[4:5], 0x4 329; CI-NEXT: s_waitcnt lgkmcnt(0) 330; CI-NEXT: v_mov_b32_e32 v0, s0 331; CI-NEXT: s_load_dword s0, s[2:3], 0x0 332; CI-NEXT: v_mov_b32_e32 v2, s4 333; CI-NEXT: v_mov_b32_e32 v1, s1 334; CI-NEXT: s_lshr_b32 s1, s4, 16 335; CI-NEXT: s_waitcnt lgkmcnt(0) 336; CI-NEXT: s_lshr_b32 s0, s0, 16 337; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 338; CI-NEXT: flat_store_dword v[0:1], v2 339; CI-NEXT: ;;#ASMSTART 340; CI-NEXT: ; use s1 341; CI-NEXT: ;;#ASMEND 342; CI-NEXT: ;;#ASMSTART 343; CI-NEXT: ; use s0 344; CI-NEXT: ;;#ASMEND 345; CI-NEXT: s_endpgm 346 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 347 %elt.hi = lshr i32 %elt.arg, 16 348 %elt = trunc i32 %elt.hi to i16 349 %vec.hi = extractelement <2 x i16> %vec, i32 1 350 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 351 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 352 %use1 = zext i16 %elt to i32 353 %vec.hi.use1 = zext i16 %vec.hi to i32 354 355 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 356 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 357 ret void 358} 359 360define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 361; GFX9-LABEL: s_insertelement_v2i16_1: 362; GFX9: ; %bb.0: 363; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 364; GFX9-NEXT: s_waitcnt lgkmcnt(0) 365; GFX9-NEXT: v_mov_b32_e32 v0, s0 366; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 367; GFX9-NEXT: v_mov_b32_e32 v1, s1 368; GFX9-NEXT: s_waitcnt lgkmcnt(0) 369; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x3e7 370; GFX9-NEXT: v_mov_b32_e32 v2, s0 371; GFX9-NEXT: global_store_dword v[0:1], v2, off 372; GFX9-NEXT: s_endpgm 373; 374; CIVI-LABEL: s_insertelement_v2i16_1: 375; CIVI: ; %bb.0: 376; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 377; CIVI-NEXT: s_waitcnt lgkmcnt(0) 378; CIVI-NEXT: v_mov_b32_e32 v0, s0 379; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 380; CIVI-NEXT: v_mov_b32_e32 v1, s1 381; CIVI-NEXT: s_waitcnt lgkmcnt(0) 382; CIVI-NEXT: s_and_b32 s0, s0, 0xffff 383; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 384; CIVI-NEXT: v_mov_b32_e32 v2, s0 385; CIVI-NEXT: flat_store_dword v[0:1], v2 386; CIVI-NEXT: s_endpgm 387 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 388 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 389 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 390 ret void 391} 392 393define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 394; GFX9-LABEL: s_insertelement_v2i16_1_reg: 395; GFX9: ; %bb.0: 396; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 397; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 398; GFX9-NEXT: s_waitcnt lgkmcnt(0) 399; GFX9-NEXT: v_mov_b32_e32 v0, s0 400; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 401; GFX9-NEXT: v_mov_b32_e32 v1, s1 402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 403; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 404; GFX9-NEXT: v_mov_b32_e32 v2, s0 405; GFX9-NEXT: global_store_dword v[0:1], v2, off 406; GFX9-NEXT: s_endpgm 407; 408; VI-LABEL: s_insertelement_v2i16_1_reg: 409; VI: ; %bb.0: 410; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 411; VI-NEXT: s_load_dword s4, s[4:5], 0x30 412; VI-NEXT: s_waitcnt lgkmcnt(0) 413; VI-NEXT: v_mov_b32_e32 v0, s0 414; VI-NEXT: s_load_dword s0, s[2:3], 0x0 415; VI-NEXT: v_mov_b32_e32 v1, s1 416; VI-NEXT: s_lshl_b32 s1, s4, 16 417; VI-NEXT: s_waitcnt lgkmcnt(0) 418; VI-NEXT: s_and_b32 s0, s0, 0xffff 419; VI-NEXT: s_or_b32 s0, s0, s1 420; VI-NEXT: v_mov_b32_e32 v2, s0 421; VI-NEXT: flat_store_dword v[0:1], v2 422; VI-NEXT: s_endpgm 423; 424; CI-LABEL: s_insertelement_v2i16_1_reg: 425; CI: ; %bb.0: 426; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 427; CI-NEXT: s_load_dword s4, s[4:5], 0xc 428; CI-NEXT: s_waitcnt lgkmcnt(0) 429; CI-NEXT: v_mov_b32_e32 v0, s0 430; CI-NEXT: s_load_dword s0, s[2:3], 0x0 431; CI-NEXT: v_mov_b32_e32 v1, s1 432; CI-NEXT: s_lshl_b32 s1, s4, 16 433; CI-NEXT: s_waitcnt lgkmcnt(0) 434; CI-NEXT: s_and_b32 s0, s0, 0xffff 435; CI-NEXT: s_or_b32 s0, s0, s1 436; CI-NEXT: v_mov_b32_e32 v2, s0 437; CI-NEXT: flat_store_dword v[0:1], v2 438; CI-NEXT: s_endpgm 439 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 440 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 441 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 442 ret void 443} 444 445define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 446; GFX9-LABEL: s_insertelement_v2f16_0: 447; GFX9: ; %bb.0: 448; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 450; GFX9-NEXT: v_mov_b32_e32 v0, s0 451; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 452; GFX9-NEXT: v_mov_b32_e32 v1, s1 453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 454; GFX9-NEXT: s_lshr_b32 s0, s0, 16 455; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0 456; GFX9-NEXT: v_mov_b32_e32 v2, s0 457; GFX9-NEXT: global_store_dword v[0:1], v2, off 458; GFX9-NEXT: s_endpgm 459; 460; CIVI-LABEL: s_insertelement_v2f16_0: 461; CIVI: ; %bb.0: 462; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 463; CIVI-NEXT: s_waitcnt lgkmcnt(0) 464; CIVI-NEXT: v_mov_b32_e32 v0, s0 465; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 466; CIVI-NEXT: v_mov_b32_e32 v1, s1 467; CIVI-NEXT: s_waitcnt lgkmcnt(0) 468; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 469; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 470; CIVI-NEXT: v_mov_b32_e32 v2, s0 471; CIVI-NEXT: flat_store_dword v[0:1], v2 472; CIVI-NEXT: s_endpgm 473 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 474 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 475 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 476 ret void 477} 478 479define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 480; GFX9-LABEL: s_insertelement_v2f16_1: 481; GFX9: ; %bb.0: 482; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 484; GFX9-NEXT: v_mov_b32_e32 v0, s0 485; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 486; GFX9-NEXT: v_mov_b32_e32 v1, s1 487; GFX9-NEXT: s_waitcnt lgkmcnt(0) 488; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x4500 489; GFX9-NEXT: v_mov_b32_e32 v2, s0 490; GFX9-NEXT: global_store_dword v[0:1], v2, off 491; GFX9-NEXT: s_endpgm 492; 493; CIVI-LABEL: s_insertelement_v2f16_1: 494; CIVI: ; %bb.0: 495; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 496; CIVI-NEXT: s_waitcnt lgkmcnt(0) 497; CIVI-NEXT: v_mov_b32_e32 v0, s0 498; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 499; CIVI-NEXT: v_mov_b32_e32 v1, s1 500; CIVI-NEXT: s_waitcnt lgkmcnt(0) 501; CIVI-NEXT: s_and_b32 s0, s0, 0xffff 502; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 503; CIVI-NEXT: v_mov_b32_e32 v2, s0 504; CIVI-NEXT: flat_store_dword v[0:1], v2 505; CIVI-NEXT: s_endpgm 506 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 507 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 508 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 509 ret void 510} 511 512define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 513; GFX9-LABEL: v_insertelement_v2i16_0: 514; GFX9: ; %bb.0: 515; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 516; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 517; GFX9-NEXT: s_waitcnt lgkmcnt(0) 518; GFX9-NEXT: v_mov_b32_e32 v1, s3 519; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 520; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 521; GFX9-NEXT: global_load_dword v0, v[0:1], off 522; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 523; GFX9-NEXT: v_mov_b32_e32 v3, s1 524; GFX9-NEXT: s_movk_i32 s0, 0x3e7 525; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 526; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 527; GFX9-NEXT: s_waitcnt vmcnt(0) 528; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 529; GFX9-NEXT: global_store_dword v[2:3], v0, off 530; GFX9-NEXT: s_endpgm 531; 532; VI-LABEL: v_insertelement_v2i16_0: 533; VI: ; %bb.0: 534; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 535; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 536; VI-NEXT: s_waitcnt lgkmcnt(0) 537; VI-NEXT: v_mov_b32_e32 v1, s3 538; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 539; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 540; VI-NEXT: flat_load_dword v0, v[0:1] 541; VI-NEXT: v_mov_b32_e32 v3, s1 542; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 543; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 544; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 545; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 546; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 547; VI-NEXT: flat_store_dword v[2:3], v0 548; VI-NEXT: s_endpgm 549; 550; CI-LABEL: v_insertelement_v2i16_0: 551; CI: ; %bb.0: 552; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 553; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 554; CI-NEXT: s_waitcnt lgkmcnt(0) 555; CI-NEXT: v_mov_b32_e32 v1, s3 556; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 557; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 558; CI-NEXT: flat_load_dword v0, v[0:1] 559; CI-NEXT: v_mov_b32_e32 v3, s1 560; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 561; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 562; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 563; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 564; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 565; CI-NEXT: flat_store_dword v[2:3], v0 566; CI-NEXT: s_endpgm 567 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 568 %tid.ext = sext i32 %tid to i64 569 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 570 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 571 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 572 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 573 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 574 ret void 575} 576 577define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { 578; GFX9-LABEL: v_insertelement_v2i16_0_reghi: 579; GFX9: ; %bb.0: 580; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 581; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 582; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 583; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 585; GFX9-NEXT: v_mov_b32_e32 v1, s3 586; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 587; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 588; GFX9-NEXT: global_load_dword v0, v[0:1], off 589; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4 590; GFX9-NEXT: v_mov_b32_e32 v3, s1 591; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 592; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 593; GFX9-NEXT: s_waitcnt vmcnt(0) 594; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 595; GFX9-NEXT: global_store_dword v[2:3], v0, off 596; GFX9-NEXT: s_endpgm 597; 598; VI-LABEL: v_insertelement_v2i16_0_reghi: 599; VI: ; %bb.0: 600; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 601; VI-NEXT: s_load_dword s4, s[4:5], 0x10 602; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 603; VI-NEXT: s_waitcnt lgkmcnt(0) 604; VI-NEXT: v_mov_b32_e32 v1, s3 605; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 606; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 607; VI-NEXT: flat_load_dword v0, v[0:1] 608; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 609; VI-NEXT: v_mov_b32_e32 v3, s1 610; VI-NEXT: s_lshr_b32 s0, s4, 16 611; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 612; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 613; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 614; VI-NEXT: v_or_b32_e32 v0, s0, v0 615; VI-NEXT: flat_store_dword v[2:3], v0 616; VI-NEXT: s_endpgm 617; 618; CI-LABEL: v_insertelement_v2i16_0_reghi: 619; CI: ; %bb.0: 620; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 621; CI-NEXT: s_load_dword s4, s[4:5], 0x4 622; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 623; CI-NEXT: s_waitcnt lgkmcnt(0) 624; CI-NEXT: v_mov_b32_e32 v1, s3 625; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 626; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 627; CI-NEXT: flat_load_dword v0, v[0:1] 628; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 629; CI-NEXT: v_mov_b32_e32 v3, s1 630; CI-NEXT: s_lshr_b32 s0, s4, 16 631; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 632; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 633; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 634; CI-NEXT: v_or_b32_e32 v0, s0, v0 635; CI-NEXT: flat_store_dword v[2:3], v0 636; CI-NEXT: s_endpgm 637 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 638 %tid.ext = sext i32 %tid to i64 639 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 640 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 641 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 642 %elt.hi = lshr i32 %elt.arg, 16 643 %elt = trunc i32 %elt.hi to i16 644 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 645 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 646 ret void 647} 648 649define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 650; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: 651; GFX9: ; %bb.0: 652; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 653; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 654; GFX9-NEXT: s_waitcnt lgkmcnt(0) 655; GFX9-NEXT: v_mov_b32_e32 v1, s3 656; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 657; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 658; GFX9-NEXT: global_load_dword v0, v[0:1], off 659; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 660; GFX9-NEXT: v_mov_b32_e32 v3, s1 661; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 662; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 663; GFX9-NEXT: s_waitcnt vmcnt(0) 664; GFX9-NEXT: v_bfi_b32 v0, v1, 53, v0 665; GFX9-NEXT: global_store_dword v[2:3], v0, off 666; GFX9-NEXT: s_endpgm 667; 668; VI-LABEL: v_insertelement_v2i16_0_inlineimm: 669; VI: ; %bb.0: 670; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 671; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 672; VI-NEXT: s_waitcnt lgkmcnt(0) 673; VI-NEXT: v_mov_b32_e32 v1, s3 674; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 675; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 676; VI-NEXT: flat_load_dword v0, v[0:1] 677; VI-NEXT: v_mov_b32_e32 v3, s1 678; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 679; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 680; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 681; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 682; VI-NEXT: v_or_b32_e32 v0, 53, v0 683; VI-NEXT: flat_store_dword v[2:3], v0 684; VI-NEXT: s_endpgm 685; 686; CI-LABEL: v_insertelement_v2i16_0_inlineimm: 687; CI: ; %bb.0: 688; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 689; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 690; CI-NEXT: s_waitcnt lgkmcnt(0) 691; CI-NEXT: v_mov_b32_e32 v1, s3 692; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 693; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 694; CI-NEXT: flat_load_dword v0, v[0:1] 695; CI-NEXT: v_mov_b32_e32 v3, s1 696; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 697; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 698; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 699; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 700; CI-NEXT: v_or_b32_e32 v0, 53, v0 701; CI-NEXT: flat_store_dword v[2:3], v0 702; CI-NEXT: s_endpgm 703 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 704 %tid.ext = sext i32 %tid to i64 705 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 706 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 707 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 708 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0 709 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 710 ret void 711} 712 713; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 714define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 715; GFX9-LABEL: v_insertelement_v2i16_1: 716; GFX9: ; %bb.0: 717; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 718; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 719; GFX9-NEXT: s_waitcnt lgkmcnt(0) 720; GFX9-NEXT: v_mov_b32_e32 v1, s3 721; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 722; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 723; GFX9-NEXT: global_load_dword v0, v[0:1], off 724; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 725; GFX9-NEXT: v_mov_b32_e32 v3, s1 726; GFX9-NEXT: s_movk_i32 s0, 0x3e7 727; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 728; GFX9-NEXT: s_waitcnt vmcnt(0) 729; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 730; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 731; GFX9-NEXT: global_store_dword v[2:3], v0, off 732; GFX9-NEXT: s_endpgm 733; 734; VI-LABEL: v_insertelement_v2i16_1: 735; VI: ; %bb.0: 736; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 737; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 738; VI-NEXT: s_waitcnt lgkmcnt(0) 739; VI-NEXT: v_mov_b32_e32 v1, s3 740; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 741; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 742; VI-NEXT: flat_load_dword v0, v[0:1] 743; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 744; VI-NEXT: v_mov_b32_e32 v3, s1 745; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 746; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 747; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 748; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 749; VI-NEXT: flat_store_dword v[2:3], v0 750; VI-NEXT: s_endpgm 751; 752; CI-LABEL: v_insertelement_v2i16_1: 753; CI: ; %bb.0: 754; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 755; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 756; CI-NEXT: s_waitcnt lgkmcnt(0) 757; CI-NEXT: v_mov_b32_e32 v1, s3 758; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 759; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 760; CI-NEXT: flat_load_dword v0, v[0:1] 761; CI-NEXT: v_mov_b32_e32 v3, s1 762; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 763; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 764; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 765; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 766; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 767; CI-NEXT: flat_store_dword v[2:3], v0 768; CI-NEXT: s_endpgm 769 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 770 %tid.ext = sext i32 %tid to i64 771 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 772 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 773 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 774 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 775 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 776 ret void 777} 778 779define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 780; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: 781; GFX9: ; %bb.0: 782; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 783; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 784; GFX9-NEXT: s_waitcnt lgkmcnt(0) 785; GFX9-NEXT: v_mov_b32_e32 v1, s3 786; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 787; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 788; GFX9-NEXT: global_load_dword v0, v[0:1], off 789; GFX9-NEXT: v_mov_b32_e32 v3, s1 790; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 791; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 792; GFX9-NEXT: s_waitcnt vmcnt(0) 793; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 794; GFX9-NEXT: v_lshl_or_b32 v0, -15, 16, v0 795; GFX9-NEXT: global_store_dword v[2:3], v0, off 796; GFX9-NEXT: s_endpgm 797; 798; VI-LABEL: v_insertelement_v2i16_1_inlineimm: 799; VI: ; %bb.0: 800; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 801; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 802; VI-NEXT: s_waitcnt lgkmcnt(0) 803; VI-NEXT: v_mov_b32_e32 v1, s3 804; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 805; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 806; VI-NEXT: flat_load_dword v0, v[0:1] 807; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 808; VI-NEXT: v_mov_b32_e32 v3, s1 809; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 810; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 811; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 812; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 813; VI-NEXT: flat_store_dword v[2:3], v0 814; VI-NEXT: s_endpgm 815; 816; CI-LABEL: v_insertelement_v2i16_1_inlineimm: 817; CI: ; %bb.0: 818; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 819; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 820; CI-NEXT: s_waitcnt lgkmcnt(0) 821; CI-NEXT: v_mov_b32_e32 v1, s3 822; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 823; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 824; CI-NEXT: flat_load_dword v0, v[0:1] 825; CI-NEXT: v_mov_b32_e32 v3, s1 826; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 827; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 828; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 829; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 830; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 831; CI-NEXT: flat_store_dword v[2:3], v0 832; CI-NEXT: s_endpgm 833 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 834 %tid.ext = sext i32 %tid to i64 835 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 836 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 837 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 838 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1 839 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 840 ret void 841} 842 843define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 844; GFX9-LABEL: v_insertelement_v2f16_0: 845; GFX9: ; %bb.0: 846; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 847; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 848; GFX9-NEXT: s_waitcnt lgkmcnt(0) 849; GFX9-NEXT: v_mov_b32_e32 v1, s3 850; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 851; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 852; GFX9-NEXT: global_load_dword v0, v[0:1], off 853; GFX9-NEXT: v_mov_b32_e32 v3, s1 854; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 855; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500 856; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 857; GFX9-NEXT: s_waitcnt vmcnt(0) 858; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 859; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 860; GFX9-NEXT: global_store_dword v[2:3], v0, off 861; GFX9-NEXT: s_endpgm 862; 863; VI-LABEL: v_insertelement_v2f16_0: 864; VI: ; %bb.0: 865; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 866; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 867; VI-NEXT: s_waitcnt lgkmcnt(0) 868; VI-NEXT: v_mov_b32_e32 v1, s3 869; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 870; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 871; VI-NEXT: flat_load_dword v0, v[0:1] 872; VI-NEXT: v_mov_b32_e32 v3, s1 873; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 874; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 875; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 876; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 877; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 878; VI-NEXT: flat_store_dword v[2:3], v0 879; VI-NEXT: s_endpgm 880; 881; CI-LABEL: v_insertelement_v2f16_0: 882; CI: ; %bb.0: 883; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 884; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 885; CI-NEXT: s_waitcnt lgkmcnt(0) 886; CI-NEXT: v_mov_b32_e32 v1, s3 887; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 888; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 889; CI-NEXT: flat_load_dword v0, v[0:1] 890; CI-NEXT: v_mov_b32_e32 v3, s1 891; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 892; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 893; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 894; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 895; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 896; CI-NEXT: flat_store_dword v[2:3], v0 897; CI-NEXT: s_endpgm 898 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 899 %tid.ext = sext i32 %tid to i64 900 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 901 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 902 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 903 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 904 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 905 ret void 906} 907 908define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 909; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: 910; GFX9: ; %bb.0: 911; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 912; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 913; GFX9-NEXT: s_waitcnt lgkmcnt(0) 914; GFX9-NEXT: v_mov_b32_e32 v1, s3 915; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 916; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 917; GFX9-NEXT: global_load_dword v0, v[0:1], off 918; GFX9-NEXT: v_mov_b32_e32 v3, s1 919; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 920; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 921; GFX9-NEXT: s_waitcnt vmcnt(0) 922; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 923; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 53 924; GFX9-NEXT: global_store_dword v[2:3], v0, off 925; GFX9-NEXT: s_endpgm 926; 927; VI-LABEL: v_insertelement_v2f16_0_inlineimm: 928; VI: ; %bb.0: 929; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 930; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 931; VI-NEXT: s_waitcnt lgkmcnt(0) 932; VI-NEXT: v_mov_b32_e32 v1, s3 933; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 934; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 935; VI-NEXT: flat_load_dword v0, v[0:1] 936; VI-NEXT: v_mov_b32_e32 v3, s1 937; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 938; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 939; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 940; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 941; VI-NEXT: v_or_b32_e32 v0, 53, v0 942; VI-NEXT: flat_store_dword v[2:3], v0 943; VI-NEXT: s_endpgm 944; 945; CI-LABEL: v_insertelement_v2f16_0_inlineimm: 946; CI: ; %bb.0: 947; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 948; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 949; CI-NEXT: s_waitcnt lgkmcnt(0) 950; CI-NEXT: v_mov_b32_e32 v1, s3 951; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 952; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 953; CI-NEXT: flat_load_dword v0, v[0:1] 954; CI-NEXT: v_mov_b32_e32 v3, s1 955; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 956; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 957; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 958; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 959; CI-NEXT: v_or_b32_e32 v0, 53, v0 960; CI-NEXT: flat_store_dword v[2:3], v0 961; CI-NEXT: s_endpgm 962 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 963 %tid.ext = sext i32 %tid to i64 964 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 965 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 966 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 967 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0 968 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 969 ret void 970} 971 972define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 973; GFX9-LABEL: v_insertelement_v2f16_1: 974; GFX9: ; %bb.0: 975; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 976; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 977; GFX9-NEXT: s_waitcnt lgkmcnt(0) 978; GFX9-NEXT: v_mov_b32_e32 v1, s3 979; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 980; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 981; GFX9-NEXT: global_load_dword v0, v[0:1], off 982; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 983; GFX9-NEXT: v_mov_b32_e32 v3, s1 984; GFX9-NEXT: s_movk_i32 s0, 0x4500 985; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 986; GFX9-NEXT: s_waitcnt vmcnt(0) 987; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 988; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 989; GFX9-NEXT: global_store_dword v[2:3], v0, off 990; GFX9-NEXT: s_endpgm 991; 992; VI-LABEL: v_insertelement_v2f16_1: 993; VI: ; %bb.0: 994; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 995; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 996; VI-NEXT: s_waitcnt lgkmcnt(0) 997; VI-NEXT: v_mov_b32_e32 v1, s3 998; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 999; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1000; VI-NEXT: flat_load_dword v0, v[0:1] 1001; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 1002; VI-NEXT: v_mov_b32_e32 v3, s1 1003; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1004; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1005; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1006; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1007; VI-NEXT: flat_store_dword v[2:3], v0 1008; VI-NEXT: s_endpgm 1009; 1010; CI-LABEL: v_insertelement_v2f16_1: 1011; CI: ; %bb.0: 1012; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1013; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1014; CI-NEXT: s_waitcnt lgkmcnt(0) 1015; CI-NEXT: v_mov_b32_e32 v1, s3 1016; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1017; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1018; CI-NEXT: flat_load_dword v0, v[0:1] 1019; CI-NEXT: v_mov_b32_e32 v3, s1 1020; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1021; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1022; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1023; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1024; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 1025; CI-NEXT: flat_store_dword v[2:3], v0 1026; CI-NEXT: s_endpgm 1027 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1028 %tid.ext = sext i32 %tid to i64 1029 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1030 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1031 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1032 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 1033 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1034 ret void 1035} 1036 1037define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1038; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: 1039; GFX9: ; %bb.0: 1040; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1041; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1042; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1043; GFX9-NEXT: v_mov_b32_e32 v1, s3 1044; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1045; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1046; GFX9-NEXT: global_load_dword v0, v[0:1], off 1047; GFX9-NEXT: v_mov_b32_e32 v3, s1 1048; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1049; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1050; GFX9-NEXT: s_waitcnt vmcnt(0) 1051; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 1052; GFX9-NEXT: v_lshl_or_b32 v0, 35, 16, v0 1053; GFX9-NEXT: global_store_dword v[2:3], v0, off 1054; GFX9-NEXT: s_endpgm 1055; 1056; VI-LABEL: v_insertelement_v2f16_1_inlineimm: 1057; VI: ; %bb.0: 1058; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1059; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1060; VI-NEXT: s_waitcnt lgkmcnt(0) 1061; VI-NEXT: v_mov_b32_e32 v1, s3 1062; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1063; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1064; VI-NEXT: flat_load_dword v0, v[0:1] 1065; VI-NEXT: v_mov_b32_e32 v1, 0x230000 1066; VI-NEXT: v_mov_b32_e32 v3, s1 1067; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1068; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1069; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1070; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1071; VI-NEXT: flat_store_dword v[2:3], v0 1072; VI-NEXT: s_endpgm 1073; 1074; CI-LABEL: v_insertelement_v2f16_1_inlineimm: 1075; CI: ; %bb.0: 1076; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1077; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1078; CI-NEXT: s_waitcnt lgkmcnt(0) 1079; CI-NEXT: v_mov_b32_e32 v1, s3 1080; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1081; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1082; CI-NEXT: flat_load_dword v0, v[0:1] 1083; CI-NEXT: v_mov_b32_e32 v3, s1 1084; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1085; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1086; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1087; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1088; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 1089; CI-NEXT: flat_store_dword v[2:3], v0 1090; CI-NEXT: s_endpgm 1091 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1092 %tid.ext = sext i32 %tid to i64 1093 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1094 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1095 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1096 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1 1097 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1098 ret void 1099} 1100 1101; FIXME: Enable for others when argument load not split 1102define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { 1103; GFX9-LABEL: s_insertelement_v2i16_dynamic: 1104; GFX9: ; %bb.0: 1105; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1106; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1107; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX9-NEXT: v_mov_b32_e32 v0, s0 1110; GFX9-NEXT: v_mov_b32_e32 v1, s1 1111; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 1112; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 1113; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX9-NEXT: s_lshl_b32 s0, s0, 4 1115; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 1116; GFX9-NEXT: v_mov_b32_e32 v3, s1 1117; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 1118; GFX9-NEXT: global_store_dword v[0:1], v2, off 1119; GFX9-NEXT: s_endpgm 1120; 1121; VI-LABEL: s_insertelement_v2i16_dynamic: 1122; VI: ; %bb.0: 1123; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1124; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1125; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1126; VI-NEXT: s_waitcnt lgkmcnt(0) 1127; VI-NEXT: v_mov_b32_e32 v0, s0 1128; VI-NEXT: v_mov_b32_e32 v1, s1 1129; VI-NEXT: s_load_dword s0, s[4:5], 0x0 1130; VI-NEXT: s_load_dword s1, s[2:3], 0x0 1131; VI-NEXT: s_waitcnt lgkmcnt(0) 1132; VI-NEXT: s_lshl_b32 s0, s0, 4 1133; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1134; VI-NEXT: v_mov_b32_e32 v3, s1 1135; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1136; VI-NEXT: flat_store_dword v[0:1], v2 1137; VI-NEXT: s_endpgm 1138; 1139; CI-LABEL: s_insertelement_v2i16_dynamic: 1140; CI: ; %bb.0: 1141; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1142; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 1143; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1144; CI-NEXT: s_waitcnt lgkmcnt(0) 1145; CI-NEXT: v_mov_b32_e32 v0, s0 1146; CI-NEXT: v_mov_b32_e32 v1, s1 1147; CI-NEXT: s_load_dword s0, s[4:5], 0x0 1148; CI-NEXT: s_load_dword s1, s[2:3], 0x0 1149; CI-NEXT: s_waitcnt lgkmcnt(0) 1150; CI-NEXT: s_lshl_b32 s0, s0, 4 1151; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1152; CI-NEXT: v_mov_b32_e32 v3, s1 1153; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 1154; CI-NEXT: flat_store_dword v[0:1], v2 1155; CI-NEXT: s_endpgm 1156 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr 1157 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 1158 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1159 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 1160 ret void 1161} 1162 1163define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { 1164; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1165; GFX9: ; %bb.0: 1166; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1167; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1168; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX9-NEXT: v_mov_b32_e32 v1, s3 1171; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1172; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1173; GFX9-NEXT: global_load_dword v0, v[0:1], off 1174; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1175; GFX9-NEXT: s_lshl_b32 s0, s4, 4 1176; GFX9-NEXT: v_mov_b32_e32 v3, s1 1177; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 1178; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7 1179; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1180; GFX9-NEXT: s_waitcnt vmcnt(0) 1181; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 1182; GFX9-NEXT: global_store_dword v[2:3], v0, off 1183; GFX9-NEXT: s_endpgm 1184; 1185; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1186; VI: ; %bb.0: 1187; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1188; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1189; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1190; VI-NEXT: s_waitcnt lgkmcnt(0) 1191; VI-NEXT: v_mov_b32_e32 v1, s3 1192; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1193; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1194; VI-NEXT: flat_load_dword v0, v[0:1] 1195; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1196; VI-NEXT: s_lshl_b32 s0, s4, 4 1197; VI-NEXT: v_mov_b32_e32 v3, s1 1198; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1199; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 1200; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1201; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1202; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 1203; VI-NEXT: flat_store_dword v[2:3], v0 1204; VI-NEXT: s_endpgm 1205; 1206; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1207; CI: ; %bb.0: 1208; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1209; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1210; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1211; CI-NEXT: s_waitcnt lgkmcnt(0) 1212; CI-NEXT: v_mov_b32_e32 v1, s3 1213; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1214; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1215; CI-NEXT: flat_load_dword v0, v[0:1] 1216; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1217; CI-NEXT: s_lshl_b32 s0, s4, 4 1218; CI-NEXT: v_mov_b32_e32 v3, s1 1219; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1220; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 1221; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1222; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1223; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 1224; CI-NEXT: flat_store_dword v[2:3], v0 1225; CI-NEXT: s_endpgm 1226 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1227 %tid.ext = sext i32 %tid to i64 1228 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1229 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1230 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 1231 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1232 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 1233 ret void 1234} 1235 1236define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { 1237; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1238; GFX9: ; %bb.0: 1239; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1240; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1241; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1242; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1243; GFX9-NEXT: v_mov_b32_e32 v1, s3 1244; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 1245; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1246; GFX9-NEXT: v_mov_b32_e32 v3, s5 1247; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 1248; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1249; GFX9-NEXT: global_load_dword v0, v[0:1], off 1250; GFX9-NEXT: global_load_dword v1, v[2:3], off 1251; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 1252; GFX9-NEXT: s_mov_b32 s0, 0xffff 1253; GFX9-NEXT: v_mov_b32_e32 v5, s1 1254; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 1255; GFX9-NEXT: s_waitcnt vmcnt(0) 1256; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1257; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 1258; GFX9-NEXT: s_mov_b32 s0, 0x12341234 1259; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 1260; GFX9-NEXT: global_store_dword v[4:5], v0, off 1261; GFX9-NEXT: s_endpgm 1262; 1263; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1264; VI: ; %bb.0: 1265; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1266; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 1267; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1268; VI-NEXT: s_waitcnt lgkmcnt(0) 1269; VI-NEXT: v_mov_b32_e32 v1, s3 1270; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1271; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1272; VI-NEXT: v_mov_b32_e32 v3, s5 1273; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1274; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1275; VI-NEXT: flat_load_dword v0, v[0:1] 1276; VI-NEXT: flat_load_dword v1, v[2:3] 1277; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1278; VI-NEXT: s_mov_b32 s0, 0xffff 1279; VI-NEXT: v_mov_b32_e32 v5, s1 1280; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1281; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1282; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1283; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0 1284; VI-NEXT: s_mov_b32 s0, 0x12341234 1285; VI-NEXT: v_bfi_b32 v0, v1, s0, v0 1286; VI-NEXT: flat_store_dword v[4:5], v0 1287; VI-NEXT: s_endpgm 1288; 1289; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1290; CI: ; %bb.0: 1291; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1292; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 1293; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1294; CI-NEXT: s_waitcnt lgkmcnt(0) 1295; CI-NEXT: v_mov_b32_e32 v1, s3 1296; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1297; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1298; CI-NEXT: v_mov_b32_e32 v3, s5 1299; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 1300; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1301; CI-NEXT: flat_load_dword v2, v[2:3] 1302; CI-NEXT: flat_load_dword v0, v[0:1] 1303; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 1304; CI-NEXT: v_mov_b32_e32 v5, s1 1305; CI-NEXT: s_mov_b32 s0, 0x12341234 1306; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1307; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) 1308; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 1309; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 1310; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1311; CI-NEXT: v_bfi_b32 v0, v1, s0, v0 1312; CI-NEXT: flat_store_dword v[4:5], v0 1313; CI-NEXT: s_endpgm 1314 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1315 %tid.ext = sext i32 %tid to i64 1316 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1317 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext 1318 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1319 %idx = load i32, i32 addrspace(1)* %idx.gep 1320 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1321 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx 1322 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1323 ret void 1324} 1325 1326define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1327; GFX9-LABEL: v_insertelement_v4f16_0: 1328; GFX9: ; %bb.0: 1329; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1330; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 1331; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1332; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 1333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1334; GFX9-NEXT: v_mov_b32_e32 v1, s3 1335; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1336; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1337; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1338; GFX9-NEXT: v_mov_b32_e32 v3, s1 1339; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1340; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1341; GFX9-NEXT: s_waitcnt vmcnt(0) 1342; GFX9-NEXT: v_bfi_b32 v0, v4, s4, v0 1343; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1344; GFX9-NEXT: s_endpgm 1345; 1346; VI-LABEL: v_insertelement_v4f16_0: 1347; VI: ; %bb.0: 1348; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1349; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1350; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1351; VI-NEXT: s_waitcnt lgkmcnt(0) 1352; VI-NEXT: v_mov_b32_e32 v1, s3 1353; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1354; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1355; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1356; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1357; VI-NEXT: v_mov_b32_e32 v3, s1 1358; VI-NEXT: s_and_b32 s0, s4, 0xffff 1359; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1360; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1361; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 1362; VI-NEXT: v_or_b32_e32 v0, s0, v0 1363; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1364; VI-NEXT: s_endpgm 1365; 1366; CI-LABEL: v_insertelement_v4f16_0: 1367; CI: ; %bb.0: 1368; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1369; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1370; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1371; CI-NEXT: s_waitcnt lgkmcnt(0) 1372; CI-NEXT: v_mov_b32_e32 v1, s3 1373; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1374; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1375; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1376; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1377; CI-NEXT: v_mov_b32_e32 v3, s1 1378; CI-NEXT: s_and_b32 s0, s4, 0xffff 1379; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1380; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1381; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 1382; CI-NEXT: v_or_b32_e32 v0, s0, v0 1383; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1384; CI-NEXT: s_endpgm 1385 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1386 %tid.ext = sext i32 %tid to i64 1387 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1388 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1389 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1390 %val.trunc = trunc i32 %val to i16 1391 %val.cvt = bitcast i16 %val.trunc to half 1392 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0 1393 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1394 ret void 1395} 1396 1397define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1398; GFX9-LABEL: v_insertelement_v4f16_1: 1399; GFX9: ; %bb.0: 1400; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1401; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1402; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1403; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1404; GFX9-NEXT: v_mov_b32_e32 v1, s3 1405; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1406; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1407; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1408; GFX9-NEXT: v_mov_b32_e32 v3, s1 1409; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1410; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1411; GFX9-NEXT: s_waitcnt vmcnt(0) 1412; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 1413; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 1414; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1415; GFX9-NEXT: s_endpgm 1416; 1417; VI-LABEL: v_insertelement_v4f16_1: 1418; VI: ; %bb.0: 1419; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1420; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1421; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1422; VI-NEXT: s_waitcnt lgkmcnt(0) 1423; VI-NEXT: v_mov_b32_e32 v1, s3 1424; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1425; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1426; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1427; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1428; VI-NEXT: s_lshl_b32 s0, s4, 16 1429; VI-NEXT: v_mov_b32_e32 v3, s1 1430; VI-NEXT: v_mov_b32_e32 v4, s0 1431; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1432; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1433; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1434; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1435; VI-NEXT: s_endpgm 1436; 1437; CI-LABEL: v_insertelement_v4f16_1: 1438; CI: ; %bb.0: 1439; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1440; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1441; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1442; CI-NEXT: s_waitcnt lgkmcnt(0) 1443; CI-NEXT: v_mov_b32_e32 v1, s3 1444; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1445; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1446; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1447; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1448; CI-NEXT: v_mov_b32_e32 v3, s1 1449; CI-NEXT: s_lshl_b32 s0, s4, 16 1450; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1451; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1452; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1453; CI-NEXT: v_or_b32_e32 v0, s0, v0 1454; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1455; CI-NEXT: s_endpgm 1456 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1457 %tid.ext = sext i32 %tid to i64 1458 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1459 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1460 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1461 %val.trunc = trunc i32 %val to i16 1462 %val.cvt = bitcast i16 %val.trunc to half 1463 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1 1464 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1465 ret void 1466} 1467 1468define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1469; GFX9-LABEL: v_insertelement_v4f16_2: 1470; GFX9: ; %bb.0: 1471; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1472; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 1473; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1474; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 1475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1476; GFX9-NEXT: v_mov_b32_e32 v1, s3 1477; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1478; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1479; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1480; GFX9-NEXT: v_mov_b32_e32 v3, s1 1481; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1482; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1483; GFX9-NEXT: s_waitcnt vmcnt(0) 1484; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 1485; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1486; GFX9-NEXT: s_endpgm 1487; 1488; VI-LABEL: v_insertelement_v4f16_2: 1489; VI: ; %bb.0: 1490; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1491; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1492; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1493; VI-NEXT: s_waitcnt lgkmcnt(0) 1494; VI-NEXT: v_mov_b32_e32 v1, s3 1495; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1496; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1497; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1498; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1499; VI-NEXT: v_mov_b32_e32 v3, s1 1500; VI-NEXT: s_and_b32 s0, s4, 0xffff 1501; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1502; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1503; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 1504; VI-NEXT: v_or_b32_e32 v1, s0, v1 1505; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1506; VI-NEXT: s_endpgm 1507; 1508; CI-LABEL: v_insertelement_v4f16_2: 1509; CI: ; %bb.0: 1510; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1511; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1512; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1513; CI-NEXT: s_waitcnt lgkmcnt(0) 1514; CI-NEXT: v_mov_b32_e32 v1, s3 1515; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1516; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1517; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1518; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1519; CI-NEXT: v_mov_b32_e32 v3, s1 1520; CI-NEXT: s_and_b32 s0, s4, 0xffff 1521; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1522; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1523; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 1524; CI-NEXT: v_or_b32_e32 v1, s0, v1 1525; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1526; CI-NEXT: s_endpgm 1527 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1528 %tid.ext = sext i32 %tid to i64 1529 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1530 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1531 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1532 %val.trunc = trunc i32 %val to i16 1533 %val.cvt = bitcast i16 %val.trunc to half 1534 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2 1535 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1536 ret void 1537} 1538 1539define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1540; GFX9-LABEL: v_insertelement_v4f16_3: 1541; GFX9: ; %bb.0: 1542; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1543; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1544; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1545; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1546; GFX9-NEXT: v_mov_b32_e32 v1, s3 1547; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1548; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1549; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1550; GFX9-NEXT: v_mov_b32_e32 v3, s1 1551; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1552; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1553; GFX9-NEXT: s_waitcnt vmcnt(0) 1554; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1555; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 1556; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1557; GFX9-NEXT: s_endpgm 1558; 1559; VI-LABEL: v_insertelement_v4f16_3: 1560; VI: ; %bb.0: 1561; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1562; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1563; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1564; VI-NEXT: s_waitcnt lgkmcnt(0) 1565; VI-NEXT: v_mov_b32_e32 v1, s3 1566; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1567; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1568; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1569; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1570; VI-NEXT: s_lshl_b32 s0, s4, 16 1571; VI-NEXT: v_mov_b32_e32 v3, s1 1572; VI-NEXT: v_mov_b32_e32 v4, s0 1573; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1574; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1575; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 1576; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1577; VI-NEXT: s_endpgm 1578; 1579; CI-LABEL: v_insertelement_v4f16_3: 1580; CI: ; %bb.0: 1581; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1582; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1583; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1584; CI-NEXT: s_waitcnt lgkmcnt(0) 1585; CI-NEXT: v_mov_b32_e32 v1, s3 1586; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1587; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1588; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1589; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1590; CI-NEXT: v_mov_b32_e32 v3, s1 1591; CI-NEXT: s_lshl_b32 s0, s4, 16 1592; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1593; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1594; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1595; CI-NEXT: v_or_b32_e32 v1, s0, v1 1596; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1597; CI-NEXT: s_endpgm 1598 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1599 %tid.ext = sext i32 %tid to i64 1600 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1601 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1602 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1603 %val.trunc = trunc i32 %val to i16 1604 %val.cvt = bitcast i16 %val.trunc to half 1605 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3 1606 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1607 ret void 1608} 1609 1610define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1611; GFX9-LABEL: v_insertelement_v4i16_2: 1612; GFX9: ; %bb.0: 1613; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1614; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1615; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1616; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 1617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1618; GFX9-NEXT: v_mov_b32_e32 v1, s3 1619; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1620; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1621; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1622; GFX9-NEXT: v_mov_b32_e32 v3, s1 1623; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1624; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1625; GFX9-NEXT: s_waitcnt vmcnt(0) 1626; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 1627; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1628; GFX9-NEXT: s_endpgm 1629; 1630; VI-LABEL: v_insertelement_v4i16_2: 1631; VI: ; %bb.0: 1632; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1633; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1634; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1635; VI-NEXT: s_waitcnt lgkmcnt(0) 1636; VI-NEXT: v_mov_b32_e32 v1, s3 1637; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1638; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1639; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1640; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1641; VI-NEXT: v_mov_b32_e32 v3, s1 1642; VI-NEXT: s_and_b32 s0, s4, 0xffff 1643; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1644; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1645; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 1646; VI-NEXT: v_or_b32_e32 v1, s0, v1 1647; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1648; VI-NEXT: s_endpgm 1649; 1650; CI-LABEL: v_insertelement_v4i16_2: 1651; CI: ; %bb.0: 1652; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1653; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1654; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1655; CI-NEXT: s_waitcnt lgkmcnt(0) 1656; CI-NEXT: v_mov_b32_e32 v1, s3 1657; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1658; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1659; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1660; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1661; CI-NEXT: v_mov_b32_e32 v3, s1 1662; CI-NEXT: s_and_b32 s0, s4, 0xffff 1663; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1664; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1665; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 1666; CI-NEXT: v_or_b32_e32 v1, s0, v1 1667; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1668; CI-NEXT: s_endpgm 1669 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1670 %tid.ext = sext i32 %tid to i64 1671 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1672 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1673 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1674 %val.trunc = trunc i32 %val to i16 1675 %val.cvt = bitcast i16 %val.trunc to i16 1676 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2 1677 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1678 ret void 1679} 1680 1681; FIXME: Better code on CI? 1682define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1683; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1684; GFX9: ; %bb.0: 1685; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1686; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1687; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1688; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1689; GFX9-NEXT: v_mov_b32_e32 v1, s3 1690; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1691; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1692; GFX9-NEXT: global_load_dword v4, v[0:1], off 1693; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1694; GFX9-NEXT: v_mov_b32_e32 v3, s1 1695; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1696; GFX9-NEXT: s_mov_b32 s1, 0 1697; GFX9-NEXT: s_mov_b32 s0, 0xffff 1698; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1699; GFX9-NEXT: s_waitcnt vmcnt(1) 1700; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1701; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] 1702; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4 1703; GFX9-NEXT: s_waitcnt vmcnt(0) 1704; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1 1705; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0 1706; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1707; GFX9-NEXT: s_endpgm 1708; 1709; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1710; VI: ; %bb.0: 1711; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1712; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1713; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1714; VI-NEXT: s_waitcnt lgkmcnt(0) 1715; VI-NEXT: v_mov_b32_e32 v1, s3 1716; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1717; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1718; VI-NEXT: flat_load_dword v4, v[0:1] 1719; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1720; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1721; VI-NEXT: s_mov_b32 s0, 0xffff 1722; VI-NEXT: v_mov_b32_e32 v3, s1 1723; VI-NEXT: s_and_b32 s2, s4, s0 1724; VI-NEXT: s_mov_b32 s1, 0 1725; VI-NEXT: s_lshl_b32 s3, s2, 16 1726; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1727; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) 1728; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1729; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] 1730; VI-NEXT: s_or_b32 s0, s2, s3 1731; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1732; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 1733; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 1734; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1735; VI-NEXT: s_endpgm 1736; 1737; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1738; CI: ; %bb.0: 1739; CI-NEXT: flat_load_dword v4, v[0:1] 1740; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1741; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1742; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1743; CI-NEXT: s_mov_b32 s6, 0xffff 1744; CI-NEXT: s_mov_b32 s7, 0 1745; CI-NEXT: s_waitcnt lgkmcnt(0) 1746; CI-NEXT: v_mov_b32_e32 v1, s3 1747; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1748; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1749; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1750; CI-NEXT: v_mov_b32_e32 v3, s1 1751; CI-NEXT: s_lshl_b32 s1, s4, 16 1752; CI-NEXT: s_and_b32 s3, s4, s6 1753; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1754; CI-NEXT: s_or_b32 s0, s3, s1 1755; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1756; CI-NEXT: s_waitcnt vmcnt(1) 1757; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1758; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 1759; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1760; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 1761; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 1762; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1763; CI-NEXT: s_endpgm 1764 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1765 %tid.ext = sext i32 %tid to i64 1766 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1767 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1768 %idx.val = load volatile i32, i32 addrspace(1)* undef 1769 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1770 %val.trunc = trunc i32 %val to i16 1771 %val.cvt = bitcast i16 %val.trunc to i16 1772 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val 1773 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1774 ret void 1775} 1776 1777define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 { 1778; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1779; GFX9: ; %bb.0: 1780; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1781; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1782; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1783; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX9-NEXT: v_mov_b32_e32 v1, s3 1785; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 1786; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1787; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 1788; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 1789; GFX9-NEXT: v_mov_b32_e32 v3, s1 1790; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 1791; GFX9-NEXT: s_mov_b32 s1, 0 1792; GFX9-NEXT: s_mov_b32 s0, 0xffff 1793; GFX9-NEXT: s_lshl_b32 s3, s5, 4 1794; GFX9-NEXT: v_mov_b32_e32 v4, s2 1795; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 1796; GFX9-NEXT: v_mov_b32_e32 v5, s2 1797; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1798; GFX9-NEXT: s_waitcnt vmcnt(0) 1799; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1 1800; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0 1801; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 1802; GFX9-NEXT: s_endpgm 1803; 1804; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1805; VI: ; %bb.0: 1806; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1807; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1808; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1809; VI-NEXT: s_waitcnt lgkmcnt(0) 1810; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1811; VI-NEXT: v_mov_b32_e32 v1, s3 1812; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1813; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1814; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1815; VI-NEXT: s_mov_b32 s0, 0xffff 1816; VI-NEXT: v_mov_b32_e32 v3, s1 1817; VI-NEXT: s_mov_b32 s1, 0 1818; VI-NEXT: s_lshl_b32 s2, s5, 4 1819; VI-NEXT: s_and_b32 s3, s4, s0 1820; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1821; VI-NEXT: s_lshl_b32 s2, s3, 16 1822; VI-NEXT: s_or_b32 s2, s3, s2 1823; VI-NEXT: v_mov_b32_e32 v4, s2 1824; VI-NEXT: v_mov_b32_e32 v5, s2 1825; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1826; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1827; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 1828; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 1829; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1830; VI-NEXT: s_endpgm 1831; 1832; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1833; CI: ; %bb.0: 1834; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1835; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 1836; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1837; CI-NEXT: s_waitcnt lgkmcnt(0) 1838; CI-NEXT: v_mov_b32_e32 v1, s3 1839; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1840; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1841; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1842; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1843; CI-NEXT: s_mov_b32 s0, 0xffff 1844; CI-NEXT: s_and_b32 s2, s4, s0 1845; CI-NEXT: s_lshl_b32 s4, s4, 16 1846; CI-NEXT: v_mov_b32_e32 v3, s1 1847; CI-NEXT: s_or_b32 s2, s2, s4 1848; CI-NEXT: s_mov_b32 s1, 0 1849; CI-NEXT: s_lshl_b32 s3, s5, 4 1850; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 1851; CI-NEXT: v_mov_b32_e32 v4, s2 1852; CI-NEXT: v_mov_b32_e32 v5, s2 1853; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1854; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1855; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 1856; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 1857; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1858; CI-NEXT: s_endpgm 1859 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1860 %tid.ext = sext i32 %tid to i64 1861 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1862 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1863 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1864 %val.trunc = trunc i32 %val to i16 1865 %val.cvt = bitcast i16 %val.trunc to half 1866 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval 1867 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1868 ret void 1869} 1870 1871declare i32 @llvm.amdgcn.workitem.id.x() #1 1872 1873attributes #0 = { nounwind } 1874attributes #1 = { nounwind readnone } 1875