1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s 4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s 5 6define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 7; GFX9-LABEL: s_insertelement_v2i16_0: 8; GFX9: ; %bb.0: 9; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 10; GFX9-NEXT: v_mov_b32_e32 v0, 0 11; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 15; GFX9-NEXT: v_mov_b32_e32 v1, s2 16; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 17; GFX9-NEXT: s_endpgm 18; 19; CIVI-LABEL: s_insertelement_v2i16_0: 20; CIVI: ; %bb.0: 21; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 22; CIVI-NEXT: s_waitcnt lgkmcnt(0) 23; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 24; CIVI-NEXT: v_mov_b32_e32 v0, s0 25; CIVI-NEXT: v_mov_b32_e32 v1, s1 26; CIVI-NEXT: s_waitcnt lgkmcnt(0) 27; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 28; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 29; CIVI-NEXT: v_mov_b32_e32 v2, s0 30; CIVI-NEXT: flat_store_dword v[0:1], v2 31; CIVI-NEXT: s_endpgm 32 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 33 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 34 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 35 ret void 36} 37 38 39define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 40; GFX9-LABEL: s_insertelement_v2i16_0_reg: 41; GFX9: ; %bb.0: 42; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 43; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 44; GFX9-NEXT: v_mov_b32_e32 v0, 0 45; GFX9-NEXT: s_waitcnt lgkmcnt(0) 46; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 47; GFX9-NEXT: s_waitcnt lgkmcnt(0) 48; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2 49; GFX9-NEXT: v_mov_b32_e32 v1, s2 50; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 51; GFX9-NEXT: s_endpgm 52; 53; VI-LABEL: s_insertelement_v2i16_0_reg: 54; VI: ; %bb.0: 55; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 56; VI-NEXT: s_load_dword s4, s[4:5], 0x30 57; VI-NEXT: s_waitcnt lgkmcnt(0) 58; VI-NEXT: s_load_dword s2, s[2:3], 0x0 59; VI-NEXT: v_mov_b32_e32 v0, s0 60; VI-NEXT: v_mov_b32_e32 v1, s1 61; VI-NEXT: s_and_b32 s0, s4, 0xffff 62; VI-NEXT: s_waitcnt lgkmcnt(0) 63; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 64; VI-NEXT: s_or_b32 s0, s0, s1 65; VI-NEXT: v_mov_b32_e32 v2, s0 66; VI-NEXT: flat_store_dword v[0:1], v2 67; VI-NEXT: s_endpgm 68; 69; CI-LABEL: s_insertelement_v2i16_0_reg: 70; CI: ; %bb.0: 71; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 72; CI-NEXT: s_load_dword s4, s[4:5], 0xc 73; CI-NEXT: s_waitcnt lgkmcnt(0) 74; CI-NEXT: s_load_dword s2, s[2:3], 0x0 75; CI-NEXT: v_mov_b32_e32 v0, s0 76; CI-NEXT: v_mov_b32_e32 v1, s1 77; CI-NEXT: s_and_b32 s1, s4, 0xffff 78; CI-NEXT: s_waitcnt lgkmcnt(0) 79; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 80; CI-NEXT: s_or_b32 s0, s1, s0 81; CI-NEXT: v_mov_b32_e32 v2, s0 82; CI-NEXT: flat_store_dword v[0:1], v2 83; CI-NEXT: s_endpgm 84 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 85 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 86 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 87 ret void 88} 89 90define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 91; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 92; GFX9: ; %bb.0: 93; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 94; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 95; GFX9-NEXT: v_mov_b32_e32 v0, 0 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 98; GFX9-NEXT: s_waitcnt lgkmcnt(0) 99; GFX9-NEXT: s_lshr_b32 s2, s2, 16 100; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2 101; GFX9-NEXT: v_mov_b32_e32 v1, s3 102; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 103; GFX9-NEXT: ;;#ASMSTART 104; GFX9-NEXT: ; use s2 105; GFX9-NEXT: ;;#ASMEND 106; GFX9-NEXT: s_endpgm 107; 108; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 109; VI: ; %bb.0: 110; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 111; VI-NEXT: s_load_dword s4, s[4:5], 0x30 112; VI-NEXT: s_waitcnt lgkmcnt(0) 113; VI-NEXT: s_load_dword s2, s[2:3], 0x0 114; VI-NEXT: v_mov_b32_e32 v0, s0 115; VI-NEXT: v_mov_b32_e32 v1, s1 116; VI-NEXT: s_and_b32 s0, s4, 0xffff 117; VI-NEXT: s_waitcnt lgkmcnt(0) 118; VI-NEXT: s_lshr_b32 s1, s2, 16 119; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 120; VI-NEXT: s_or_b32 s0, s0, s2 121; VI-NEXT: v_mov_b32_e32 v2, s0 122; VI-NEXT: flat_store_dword v[0:1], v2 123; VI-NEXT: ;;#ASMSTART 124; VI-NEXT: ; use s1 125; VI-NEXT: ;;#ASMEND 126; VI-NEXT: s_endpgm 127; 128; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: 129; CI: ; %bb.0: 130; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 131; CI-NEXT: s_load_dword s4, s[4:5], 0xc 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: s_load_dword s2, s[2:3], 0x0 134; CI-NEXT: v_mov_b32_e32 v1, s1 135; CI-NEXT: v_mov_b32_e32 v0, s0 136; CI-NEXT: s_and_b32 s0, s4, 0xffff 137; CI-NEXT: s_waitcnt lgkmcnt(0) 138; CI-NEXT: s_lshr_b32 s1, s2, 16 139; CI-NEXT: s_lshl_b32 s2, s1, 16 140; CI-NEXT: s_or_b32 s0, s0, s2 141; CI-NEXT: v_mov_b32_e32 v2, s0 142; CI-NEXT: flat_store_dword v[0:1], v2 143; CI-NEXT: ;;#ASMSTART 144; CI-NEXT: ; use s1 145; CI-NEXT: ;;#ASMEND 146; CI-NEXT: s_endpgm 147 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 148 %elt1 = extractelement <2 x i16> %vec, i32 1 149 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 150 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 151 %use1 = zext i16 %elt1 to i32 152 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 153 ret void 154} 155 156define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 { 157; GFX9-LABEL: s_insertelement_v2i16_0_reghi: 158; GFX9: ; %bb.0: 159; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 160; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 161; GFX9-NEXT: v_mov_b32_e32 v0, 0 162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 163; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 165; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2 166; GFX9-NEXT: v_mov_b32_e32 v1, s2 167; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 168; GFX9-NEXT: s_endpgm 169; 170; VI-LABEL: s_insertelement_v2i16_0_reghi: 171; VI: ; %bb.0: 172; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 173; VI-NEXT: s_load_dword s4, s[4:5], 0x30 174; VI-NEXT: s_waitcnt lgkmcnt(0) 175; VI-NEXT: s_load_dword s2, s[2:3], 0x0 176; VI-NEXT: v_mov_b32_e32 v0, s0 177; VI-NEXT: v_mov_b32_e32 v2, s4 178; VI-NEXT: v_mov_b32_e32 v1, s1 179; VI-NEXT: s_waitcnt lgkmcnt(0) 180; VI-NEXT: s_lshr_b32 s0, s2, 16 181; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16 182; VI-NEXT: flat_store_dword v[0:1], v2 183; VI-NEXT: s_endpgm 184; 185; CI-LABEL: s_insertelement_v2i16_0_reghi: 186; CI: ; %bb.0: 187; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 188; CI-NEXT: s_load_dword s4, s[4:5], 0xc 189; CI-NEXT: s_waitcnt lgkmcnt(0) 190; CI-NEXT: s_load_dword s2, s[2:3], 0x0 191; CI-NEXT: v_mov_b32_e32 v0, s0 192; CI-NEXT: v_mov_b32_e32 v1, s1 193; CI-NEXT: s_lshr_b32 s1, s4, 16 194; CI-NEXT: s_waitcnt lgkmcnt(0) 195; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 196; CI-NEXT: s_or_b32 s0, s1, s0 197; CI-NEXT: v_mov_b32_e32 v2, s0 198; CI-NEXT: flat_store_dword v[0:1], v2 199; CI-NEXT: s_endpgm 200 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 201 %elt.hi = lshr i32 %elt.arg, 16 202 %elt = trunc i32 %elt.hi to i16 203 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 204 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 205 ret void 206} 207 208define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 209; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 210; GFX9: ; %bb.0: 211; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 212; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 213; GFX9-NEXT: v_mov_b32_e32 v0, 0 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 216; GFX9-NEXT: s_lshr_b32 s3, s6, 16 217; GFX9-NEXT: s_waitcnt lgkmcnt(0) 218; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2 219; GFX9-NEXT: v_mov_b32_e32 v1, s2 220; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 221; GFX9-NEXT: ;;#ASMSTART 222; GFX9-NEXT: ; use s3 223; GFX9-NEXT: ;;#ASMEND 224; GFX9-NEXT: s_endpgm 225; 226; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 227; VI: ; %bb.0: 228; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 229; VI-NEXT: s_load_dword s4, s[4:5], 0x10 230; VI-NEXT: s_waitcnt lgkmcnt(0) 231; VI-NEXT: s_load_dword s2, s[2:3], 0x0 232; VI-NEXT: v_mov_b32_e32 v1, s1 233; VI-NEXT: v_mov_b32_e32 v2, s4 234; VI-NEXT: v_mov_b32_e32 v0, s0 235; VI-NEXT: s_lshr_b32 s0, s4, 16 236; VI-NEXT: s_waitcnt lgkmcnt(0) 237; VI-NEXT: s_lshr_b32 s1, s2, 16 238; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 239; VI-NEXT: flat_store_dword v[0:1], v2 240; VI-NEXT: ;;#ASMSTART 241; VI-NEXT: ; use s0 242; VI-NEXT: ;;#ASMEND 243; VI-NEXT: s_endpgm 244; 245; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: 246; CI: ; %bb.0: 247; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 248; CI-NEXT: s_load_dword s4, s[4:5], 0x4 249; CI-NEXT: s_waitcnt lgkmcnt(0) 250; CI-NEXT: s_load_dword s2, s[2:3], 0x0 251; CI-NEXT: v_mov_b32_e32 v0, s0 252; CI-NEXT: v_mov_b32_e32 v1, s1 253; CI-NEXT: s_lshr_b32 s0, s4, 16 254; CI-NEXT: s_waitcnt lgkmcnt(0) 255; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 256; CI-NEXT: s_or_b32 s1, s0, s1 257; CI-NEXT: v_mov_b32_e32 v2, s1 258; CI-NEXT: flat_store_dword v[0:1], v2 259; CI-NEXT: ;;#ASMSTART 260; CI-NEXT: ; use s0 261; CI-NEXT: ;;#ASMEND 262; CI-NEXT: s_endpgm 263 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 264 %elt.hi = lshr i32 %elt.arg, 16 265 %elt = trunc i32 %elt.hi to i16 266 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 267 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 268 %use1 = zext i16 %elt to i32 269 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 270 ret void 271} 272 273define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 { 274; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 275; GFX9: ; %bb.0: 276; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 277; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 278; GFX9-NEXT: v_mov_b32_e32 v0, 0 279; GFX9-NEXT: s_waitcnt lgkmcnt(0) 280; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 281; GFX9-NEXT: s_lshr_b32 s3, s6, 16 282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 283; GFX9-NEXT: s_lshr_b32 s2, s2, 16 284; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2 285; GFX9-NEXT: v_mov_b32_e32 v1, s4 286; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 287; GFX9-NEXT: ;;#ASMSTART 288; GFX9-NEXT: ; use s3 289; GFX9-NEXT: ;;#ASMEND 290; GFX9-NEXT: ;;#ASMSTART 291; GFX9-NEXT: ; use s2 292; GFX9-NEXT: ;;#ASMEND 293; GFX9-NEXT: s_endpgm 294; 295; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 296; VI: ; %bb.0: 297; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 298; VI-NEXT: s_load_dword s4, s[4:5], 0x10 299; VI-NEXT: s_waitcnt lgkmcnt(0) 300; VI-NEXT: s_load_dword s2, s[2:3], 0x0 301; VI-NEXT: v_mov_b32_e32 v1, s1 302; VI-NEXT: v_mov_b32_e32 v2, s4 303; VI-NEXT: v_mov_b32_e32 v0, s0 304; VI-NEXT: s_lshr_b32 s0, s4, 16 305; VI-NEXT: s_waitcnt lgkmcnt(0) 306; VI-NEXT: s_lshr_b32 s1, s2, 16 307; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16 308; VI-NEXT: flat_store_dword v[0:1], v2 309; VI-NEXT: ;;#ASMSTART 310; VI-NEXT: ; use s0 311; VI-NEXT: ;;#ASMEND 312; VI-NEXT: ;;#ASMSTART 313; VI-NEXT: ; use s1 314; VI-NEXT: ;;#ASMEND 315; VI-NEXT: s_endpgm 316; 317; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: 318; CI: ; %bb.0: 319; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 320; CI-NEXT: s_load_dword s4, s[4:5], 0x4 321; CI-NEXT: s_waitcnt lgkmcnt(0) 322; CI-NEXT: s_load_dword s2, s[2:3], 0x0 323; CI-NEXT: v_mov_b32_e32 v1, s1 324; CI-NEXT: v_mov_b32_e32 v2, s4 325; CI-NEXT: v_mov_b32_e32 v0, s0 326; CI-NEXT: s_lshr_b32 s0, s4, 16 327; CI-NEXT: s_waitcnt lgkmcnt(0) 328; CI-NEXT: s_lshr_b32 s1, s2, 16 329; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 330; CI-NEXT: flat_store_dword v[0:1], v2 331; CI-NEXT: ;;#ASMSTART 332; CI-NEXT: ; use s0 333; CI-NEXT: ;;#ASMEND 334; CI-NEXT: ;;#ASMSTART 335; CI-NEXT: ; use s1 336; CI-NEXT: ;;#ASMEND 337; CI-NEXT: s_endpgm 338 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 339 %elt.hi = lshr i32 %elt.arg, 16 340 %elt = trunc i32 %elt.hi to i16 341 %vec.hi = extractelement <2 x i16> %vec, i32 1 342 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 343 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 344 %use1 = zext i16 %elt to i32 345 %vec.hi.use1 = zext i16 %vec.hi to i32 346 347 call void asm sideeffect "; use $0", "s"(i32 %use1) #0 348 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 349 ret void 350} 351 352define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { 353; GFX9-LABEL: s_insertelement_v2i16_1: 354; GFX9: ; %bb.0: 355; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 356; GFX9-NEXT: v_mov_b32_e32 v0, 0 357; GFX9-NEXT: s_waitcnt lgkmcnt(0) 358; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 359; GFX9-NEXT: s_waitcnt lgkmcnt(0) 360; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 361; GFX9-NEXT: v_mov_b32_e32 v1, s2 362; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 363; GFX9-NEXT: s_endpgm 364; 365; CIVI-LABEL: s_insertelement_v2i16_1: 366; CIVI: ; %bb.0: 367; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 368; CIVI-NEXT: s_waitcnt lgkmcnt(0) 369; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 370; CIVI-NEXT: v_mov_b32_e32 v0, s0 371; CIVI-NEXT: v_mov_b32_e32 v1, s1 372; CIVI-NEXT: s_waitcnt lgkmcnt(0) 373; CIVI-NEXT: s_and_b32 s0, s2, 0xffff 374; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 375; CIVI-NEXT: v_mov_b32_e32 v2, s0 376; CIVI-NEXT: flat_store_dword v[0:1], v2 377; CIVI-NEXT: s_endpgm 378 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 379 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 380 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 381 ret void 382} 383 384define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 { 385; GFX9-LABEL: s_insertelement_v2i16_1_reg: 386; GFX9: ; %bb.0: 387; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 388; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 389; GFX9-NEXT: v_mov_b32_e32 v0, 0 390; GFX9-NEXT: s_waitcnt lgkmcnt(0) 391; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 393; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 394; GFX9-NEXT: v_mov_b32_e32 v1, s2 395; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 396; GFX9-NEXT: s_endpgm 397; 398; VI-LABEL: s_insertelement_v2i16_1_reg: 399; VI: ; %bb.0: 400; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 401; VI-NEXT: s_load_dword s4, s[4:5], 0x30 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: s_load_dword s2, s[2:3], 0x0 404; VI-NEXT: v_mov_b32_e32 v0, s0 405; VI-NEXT: v_mov_b32_e32 v1, s1 406; VI-NEXT: s_lshl_b32 s0, s4, 16 407; VI-NEXT: s_waitcnt lgkmcnt(0) 408; VI-NEXT: s_and_b32 s1, s2, 0xffff 409; VI-NEXT: s_or_b32 s0, s1, s0 410; VI-NEXT: v_mov_b32_e32 v2, s0 411; VI-NEXT: flat_store_dword v[0:1], v2 412; VI-NEXT: s_endpgm 413; 414; CI-LABEL: s_insertelement_v2i16_1_reg: 415; CI: ; %bb.0: 416; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 417; CI-NEXT: s_load_dword s4, s[4:5], 0xc 418; CI-NEXT: s_waitcnt lgkmcnt(0) 419; CI-NEXT: s_load_dword s2, s[2:3], 0x0 420; CI-NEXT: v_mov_b32_e32 v0, s0 421; CI-NEXT: v_mov_b32_e32 v1, s1 422; CI-NEXT: s_lshl_b32 s1, s4, 16 423; CI-NEXT: s_waitcnt lgkmcnt(0) 424; CI-NEXT: s_and_b32 s0, s2, 0xffff 425; CI-NEXT: s_or_b32 s0, s0, s1 426; CI-NEXT: v_mov_b32_e32 v2, s0 427; CI-NEXT: flat_store_dword v[0:1], v2 428; CI-NEXT: s_endpgm 429 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 430 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 431 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 432 ret void 433} 434 435define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 436; GFX9-LABEL: s_insertelement_v2f16_0: 437; GFX9: ; %bb.0: 438; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 439; GFX9-NEXT: v_mov_b32_e32 v0, 0 440; GFX9-NEXT: s_waitcnt lgkmcnt(0) 441; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 442; GFX9-NEXT: s_waitcnt lgkmcnt(0) 443; GFX9-NEXT: s_lshr_b32 s2, s2, 16 444; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 445; GFX9-NEXT: v_mov_b32_e32 v1, s2 446; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 447; GFX9-NEXT: s_endpgm 448; 449; CIVI-LABEL: s_insertelement_v2f16_0: 450; CIVI: ; %bb.0: 451; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 452; CIVI-NEXT: s_waitcnt lgkmcnt(0) 453; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 454; CIVI-NEXT: v_mov_b32_e32 v0, s0 455; CIVI-NEXT: v_mov_b32_e32 v1, s1 456; CIVI-NEXT: s_waitcnt lgkmcnt(0) 457; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 458; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 459; CIVI-NEXT: v_mov_b32_e32 v2, s0 460; CIVI-NEXT: flat_store_dword v[0:1], v2 461; CIVI-NEXT: s_endpgm 462 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 463 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 464 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 465 ret void 466} 467 468define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 { 469; GFX9-LABEL: s_insertelement_v2f16_1: 470; GFX9: ; %bb.0: 471; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 472; GFX9-NEXT: v_mov_b32_e32 v0, 0 473; GFX9-NEXT: s_waitcnt lgkmcnt(0) 474; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 476; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 477; GFX9-NEXT: v_mov_b32_e32 v1, s2 478; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 479; GFX9-NEXT: s_endpgm 480; 481; CIVI-LABEL: s_insertelement_v2f16_1: 482; CIVI: ; %bb.0: 483; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 484; CIVI-NEXT: s_waitcnt lgkmcnt(0) 485; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 486; CIVI-NEXT: v_mov_b32_e32 v0, s0 487; CIVI-NEXT: v_mov_b32_e32 v1, s1 488; CIVI-NEXT: s_waitcnt lgkmcnt(0) 489; CIVI-NEXT: s_and_b32 s0, s2, 0xffff 490; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 491; CIVI-NEXT: v_mov_b32_e32 v2, s0 492; CIVI-NEXT: flat_store_dword v[0:1], v2 493; CIVI-NEXT: s_endpgm 494 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr 495 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 496 store <2 x half> %vecins, <2 x half> addrspace(1)* %out 497 ret void 498} 499 500define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 501; GFX9-LABEL: v_insertelement_v2i16_0: 502; GFX9: ; %bb.0: 503; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 504; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 505; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 506; GFX9-NEXT: s_waitcnt lgkmcnt(0) 507; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 508; GFX9-NEXT: s_movk_i32 s2, 0x3e7 509; GFX9-NEXT: s_waitcnt vmcnt(0) 510; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1 511; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 512; GFX9-NEXT: s_endpgm 513; 514; VI-LABEL: v_insertelement_v2i16_0: 515; VI: ; %bb.0: 516; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 517; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 518; VI-NEXT: s_waitcnt lgkmcnt(0) 519; VI-NEXT: v_mov_b32_e32 v1, s3 520; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 521; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 522; VI-NEXT: flat_load_dword v3, v[0:1] 523; VI-NEXT: v_mov_b32_e32 v1, s1 524; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 525; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 526; VI-NEXT: s_waitcnt vmcnt(0) 527; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 528; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 529; VI-NEXT: flat_store_dword v[0:1], v2 530; VI-NEXT: s_endpgm 531; 532; CI-LABEL: v_insertelement_v2i16_0: 533; CI: ; %bb.0: 534; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 535; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 536; CI-NEXT: s_waitcnt lgkmcnt(0) 537; CI-NEXT: v_mov_b32_e32 v1, s3 538; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 539; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 540; CI-NEXT: flat_load_dword v3, v[0:1] 541; CI-NEXT: v_mov_b32_e32 v1, s1 542; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 543; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 544; CI-NEXT: s_waitcnt vmcnt(0) 545; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 546; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 547; CI-NEXT: flat_store_dword v[0:1], v2 548; CI-NEXT: s_endpgm 549 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 550 %tid.ext = sext i32 %tid to i64 551 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 552 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 553 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 554 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 555 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 556 ret void 557} 558 559define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { 560; GFX9-LABEL: v_insertelement_v2i16_0_reghi: 561; GFX9: ; %bb.0: 562; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 563; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 564; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 565; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 566; GFX9-NEXT: s_waitcnt lgkmcnt(0) 567; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 568; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6 569; GFX9-NEXT: s_waitcnt vmcnt(0) 570; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 571; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 572; GFX9-NEXT: s_endpgm 573; 574; VI-LABEL: v_insertelement_v2i16_0_reghi: 575; VI: ; %bb.0: 576; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 577; VI-NEXT: s_load_dword s4, s[4:5], 0x10 578; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 579; VI-NEXT: s_waitcnt lgkmcnt(0) 580; VI-NEXT: v_mov_b32_e32 v1, s3 581; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 582; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 583; VI-NEXT: flat_load_dword v3, v[0:1] 584; VI-NEXT: v_mov_b32_e32 v1, s1 585; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 586; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 587; VI-NEXT: s_waitcnt vmcnt(0) 588; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 589; VI-NEXT: v_alignbit_b32 v2, v2, s4, 16 590; VI-NEXT: flat_store_dword v[0:1], v2 591; VI-NEXT: s_endpgm 592; 593; CI-LABEL: v_insertelement_v2i16_0_reghi: 594; CI: ; %bb.0: 595; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 596; CI-NEXT: s_load_dword s4, s[4:5], 0x4 597; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 598; CI-NEXT: s_waitcnt lgkmcnt(0) 599; CI-NEXT: v_mov_b32_e32 v1, s3 600; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 601; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 602; CI-NEXT: flat_load_dword v3, v[0:1] 603; CI-NEXT: v_mov_b32_e32 v1, s1 604; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 605; CI-NEXT: s_lshr_b32 s0, s4, 16 606; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 607; CI-NEXT: s_waitcnt vmcnt(0) 608; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 609; CI-NEXT: v_or_b32_e32 v2, s0, v2 610; CI-NEXT: flat_store_dword v[0:1], v2 611; CI-NEXT: s_endpgm 612 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 613 %tid.ext = sext i32 %tid to i64 614 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 615 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 616 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 617 %elt.hi = lshr i32 %elt.arg, 16 618 %elt = trunc i32 %elt.hi to i16 619 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 620 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 621 ret void 622} 623 624define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 625; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: 626; GFX9: ; %bb.0: 627; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 628; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 629; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 630; GFX9-NEXT: s_waitcnt lgkmcnt(0) 631; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 632; GFX9-NEXT: s_waitcnt vmcnt(0) 633; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1 634; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 635; GFX9-NEXT: s_endpgm 636; 637; VI-LABEL: v_insertelement_v2i16_0_inlineimm: 638; VI: ; %bb.0: 639; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 640; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 641; VI-NEXT: s_waitcnt lgkmcnt(0) 642; VI-NEXT: v_mov_b32_e32 v1, s3 643; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 644; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 645; VI-NEXT: flat_load_dword v3, v[0:1] 646; VI-NEXT: v_mov_b32_e32 v1, s1 647; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 648; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 649; VI-NEXT: s_waitcnt vmcnt(0) 650; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 651; VI-NEXT: v_or_b32_e32 v2, 53, v2 652; VI-NEXT: flat_store_dword v[0:1], v2 653; VI-NEXT: s_endpgm 654; 655; CI-LABEL: v_insertelement_v2i16_0_inlineimm: 656; CI: ; %bb.0: 657; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 658; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 659; CI-NEXT: s_waitcnt lgkmcnt(0) 660; CI-NEXT: v_mov_b32_e32 v1, s3 661; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 662; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 663; CI-NEXT: flat_load_dword v3, v[0:1] 664; CI-NEXT: v_mov_b32_e32 v1, s1 665; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 666; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 667; CI-NEXT: s_waitcnt vmcnt(0) 668; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 669; CI-NEXT: v_or_b32_e32 v2, 53, v2 670; CI-NEXT: flat_store_dword v[0:1], v2 671; CI-NEXT: s_endpgm 672 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 673 %tid.ext = sext i32 %tid to i64 674 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 675 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 676 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 677 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0 678 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 679 ret void 680} 681 682; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 683define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 684; GFX9-LABEL: v_insertelement_v2i16_1: 685; GFX9: ; %bb.0: 686; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 687; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 688; GFX9-NEXT: s_waitcnt lgkmcnt(0) 689; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 690; GFX9-NEXT: s_movk_i32 s2, 0x3e7 691; GFX9-NEXT: s_waitcnt vmcnt(0) 692; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 693; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 694; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 695; GFX9-NEXT: s_endpgm 696; 697; VI-LABEL: v_insertelement_v2i16_1: 698; VI: ; %bb.0: 699; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 700; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 701; VI-NEXT: s_waitcnt lgkmcnt(0) 702; VI-NEXT: v_mov_b32_e32 v1, s3 703; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 704; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 705; VI-NEXT: flat_load_dword v3, v[0:1] 706; VI-NEXT: v_mov_b32_e32 v1, s1 707; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 708; VI-NEXT: v_mov_b32_e32 v2, 0x3e70000 709; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 710; VI-NEXT: s_waitcnt vmcnt(0) 711; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 712; VI-NEXT: flat_store_dword v[0:1], v2 713; VI-NEXT: s_endpgm 714; 715; CI-LABEL: v_insertelement_v2i16_1: 716; CI: ; %bb.0: 717; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 718; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 719; CI-NEXT: s_waitcnt lgkmcnt(0) 720; CI-NEXT: v_mov_b32_e32 v1, s3 721; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 722; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 723; CI-NEXT: flat_load_dword v3, v[0:1] 724; CI-NEXT: v_mov_b32_e32 v1, s1 725; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 726; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 727; CI-NEXT: s_waitcnt vmcnt(0) 728; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 729; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 730; CI-NEXT: flat_store_dword v[0:1], v2 731; CI-NEXT: s_endpgm 732 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 733 %tid.ext = sext i32 %tid to i64 734 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 735 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 736 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 737 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 738 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 739 ret void 740} 741 742define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { 743; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: 744; GFX9: ; %bb.0: 745; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 746; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 747; GFX9-NEXT: s_waitcnt lgkmcnt(0) 748; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 749; GFX9-NEXT: s_waitcnt vmcnt(0) 750; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 751; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1 752; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 753; GFX9-NEXT: s_endpgm 754; 755; VI-LABEL: v_insertelement_v2i16_1_inlineimm: 756; VI: ; %bb.0: 757; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 758; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 759; VI-NEXT: s_waitcnt lgkmcnt(0) 760; VI-NEXT: v_mov_b32_e32 v1, s3 761; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 762; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 763; VI-NEXT: flat_load_dword v3, v[0:1] 764; VI-NEXT: v_mov_b32_e32 v1, s1 765; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 766; VI-NEXT: v_mov_b32_e32 v2, 0xfff10000 767; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 768; VI-NEXT: s_waitcnt vmcnt(0) 769; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 770; VI-NEXT: flat_store_dword v[0:1], v2 771; VI-NEXT: s_endpgm 772; 773; CI-LABEL: v_insertelement_v2i16_1_inlineimm: 774; CI: ; %bb.0: 775; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 776; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 777; CI-NEXT: s_waitcnt lgkmcnt(0) 778; CI-NEXT: v_mov_b32_e32 v1, s3 779; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 780; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 781; CI-NEXT: flat_load_dword v3, v[0:1] 782; CI-NEXT: v_mov_b32_e32 v1, s1 783; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 784; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 785; CI-NEXT: s_waitcnt vmcnt(0) 786; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 787; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 788; CI-NEXT: flat_store_dword v[0:1], v2 789; CI-NEXT: s_endpgm 790 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 791 %tid.ext = sext i32 %tid to i64 792 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 793 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 794 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 795 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1 796 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 797 ret void 798} 799 800define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 801; GFX9-LABEL: v_insertelement_v2f16_0: 802; GFX9: ; %bb.0: 803; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 804; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 805; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 806; GFX9-NEXT: s_waitcnt lgkmcnt(0) 807; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 808; GFX9-NEXT: s_waitcnt vmcnt(0) 809; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 810; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 811; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 812; GFX9-NEXT: s_endpgm 813; 814; VI-LABEL: v_insertelement_v2f16_0: 815; VI: ; %bb.0: 816; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 817; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 818; VI-NEXT: s_waitcnt lgkmcnt(0) 819; VI-NEXT: v_mov_b32_e32 v1, s3 820; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 821; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 822; VI-NEXT: flat_load_dword v3, v[0:1] 823; VI-NEXT: v_mov_b32_e32 v1, s1 824; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 825; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 826; VI-NEXT: s_waitcnt vmcnt(0) 827; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 828; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 829; VI-NEXT: flat_store_dword v[0:1], v2 830; VI-NEXT: s_endpgm 831; 832; CI-LABEL: v_insertelement_v2f16_0: 833; CI: ; %bb.0: 834; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 835; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 836; CI-NEXT: s_waitcnt lgkmcnt(0) 837; CI-NEXT: v_mov_b32_e32 v1, s3 838; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 839; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 840; CI-NEXT: flat_load_dword v3, v[0:1] 841; CI-NEXT: v_mov_b32_e32 v1, s1 842; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 843; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 844; CI-NEXT: s_waitcnt vmcnt(0) 845; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 846; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 847; CI-NEXT: flat_store_dword v[0:1], v2 848; CI-NEXT: s_endpgm 849 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 850 %tid.ext = sext i32 %tid to i64 851 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 852 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 853 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 854 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 855 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 856 ret void 857} 858 859define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 860; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: 861; GFX9: ; %bb.0: 862; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 863; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 864; GFX9-NEXT: s_waitcnt lgkmcnt(0) 865; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 866; GFX9-NEXT: s_waitcnt vmcnt(0) 867; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 868; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53 869; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 870; GFX9-NEXT: s_endpgm 871; 872; VI-LABEL: v_insertelement_v2f16_0_inlineimm: 873; VI: ; %bb.0: 874; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 875; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 876; VI-NEXT: s_waitcnt lgkmcnt(0) 877; VI-NEXT: v_mov_b32_e32 v1, s3 878; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 879; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 880; VI-NEXT: flat_load_dword v3, v[0:1] 881; VI-NEXT: v_mov_b32_e32 v1, s1 882; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 883; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 884; VI-NEXT: s_waitcnt vmcnt(0) 885; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 886; VI-NEXT: v_or_b32_e32 v2, 53, v2 887; VI-NEXT: flat_store_dword v[0:1], v2 888; VI-NEXT: s_endpgm 889; 890; CI-LABEL: v_insertelement_v2f16_0_inlineimm: 891; CI: ; %bb.0: 892; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 893; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 894; CI-NEXT: s_waitcnt lgkmcnt(0) 895; CI-NEXT: v_mov_b32_e32 v1, s3 896; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 897; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 898; CI-NEXT: flat_load_dword v3, v[0:1] 899; CI-NEXT: v_mov_b32_e32 v1, s1 900; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 901; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 902; CI-NEXT: s_waitcnt vmcnt(0) 903; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 904; CI-NEXT: v_or_b32_e32 v2, 53, v2 905; CI-NEXT: flat_store_dword v[0:1], v2 906; CI-NEXT: s_endpgm 907 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 908 %tid.ext = sext i32 %tid to i64 909 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 910 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 911 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 912 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0 913 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 914 ret void 915} 916 917define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 918; GFX9-LABEL: v_insertelement_v2f16_1: 919; GFX9: ; %bb.0: 920; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 921; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 922; GFX9-NEXT: s_waitcnt lgkmcnt(0) 923; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 924; GFX9-NEXT: s_movk_i32 s2, 0x4500 925; GFX9-NEXT: s_waitcnt vmcnt(0) 926; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 927; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 928; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 929; GFX9-NEXT: s_endpgm 930; 931; VI-LABEL: v_insertelement_v2f16_1: 932; VI: ; %bb.0: 933; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 934; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 935; VI-NEXT: s_waitcnt lgkmcnt(0) 936; VI-NEXT: v_mov_b32_e32 v1, s3 937; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 938; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 939; VI-NEXT: flat_load_dword v3, v[0:1] 940; VI-NEXT: v_mov_b32_e32 v1, s1 941; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 942; VI-NEXT: v_mov_b32_e32 v2, 0x45000000 943; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 944; VI-NEXT: s_waitcnt vmcnt(0) 945; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 946; VI-NEXT: flat_store_dword v[0:1], v2 947; VI-NEXT: s_endpgm 948; 949; CI-LABEL: v_insertelement_v2f16_1: 950; CI: ; %bb.0: 951; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 952; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 953; CI-NEXT: s_waitcnt lgkmcnt(0) 954; CI-NEXT: v_mov_b32_e32 v1, s3 955; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 956; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 957; CI-NEXT: flat_load_dword v3, v[0:1] 958; CI-NEXT: v_mov_b32_e32 v1, s1 959; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 960; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 961; CI-NEXT: s_waitcnt vmcnt(0) 962; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 963; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 964; CI-NEXT: flat_store_dword v[0:1], v2 965; CI-NEXT: s_endpgm 966 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 967 %tid.ext = sext i32 %tid to i64 968 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 969 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 970 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 971 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 972 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 973 ret void 974} 975 976define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 977; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: 978; GFX9: ; %bb.0: 979; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 980; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 981; GFX9-NEXT: s_waitcnt lgkmcnt(0) 982; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 983; GFX9-NEXT: s_waitcnt vmcnt(0) 984; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 985; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1 986; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 987; GFX9-NEXT: s_endpgm 988; 989; VI-LABEL: v_insertelement_v2f16_1_inlineimm: 990; VI: ; %bb.0: 991; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 992; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 993; VI-NEXT: s_waitcnt lgkmcnt(0) 994; VI-NEXT: v_mov_b32_e32 v1, s3 995; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 996; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 997; VI-NEXT: flat_load_dword v3, v[0:1] 998; VI-NEXT: v_mov_b32_e32 v1, s1 999; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1000; VI-NEXT: v_mov_b32_e32 v2, 0x230000 1001; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1002; VI-NEXT: s_waitcnt vmcnt(0) 1003; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1004; VI-NEXT: flat_store_dword v[0:1], v2 1005; VI-NEXT: s_endpgm 1006; 1007; CI-LABEL: v_insertelement_v2f16_1_inlineimm: 1008; CI: ; %bb.0: 1009; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1010; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1011; CI-NEXT: s_waitcnt lgkmcnt(0) 1012; CI-NEXT: v_mov_b32_e32 v1, s3 1013; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1014; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1015; CI-NEXT: flat_load_dword v3, v[0:1] 1016; CI-NEXT: v_mov_b32_e32 v1, s1 1017; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1018; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1019; CI-NEXT: s_waitcnt vmcnt(0) 1020; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 1021; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 1022; CI-NEXT: flat_store_dword v[0:1], v2 1023; CI-NEXT: s_endpgm 1024 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1025 %tid.ext = sext i32 %tid to i64 1026 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1027 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1028 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1029 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1 1030 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1031 ret void 1032} 1033 1034; FIXME: Enable for others when argument load not split 1035define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { 1036; GFX9-LABEL: s_insertelement_v2i16_dynamic: 1037; GFX9: ; %bb.0: 1038; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1039; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1040; GFX9-NEXT: v_mov_b32_e32 v0, 0 1041; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1042; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 1043; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 1044; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX9-NEXT: s_lshl_b32 s2, s4, 4 1046; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1047; GFX9-NEXT: s_andn2_b32 s3, s5, s2 1048; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 1049; GFX9-NEXT: s_or_b32 s2, s2, s3 1050; GFX9-NEXT: v_mov_b32_e32 v1, s2 1051; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1052; GFX9-NEXT: s_endpgm 1053; 1054; VI-LABEL: s_insertelement_v2i16_dynamic: 1055; VI: ; %bb.0: 1056; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1057; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1058; VI-NEXT: s_waitcnt lgkmcnt(0) 1059; VI-NEXT: s_load_dword s4, s[6:7], 0x0 1060; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1061; VI-NEXT: v_mov_b32_e32 v0, s0 1062; VI-NEXT: v_mov_b32_e32 v1, s1 1063; VI-NEXT: s_waitcnt lgkmcnt(0) 1064; VI-NEXT: s_lshl_b32 s0, s4, 4 1065; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1066; VI-NEXT: s_andn2_b32 s1, s2, s0 1067; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1068; VI-NEXT: s_or_b32 s0, s0, s1 1069; VI-NEXT: v_mov_b32_e32 v2, s0 1070; VI-NEXT: flat_store_dword v[0:1], v2 1071; VI-NEXT: s_endpgm 1072; 1073; CI-LABEL: s_insertelement_v2i16_dynamic: 1074; CI: ; %bb.0: 1075; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4 1076; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1077; CI-NEXT: s_waitcnt lgkmcnt(0) 1078; CI-NEXT: s_load_dword s4, s[6:7], 0x0 1079; CI-NEXT: s_load_dword s2, s[2:3], 0x0 1080; CI-NEXT: v_mov_b32_e32 v0, s0 1081; CI-NEXT: v_mov_b32_e32 v1, s1 1082; CI-NEXT: s_waitcnt lgkmcnt(0) 1083; CI-NEXT: s_lshl_b32 s0, s4, 4 1084; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1085; CI-NEXT: s_andn2_b32 s1, s2, s0 1086; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 1087; CI-NEXT: s_or_b32 s0, s0, s1 1088; CI-NEXT: v_mov_b32_e32 v2, s0 1089; CI-NEXT: flat_store_dword v[0:1], v2 1090; CI-NEXT: s_endpgm 1091 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr 1092 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr 1093 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1094 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out 1095 ret void 1096} 1097 1098define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 { 1099; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1100; GFX9: ; %bb.0: 1101; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1102; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1103; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1104; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1107; GFX9-NEXT: s_lshl_b32 s2, s6, 4 1108; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 1109; GFX9-NEXT: s_waitcnt vmcnt(0) 1110; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 1111; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1112; GFX9-NEXT: s_endpgm 1113; 1114; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1115; VI: ; %bb.0: 1116; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1117; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1118; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1119; VI-NEXT: s_waitcnt lgkmcnt(0) 1120; VI-NEXT: v_mov_b32_e32 v1, s3 1121; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1122; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1123; VI-NEXT: flat_load_dword v3, v[0:1] 1124; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1125; VI-NEXT: s_lshl_b32 s0, s4, 4 1126; VI-NEXT: v_mov_b32_e32 v1, s1 1127; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 1128; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1129; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1130; VI-NEXT: s_waitcnt vmcnt(0) 1131; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1132; VI-NEXT: flat_store_dword v[0:1], v2 1133; VI-NEXT: s_endpgm 1134; 1135; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: 1136; CI: ; %bb.0: 1137; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1138; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1139; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1140; CI-NEXT: s_waitcnt lgkmcnt(0) 1141; CI-NEXT: v_mov_b32_e32 v1, s3 1142; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1143; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1144; CI-NEXT: flat_load_dword v3, v[0:1] 1145; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1146; CI-NEXT: s_lshl_b32 s0, s4, 4 1147; CI-NEXT: v_mov_b32_e32 v1, s1 1148; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 1149; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 1150; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1151; CI-NEXT: s_waitcnt vmcnt(0) 1152; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 1153; CI-NEXT: flat_store_dword v[0:1], v2 1154; CI-NEXT: s_endpgm 1155 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1156 %tid.ext = sext i32 %tid to i64 1157 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext 1158 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext 1159 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep 1160 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx 1161 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep 1162 ret void 1163} 1164 1165define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { 1166; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1167; GFX9: ; %bb.0: 1168; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1169; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1170; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1171; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1172; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 1173; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 1174; GFX9-NEXT: s_mov_b32 s2, 0xffff 1175; GFX9-NEXT: s_waitcnt vmcnt(1) 1176; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 1177; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 1178; GFX9-NEXT: s_mov_b32 s2, 0x12341234 1179; GFX9-NEXT: s_waitcnt vmcnt(0) 1180; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2 1181; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1182; GFX9-NEXT: s_endpgm 1183; 1184; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1185; VI: ; %bb.0: 1186; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1187; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1188; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1189; VI-NEXT: s_waitcnt lgkmcnt(0) 1190; VI-NEXT: v_mov_b32_e32 v3, s3 1191; VI-NEXT: v_mov_b32_e32 v1, s5 1192; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1193; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1194; VI-NEXT: flat_load_dword v4, v[0:1] 1195; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1196; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1197; VI-NEXT: flat_load_dword v3, v[0:1] 1198; VI-NEXT: s_mov_b32 s2, 0xffff 1199; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1200; VI-NEXT: v_mov_b32_e32 v1, s1 1201; VI-NEXT: s_mov_b32 s0, 0x12341234 1202; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1203; VI-NEXT: s_waitcnt vmcnt(1) 1204; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 1205; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 1206; VI-NEXT: s_waitcnt vmcnt(0) 1207; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 1208; VI-NEXT: flat_store_dword v[0:1], v2 1209; VI-NEXT: s_endpgm 1210; 1211; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: 1212; CI: ; %bb.0: 1213; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1214; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 1215; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1216; CI-NEXT: s_waitcnt lgkmcnt(0) 1217; CI-NEXT: v_mov_b32_e32 v3, s3 1218; CI-NEXT: v_mov_b32_e32 v1, s5 1219; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 1220; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1221; CI-NEXT: flat_load_dword v4, v[0:1] 1222; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1223; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1224; CI-NEXT: flat_load_dword v3, v[0:1] 1225; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 1226; CI-NEXT: v_mov_b32_e32 v1, s1 1227; CI-NEXT: s_mov_b32 s0, 0x12341234 1228; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1229; CI-NEXT: s_waitcnt vmcnt(1) 1230; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 1231; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 1232; CI-NEXT: s_waitcnt vmcnt(0) 1233; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 1234; CI-NEXT: flat_store_dword v[0:1], v2 1235; CI-NEXT: s_endpgm 1236 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1237 %tid.ext = sext i32 %tid to i64 1238 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext 1239 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext 1240 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext 1241 %idx = load i32, i32 addrspace(1)* %idx.gep 1242 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep 1243 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx 1244 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep 1245 ret void 1246} 1247 1248define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1249; GFX9-LABEL: v_insertelement_v4f16_0: 1250; GFX9: ; %bb.0: 1251; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1252; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 1253; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1254; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1257; GFX9-NEXT: s_waitcnt vmcnt(0) 1258; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0 1259; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1260; GFX9-NEXT: s_endpgm 1261; 1262; VI-LABEL: v_insertelement_v4f16_0: 1263; VI: ; %bb.0: 1264; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1265; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1266; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1267; VI-NEXT: s_waitcnt lgkmcnt(0) 1268; VI-NEXT: v_mov_b32_e32 v1, s3 1269; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1270; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1271; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1272; VI-NEXT: v_mov_b32_e32 v3, s1 1273; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1274; VI-NEXT: s_mov_b32 s0, 0xffff 1275; VI-NEXT: v_mov_b32_e32 v4, s4 1276; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1277; VI-NEXT: s_waitcnt vmcnt(0) 1278; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 1279; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1280; VI-NEXT: s_endpgm 1281; 1282; CI-LABEL: v_insertelement_v4f16_0: 1283; CI: ; %bb.0: 1284; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1285; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1286; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1287; CI-NEXT: s_waitcnt lgkmcnt(0) 1288; CI-NEXT: v_mov_b32_e32 v1, s3 1289; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1290; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1291; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1292; CI-NEXT: v_mov_b32_e32 v3, s1 1293; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1294; CI-NEXT: s_mov_b32 s0, 0xffff 1295; CI-NEXT: v_mov_b32_e32 v4, s4 1296; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1297; CI-NEXT: s_waitcnt vmcnt(0) 1298; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 1299; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1300; CI-NEXT: s_endpgm 1301 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1302 %tid.ext = sext i32 %tid to i64 1303 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1304 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1305 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1306 %val.trunc = trunc i32 %val to i16 1307 %val.cvt = bitcast i16 %val.trunc to half 1308 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0 1309 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1310 ret void 1311} 1312 1313define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1314; GFX9-LABEL: v_insertelement_v4f16_1: 1315; GFX9: ; %bb.0: 1316; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1317; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1318; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1320; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1321; GFX9-NEXT: s_waitcnt vmcnt(0) 1322; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 1323; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0 1324; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1325; GFX9-NEXT: s_endpgm 1326; 1327; VI-LABEL: v_insertelement_v4f16_1: 1328; VI: ; %bb.0: 1329; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1330; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1331; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1332; VI-NEXT: s_waitcnt lgkmcnt(0) 1333; VI-NEXT: v_mov_b32_e32 v1, s3 1334; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1335; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1336; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1337; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1338; VI-NEXT: s_lshl_b32 s0, s4, 16 1339; VI-NEXT: v_mov_b32_e32 v3, s1 1340; VI-NEXT: v_mov_b32_e32 v4, s0 1341; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1342; VI-NEXT: s_waitcnt vmcnt(0) 1343; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1344; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1345; VI-NEXT: s_endpgm 1346; 1347; CI-LABEL: v_insertelement_v4f16_1: 1348; CI: ; %bb.0: 1349; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1350; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1351; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1352; CI-NEXT: s_waitcnt lgkmcnt(0) 1353; CI-NEXT: v_mov_b32_e32 v1, s3 1354; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1355; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1356; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1357; CI-NEXT: v_mov_b32_e32 v3, s1 1358; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1359; CI-NEXT: s_lshl_b32 s0, s4, 16 1360; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1361; CI-NEXT: s_waitcnt vmcnt(0) 1362; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1363; CI-NEXT: v_or_b32_e32 v0, s0, v0 1364; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1365; CI-NEXT: s_endpgm 1366 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1367 %tid.ext = sext i32 %tid to i64 1368 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1369 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1370 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1371 %val.trunc = trunc i32 %val to i16 1372 %val.cvt = bitcast i16 %val.trunc to half 1373 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1 1374 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1375 ret void 1376} 1377 1378define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 { 1379; GFX9-LABEL: v_insertelement_v4f16_2: 1380; GFX9: ; %bb.0: 1381; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1382; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30 1383; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1384; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1386; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1387; GFX9-NEXT: s_waitcnt vmcnt(0) 1388; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 1389; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1390; GFX9-NEXT: s_endpgm 1391; 1392; VI-LABEL: v_insertelement_v4f16_2: 1393; VI: ; %bb.0: 1394; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1395; VI-NEXT: s_load_dword s4, s[4:5], 0x30 1396; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1397; VI-NEXT: s_waitcnt lgkmcnt(0) 1398; VI-NEXT: v_mov_b32_e32 v1, s3 1399; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1400; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1401; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1402; VI-NEXT: v_mov_b32_e32 v3, s1 1403; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1404; VI-NEXT: s_mov_b32 s0, 0xffff 1405; VI-NEXT: v_mov_b32_e32 v4, s4 1406; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1407; VI-NEXT: s_waitcnt vmcnt(0) 1408; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 1409; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1410; VI-NEXT: s_endpgm 1411; 1412; CI-LABEL: v_insertelement_v4f16_2: 1413; CI: ; %bb.0: 1414; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1415; CI-NEXT: s_load_dword s4, s[4:5], 0xc 1416; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1417; CI-NEXT: s_waitcnt lgkmcnt(0) 1418; CI-NEXT: v_mov_b32_e32 v1, s3 1419; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1420; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1421; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1422; CI-NEXT: v_mov_b32_e32 v3, s1 1423; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1424; CI-NEXT: s_mov_b32 s0, 0xffff 1425; CI-NEXT: v_mov_b32_e32 v4, s4 1426; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1427; CI-NEXT: s_waitcnt vmcnt(0) 1428; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1429; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1430; CI-NEXT: s_endpgm 1431 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1432 %tid.ext = sext i32 %tid to i64 1433 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1434 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1435 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1436 %val.trunc = trunc i32 %val to i16 1437 %val.cvt = bitcast i16 %val.trunc to half 1438 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2 1439 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1440 ret void 1441} 1442 1443define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 { 1444; GFX9-LABEL: v_insertelement_v4f16_3: 1445; GFX9: ; %bb.0: 1446; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1447; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1448; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1450; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1451; GFX9-NEXT: s_waitcnt vmcnt(0) 1452; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1453; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 1454; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1455; GFX9-NEXT: s_endpgm 1456; 1457; VI-LABEL: v_insertelement_v4f16_3: 1458; VI: ; %bb.0: 1459; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1460; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1461; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1462; VI-NEXT: s_waitcnt lgkmcnt(0) 1463; VI-NEXT: v_mov_b32_e32 v1, s3 1464; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1465; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1466; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1467; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1468; VI-NEXT: s_lshl_b32 s0, s4, 16 1469; VI-NEXT: v_mov_b32_e32 v3, s1 1470; VI-NEXT: v_mov_b32_e32 v4, s0 1471; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1472; VI-NEXT: s_waitcnt vmcnt(0) 1473; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1474; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1475; VI-NEXT: s_endpgm 1476; 1477; CI-LABEL: v_insertelement_v4f16_3: 1478; CI: ; %bb.0: 1479; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1480; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1481; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1482; CI-NEXT: s_waitcnt lgkmcnt(0) 1483; CI-NEXT: v_mov_b32_e32 v1, s3 1484; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1485; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1486; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1487; CI-NEXT: v_mov_b32_e32 v3, s1 1488; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1489; CI-NEXT: s_lshl_b32 s0, s4, 16 1490; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1491; CI-NEXT: s_waitcnt vmcnt(0) 1492; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1493; CI-NEXT: v_or_b32_e32 v1, s0, v1 1494; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1495; CI-NEXT: s_endpgm 1496 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1497 %tid.ext = sext i32 %tid to i64 1498 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1499 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1500 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1501 %val.trunc = trunc i32 %val to i16 1502 %val.cvt = bitcast i16 %val.trunc to half 1503 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3 1504 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1505 ret void 1506} 1507 1508define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1509; GFX9-LABEL: v_insertelement_v4i16_2: 1510; GFX9: ; %bb.0: 1511; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1512; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1513; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1514; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff 1515; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1516; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1517; GFX9-NEXT: s_waitcnt vmcnt(0) 1518; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1 1519; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1520; GFX9-NEXT: s_endpgm 1521; 1522; VI-LABEL: v_insertelement_v4i16_2: 1523; VI: ; %bb.0: 1524; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1525; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1526; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1527; VI-NEXT: s_waitcnt lgkmcnt(0) 1528; VI-NEXT: v_mov_b32_e32 v1, s3 1529; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1530; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1531; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1532; VI-NEXT: v_mov_b32_e32 v3, s1 1533; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1534; VI-NEXT: s_mov_b32 s0, 0xffff 1535; VI-NEXT: v_mov_b32_e32 v4, s4 1536; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1537; VI-NEXT: s_waitcnt vmcnt(0) 1538; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 1539; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1540; VI-NEXT: s_endpgm 1541; 1542; CI-LABEL: v_insertelement_v4i16_2: 1543; CI: ; %bb.0: 1544; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1545; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1546; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1547; CI-NEXT: s_waitcnt lgkmcnt(0) 1548; CI-NEXT: v_mov_b32_e32 v1, s3 1549; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1550; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1551; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1552; CI-NEXT: v_mov_b32_e32 v3, s1 1553; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1554; CI-NEXT: s_mov_b32 s0, 0xffff 1555; CI-NEXT: v_mov_b32_e32 v4, s4 1556; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1557; CI-NEXT: s_waitcnt vmcnt(0) 1558; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 1559; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1560; CI-NEXT: s_endpgm 1561 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1562 %tid.ext = sext i32 %tid to i64 1563 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1564 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1565 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1566 %val.trunc = trunc i32 %val to i16 1567 %val.cvt = bitcast i16 %val.trunc to i16 1568 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2 1569 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1570 ret void 1571} 1572 1573; FIXME: Better code on CI? 1574define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { 1575; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1576; GFX9: ; %bb.0: 1577; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1578; GFX9-NEXT: global_load_dword v2, v[0:1], off glc 1579; GFX9-NEXT: s_waitcnt vmcnt(0) 1580; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1581; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 1582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1584; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff 1585; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 1586; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] 1587; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 1588; GFX9-NEXT: s_waitcnt vmcnt(0) 1589; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 1590; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 1591; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1592; GFX9-NEXT: s_endpgm 1593; 1594; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1595; VI: ; %bb.0: 1596; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1597; VI-NEXT: flat_load_dword v4, v[0:1] glc 1598; VI-NEXT: s_waitcnt vmcnt(0) 1599; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1600; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1601; VI-NEXT: s_waitcnt lgkmcnt(0) 1602; VI-NEXT: v_mov_b32_e32 v1, s3 1603; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1604; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1605; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1606; VI-NEXT: s_mov_b64 s[2:3], 0xffff 1607; VI-NEXT: v_mov_b32_e32 v3, s1 1608; VI-NEXT: s_lshl_b32 s1, s4, 16 1609; VI-NEXT: s_and_b32 s4, s4, s2 1610; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1611; VI-NEXT: s_or_b32 s0, s4, s1 1612; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1613; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1614; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] 1615; VI-NEXT: s_waitcnt vmcnt(0) 1616; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 1617; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 1618; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1619; VI-NEXT: s_endpgm 1620; 1621; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: 1622; CI: ; %bb.0: 1623; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1624; CI-NEXT: flat_load_dword v4, v[0:1] glc 1625; CI-NEXT: s_waitcnt vmcnt(0) 1626; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1627; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1628; CI-NEXT: s_waitcnt lgkmcnt(0) 1629; CI-NEXT: v_mov_b32_e32 v1, s3 1630; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1631; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1632; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1633; CI-NEXT: s_mov_b64 s[2:3], 0xffff 1634; CI-NEXT: v_mov_b32_e32 v3, s1 1635; CI-NEXT: s_lshl_b32 s1, s4, 16 1636; CI-NEXT: s_and_b32 s4, s4, s2 1637; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1638; CI-NEXT: s_or_b32 s0, s4, s1 1639; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1640; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 1641; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 1642; CI-NEXT: s_waitcnt vmcnt(0) 1643; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 1644; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 1645; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1646; CI-NEXT: s_endpgm 1647 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1648 %tid.ext = sext i32 %tid to i64 1649 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext 1650 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext 1651 %idx.val = load volatile i32, i32 addrspace(1)* undef 1652 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep 1653 %val.trunc = trunc i32 %val to i16 1654 %val.cvt = bitcast i16 %val.trunc to i16 1655 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val 1656 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep 1657 ret void 1658} 1659 1660define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 { 1661; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1662; GFX9: ; %bb.0: 1663; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1664; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1665; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1666; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1667; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1668; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff 1669; GFX9-NEXT: s_lshl_b32 s4, s7, 4 1670; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6 1671; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 1672; GFX9-NEXT: v_mov_b32_e32 v3, s5 1673; GFX9-NEXT: v_mov_b32_e32 v4, s5 1674; GFX9-NEXT: s_waitcnt vmcnt(0) 1675; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 1676; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 1677; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1678; GFX9-NEXT: s_endpgm 1679; 1680; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1681; VI: ; %bb.0: 1682; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1683; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1684; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1685; VI-NEXT: s_waitcnt lgkmcnt(0) 1686; VI-NEXT: v_mov_b32_e32 v1, s3 1687; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1688; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1689; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1690; VI-NEXT: s_mov_b64 s[2:3], 0xffff 1691; VI-NEXT: v_mov_b32_e32 v3, s1 1692; VI-NEXT: s_lshl_b32 s1, s5, 4 1693; VI-NEXT: s_lshl_b32 s5, s4, 16 1694; VI-NEXT: s_and_b32 s4, s4, s2 1695; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 1696; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 1697; VI-NEXT: s_or_b32 s2, s4, s5 1698; VI-NEXT: v_mov_b32_e32 v4, s2 1699; VI-NEXT: v_mov_b32_e32 v5, s2 1700; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1701; VI-NEXT: s_waitcnt vmcnt(0) 1702; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 1703; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 1704; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1705; VI-NEXT: s_endpgm 1706; 1707; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: 1708; CI: ; %bb.0: 1709; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1710; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 1711; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1712; CI-NEXT: s_waitcnt lgkmcnt(0) 1713; CI-NEXT: v_mov_b32_e32 v1, s3 1714; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1715; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1716; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1717; CI-NEXT: s_mov_b64 s[2:3], 0xffff 1718; CI-NEXT: v_mov_b32_e32 v3, s1 1719; CI-NEXT: s_and_b32 s6, s4, s2 1720; CI-NEXT: s_lshl_b32 s1, s5, 4 1721; CI-NEXT: s_lshl_b32 s4, s4, 16 1722; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 1723; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 1724; CI-NEXT: s_or_b32 s2, s6, s4 1725; CI-NEXT: v_mov_b32_e32 v4, s2 1726; CI-NEXT: v_mov_b32_e32 v5, s2 1727; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1728; CI-NEXT: s_waitcnt vmcnt(0) 1729; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 1730; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 1731; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1732; CI-NEXT: s_endpgm 1733 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1734 %tid.ext = sext i32 %tid to i64 1735 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext 1736 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext 1737 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep 1738 %val.trunc = trunc i32 %val to i16 1739 %val.cvt = bitcast i16 %val.trunc to half 1740 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval 1741 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep 1742 ret void 1743} 1744 1745define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) { 1746; GFX9-LABEL: v_insertelement_v8f16_3: 1747; GFX9: ; %bb.0: 1748; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1749; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1750; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1751; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 1753; GFX9-NEXT: s_waitcnt vmcnt(0) 1754; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1755; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1 1756; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1757; GFX9-NEXT: s_endpgm 1758; 1759; VI-LABEL: v_insertelement_v8f16_3: 1760; VI: ; %bb.0: 1761; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1762; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1763; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1764; VI-NEXT: s_waitcnt lgkmcnt(0) 1765; VI-NEXT: v_mov_b32_e32 v1, s3 1766; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1767; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1768; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1769; VI-NEXT: v_mov_b32_e32 v5, s1 1770; VI-NEXT: s_lshl_b32 s1, s4, 16 1771; VI-NEXT: s_mov_b32 s2, 0xffff 1772; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1773; VI-NEXT: v_mov_b32_e32 v6, s1 1774; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1775; VI-NEXT: s_waitcnt vmcnt(0) 1776; VI-NEXT: v_bfi_b32 v3, s2, v3, v3 1777; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1778; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1779; VI-NEXT: s_endpgm 1780; 1781; CI-LABEL: v_insertelement_v8f16_3: 1782; CI: ; %bb.0: 1783; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1784; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1785; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1786; CI-NEXT: s_waitcnt lgkmcnt(0) 1787; CI-NEXT: v_mov_b32_e32 v1, s3 1788; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1789; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1790; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1791; CI-NEXT: v_mov_b32_e32 v5, s1 1792; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 1793; CI-NEXT: s_lshl_b32 s0, s4, 16 1794; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1795; CI-NEXT: s_waitcnt vmcnt(0) 1796; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1797; CI-NEXT: v_or_b32_e32 v1, s0, v1 1798; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1799; CI-NEXT: s_endpgm 1800 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1801 %tid.ext = sext i32 %tid to i64 1802 %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext 1803 %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext 1804 %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep 1805 %val.trunc = trunc i32 %val to i16 1806 %val.cvt = bitcast i16 %val.trunc to half 1807 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3 1808 store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep 1809 ret void 1810} 1811 1812define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) { 1813; GFX9-LABEL: v_insertelement_v8i16_6: 1814; GFX9: ; %bb.0: 1815; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1816; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 1817; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1818; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 1819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1820; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 1821; GFX9-NEXT: s_waitcnt vmcnt(0) 1822; GFX9-NEXT: v_bfi_b32 v3, v5, s6, v3 1823; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1824; GFX9-NEXT: s_endpgm 1825; 1826; VI-LABEL: v_insertelement_v8i16_6: 1827; VI: ; %bb.0: 1828; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1829; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1830; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1831; VI-NEXT: s_waitcnt lgkmcnt(0) 1832; VI-NEXT: v_mov_b32_e32 v1, s3 1833; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1834; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1835; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1836; VI-NEXT: s_mov_b32 s2, 0xffff 1837; VI-NEXT: v_mov_b32_e32 v5, s1 1838; VI-NEXT: v_mov_b32_e32 v6, s4 1839; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1840; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1841; VI-NEXT: s_waitcnt vmcnt(0) 1842; VI-NEXT: v_bfi_b32 v3, s2, v6, v3 1843; VI-NEXT: v_bfi_b32 v1, s2, v1, v1 1844; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1845; VI-NEXT: s_endpgm 1846; 1847; CI-LABEL: v_insertelement_v8i16_6: 1848; CI: ; %bb.0: 1849; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1850; CI-NEXT: s_load_dword s4, s[4:5], 0x4 1851; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1852; CI-NEXT: s_waitcnt lgkmcnt(0) 1853; CI-NEXT: v_mov_b32_e32 v1, s3 1854; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1855; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1856; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1857; CI-NEXT: v_mov_b32_e32 v5, s1 1858; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 1859; CI-NEXT: s_mov_b32 s0, 0xffff 1860; CI-NEXT: v_mov_b32_e32 v6, s4 1861; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1862; CI-NEXT: s_waitcnt vmcnt(0) 1863; CI-NEXT: v_bfi_b32 v3, s0, v6, v3 1864; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1865; CI-NEXT: s_endpgm 1866 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1867 %tid.ext = sext i32 %tid to i64 1868 %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext 1869 %out.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %out, i64 %tid.ext 1870 %vec = load <8 x i16>, <8 x i16> addrspace(1)* %in.gep 1871 %val.trunc = trunc i32 %val to i16 1872 %val.cvt = bitcast i16 %val.trunc to i16 1873 %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6 1874 store <8 x i16> %vecins, <8 x i16> addrspace(1)* %out.gep 1875 ret void 1876} 1877 1878define amdgpu_kernel void @v_insertelement_v8f16_dynamic(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val, i32 %n) { 1879; GFX9-LABEL: v_insertelement_v8f16_dynamic: 1880; GFX9: ; %bb.0: 1881; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1882; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 1883; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1884; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 1885; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 1887; GFX9-NEXT: s_cmp_eq_u32 s7, 7 1888; GFX9-NEXT: v_mov_b32_e32 v6, s6 1889; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1890; GFX9-NEXT: s_cmp_eq_u32 s7, 6 1891; GFX9-NEXT: s_waitcnt vmcnt(0) 1892; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 1893; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 1894; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1895; GFX9-NEXT: s_cmp_eq_u32 s7, 5 1896; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v2 1897; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1898; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1899; GFX9-NEXT: s_cmp_eq_u32 s7, 4 1900; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 1901; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1902; GFX9-NEXT: s_cmp_eq_u32 s7, 3 1903; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1904; GFX9-NEXT: v_and_b32_e32 v3, v5, v3 1905; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1906; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1907; GFX9-NEXT: s_cmp_eq_u32 s7, 2 1908; GFX9-NEXT: v_lshl_or_b32 v3, v7, 16, v3 1909; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc 1910; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1911; GFX9-NEXT: s_cmp_eq_u32 s7, 1 1912; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1913; GFX9-NEXT: v_and_b32_e32 v2, v5, v2 1914; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1915; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1916; GFX9-NEXT: s_cmp_eq_u32 s7, 0 1917; GFX9-NEXT: v_lshl_or_b32 v2, v8, 16, v2 1918; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v6, vcc 1919; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1920; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1921; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 1922; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 1923; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v1 1924; GFX9-NEXT: v_lshl_or_b32 v0, v8, 16, v0 1925; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1926; GFX9-NEXT: s_endpgm 1927; 1928; VI-LABEL: v_insertelement_v8f16_dynamic: 1929; VI: ; %bb.0: 1930; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1931; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 1932; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1933; VI-NEXT: s_waitcnt lgkmcnt(0) 1934; VI-NEXT: v_mov_b32_e32 v1, s3 1935; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1936; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1937; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1938; VI-NEXT: v_mov_b32_e32 v5, s1 1939; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1940; VI-NEXT: s_cmp_eq_u32 s5, 6 1941; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1942; VI-NEXT: v_mov_b32_e32 v6, s4 1943; VI-NEXT: s_cselect_b64 vcc, -1, 0 1944; VI-NEXT: s_cmp_eq_u32 s5, 7 1945; VI-NEXT: s_waitcnt vmcnt(0) 1946; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc 1947; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1948; VI-NEXT: s_cselect_b64 vcc, -1, 0 1949; VI-NEXT: s_cmp_eq_u32 s5, 4 1950; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1951; VI-NEXT: s_cselect_b64 vcc, -1, 0 1952; VI-NEXT: s_cmp_eq_u32 s5, 5 1953; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 1954; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1955; VI-NEXT: s_cselect_b64 vcc, -1, 0 1956; VI-NEXT: s_cmp_eq_u32 s5, 2 1957; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1958; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc 1959; VI-NEXT: s_cselect_b64 vcc, -1, 0 1960; VI-NEXT: s_cmp_eq_u32 s5, 3 1961; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1962; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1963; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 1964; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1965; VI-NEXT: s_cselect_b64 vcc, -1, 0 1966; VI-NEXT: s_cmp_eq_u32 s5, 0 1967; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1968; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc 1969; VI-NEXT: s_cselect_b64 vcc, -1, 0 1970; VI-NEXT: s_cmp_eq_u32 s5, 1 1971; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1972; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1973; VI-NEXT: s_cselect_b64 vcc, -1, 0 1974; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 1975; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1976; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1977; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1978; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1979; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1980; VI-NEXT: s_endpgm 1981; 1982; CI-LABEL: v_insertelement_v8f16_dynamic: 1983; CI: ; %bb.0: 1984; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1985; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 1986; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1987; CI-NEXT: s_waitcnt lgkmcnt(0) 1988; CI-NEXT: v_mov_b32_e32 v1, s3 1989; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1990; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1991; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1992; CI-NEXT: v_mov_b32_e32 v5, s1 1993; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 1994; CI-NEXT: v_cvt_f32_f16_e32 v6, s4 1995; CI-NEXT: s_cmp_eq_u32 s5, 7 1996; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1997; CI-NEXT: s_cselect_b64 vcc, -1, 0 1998; CI-NEXT: s_cmp_eq_u32 s5, 6 1999; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2000; CI-NEXT: s_cmp_eq_u32 s5, 5 2001; CI-NEXT: s_cselect_b64 s[2:3], -1, 0 2002; CI-NEXT: s_cmp_eq_u32 s5, 4 2003; CI-NEXT: s_waitcnt vmcnt(0) 2004; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 2005; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2006; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 2007; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 2008; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 2009; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 2010; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2011; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 2012; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 2013; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 2014; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 2015; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2016; CI-NEXT: s_cmp_eq_u32 s5, 3 2017; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2018; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2019; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc 2020; CI-NEXT: s_cselect_b64 vcc, -1, 0 2021; CI-NEXT: s_cmp_eq_u32 s5, 2 2022; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc 2023; CI-NEXT: s_cselect_b64 vcc, -1, 0 2024; CI-NEXT: s_cmp_eq_u32 s5, 1 2025; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2026; CI-NEXT: s_cselect_b64 vcc, -1, 0 2027; CI-NEXT: s_cmp_eq_u32 s5, 0 2028; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] 2029; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2030; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc 2031; CI-NEXT: s_cselect_b64 vcc, -1, 0 2032; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 2033; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2034; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 2035; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 2036; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 2037; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2038; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2039; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2040; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2041; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 2042; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 2043; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9 2044; CI-NEXT: v_or_b32_e32 v3, v3, v6 2045; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10 2046; CI-NEXT: v_or_b32_e32 v2, v2, v7 2047; CI-NEXT: v_or_b32_e32 v1, v1, v8 2048; CI-NEXT: v_or_b32_e32 v0, v0, v6 2049; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2050; CI-NEXT: s_endpgm 2051 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2052 %tid.ext = sext i32 %tid to i64 2053 %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext 2054 %out.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %out, i64 %tid.ext 2055 %vec = load <8 x half>, <8 x half> addrspace(1)* %in.gep 2056 %val.trunc = trunc i32 %val to i16 2057 %val.cvt = bitcast i16 %val.trunc to half 2058 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n 2059 store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep 2060 ret void 2061} 2062 2063declare i32 @llvm.amdgcn.workitem.id.x() #1 2064 2065attributes #0 = { nounwind } 2066attributes #1 = { nounwind readnone } 2067