1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11 6 7; FIXME: Need to handle non-uniform case for function below (load without gep). 8define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 9; GFX9-LABEL: v_test_sub_v2i16: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 12; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 13; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc 16; GFX9-NEXT: s_waitcnt vmcnt(0) 17; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 18; GFX9-NEXT: s_waitcnt vmcnt(0) 19; GFX9-NEXT: s_mov_b32 s7, 0xf000 20; GFX9-NEXT: s_mov_b32 s6, -1 21; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 22; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 23; GFX9-NEXT: s_endpgm 24; 25; VI-LABEL: v_test_sub_v2i16: 26; VI: ; %bb.0: 27; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 28; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 29; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: v_mov_b32_e32 v1, s7 32; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 33; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 34; VI-NEXT: v_mov_b32_e32 v3, s1 35; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 36; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 37; VI-NEXT: flat_load_dword v0, v[0:1] glc 38; VI-NEXT: s_waitcnt vmcnt(0) 39; VI-NEXT: flat_load_dword v1, v[2:3] glc 40; VI-NEXT: s_waitcnt vmcnt(0) 41; VI-NEXT: s_mov_b32 s7, 0xf000 42; VI-NEXT: s_mov_b32 s6, -1 43; VI-NEXT: v_sub_u16_e32 v2, v0, v1 44; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 45; VI-NEXT: v_or_b32_e32 v0, v2, v0 46; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 47; VI-NEXT: s_endpgm 48; 49; GFX10-LABEL: v_test_sub_v2i16: 50; GFX10: ; %bb.0: 51; GFX10-NEXT: s_clause 0x1 52; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 53; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 54; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 55; GFX10-NEXT: s_waitcnt lgkmcnt(0) 56; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 57; GFX10-NEXT: s_waitcnt vmcnt(0) 58; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 59; GFX10-NEXT: s_waitcnt vmcnt(0) 60; GFX10-NEXT: s_waitcnt_depctr 0xffe3 61; GFX10-NEXT: s_mov_b32 s7, 0x31016000 62; GFX10-NEXT: s_mov_b32 s6, -1 63; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 64; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 65; GFX10-NEXT: s_endpgm 66; 67; GFX11-LABEL: v_test_sub_v2i16: 68; GFX11: ; %bb.0: 69; GFX11-NEXT: s_clause 0x1 70; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 71; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 72; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 73; GFX11-NEXT: s_waitcnt lgkmcnt(0) 74; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc 75; GFX11-NEXT: s_waitcnt vmcnt(0) 76; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc 77; GFX11-NEXT: s_waitcnt vmcnt(0) 78; GFX11-NEXT: s_mov_b32 s7, 0x31016000 79; GFX11-NEXT: s_mov_b32 s6, -1 80; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 81; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 82; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 83; GFX11-NEXT: s_endpgm 84 %tid = call i32 @llvm.amdgcn.workitem.id.x() 85 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 86 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 87 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 88 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 89 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 90 %add = sub <2 x i16> %a, %b 91 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 92 ret void 93} 94 95define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { 96; GFX9-LABEL: s_test_sub_v2i16: 97; GFX9: ; %bb.0: 98; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 99; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 100; GFX9-NEXT: s_mov_b32 s3, 0xf000 101; GFX9-NEXT: s_mov_b32 s2, -1 102; GFX9-NEXT: s_waitcnt lgkmcnt(0) 103; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 104; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 105; GFX9-NEXT: s_mov_b32 s0, s4 106; GFX9-NEXT: s_mov_b32 s1, s5 107; GFX9-NEXT: s_waitcnt lgkmcnt(0) 108; GFX9-NEXT: v_mov_b32_e32 v0, s10 109; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 110; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 111; GFX9-NEXT: s_endpgm 112; 113; VI-LABEL: s_test_sub_v2i16: 114; VI: ; %bb.0: 115; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 116; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 117; VI-NEXT: s_mov_b32 s3, 0xf000 118; VI-NEXT: s_mov_b32 s2, -1 119; VI-NEXT: s_waitcnt lgkmcnt(0) 120; VI-NEXT: s_load_dword s6, s[6:7], 0x0 121; VI-NEXT: s_load_dword s7, s[0:1], 0x0 122; VI-NEXT: s_mov_b32 s0, s4 123; VI-NEXT: s_mov_b32 s1, s5 124; VI-NEXT: s_waitcnt lgkmcnt(0) 125; VI-NEXT: s_lshr_b32 s4, s6, 16 126; VI-NEXT: s_lshr_b32 s5, s7, 16 127; VI-NEXT: s_sub_i32 s6, s6, s7 128; VI-NEXT: s_sub_i32 s4, s4, s5 129; VI-NEXT: s_and_b32 s5, s6, 0xffff 130; VI-NEXT: s_lshl_b32 s4, s4, 16 131; VI-NEXT: s_or_b32 s4, s5, s4 132; VI-NEXT: v_mov_b32_e32 v0, s4 133; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 134; VI-NEXT: s_endpgm 135; 136; GFX10-LABEL: s_test_sub_v2i16: 137; GFX10: ; %bb.0: 138; GFX10-NEXT: s_clause 0x1 139; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 140; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 141; GFX10-NEXT: s_waitcnt lgkmcnt(0) 142; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 143; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 144; GFX10-NEXT: s_mov_b32 s7, 0x31016000 145; GFX10-NEXT: s_mov_b32 s6, -1 146; GFX10-NEXT: s_waitcnt lgkmcnt(0) 147; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 148; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 149; GFX10-NEXT: s_endpgm 150; 151; GFX11-LABEL: s_test_sub_v2i16: 152; GFX11: ; %bb.0: 153; GFX11-NEXT: s_clause 0x1 154; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 155; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 156; GFX11-NEXT: s_waitcnt lgkmcnt(0) 157; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 158; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 159; GFX11-NEXT: s_mov_b32 s7, 0x31016000 160; GFX11-NEXT: s_mov_b32 s6, -1 161; GFX11-NEXT: s_waitcnt lgkmcnt(0) 162; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0 163; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 164; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 165; GFX11-NEXT: s_endpgm 166 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 167 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 168 %add = sub <2 x i16> %a, %b 169 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 170 ret void 171} 172 173define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { 174; GCN-LABEL: s_test_sub_self_v2i16: 175; GCN: ; %bb.0: 176; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 177; GCN-NEXT: s_mov_b32 s3, 0xf000 178; GCN-NEXT: s_mov_b32 s2, -1 179; GCN-NEXT: v_mov_b32_e32 v0, 0 180; GCN-NEXT: s_waitcnt lgkmcnt(0) 181; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 182; GCN-NEXT: s_endpgm 183; 184; GFX10-LABEL: s_test_sub_self_v2i16: 185; GFX10: ; %bb.0: 186; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 187; GFX10-NEXT: v_mov_b32_e32 v0, 0 188; GFX10-NEXT: s_mov_b32 s3, 0x31016000 189; GFX10-NEXT: s_mov_b32 s2, -1 190; GFX10-NEXT: s_waitcnt lgkmcnt(0) 191; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 192; GFX10-NEXT: s_endpgm 193; 194; GFX11-LABEL: s_test_sub_self_v2i16: 195; GFX11: ; %bb.0: 196; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 197; GFX11-NEXT: v_mov_b32_e32 v0, 0 198; GFX11-NEXT: s_mov_b32 s3, 0x31016000 199; GFX11-NEXT: s_mov_b32 s2, -1 200; GFX11-NEXT: s_waitcnt lgkmcnt(0) 201; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 202; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 203; GFX11-NEXT: s_endpgm 204 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 205 %add = sub <2 x i16> %a, %a 206 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 207 ret void 208} 209 210; FIXME: VI should not scalarize arg access. 211define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { 212; GFX9-LABEL: s_test_sub_v2i16_kernarg: 213; GFX9: ; %bb.0: 214; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 215; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 216; GFX9-NEXT: s_mov_b32 s7, 0xf000 217; GFX9-NEXT: s_mov_b32 s6, -1 218; GFX9-NEXT: s_waitcnt lgkmcnt(0) 219; GFX9-NEXT: v_mov_b32_e32 v0, s3 220; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 221; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 222; GFX9-NEXT: s_endpgm 223; 224; VI-LABEL: s_test_sub_v2i16_kernarg: 225; VI: ; %bb.0: 226; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 227; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 228; VI-NEXT: s_mov_b32 s3, 0xf000 229; VI-NEXT: s_mov_b32 s2, -1 230; VI-NEXT: s_waitcnt lgkmcnt(0) 231; VI-NEXT: s_lshr_b32 s6, s4, 16 232; VI-NEXT: s_lshr_b32 s7, s5, 16 233; VI-NEXT: s_sub_i32 s6, s6, s7 234; VI-NEXT: s_sub_i32 s4, s4, s5 235; VI-NEXT: s_lshl_b32 s5, s6, 16 236; VI-NEXT: s_and_b32 s4, s4, 0xffff 237; VI-NEXT: s_or_b32 s4, s4, s5 238; VI-NEXT: v_mov_b32_e32 v0, s4 239; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 240; VI-NEXT: s_endpgm 241; 242; GFX10-LABEL: s_test_sub_v2i16_kernarg: 243; GFX10: ; %bb.0: 244; GFX10-NEXT: s_clause 0x1 245; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 246; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 247; GFX10-NEXT: s_mov_b32 s7, 0x31016000 248; GFX10-NEXT: s_mov_b32 s6, -1 249; GFX10-NEXT: s_waitcnt lgkmcnt(0) 250; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 251; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 252; GFX10-NEXT: s_endpgm 253; 254; GFX11-LABEL: s_test_sub_v2i16_kernarg: 255; GFX11: ; %bb.0: 256; GFX11-NEXT: s_clause 0x1 257; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 258; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 259; GFX11-NEXT: s_waitcnt lgkmcnt(0) 260; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 261; GFX11-NEXT: s_mov_b32 s3, 0x31016000 262; GFX11-NEXT: s_mov_b32 s2, -1 263; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 264; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 265; GFX11-NEXT: s_endpgm 266 %add = sub <2 x i16> %a, %b 267 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 268 ret void 269} 270 271define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 272; GFX9-LABEL: v_test_sub_v2i16_constant: 273; GFX9: ; %bb.0: 274; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 275; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 276; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b 277; GFX9-NEXT: s_waitcnt lgkmcnt(0) 278; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 279; GFX9-NEXT: s_waitcnt vmcnt(0) 280; GFX9-NEXT: s_mov_b32 s3, 0xf000 281; GFX9-NEXT: s_mov_b32 s2, -1 282; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 283; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 284; GFX9-NEXT: s_endpgm 285; 286; VI-LABEL: v_test_sub_v2i16_constant: 287; VI: ; %bb.0: 288; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 289; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 290; VI-NEXT: s_waitcnt lgkmcnt(0) 291; VI-NEXT: v_mov_b32_e32 v1, s3 292; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 293; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 294; VI-NEXT: flat_load_dword v0, v[0:1] glc 295; VI-NEXT: s_waitcnt vmcnt(0) 296; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 297; VI-NEXT: s_mov_b32 s3, 0xf000 298; VI-NEXT: s_mov_b32 s2, -1 299; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0 300; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 301; VI-NEXT: v_or_b32_e32 v0, v2, v0 302; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 303; VI-NEXT: s_endpgm 304; 305; GFX10-LABEL: v_test_sub_v2i16_constant: 306; GFX10: ; %bb.0: 307; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 308; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 309; GFX10-NEXT: s_waitcnt lgkmcnt(0) 310; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 311; GFX10-NEXT: s_waitcnt vmcnt(0) 312; GFX10-NEXT: s_waitcnt_depctr 0xffe3 313; GFX10-NEXT: s_mov_b32 s3, 0x31016000 314; GFX10-NEXT: s_mov_b32 s2, -1 315; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b 316; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 317; GFX10-NEXT: s_endpgm 318; 319; GFX11-LABEL: v_test_sub_v2i16_constant: 320; GFX11: ; %bb.0: 321; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 322; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 323; GFX11-NEXT: s_waitcnt lgkmcnt(0) 324; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 325; GFX11-NEXT: s_waitcnt vmcnt(0) 326; GFX11-NEXT: s_mov_b32 s3, 0x31016000 327; GFX11-NEXT: s_mov_b32 s2, -1 328; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b 329; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 330; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 331; GFX11-NEXT: s_endpgm 332 %tid = call i32 @llvm.amdgcn.workitem.id.x() 333 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 334 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 335 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 336 %add = sub <2 x i16> %a, <i16 123, i16 456> 337 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 338 ret void 339} 340 341; FIXME: Need to handle non-uniform case for function below (load without gep). 342define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 343; GFX9-LABEL: v_test_sub_v2i16_neg_constant: 344; GFX9: ; %bb.0: 345; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 346; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 347; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 349; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 350; GFX9-NEXT: s_waitcnt vmcnt(0) 351; GFX9-NEXT: s_mov_b32 s3, 0xf000 352; GFX9-NEXT: s_mov_b32 s2, -1 353; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 354; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 355; GFX9-NEXT: s_endpgm 356; 357; VI-LABEL: v_test_sub_v2i16_neg_constant: 358; VI: ; %bb.0: 359; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 360; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: v_mov_b32_e32 v1, s3 363; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 364; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 365; VI-NEXT: flat_load_dword v0, v[0:1] glc 366; VI-NEXT: s_waitcnt vmcnt(0) 367; VI-NEXT: v_mov_b32_e32 v1, 0x3df 368; VI-NEXT: s_mov_b32 s3, 0xf000 369; VI-NEXT: s_mov_b32 s2, -1 370; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 371; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 372; VI-NEXT: v_or_b32_e32 v0, v2, v0 373; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 374; VI-NEXT: s_endpgm 375; 376; GFX10-LABEL: v_test_sub_v2i16_neg_constant: 377; GFX10: ; %bb.0: 378; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 379; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 380; GFX10-NEXT: s_waitcnt lgkmcnt(0) 381; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 382; GFX10-NEXT: s_waitcnt vmcnt(0) 383; GFX10-NEXT: s_waitcnt_depctr 0xffe3 384; GFX10-NEXT: s_mov_b32 s3, 0x31016000 385; GFX10-NEXT: s_mov_b32 s2, -1 386; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 387; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; GFX10-NEXT: s_endpgm 389; 390; GFX11-LABEL: v_test_sub_v2i16_neg_constant: 391; GFX11: ; %bb.0: 392; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 393; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 394; GFX11-NEXT: s_waitcnt lgkmcnt(0) 395; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 396; GFX11-NEXT: s_waitcnt vmcnt(0) 397; GFX11-NEXT: s_mov_b32 s3, 0x31016000 398; GFX11-NEXT: s_mov_b32 s2, -1 399; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 400; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 401; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 402; GFX11-NEXT: s_endpgm 403 %tid = call i32 @llvm.amdgcn.workitem.id.x() 404 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 405 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 406 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 407 %add = sub <2 x i16> %a, <i16 -845, i16 -991> 408 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 409 ret void 410} 411 412define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 413; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: 414; GFX9: ; %bb.0: 415; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 416; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 418; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 419; GFX9-NEXT: s_waitcnt vmcnt(0) 420; GFX9-NEXT: s_mov_b32 s3, 0xf000 421; GFX9-NEXT: s_mov_b32 s2, -1 422; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] 423; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 424; GFX9-NEXT: s_endpgm 425; 426; VI-LABEL: v_test_sub_v2i16_inline_neg1: 427; VI: ; %bb.0: 428; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 429; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: v_mov_b32_e32 v1, s3 432; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 433; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 434; VI-NEXT: flat_load_dword v0, v[0:1] glc 435; VI-NEXT: s_waitcnt vmcnt(0) 436; VI-NEXT: v_mov_b32_e32 v1, 1 437; VI-NEXT: s_mov_b32 s3, 0xf000 438; VI-NEXT: s_mov_b32 s2, -1 439; VI-NEXT: v_add_u16_e32 v2, 1, v0 440; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 441; VI-NEXT: v_or_b32_e32 v0, v2, v0 442; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 443; VI-NEXT: s_endpgm 444; 445; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: 446; GFX10: ; %bb.0: 447; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 448; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 449; GFX10-NEXT: s_waitcnt lgkmcnt(0) 450; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 451; GFX10-NEXT: s_waitcnt vmcnt(0) 452; GFX10-NEXT: s_waitcnt_depctr 0xffe3 453; GFX10-NEXT: s_mov_b32 s3, 0x31016000 454; GFX10-NEXT: s_mov_b32 s2, -1 455; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] 456; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 457; GFX10-NEXT: s_endpgm 458; 459; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: 460; GFX11: ; %bb.0: 461; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 462; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 463; GFX11-NEXT: s_waitcnt lgkmcnt(0) 464; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 465; GFX11-NEXT: s_waitcnt vmcnt(0) 466; GFX11-NEXT: s_mov_b32 s3, 0x31016000 467; GFX11-NEXT: s_mov_b32 s2, -1 468; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] 469; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 470; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 471; GFX11-NEXT: s_endpgm 472 %tid = call i32 @llvm.amdgcn.workitem.id.x() 473 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 474 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 475 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 476 %add = sub <2 x i16> %a, <i16 -1, i16 -1> 477 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 478 ret void 479} 480 481define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 482; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 483; GFX9: ; %bb.0: 484; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 485; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 486; GFX9-NEXT: s_waitcnt lgkmcnt(0) 487; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 488; GFX9-NEXT: s_waitcnt vmcnt(0) 489; GFX9-NEXT: s_mov_b32 s3, 0xf000 490; GFX9-NEXT: s_mov_b32 s2, -1 491; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 492; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 493; GFX9-NEXT: s_endpgm 494; 495; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 496; VI: ; %bb.0: 497; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 498; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 499; VI-NEXT: s_waitcnt lgkmcnt(0) 500; VI-NEXT: v_mov_b32_e32 v1, s3 501; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 502; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 503; VI-NEXT: flat_load_dword v0, v[0:1] glc 504; VI-NEXT: s_waitcnt vmcnt(0) 505; VI-NEXT: s_mov_b32 s3, 0xf000 506; VI-NEXT: s_mov_b32 s2, -1 507; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 508; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 509; VI-NEXT: v_or_b32_e32 v0, v0, v1 510; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 511; VI-NEXT: s_endpgm 512; 513; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 514; GFX10: ; %bb.0: 515; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 516; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 517; GFX10-NEXT: s_waitcnt lgkmcnt(0) 518; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 519; GFX10-NEXT: s_waitcnt vmcnt(0) 520; GFX10-NEXT: s_waitcnt_depctr 0xffe3 521; GFX10-NEXT: s_mov_b32 s3, 0x31016000 522; GFX10-NEXT: s_mov_b32 s2, -1 523; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 524; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 525; GFX10-NEXT: s_endpgm 526; 527; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 528; GFX11: ; %bb.0: 529; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 530; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 531; GFX11-NEXT: s_waitcnt lgkmcnt(0) 532; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 533; GFX11-NEXT: s_waitcnt vmcnt(0) 534; GFX11-NEXT: s_mov_b32 s3, 0x31016000 535; GFX11-NEXT: s_mov_b32 s2, -1 536; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32 537; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 538; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 539; GFX11-NEXT: s_endpgm 540 %tid = call i32 @llvm.amdgcn.workitem.id.x() 541 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 542 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 543 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 544 %add = sub <2 x i16> %a, <i16 32, i16 0> 545 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 546 ret void 547} 548 549; The high element gives fp 550define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 551; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: 552; GFX9: ; %bb.0: 553; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 554; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 555; GFX9-NEXT: s_mov_b32 s4, 1.0 556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 557; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 558; GFX9-NEXT: s_waitcnt vmcnt(0) 559; GFX9-NEXT: s_mov_b32 s3, 0xf000 560; GFX9-NEXT: s_mov_b32 s2, -1 561; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 562; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 563; GFX9-NEXT: s_endpgm 564; 565; VI-LABEL: v_test_sub_v2i16_inline_fp_split: 566; VI: ; %bb.0: 567; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 568; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 569; VI-NEXT: s_waitcnt lgkmcnt(0) 570; VI-NEXT: v_mov_b32_e32 v1, s3 571; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 572; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 573; VI-NEXT: flat_load_dword v0, v[0:1] glc 574; VI-NEXT: s_waitcnt vmcnt(0) 575; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 576; VI-NEXT: s_mov_b32 s3, 0xf000 577; VI-NEXT: s_mov_b32 s2, -1 578; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 579; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 580; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 581; VI-NEXT: s_endpgm 582; 583; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: 584; GFX10: ; %bb.0: 585; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 586; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 587; GFX10-NEXT: s_waitcnt lgkmcnt(0) 588; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 589; GFX10-NEXT: s_waitcnt vmcnt(0) 590; GFX10-NEXT: s_waitcnt_depctr 0xffe3 591; GFX10-NEXT: s_mov_b32 s3, 0x31016000 592; GFX10-NEXT: s_mov_b32 s2, -1 593; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] 594; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 595; GFX10-NEXT: s_endpgm 596; 597; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: 598; GFX11: ; %bb.0: 599; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 600; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 601; GFX11-NEXT: s_waitcnt lgkmcnt(0) 602; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 603; GFX11-NEXT: s_waitcnt vmcnt(0) 604; GFX11-NEXT: s_mov_b32 s3, 0x31016000 605; GFX11-NEXT: s_mov_b32 s2, -1 606; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] 607; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 608; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 609; GFX11-NEXT: s_endpgm 610 %tid = call i32 @llvm.amdgcn.workitem.id.x() 611 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 612 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 613 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 614 %add = sub <2 x i16> %a, <i16 0, i16 16256> 615 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 616 ret void 617} 618 619; FIXME: Need to handle non-uniform case for function below (load without gep). 620define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 621; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: 622; GFX9: ; %bb.0: 623; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 624; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 625; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 626; GFX9-NEXT: s_waitcnt lgkmcnt(0) 627; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc 628; GFX9-NEXT: s_waitcnt vmcnt(0) 629; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 630; GFX9-NEXT: s_waitcnt vmcnt(0) 631; GFX9-NEXT: s_mov_b32 s7, 0xf000 632; GFX9-NEXT: s_mov_b32 s6, -1 633; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 634; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 635; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 636; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 637; GFX9-NEXT: s_endpgm 638; 639; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: 640; VI: ; %bb.0: 641; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 642; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 643; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 644; VI-NEXT: s_waitcnt lgkmcnt(0) 645; VI-NEXT: v_mov_b32_e32 v1, s7 646; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 647; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 648; VI-NEXT: v_mov_b32_e32 v3, s1 649; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 650; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 651; VI-NEXT: flat_load_dword v1, v[0:1] glc 652; VI-NEXT: s_waitcnt vmcnt(0) 653; VI-NEXT: flat_load_dword v2, v[2:3] glc 654; VI-NEXT: s_waitcnt vmcnt(0) 655; VI-NEXT: s_mov_b32 s7, 0xf000 656; VI-NEXT: s_mov_b32 s6, -1 657; VI-NEXT: v_sub_u16_e32 v0, v1, v2 658; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 659; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 660; VI-NEXT: s_endpgm 661; 662; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: 663; GFX10: ; %bb.0: 664; GFX10-NEXT: s_clause 0x1 665; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 666; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 667; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 668; GFX10-NEXT: s_waitcnt lgkmcnt(0) 669; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 670; GFX10-NEXT: s_waitcnt vmcnt(0) 671; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 672; GFX10-NEXT: s_waitcnt vmcnt(0) 673; GFX10-NEXT: s_waitcnt_depctr 0xffe3 674; GFX10-NEXT: s_mov_b32 s7, 0x31016000 675; GFX10-NEXT: s_mov_b32 s6, -1 676; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 677; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 678; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 679; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 680; GFX10-NEXT: s_endpgm 681; 682; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: 683; GFX11: ; %bb.0: 684; GFX11-NEXT: s_clause 0x1 685; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 686; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 687; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 688; GFX11-NEXT: s_waitcnt lgkmcnt(0) 689; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc 690; GFX11-NEXT: s_waitcnt vmcnt(0) 691; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc 692; GFX11-NEXT: s_waitcnt vmcnt(0) 693; GFX11-NEXT: s_mov_b32 s7, 0x31016000 694; GFX11-NEXT: s_mov_b32 s6, -1 695; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 696; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 697; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 698; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 699; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 700; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 701; GFX11-NEXT: s_endpgm 702 %tid = call i32 @llvm.amdgcn.workitem.id.x() 703 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 704 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 705 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 706 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 707 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 708 %add = sub <2 x i16> %a, %b 709 %ext = zext <2 x i16> %add to <2 x i32> 710 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 711 ret void 712} 713 714; FIXME: Need to handle non-uniform case for function below (load without gep). 715define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 716; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: 717; GFX9: ; %bb.0: 718; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 719; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 720; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 721; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 722; GFX9-NEXT: v_mov_b32_e32 v1, 0 723; GFX9-NEXT: s_waitcnt lgkmcnt(0) 724; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 725; GFX9-NEXT: s_waitcnt vmcnt(0) 726; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc 727; GFX9-NEXT: s_waitcnt vmcnt(0) 728; GFX9-NEXT: s_mov_b32 s7, 0xf000 729; GFX9-NEXT: s_mov_b32 s6, -1 730; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 731; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 732; GFX9-NEXT: v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 733; GFX9-NEXT: v_mov_b32_e32 v3, v1 734; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 735; GFX9-NEXT: s_endpgm 736; 737; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: 738; VI: ; %bb.0: 739; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 740; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 741; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 742; VI-NEXT: s_waitcnt lgkmcnt(0) 743; VI-NEXT: v_mov_b32_e32 v1, s7 744; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 745; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 746; VI-NEXT: v_mov_b32_e32 v3, s1 747; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 748; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 749; VI-NEXT: flat_load_dword v4, v[0:1] glc 750; VI-NEXT: s_waitcnt vmcnt(0) 751; VI-NEXT: flat_load_dword v2, v[2:3] glc 752; VI-NEXT: s_waitcnt vmcnt(0) 753; VI-NEXT: v_mov_b32_e32 v1, 0 754; VI-NEXT: s_mov_b32 s7, 0xf000 755; VI-NEXT: s_mov_b32 s6, -1 756; VI-NEXT: v_mov_b32_e32 v3, v1 757; VI-NEXT: v_sub_u16_e32 v0, v4, v2 758; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 759; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 760; VI-NEXT: s_endpgm 761; 762; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: 763; GFX10: ; %bb.0: 764; GFX10-NEXT: s_clause 0x1 765; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 766; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 767; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 768; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff 769; GFX10-NEXT: s_waitcnt lgkmcnt(0) 770; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 771; GFX10-NEXT: s_waitcnt vmcnt(0) 772; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 773; GFX10-NEXT: s_waitcnt vmcnt(0) 774; GFX10-NEXT: s_waitcnt_depctr 0xffe3 775; GFX10-NEXT: s_mov_b32 s7, 0x31016000 776; GFX10-NEXT: s_mov_b32 s6, -1 777; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 778; GFX10-NEXT: v_mov_b32_e32 v1, 0 779; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 780; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 781; GFX10-NEXT: v_mov_b32_e32 v3, v1 782; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 783; GFX10-NEXT: s_endpgm 784; 785; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: 786; GFX11: ; %bb.0: 787; GFX11-NEXT: s_clause 0x1 788; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 789; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 790; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 791; GFX11-NEXT: s_waitcnt lgkmcnt(0) 792; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc 793; GFX11-NEXT: s_waitcnt vmcnt(0) 794; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc 795; GFX11-NEXT: s_waitcnt vmcnt(0) 796; GFX11-NEXT: s_mov_b32 s7, 0x31016000 797; GFX11-NEXT: s_mov_b32 s6, -1 798; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 799; GFX11-NEXT: v_mov_b32_e32 v1, 0 800; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 801; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 802; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 803; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 804; GFX11-NEXT: v_lshl_or_b32 v2, 0, 16, v2 805; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 806; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 807; GFX11-NEXT: s_endpgm 808 %tid = call i32 @llvm.amdgcn.workitem.id.x() 809 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 810 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 811 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 812 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 813 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 814 %add = sub <2 x i16> %a, %b 815 %ext = zext <2 x i16> %add to <2 x i64> 816 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 817 ret void 818} 819 820; FIXME: Need to handle non-uniform case for function below (load without gep). 821define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 822; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: 823; GFX9: ; %bb.0: 824; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 825; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 826; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 827; GFX9-NEXT: s_waitcnt lgkmcnt(0) 828; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc 829; GFX9-NEXT: s_waitcnt vmcnt(0) 830; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 831; GFX9-NEXT: s_waitcnt vmcnt(0) 832; GFX9-NEXT: s_mov_b32 s7, 0xf000 833; GFX9-NEXT: s_mov_b32 s6, -1 834; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 835; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 836; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 837; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 838; GFX9-NEXT: s_endpgm 839; 840; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: 841; VI: ; %bb.0: 842; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 843; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 844; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 845; VI-NEXT: s_waitcnt lgkmcnt(0) 846; VI-NEXT: v_mov_b32_e32 v1, s7 847; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 848; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 849; VI-NEXT: v_mov_b32_e32 v3, s1 850; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 851; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 852; VI-NEXT: flat_load_dword v0, v[0:1] glc 853; VI-NEXT: s_waitcnt vmcnt(0) 854; VI-NEXT: flat_load_dword v1, v[2:3] glc 855; VI-NEXT: s_waitcnt vmcnt(0) 856; VI-NEXT: s_mov_b32 s7, 0xf000 857; VI-NEXT: s_mov_b32 s6, -1 858; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 859; VI-NEXT: v_sub_u16_e32 v0, v0, v1 860; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 861; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 862; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 863; VI-NEXT: s_endpgm 864; 865; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: 866; GFX10: ; %bb.0: 867; GFX10-NEXT: s_clause 0x1 868; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 869; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 870; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 871; GFX10-NEXT: s_waitcnt lgkmcnt(0) 872; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 873; GFX10-NEXT: s_waitcnt vmcnt(0) 874; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 875; GFX10-NEXT: s_waitcnt vmcnt(0) 876; GFX10-NEXT: s_waitcnt_depctr 0xffe3 877; GFX10-NEXT: s_mov_b32 s7, 0x31016000 878; GFX10-NEXT: s_mov_b32 s6, -1 879; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 880; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 881; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 882; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 883; GFX10-NEXT: s_endpgm 884; 885; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: 886; GFX11: ; %bb.0: 887; GFX11-NEXT: s_clause 0x1 888; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 889; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 890; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 891; GFX11-NEXT: s_waitcnt lgkmcnt(0) 892; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc 893; GFX11-NEXT: s_waitcnt vmcnt(0) 894; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc 895; GFX11-NEXT: s_waitcnt vmcnt(0) 896; GFX11-NEXT: s_mov_b32 s7, 0x31016000 897; GFX11-NEXT: s_mov_b32 s6, -1 898; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 899; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 900; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 901; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 902; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 903; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 904; GFX11-NEXT: s_endpgm 905 %tid = call i32 @llvm.amdgcn.workitem.id.x() 906 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 907 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 908 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 909 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 910 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 911 %add = sub <2 x i16> %a, %b 912 %ext = sext <2 x i16> %add to <2 x i32> 913 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 914 ret void 915} 916 917; FIXME: Need to handle non-uniform case for function below (load without gep). 918define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 919; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: 920; GFX9: ; %bb.0: 921; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 922; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 923; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 924; GFX9-NEXT: s_waitcnt lgkmcnt(0) 925; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 926; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 927; GFX9-NEXT: s_mov_b32 s7, 0xf000 928; GFX9-NEXT: s_mov_b32 s6, -1 929; GFX9-NEXT: s_waitcnt vmcnt(0) 930; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 931; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 932; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 933; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 934; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 935; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 936; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 937; GFX9-NEXT: s_endpgm 938; 939; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: 940; VI: ; %bb.0: 941; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 942; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 943; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 944; VI-NEXT: s_waitcnt lgkmcnt(0) 945; VI-NEXT: v_mov_b32_e32 v1, s7 946; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 947; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 948; VI-NEXT: v_mov_b32_e32 v3, s1 949; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 950; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 951; VI-NEXT: flat_load_dword v0, v[0:1] 952; VI-NEXT: flat_load_dword v1, v[2:3] 953; VI-NEXT: s_mov_b32 s7, 0xf000 954; VI-NEXT: s_mov_b32 s6, -1 955; VI-NEXT: s_waitcnt vmcnt(0) 956; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 957; VI-NEXT: v_sub_u16_e32 v0, v0, v1 958; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 959; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 960; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 961; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 962; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 963; VI-NEXT: s_endpgm 964; 965; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: 966; GFX10: ; %bb.0: 967; GFX10-NEXT: s_clause 0x1 968; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 969; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 970; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 971; GFX10-NEXT: s_waitcnt lgkmcnt(0) 972; GFX10-NEXT: s_clause 0x1 973; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 974; GFX10-NEXT: global_load_dword v2, v0, s[2:3] 975; GFX10-NEXT: s_waitcnt_depctr 0xffe3 976; GFX10-NEXT: s_mov_b32 s7, 0x31016000 977; GFX10-NEXT: s_mov_b32 s6, -1 978; GFX10-NEXT: s_waitcnt vmcnt(0) 979; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 980; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 981; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 982; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 983; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 984; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 985; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 986; GFX10-NEXT: s_endpgm 987; 988; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: 989; GFX11: ; %bb.0: 990; GFX11-NEXT: s_clause 0x1 991; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 992; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 993; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 994; GFX11-NEXT: s_waitcnt lgkmcnt(0) 995; GFX11-NEXT: s_clause 0x1 996; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] 997; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 998; GFX11-NEXT: s_mov_b32 s7, 0x31016000 999; GFX11-NEXT: s_mov_b32 s6, -1 1000; GFX11-NEXT: s_waitcnt vmcnt(0) 1001; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 1002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1003; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1004; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 1005; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 1006; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1007; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1008; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 1009; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 1010; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1011; GFX11-NEXT: s_endpgm 1012 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1013 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 1014 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 1015 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 1016 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 1017 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 1018 %add = sub <2 x i16> %a, %b 1019 %ext = sext <2 x i16> %add to <2 x i64> 1020 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 1021 ret void 1022} 1023 1024declare i32 @llvm.amdgcn.workitem.id.x() #0 1025 1026attributes #0 = { nounwind readnone } 1027attributes #1 = { nounwind } 1028