1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 5 6; FIXME: Need to handle non-uniform case for function below (load without gep). 7define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 8; GFX9-LABEL: v_test_sub_v2i16: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 12; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc 15; GFX9-NEXT: s_waitcnt vmcnt(0) 16; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 17; GFX9-NEXT: s_waitcnt vmcnt(0) 18; GFX9-NEXT: s_mov_b32 s7, 0xf000 19; GFX9-NEXT: s_mov_b32 s6, -1 20; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 21; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 22; GFX9-NEXT: s_endpgm 23; 24; VI-LABEL: v_test_sub_v2i16: 25; VI: ; %bb.0: 26; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 27; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 28; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: v_mov_b32_e32 v1, s7 31; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 32; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 33; VI-NEXT: v_mov_b32_e32 v3, s1 34; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 35; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 36; VI-NEXT: flat_load_dword v0, v[0:1] glc 37; VI-NEXT: s_waitcnt vmcnt(0) 38; VI-NEXT: flat_load_dword v1, v[2:3] glc 39; VI-NEXT: s_waitcnt vmcnt(0) 40; VI-NEXT: s_mov_b32 s7, 0xf000 41; VI-NEXT: s_mov_b32 s6, -1 42; VI-NEXT: v_sub_u16_e32 v2, v0, v1 43; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 44; VI-NEXT: v_or_b32_e32 v0, v2, v0 45; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 46; VI-NEXT: s_endpgm 47; 48; GFX10-LABEL: v_test_sub_v2i16: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_clause 0x1 51; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 52; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 53; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 54; GFX10-NEXT: s_waitcnt lgkmcnt(0) 55; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 56; GFX10-NEXT: s_waitcnt vmcnt(0) 57; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 58; GFX10-NEXT: s_waitcnt vmcnt(0) 59; GFX10-NEXT: s_waitcnt_depctr 0xffe3 60; GFX10-NEXT: s_mov_b32 s7, 0x31016000 61; GFX10-NEXT: s_mov_b32 s6, -1 62; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 63; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 64; GFX10-NEXT: s_endpgm 65 %tid = call i32 @llvm.amdgcn.workitem.id.x() 66 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 67 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 68 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 69 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 70 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 71 %add = sub <2 x i16> %a, %b 72 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 73 ret void 74} 75 76define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { 77; GFX9-LABEL: s_test_sub_v2i16: 78; GFX9: ; %bb.0: 79; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 80; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 81; GFX9-NEXT: s_mov_b32 s3, 0xf000 82; GFX9-NEXT: s_mov_b32 s2, -1 83; GFX9-NEXT: s_waitcnt lgkmcnt(0) 84; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 85; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 86; GFX9-NEXT: s_mov_b32 s0, s4 87; GFX9-NEXT: s_mov_b32 s1, s5 88; GFX9-NEXT: s_waitcnt lgkmcnt(0) 89; GFX9-NEXT: v_mov_b32_e32 v0, s10 90; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 91; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 92; GFX9-NEXT: s_endpgm 93; 94; VI-LABEL: s_test_sub_v2i16: 95; VI: ; %bb.0: 96; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 97; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 98; VI-NEXT: s_mov_b32 s3, 0xf000 99; VI-NEXT: s_mov_b32 s2, -1 100; VI-NEXT: s_waitcnt lgkmcnt(0) 101; VI-NEXT: s_load_dword s6, s[6:7], 0x0 102; VI-NEXT: s_load_dword s7, s[0:1], 0x0 103; VI-NEXT: s_mov_b32 s0, s4 104; VI-NEXT: s_mov_b32 s1, s5 105; VI-NEXT: s_waitcnt lgkmcnt(0) 106; VI-NEXT: s_lshr_b32 s4, s6, 16 107; VI-NEXT: s_lshr_b32 s5, s7, 16 108; VI-NEXT: s_sub_i32 s6, s6, s7 109; VI-NEXT: s_sub_i32 s4, s4, s5 110; VI-NEXT: s_and_b32 s5, s6, 0xffff 111; VI-NEXT: s_lshl_b32 s4, s4, 16 112; VI-NEXT: s_or_b32 s4, s5, s4 113; VI-NEXT: v_mov_b32_e32 v0, s4 114; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 115; VI-NEXT: s_endpgm 116; 117; GFX10-LABEL: s_test_sub_v2i16: 118; GFX10: ; %bb.0: 119; GFX10-NEXT: s_clause 0x1 120; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 121; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 122; GFX10-NEXT: s_waitcnt lgkmcnt(0) 123; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 124; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 125; GFX10-NEXT: s_mov_b32 s7, 0x31016000 126; GFX10-NEXT: s_mov_b32 s6, -1 127; GFX10-NEXT: s_waitcnt lgkmcnt(0) 128; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 129; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 130; GFX10-NEXT: s_endpgm 131 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 132 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 133 %add = sub <2 x i16> %a, %b 134 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 135 ret void 136} 137 138define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { 139; GCN-LABEL: s_test_sub_self_v2i16: 140; GCN: ; %bb.0: 141; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 142; GCN-NEXT: s_mov_b32 s3, 0xf000 143; GCN-NEXT: s_mov_b32 s2, -1 144; GCN-NEXT: v_mov_b32_e32 v0, 0 145; GCN-NEXT: s_waitcnt lgkmcnt(0) 146; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 147; GCN-NEXT: s_endpgm 148; 149; GFX10-LABEL: s_test_sub_self_v2i16: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 152; GFX10-NEXT: v_mov_b32_e32 v0, 0 153; GFX10-NEXT: s_mov_b32 s3, 0x31016000 154; GFX10-NEXT: s_mov_b32 s2, -1 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 157; GFX10-NEXT: s_endpgm 158 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 159 %add = sub <2 x i16> %a, %a 160 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 161 ret void 162} 163 164; FIXME: VI should not scalarize arg access. 165define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { 166; GFX9-LABEL: s_test_sub_v2i16_kernarg: 167; GFX9: ; %bb.0: 168; GFX9-NEXT: s_load_dword s2, s[0:1], 0x30 169; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c 170; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 171; GFX9-NEXT: s_mov_b32 s7, 0xf000 172; GFX9-NEXT: s_mov_b32 s6, -1 173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 174; GFX9-NEXT: v_mov_b32_e32 v0, s2 175; GFX9-NEXT: v_pk_sub_i16 v0, s3, v0 176; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 177; GFX9-NEXT: s_endpgm 178; 179; VI-LABEL: s_test_sub_v2i16_kernarg: 180; VI: ; %bb.0: 181; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 182; VI-NEXT: s_load_dword s5, s[0:1], 0x30 183; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 184; VI-NEXT: s_mov_b32 s3, 0xf000 185; VI-NEXT: s_mov_b32 s2, -1 186; VI-NEXT: s_waitcnt lgkmcnt(0) 187; VI-NEXT: s_lshr_b32 s6, s4, 16 188; VI-NEXT: s_lshr_b32 s7, s5, 16 189; VI-NEXT: s_sub_i32 s6, s6, s7 190; VI-NEXT: s_sub_i32 s4, s4, s5 191; VI-NEXT: s_lshl_b32 s5, s6, 16 192; VI-NEXT: s_and_b32 s4, s4, 0xffff 193; VI-NEXT: s_or_b32 s4, s4, s5 194; VI-NEXT: v_mov_b32_e32 v0, s4 195; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 196; VI-NEXT: s_endpgm 197; 198; GFX10-LABEL: s_test_sub_v2i16_kernarg: 199; GFX10: ; %bb.0: 200; GFX10-NEXT: s_clause 0x2 201; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c 202; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30 203; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 204; GFX10-NEXT: s_mov_b32 s7, 0x31016000 205; GFX10-NEXT: s_mov_b32 s6, -1 206; GFX10-NEXT: s_waitcnt lgkmcnt(0) 207; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 208; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 209; GFX10-NEXT: s_endpgm 210 %add = sub <2 x i16> %a, %b 211 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 212 ret void 213} 214 215define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 216; GFX9-LABEL: v_test_sub_v2i16_constant: 217; GFX9: ; %bb.0: 218; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 219; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 220; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b 221; GFX9-NEXT: s_waitcnt lgkmcnt(0) 222; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 223; GFX9-NEXT: s_waitcnt vmcnt(0) 224; GFX9-NEXT: s_mov_b32 s3, 0xf000 225; GFX9-NEXT: s_mov_b32 s2, -1 226; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 227; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 228; GFX9-NEXT: s_endpgm 229; 230; VI-LABEL: v_test_sub_v2i16_constant: 231; VI: ; %bb.0: 232; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 233; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 234; VI-NEXT: s_waitcnt lgkmcnt(0) 235; VI-NEXT: v_mov_b32_e32 v1, s3 236; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 237; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 238; VI-NEXT: flat_load_dword v0, v[0:1] glc 239; VI-NEXT: s_waitcnt vmcnt(0) 240; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 241; VI-NEXT: s_mov_b32 s3, 0xf000 242; VI-NEXT: s_mov_b32 s2, -1 243; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0 244; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 245; VI-NEXT: v_or_b32_e32 v0, v2, v0 246; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 247; VI-NEXT: s_endpgm 248; 249; GFX10-LABEL: v_test_sub_v2i16_constant: 250; GFX10: ; %bb.0: 251; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 252; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 253; GFX10-NEXT: s_waitcnt lgkmcnt(0) 254; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 255; GFX10-NEXT: s_waitcnt vmcnt(0) 256; GFX10-NEXT: s_waitcnt_depctr 0xffe3 257; GFX10-NEXT: s_mov_b32 s3, 0x31016000 258; GFX10-NEXT: s_mov_b32 s2, -1 259; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b 260; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 261; GFX10-NEXT: s_endpgm 262 %tid = call i32 @llvm.amdgcn.workitem.id.x() 263 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 264 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 265 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 266 %add = sub <2 x i16> %a, <i16 123, i16 456> 267 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 268 ret void 269} 270 271; FIXME: Need to handle non-uniform case for function below (load without gep). 272define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 273; GFX9-LABEL: v_test_sub_v2i16_neg_constant: 274; GFX9: ; %bb.0: 275; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 276; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 277; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 279; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 280; GFX9-NEXT: s_waitcnt vmcnt(0) 281; GFX9-NEXT: s_mov_b32 s3, 0xf000 282; GFX9-NEXT: s_mov_b32 s2, -1 283; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 284; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 285; GFX9-NEXT: s_endpgm 286; 287; VI-LABEL: v_test_sub_v2i16_neg_constant: 288; VI: ; %bb.0: 289; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 290; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 291; VI-NEXT: s_waitcnt lgkmcnt(0) 292; VI-NEXT: v_mov_b32_e32 v1, s3 293; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 294; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 295; VI-NEXT: flat_load_dword v0, v[0:1] glc 296; VI-NEXT: s_waitcnt vmcnt(0) 297; VI-NEXT: v_mov_b32_e32 v1, 0x3df 298; VI-NEXT: s_mov_b32 s3, 0xf000 299; VI-NEXT: s_mov_b32 s2, -1 300; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 301; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 302; VI-NEXT: v_or_b32_e32 v0, v2, v0 303; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 304; VI-NEXT: s_endpgm 305; 306; GFX10-LABEL: v_test_sub_v2i16_neg_constant: 307; GFX10: ; %bb.0: 308; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 309; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 310; GFX10-NEXT: s_waitcnt lgkmcnt(0) 311; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 312; GFX10-NEXT: s_waitcnt vmcnt(0) 313; GFX10-NEXT: s_waitcnt_depctr 0xffe3 314; GFX10-NEXT: s_mov_b32 s3, 0x31016000 315; GFX10-NEXT: s_mov_b32 s2, -1 316; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 317; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 318; GFX10-NEXT: s_endpgm 319 %tid = call i32 @llvm.amdgcn.workitem.id.x() 320 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 321 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 322 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 323 %add = sub <2 x i16> %a, <i16 -845, i16 -991> 324 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 325 ret void 326} 327 328define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 329; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: 330; GFX9: ; %bb.0: 331; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 332; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 334; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 335; GFX9-NEXT: s_waitcnt vmcnt(0) 336; GFX9-NEXT: s_mov_b32 s3, 0xf000 337; GFX9-NEXT: s_mov_b32 s2, -1 338; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] 339; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 340; GFX9-NEXT: s_endpgm 341; 342; VI-LABEL: v_test_sub_v2i16_inline_neg1: 343; VI: ; %bb.0: 344; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 345; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 346; VI-NEXT: s_waitcnt lgkmcnt(0) 347; VI-NEXT: v_mov_b32_e32 v1, s3 348; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 349; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 350; VI-NEXT: flat_load_dword v0, v[0:1] glc 351; VI-NEXT: s_waitcnt vmcnt(0) 352; VI-NEXT: v_mov_b32_e32 v1, 1 353; VI-NEXT: s_mov_b32 s3, 0xf000 354; VI-NEXT: s_mov_b32 s2, -1 355; VI-NEXT: v_add_u16_e32 v2, 1, v0 356; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 357; VI-NEXT: v_or_b32_e32 v0, v2, v0 358; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 359; VI-NEXT: s_endpgm 360; 361; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: 362; GFX10: ; %bb.0: 363; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 364; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 365; GFX10-NEXT: s_waitcnt lgkmcnt(0) 366; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 367; GFX10-NEXT: s_waitcnt vmcnt(0) 368; GFX10-NEXT: s_waitcnt_depctr 0xffe3 369; GFX10-NEXT: s_mov_b32 s3, 0x31016000 370; GFX10-NEXT: s_mov_b32 s2, -1 371; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] 372; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 373; GFX10-NEXT: s_endpgm 374 %tid = call i32 @llvm.amdgcn.workitem.id.x() 375 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 376 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 377 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 378 %add = sub <2 x i16> %a, <i16 -1, i16 -1> 379 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 380 ret void 381} 382 383define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 384; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 385; GFX9: ; %bb.0: 386; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 387; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 389; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 390; GFX9-NEXT: s_waitcnt vmcnt(0) 391; GFX9-NEXT: s_mov_b32 s3, 0xf000 392; GFX9-NEXT: s_mov_b32 s2, -1 393; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 394; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 395; GFX9-NEXT: s_endpgm 396; 397; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 398; VI: ; %bb.0: 399; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 400; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 401; VI-NEXT: s_waitcnt lgkmcnt(0) 402; VI-NEXT: v_mov_b32_e32 v1, s3 403; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 404; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 405; VI-NEXT: flat_load_dword v0, v[0:1] glc 406; VI-NEXT: s_waitcnt vmcnt(0) 407; VI-NEXT: s_mov_b32 s3, 0xf000 408; VI-NEXT: s_mov_b32 s2, -1 409; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 410; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 411; VI-NEXT: v_or_b32_e32 v0, v0, v1 412; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 413; VI-NEXT: s_endpgm 414; 415; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 416; GFX10: ; %bb.0: 417; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 418; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 419; GFX10-NEXT: s_waitcnt lgkmcnt(0) 420; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 421; GFX10-NEXT: s_waitcnt vmcnt(0) 422; GFX10-NEXT: s_waitcnt_depctr 0xffe3 423; GFX10-NEXT: s_mov_b32 s3, 0x31016000 424; GFX10-NEXT: s_mov_b32 s2, -1 425; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 426; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 427; GFX10-NEXT: s_endpgm 428 %tid = call i32 @llvm.amdgcn.workitem.id.x() 429 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 430 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 431 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 432 %add = sub <2 x i16> %a, <i16 32, i16 0> 433 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 434 ret void 435} 436 437; The high element gives fp 438define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 439; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: 440; GFX9: ; %bb.0: 441; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 442; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 443; GFX9-NEXT: s_mov_b32 s4, 1.0 444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 445; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 446; GFX9-NEXT: s_waitcnt vmcnt(0) 447; GFX9-NEXT: s_mov_b32 s3, 0xf000 448; GFX9-NEXT: s_mov_b32 s2, -1 449; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 450; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 451; GFX9-NEXT: s_endpgm 452; 453; VI-LABEL: v_test_sub_v2i16_inline_fp_split: 454; VI: ; %bb.0: 455; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 456; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 457; VI-NEXT: s_waitcnt lgkmcnt(0) 458; VI-NEXT: v_mov_b32_e32 v1, s3 459; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 460; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 461; VI-NEXT: flat_load_dword v0, v[0:1] glc 462; VI-NEXT: s_waitcnt vmcnt(0) 463; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 464; VI-NEXT: s_mov_b32 s3, 0xf000 465; VI-NEXT: s_mov_b32 s2, -1 466; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 467; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 468; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; VI-NEXT: s_endpgm 470; 471; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: 472; GFX10: ; %bb.0: 473; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 474; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 475; GFX10-NEXT: s_waitcnt lgkmcnt(0) 476; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 477; GFX10-NEXT: s_waitcnt vmcnt(0) 478; GFX10-NEXT: s_waitcnt_depctr 0xffe3 479; GFX10-NEXT: s_mov_b32 s3, 0x31016000 480; GFX10-NEXT: s_mov_b32 s2, -1 481; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] 482; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 483; GFX10-NEXT: s_endpgm 484 %tid = call i32 @llvm.amdgcn.workitem.id.x() 485 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 486 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 487 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 488 %add = sub <2 x i16> %a, <i16 0, i16 16256> 489 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 490 ret void 491} 492 493; FIXME: Need to handle non-uniform case for function below (load without gep). 494define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 495; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: 496; GFX9: ; %bb.0: 497; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 498; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 499; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 500; GFX9-NEXT: s_waitcnt lgkmcnt(0) 501; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc 502; GFX9-NEXT: s_waitcnt vmcnt(0) 503; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 504; GFX9-NEXT: s_waitcnt vmcnt(0) 505; GFX9-NEXT: s_mov_b32 s7, 0xf000 506; GFX9-NEXT: s_mov_b32 s6, -1 507; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 508; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 509; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 510; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 511; GFX9-NEXT: s_endpgm 512; 513; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: 514; VI: ; %bb.0: 515; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 516; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 517; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 518; VI-NEXT: s_waitcnt lgkmcnt(0) 519; VI-NEXT: v_mov_b32_e32 v1, s7 520; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 521; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 522; VI-NEXT: v_mov_b32_e32 v3, s1 523; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 524; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 525; VI-NEXT: flat_load_dword v1, v[0:1] glc 526; VI-NEXT: s_waitcnt vmcnt(0) 527; VI-NEXT: flat_load_dword v2, v[2:3] glc 528; VI-NEXT: s_waitcnt vmcnt(0) 529; VI-NEXT: s_mov_b32 s7, 0xf000 530; VI-NEXT: s_mov_b32 s6, -1 531; VI-NEXT: v_sub_u16_e32 v0, v1, v2 532; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 533; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 534; VI-NEXT: s_endpgm 535; 536; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: 537; GFX10: ; %bb.0: 538; GFX10-NEXT: s_clause 0x1 539; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 540; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 541; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 542; GFX10-NEXT: s_waitcnt lgkmcnt(0) 543; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 544; GFX10-NEXT: s_waitcnt vmcnt(0) 545; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 546; GFX10-NEXT: s_waitcnt vmcnt(0) 547; GFX10-NEXT: s_waitcnt_depctr 0xffe3 548; GFX10-NEXT: s_mov_b32 s7, 0x31016000 549; GFX10-NEXT: s_mov_b32 s6, -1 550; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 551; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 552; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 553; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 554; GFX10-NEXT: s_endpgm 555 %tid = call i32 @llvm.amdgcn.workitem.id.x() 556 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 557 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 558 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 559 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 560 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 561 %add = sub <2 x i16> %a, %b 562 %ext = zext <2 x i16> %add to <2 x i32> 563 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 564 ret void 565} 566 567; FIXME: Need to handle non-uniform case for function below (load without gep). 568define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 569; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: 570; GFX9: ; %bb.0: 571; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 572; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 573; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 574; GFX9-NEXT: v_mov_b32_e32 v1, 0 575; GFX9-NEXT: s_waitcnt lgkmcnt(0) 576; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 577; GFX9-NEXT: s_waitcnt vmcnt(0) 578; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc 579; GFX9-NEXT: s_waitcnt vmcnt(0) 580; GFX9-NEXT: s_mov_b32 s7, 0xf000 581; GFX9-NEXT: s_mov_b32 s6, -1 582; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 583; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 584; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 585; GFX9-NEXT: v_mov_b32_e32 v3, v1 586; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 587; GFX9-NEXT: s_endpgm 588; 589; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: 590; VI: ; %bb.0: 591; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 592; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 593; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 594; VI-NEXT: s_waitcnt lgkmcnt(0) 595; VI-NEXT: v_mov_b32_e32 v1, s7 596; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 597; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 598; VI-NEXT: v_mov_b32_e32 v3, s1 599; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 600; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 601; VI-NEXT: flat_load_dword v4, v[0:1] glc 602; VI-NEXT: s_waitcnt vmcnt(0) 603; VI-NEXT: flat_load_dword v2, v[2:3] glc 604; VI-NEXT: s_waitcnt vmcnt(0) 605; VI-NEXT: v_mov_b32_e32 v1, 0 606; VI-NEXT: s_mov_b32 s7, 0xf000 607; VI-NEXT: s_mov_b32 s6, -1 608; VI-NEXT: v_mov_b32_e32 v3, v1 609; VI-NEXT: v_sub_u16_e32 v0, v4, v2 610; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 611; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 612; VI-NEXT: s_endpgm 613; 614; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: 615; GFX10: ; %bb.0: 616; GFX10-NEXT: s_clause 0x1 617; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 618; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 619; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 620; GFX10-NEXT: s_waitcnt lgkmcnt(0) 621; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 622; GFX10-NEXT: s_waitcnt vmcnt(0) 623; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 624; GFX10-NEXT: s_waitcnt vmcnt(0) 625; GFX10-NEXT: s_waitcnt_depctr 0xffe3 626; GFX10-NEXT: s_mov_b32 s7, 0x31016000 627; GFX10-NEXT: s_mov_b32 s6, -1 628; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 629; GFX10-NEXT: v_mov_b32_e32 v1, 0 630; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 631; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 632; GFX10-NEXT: v_mov_b32_e32 v3, v1 633; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 634; GFX10-NEXT: s_endpgm 635 %tid = call i32 @llvm.amdgcn.workitem.id.x() 636 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 637 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 638 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 639 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 640 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 641 %add = sub <2 x i16> %a, %b 642 %ext = zext <2 x i16> %add to <2 x i64> 643 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 644 ret void 645} 646 647; FIXME: Need to handle non-uniform case for function below (load without gep). 648define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 649; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: 650; GFX9: ; %bb.0: 651; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 652; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 653; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 654; GFX9-NEXT: s_waitcnt lgkmcnt(0) 655; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc 656; GFX9-NEXT: s_waitcnt vmcnt(0) 657; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 658; GFX9-NEXT: s_waitcnt vmcnt(0) 659; GFX9-NEXT: s_mov_b32 s7, 0xf000 660; GFX9-NEXT: s_mov_b32 s6, -1 661; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 662; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 663; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 664; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 665; GFX9-NEXT: s_endpgm 666; 667; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: 668; VI: ; %bb.0: 669; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 670; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 671; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 672; VI-NEXT: s_waitcnt lgkmcnt(0) 673; VI-NEXT: v_mov_b32_e32 v1, s7 674; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 675; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 676; VI-NEXT: v_mov_b32_e32 v3, s1 677; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 678; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 679; VI-NEXT: flat_load_dword v0, v[0:1] glc 680; VI-NEXT: s_waitcnt vmcnt(0) 681; VI-NEXT: flat_load_dword v1, v[2:3] glc 682; VI-NEXT: s_waitcnt vmcnt(0) 683; VI-NEXT: s_mov_b32 s7, 0xf000 684; VI-NEXT: s_mov_b32 s6, -1 685; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 686; VI-NEXT: v_sub_u16_e32 v0, v0, v1 687; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 688; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 689; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 690; VI-NEXT: s_endpgm 691; 692; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: 693; GFX10: ; %bb.0: 694; GFX10-NEXT: s_clause 0x1 695; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 696; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 697; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 698; GFX10-NEXT: s_waitcnt lgkmcnt(0) 699; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc 700; GFX10-NEXT: s_waitcnt vmcnt(0) 701; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc 702; GFX10-NEXT: s_waitcnt vmcnt(0) 703; GFX10-NEXT: s_waitcnt_depctr 0xffe3 704; GFX10-NEXT: s_mov_b32 s7, 0x31016000 705; GFX10-NEXT: s_mov_b32 s6, -1 706; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 707; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 708; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 709; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 710; GFX10-NEXT: s_endpgm 711 %tid = call i32 @llvm.amdgcn.workitem.id.x() 712 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 713 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 714 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 715 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 716 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 717 %add = sub <2 x i16> %a, %b 718 %ext = sext <2 x i16> %add to <2 x i32> 719 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 720 ret void 721} 722 723; FIXME: Need to handle non-uniform case for function below (load without gep). 724define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 725; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: 726; GFX9: ; %bb.0: 727; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 728; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 729; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 730; GFX9-NEXT: s_waitcnt lgkmcnt(0) 731; GFX9-NEXT: global_load_dword v1, v0, s[6:7] 732; GFX9-NEXT: global_load_dword v2, v0, s[2:3] 733; GFX9-NEXT: s_mov_b32 s7, 0xf000 734; GFX9-NEXT: s_mov_b32 s6, -1 735; GFX9-NEXT: s_waitcnt vmcnt(0) 736; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 737; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 738; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 739; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 740; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 741; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 742; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 743; GFX9-NEXT: s_endpgm 744; 745; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: 746; VI: ; %bb.0: 747; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 748; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 749; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 750; VI-NEXT: s_waitcnt lgkmcnt(0) 751; VI-NEXT: v_mov_b32_e32 v1, s7 752; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 753; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 754; VI-NEXT: v_mov_b32_e32 v3, s1 755; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 756; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 757; VI-NEXT: flat_load_dword v0, v[0:1] 758; VI-NEXT: flat_load_dword v1, v[2:3] 759; VI-NEXT: s_mov_b32 s7, 0xf000 760; VI-NEXT: s_mov_b32 s6, -1 761; VI-NEXT: s_waitcnt vmcnt(0) 762; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 763; VI-NEXT: v_sub_u16_e32 v0, v0, v1 764; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 765; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 766; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 767; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 768; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 769; VI-NEXT: s_endpgm 770; 771; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: 772; GFX10: ; %bb.0: 773; GFX10-NEXT: s_clause 0x1 774; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 775; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 776; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 777; GFX10-NEXT: s_waitcnt lgkmcnt(0) 778; GFX10-NEXT: s_clause 0x1 779; GFX10-NEXT: global_load_dword v1, v0, s[6:7] 780; GFX10-NEXT: global_load_dword v2, v0, s[2:3] 781; GFX10-NEXT: s_waitcnt_depctr 0xffe3 782; GFX10-NEXT: s_mov_b32 s7, 0x31016000 783; GFX10-NEXT: s_mov_b32 s6, -1 784; GFX10-NEXT: s_waitcnt vmcnt(0) 785; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 786; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 787; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 788; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 789; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 790; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 791; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 792; GFX10-NEXT: s_endpgm 793 %tid = call i32 @llvm.amdgcn.workitem.id.x() 794 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 795 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 796 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 797 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 798 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 799 %add = sub <2 x i16> %a, %b 800 %ext = sext <2 x i16> %add to <2 x i64> 801 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 802 ret void 803} 804 805declare i32 @llvm.amdgcn.workitem.id.x() #0 806 807attributes #0 = { nounwind readnone } 808attributes #1 = { nounwind } 809