1; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9PLUS,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9PLUS,GFX10 %s 4 5; FIXME: Need to handle non-uniform case for function below (load without gep). 6; GCN-LABEL: {{^}}v_test_add_v2i16: 7; GFX9PLUS: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 8 9; FIXME: or should be unnecessary 10; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 11; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12; VI: v_or_b32 13define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 14 %tid = call i32 @llvm.amdgcn.workitem.id.x() 15 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 16 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 17 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 18 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 19 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 20 %add = add <2 x i16> %a, %b 21 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 22 ret void 23} 24 25; GCN-LABEL: {{^}}s_test_add_v2i16: 26; GFX9PLUS: s_load_dword [[VAL0:s[0-9]+]] 27; GFX9PLUS: s_load_dword [[VAL1:s[0-9]+]] 28; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] 29; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL1]], [[VVAL1]] 30; GFX10: v_pk_add_u16 v{{[0-9]+}}, [[VAL0]], [[VAL1]] 31 32; VI: s_add_i32 33; VI: s_add_i32 34define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { 35 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 36 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 37 %add = add <2 x i16> %a, %b 38 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 39 ret void 40} 41 42; GCN-LABEL: {{^}}s_test_add_self_v2i16: 43; GFX9PLUS: s_load_dword [[VAL:s[0-9]+]] 44; GFX9PLUS: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]] 45 46; VI: s_add_i32 47; VI: s_add_i32 48define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 { 49 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 50 %add = add <2 x i16> %a, %a 51 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 52 ret void 53} 54 55; FIXME: VI should not scalarize arg access. 56; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: 57; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 58; GFX10: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 59 60; VI: s_add_i32 61; VI: s_add_i32 62; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 63; VI: s_and_b32 64; VI: s_or_b32 65define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { 66 %add = add <2 x i16> %a, %b 67 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 68 ret void 69} 70 71; FIXME: Eliminate or with sdwa 72; GCN-LABEL: {{^}}v_test_add_v2i16_constant: 73; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} 74; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] 75; GFX10: v_pk_add_u16 v{{[0-9]+}}, 0x1c8007b, v{{[0-9]+}} 76 77; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8 78; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} 79; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 80; VI: v_or_b32_e32 81define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 82 %tid = call i32 @llvm.amdgcn.workitem.id.x() 83 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 84 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 85 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 86 %add = add <2 x i16> %a, <i16 123, i16 456> 87 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 88 ret void 89} 90 91; FIXME: Need to handle non-uniform case for function below (load without gep). 92; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant: 93; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} 94; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] 95; GFX10: v_pk_add_u16 v{{[0-9]+}}, 0xfc21fcb3, v{{[0-9]+}} 96 97; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfcb3, v{{[0-9]+}} 98; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21 99; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 100define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 101 %tid = call i32 @llvm.amdgcn.workitem.id.x() 102 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 103 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 104 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 105 %add = add <2 x i16> %a, <i16 -845, i16 -991> 106 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 107 ret void 108} 109 110; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1: 111; GFX9PLUS: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}} 112 113; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 114; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] 115; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 116; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]] 117; VI: v_or_b32_e32 118define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 119 %tid = call i32 @llvm.amdgcn.workitem.id.x() 120 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 121 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 122 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 123 %add = add <2 x i16> %a, <i16 -1, i16 -1> 124 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 125 ret void 126} 127 128; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: 129; GFX9PLUS: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} 130 131; VI: flat_load_dword 132; VI-NOT: v_add_u16 133; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000, 134; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} 135; VI-NOT: v_add_u16 136; VI: v_or_b32_e32 137define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() 139 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 140 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 141 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 142 %add = add <2 x i16> %a, <i16 32, i16 0> 143 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 144 ret void 145} 146 147; The high element gives fp 148; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split: 149; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 150; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}} 151; GFX10: v_pk_add_u16 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}} 152 153; VI-NOT: v_add_u16 154; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80 155; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 156; VI-NOT: v_add_u16 157; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 158define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { 159 %tid = call i32 @llvm.amdgcn.workitem.id.x() 160 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 161 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 162 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 163 %add = add <2 x i16> %a, <i16 0, i16 16256> 164 store <2 x i16> %add, <2 x i16> addrspace(1)* %out 165 ret void 166} 167 168; FIXME: Need to handle non-uniform case for function below (load without gep). 169; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32: 170; GFX9PLUS: global_load_dword [[A:v[0-9]+]] 171; GFX9PLUS: global_load_dword [[B:v[0-9]+]] 172 173; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] 174; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] 175; GFX9PLUS-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] 176; GFX9PLUS: buffer_store_dwordx2 v[[[ELT0]]:[[ELT1]]] 177 178; VI: flat_load_dword v[[A:[0-9]+]] 179; VI: flat_load_dword v[[B:[0-9]+]] 180 181; VI-NOT: and 182; VI-NOT: shl 183; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] 184; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 185; VI-NOT: and 186; VI-NOT: shl 187; VI: buffer_store_dwordx2 v[[[ADD_LO]]:[[ADD_HI]]] 188define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 189 %tid = call i32 @llvm.amdgcn.workitem.id.x() 190 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 191 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 192 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 193 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 194 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 195 %add = add <2 x i16> %a, %b 196 %ext = zext <2 x i16> %add to <2 x i32> 197 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 198 ret void 199} 200 201; FIXME: Need to handle non-uniform case for function below (load without gep). 202; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64: 203; GFX9PLUS: v_mov_b32_e32 [[MASK:v[0-9+]]], 0xffff 204; GFX9PLUS: global_load_dword [[A:v[0-9]+]] 205; GFX9PLUS: global_load_dword [[B:v[0-9]+]] 206 207; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] 208; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] 209; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 210; GFX9PLUS: buffer_store_dwordx4 211 212; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} 213; VI-DAG: flat_load_dword v[[A:[0-9]+]] 214; VI-DAG: flat_load_dword v[[B:[0-9]+]] 215 216; VI-DAG: v_add_u16_e32 217; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 218 219; VI: buffer_store_dwordx4 220define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 221 %tid = call i32 @llvm.amdgcn.workitem.id.x() 222 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 223 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 224 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 225 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 226 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 227 %add = add <2 x i16> %a, %b 228 %ext = zext <2 x i16> %add to <2 x i64> 229 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 230 ret void 231} 232 233; FIXME: Need to handle non-uniform case for function below (load without gep). 234; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32: 235; GFX9PLUS: global_load_dword [[A:v[0-9]+]] 236; GFX9PLUS: global_load_dword [[B:v[0-9]+]] 237 238; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] 239; GFX9PLUS-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 240; GFX9PLUS-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] 241; GFX9PLUS: buffer_store_dwordx2 v[[[ELT0]]:[[ELT1]]] 242 243; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 244; VI: v_add_u16_e32 245 246; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 247; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 248; VI: buffer_store_dwordx2 249define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 250 %tid = call i32 @llvm.amdgcn.workitem.id.x() 251 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid 252 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 253 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 254 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 255 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 256 %add = add <2 x i16> %a, %b 257 %ext = sext <2 x i16> %add to <2 x i32> 258 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out 259 ret void 260} 261 262; FIXME: Need to handle non-uniform case for function below (load without gep). 263; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64: 264; GCN: {{flat|global}}_load_dword 265; GCN: {{flat|global}}_load_dword 266 267; GFX9PLUS: v_pk_add_u16 268; GFX9PLUS: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} 269 270; VI: v_add_u16_sdwa 271; VI: v_add_u16_e32 272 273; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 274; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 275; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} 276; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} 277define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { 278 %tid = call i32 @llvm.amdgcn.workitem.id.x() 279 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid 280 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid 281 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid 282 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 283 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 284 %add = add <2 x i16> %a, %b 285 %ext = sext <2 x i16> %add to <2 x i64> 286 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out 287 ret void 288} 289 290declare i32 @llvm.amdgcn.workitem.id.x() #0 291 292attributes #0 = { nounwind readnone } 293attributes #1 = { nounwind } 294