1;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,PREGFX10 2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI,PREGFX10 3;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,GFX10 4;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,GFX10 5 6;CHECK-LABEL: {{^}}buffer_load: 7;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], 0{{$}} 8;CHECK: buffer_load_{{dwordx4|b128}} v[4:7], off, s[0:3], 0 glc{{$}} 9;CHECK: buffer_load_{{dwordx4|b128}} v[8:11], off, s[0:3], 0 slc{{$}} 10;CHECK: s_waitcnt 11define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { 12main_body: 13 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) 14 %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) 15 %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) 16 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 17 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 18 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 19 ret {<4 x float>, <4 x float>, <4 x float>} %r2 20} 21 22;CHECK-LABEL: {{^}}buffer_load_dlc: 23;PREGFX10: buffer_load_dwordx4 v[0:3], off, s[0:3], 0{{$}} 24;PREGFX10: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc{{$}} 25;PREGFX10: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc{{$}} 26;GFX10: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], 0 dlc{{$}} 27;GFX10: buffer_load_{{dwordx4|b128}} v[4:7], off, s[0:3], 0 glc dlc{{$}} 28;GFX10: buffer_load_{{dwordx4|b128}} v[8:11], off, s[0:3], 0 slc dlc{{$}} 29;CHECK: s_waitcnt 30define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) { 31main_body: 32 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4) 33 %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 5) 34 %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 6) 35 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 36 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 37 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 38 ret {<4 x float>, <4 x float>, <4 x float>} %r2 39} 40 41;CHECK-LABEL: {{^}}buffer_load_immoffs: 42;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], 0 offset:40 43;CHECK: s_waitcnt 44define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { 45main_body: 46 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) 47 ret <4 x float> %data 48} 49 50;CHECK-LABEL: {{^}}buffer_load_immoffs_large: 51;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc 52;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], [[OFFSET]] offset:4 53;CHECK: s_waitcnt 54define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { 55main_body: 56 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) 57 ret <4 x float> %data 58} 59 60;CHECK-LABEL: {{^}}buffer_load_ofs: 61;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], v0, s[0:3], 0 offen 62;CHECK: s_waitcnt 63define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { 64main_body: 65 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) 66 ret <4 x float> %data 67} 68 69;CHECK-LABEL: {{^}}buffer_load_ofs_imm: 70;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], v0, s[0:3], 0 offen offset:60 71;CHECK: s_waitcnt 72define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { 73main_body: 74 %ofs = add i32 %1, 60 75 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) 76 ret <4 x float> %data 77} 78 79;CHECK-LABEL: {{^}}buffer_load_x1: 80;CHECK: buffer_load_{{dword|b32}} v0, v0, s[0:3], 0 offen 81;CHECK: s_waitcnt 82define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { 83main_body: 84 %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) 85 ret float %data 86} 87 88;CHECK-LABEL: {{^}}buffer_load_x2: 89;CHECK: buffer_load_{{dwordx2|b64}} v[0:1], v0, s[0:3], 0 offen 90;CHECK: s_waitcnt 91define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { 92main_body: 93 %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) 94 ret <2 x float> %data 95} 96 97;CHECK-LABEL: {{^}}buffer_load_negative_offset: 98;PREGFX10: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 99;GFX10: v_add_nc_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], -16, v0 100;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], [[VOFS]], s[0:3], 0 offen 101define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { 102main_body: 103 %ofs.1 = add i32 %ofs, -16 104 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0) 105 ret <4 x float> %data 106} 107 108; SI won't merge ds memory operations, because of the signed offset bug, so 109; we only have check lines for VI. 110; CHECK-LABEL: buffer_load_mmo: 111; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 112; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 113define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { 114entry: 115 store float 0.0, float addrspace(3)* %lds 116 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 117 %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 118 store float 0.0, float addrspace(3)* %tmp2 119 ret float %val 120} 121 122;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_and: 123;CHECK-NEXT: %bb. 124;GFX10-NEXT: s_clause 125;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 126;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 127;CHECK: s_waitcnt 128define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { 129main_body: 130 %a1 = add i32 %a, 4 131 %a2 = add i32 %a, 8 132 %a3 = add i32 %a, 12 133 %a4 = add i32 %a, 16 134 %a5 = add i32 %a, 28 135 %a6 = add i32 %a, 32 136 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 137 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 138 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0) 139 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0) 140 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0) 141 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0) 142 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 143 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 144 ret void 145} 146 147;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_or: 148;CHECK-NEXT: %bb. 149;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 6, v0 150;GFX10-NEXT: s_clause 151;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 152;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 153;CHECK: s_waitcnt 154define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { 155main_body: 156 %a = shl i32 %inp, 6 157 %a1 = or i32 %a, 4 158 %a2 = or i32 %a, 8 159 %a3 = or i32 %a, 12 160 %a4 = or i32 %a, 16 161 %a5 = or i32 %a, 28 162 %a6 = or i32 %a, 32 163 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 164 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 165 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0) 166 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0) 167 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0) 168 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0) 169 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 170 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 171 ret void 172} 173 174;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc: 175;CHECK-NEXT: %bb. 176;GFX10-NEXT: s_clause 177;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} 178;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} 179;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} 180;CHECK: s_waitcnt 181define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { 182main_body: 183 %a1 = add i32 %a, 4 184 %a2 = add i32 %a, 8 185 %a3 = add i32 %a, 12 186 %a4 = add i32 %a, 16 187 %a5 = add i32 %a, 28 188 %a6 = add i32 %a, 32 189 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 190 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 191 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 1) 192 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 1) 193 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 3) 194 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 3) 195 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 196 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 197 ret void 198} 199 200;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged_and: 201;CHECK-NEXT: %bb. 202;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 203;CHECK: s_waitcnt 204define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { 205main_body: 206 %a1 = add i32 %a, 4 207 %a2 = add i32 %a, 12 208 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 209 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 210 %r1 = extractelement <2 x float> %vr1, i32 0 211 %r2 = extractelement <2 x float> %vr1, i32 1 212 %r3 = extractelement <2 x float> %vr2, i32 0 213 %r4 = extractelement <2 x float> %vr2, i32 1 214 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 215 ret void 216} 217 218;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged_or: 219;CHECK-NEXT: %bb. 220;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 4, v0 221;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 222;CHECK: s_waitcnt 223define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { 224main_body: 225 %a = shl i32 %inp, 4 226 %a1 = add i32 %a, 4 227 %a2 = add i32 %a, 12 228 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 229 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 230 %r1 = extractelement <2 x float> %vr1, i32 0 231 %r2 = extractelement <2 x float> %vr1, i32 1 232 %r3 = extractelement <2 x float> %vr2, i32 0 233 %r4 = extractelement <2 x float> %vr2, i32 1 234 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 235 ret void 236} 237 238;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: 239;CHECK-NEXT: %bb. 240;GFX10-NEXT: s_clause 241;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 242;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 243;CHECK: s_waitcnt 244define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { 245main_body: 246 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) 247 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) 248 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) 249 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) 250 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) 251 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) 252 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 253 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 254 ret void 255} 256 257;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: 258;CHECK-NEXT: %bb. 259;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 260;CHECK: s_waitcnt 261define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { 262main_body: 263 %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) 264 %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) 265 %r1 = extractelement <2 x float> %vr1, i32 0 266 %r2 = extractelement <2 x float> %vr1, i32 1 267 %r3 = extractelement <2 x float> %vr2, i32 0 268 %r4 = extractelement <2 x float> %vr2, i32 1 269 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 270 ret void 271} 272 273;CHECK-LABEL: {{^}}buffer_load_int: 274;CHECK: buffer_load_{{dwordx4|b128}} v[0:3], off, s[0:3], 0 275;CHECK: buffer_load_{{dwordx2|b64}} v[4:5], off, s[0:3], 0 glc 276;CHECK: buffer_load_{{dword|b32}} v6, off, s[0:3], 0 slc 277;CHECK: s_waitcnt 278define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { 279main_body: 280 %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) 281 %data_glc = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 1) 282 %data_slc = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %0, i32 0, i32 0, i32 2) 283 %fdata = bitcast <4 x i32> %data to <4 x float> 284 %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float> 285 %fdata_slc = bitcast i32 %data_slc to float 286 %r0 = insertvalue {<4 x float>, <2 x float>, float} undef, <4 x float> %fdata, 0 287 %r1 = insertvalue {<4 x float>, <2 x float>, float} %r0, <2 x float> %fdata_glc, 1 288 %r2 = insertvalue {<4 x float>, <2 x float>, float} %r1, float %fdata_slc, 2 289 ret {<4 x float>, <2 x float>, float} %r2 290} 291 292;CHECK-LABEL: {{^}}raw_buffer_load_ubyte: 293;CHECK-NEXT: %bb. 294;CHECK-NEXT: buffer_load_{{ubyte|u8}} v{{[0-9]}}, off, s[0:3], 0 295;CHECK: s_waitcnt vmcnt(0) 296;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 297;CHECK-NEXT: ; return to shader part epilog 298define amdgpu_ps float @raw_buffer_load_ubyte(<4 x i32> inreg %rsrc) { 299main_body: 300 %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 301 %tmp2 = zext i8 %tmp to i32 302 %val = uitofp i32 %tmp2 to float 303 ret float %val 304} 305 306;CHECK-LABEL: {{^}}raw_buffer_load_i16: 307;CHECK-NEXT: %bb. 308;CHECK-NEXT: buffer_load_{{ushort|u16}} v{{[0-9]}}, off, s[0:3], 0 309;CHECK: s_waitcnt vmcnt(0) 310;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 311;CHECK-NEXT: ; return to shader part epilog 312define amdgpu_ps float @raw_buffer_load_i16(<4 x i32> inreg %rsrc) { 313main_body: 314 %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 315 %tmp2 = zext i16 %tmp to i32 316 %val = uitofp i32 %tmp2 to float 317 ret float %val 318} 319 320;CHECK-LABEL: {{^}}raw_buffer_load_sbyte: 321;CHECK-NEXT: %bb. 322;CHECK-NEXT: buffer_load_{{sbyte|i8}} v{{[0-9]}}, off, s[0:3], 0 323;CHECK: s_waitcnt vmcnt(0) 324;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 325;CHECK-NEXT: ; return to shader part epilog 326define amdgpu_ps float @raw_buffer_load_sbyte(<4 x i32> inreg %rsrc) { 327main_body: 328 %tmp = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 329 %tmp2 = sext i8 %tmp to i32 330 %val = sitofp i32 %tmp2 to float 331 ret float %val 332} 333 334;CHECK-LABEL: {{^}}raw_buffer_load_sshort: 335;CHECK-NEXT: %bb. 336;CHECK-NEXT: buffer_load_{{sshort|i16}} v{{[0-9]}}, off, s[0:3], 0 337;CHECK: s_waitcnt vmcnt(0) 338;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 339;CHECK-NEXT: ; return to shader part epilog 340define amdgpu_ps float @raw_buffer_load_sshort(<4 x i32> inreg %rsrc) { 341main_body: 342 %tmp = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 343 %tmp2 = sext i16 %tmp to i32 344 %val = sitofp i32 %tmp2 to float 345 ret float %val 346} 347 348;CHECK-LABEL: {{^}}raw_buffer_load_f16: 349;CHECK-NEXT: %bb. 350;CHECK-NEXT: buffer_load_{{ushort|u16}} [[VAL:v[0-9]+]], off, s[0:3], 0 351;CHECK: s_waitcnt vmcnt(0) 352;CHECK: ds_{{write|store}}_b16 v0, [[VAL]] 353define amdgpu_ps void @raw_buffer_load_f16(<4 x i32> inreg %rsrc, half addrspace(3)* %ptr) { 354main_body: 355 %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 356 store half %val, half addrspace(3)* %ptr 357 ret void 358} 359 360;CHECK-LABEL: {{^}}raw_buffer_load_v2f16: 361;CHECK-NEXT: %bb. 362;CHECK-NEXT: buffer_load_{{dword|b32}} [[VAL:v[0-9]+]], off, s[0:3], 0 363;CHECK: s_waitcnt vmcnt(0) 364;CHECK: ds_{{write|store}}_b32 v0, [[VAL]] 365define amdgpu_ps void @raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, <2 x half> addrspace(3)* %ptr) { 366main_body: 367 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 368 store <2 x half> %val, <2 x half> addrspace(3)* %ptr 369 ret void 370} 371 372;CHECK-LABEL: {{^}}raw_buffer_load_v4f16: 373;CHECK-NEXT: %bb. 374;CHECK-NEXT: buffer_load_{{dwordx2|b64}} [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 375;CHECK: s_waitcnt vmcnt(0) 376;CHECK: ds_{{write|store}}_b64 v0, [[VAL]] 377define amdgpu_ps void @raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, <4 x half> addrspace(3)* %ptr) { 378main_body: 379 %val = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 380 store <4 x half> %val, <4 x half> addrspace(3)* %ptr 381 ret void 382} 383 384;CHECK-LABEL: {{^}}raw_buffer_load_v2i16: 385;CHECK-NEXT: %bb. 386;CHECK-NEXT: buffer_load_{{dword|b32}} [[VAL:v[0-9]+]], off, s[0:3], 0 387;CHECK: s_waitcnt vmcnt(0) 388;CHECK: ds_{{write|store}}_b32 v0, [[VAL]] 389define amdgpu_ps void @raw_buffer_load_v2i16(<4 x i32> inreg %rsrc, <2 x i16> addrspace(3)* %ptr) { 390main_body: 391 %val = call <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 392 store <2 x i16> %val, <2 x i16> addrspace(3)* %ptr 393 ret void 394} 395 396;CHECK-LABEL: {{^}}raw_buffer_load_v4i16: 397;CHECK-NEXT: %bb. 398;CHECK-NEXT: buffer_load_{{dwordx2|b64}} [[VAL:v\[[0-9]+:[0-9]+\]]], off, s[0:3], 0 399;CHECK: s_waitcnt vmcnt(0) 400;CHECK: ds_{{write|store}}_b64 v0, [[VAL]] 401define amdgpu_ps void @raw_buffer_load_v4i16(<4 x i32> inreg %rsrc, <4 x i16> addrspace(3)* %ptr) { 402main_body: 403 %val = call <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) 404 store <4 x i16> %val, <4 x i16> addrspace(3)* %ptr 405 ret void 406} 407 408;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_merged: 409;CHECK-NEXT: %bb. 410;GFX10-NEXT: s_clause 411;CHECK-NEXT: buffer_load_{{dwordx4|b128}} v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 412;CHECK-NEXT: buffer_load_{{dwordx2|b64}} v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 413;CHECK: s_waitcnt 414define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { 415main_body: 416 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) 417 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) 418 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) 419 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) 420 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) 421 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) 422 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 423 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 424 ret void 425} 426 427;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_swizzled_not_merged: 428;CHECK-NEXT: %bb. 429;GFX10-NEXT: s_clause 430;CHECK-NEXT: buffer_load_{{dword|b32}} v{{[0-9]}}, off, s[0:3], 0 offset:4 431;CHECK-NEXT: buffer_load_{{dword|b32}} v{{[0-9]}}, off, s[0:3], 0 offset:8 432;CHECK-NEXT: buffer_load_{{dword|b32}} v{{[0-9]}}, off, s[0:3], 0 offset:12 433;CHECK-NEXT: buffer_load_{{dword|b32}} v{{[0-9]}}, off, s[0:3], 0 offset:16 434;CHECK-NEXT: buffer_load_{{dword|b32}} v{{[0-9]}}, off, s[0:3], 0 offset:28 435;CHECK-NEXT: buffer_load_{{dword|b32}} v{{[0-9]}}, off, s[0:3], 0 offset:32 436;CHECK: s_waitcnt 437define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { 438main_body: 439 %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) 440 %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8) 441 %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 8) 442 %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 8) 443 %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 8) 444 %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 8) 445 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 446 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 447 ret void 448} 449 450declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 451declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 452declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 453declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #0 454declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32) #0 455declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #0 456declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 457declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #0 458declare i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32>, i32, i32, i32) #0 459declare <2 x i16> @llvm.amdgcn.raw.buffer.load.v2i16(<4 x i32>, i32, i32, i32) #0 460declare <4 x i16> @llvm.amdgcn.raw.buffer.load.v4i16(<4 x i32>, i32, i32, i32) #0 461declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #0 462declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #0 463declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #0 464 465attributes #0 = { nounwind readonly } 466