1;RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI 2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI 3 4;GCN-LABEL: {{^}}s_buffer_load_imm: 5;GCN-NOT: s_waitcnt; 6;SI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x1 7;VI: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4 8define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) { 9main_body: 10 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) 11 %bitcast = bitcast i32 %load to float 12 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) 13 ret void 14} 15 16;GCN-LABEL: {{^}}s_buffer_load_index: 17;GCN-NOT: s_waitcnt; 18;GCN: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} 19define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) { 20main_body: 21 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) 22 %bitcast = bitcast i32 %load to float 23 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) 24 ret void 25} 26 27;GCN-LABEL: {{^}}s_buffer_load_index_divergent: 28;GCN-NOT: s_waitcnt; 29;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 30define amdgpu_ps void @s_buffer_load_index_divergent(<4 x i32> inreg %desc, i32 %index) { 31main_body: 32 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0) 33 %bitcast = bitcast i32 %load to float 34 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) 35 ret void 36} 37 38;GCN-LABEL: {{^}}s_buffer_loadx2_imm: 39;GCN-NOT: s_waitcnt; 40;SI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x10 41;VI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 42define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) { 43main_body: 44 %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0) 45 %bitcast = bitcast <2 x i32> %load to <2 x float> 46 %x = extractelement <2 x float> %bitcast, i32 0 47 %y = extractelement <2 x float> %bitcast, i32 1 48 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) 49 ret void 50} 51 52;GCN-LABEL: {{^}}s_buffer_loadx2_index: 53;GCN-NOT: s_waitcnt; 54;GCN: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} 55define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) { 56main_body: 57 %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) 58 %bitcast = bitcast <2 x i32> %load to <2 x float> 59 %x = extractelement <2 x float> %bitcast, i32 0 60 %y = extractelement <2 x float> %bitcast, i32 1 61 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) 62 ret void 63} 64 65;GCN-LABEL: {{^}}s_buffer_loadx2_index_divergent: 66;GCN-NOT: s_waitcnt; 67;GCN: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 68define amdgpu_ps void @s_buffer_loadx2_index_divergent(<4 x i32> inreg %desc, i32 %index) { 69main_body: 70 %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0) 71 %bitcast = bitcast <2 x i32> %load to <2 x float> 72 %x = extractelement <2 x float> %bitcast, i32 0 73 %y = extractelement <2 x float> %bitcast, i32 1 74 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) 75 ret void 76} 77 78;GCN-LABEL: {{^}}s_buffer_loadx3_imm: 79;GCN-NOT: s_waitcnt; 80;SI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x10 81;VI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40 82define amdgpu_ps void @s_buffer_loadx3_imm(<4 x i32> inreg %desc) { 83main_body: 84 %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 64, i32 0) 85 %bitcast = bitcast <3 x i32> %load to <3 x float> 86 %x = extractelement <3 x float> %bitcast, i32 0 87 %y = extractelement <3 x float> %bitcast, i32 1 88 %z = extractelement <3 x float> %bitcast, i32 2 89 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true) 90 ret void 91} 92 93;GCN-LABEL: {{^}}s_buffer_loadx3_index: 94;GCN-NOT: s_waitcnt; 95;GCN: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} 96define amdgpu_ps void @s_buffer_loadx3_index(<4 x i32> inreg %desc, i32 inreg %index) { 97main_body: 98 %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0) 99 %bitcast = bitcast <3 x i32> %load to <3 x float> 100 %x = extractelement <3 x float> %bitcast, i32 0 101 %y = extractelement <3 x float> %bitcast, i32 1 102 %z = extractelement <3 x float> %bitcast, i32 2 103 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true) 104 ret void 105} 106 107;GCN-LABEL: {{^}}s_buffer_loadx3_index_divergent: 108;GCN-NOT: s_waitcnt; 109;SI: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 110;VI: buffer_load_dwordx3 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 111define amdgpu_ps void @s_buffer_loadx3_index_divergent(<4 x i32> inreg %desc, i32 %index) { 112main_body: 113 %load = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %desc, i32 %index, i32 0) 114 %bitcast = bitcast <3 x i32> %load to <3 x float> 115 %x = extractelement <3 x float> %bitcast, i32 0 116 %y = extractelement <3 x float> %bitcast, i32 1 117 %z = extractelement <3 x float> %bitcast, i32 2 118 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float undef, i1 true, i1 true) 119 ret void 120} 121 122;GCN-LABEL: {{^}}s_buffer_loadx4_imm: 123;GCN-NOT: s_waitcnt; 124;SI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x32 125;VI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8 126define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) { 127main_body: 128 %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0) 129 %bitcast = bitcast <4 x i32> %load to <4 x float> 130 %x = extractelement <4 x float> %bitcast, i32 0 131 %y = extractelement <4 x float> %bitcast, i32 1 132 %z = extractelement <4 x float> %bitcast, i32 2 133 %w = extractelement <4 x float> %bitcast, i32 3 134 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) 135 ret void 136} 137 138;GCN-LABEL: {{^}}s_buffer_loadx4_index: 139;GCN-NOT: s_waitcnt; 140;GCN: buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} 141define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) { 142main_body: 143 %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) 144 %bitcast = bitcast <4 x i32> %load to <4 x float> 145 %x = extractelement <4 x float> %bitcast, i32 0 146 %y = extractelement <4 x float> %bitcast, i32 1 147 %z = extractelement <4 x float> %bitcast, i32 2 148 %w = extractelement <4 x float> %bitcast, i32 3 149 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) 150 ret void 151} 152 153;GCN-LABEL: {{^}}s_buffer_loadx4_index_divergent: 154;GCN-NOT: s_waitcnt; 155;GCN: buffer_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 156define amdgpu_ps void @s_buffer_loadx4_index_divergent(<4 x i32> inreg %desc, i32 %index) { 157main_body: 158 %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0) 159 %bitcast = bitcast <4 x i32> %load to <4 x float> 160 %x = extractelement <4 x float> %bitcast, i32 0 161 %y = extractelement <4 x float> %bitcast, i32 1 162 %z = extractelement <4 x float> %bitcast, i32 2 163 %w = extractelement <4 x float> %bitcast, i32 3 164 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) 165 ret void 166} 167 168;GCN-LABEL: {{^}}s_buffer_load_imm_mergex2: 169;GCN-NOT: s_waitcnt; 170;SI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x1 171;VI: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4 172define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) { 173main_body: 174 %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0) 175 %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) 176 %x = bitcast i32 %load0 to float 177 %y = bitcast i32 %load1 to float 178 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true) 179 ret void 180} 181 182;GCN-LABEL: {{^}}s_buffer_load_imm_mergex4: 183;GCN-NOT: s_waitcnt; 184;SI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x2 185;VI: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8 186define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) { 187main_body: 188 %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0) 189 %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0) 190 %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0) 191 %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0) 192 %x = bitcast i32 %load0 to float 193 %y = bitcast i32 %load1 to float 194 %z = bitcast i32 %load2 to float 195 %w = bitcast i32 %load3 to float 196 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) 197 ret void 198} 199 200;GCN-LABEL: {{^}}s_buffer_load_index_across_bb: 201;GCN-NOT: s_waitcnt; 202;GCN: v_or_b32 203;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 204define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) { 205main_body: 206 %tmp = shl i32 %index, 4 207 br label %bb1 208 209bb1: ; preds = %main_body 210 %tmp1 = or i32 %tmp, 8 211 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0) 212 %bitcast = bitcast i32 %load to float 213 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true) 214 ret void 215} 216 217;GCN-LABEL: {{^}}s_buffer_load_index_across_bb_merged: 218;GCN-NOT: s_waitcnt; 219;GCN: v_or_b32 220;GCN: v_or_b32 221;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 222;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen 223define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) { 224main_body: 225 %tmp = shl i32 %index, 4 226 br label %bb1 227 228bb1: ; preds = %main_body 229 %tmp1 = or i32 %tmp, 8 230 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0) 231 %tmp2 = or i32 %tmp1, 4 232 %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp2, i32 0) 233 %bitcast = bitcast i32 %load to float 234 %bitcast2 = bitcast i32 %load2 to float 235 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true) 236 ret void 237} 238 239declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) 240declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) 241declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) 242declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32) 243declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) 244