1;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
2
3;CHECK-LABEL: {{^}}s_buffer_load_imm:
4;CHECK-NOT: s_waitcnt;
5;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
6define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
7main_body:
8  %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
9  %bitcast = bitcast i32 %load to float
10  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
11  ret void
12}
13
14;CHECK-LABEL: {{^}}s_buffer_load_index:
15;CHECK-NOT: s_waitcnt;
16;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
17define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) {
18main_body:
19  %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
20  %bitcast = bitcast i32 %load to float
21  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
22  ret void
23}
24
25;CHECK-LABEL: {{^}}s_buffer_loadx2_imm:
26;CHECK-NOT: s_waitcnt;
27;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
28define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
29main_body:
30  %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
31  %bitcast = bitcast <2 x i32> %load to <2 x float>
32  %x = extractelement <2 x float> %bitcast, i32 0
33  %y = extractelement <2 x float> %bitcast, i32 1
34  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
35  ret void
36}
37
38;CHECK-LABEL: {{^}}s_buffer_loadx2_index:
39;CHECK-NOT: s_waitcnt;
40;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
41define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) {
42main_body:
43  %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
44  %bitcast = bitcast <2 x i32> %load to <2 x float>
45  %x = extractelement <2 x float> %bitcast, i32 0
46  %y = extractelement <2 x float> %bitcast, i32 1
47  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
48  ret void
49}
50
51;CHECK-LABEL: {{^}}s_buffer_loadx4_imm:
52;CHECK-NOT: s_waitcnt;
53;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
54define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
55main_body:
56  %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
57  %bitcast = bitcast <4 x i32> %load to <4 x float>
58  %x = extractelement <4 x float> %bitcast, i32 0
59  %y = extractelement <4 x float> %bitcast, i32 1
60  %z = extractelement <4 x float> %bitcast, i32 2
61  %w = extractelement <4 x float> %bitcast, i32 3
62  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
63  ret void
64}
65
66;CHECK-LABEL: {{^}}s_buffer_loadx4_index:
67;CHECK-NOT: s_waitcnt;
68;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
69define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) {
70main_body:
71  %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
72  %bitcast = bitcast <4 x i32> %load to <4 x float>
73  %x = extractelement <4 x float> %bitcast, i32 0
74  %y = extractelement <4 x float> %bitcast, i32 1
75  %z = extractelement <4 x float> %bitcast, i32 2
76  %w = extractelement <4 x float> %bitcast, i32 3
77  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
78  ret void
79}
80
81;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2:
82;CHECK-NOT: s_waitcnt;
83;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
84define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
85main_body:
86  %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
87  %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
88  %x = bitcast i32 %load0 to float
89  %y = bitcast i32 %load1 to float
90  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
91  ret void
92}
93
94;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4:
95;CHECK-NOT: s_waitcnt;
96;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
97define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
98main_body:
99  %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
100  %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0)
101  %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0)
102  %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0)
103  %x = bitcast i32 %load0 to float
104  %y = bitcast i32 %load1 to float
105  %z = bitcast i32 %load2 to float
106  %w = bitcast i32 %load3 to float
107  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
108  ret void
109}
110
111;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb:
112;CHECK-NOT: s_waitcnt;
113;CHECK: v_or_b32
114;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
115define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) {
116main_body:
117  %tmp = shl i32 %index, 4
118  br label %bb1
119
120bb1:                                              ; preds = %main_body
121  %tmp1 = or i32 %tmp, 8
122  %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0)
123  %bitcast = bitcast i32 %load to float
124  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
125  ret void
126}
127
128;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb_merged:
129;CHECK-NOT: s_waitcnt;
130;CHECK: v_or_b32
131;CHECK: v_or_b32
132;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
133;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
134define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) {
135main_body:
136  %tmp = shl i32 %index, 4
137  br label %bb1
138
139bb1:                                              ; preds = %main_body
140  %tmp1 = or i32 %tmp, 8
141  %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0)
142  %tmp2 = or i32 %tmp1, 4
143  %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp2, i32 0)
144  %bitcast = bitcast i32 %load to float
145  %bitcast2 = bitcast i32 %load2 to float
146  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true)
147  ret void
148}
149
150declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
151declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
152declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
153declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
154