1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s
3; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
7
8define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
9; TONGA-LABEL: image_sample_2d_f16:
10; TONGA:       ; %bb.0: ; %main_body
11; TONGA-NEXT:    s_mov_b64 s[12:13], exec
12; TONGA-NEXT:    s_wqm_b64 exec, exec
13; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
14; TONGA-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
15; TONGA-NEXT:    s_waitcnt vmcnt(0)
16; TONGA-NEXT:    ; return to shader part epilog
17;
18; GFX81-LABEL: image_sample_2d_f16:
19; GFX81:       ; %bb.0: ; %main_body
20; GFX81-NEXT:    s_mov_b64 s[12:13], exec
21; GFX81-NEXT:    s_wqm_b64 exec, exec
22; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
23; GFX81-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
24; GFX81-NEXT:    s_waitcnt vmcnt(0)
25; GFX81-NEXT:    ; return to shader part epilog
26;
27; GFX9-LABEL: image_sample_2d_f16:
28; GFX9:       ; %bb.0: ; %main_body
29; GFX9-NEXT:    s_mov_b64 s[12:13], exec
30; GFX9-NEXT:    s_wqm_b64 exec, exec
31; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
32; GFX9-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
33; GFX9-NEXT:    s_waitcnt vmcnt(0)
34; GFX9-NEXT:    ; return to shader part epilog
35;
36; GFX10PLUS-LABEL: image_sample_2d_f16:
37; GFX10PLUS:       ; %bb.0: ; %main_body
38; GFX10PLUS-NEXT:    s_mov_b32 s12, exec_lo
39; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
40; GFX10PLUS-NEXT:    s_and_b32 exec_lo, exec_lo, s12
41; GFX10PLUS-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
42; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
43; GFX10PLUS-NEXT:    ; return to shader part epilog
44main_body:
45  %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
46  ret half %tex
47}
48
49define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
50; TONGA-LABEL: image_sample_2d_f16_tfe:
51; TONGA:       ; %bb.0: ; %main_body
52; TONGA-NEXT:    s_mov_b64 s[14:15], exec
53; TONGA-NEXT:    s_wqm_b64 exec, exec
54; TONGA-NEXT:    v_mov_b32_e32 v2, 0
55; TONGA-NEXT:    v_mov_b32_e32 v3, v2
56; TONGA-NEXT:    s_and_b64 exec, exec, s[14:15]
57; TONGA-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
58; TONGA-NEXT:    v_mov_b32_e32 v0, s12
59; TONGA-NEXT:    v_mov_b32_e32 v1, s13
60; TONGA-NEXT:    s_waitcnt vmcnt(0)
61; TONGA-NEXT:    flat_store_dword v[0:1], v3
62; TONGA-NEXT:    v_mov_b32_e32 v0, v2
63; TONGA-NEXT:    s_waitcnt vmcnt(0)
64; TONGA-NEXT:    ; return to shader part epilog
65;
66; GFX81-LABEL: image_sample_2d_f16_tfe:
67; GFX81:       ; %bb.0: ; %main_body
68; GFX81-NEXT:    s_mov_b64 s[14:15], exec
69; GFX81-NEXT:    s_wqm_b64 exec, exec
70; GFX81-NEXT:    v_mov_b32_e32 v2, 0
71; GFX81-NEXT:    v_mov_b32_e32 v3, v2
72; GFX81-NEXT:    s_and_b64 exec, exec, s[14:15]
73; GFX81-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
74; GFX81-NEXT:    v_mov_b32_e32 v0, s12
75; GFX81-NEXT:    v_mov_b32_e32 v1, s13
76; GFX81-NEXT:    s_waitcnt vmcnt(0)
77; GFX81-NEXT:    flat_store_dword v[0:1], v3
78; GFX81-NEXT:    v_mov_b32_e32 v0, v2
79; GFX81-NEXT:    s_waitcnt vmcnt(0)
80; GFX81-NEXT:    ; return to shader part epilog
81;
82; GFX9-LABEL: image_sample_2d_f16_tfe:
83; GFX9:       ; %bb.0: ; %main_body
84; GFX9-NEXT:    s_mov_b64 s[14:15], exec
85; GFX9-NEXT:    s_wqm_b64 exec, exec
86; GFX9-NEXT:    v_mov_b32_e32 v4, 0
87; GFX9-NEXT:    v_mov_b32_e32 v5, v4
88; GFX9-NEXT:    v_mov_b32_e32 v2, v4
89; GFX9-NEXT:    v_mov_b32_e32 v3, v5
90; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
91; GFX9-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
92; GFX9-NEXT:    s_waitcnt vmcnt(0)
93; GFX9-NEXT:    v_mov_b32_e32 v0, v2
94; GFX9-NEXT:    global_store_dword v4, v3, s[12:13]
95; GFX9-NEXT:    s_waitcnt vmcnt(0)
96; GFX9-NEXT:    ; return to shader part epilog
97;
98; GFX10-LABEL: image_sample_2d_f16_tfe:
99; GFX10:       ; %bb.0: ; %main_body
100; GFX10-NEXT:    s_mov_b32 s14, exec_lo
101; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
102; GFX10-NEXT:    v_mov_b32_e32 v4, 0
103; GFX10-NEXT:    v_mov_b32_e32 v5, v4
104; GFX10-NEXT:    v_mov_b32_e32 v2, v4
105; GFX10-NEXT:    v_mov_b32_e32 v3, v5
106; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s14
107; GFX10-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
108; GFX10-NEXT:    s_waitcnt vmcnt(0)
109; GFX10-NEXT:    v_mov_b32_e32 v0, v2
110; GFX10-NEXT:    global_store_dword v4, v3, s[12:13]
111; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
112; GFX10-NEXT:    ; return to shader part epilog
113;
114; GFX11-LABEL: image_sample_2d_f16_tfe:
115; GFX11:       ; %bb.0: ; %main_body
116; GFX11-NEXT:    s_mov_b32 s14, exec_lo
117; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
118; GFX11-NEXT:    v_mov_b32_e32 v4, 0
119; GFX11-NEXT:    v_mov_b32_e32 v5, v4
120; GFX11-NEXT:    v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5
121; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s14
122; GFX11-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
123; GFX11-NEXT:    s_waitcnt vmcnt(0)
124; GFX11-NEXT:    v_mov_b32_e32 v0, v2
125; GFX11-NEXT:    global_store_b32 v4, v3, s[12:13]
126; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
127; GFX11-NEXT:    ; return to shader part epilog
128main_body:
129  %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
130  %tex.vec = extractvalue {half, i32} %tex, 0
131  %tex.err = extractvalue {half, i32} %tex, 1
132  store i32 %tex.err, i32 addrspace(1)* %out, align 4
133  ret half %tex.vec
134}
135
136define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
137; TONGA-LABEL: image_sample_c_d_1d_v2f16:
138; TONGA:       ; %bb.0: ; %main_body
139; TONGA-NEXT:    image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16
140; TONGA-NEXT:    s_waitcnt vmcnt(0)
141; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
142; TONGA-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
143; TONGA-NEXT:    ; return to shader part epilog
144;
145; GFX81-LABEL: image_sample_c_d_1d_v2f16:
146; GFX81:       ; %bb.0: ; %main_body
147; GFX81-NEXT:    image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16
148; GFX81-NEXT:    s_waitcnt vmcnt(0)
149; GFX81-NEXT:    ; return to shader part epilog
150;
151; GFX9-LABEL: image_sample_c_d_1d_v2f16:
152; GFX9:       ; %bb.0: ; %main_body
153; GFX9-NEXT:    image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16
154; GFX9-NEXT:    s_waitcnt vmcnt(0)
155; GFX9-NEXT:    ; return to shader part epilog
156;
157; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16:
158; GFX10PLUS:       ; %bb.0: ; %main_body
159; GFX10PLUS-NEXT:    image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
160; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
161; GFX10PLUS-NEXT:    ; return to shader part epilog
162main_body:
163  %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
164  %r = bitcast <2 x half> %tex to float
165  ret float %r
166}
167
168define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
169; TONGA-LABEL: image_sample_c_d_1d_v2f16_tfe:
170; TONGA:       ; %bb.0: ; %main_body
171; TONGA-NEXT:    v_mov_b32_e32 v4, 0
172; TONGA-NEXT:    v_mov_b32_e32 v5, v4
173; TONGA-NEXT:    v_mov_b32_e32 v6, v4
174; TONGA-NEXT:    image_sample_c_d v[4:6], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
175; TONGA-NEXT:    s_waitcnt vmcnt(0)
176; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
177; TONGA-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
178; TONGA-NEXT:    v_mov_b32_e32 v1, v6
179; TONGA-NEXT:    ; return to shader part epilog
180;
181; GFX81-LABEL: image_sample_c_d_1d_v2f16_tfe:
182; GFX81:       ; %bb.0: ; %main_body
183; GFX81-NEXT:    v_mov_b32_e32 v4, 0
184; GFX81-NEXT:    v_mov_b32_e32 v5, v4
185; GFX81-NEXT:    image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
186; GFX81-NEXT:    s_waitcnt vmcnt(0)
187; GFX81-NEXT:    v_mov_b32_e32 v0, v4
188; GFX81-NEXT:    v_mov_b32_e32 v1, v5
189; GFX81-NEXT:    ; return to shader part epilog
190;
191; GFX9-LABEL: image_sample_c_d_1d_v2f16_tfe:
192; GFX9:       ; %bb.0: ; %main_body
193; GFX9-NEXT:    v_mov_b32_e32 v4, 0
194; GFX9-NEXT:    v_mov_b32_e32 v5, v4
195; GFX9-NEXT:    image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
196; GFX9-NEXT:    s_waitcnt vmcnt(0)
197; GFX9-NEXT:    v_mov_b32_e32 v0, v4
198; GFX9-NEXT:    v_mov_b32_e32 v1, v5
199; GFX9-NEXT:    ; return to shader part epilog
200;
201; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe:
202; GFX10:       ; %bb.0: ; %main_body
203; GFX10-NEXT:    v_mov_b32_e32 v5, v0
204; GFX10-NEXT:    v_mov_b32_e32 v0, 0
205; GFX10-NEXT:    v_mov_b32_e32 v4, v1
206; GFX10-NEXT:    v_mov_b32_e32 v1, v0
207; GFX10-NEXT:    image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
208; GFX10-NEXT:    s_waitcnt vmcnt(0)
209; GFX10-NEXT:    ; return to shader part epilog
210;
211; GFX11-LABEL: image_sample_c_d_1d_v2f16_tfe:
212; GFX11:       ; %bb.0: ; %main_body
213; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
214; GFX11-NEXT:    v_mov_b32_e32 v0, 0
215; GFX11-NEXT:    v_mov_b32_e32 v1, v0
216; GFX11-NEXT:    image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
217; GFX11-NEXT:    s_waitcnt vmcnt(0)
218; GFX11-NEXT:    ; return to shader part epilog
219main_body:
220  %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
221  %tex.vec = extractvalue {<2 x half>, i32} %tex, 0
222  %tex.err = extractvalue {<2 x half>, i32} %tex, 1
223  %tex.vecf = bitcast <2 x half> %tex.vec to float
224  %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
225  %tex.errf = bitcast i32 %tex.err to float
226  %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
227  ret <2 x float> %r
228}
229
230define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
231; TONGA-LABEL: image_sample_b_2d_v3f16:
232; TONGA:       ; %bb.0: ; %main_body
233; TONGA-NEXT:    s_mov_b64 s[12:13], exec
234; TONGA-NEXT:    s_wqm_b64 exec, exec
235; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
236; TONGA-NEXT:    image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
237; TONGA-NEXT:    s_waitcnt vmcnt(0)
238; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
239; TONGA-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
240; TONGA-NEXT:    v_mov_b32_e32 v1, v2
241; TONGA-NEXT:    ; return to shader part epilog
242;
243; GFX81-LABEL: image_sample_b_2d_v3f16:
244; GFX81:       ; %bb.0: ; %main_body
245; GFX81-NEXT:    s_mov_b64 s[12:13], exec
246; GFX81-NEXT:    s_wqm_b64 exec, exec
247; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
248; GFX81-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
249; GFX81-NEXT:    s_waitcnt vmcnt(0)
250; GFX81-NEXT:    ; return to shader part epilog
251;
252; GFX9-LABEL: image_sample_b_2d_v3f16:
253; GFX9:       ; %bb.0: ; %main_body
254; GFX9-NEXT:    s_mov_b64 s[12:13], exec
255; GFX9-NEXT:    s_wqm_b64 exec, exec
256; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
257; GFX9-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
258; GFX9-NEXT:    s_waitcnt vmcnt(0)
259; GFX9-NEXT:    ; return to shader part epilog
260;
261; GFX10PLUS-LABEL: image_sample_b_2d_v3f16:
262; GFX10PLUS:       ; %bb.0: ; %main_body
263; GFX10PLUS-NEXT:    s_mov_b32 s12, exec_lo
264; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
265; GFX10PLUS-NEXT:    s_and_b32 exec_lo, exec_lo, s12
266; GFX10PLUS-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
267; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
268; GFX10PLUS-NEXT:    ; return to shader part epilog
269main_body:
270  %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
271  %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
272  %r = bitcast <4 x half> %tex_wide to <2 x float>
273  ret <2 x float> %r
274}
275
276define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
277; TONGA-LABEL: image_sample_b_2d_v3f16_tfe:
278; TONGA:       ; %bb.0: ; %main_body
279; TONGA-NEXT:    s_mov_b64 s[12:13], exec
280; TONGA-NEXT:    s_wqm_b64 exec, exec
281; TONGA-NEXT:    v_mov_b32_e32 v3, 0
282; TONGA-NEXT:    v_mov_b32_e32 v4, v3
283; TONGA-NEXT:    v_mov_b32_e32 v5, v3
284; TONGA-NEXT:    v_mov_b32_e32 v6, v3
285; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
286; TONGA-NEXT:    image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
287; TONGA-NEXT:    s_waitcnt vmcnt(0)
288; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
289; TONGA-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
290; TONGA-NEXT:    v_mov_b32_e32 v1, v5
291; TONGA-NEXT:    v_mov_b32_e32 v2, v6
292; TONGA-NEXT:    ; return to shader part epilog
293;
294; GFX81-LABEL: image_sample_b_2d_v3f16_tfe:
295; GFX81:       ; %bb.0: ; %main_body
296; GFX81-NEXT:    s_mov_b64 s[12:13], exec
297; GFX81-NEXT:    s_wqm_b64 exec, exec
298; GFX81-NEXT:    v_mov_b32_e32 v3, 0
299; GFX81-NEXT:    v_mov_b32_e32 v4, v3
300; GFX81-NEXT:    v_mov_b32_e32 v5, v3
301; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
302; GFX81-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
303; GFX81-NEXT:    s_waitcnt vmcnt(0)
304; GFX81-NEXT:    v_mov_b32_e32 v0, v3
305; GFX81-NEXT:    v_mov_b32_e32 v1, v4
306; GFX81-NEXT:    v_mov_b32_e32 v2, v5
307; GFX81-NEXT:    ; return to shader part epilog
308;
309; GFX9-LABEL: image_sample_b_2d_v3f16_tfe:
310; GFX9:       ; %bb.0: ; %main_body
311; GFX9-NEXT:    s_mov_b64 s[12:13], exec
312; GFX9-NEXT:    s_wqm_b64 exec, exec
313; GFX9-NEXT:    v_mov_b32_e32 v3, 0
314; GFX9-NEXT:    v_mov_b32_e32 v4, v3
315; GFX9-NEXT:    v_mov_b32_e32 v5, v3
316; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
317; GFX9-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
318; GFX9-NEXT:    s_waitcnt vmcnt(0)
319; GFX9-NEXT:    v_mov_b32_e32 v0, v3
320; GFX9-NEXT:    v_mov_b32_e32 v1, v4
321; GFX9-NEXT:    v_mov_b32_e32 v2, v5
322; GFX9-NEXT:    ; return to shader part epilog
323;
324; GFX10-LABEL: image_sample_b_2d_v3f16_tfe:
325; GFX10:       ; %bb.0: ; %main_body
326; GFX10-NEXT:    s_mov_b32 s12, exec_lo
327; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
328; GFX10-NEXT:    v_mov_b32_e32 v3, v0
329; GFX10-NEXT:    v_mov_b32_e32 v0, 0
330; GFX10-NEXT:    v_mov_b32_e32 v5, v2
331; GFX10-NEXT:    v_mov_b32_e32 v4, v1
332; GFX10-NEXT:    v_mov_b32_e32 v1, v0
333; GFX10-NEXT:    v_mov_b32_e32 v2, v0
334; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
335; GFX10-NEXT:    image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
336; GFX10-NEXT:    s_waitcnt vmcnt(0)
337; GFX10-NEXT:    ; return to shader part epilog
338;
339; GFX11-LABEL: image_sample_b_2d_v3f16_tfe:
340; GFX11:       ; %bb.0: ; %main_body
341; GFX11-NEXT:    s_mov_b32 s12, exec_lo
342; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
343; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
344; GFX11-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
345; GFX11-NEXT:    v_mov_b32_e32 v1, v0
346; GFX11-NEXT:    v_mov_b32_e32 v2, v0
347; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
348; GFX11-NEXT:    image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
349; GFX11-NEXT:    s_waitcnt vmcnt(0)
350; GFX11-NEXT:    ; return to shader part epilog
351main_body:
352  %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
353  %tex.vec = extractvalue {<3 x half>, i32} %tex, 0
354  %tex.vec_wide = shufflevector <3 x half> %tex.vec, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
355  %tex.err = extractvalue {<3 x half>, i32} %tex, 1
356  %tex.vecf = bitcast <4 x half> %tex.vec_wide to <2 x float>
357  %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
358  %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
359  %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
360  %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
361  %tex.errf = bitcast i32 %tex.err to float
362  %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
363  ret <4 x float> %r
364}
365
366define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
367; TONGA-LABEL: image_sample_b_2d_v4f16:
368; TONGA:       ; %bb.0: ; %main_body
369; TONGA-NEXT:    s_mov_b64 s[12:13], exec
370; TONGA-NEXT:    s_wqm_b64 exec, exec
371; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
372; TONGA-NEXT:    image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16
373; TONGA-NEXT:    s_waitcnt vmcnt(0)
374; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
375; TONGA-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
376; TONGA-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
377; TONGA-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
378; TONGA-NEXT:    ; return to shader part epilog
379;
380; GFX81-LABEL: image_sample_b_2d_v4f16:
381; GFX81:       ; %bb.0: ; %main_body
382; GFX81-NEXT:    s_mov_b64 s[12:13], exec
383; GFX81-NEXT:    s_wqm_b64 exec, exec
384; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
385; GFX81-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16
386; GFX81-NEXT:    s_waitcnt vmcnt(0)
387; GFX81-NEXT:    ; return to shader part epilog
388;
389; GFX9-LABEL: image_sample_b_2d_v4f16:
390; GFX9:       ; %bb.0: ; %main_body
391; GFX9-NEXT:    s_mov_b64 s[12:13], exec
392; GFX9-NEXT:    s_wqm_b64 exec, exec
393; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
394; GFX9-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16
395; GFX9-NEXT:    s_waitcnt vmcnt(0)
396; GFX9-NEXT:    ; return to shader part epilog
397;
398; GFX10PLUS-LABEL: image_sample_b_2d_v4f16:
399; GFX10PLUS:       ; %bb.0: ; %main_body
400; GFX10PLUS-NEXT:    s_mov_b32 s12, exec_lo
401; GFX10PLUS-NEXT:    s_wqm_b32 exec_lo, exec_lo
402; GFX10PLUS-NEXT:    s_and_b32 exec_lo, exec_lo, s12
403; GFX10PLUS-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16
404; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0)
405; GFX10PLUS-NEXT:    ; return to shader part epilog
406main_body:
407  %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
408  %r = bitcast <4 x half> %tex to <2 x float>
409  ret <2 x float> %r
410}
411
412define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
413; TONGA-LABEL: image_sample_b_2d_v4f16_tfe:
414; TONGA:       ; %bb.0: ; %main_body
415; TONGA-NEXT:    s_mov_b64 s[12:13], exec
416; TONGA-NEXT:    s_wqm_b64 exec, exec
417; TONGA-NEXT:    v_mov_b32_e32 v3, 0
418; TONGA-NEXT:    v_mov_b32_e32 v4, v3
419; TONGA-NEXT:    v_mov_b32_e32 v5, v3
420; TONGA-NEXT:    v_mov_b32_e32 v6, v3
421; TONGA-NEXT:    v_mov_b32_e32 v7, v3
422; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
423; TONGA-NEXT:    image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
424; TONGA-NEXT:    s_waitcnt vmcnt(0)
425; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
426; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
427; TONGA-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
428; TONGA-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
429; TONGA-NEXT:    v_mov_b32_e32 v2, v7
430; TONGA-NEXT:    ; return to shader part epilog
431;
432; GFX81-LABEL: image_sample_b_2d_v4f16_tfe:
433; GFX81:       ; %bb.0: ; %main_body
434; GFX81-NEXT:    s_mov_b64 s[12:13], exec
435; GFX81-NEXT:    s_wqm_b64 exec, exec
436; GFX81-NEXT:    v_mov_b32_e32 v3, 0
437; GFX81-NEXT:    v_mov_b32_e32 v4, v3
438; GFX81-NEXT:    v_mov_b32_e32 v5, v3
439; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
440; GFX81-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
441; GFX81-NEXT:    s_waitcnt vmcnt(0)
442; GFX81-NEXT:    v_mov_b32_e32 v0, v3
443; GFX81-NEXT:    v_mov_b32_e32 v1, v4
444; GFX81-NEXT:    v_mov_b32_e32 v2, v5
445; GFX81-NEXT:    ; return to shader part epilog
446;
447; GFX9-LABEL: image_sample_b_2d_v4f16_tfe:
448; GFX9:       ; %bb.0: ; %main_body
449; GFX9-NEXT:    s_mov_b64 s[12:13], exec
450; GFX9-NEXT:    s_wqm_b64 exec, exec
451; GFX9-NEXT:    v_mov_b32_e32 v3, 0
452; GFX9-NEXT:    v_mov_b32_e32 v4, v3
453; GFX9-NEXT:    v_mov_b32_e32 v5, v3
454; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
455; GFX9-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
456; GFX9-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-NEXT:    v_mov_b32_e32 v0, v3
458; GFX9-NEXT:    v_mov_b32_e32 v1, v4
459; GFX9-NEXT:    v_mov_b32_e32 v2, v5
460; GFX9-NEXT:    ; return to shader part epilog
461;
462; GFX10-LABEL: image_sample_b_2d_v4f16_tfe:
463; GFX10:       ; %bb.0: ; %main_body
464; GFX10-NEXT:    s_mov_b32 s12, exec_lo
465; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
466; GFX10-NEXT:    v_mov_b32_e32 v3, v0
467; GFX10-NEXT:    v_mov_b32_e32 v0, 0
468; GFX10-NEXT:    v_mov_b32_e32 v5, v2
469; GFX10-NEXT:    v_mov_b32_e32 v4, v1
470; GFX10-NEXT:    v_mov_b32_e32 v1, v0
471; GFX10-NEXT:    v_mov_b32_e32 v2, v0
472; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
473; GFX10-NEXT:    image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
474; GFX10-NEXT:    s_waitcnt vmcnt(0)
475; GFX10-NEXT:    ; return to shader part epilog
476;
477; GFX11-LABEL: image_sample_b_2d_v4f16_tfe:
478; GFX11:       ; %bb.0: ; %main_body
479; GFX11-NEXT:    s_mov_b32 s12, exec_lo
480; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
481; GFX11-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0
482; GFX11-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
483; GFX11-NEXT:    v_mov_b32_e32 v1, v0
484; GFX11-NEXT:    v_mov_b32_e32 v2, v0
485; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
486; GFX11-NEXT:    image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
487; GFX11-NEXT:    s_waitcnt vmcnt(0)
488; GFX11-NEXT:    ; return to shader part epilog
489main_body:
490  %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
491  %tex.vec = extractvalue {<4 x half>, i32} %tex, 0
492  %tex.err = extractvalue {<4 x half>, i32} %tex, 1
493  %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
494  %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
495  %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
496  %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
497  %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
498  %tex.errf = bitcast i32 %tex.err to float
499  %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
500  ret <4 x float> %r
501}
502
503declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
504declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
505declare <3 x half> @llvm.amdgcn.image.sample.2d.v3f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
506declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
507declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
508declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
509declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
510declare <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
511declare {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
512declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
513declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
514
515attributes #0 = { nounwind }
516attributes #1 = { nounwind readonly }
517attributes #2 = { nounwind readnone }
518