1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s
3; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s
6
7define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
8; TONGA-LABEL: image_sample_2d_f16:
9; TONGA:       ; %bb.0: ; %main_body
10; TONGA-NEXT:    s_mov_b64 s[12:13], exec
11; TONGA-NEXT:    s_wqm_b64 exec, exec
12; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
13; TONGA-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
14; TONGA-NEXT:    s_waitcnt vmcnt(0)
15; TONGA-NEXT:    ; return to shader part epilog
16;
17; GFX81-LABEL: image_sample_2d_f16:
18; GFX81:       ; %bb.0: ; %main_body
19; GFX81-NEXT:    s_mov_b64 s[12:13], exec
20; GFX81-NEXT:    s_wqm_b64 exec, exec
21; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
22; GFX81-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
23; GFX81-NEXT:    s_waitcnt vmcnt(0)
24; GFX81-NEXT:    ; return to shader part epilog
25;
26; GFX9-LABEL: image_sample_2d_f16:
27; GFX9:       ; %bb.0: ; %main_body
28; GFX9-NEXT:    s_mov_b64 s[12:13], exec
29; GFX9-NEXT:    s_wqm_b64 exec, exec
30; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
31; GFX9-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16
32; GFX9-NEXT:    s_waitcnt vmcnt(0)
33; GFX9-NEXT:    ; return to shader part epilog
34;
35; GFX10-LABEL: image_sample_2d_f16:
36; GFX10:       ; %bb.0: ; %main_body
37; GFX10-NEXT:    s_mov_b32 s12, exec_lo
38; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
39; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
40; GFX10-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
41; GFX10-NEXT:    s_waitcnt vmcnt(0)
42; GFX10-NEXT:    ; return to shader part epilog
43main_body:
44  %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
45  ret half %tex
46}
47
48define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
49; TONGA-LABEL: image_sample_2d_f16_tfe:
50; TONGA:       ; %bb.0: ; %main_body
51; TONGA-NEXT:    s_mov_b64 s[14:15], exec
52; TONGA-NEXT:    s_wqm_b64 exec, exec
53; TONGA-NEXT:    v_mov_b32_e32 v2, 0
54; TONGA-NEXT:    v_mov_b32_e32 v3, v2
55; TONGA-NEXT:    s_and_b64 exec, exec, s[14:15]
56; TONGA-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
57; TONGA-NEXT:    v_mov_b32_e32 v0, s12
58; TONGA-NEXT:    v_mov_b32_e32 v1, s13
59; TONGA-NEXT:    s_waitcnt vmcnt(0)
60; TONGA-NEXT:    flat_store_dword v[0:1], v3
61; TONGA-NEXT:    v_mov_b32_e32 v0, v2
62; TONGA-NEXT:    s_waitcnt vmcnt(0)
63; TONGA-NEXT:    ; return to shader part epilog
64;
65; GFX81-LABEL: image_sample_2d_f16_tfe:
66; GFX81:       ; %bb.0: ; %main_body
67; GFX81-NEXT:    s_mov_b64 s[14:15], exec
68; GFX81-NEXT:    s_wqm_b64 exec, exec
69; GFX81-NEXT:    v_mov_b32_e32 v2, 0
70; GFX81-NEXT:    v_mov_b32_e32 v3, v2
71; GFX81-NEXT:    s_and_b64 exec, exec, s[14:15]
72; GFX81-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
73; GFX81-NEXT:    v_mov_b32_e32 v0, s12
74; GFX81-NEXT:    v_mov_b32_e32 v1, s13
75; GFX81-NEXT:    s_waitcnt vmcnt(0)
76; GFX81-NEXT:    flat_store_dword v[0:1], v3
77; GFX81-NEXT:    v_mov_b32_e32 v0, v2
78; GFX81-NEXT:    s_waitcnt vmcnt(0)
79; GFX81-NEXT:    ; return to shader part epilog
80;
81; GFX9-LABEL: image_sample_2d_f16_tfe:
82; GFX9:       ; %bb.0: ; %main_body
83; GFX9-NEXT:    s_mov_b64 s[14:15], exec
84; GFX9-NEXT:    s_wqm_b64 exec, exec
85; GFX9-NEXT:    v_mov_b32_e32 v4, 0
86; GFX9-NEXT:    v_mov_b32_e32 v5, v4
87; GFX9-NEXT:    v_mov_b32_e32 v2, v4
88; GFX9-NEXT:    v_mov_b32_e32 v3, v5
89; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
90; GFX9-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16
91; GFX9-NEXT:    s_waitcnt vmcnt(0)
92; GFX9-NEXT:    v_mov_b32_e32 v0, v2
93; GFX9-NEXT:    global_store_dword v4, v3, s[12:13]
94; GFX9-NEXT:    s_waitcnt vmcnt(0)
95; GFX9-NEXT:    ; return to shader part epilog
96;
97; GFX10-LABEL: image_sample_2d_f16_tfe:
98; GFX10:       ; %bb.0: ; %main_body
99; GFX10-NEXT:    s_mov_b32 s14, exec_lo
100; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
101; GFX10-NEXT:    v_mov_b32_e32 v4, 0
102; GFX10-NEXT:    v_mov_b32_e32 v5, v4
103; GFX10-NEXT:    v_mov_b32_e32 v2, v4
104; GFX10-NEXT:    v_mov_b32_e32 v3, v5
105; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s14
106; GFX10-NEXT:    image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    v_mov_b32_e32 v0, v2
109; GFX10-NEXT:    global_store_dword v4, v3, s[12:13]
110; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
111; GFX10-NEXT:    ; return to shader part epilog
112main_body:
113  %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
114  %tex.vec = extractvalue {half, i32} %tex, 0
115  %tex.err = extractvalue {half, i32} %tex, 1
116  store i32 %tex.err, i32 addrspace(1)* %out, align 4
117  ret half %tex.vec
118}
119
120define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
121; TONGA-LABEL: image_sample_c_d_1d_v2f16:
122; TONGA:       ; %bb.0: ; %main_body
123; TONGA-NEXT:    image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16
124; TONGA-NEXT:    s_waitcnt vmcnt(0)
125; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
126; TONGA-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
127; TONGA-NEXT:    ; return to shader part epilog
128;
129; GFX81-LABEL: image_sample_c_d_1d_v2f16:
130; GFX81:       ; %bb.0: ; %main_body
131; GFX81-NEXT:    image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16
132; GFX81-NEXT:    s_waitcnt vmcnt(0)
133; GFX81-NEXT:    ; return to shader part epilog
134;
135; GFX9-LABEL: image_sample_c_d_1d_v2f16:
136; GFX9:       ; %bb.0: ; %main_body
137; GFX9-NEXT:    image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16
138; GFX9-NEXT:    s_waitcnt vmcnt(0)
139; GFX9-NEXT:    ; return to shader part epilog
140;
141; GFX10-LABEL: image_sample_c_d_1d_v2f16:
142; GFX10:       ; %bb.0: ; %main_body
143; GFX10-NEXT:    image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16
144; GFX10-NEXT:    s_waitcnt vmcnt(0)
145; GFX10-NEXT:    ; return to shader part epilog
146main_body:
147  %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
148  %r = bitcast <2 x half> %tex to float
149  ret float %r
150}
151
152define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
153; TONGA-LABEL: image_sample_c_d_1d_v2f16_tfe:
154; TONGA:       ; %bb.0: ; %main_body
155; TONGA-NEXT:    v_mov_b32_e32 v4, 0
156; TONGA-NEXT:    v_mov_b32_e32 v5, v4
157; TONGA-NEXT:    v_mov_b32_e32 v6, v4
158; TONGA-NEXT:    image_sample_c_d v[4:6], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
159; TONGA-NEXT:    s_waitcnt vmcnt(0)
160; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
161; TONGA-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
162; TONGA-NEXT:    v_mov_b32_e32 v1, v6
163; TONGA-NEXT:    ; return to shader part epilog
164;
165; GFX81-LABEL: image_sample_c_d_1d_v2f16_tfe:
166; GFX81:       ; %bb.0: ; %main_body
167; GFX81-NEXT:    v_mov_b32_e32 v4, 0
168; GFX81-NEXT:    v_mov_b32_e32 v5, v4
169; GFX81-NEXT:    image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
170; GFX81-NEXT:    s_waitcnt vmcnt(0)
171; GFX81-NEXT:    v_mov_b32_e32 v0, v4
172; GFX81-NEXT:    v_mov_b32_e32 v1, v5
173; GFX81-NEXT:    ; return to shader part epilog
174;
175; GFX9-LABEL: image_sample_c_d_1d_v2f16_tfe:
176; GFX9:       ; %bb.0: ; %main_body
177; GFX9-NEXT:    v_mov_b32_e32 v4, 0
178; GFX9-NEXT:    v_mov_b32_e32 v5, v4
179; GFX9-NEXT:    image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16
180; GFX9-NEXT:    s_waitcnt vmcnt(0)
181; GFX9-NEXT:    v_mov_b32_e32 v0, v4
182; GFX9-NEXT:    v_mov_b32_e32 v1, v5
183; GFX9-NEXT:    ; return to shader part epilog
184;
185; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe:
186; GFX10:       ; %bb.0: ; %main_body
187; GFX10-NEXT:    v_mov_b32_e32 v5, v0
188; GFX10-NEXT:    v_mov_b32_e32 v0, 0
189; GFX10-NEXT:    v_mov_b32_e32 v4, v1
190; GFX10-NEXT:    v_mov_b32_e32 v1, v0
191; GFX10-NEXT:    image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16
192; GFX10-NEXT:    s_waitcnt vmcnt(0)
193; GFX10-NEXT:    ; return to shader part epilog
194main_body:
195  %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
196  %tex.vec = extractvalue {<2 x half>, i32} %tex, 0
197  %tex.err = extractvalue {<2 x half>, i32} %tex, 1
198  %tex.vecf = bitcast <2 x half> %tex.vec to float
199  %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
200  %tex.errf = bitcast i32 %tex.err to float
201  %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
202  ret <2 x float> %r
203}
204
205define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
206; TONGA-LABEL: image_sample_b_2d_v3f16:
207; TONGA:       ; %bb.0: ; %main_body
208; TONGA-NEXT:    s_mov_b64 s[12:13], exec
209; TONGA-NEXT:    s_wqm_b64 exec, exec
210; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
211; TONGA-NEXT:    image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
212; TONGA-NEXT:    s_waitcnt vmcnt(0)
213; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
214; TONGA-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
215; TONGA-NEXT:    v_mov_b32_e32 v1, v2
216; TONGA-NEXT:    ; return to shader part epilog
217;
218; GFX81-LABEL: image_sample_b_2d_v3f16:
219; GFX81:       ; %bb.0: ; %main_body
220; GFX81-NEXT:    s_mov_b64 s[12:13], exec
221; GFX81-NEXT:    s_wqm_b64 exec, exec
222; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
223; GFX81-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
224; GFX81-NEXT:    s_waitcnt vmcnt(0)
225; GFX81-NEXT:    ; return to shader part epilog
226;
227; GFX9-LABEL: image_sample_b_2d_v3f16:
228; GFX9:       ; %bb.0: ; %main_body
229; GFX9-NEXT:    s_mov_b64 s[12:13], exec
230; GFX9-NEXT:    s_wqm_b64 exec, exec
231; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
232; GFX9-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
233; GFX9-NEXT:    s_waitcnt vmcnt(0)
234; GFX9-NEXT:    ; return to shader part epilog
235;
236; GFX10-LABEL: image_sample_b_2d_v3f16:
237; GFX10:       ; %bb.0: ; %main_body
238; GFX10-NEXT:    s_mov_b32 s12, exec_lo
239; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
240; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
241; GFX10-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
242; GFX10-NEXT:    s_waitcnt vmcnt(0)
243; GFX10-NEXT:    ; return to shader part epilog
244main_body:
245  %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
246  %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
247  %r = bitcast <4 x half> %tex_wide to <2 x float>
248  ret <2 x float> %r
249}
250
251define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
252; TONGA-LABEL: image_sample_b_2d_v3f16_tfe:
253; TONGA:       ; %bb.0: ; %main_body
254; TONGA-NEXT:    s_mov_b64 s[12:13], exec
255; TONGA-NEXT:    s_wqm_b64 exec, exec
256; TONGA-NEXT:    v_mov_b32_e32 v3, 0
257; TONGA-NEXT:    v_mov_b32_e32 v4, v3
258; TONGA-NEXT:    v_mov_b32_e32 v5, v3
259; TONGA-NEXT:    v_mov_b32_e32 v6, v3
260; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
261; TONGA-NEXT:    image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
262; TONGA-NEXT:    s_waitcnt vmcnt(0)
263; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
264; TONGA-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
265; TONGA-NEXT:    v_mov_b32_e32 v1, v5
266; TONGA-NEXT:    v_mov_b32_e32 v2, v6
267; TONGA-NEXT:    ; return to shader part epilog
268;
269; GFX81-LABEL: image_sample_b_2d_v3f16_tfe:
270; GFX81:       ; %bb.0: ; %main_body
271; GFX81-NEXT:    s_mov_b64 s[12:13], exec
272; GFX81-NEXT:    s_wqm_b64 exec, exec
273; GFX81-NEXT:    v_mov_b32_e32 v3, 0
274; GFX81-NEXT:    v_mov_b32_e32 v4, v3
275; GFX81-NEXT:    v_mov_b32_e32 v5, v3
276; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
277; GFX81-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
278; GFX81-NEXT:    s_waitcnt vmcnt(0)
279; GFX81-NEXT:    v_mov_b32_e32 v0, v3
280; GFX81-NEXT:    v_mov_b32_e32 v1, v4
281; GFX81-NEXT:    v_mov_b32_e32 v2, v5
282; GFX81-NEXT:    ; return to shader part epilog
283;
284; GFX9-LABEL: image_sample_b_2d_v3f16_tfe:
285; GFX9:       ; %bb.0: ; %main_body
286; GFX9-NEXT:    s_mov_b64 s[12:13], exec
287; GFX9-NEXT:    s_wqm_b64 exec, exec
288; GFX9-NEXT:    v_mov_b32_e32 v3, 0
289; GFX9-NEXT:    v_mov_b32_e32 v4, v3
290; GFX9-NEXT:    v_mov_b32_e32 v5, v3
291; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
292; GFX9-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
293; GFX9-NEXT:    s_waitcnt vmcnt(0)
294; GFX9-NEXT:    v_mov_b32_e32 v0, v3
295; GFX9-NEXT:    v_mov_b32_e32 v1, v4
296; GFX9-NEXT:    v_mov_b32_e32 v2, v5
297; GFX9-NEXT:    ; return to shader part epilog
298;
299; GFX10-LABEL: image_sample_b_2d_v3f16_tfe:
300; GFX10:       ; %bb.0: ; %main_body
301; GFX10-NEXT:    s_mov_b32 s12, exec_lo
302; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
303; GFX10-NEXT:    v_mov_b32_e32 v3, v0
304; GFX10-NEXT:    v_mov_b32_e32 v0, 0
305; GFX10-NEXT:    v_mov_b32_e32 v5, v2
306; GFX10-NEXT:    v_mov_b32_e32 v4, v1
307; GFX10-NEXT:    v_mov_b32_e32 v1, v0
308; GFX10-NEXT:    v_mov_b32_e32 v2, v0
309; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
310; GFX10-NEXT:    image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
311; GFX10-NEXT:    s_waitcnt vmcnt(0)
312; GFX10-NEXT:    ; return to shader part epilog
313main_body:
314  %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
315  %tex.vec = extractvalue {<3 x half>, i32} %tex, 0
316  %tex.vec_wide = shufflevector <3 x half> %tex.vec, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
317  %tex.err = extractvalue {<3 x half>, i32} %tex, 1
318  %tex.vecf = bitcast <4 x half> %tex.vec_wide to <2 x float>
319  %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
320  %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
321  %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
322  %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
323  %tex.errf = bitcast i32 %tex.err to float
324  %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
325  ret <4 x float> %r
326}
327
328define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
329; TONGA-LABEL: image_sample_b_2d_v4f16:
330; TONGA:       ; %bb.0: ; %main_body
331; TONGA-NEXT:    s_mov_b64 s[12:13], exec
332; TONGA-NEXT:    s_wqm_b64 exec, exec
333; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
334; TONGA-NEXT:    image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16
335; TONGA-NEXT:    s_waitcnt vmcnt(0)
336; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
337; TONGA-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
338; TONGA-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
339; TONGA-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
340; TONGA-NEXT:    ; return to shader part epilog
341;
342; GFX81-LABEL: image_sample_b_2d_v4f16:
343; GFX81:       ; %bb.0: ; %main_body
344; GFX81-NEXT:    s_mov_b64 s[12:13], exec
345; GFX81-NEXT:    s_wqm_b64 exec, exec
346; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
347; GFX81-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16
348; GFX81-NEXT:    s_waitcnt vmcnt(0)
349; GFX81-NEXT:    ; return to shader part epilog
350;
351; GFX9-LABEL: image_sample_b_2d_v4f16:
352; GFX9:       ; %bb.0: ; %main_body
353; GFX9-NEXT:    s_mov_b64 s[12:13], exec
354; GFX9-NEXT:    s_wqm_b64 exec, exec
355; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
356; GFX9-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16
357; GFX9-NEXT:    s_waitcnt vmcnt(0)
358; GFX9-NEXT:    ; return to shader part epilog
359;
360; GFX10-LABEL: image_sample_b_2d_v4f16:
361; GFX10:       ; %bb.0: ; %main_body
362; GFX10-NEXT:    s_mov_b32 s12, exec_lo
363; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
364; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
365; GFX10-NEXT:    image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16
366; GFX10-NEXT:    s_waitcnt vmcnt(0)
367; GFX10-NEXT:    ; return to shader part epilog
368main_body:
369  %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
370  %r = bitcast <4 x half> %tex to <2 x float>
371  ret <2 x float> %r
372}
373
374define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
375; TONGA-LABEL: image_sample_b_2d_v4f16_tfe:
376; TONGA:       ; %bb.0: ; %main_body
377; TONGA-NEXT:    s_mov_b64 s[12:13], exec
378; TONGA-NEXT:    s_wqm_b64 exec, exec
379; TONGA-NEXT:    v_mov_b32_e32 v3, 0
380; TONGA-NEXT:    v_mov_b32_e32 v4, v3
381; TONGA-NEXT:    v_mov_b32_e32 v5, v3
382; TONGA-NEXT:    v_mov_b32_e32 v6, v3
383; TONGA-NEXT:    v_mov_b32_e32 v7, v3
384; TONGA-NEXT:    s_and_b64 exec, exec, s[12:13]
385; TONGA-NEXT:    image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
386; TONGA-NEXT:    s_waitcnt vmcnt(0)
387; TONGA-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
388; TONGA-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
389; TONGA-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
390; TONGA-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
391; TONGA-NEXT:    v_mov_b32_e32 v2, v7
392; TONGA-NEXT:    ; return to shader part epilog
393;
394; GFX81-LABEL: image_sample_b_2d_v4f16_tfe:
395; GFX81:       ; %bb.0: ; %main_body
396; GFX81-NEXT:    s_mov_b64 s[12:13], exec
397; GFX81-NEXT:    s_wqm_b64 exec, exec
398; GFX81-NEXT:    v_mov_b32_e32 v3, 0
399; GFX81-NEXT:    v_mov_b32_e32 v4, v3
400; GFX81-NEXT:    v_mov_b32_e32 v5, v3
401; GFX81-NEXT:    s_and_b64 exec, exec, s[12:13]
402; GFX81-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
403; GFX81-NEXT:    s_waitcnt vmcnt(0)
404; GFX81-NEXT:    v_mov_b32_e32 v0, v3
405; GFX81-NEXT:    v_mov_b32_e32 v1, v4
406; GFX81-NEXT:    v_mov_b32_e32 v2, v5
407; GFX81-NEXT:    ; return to shader part epilog
408;
409; GFX9-LABEL: image_sample_b_2d_v4f16_tfe:
410; GFX9:       ; %bb.0: ; %main_body
411; GFX9-NEXT:    s_mov_b64 s[12:13], exec
412; GFX9-NEXT:    s_wqm_b64 exec, exec
413; GFX9-NEXT:    v_mov_b32_e32 v3, 0
414; GFX9-NEXT:    v_mov_b32_e32 v4, v3
415; GFX9-NEXT:    v_mov_b32_e32 v5, v3
416; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
417; GFX9-NEXT:    image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16
418; GFX9-NEXT:    s_waitcnt vmcnt(0)
419; GFX9-NEXT:    v_mov_b32_e32 v0, v3
420; GFX9-NEXT:    v_mov_b32_e32 v1, v4
421; GFX9-NEXT:    v_mov_b32_e32 v2, v5
422; GFX9-NEXT:    ; return to shader part epilog
423;
424; GFX10-LABEL: image_sample_b_2d_v4f16_tfe:
425; GFX10:       ; %bb.0: ; %main_body
426; GFX10-NEXT:    s_mov_b32 s12, exec_lo
427; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
428; GFX10-NEXT:    v_mov_b32_e32 v3, v0
429; GFX10-NEXT:    v_mov_b32_e32 v0, 0
430; GFX10-NEXT:    v_mov_b32_e32 v5, v2
431; GFX10-NEXT:    v_mov_b32_e32 v4, v1
432; GFX10-NEXT:    v_mov_b32_e32 v1, v0
433; GFX10-NEXT:    v_mov_b32_e32 v2, v0
434; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
435; GFX10-NEXT:    image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16
436; GFX10-NEXT:    s_waitcnt vmcnt(0)
437; GFX10-NEXT:    ; return to shader part epilog
438main_body:
439  %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
440  %tex.vec = extractvalue {<4 x half>, i32} %tex, 0
441  %tex.err = extractvalue {<4 x half>, i32} %tex, 1
442  %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
443  %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
444  %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
445  %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
446  %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
447  %tex.errf = bitcast i32 %tex.err to float
448  %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
449  ret <4 x float> %r
450}
451
452declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
453declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
454declare <3 x half> @llvm.amdgcn.image.sample.2d.v3f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
455declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
456declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
457declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
458declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
459declare <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
460declare {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
461declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
462declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
463
464attributes #0 = { nounwind }
465attributes #1 = { nounwind readonly }
466attributes #2 = { nounwind readnone }
467