1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
5; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
6; FIXME: promotion not handled without f16 insts
7
8define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 {
9; GCN-LABEL: v_constained_fsub_f16_fpexcept_strict:
10; GCN:       ; %bb.0:
11; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GCN-NEXT:    v_sub_f16_e32 v0, v0, v1
13; GCN-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_strict:
16; GFX10PLUS:       ; %bb.0:
17; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
19; GFX10PLUS-NEXT:    v_sub_f16_e32 v0, v0, v1
20; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
21  %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
22  ret half %val
23}
24
25define half @v_constained_fsub_f16_fpexcept_ignore(half %x, half %y) #0 {
26; GCN-LABEL: v_constained_fsub_f16_fpexcept_ignore:
27; GCN:       ; %bb.0:
28; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29; GCN-NEXT:    v_sub_f16_e32 v0, v0, v1
30; GCN-NEXT:    s_setpc_b64 s[30:31]
31;
32; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_ignore:
33; GFX10PLUS:       ; %bb.0:
34; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
36; GFX10PLUS-NEXT:    v_sub_f16_e32 v0, v0, v1
37; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
38  %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
39  ret half %val
40}
41
42define half @v_constained_fsub_f16_fpexcept_maytrap(half %x, half %y) #0 {
43; GCN-LABEL: v_constained_fsub_f16_fpexcept_maytrap:
44; GCN:       ; %bb.0:
45; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GCN-NEXT:    v_sub_f16_e32 v0, v0, v1
47; GCN-NEXT:    s_setpc_b64 s[30:31]
48;
49; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_maytrap:
50; GFX10PLUS:       ; %bb.0:
51; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX10PLUS-NEXT:    s_waitcnt_vscnt null, 0x0
53; GFX10PLUS-NEXT:    v_sub_f16_e32 v0, v0, v1
54; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
55  %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
56  ret half %val
57}
58
59define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y) #0 {
60; GFX9-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
61; GFX9:       ; %bb.0:
62; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
63; GFX9-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
64; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v1
65; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
66; GFX9-NEXT:    s_setpc_b64 s[30:31]
67;
68; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
69; GFX8:       ; %bb.0:
70; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
71; GFX8-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
72; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v1
73; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
74; GFX8-NEXT:    s_setpc_b64 s[30:31]
75;
76; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
77; GFX10:       ; %bb.0:
78; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
80; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
81; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
82; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
83; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
84; GFX10-NEXT:    s_setpc_b64 s[30:31]
85;
86; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
87; GFX11:       ; %bb.0:
88; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
90; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
91; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
92; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v1
93; GFX11-NEXT:    v_sub_f16_e32 v1, v3, v2
94; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
95; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
96; GFX11-NEXT:    s_setpc_b64 s[30:31]
97  %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
98  ret <2 x half> %val
99}
100
101define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x half> %y) #0 {
102; GFX9-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
103; GFX9:       ; %bb.0:
104; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX9-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
106; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v1
107; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
108; GFX9-NEXT:    s_setpc_b64 s[30:31]
109;
110; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
111; GFX8:       ; %bb.0:
112; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GFX8-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
114; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v1
115; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
116; GFX8-NEXT:    s_setpc_b64 s[30:31]
117;
118; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
119; GFX10:       ; %bb.0:
120; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
122; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
123; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
124; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
125; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
126; GFX10-NEXT:    s_setpc_b64 s[30:31]
127;
128; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
129; GFX11:       ; %bb.0:
130; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
132; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
133; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
134; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v1
135; GFX11-NEXT:    v_sub_f16_e32 v1, v3, v2
136; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
137; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
138; GFX11-NEXT:    s_setpc_b64 s[30:31]
139  %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
140  ret <2 x half> %val
141}
142
143define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x half> %y) #0 {
144; GFX9-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
145; GFX9:       ; %bb.0:
146; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX9-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
148; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v1
149; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
150; GFX9-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX8-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
153; GFX8:       ; %bb.0:
154; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX8-NEXT:    v_sub_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
156; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v1
157; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
158; GFX8-NEXT:    s_setpc_b64 s[30:31]
159;
160; GFX10-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
161; GFX10:       ; %bb.0:
162; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
163; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
164; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v1
165; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
166; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
167; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
168; GFX10-NEXT:    s_setpc_b64 s[30:31]
169;
170; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
171; GFX11:       ; %bb.0:
172; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
174; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
175; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
176; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v1
177; GFX11-NEXT:    v_sub_f16_e32 v1, v3, v2
178; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
179; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
180; GFX11-NEXT:    s_setpc_b64 s[30:31]
181  %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
182  ret <2 x half> %val
183}
184
185define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y) #0 {
186; GFX9-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
187; GFX9:       ; %bb.0:
188; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189; GFX9-NEXT:    v_sub_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
190; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v2
191; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
192; GFX9-NEXT:    v_sub_f16_e32 v1, v1, v3
193; GFX9-NEXT:    s_setpc_b64 s[30:31]
194;
195; GFX8-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
196; GFX8:       ; %bb.0:
197; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198; GFX8-NEXT:    v_sub_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
199; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v2
200; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
201; GFX8-NEXT:    v_sub_f16_e32 v1, v1, v3
202; GFX8-NEXT:    s_setpc_b64 s[30:31]
203;
204; GFX10-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
205; GFX10:       ; %bb.0:
206; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
208; GFX10-NEXT:    v_sub_f16_e32 v4, v0, v2
209; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
210; GFX10-NEXT:    v_sub_f16_e32 v1, v1, v3
211; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
212; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
213; GFX10-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX11-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
216; GFX11:       ; %bb.0:
217; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
219; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
220; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
221; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v2
222; GFX11-NEXT:    v_sub_f16_e32 v1, v1, v3
223; GFX11-NEXT:    v_sub_f16_e32 v2, v5, v4
224; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
225; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
226; GFX11-NEXT:    s_setpc_b64 s[30:31]
227  %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
228  ret <3 x half> %val
229}
230
231; FIXME: Scalarized
232define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y) #0 {
233; GFX9-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
234; GFX9:       ; %bb.0:
235; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX9-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
237; GFX9-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
238; GFX9-NEXT:    v_sub_f16_e32 v1, v1, v3
239; GFX9-NEXT:    v_sub_f16_e32 v0, v0, v2
240; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
241; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
242; GFX9-NEXT:    s_setpc_b64 s[30:31]
243;
244; GFX8-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
245; GFX8:       ; %bb.0:
246; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247; GFX8-NEXT:    v_sub_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
248; GFX8-NEXT:    v_sub_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
249; GFX8-NEXT:    v_sub_f16_e32 v1, v1, v3
250; GFX8-NEXT:    v_sub_f16_e32 v0, v0, v2
251; GFX8-NEXT:    v_or_b32_e32 v0, v0, v5
252; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
253; GFX8-NEXT:    s_setpc_b64 s[30:31]
254;
255; GFX10-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
256; GFX10:       ; %bb.0:
257; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
259; GFX10-NEXT:    v_sub_f16_e32 v4, v0, v2
260; GFX10-NEXT:    v_sub_f16_e32 v5, v1, v3
261; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
262; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
263; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v4
264; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v5
265; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
266; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
267; GFX10-NEXT:    s_setpc_b64 s[30:31]
268;
269; GFX11-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
270; GFX11:       ; %bb.0:
271; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
273; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
274; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
275; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
276; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
277; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v2
278; GFX11-NEXT:    v_sub_f16_e32 v1, v1, v3
279; GFX11-NEXT:    v_sub_f16_e32 v2, v5, v4
280; GFX11-NEXT:    v_sub_f16_e32 v3, v7, v6
281; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
282; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
283; GFX11-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
284; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
285; GFX11-NEXT:    s_setpc_b64 s[30:31]
286  %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
287  ret <4 x half> %val
288}
289
290define amdgpu_ps half @s_constained_fsub_f16_fpexcept_strict(half inreg %x, half inreg %y) #0 {
291; GCN-LABEL: s_constained_fsub_f16_fpexcept_strict:
292; GCN:       ; %bb.0:
293; GCN-NEXT:    v_mov_b32_e32 v0, s3
294; GCN-NEXT:    v_sub_f16_e32 v0, s2, v0
295; GCN-NEXT:    ; return to shader part epilog
296;
297; GFX10PLUS-LABEL: s_constained_fsub_f16_fpexcept_strict:
298; GFX10PLUS:       ; %bb.0:
299; GFX10PLUS-NEXT:    v_sub_f16_e64 v0, s2, s3
300; GFX10PLUS-NEXT:    ; return to shader part epilog
301  %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
302  ret half %val
303}
304
305define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> inreg %x, <2 x half> inreg %y) #0 {
306; GFX9-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
307; GFX9:       ; %bb.0:
308; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
309; GFX9-NEXT:    s_lshr_b32 s1, s2, 16
310; GFX9-NEXT:    v_mov_b32_e32 v0, s0
311; GFX9-NEXT:    v_mov_b32_e32 v1, s3
312; GFX9-NEXT:    v_sub_f16_e32 v0, s1, v0
313; GFX9-NEXT:    v_sub_f16_e32 v1, s2, v1
314; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
315; GFX9-NEXT:    ; return to shader part epilog
316;
317; GFX8-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
318; GFX8:       ; %bb.0:
319; GFX8-NEXT:    s_lshr_b32 s0, s3, 16
320; GFX8-NEXT:    s_lshr_b32 s1, s2, 16
321; GFX8-NEXT:    v_mov_b32_e32 v0, s0
322; GFX8-NEXT:    v_mov_b32_e32 v1, s1
323; GFX8-NEXT:    v_sub_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
324; GFX8-NEXT:    v_mov_b32_e32 v1, s3
325; GFX8-NEXT:    v_sub_f16_e32 v1, s2, v1
326; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
327; GFX8-NEXT:    ; return to shader part epilog
328;
329; GFX10PLUS-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
330; GFX10PLUS:       ; %bb.0:
331; GFX10PLUS-NEXT:    v_sub_f16_e64 v0, s2, s3
332; GFX10PLUS-NEXT:    s_lshr_b32 s0, s3, 16
333; GFX10PLUS-NEXT:    s_lshr_b32 s1, s2, 16
334; GFX10PLUS-NEXT:    v_sub_f16_e64 v1, s1, s0
335; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
336; GFX10PLUS-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
337; GFX10PLUS-NEXT:    ; return to shader part epilog
338  %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
339  ret <2 x half> %val
340}
341
342declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) #1
343declare <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half>, <2 x half>, metadata, metadata) #1
344declare <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half>, <3 x half>, metadata, metadata) #1
345declare <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half>, <4 x half>, metadata, metadata) #1
346
347attributes #0 = { strictfp }
348attributes #1 = { inaccessiblememonly nounwind willreturn }
349