1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
4
5define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
6; SI-LABEL: extract_4xi16:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; SI-NEXT:    s_cbranch_scc0 .LBB0_2
10; SI-NEXT:  ; %bb.1: ; %F
11; SI-NEXT:    s_mov_b32 s6, 0
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s4, s6
14; SI-NEXT:    s_mov_b32 s5, s6
15; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
16; SI-NEXT:    s_waitcnt vmcnt(0)
17; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
18; SI-NEXT:    s_waitcnt vmcnt(0)
19; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
20; SI-NEXT:    s_waitcnt vmcnt(0)
21; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
32; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
33; SI-NEXT:    v_or_b32_e32 v2, v6, v2
34; SI-NEXT:    v_or_b32_e32 v3, v4, v3
35; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
36; SI-NEXT:    s_mov_b64 vcc, exec
37; SI-NEXT:    s_cbranch_execz .LBB0_3
38; SI-NEXT:    s_branch .LBB0_4
39; SI-NEXT:  .LBB0_2:
40; SI-NEXT:    ; implicit-def: $vgpr3
41; SI-NEXT:    ; implicit-def: $vgpr4
42; SI-NEXT:    ; implicit-def: $vgpr2
43; SI-NEXT:    s_mov_b64 vcc, 0
44; SI-NEXT:  .LBB0_3: ; %T
45; SI-NEXT:    s_mov_b32 s6, 0
46; SI-NEXT:    s_mov_b32 s7, 0xf000
47; SI-NEXT:    s_mov_b32 s4, s6
48; SI-NEXT:    s_mov_b32 s5, s6
49; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
50; SI-NEXT:    s_waitcnt vmcnt(0)
51; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
52; SI-NEXT:    s_waitcnt vmcnt(0)
53; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
54; SI-NEXT:    s_waitcnt vmcnt(0)
55; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
56; SI-NEXT:    s_waitcnt vmcnt(0)
57; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
58; SI-NEXT:    s_waitcnt vmcnt(0)
59; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
60; SI-NEXT:    s_waitcnt vmcnt(0)
61; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
62; SI-NEXT:    s_waitcnt vmcnt(0)
63; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
64; SI-NEXT:    s_waitcnt vmcnt(0)
65; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
66; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
67; SI-NEXT:    v_or_b32_e32 v2, v4, v0
68; SI-NEXT:    v_or_b32_e32 v3, v3, v1
69; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
70; SI-NEXT:  .LBB0_4: ; %exit
71; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
72; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
73; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
74; SI-NEXT:    s_mov_b32 s4, 0xffff
75; SI-NEXT:    v_mov_b32_e32 v3, 0x8000
76; SI-NEXT:    v_mov_b32_e32 v4, 0xffff0000
77; SI-NEXT:    v_bfrev_b32_e32 v5, 1
78; SI-NEXT:    v_mov_b32_e32 v6, 0xffff8000
79; SI-NEXT:    v_mov_b32_e32 v7, s4
80; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
81; SI-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
82; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
83; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
84; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
85; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v6, vcc
86; SI-NEXT:    v_or_b32_e32 v0, v0, v1
87; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
88; SI-NEXT:    v_and_b32_e32 v2, s4, v2
89; SI-NEXT:    v_or_b32_e32 v2, v2, v3
90; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
91; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
92; SI-NEXT:    s_setpc_b64 s[30:31]
93;
94; GFX9-LABEL: extract_4xi16:
95; GFX9:       ; %bb.0:
96; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
98; GFX9-NEXT:  ; %bb.1: ; %F
99; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
100; GFX9-NEXT:    s_waitcnt vmcnt(0)
101; GFX9-NEXT:    s_cbranch_execz .LBB0_3
102; GFX9-NEXT:    s_branch .LBB0_4
103; GFX9-NEXT:  .LBB0_2:
104; GFX9-NEXT:    s_mov_b32 s8, 0
105; GFX9-NEXT:    s_mov_b32 s9, s8
106; GFX9-NEXT:    s_mov_b32 s10, s8
107; GFX9-NEXT:    s_mov_b32 s11, s8
108; GFX9-NEXT:    v_mov_b32_e32 v2, s8
109; GFX9-NEXT:    v_mov_b32_e32 v3, s9
110; GFX9-NEXT:    v_mov_b32_e32 v4, s10
111; GFX9-NEXT:    v_mov_b32_e32 v5, s11
112; GFX9-NEXT:  .LBB0_3: ; %T
113; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
114; GFX9-NEXT:    s_waitcnt vmcnt(0)
115; GFX9-NEXT:  .LBB0_4: ; %exit
116; GFX9-NEXT:    s_waitcnt vmcnt(0)
117; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
118; GFX9-NEXT:    s_movk_i32 s4, 0x8000
119; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
120; GFX9-NEXT:    v_or_b32_e32 v3, s4, v0
121; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
122; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
123; GFX9-NEXT:    v_or_b32_e32 v0, s4, v0
124; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
125; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
126; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
127; GFX9-NEXT:    v_and_b32_e32 v2, v4, v3
128; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
129; GFX9-NEXT:    s_setpc_b64 s[30:31]
130  br i1 undef, label %T, label %F
131
132T:
133  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
134  br label %exit
135
136F:
137  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
138  br label %exit
139
140exit:
141  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
142  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
143  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
144  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
145  ret <4 x i16> %r2
146}
147
148define <4 x i16> @extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
149; SI-LABEL: extract_4xi16_2:
150; SI:       ; %bb.0:
151; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; SI-NEXT:    s_cbranch_scc0 .LBB1_2
153; SI-NEXT:  ; %bb.1: ; %F
154; SI-NEXT:    s_mov_b32 s6, 0
155; SI-NEXT:    s_mov_b32 s7, 0xf000
156; SI-NEXT:    s_mov_b32 s4, s6
157; SI-NEXT:    s_mov_b32 s5, s6
158; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
159; SI-NEXT:    s_waitcnt vmcnt(0)
160; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
163; SI-NEXT:    s_waitcnt vmcnt(0)
164; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
165; SI-NEXT:    s_waitcnt vmcnt(0)
166; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
167; SI-NEXT:    s_waitcnt vmcnt(0)
168; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
169; SI-NEXT:    s_waitcnt vmcnt(0)
170; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
171; SI-NEXT:    s_waitcnt vmcnt(0)
172; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
173; SI-NEXT:    s_waitcnt vmcnt(0)
174; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
175; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
176; SI-NEXT:    v_or_b32_e32 v2, v6, v2
177; SI-NEXT:    v_or_b32_e32 v3, v4, v3
178; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
179; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
180; SI-NEXT:    s_mov_b64 vcc, exec
181; SI-NEXT:    s_cbranch_execz .LBB1_3
182; SI-NEXT:    s_branch .LBB1_4
183; SI-NEXT:  .LBB1_2:
184; SI-NEXT:    ; implicit-def: $vgpr3
185; SI-NEXT:    ; implicit-def: $vgpr5
186; SI-NEXT:    ; implicit-def: $vgpr2
187; SI-NEXT:    ; implicit-def: $vgpr4
188; SI-NEXT:    s_mov_b64 vcc, 0
189; SI-NEXT:  .LBB1_3: ; %T
190; SI-NEXT:    s_mov_b32 s6, 0
191; SI-NEXT:    s_mov_b32 s7, 0xf000
192; SI-NEXT:    s_mov_b32 s4, s6
193; SI-NEXT:    s_mov_b32 s5, s6
194; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
195; SI-NEXT:    s_waitcnt vmcnt(0)
196; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
199; SI-NEXT:    s_waitcnt vmcnt(0)
200; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
201; SI-NEXT:    s_waitcnt vmcnt(0)
202; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
203; SI-NEXT:    s_waitcnt vmcnt(0)
204; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
205; SI-NEXT:    s_waitcnt vmcnt(0)
206; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
207; SI-NEXT:    s_waitcnt vmcnt(0)
208; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
209; SI-NEXT:    s_waitcnt vmcnt(0)
210; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
211; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
212; SI-NEXT:    v_or_b32_e32 v2, v4, v0
213; SI-NEXT:    v_or_b32_e32 v3, v3, v1
214; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
215; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
216; SI-NEXT:  .LBB1_4: ; %exit
217; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
218; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
219; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
220; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
221; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
222; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
223; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
224; SI-NEXT:    v_bfrev_b32_e32 v7, 1
225; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
226; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
227; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
228; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
229; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
230; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
231; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
232; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
233; SI-NEXT:    v_or_b32_e32 v0, v0, v1
234; SI-NEXT:    v_or_b32_e32 v2, v2, v3
235; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
236; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
237; SI-NEXT:    s_setpc_b64 s[30:31]
238;
239; GFX9-LABEL: extract_4xi16_2:
240; GFX9:       ; %bb.0:
241; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
243; GFX9-NEXT:  ; %bb.1: ; %F
244; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
245; GFX9-NEXT:    s_waitcnt vmcnt(0)
246; GFX9-NEXT:    s_cbranch_execz .LBB1_3
247; GFX9-NEXT:    s_branch .LBB1_4
248; GFX9-NEXT:  .LBB1_2:
249; GFX9-NEXT:    s_mov_b32 s8, 0
250; GFX9-NEXT:    s_mov_b32 s9, s8
251; GFX9-NEXT:    s_mov_b32 s10, s8
252; GFX9-NEXT:    s_mov_b32 s11, s8
253; GFX9-NEXT:    v_mov_b32_e32 v2, s8
254; GFX9-NEXT:    v_mov_b32_e32 v3, s9
255; GFX9-NEXT:    v_mov_b32_e32 v4, s10
256; GFX9-NEXT:    v_mov_b32_e32 v5, s11
257; GFX9-NEXT:  .LBB1_3: ; %T
258; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
259; GFX9-NEXT:    s_waitcnt vmcnt(0)
260; GFX9-NEXT:  .LBB1_4: ; %exit
261; GFX9-NEXT:    s_waitcnt vmcnt(0)
262; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
263; GFX9-NEXT:    s_movk_i32 s4, 0x8000
264; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
265; GFX9-NEXT:    v_or_b32_e32 v2, s4, v0
266; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
267; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
268; GFX9-NEXT:    v_or_b32_e32 v0, s4, v0
269; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
270; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
271; GFX9-NEXT:    v_and_b32_e32 v2, v4, v2
272; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
273; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
274; GFX9-NEXT:    s_setpc_b64 s[30:31]
275  br i1 undef, label %T, label %F
276
277T:
278  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
279  br label %exit
280
281F:
282  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
283  br label %exit
284
285exit:
286  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
287  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
288  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
289  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
290  ret <4 x i16> %r2
291}
292
293define <4 x half> @extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) {
294; SI-LABEL: extract_4xf16:
295; SI:       ; %bb.0:
296; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; SI-NEXT:    s_cbranch_scc0 .LBB2_2
298; SI-NEXT:  ; %bb.1: ; %F
299; SI-NEXT:    s_mov_b32 s6, 0
300; SI-NEXT:    s_mov_b32 s7, 0xf000
301; SI-NEXT:    s_mov_b32 s4, s6
302; SI-NEXT:    s_mov_b32 s5, s6
303; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
304; SI-NEXT:    s_waitcnt vmcnt(0)
305; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
306; SI-NEXT:    s_waitcnt vmcnt(0)
307; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
308; SI-NEXT:    s_waitcnt vmcnt(0)
309; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
310; SI-NEXT:    s_waitcnt vmcnt(0)
311; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
312; SI-NEXT:    s_waitcnt vmcnt(0)
313; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
314; SI-NEXT:    s_waitcnt vmcnt(0)
315; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
316; SI-NEXT:    s_waitcnt vmcnt(0)
317; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
318; SI-NEXT:    s_waitcnt vmcnt(0)
319; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
320; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
321; SI-NEXT:    v_or_b32_e32 v2, v6, v2
322; SI-NEXT:    v_or_b32_e32 v4, v4, v3
323; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
324; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
325; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
326; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
327; SI-NEXT:    s_mov_b64 vcc, exec
328; SI-NEXT:    s_cbranch_execz .LBB2_3
329; SI-NEXT:    s_branch .LBB2_4
330; SI-NEXT:  .LBB2_2:
331; SI-NEXT:    ; implicit-def: $vgpr3
332; SI-NEXT:    ; implicit-def: $vgpr4
333; SI-NEXT:    ; implicit-def: $vgpr2
334; SI-NEXT:    s_mov_b64 vcc, 0
335; SI-NEXT:  .LBB2_3: ; %T
336; SI-NEXT:    s_mov_b32 s6, 0
337; SI-NEXT:    s_mov_b32 s7, 0xf000
338; SI-NEXT:    s_mov_b32 s4, s6
339; SI-NEXT:    s_mov_b32 s5, s6
340; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
341; SI-NEXT:    s_waitcnt vmcnt(0)
342; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
343; SI-NEXT:    s_waitcnt vmcnt(0)
344; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
345; SI-NEXT:    s_waitcnt vmcnt(0)
346; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
347; SI-NEXT:    s_waitcnt vmcnt(0)
348; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
351; SI-NEXT:    s_waitcnt vmcnt(0)
352; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
353; SI-NEXT:    s_waitcnt vmcnt(0)
354; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
355; SI-NEXT:    s_waitcnt vmcnt(0)
356; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
357; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
358; SI-NEXT:    v_or_b32_e32 v0, v4, v0
359; SI-NEXT:    v_or_b32_e32 v1, v2, v1
360; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
361; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
362; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
363; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
364; SI-NEXT:  .LBB2_4: ; %exit
365; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
366; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
367; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
368; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
369; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
370; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
371; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
372; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
373; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
374; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
375; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
376; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
377; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
378; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
379; SI-NEXT:    v_mov_b32_e32 v3, v2
380; SI-NEXT:    s_setpc_b64 s[30:31]
381;
382; GFX9-LABEL: extract_4xf16:
383; GFX9:       ; %bb.0:
384; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
386; GFX9-NEXT:  ; %bb.1: ; %F
387; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
388; GFX9-NEXT:    s_waitcnt vmcnt(0)
389; GFX9-NEXT:    s_cbranch_execz .LBB2_3
390; GFX9-NEXT:    s_branch .LBB2_4
391; GFX9-NEXT:  .LBB2_2:
392; GFX9-NEXT:    s_mov_b32 s8, 0
393; GFX9-NEXT:    s_mov_b32 s9, s8
394; GFX9-NEXT:    s_mov_b32 s10, s8
395; GFX9-NEXT:    s_mov_b32 s11, s8
396; GFX9-NEXT:    v_mov_b32_e32 v2, s8
397; GFX9-NEXT:    v_mov_b32_e32 v3, s9
398; GFX9-NEXT:    v_mov_b32_e32 v4, s10
399; GFX9-NEXT:    v_mov_b32_e32 v5, s11
400; GFX9-NEXT:  .LBB2_3: ; %T
401; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
402; GFX9-NEXT:    s_waitcnt vmcnt(0)
403; GFX9-NEXT:  .LBB2_4: ; %exit
404; GFX9-NEXT:    s_waitcnt vmcnt(0)
405; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
406; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
407; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3800
408; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3900
409; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3d00
410; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v0
411; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
412; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
413; GFX9-NEXT:    v_cndmask_b32_e32 v6, v4, v3, vcc
414; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
415; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
416; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
417; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
418; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
419; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v6
420; GFX9-NEXT:    s_setpc_b64 s[30:31]
421  br i1 undef, label %T, label %F
422
423T:
424  %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0
425  br label %exit
426
427F:
428  %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1
429  br label %exit
430
431exit:
432  %m = phi <8 x half> [ %t, %T ], [ %f, %F ]
433  %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
434  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
435  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
436  ret <4 x half> %r2
437}
438