1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
4
5define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
6; SI-LABEL: vec_8xi16_extract_4xi16:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; SI-NEXT:    s_cbranch_scc0 .LBB0_2
10; SI-NEXT:  ; %bb.1: ; %F
11; SI-NEXT:    s_mov_b32 s6, 0
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s4, s6
14; SI-NEXT:    s_mov_b32 s5, s6
15; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
16; SI-NEXT:    s_waitcnt vmcnt(0)
17; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
18; SI-NEXT:    s_waitcnt vmcnt(0)
19; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
20; SI-NEXT:    s_waitcnt vmcnt(0)
21; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
32; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
33; SI-NEXT:    v_or_b32_e32 v2, v6, v2
34; SI-NEXT:    v_or_b32_e32 v3, v4, v3
35; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
36; SI-NEXT:    s_mov_b64 vcc, exec
37; SI-NEXT:    s_cbranch_execz .LBB0_3
38; SI-NEXT:    s_branch .LBB0_4
39; SI-NEXT:  .LBB0_2:
40; SI-NEXT:    ; implicit-def: $vgpr3
41; SI-NEXT:    ; implicit-def: $vgpr4
42; SI-NEXT:    ; implicit-def: $vgpr2
43; SI-NEXT:    s_mov_b64 vcc, 0
44; SI-NEXT:  .LBB0_3: ; %T
45; SI-NEXT:    s_mov_b32 s6, 0
46; SI-NEXT:    s_mov_b32 s7, 0xf000
47; SI-NEXT:    s_mov_b32 s4, s6
48; SI-NEXT:    s_mov_b32 s5, s6
49; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
50; SI-NEXT:    s_waitcnt vmcnt(0)
51; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
52; SI-NEXT:    s_waitcnt vmcnt(0)
53; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
54; SI-NEXT:    s_waitcnt vmcnt(0)
55; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
56; SI-NEXT:    s_waitcnt vmcnt(0)
57; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
58; SI-NEXT:    s_waitcnt vmcnt(0)
59; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
60; SI-NEXT:    s_waitcnt vmcnt(0)
61; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
62; SI-NEXT:    s_waitcnt vmcnt(0)
63; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
64; SI-NEXT:    s_waitcnt vmcnt(0)
65; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
66; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
67; SI-NEXT:    v_or_b32_e32 v2, v4, v0
68; SI-NEXT:    v_or_b32_e32 v3, v3, v1
69; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
70; SI-NEXT:  .LBB0_4: ; %exit
71; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
72; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
73; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
74; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
75; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
76; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
77; SI-NEXT:    v_bfrev_b32_e32 v6, 1
78; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
79; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
80; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
81; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
82; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
83; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
84; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
85; SI-NEXT:    v_or_b32_e32 v0, v0, v1
86; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
87; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
88; SI-NEXT:    v_or_b32_e32 v2, v2, v3
89; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
90; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
91; SI-NEXT:    s_setpc_b64 s[30:31]
92;
93; GFX9-LABEL: vec_8xi16_extract_4xi16:
94; GFX9:       ; %bb.0:
95; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
97; GFX9-NEXT:  ; %bb.1: ; %F
98; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
99; GFX9-NEXT:    s_waitcnt vmcnt(0)
100; GFX9-NEXT:    s_cbranch_execz .LBB0_3
101; GFX9-NEXT:    s_branch .LBB0_4
102; GFX9-NEXT:  .LBB0_2:
103; GFX9-NEXT:    s_mov_b32 s8, 0
104; GFX9-NEXT:    s_mov_b32 s9, s8
105; GFX9-NEXT:    s_mov_b32 s10, s8
106; GFX9-NEXT:    s_mov_b32 s11, s8
107; GFX9-NEXT:    v_mov_b32_e32 v2, s8
108; GFX9-NEXT:    v_mov_b32_e32 v3, s9
109; GFX9-NEXT:    v_mov_b32_e32 v4, s10
110; GFX9-NEXT:    v_mov_b32_e32 v5, s11
111; GFX9-NEXT:  .LBB0_3: ; %T
112; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
113; GFX9-NEXT:    s_waitcnt vmcnt(0)
114; GFX9-NEXT:  .LBB0_4: ; %exit
115; GFX9-NEXT:    s_waitcnt vmcnt(0)
116; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
117; GFX9-NEXT:    s_movk_i32 s4, 0x8000
118; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
119; GFX9-NEXT:    v_or_b32_e32 v3, 0xffff8000, v0
120; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
121; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
122; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
123; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
124; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
125; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
126; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
127; GFX9-NEXT:    s_setpc_b64 s[30:31]
128  br i1 undef, label %T, label %F
129
130T:
131  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
132  br label %exit
133
134F:
135  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
136  br label %exit
137
138exit:
139  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
140  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
141  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
142  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
143  ret <4 x i16> %r2
144}
145
146define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
147; SI-LABEL: vec_8xi16_extract_4xi16_2:
148; SI:       ; %bb.0:
149; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150; SI-NEXT:    s_cbranch_scc0 .LBB1_2
151; SI-NEXT:  ; %bb.1: ; %F
152; SI-NEXT:    s_mov_b32 s6, 0
153; SI-NEXT:    s_mov_b32 s7, 0xf000
154; SI-NEXT:    s_mov_b32 s4, s6
155; SI-NEXT:    s_mov_b32 s5, s6
156; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
157; SI-NEXT:    s_waitcnt vmcnt(0)
158; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
159; SI-NEXT:    s_waitcnt vmcnt(0)
160; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
163; SI-NEXT:    s_waitcnt vmcnt(0)
164; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
165; SI-NEXT:    s_waitcnt vmcnt(0)
166; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
167; SI-NEXT:    s_waitcnt vmcnt(0)
168; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
169; SI-NEXT:    s_waitcnt vmcnt(0)
170; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
171; SI-NEXT:    s_waitcnt vmcnt(0)
172; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
173; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
174; SI-NEXT:    v_or_b32_e32 v2, v6, v2
175; SI-NEXT:    v_or_b32_e32 v3, v4, v3
176; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
177; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
178; SI-NEXT:    s_mov_b64 vcc, exec
179; SI-NEXT:    s_cbranch_execz .LBB1_3
180; SI-NEXT:    s_branch .LBB1_4
181; SI-NEXT:  .LBB1_2:
182; SI-NEXT:    ; implicit-def: $vgpr3
183; SI-NEXT:    ; implicit-def: $vgpr5
184; SI-NEXT:    ; implicit-def: $vgpr2
185; SI-NEXT:    ; implicit-def: $vgpr4
186; SI-NEXT:    s_mov_b64 vcc, 0
187; SI-NEXT:  .LBB1_3: ; %T
188; SI-NEXT:    s_mov_b32 s6, 0
189; SI-NEXT:    s_mov_b32 s7, 0xf000
190; SI-NEXT:    s_mov_b32 s4, s6
191; SI-NEXT:    s_mov_b32 s5, s6
192; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
193; SI-NEXT:    s_waitcnt vmcnt(0)
194; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
195; SI-NEXT:    s_waitcnt vmcnt(0)
196; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
199; SI-NEXT:    s_waitcnt vmcnt(0)
200; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
201; SI-NEXT:    s_waitcnt vmcnt(0)
202; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
203; SI-NEXT:    s_waitcnt vmcnt(0)
204; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
205; SI-NEXT:    s_waitcnt vmcnt(0)
206; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
207; SI-NEXT:    s_waitcnt vmcnt(0)
208; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
209; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
210; SI-NEXT:    v_or_b32_e32 v2, v4, v0
211; SI-NEXT:    v_or_b32_e32 v3, v3, v1
212; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
213; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
214; SI-NEXT:  .LBB1_4: ; %exit
215; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
216; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
217; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
218; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
219; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
220; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
221; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
222; SI-NEXT:    v_bfrev_b32_e32 v7, 1
223; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
224; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
225; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
226; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
227; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
228; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
229; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
230; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
231; SI-NEXT:    v_or_b32_e32 v0, v0, v1
232; SI-NEXT:    v_or_b32_e32 v2, v2, v3
233; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
234; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
235; SI-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
238; GFX9:       ; %bb.0:
239; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
241; GFX9-NEXT:  ; %bb.1: ; %F
242; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
243; GFX9-NEXT:    s_waitcnt vmcnt(0)
244; GFX9-NEXT:    s_cbranch_execz .LBB1_3
245; GFX9-NEXT:    s_branch .LBB1_4
246; GFX9-NEXT:  .LBB1_2:
247; GFX9-NEXT:    s_mov_b32 s8, 0
248; GFX9-NEXT:    s_mov_b32 s9, s8
249; GFX9-NEXT:    s_mov_b32 s10, s8
250; GFX9-NEXT:    s_mov_b32 s11, s8
251; GFX9-NEXT:    v_mov_b32_e32 v2, s8
252; GFX9-NEXT:    v_mov_b32_e32 v3, s9
253; GFX9-NEXT:    v_mov_b32_e32 v4, s10
254; GFX9-NEXT:    v_mov_b32_e32 v5, s11
255; GFX9-NEXT:  .LBB1_3: ; %T
256; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
257; GFX9-NEXT:    s_waitcnt vmcnt(0)
258; GFX9-NEXT:  .LBB1_4: ; %exit
259; GFX9-NEXT:    s_waitcnt vmcnt(0)
260; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
261; GFX9-NEXT:    s_movk_i32 s4, 0x8000
262; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
263; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
264; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
265; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
266; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
267; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
268; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
269; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
270; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
271; GFX9-NEXT:    s_setpc_b64 s[30:31]
272  br i1 undef, label %T, label %F
273
274T:
275  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
276  br label %exit
277
278F:
279  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
280  br label %exit
281
282exit:
283  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
284  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
285  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
286  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
287  ret <4 x i16> %r2
288}
289
290define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) {
291; SI-LABEL: vec_8xf16_extract_4xf16:
292; SI:       ; %bb.0:
293; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294; SI-NEXT:    s_cbranch_scc0 .LBB2_2
295; SI-NEXT:  ; %bb.1: ; %F
296; SI-NEXT:    s_mov_b32 s6, 0
297; SI-NEXT:    s_mov_b32 s7, 0xf000
298; SI-NEXT:    s_mov_b32 s4, s6
299; SI-NEXT:    s_mov_b32 s5, s6
300; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
303; SI-NEXT:    s_waitcnt vmcnt(0)
304; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
305; SI-NEXT:    s_waitcnt vmcnt(0)
306; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
307; SI-NEXT:    s_waitcnt vmcnt(0)
308; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
309; SI-NEXT:    s_waitcnt vmcnt(0)
310; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
311; SI-NEXT:    s_waitcnt vmcnt(0)
312; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
313; SI-NEXT:    s_waitcnt vmcnt(0)
314; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
315; SI-NEXT:    s_waitcnt vmcnt(0)
316; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
317; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
318; SI-NEXT:    v_or_b32_e32 v2, v6, v2
319; SI-NEXT:    v_or_b32_e32 v4, v4, v3
320; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
321; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
322; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
323; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
324; SI-NEXT:    s_mov_b64 vcc, exec
325; SI-NEXT:    s_cbranch_execz .LBB2_3
326; SI-NEXT:    s_branch .LBB2_4
327; SI-NEXT:  .LBB2_2:
328; SI-NEXT:    ; implicit-def: $vgpr3
329; SI-NEXT:    ; implicit-def: $vgpr4
330; SI-NEXT:    ; implicit-def: $vgpr2
331; SI-NEXT:    s_mov_b64 vcc, 0
332; SI-NEXT:  .LBB2_3: ; %T
333; SI-NEXT:    s_mov_b32 s6, 0
334; SI-NEXT:    s_mov_b32 s7, 0xf000
335; SI-NEXT:    s_mov_b32 s4, s6
336; SI-NEXT:    s_mov_b32 s5, s6
337; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
338; SI-NEXT:    s_waitcnt vmcnt(0)
339; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
340; SI-NEXT:    s_waitcnt vmcnt(0)
341; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
342; SI-NEXT:    s_waitcnt vmcnt(0)
343; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
344; SI-NEXT:    s_waitcnt vmcnt(0)
345; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
346; SI-NEXT:    s_waitcnt vmcnt(0)
347; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
348; SI-NEXT:    s_waitcnt vmcnt(0)
349; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
350; SI-NEXT:    s_waitcnt vmcnt(0)
351; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
352; SI-NEXT:    s_waitcnt vmcnt(0)
353; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
354; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
355; SI-NEXT:    v_or_b32_e32 v0, v4, v0
356; SI-NEXT:    v_or_b32_e32 v1, v2, v1
357; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
358; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
359; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
360; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
361; SI-NEXT:  .LBB2_4: ; %exit
362; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
363; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
364; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
365; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
366; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
367; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
368; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
369; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
370; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
371; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
372; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
373; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
374; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
375; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
376; SI-NEXT:    v_mov_b32_e32 v3, v2
377; SI-NEXT:    s_setpc_b64 s[30:31]
378;
379; GFX9-LABEL: vec_8xf16_extract_4xf16:
380; GFX9:       ; %bb.0:
381; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
383; GFX9-NEXT:  ; %bb.1: ; %F
384; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
385; GFX9-NEXT:    s_waitcnt vmcnt(0)
386; GFX9-NEXT:    s_cbranch_execz .LBB2_3
387; GFX9-NEXT:    s_branch .LBB2_4
388; GFX9-NEXT:  .LBB2_2:
389; GFX9-NEXT:    s_mov_b32 s8, 0
390; GFX9-NEXT:    s_mov_b32 s9, s8
391; GFX9-NEXT:    s_mov_b32 s10, s8
392; GFX9-NEXT:    s_mov_b32 s11, s8
393; GFX9-NEXT:    v_mov_b32_e32 v2, s8
394; GFX9-NEXT:    v_mov_b32_e32 v3, s9
395; GFX9-NEXT:    v_mov_b32_e32 v4, s10
396; GFX9-NEXT:    v_mov_b32_e32 v5, s11
397; GFX9-NEXT:  .LBB2_3: ; %T
398; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
399; GFX9-NEXT:    s_waitcnt vmcnt(0)
400; GFX9-NEXT:  .LBB2_4: ; %exit
401; GFX9-NEXT:    s_waitcnt vmcnt(0)
402; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
403; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
404; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3800
405; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3900
406; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3d00
407; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v0
408; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
409; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
410; GFX9-NEXT:    v_cndmask_b32_e32 v6, v4, v3, vcc
411; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
412; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
413; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
414; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
415; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
416; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v6
417; GFX9-NEXT:    s_setpc_b64 s[30:31]
418  br i1 undef, label %T, label %F
419
420T:
421  %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0
422  br label %exit
423
424F:
425  %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1
426  br label %exit
427
428exit:
429  %m = phi <8 x half> [ %t, %T ], [ %f, %F ]
430  %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
431  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
432  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
433  ret <4 x half> %r2
434}
435
436define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) {
437;
438; SI-LABEL: vec_16xi16_extract_4xi16:
439; SI:       ; %bb.0:
440; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441; SI-NEXT:    s_cbranch_scc0 .LBB3_2
442; SI-NEXT:  ; %bb.1: ; %F
443; SI-NEXT:    s_mov_b32 s6, 0
444; SI-NEXT:    s_mov_b32 s7, 0xf000
445; SI-NEXT:    s_mov_b32 s4, s6
446; SI-NEXT:    s_mov_b32 s5, s6
447; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
448; SI-NEXT:    s_waitcnt vmcnt(0)
449; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
450; SI-NEXT:    s_waitcnt vmcnt(0)
451; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
452; SI-NEXT:    s_waitcnt vmcnt(0)
453; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
454; SI-NEXT:    s_waitcnt vmcnt(0)
455; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
456; SI-NEXT:    s_waitcnt vmcnt(0)
457; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
458; SI-NEXT:    s_waitcnt vmcnt(0)
459; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
460; SI-NEXT:    s_waitcnt vmcnt(0)
461; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
462; SI-NEXT:    s_waitcnt vmcnt(0)
463; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
464; SI-NEXT:    s_waitcnt vmcnt(0)
465; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
466; SI-NEXT:    s_waitcnt vmcnt(0)
467; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
468; SI-NEXT:    s_waitcnt vmcnt(0)
469; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
470; SI-NEXT:    s_waitcnt vmcnt(0)
471; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
472; SI-NEXT:    s_waitcnt vmcnt(0)
473; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
474; SI-NEXT:    s_waitcnt vmcnt(0)
475; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
476; SI-NEXT:    s_waitcnt vmcnt(0)
477; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
478; SI-NEXT:    s_waitcnt vmcnt(0)
479; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
480; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
481; SI-NEXT:    v_or_b32_e32 v2, v6, v2
482; SI-NEXT:    v_or_b32_e32 v3, v4, v3
483; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
484; SI-NEXT:    s_mov_b64 vcc, exec
485; SI-NEXT:    s_cbranch_execz .LBB3_3
486; SI-NEXT:    s_branch .LBB3_4
487; SI-NEXT:  .LBB3_2:
488; SI-NEXT:    ; implicit-def: $vgpr3
489; SI-NEXT:    ; implicit-def: $vgpr4
490; SI-NEXT:    ; implicit-def: $vgpr2
491; SI-NEXT:    s_mov_b64 vcc, 0
492; SI-NEXT:  .LBB3_3: ; %T
493; SI-NEXT:    s_mov_b32 s6, 0
494; SI-NEXT:    s_mov_b32 s7, 0xf000
495; SI-NEXT:    s_mov_b32 s4, s6
496; SI-NEXT:    s_mov_b32 s5, s6
497; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
498; SI-NEXT:    s_waitcnt vmcnt(0)
499; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
500; SI-NEXT:    s_waitcnt vmcnt(0)
501; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
502; SI-NEXT:    s_waitcnt vmcnt(0)
503; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
504; SI-NEXT:    s_waitcnt vmcnt(0)
505; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
506; SI-NEXT:    s_waitcnt vmcnt(0)
507; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
508; SI-NEXT:    s_waitcnt vmcnt(0)
509; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
510; SI-NEXT:    s_waitcnt vmcnt(0)
511; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
512; SI-NEXT:    s_waitcnt vmcnt(0)
513; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
514; SI-NEXT:    s_waitcnt vmcnt(0)
515; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
516; SI-NEXT:    s_waitcnt vmcnt(0)
517; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
518; SI-NEXT:    s_waitcnt vmcnt(0)
519; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
520; SI-NEXT:    s_waitcnt vmcnt(0)
521; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
522; SI-NEXT:    s_waitcnt vmcnt(0)
523; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
524; SI-NEXT:    s_waitcnt vmcnt(0)
525; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
526; SI-NEXT:    s_waitcnt vmcnt(0)
527; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
528; SI-NEXT:    s_waitcnt vmcnt(0)
529; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
530; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
531; SI-NEXT:    v_or_b32_e32 v2, v4, v0
532; SI-NEXT:    v_or_b32_e32 v3, v3, v1
533; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
534; SI-NEXT:  .LBB3_4: ; %exit
535; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
536; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
537; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
538; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
539; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
540; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
541; SI-NEXT:    v_bfrev_b32_e32 v6, 1
542; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
543; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
544; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
545; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
546; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
547; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
548; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
549; SI-NEXT:    v_or_b32_e32 v0, v0, v1
550; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
551; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
552; SI-NEXT:    v_or_b32_e32 v2, v2, v3
553; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
554; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
555; SI-NEXT:    s_setpc_b64 s[30:31]
556;
557; GFX9-LABEL: vec_16xi16_extract_4xi16:
558; GFX9:       ; %bb.0:
559; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
560; GFX9-NEXT:    s_cbranch_scc0 .LBB3_2
561; GFX9-NEXT:  ; %bb.1: ; %F
562; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
563; GFX9-NEXT:    s_waitcnt vmcnt(0)
564; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
565; GFX9-NEXT:    s_waitcnt vmcnt(0)
566; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
567; GFX9-NEXT:    s_cbranch_execz .LBB3_3
568; GFX9-NEXT:    s_branch .LBB3_4
569; GFX9-NEXT:  .LBB3_2:
570; GFX9-NEXT:    s_mov_b32 s8, 0
571; GFX9-NEXT:    s_mov_b32 s9, s8
572; GFX9-NEXT:    s_mov_b32 s10, s8
573; GFX9-NEXT:    s_mov_b32 s11, s8
574; GFX9-NEXT:    s_mov_b32 s12, s8
575; GFX9-NEXT:    s_mov_b32 s13, s8
576; GFX9-NEXT:    s_mov_b32 s14, s8
577; GFX9-NEXT:    s_mov_b32 s15, s8
578; GFX9-NEXT:    v_mov_b32_e32 v4, s8
579; GFX9-NEXT:    v_mov_b32_e32 v5, s9
580; GFX9-NEXT:    v_mov_b32_e32 v6, s10
581; GFX9-NEXT:    v_mov_b32_e32 v7, s11
582; GFX9-NEXT:    v_mov_b32_e32 v8, s12
583; GFX9-NEXT:    v_mov_b32_e32 v9, s13
584; GFX9-NEXT:    v_mov_b32_e32 v10, s14
585; GFX9-NEXT:    v_mov_b32_e32 v11, s15
586; GFX9-NEXT:  .LBB3_3: ; %T
587; GFX9-NEXT:    s_waitcnt vmcnt(0)
588; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
589; GFX9-NEXT:    s_waitcnt vmcnt(0)
590; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
591; GFX9-NEXT:    s_waitcnt vmcnt(0)
592; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
593; GFX9-NEXT:  .LBB3_4: ; %exit
594; GFX9-NEXT:    s_waitcnt vmcnt(0)
595; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
596; GFX9-NEXT:    s_movk_i32 s4, 0x8000
597; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
598; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
599; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
600; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
601; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
602; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
603; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
604; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
605; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
606; GFX9-NEXT:    s_setpc_b64 s[30:31]
607  br i1 undef, label %T, label %F
608
609T:
610  %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0
611  br label %exit
612
613F:
614  %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1
615  br label %exit
616
617exit:
618  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
619  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
620  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
621  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
622  ret <4 x i16> %r2
623}
624
625define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) {
626;
627; SI-LABEL: vec_16xi16_extract_4xi16_2:
628; SI:       ; %bb.0:
629; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
630; SI-NEXT:    s_cbranch_scc0 .LBB4_2
631; SI-NEXT:  ; %bb.1: ; %F
632; SI-NEXT:    s_mov_b32 s6, 0
633; SI-NEXT:    s_mov_b32 s7, 0xf000
634; SI-NEXT:    s_mov_b32 s4, s6
635; SI-NEXT:    s_mov_b32 s5, s6
636; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
637; SI-NEXT:    s_waitcnt vmcnt(0)
638; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
639; SI-NEXT:    s_waitcnt vmcnt(0)
640; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
641; SI-NEXT:    s_waitcnt vmcnt(0)
642; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
643; SI-NEXT:    s_waitcnt vmcnt(0)
644; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
645; SI-NEXT:    s_waitcnt vmcnt(0)
646; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
647; SI-NEXT:    s_waitcnt vmcnt(0)
648; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
649; SI-NEXT:    s_waitcnt vmcnt(0)
650; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
651; SI-NEXT:    s_waitcnt vmcnt(0)
652; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
653; SI-NEXT:    s_waitcnt vmcnt(0)
654; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
655; SI-NEXT:    s_waitcnt vmcnt(0)
656; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
657; SI-NEXT:    s_waitcnt vmcnt(0)
658; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
659; SI-NEXT:    s_waitcnt vmcnt(0)
660; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
661; SI-NEXT:    s_waitcnt vmcnt(0)
662; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
663; SI-NEXT:    s_waitcnt vmcnt(0)
664; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
665; SI-NEXT:    s_waitcnt vmcnt(0)
666; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
667; SI-NEXT:    s_waitcnt vmcnt(0)
668; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
669; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
670; SI-NEXT:    v_or_b32_e32 v2, v6, v2
671; SI-NEXT:    v_or_b32_e32 v3, v4, v3
672; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
673; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
674; SI-NEXT:    s_mov_b64 vcc, exec
675; SI-NEXT:    s_cbranch_execz .LBB4_3
676; SI-NEXT:    s_branch .LBB4_4
677; SI-NEXT:  .LBB4_2:
678; SI-NEXT:    ; implicit-def: $vgpr3
679; SI-NEXT:    ; implicit-def: $vgpr5
680; SI-NEXT:    ; implicit-def: $vgpr2
681; SI-NEXT:    ; implicit-def: $vgpr4
682; SI-NEXT:    s_mov_b64 vcc, 0
683; SI-NEXT:  .LBB4_3: ; %T
684; SI-NEXT:    s_mov_b32 s6, 0
685; SI-NEXT:    s_mov_b32 s7, 0xf000
686; SI-NEXT:    s_mov_b32 s4, s6
687; SI-NEXT:    s_mov_b32 s5, s6
688; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
689; SI-NEXT:    s_waitcnt vmcnt(0)
690; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
691; SI-NEXT:    s_waitcnt vmcnt(0)
692; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
693; SI-NEXT:    s_waitcnt vmcnt(0)
694; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
695; SI-NEXT:    s_waitcnt vmcnt(0)
696; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
697; SI-NEXT:    s_waitcnt vmcnt(0)
698; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
699; SI-NEXT:    s_waitcnt vmcnt(0)
700; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
701; SI-NEXT:    s_waitcnt vmcnt(0)
702; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
703; SI-NEXT:    s_waitcnt vmcnt(0)
704; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
705; SI-NEXT:    s_waitcnt vmcnt(0)
706; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
707; SI-NEXT:    s_waitcnt vmcnt(0)
708; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
709; SI-NEXT:    s_waitcnt vmcnt(0)
710; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
711; SI-NEXT:    s_waitcnt vmcnt(0)
712; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
713; SI-NEXT:    s_waitcnt vmcnt(0)
714; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
715; SI-NEXT:    s_waitcnt vmcnt(0)
716; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
717; SI-NEXT:    s_waitcnt vmcnt(0)
718; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
719; SI-NEXT:    s_waitcnt vmcnt(0)
720; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
721; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
722; SI-NEXT:    v_or_b32_e32 v2, v4, v0
723; SI-NEXT:    v_or_b32_e32 v3, v3, v1
724; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
725; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
726; SI-NEXT:  .LBB4_4: ; %exit
727; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
728; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
729; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
730; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
731; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
732; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
733; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
734; SI-NEXT:    v_bfrev_b32_e32 v7, 1
735; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
736; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
737; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
738; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
739; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
740; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
741; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
742; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
743; SI-NEXT:    v_or_b32_e32 v0, v0, v1
744; SI-NEXT:    v_or_b32_e32 v2, v2, v3
745; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
746; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
747; SI-NEXT:    s_setpc_b64 s[30:31]
748;
749; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
750; GFX9:       ; %bb.0:
751; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX9-NEXT:    s_cbranch_scc0 .LBB4_2
753; GFX9-NEXT:  ; %bb.1: ; %F
754; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
755; GFX9-NEXT:    s_waitcnt vmcnt(0)
756; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
757; GFX9-NEXT:    s_waitcnt vmcnt(0)
758; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
759; GFX9-NEXT:    s_cbranch_execz .LBB4_3
760; GFX9-NEXT:    s_branch .LBB4_4
761; GFX9-NEXT:  .LBB4_2:
762; GFX9-NEXT:    s_mov_b32 s8, 0
763; GFX9-NEXT:    s_mov_b32 s9, s8
764; GFX9-NEXT:    s_mov_b32 s10, s8
765; GFX9-NEXT:    s_mov_b32 s11, s8
766; GFX9-NEXT:    s_mov_b32 s12, s8
767; GFX9-NEXT:    s_mov_b32 s13, s8
768; GFX9-NEXT:    s_mov_b32 s14, s8
769; GFX9-NEXT:    s_mov_b32 s15, s8
770; GFX9-NEXT:    v_mov_b32_e32 v4, s8
771; GFX9-NEXT:    v_mov_b32_e32 v5, s9
772; GFX9-NEXT:    v_mov_b32_e32 v6, s10
773; GFX9-NEXT:    v_mov_b32_e32 v7, s11
774; GFX9-NEXT:    v_mov_b32_e32 v8, s12
775; GFX9-NEXT:    v_mov_b32_e32 v9, s13
776; GFX9-NEXT:    v_mov_b32_e32 v10, s14
777; GFX9-NEXT:    v_mov_b32_e32 v11, s15
778; GFX9-NEXT:  .LBB4_3: ; %T
779; GFX9-NEXT:    s_waitcnt vmcnt(0)
780; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
781; GFX9-NEXT:    s_waitcnt vmcnt(0)
782; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
783; GFX9-NEXT:    s_waitcnt vmcnt(0)
784; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
785; GFX9-NEXT:  .LBB4_4: ; %exit
786; GFX9-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
788; GFX9-NEXT:    s_movk_i32 s4, 0x8000
789; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
790; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
791; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1]
792; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
793; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
794; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
795; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
796; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
797; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
798; GFX9-NEXT:    s_setpc_b64 s[30:31]
799  br i1 undef, label %T, label %F
800
801T:
802  %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0
803  br label %exit
804
805F:
806  %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1
807  br label %exit
808
809exit:
810  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
811  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
812  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
813  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
814  ret <4 x i16> %r2
815}
816
817define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 x half> addrspace(1) * %p1) {
818;
819; SI-LABEL: vec_16xf16_extract_4xf16:
820; SI:       ; %bb.0:
821; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
822; SI-NEXT:    s_cbranch_scc0 .LBB5_2
823; SI-NEXT:  ; %bb.1: ; %F
824; SI-NEXT:    s_mov_b32 s6, 0
825; SI-NEXT:    s_mov_b32 s7, 0xf000
826; SI-NEXT:    s_mov_b32 s4, s6
827; SI-NEXT:    s_mov_b32 s5, s6
828; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
829; SI-NEXT:    s_waitcnt vmcnt(0)
830; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
831; SI-NEXT:    s_waitcnt vmcnt(0)
832; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
833; SI-NEXT:    s_waitcnt vmcnt(0)
834; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
835; SI-NEXT:    s_waitcnt vmcnt(0)
836; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
837; SI-NEXT:    s_waitcnt vmcnt(0)
838; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
839; SI-NEXT:    s_waitcnt vmcnt(0)
840; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
841; SI-NEXT:    s_waitcnt vmcnt(0)
842; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
843; SI-NEXT:    s_waitcnt vmcnt(0)
844; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
845; SI-NEXT:    s_waitcnt vmcnt(0)
846; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
847; SI-NEXT:    s_waitcnt vmcnt(0)
848; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
849; SI-NEXT:    s_waitcnt vmcnt(0)
850; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
851; SI-NEXT:    s_waitcnt vmcnt(0)
852; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
853; SI-NEXT:    s_waitcnt vmcnt(0)
854; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
855; SI-NEXT:    s_waitcnt vmcnt(0)
856; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
857; SI-NEXT:    s_waitcnt vmcnt(0)
858; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
859; SI-NEXT:    s_waitcnt vmcnt(0)
860; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
861; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
862; SI-NEXT:    v_or_b32_e32 v2, v6, v2
863; SI-NEXT:    v_or_b32_e32 v4, v4, v3
864; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
865; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
866; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
867; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
868; SI-NEXT:    s_mov_b64 vcc, exec
869; SI-NEXT:    s_cbranch_execz .LBB5_3
870; SI-NEXT:    s_branch .LBB5_4
871; SI-NEXT:  .LBB5_2:
872; SI-NEXT:    ; implicit-def: $vgpr3
873; SI-NEXT:    ; implicit-def: $vgpr4
874; SI-NEXT:    ; implicit-def: $vgpr2
875; SI-NEXT:    s_mov_b64 vcc, 0
876; SI-NEXT:  .LBB5_3: ; %T
877; SI-NEXT:    s_mov_b32 s6, 0
878; SI-NEXT:    s_mov_b32 s7, 0xf000
879; SI-NEXT:    s_mov_b32 s4, s6
880; SI-NEXT:    s_mov_b32 s5, s6
881; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
882; SI-NEXT:    s_waitcnt vmcnt(0)
883; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
884; SI-NEXT:    s_waitcnt vmcnt(0)
885; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
886; SI-NEXT:    s_waitcnt vmcnt(0)
887; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
888; SI-NEXT:    s_waitcnt vmcnt(0)
889; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
890; SI-NEXT:    s_waitcnt vmcnt(0)
891; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
892; SI-NEXT:    s_waitcnt vmcnt(0)
893; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
894; SI-NEXT:    s_waitcnt vmcnt(0)
895; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
896; SI-NEXT:    s_waitcnt vmcnt(0)
897; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
898; SI-NEXT:    s_waitcnt vmcnt(0)
899; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
900; SI-NEXT:    s_waitcnt vmcnt(0)
901; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
902; SI-NEXT:    s_waitcnt vmcnt(0)
903; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
904; SI-NEXT:    s_waitcnt vmcnt(0)
905; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
906; SI-NEXT:    s_waitcnt vmcnt(0)
907; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
908; SI-NEXT:    s_waitcnt vmcnt(0)
909; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
910; SI-NEXT:    s_waitcnt vmcnt(0)
911; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
912; SI-NEXT:    s_waitcnt vmcnt(0)
913; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
914; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
915; SI-NEXT:    v_or_b32_e32 v0, v4, v0
916; SI-NEXT:    v_or_b32_e32 v1, v2, v1
917; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
918; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
919; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
920; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
921; SI-NEXT:  .LBB5_4: ; %exit
922; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
923; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
924; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
925; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
926; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
927; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
928; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
929; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
930; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
931; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
932; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
933; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
934; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
935; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
936; SI-NEXT:    v_mov_b32_e32 v3, v2
937; SI-NEXT:    s_setpc_b64 s[30:31]
938;
939; GFX9-LABEL: vec_16xf16_extract_4xf16:
940; GFX9:       ; %bb.0:
941; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
942; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
943; GFX9-NEXT:  ; %bb.1: ; %F
944; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
945; GFX9-NEXT:    s_waitcnt vmcnt(0)
946; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
947; GFX9-NEXT:    s_waitcnt vmcnt(0)
948; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
949; GFX9-NEXT:    s_cbranch_execz .LBB5_3
950; GFX9-NEXT:    s_branch .LBB5_4
951; GFX9-NEXT:  .LBB5_2:
952; GFX9-NEXT:    s_mov_b32 s8, 0
953; GFX9-NEXT:    s_mov_b32 s9, s8
954; GFX9-NEXT:    s_mov_b32 s10, s8
955; GFX9-NEXT:    s_mov_b32 s11, s8
956; GFX9-NEXT:    s_mov_b32 s12, s8
957; GFX9-NEXT:    s_mov_b32 s13, s8
958; GFX9-NEXT:    s_mov_b32 s14, s8
959; GFX9-NEXT:    s_mov_b32 s15, s8
960; GFX9-NEXT:    v_mov_b32_e32 v4, s8
961; GFX9-NEXT:    v_mov_b32_e32 v5, s9
962; GFX9-NEXT:    v_mov_b32_e32 v6, s10
963; GFX9-NEXT:    v_mov_b32_e32 v7, s11
964; GFX9-NEXT:    v_mov_b32_e32 v8, s12
965; GFX9-NEXT:    v_mov_b32_e32 v9, s13
966; GFX9-NEXT:    v_mov_b32_e32 v10, s14
967; GFX9-NEXT:    v_mov_b32_e32 v11, s15
968; GFX9-NEXT:  .LBB5_3: ; %T
969; GFX9-NEXT:    s_waitcnt vmcnt(0)
970; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
971; GFX9-NEXT:    s_waitcnt vmcnt(0)
972; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
973; GFX9-NEXT:    s_waitcnt vmcnt(0)
974; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
975; GFX9-NEXT:  .LBB5_4: ; %exit
976; GFX9-NEXT:    s_waitcnt vmcnt(0)
977; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v5
978; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
979; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3800
980; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3900
981; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3d00
982; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v0
983; GFX9-NEXT:    v_cndmask_b32_e32 v5, v2, v3, vcc
984; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
985; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v2, vcc
986; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v4
987; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
988; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD
989; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
990; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
991; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v6
992; GFX9-NEXT:    s_setpc_b64 s[30:31]
993  br i1 undef, label %T, label %F
994
995T:
996  %t = load volatile <16 x half>, <16 x half> addrspace(1) * %p0
997  br label %exit
998
999F:
1000  %f = load volatile <16 x half>, <16 x half> addrspace(1) * %p1
1001  br label %exit
1002
1003exit:
1004  %m = phi <16 x half> [ %t, %T ], [ %f, %F ]
1005  %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1006  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
1007  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
1008  ret <4 x half> %r2
1009}
1010