1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
4
5define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
6; SI-LABEL: vec_8xi16_extract_4xi16:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; SI-NEXT:    s_cbranch_scc0 .LBB0_2
10; SI-NEXT:  ; %bb.1: ; %F
11; SI-NEXT:    s_mov_b32 s6, 0
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s4, s6
14; SI-NEXT:    s_mov_b32 s5, s6
15; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
16; SI-NEXT:    s_waitcnt vmcnt(0)
17; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
18; SI-NEXT:    s_waitcnt vmcnt(0)
19; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
20; SI-NEXT:    s_waitcnt vmcnt(0)
21; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
30; SI-NEXT:    s_waitcnt vmcnt(0)
31; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
32; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
33; SI-NEXT:    v_or_b32_e32 v2, v6, v2
34; SI-NEXT:    v_or_b32_e32 v3, v4, v3
35; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
36; SI-NEXT:    s_mov_b64 vcc, exec
37; SI-NEXT:    s_cbranch_execz .LBB0_3
38; SI-NEXT:    s_branch .LBB0_4
39; SI-NEXT:  .LBB0_2:
40; SI-NEXT:    ; implicit-def: $vgpr3
41; SI-NEXT:    ; implicit-def: $vgpr4
42; SI-NEXT:    ; implicit-def: $vgpr2
43; SI-NEXT:    s_mov_b64 vcc, 0
44; SI-NEXT:  .LBB0_3: ; %T
45; SI-NEXT:    s_mov_b32 s6, 0
46; SI-NEXT:    s_mov_b32 s7, 0xf000
47; SI-NEXT:    s_mov_b32 s4, s6
48; SI-NEXT:    s_mov_b32 s5, s6
49; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
50; SI-NEXT:    s_waitcnt vmcnt(0)
51; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
52; SI-NEXT:    s_waitcnt vmcnt(0)
53; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
54; SI-NEXT:    s_waitcnt vmcnt(0)
55; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
56; SI-NEXT:    s_waitcnt vmcnt(0)
57; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
58; SI-NEXT:    s_waitcnt vmcnt(0)
59; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
60; SI-NEXT:    s_waitcnt vmcnt(0)
61; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
62; SI-NEXT:    s_waitcnt vmcnt(0)
63; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
64; SI-NEXT:    s_waitcnt vmcnt(0)
65; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
66; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
67; SI-NEXT:    v_or_b32_e32 v2, v4, v0
68; SI-NEXT:    v_or_b32_e32 v3, v3, v1
69; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
70; SI-NEXT:  .LBB0_4: ; %exit
71; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
72; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
73; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
74; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
75; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
76; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
77; SI-NEXT:    v_bfrev_b32_e32 v6, 1
78; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
79; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
80; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
81; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
82; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
83; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
84; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
85; SI-NEXT:    v_or_b32_e32 v0, v0, v1
86; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
87; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
88; SI-NEXT:    v_or_b32_e32 v2, v2, v3
89; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
90; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
91; SI-NEXT:    s_setpc_b64 s[30:31]
92;
93; GFX9-LABEL: vec_8xi16_extract_4xi16:
94; GFX9:       ; %bb.0:
95; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
96; GFX9-NEXT:    s_cbranch_scc0 .LBB0_2
97; GFX9-NEXT:  ; %bb.1: ; %F
98; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
99; GFX9-NEXT:    s_waitcnt vmcnt(0)
100; GFX9-NEXT:    s_cbranch_execz .LBB0_3
101; GFX9-NEXT:    s_branch .LBB0_4
102; GFX9-NEXT:  .LBB0_2:
103; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
104; GFX9-NEXT:  .LBB0_3: ; %T
105; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
106; GFX9-NEXT:    s_waitcnt vmcnt(0)
107; GFX9-NEXT:  .LBB0_4: ; %exit
108; GFX9-NEXT:    s_waitcnt vmcnt(0)
109; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
110; GFX9-NEXT:    s_movk_i32 s4, 0x8000
111; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
112; GFX9-NEXT:    v_or_b32_e32 v3, 0xffff8000, v0
113; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
114; GFX9-NEXT:    v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
115; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
116; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
117; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
118; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v3
119; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
120; GFX9-NEXT:    s_setpc_b64 s[30:31]
121  br i1 undef, label %T, label %F
122
123T:
124  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
125  br label %exit
126
127F:
128  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
129  br label %exit
130
131exit:
132  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
133  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
134  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
135  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
136  ret <4 x i16> %r2
137}
138
139define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspace(1) * %p1) {
140; SI-LABEL: vec_8xi16_extract_4xi16_2:
141; SI:       ; %bb.0:
142; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143; SI-NEXT:    s_cbranch_scc0 .LBB1_2
144; SI-NEXT:  ; %bb.1: ; %F
145; SI-NEXT:    s_mov_b32 s6, 0
146; SI-NEXT:    s_mov_b32 s7, 0xf000
147; SI-NEXT:    s_mov_b32 s4, s6
148; SI-NEXT:    s_mov_b32 s5, s6
149; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
150; SI-NEXT:    s_waitcnt vmcnt(0)
151; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
152; SI-NEXT:    s_waitcnt vmcnt(0)
153; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
154; SI-NEXT:    s_waitcnt vmcnt(0)
155; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
158; SI-NEXT:    s_waitcnt vmcnt(0)
159; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
160; SI-NEXT:    s_waitcnt vmcnt(0)
161; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
162; SI-NEXT:    s_waitcnt vmcnt(0)
163; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
164; SI-NEXT:    s_waitcnt vmcnt(0)
165; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
166; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
167; SI-NEXT:    v_or_b32_e32 v2, v6, v2
168; SI-NEXT:    v_or_b32_e32 v3, v4, v3
169; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
170; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
171; SI-NEXT:    s_mov_b64 vcc, exec
172; SI-NEXT:    s_cbranch_execz .LBB1_3
173; SI-NEXT:    s_branch .LBB1_4
174; SI-NEXT:  .LBB1_2:
175; SI-NEXT:    ; implicit-def: $vgpr3
176; SI-NEXT:    ; implicit-def: $vgpr5
177; SI-NEXT:    ; implicit-def: $vgpr2
178; SI-NEXT:    ; implicit-def: $vgpr4
179; SI-NEXT:    s_mov_b64 vcc, 0
180; SI-NEXT:  .LBB1_3: ; %T
181; SI-NEXT:    s_mov_b32 s6, 0
182; SI-NEXT:    s_mov_b32 s7, 0xf000
183; SI-NEXT:    s_mov_b32 s4, s6
184; SI-NEXT:    s_mov_b32 s5, s6
185; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
186; SI-NEXT:    s_waitcnt vmcnt(0)
187; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
188; SI-NEXT:    s_waitcnt vmcnt(0)
189; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
190; SI-NEXT:    s_waitcnt vmcnt(0)
191; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
192; SI-NEXT:    s_waitcnt vmcnt(0)
193; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
194; SI-NEXT:    s_waitcnt vmcnt(0)
195; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
196; SI-NEXT:    s_waitcnt vmcnt(0)
197; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
198; SI-NEXT:    s_waitcnt vmcnt(0)
199; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
200; SI-NEXT:    s_waitcnt vmcnt(0)
201; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
202; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
203; SI-NEXT:    v_or_b32_e32 v2, v4, v0
204; SI-NEXT:    v_or_b32_e32 v3, v3, v1
205; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
206; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
207; SI-NEXT:  .LBB1_4: ; %exit
208; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
209; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
210; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
211; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
212; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
213; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
214; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
215; SI-NEXT:    v_bfrev_b32_e32 v7, 1
216; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
217; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
218; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
219; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
220; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
221; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
222; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
223; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
224; SI-NEXT:    v_or_b32_e32 v0, v0, v1
225; SI-NEXT:    v_or_b32_e32 v2, v2, v3
226; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
227; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
228; SI-NEXT:    s_setpc_b64 s[30:31]
229;
230; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
231; GFX9:       ; %bb.0:
232; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233; GFX9-NEXT:    s_cbranch_scc0 .LBB1_2
234; GFX9-NEXT:  ; %bb.1: ; %F
235; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
236; GFX9-NEXT:    s_waitcnt vmcnt(0)
237; GFX9-NEXT:    s_cbranch_execz .LBB1_3
238; GFX9-NEXT:    s_branch .LBB1_4
239; GFX9-NEXT:  .LBB1_2:
240; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
241; GFX9-NEXT:  .LBB1_3: ; %T
242; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
243; GFX9-NEXT:    s_waitcnt vmcnt(0)
244; GFX9-NEXT:  .LBB1_4: ; %exit
245; GFX9-NEXT:    s_waitcnt vmcnt(0)
246; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
247; GFX9-NEXT:    s_movk_i32 s4, 0x8000
248; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
249; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
250; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
251; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
252; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
253; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
254; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
255; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
256; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
257; GFX9-NEXT:    s_setpc_b64 s[30:31]
258  br i1 undef, label %T, label %F
259
260T:
261  %t = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p0
262  br label %exit
263
264F:
265  %f = load volatile <8 x i16>, <8 x i16> addrspace(1) * %p1
266  br label %exit
267
268exit:
269  %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
270  %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
271  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
272  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
273  ret <4 x i16> %r2
274}
275
276define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x half> addrspace(1) * %p1) {
277; SI-LABEL: vec_8xf16_extract_4xf16:
278; SI:       ; %bb.0:
279; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; SI-NEXT:    s_cbranch_scc0 .LBB2_2
281; SI-NEXT:  ; %bb.1: ; %F
282; SI-NEXT:    s_mov_b32 s6, 0
283; SI-NEXT:    s_mov_b32 s7, 0xf000
284; SI-NEXT:    s_mov_b32 s4, s6
285; SI-NEXT:    s_mov_b32 s5, s6
286; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
287; SI-NEXT:    s_waitcnt vmcnt(0)
288; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
289; SI-NEXT:    s_waitcnt vmcnt(0)
290; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
291; SI-NEXT:    s_waitcnt vmcnt(0)
292; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
293; SI-NEXT:    s_waitcnt vmcnt(0)
294; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
295; SI-NEXT:    s_waitcnt vmcnt(0)
296; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
297; SI-NEXT:    s_waitcnt vmcnt(0)
298; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
299; SI-NEXT:    s_waitcnt vmcnt(0)
300; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
301; SI-NEXT:    s_waitcnt vmcnt(0)
302; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
303; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
304; SI-NEXT:    v_or_b32_e32 v2, v6, v2
305; SI-NEXT:    v_or_b32_e32 v4, v4, v3
306; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
307; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
308; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
309; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
310; SI-NEXT:    s_mov_b64 vcc, exec
311; SI-NEXT:    s_cbranch_execz .LBB2_3
312; SI-NEXT:    s_branch .LBB2_4
313; SI-NEXT:  .LBB2_2:
314; SI-NEXT:    ; implicit-def: $vgpr3
315; SI-NEXT:    ; implicit-def: $vgpr4
316; SI-NEXT:    ; implicit-def: $vgpr2
317; SI-NEXT:    s_mov_b64 vcc, 0
318; SI-NEXT:  .LBB2_3: ; %T
319; SI-NEXT:    s_mov_b32 s6, 0
320; SI-NEXT:    s_mov_b32 s7, 0xf000
321; SI-NEXT:    s_mov_b32 s4, s6
322; SI-NEXT:    s_mov_b32 s5, s6
323; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
324; SI-NEXT:    s_waitcnt vmcnt(0)
325; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
326; SI-NEXT:    s_waitcnt vmcnt(0)
327; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
328; SI-NEXT:    s_waitcnt vmcnt(0)
329; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
330; SI-NEXT:    s_waitcnt vmcnt(0)
331; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
332; SI-NEXT:    s_waitcnt vmcnt(0)
333; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
334; SI-NEXT:    s_waitcnt vmcnt(0)
335; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
336; SI-NEXT:    s_waitcnt vmcnt(0)
337; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
338; SI-NEXT:    s_waitcnt vmcnt(0)
339; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
340; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
341; SI-NEXT:    v_or_b32_e32 v0, v4, v0
342; SI-NEXT:    v_or_b32_e32 v1, v2, v1
343; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
344; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
345; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
346; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
347; SI-NEXT:  .LBB2_4: ; %exit
348; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
349; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
350; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
351; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
352; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
353; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
354; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
355; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
356; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
357; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
358; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
359; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
360; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
361; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
362; SI-NEXT:    v_mov_b32_e32 v3, v2
363; SI-NEXT:    s_setpc_b64 s[30:31]
364;
365; GFX9-LABEL: vec_8xf16_extract_4xf16:
366; GFX9:       ; %bb.0:
367; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
368; GFX9-NEXT:    s_cbranch_scc0 .LBB2_2
369; GFX9-NEXT:  ; %bb.1: ; %F
370; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off glc
371; GFX9-NEXT:    s_waitcnt vmcnt(0)
372; GFX9-NEXT:    s_cbranch_execz .LBB2_3
373; GFX9-NEXT:    s_branch .LBB2_4
374; GFX9-NEXT:  .LBB2_2:
375; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
376; GFX9-NEXT:  .LBB2_3: ; %T
377; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:  .LBB2_4: ; %exit
380; GFX9-NEXT:    s_waitcnt vmcnt(0)
381; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v3
382; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
383; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3800
384; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3900
385; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3d00
386; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v0
387; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v4, vcc
388; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
389; GFX9-NEXT:    v_cndmask_b32_e32 v6, v4, v3, vcc
390; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
391; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
392; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
393; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
394; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
395; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v6
396; GFX9-NEXT:    s_setpc_b64 s[30:31]
397  br i1 undef, label %T, label %F
398
399T:
400  %t = load volatile <8 x half>, <8 x half> addrspace(1) * %p0
401  br label %exit
402
403F:
404  %f = load volatile <8 x half>, <8 x half> addrspace(1) * %p1
405  br label %exit
406
407exit:
408  %m = phi <8 x half> [ %t, %T ], [ %f, %F ]
409  %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
410  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
411  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
412  ret <4 x half> %r2
413}
414
415define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) {
416;
417; SI-LABEL: vec_16xi16_extract_4xi16:
418; SI:       ; %bb.0:
419; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
420; SI-NEXT:    s_cbranch_scc0 .LBB3_2
421; SI-NEXT:  ; %bb.1: ; %F
422; SI-NEXT:    s_mov_b32 s6, 0
423; SI-NEXT:    s_mov_b32 s7, 0xf000
424; SI-NEXT:    s_mov_b32 s4, s6
425; SI-NEXT:    s_mov_b32 s5, s6
426; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
427; SI-NEXT:    s_waitcnt vmcnt(0)
428; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
429; SI-NEXT:    s_waitcnt vmcnt(0)
430; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
431; SI-NEXT:    s_waitcnt vmcnt(0)
432; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
433; SI-NEXT:    s_waitcnt vmcnt(0)
434; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
435; SI-NEXT:    s_waitcnt vmcnt(0)
436; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
437; SI-NEXT:    s_waitcnt vmcnt(0)
438; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
439; SI-NEXT:    s_waitcnt vmcnt(0)
440; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
441; SI-NEXT:    s_waitcnt vmcnt(0)
442; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
443; SI-NEXT:    s_waitcnt vmcnt(0)
444; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
445; SI-NEXT:    s_waitcnt vmcnt(0)
446; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
447; SI-NEXT:    s_waitcnt vmcnt(0)
448; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
449; SI-NEXT:    s_waitcnt vmcnt(0)
450; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
451; SI-NEXT:    s_waitcnt vmcnt(0)
452; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
453; SI-NEXT:    s_waitcnt vmcnt(0)
454; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
455; SI-NEXT:    s_waitcnt vmcnt(0)
456; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
457; SI-NEXT:    s_waitcnt vmcnt(0)
458; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
459; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
460; SI-NEXT:    v_or_b32_e32 v2, v6, v2
461; SI-NEXT:    v_or_b32_e32 v3, v4, v3
462; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
463; SI-NEXT:    s_mov_b64 vcc, exec
464; SI-NEXT:    s_cbranch_execz .LBB3_3
465; SI-NEXT:    s_branch .LBB3_4
466; SI-NEXT:  .LBB3_2:
467; SI-NEXT:    ; implicit-def: $vgpr3
468; SI-NEXT:    ; implicit-def: $vgpr4
469; SI-NEXT:    ; implicit-def: $vgpr2
470; SI-NEXT:    s_mov_b64 vcc, 0
471; SI-NEXT:  .LBB3_3: ; %T
472; SI-NEXT:    s_mov_b32 s6, 0
473; SI-NEXT:    s_mov_b32 s7, 0xf000
474; SI-NEXT:    s_mov_b32 s4, s6
475; SI-NEXT:    s_mov_b32 s5, s6
476; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc
477; SI-NEXT:    s_waitcnt vmcnt(0)
478; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
479; SI-NEXT:    s_waitcnt vmcnt(0)
480; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
481; SI-NEXT:    s_waitcnt vmcnt(0)
482; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
483; SI-NEXT:    s_waitcnt vmcnt(0)
484; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
485; SI-NEXT:    s_waitcnt vmcnt(0)
486; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
487; SI-NEXT:    s_waitcnt vmcnt(0)
488; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
489; SI-NEXT:    s_waitcnt vmcnt(0)
490; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
491; SI-NEXT:    s_waitcnt vmcnt(0)
492; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
493; SI-NEXT:    s_waitcnt vmcnt(0)
494; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
495; SI-NEXT:    s_waitcnt vmcnt(0)
496; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
497; SI-NEXT:    s_waitcnt vmcnt(0)
498; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
499; SI-NEXT:    s_waitcnt vmcnt(0)
500; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
501; SI-NEXT:    s_waitcnt vmcnt(0)
502; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
503; SI-NEXT:    s_waitcnt vmcnt(0)
504; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
505; SI-NEXT:    s_waitcnt vmcnt(0)
506; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
507; SI-NEXT:    s_waitcnt vmcnt(0)
508; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
509; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
510; SI-NEXT:    v_or_b32_e32 v2, v4, v0
511; SI-NEXT:    v_or_b32_e32 v3, v3, v1
512; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
513; SI-NEXT:  .LBB3_4: ; %exit
514; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
515; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
516; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
517; SI-NEXT:    v_mov_b32_e32 v3, 0xffff
518; SI-NEXT:    v_mov_b32_e32 v4, 0x8000
519; SI-NEXT:    v_mov_b32_e32 v5, 0xffff0000
520; SI-NEXT:    v_bfrev_b32_e32 v6, 1
521; SI-NEXT:    v_mov_b32_e32 v7, 0xffff8000
522; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
523; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
524; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
525; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
526; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
527; SI-NEXT:    v_cndmask_b32_e32 v2, -1, v7, vcc
528; SI-NEXT:    v_or_b32_e32 v0, v0, v1
529; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
530; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
531; SI-NEXT:    v_or_b32_e32 v2, v2, v3
532; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
533; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
534; SI-NEXT:    s_setpc_b64 s[30:31]
535;
536; GFX9-LABEL: vec_16xi16_extract_4xi16:
537; GFX9:       ; %bb.0:
538; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539; GFX9-NEXT:    s_cbranch_scc0 .LBB3_2
540; GFX9-NEXT:  ; %bb.1: ; %F
541; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
544; GFX9-NEXT:    s_waitcnt vmcnt(0)
545; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
546; GFX9-NEXT:    s_cbranch_execz .LBB3_3
547; GFX9-NEXT:    s_branch .LBB3_4
548; GFX9-NEXT:  .LBB3_2:
549; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
550; GFX9-NEXT:  .LBB3_3: ; %T
551; GFX9-NEXT:    s_waitcnt vmcnt(0)
552; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
553; GFX9-NEXT:    s_waitcnt vmcnt(0)
554; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
555; GFX9-NEXT:    s_waitcnt vmcnt(0)
556; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
557; GFX9-NEXT:  .LBB3_4: ; %exit
558; GFX9-NEXT:    s_waitcnt vmcnt(0)
559; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
560; GFX9-NEXT:    s_movk_i32 s4, 0x8000
561; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
562; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
563; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
564; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
565; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
566; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
567; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
568; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
569; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
570; GFX9-NEXT:    s_setpc_b64 s[30:31]
571  br i1 undef, label %T, label %F
572
573T:
574  %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0
575  br label %exit
576
577F:
578  %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1
579  br label %exit
580
581exit:
582  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
583  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
584  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
585  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
586  ret <4 x i16> %r2
587}
588
589define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 x i16> addrspace(1) * %p1) {
590;
591; SI-LABEL: vec_16xi16_extract_4xi16_2:
592; SI:       ; %bb.0:
593; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
594; SI-NEXT:    s_cbranch_scc0 .LBB4_2
595; SI-NEXT:  ; %bb.1: ; %F
596; SI-NEXT:    s_mov_b32 s6, 0
597; SI-NEXT:    s_mov_b32 s7, 0xf000
598; SI-NEXT:    s_mov_b32 s4, s6
599; SI-NEXT:    s_mov_b32 s5, s6
600; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
601; SI-NEXT:    s_waitcnt vmcnt(0)
602; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
603; SI-NEXT:    s_waitcnt vmcnt(0)
604; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
605; SI-NEXT:    s_waitcnt vmcnt(0)
606; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
607; SI-NEXT:    s_waitcnt vmcnt(0)
608; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
609; SI-NEXT:    s_waitcnt vmcnt(0)
610; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
611; SI-NEXT:    s_waitcnt vmcnt(0)
612; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
613; SI-NEXT:    s_waitcnt vmcnt(0)
614; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
615; SI-NEXT:    s_waitcnt vmcnt(0)
616; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
617; SI-NEXT:    s_waitcnt vmcnt(0)
618; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
619; SI-NEXT:    s_waitcnt vmcnt(0)
620; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
621; SI-NEXT:    s_waitcnt vmcnt(0)
622; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
623; SI-NEXT:    s_waitcnt vmcnt(0)
624; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
625; SI-NEXT:    s_waitcnt vmcnt(0)
626; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
627; SI-NEXT:    s_waitcnt vmcnt(0)
628; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
629; SI-NEXT:    s_waitcnt vmcnt(0)
630; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
631; SI-NEXT:    s_waitcnt vmcnt(0)
632; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
633; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
634; SI-NEXT:    v_or_b32_e32 v2, v6, v2
635; SI-NEXT:    v_or_b32_e32 v3, v4, v3
636; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
637; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
638; SI-NEXT:    s_mov_b64 vcc, exec
639; SI-NEXT:    s_cbranch_execz .LBB4_3
640; SI-NEXT:    s_branch .LBB4_4
641; SI-NEXT:  .LBB4_2:
642; SI-NEXT:    ; implicit-def: $vgpr3
643; SI-NEXT:    ; implicit-def: $vgpr5
644; SI-NEXT:    ; implicit-def: $vgpr2
645; SI-NEXT:    ; implicit-def: $vgpr4
646; SI-NEXT:    s_mov_b64 vcc, 0
647; SI-NEXT:  .LBB4_3: ; %T
648; SI-NEXT:    s_mov_b32 s6, 0
649; SI-NEXT:    s_mov_b32 s7, 0xf000
650; SI-NEXT:    s_mov_b32 s4, s6
651; SI-NEXT:    s_mov_b32 s5, s6
652; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
653; SI-NEXT:    s_waitcnt vmcnt(0)
654; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
655; SI-NEXT:    s_waitcnt vmcnt(0)
656; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
657; SI-NEXT:    s_waitcnt vmcnt(0)
658; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
659; SI-NEXT:    s_waitcnt vmcnt(0)
660; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:8 glc
661; SI-NEXT:    s_waitcnt vmcnt(0)
662; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:10 glc
663; SI-NEXT:    s_waitcnt vmcnt(0)
664; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
665; SI-NEXT:    s_waitcnt vmcnt(0)
666; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
667; SI-NEXT:    s_waitcnt vmcnt(0)
668; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
669; SI-NEXT:    s_waitcnt vmcnt(0)
670; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
671; SI-NEXT:    s_waitcnt vmcnt(0)
672; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
673; SI-NEXT:    s_waitcnt vmcnt(0)
674; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
675; SI-NEXT:    s_waitcnt vmcnt(0)
676; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
677; SI-NEXT:    s_waitcnt vmcnt(0)
678; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
679; SI-NEXT:    s_waitcnt vmcnt(0)
680; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
681; SI-NEXT:    s_waitcnt vmcnt(0)
682; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
683; SI-NEXT:    s_waitcnt vmcnt(0)
684; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
685; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
686; SI-NEXT:    v_or_b32_e32 v2, v4, v0
687; SI-NEXT:    v_or_b32_e32 v3, v3, v1
688; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
689; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
690; SI-NEXT:  .LBB4_4: ; %exit
691; SI-NEXT:    v_bfe_i32 v0, v3, 0, 16
692; SI-NEXT:    v_bfe_i32 v1, v5, 0, 16
693; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
694; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
695; SI-NEXT:    v_mov_b32_e32 v4, 0xffff
696; SI-NEXT:    v_mov_b32_e32 v5, 0x8000
697; SI-NEXT:    v_mov_b32_e32 v6, 0xffff0000
698; SI-NEXT:    v_bfrev_b32_e32 v7, 1
699; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v0
700; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
701; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v1
702; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
703; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v2
704; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
705; SI-NEXT:    v_cmp_lt_i32_e32 vcc, -1, v3
706; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
707; SI-NEXT:    v_or_b32_e32 v0, v0, v1
708; SI-NEXT:    v_or_b32_e32 v2, v2, v3
709; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
710; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
711; SI-NEXT:    s_setpc_b64 s[30:31]
712;
713; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
714; GFX9:       ; %bb.0:
715; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716; GFX9-NEXT:    s_cbranch_scc0 .LBB4_2
717; GFX9-NEXT:  ; %bb.1: ; %F
718; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
719; GFX9-NEXT:    s_waitcnt vmcnt(0)
720; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
721; GFX9-NEXT:    s_waitcnt vmcnt(0)
722; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
723; GFX9-NEXT:    s_cbranch_execz .LBB4_3
724; GFX9-NEXT:    s_branch .LBB4_4
725; GFX9-NEXT:  .LBB4_2:
726; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
727; GFX9-NEXT:  .LBB4_3: ; %T
728; GFX9-NEXT:    s_waitcnt vmcnt(0)
729; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
730; GFX9-NEXT:    s_waitcnt vmcnt(0)
731; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
732; GFX9-NEXT:    s_waitcnt vmcnt(0)
733; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
734; GFX9-NEXT:  .LBB4_4: ; %exit
735; GFX9-NEXT:    s_waitcnt vmcnt(0)
736; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
737; GFX9-NEXT:    s_movk_i32 s4, 0x8000
738; GFX9-NEXT:    v_or_b32_sdwa v1, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
739; GFX9-NEXT:    v_or_b32_e32 v2, 0xffff8000, v0
740; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1]
741; GFX9-NEXT:    v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
742; GFX9-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
743; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
744; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
745; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
746; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
747; GFX9-NEXT:    s_setpc_b64 s[30:31]
748  br i1 undef, label %T, label %F
749
750T:
751  %t = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p0
752  br label %exit
753
754F:
755  %f = load volatile <16 x i16>, <16 x i16> addrspace(1) * %p1
756  br label %exit
757
758exit:
759  %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
760  %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
761  %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
762  %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
763  ret <4 x i16> %r2
764}
765
766define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 x half> addrspace(1) * %p1) {
767;
768; SI-LABEL: vec_16xf16_extract_4xf16:
769; SI:       ; %bb.0:
770; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
771; SI-NEXT:    s_cbranch_scc0 .LBB5_2
772; SI-NEXT:  ; %bb.1: ; %F
773; SI-NEXT:    s_mov_b32 s6, 0
774; SI-NEXT:    s_mov_b32 s7, 0xf000
775; SI-NEXT:    s_mov_b32 s4, s6
776; SI-NEXT:    s_mov_b32 s5, s6
777; SI-NEXT:    buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
778; SI-NEXT:    s_waitcnt vmcnt(0)
779; SI-NEXT:    buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
780; SI-NEXT:    s_waitcnt vmcnt(0)
781; SI-NEXT:    buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
782; SI-NEXT:    s_waitcnt vmcnt(0)
783; SI-NEXT:    buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
784; SI-NEXT:    s_waitcnt vmcnt(0)
785; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
786; SI-NEXT:    s_waitcnt vmcnt(0)
787; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
788; SI-NEXT:    s_waitcnt vmcnt(0)
789; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
790; SI-NEXT:    s_waitcnt vmcnt(0)
791; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
792; SI-NEXT:    s_waitcnt vmcnt(0)
793; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
794; SI-NEXT:    s_waitcnt vmcnt(0)
795; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
796; SI-NEXT:    s_waitcnt vmcnt(0)
797; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
798; SI-NEXT:    s_waitcnt vmcnt(0)
799; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
800; SI-NEXT:    s_waitcnt vmcnt(0)
801; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
802; SI-NEXT:    s_waitcnt vmcnt(0)
803; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
804; SI-NEXT:    s_waitcnt vmcnt(0)
805; SI-NEXT:    buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
806; SI-NEXT:    s_waitcnt vmcnt(0)
807; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
808; SI-NEXT:    s_waitcnt vmcnt(0)
809; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
810; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
811; SI-NEXT:    v_or_b32_e32 v2, v6, v2
812; SI-NEXT:    v_or_b32_e32 v4, v4, v3
813; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
814; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
815; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
816; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
817; SI-NEXT:    s_mov_b64 vcc, exec
818; SI-NEXT:    s_cbranch_execz .LBB5_3
819; SI-NEXT:    s_branch .LBB5_4
820; SI-NEXT:  .LBB5_2:
821; SI-NEXT:    ; implicit-def: $vgpr3
822; SI-NEXT:    ; implicit-def: $vgpr4
823; SI-NEXT:    ; implicit-def: $vgpr2
824; SI-NEXT:    s_mov_b64 vcc, 0
825; SI-NEXT:  .LBB5_3: ; %T
826; SI-NEXT:    s_mov_b32 s6, 0
827; SI-NEXT:    s_mov_b32 s7, 0xf000
828; SI-NEXT:    s_mov_b32 s4, s6
829; SI-NEXT:    s_mov_b32 s5, s6
830; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
831; SI-NEXT:    s_waitcnt vmcnt(0)
832; SI-NEXT:    buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
833; SI-NEXT:    s_waitcnt vmcnt(0)
834; SI-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
835; SI-NEXT:    s_waitcnt vmcnt(0)
836; SI-NEXT:    buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
837; SI-NEXT:    s_waitcnt vmcnt(0)
838; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
839; SI-NEXT:    s_waitcnt vmcnt(0)
840; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
841; SI-NEXT:    s_waitcnt vmcnt(0)
842; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
843; SI-NEXT:    s_waitcnt vmcnt(0)
844; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
845; SI-NEXT:    s_waitcnt vmcnt(0)
846; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
847; SI-NEXT:    s_waitcnt vmcnt(0)
848; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
849; SI-NEXT:    s_waitcnt vmcnt(0)
850; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
851; SI-NEXT:    s_waitcnt vmcnt(0)
852; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
853; SI-NEXT:    s_waitcnt vmcnt(0)
854; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
855; SI-NEXT:    s_waitcnt vmcnt(0)
856; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
857; SI-NEXT:    s_waitcnt vmcnt(0)
858; SI-NEXT:    buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
859; SI-NEXT:    s_waitcnt vmcnt(0)
860; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
861; SI-NEXT:    s_waitcnt vmcnt(0)
862; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
863; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
864; SI-NEXT:    v_or_b32_e32 v0, v4, v0
865; SI-NEXT:    v_or_b32_e32 v1, v2, v1
866; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
867; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
868; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
869; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
870; SI-NEXT:  .LBB5_4: ; %exit
871; SI-NEXT:    v_cvt_f16_f32_e32 v0, v3
872; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
873; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
874; SI-NEXT:    v_mov_b32_e32 v3, 0x3fa00000
875; SI-NEXT:    v_mov_b32_e32 v4, 0x3f200000
876; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
877; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
878; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
879; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v0
880; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
881; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v1
882; SI-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
883; SI-NEXT:    v_cmp_nge_f32_e32 vcc, 0.5, v2
884; SI-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
885; SI-NEXT:    v_mov_b32_e32 v3, v2
886; SI-NEXT:    s_setpc_b64 s[30:31]
887;
888; GFX9-LABEL: vec_16xf16_extract_4xf16:
889; GFX9:       ; %bb.0:
890; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX9-NEXT:    s_cbranch_scc0 .LBB5_2
892; GFX9-NEXT:  ; %bb.1: ; %F
893; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
894; GFX9-NEXT:    s_waitcnt vmcnt(0)
895; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off glc
896; GFX9-NEXT:    s_waitcnt vmcnt(0)
897; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
898; GFX9-NEXT:    s_cbranch_execz .LBB5_3
899; GFX9-NEXT:    s_branch .LBB5_4
900; GFX9-NEXT:  .LBB5_2:
901; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
902; GFX9-NEXT:  .LBB5_3: ; %T
903; GFX9-NEXT:    s_waitcnt vmcnt(0)
904; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
905; GFX9-NEXT:    s_waitcnt vmcnt(0)
906; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off glc
907; GFX9-NEXT:    s_waitcnt vmcnt(0)
908; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
909; GFX9-NEXT:  .LBB5_4: ; %exit
910; GFX9-NEXT:    s_waitcnt vmcnt(0)
911; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v5
912; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
913; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3800
914; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3900
915; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3d00
916; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v0
917; GFX9-NEXT:    v_cndmask_b32_e32 v5, v2, v3, vcc
918; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
919; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v2, vcc
920; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v4
921; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
922; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD
923; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
924; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
925; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v6
926; GFX9-NEXT:    s_setpc_b64 s[30:31]
927  br i1 undef, label %T, label %F
928
929T:
930  %t = load volatile <16 x half>, <16 x half> addrspace(1) * %p0
931  br label %exit
932
933F:
934  %f = load volatile <16 x half>, <16 x half> addrspace(1) * %p1
935  br label %exit
936
937exit:
938  %m = phi <16 x half> [ %t, %T ], [ %f, %F ]
939  %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
940  %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
941  %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
942  ret <4 x half> %r2
943}
944