1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
3
4define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
5; GCN-LABEL: float4_inselt:
6; GCN:       ; %bb.0: ; %entry
7; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
8; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
10; GCN-NEXT:    s_waitcnt lgkmcnt(0)
11; GCN-NEXT:    s_cmp_lg_u32 s2, 3
12; GCN-NEXT:    v_mov_b32_e32 v0, s7
13; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
14; GCN-NEXT:    s_cmp_lg_u32 s2, 2
15; GCN-NEXT:    v_cndmask_b32_e32 v3, 1.0, v0, vcc
16; GCN-NEXT:    v_mov_b32_e32 v0, s6
17; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
18; GCN-NEXT:    s_cmp_lg_u32 s2, 1
19; GCN-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
20; GCN-NEXT:    v_mov_b32_e32 v0, s5
21; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
22; GCN-NEXT:    s_cmp_lg_u32 s2, 0
23; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
24; GCN-NEXT:    v_mov_b32_e32 v0, s4
25; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
26; GCN-NEXT:    v_mov_b32_e32 v5, s1
27; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
28; GCN-NEXT:    v_mov_b32_e32 v4, s0
29; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
30; GCN-NEXT:    s_endpgm
31entry:
32  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
33  store <4 x float> %v, <4 x float> addrspace(1)* %out
34  ret void
35}
36
37define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
38; GCN-LABEL: float4_inselt_undef:
39; GCN:       ; %bb.0: ; %entry
40; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
42; GCN-NEXT:    v_mov_b32_e32 v1, v0
43; GCN-NEXT:    v_mov_b32_e32 v2, v0
44; GCN-NEXT:    v_mov_b32_e32 v3, v0
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_mov_b32_e32 v5, s1
47; GCN-NEXT:    v_mov_b32_e32 v4, s0
48; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
49; GCN-NEXT:    s_endpgm
50entry:
51  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
52  store <4 x float> %v, <4 x float> addrspace(1)* %out
53  ret void
54}
55
56define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
57; GCN-LABEL: int4_inselt:
58; GCN:       ; %bb.0: ; %entry
59; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
60; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
61; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
62; GCN-NEXT:    s_waitcnt lgkmcnt(0)
63; GCN-NEXT:    s_cmp_lg_u32 s2, 3
64; GCN-NEXT:    s_cselect_b32 s3, s7, 1
65; GCN-NEXT:    s_cmp_lg_u32 s2, 2
66; GCN-NEXT:    s_cselect_b32 s6, s6, 1
67; GCN-NEXT:    s_cmp_lg_u32 s2, 1
68; GCN-NEXT:    s_cselect_b32 s5, s5, 1
69; GCN-NEXT:    s_cmp_lg_u32 s2, 0
70; GCN-NEXT:    s_cselect_b32 s2, s4, 1
71; GCN-NEXT:    v_mov_b32_e32 v5, s1
72; GCN-NEXT:    v_mov_b32_e32 v0, s2
73; GCN-NEXT:    v_mov_b32_e32 v1, s5
74; GCN-NEXT:    v_mov_b32_e32 v2, s6
75; GCN-NEXT:    v_mov_b32_e32 v3, s3
76; GCN-NEXT:    v_mov_b32_e32 v4, s0
77; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
78; GCN-NEXT:    s_endpgm
79entry:
80  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
81  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
82  ret void
83}
84
85define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
86; GCN-LABEL: float2_inselt:
87; GCN:       ; %bb.0: ; %entry
88; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
89; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
90; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
91; GCN-NEXT:    s_waitcnt lgkmcnt(0)
92; GCN-NEXT:    s_cmp_lg_u32 s4, 1
93; GCN-NEXT:    v_mov_b32_e32 v0, s3
94; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
95; GCN-NEXT:    s_cmp_lg_u32 s4, 0
96; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
97; GCN-NEXT:    v_mov_b32_e32 v0, s2
98; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
101; GCN-NEXT:    v_mov_b32_e32 v2, s0
102; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
103; GCN-NEXT:    s_endpgm
104entry:
105  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
106  store <2 x float> %v, <2 x float> addrspace(1)* %out
107  ret void
108}
109
110define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
111; GCN-LABEL: float8_inselt:
112; GCN:       ; %bb.0: ; %entry
113; GCN-NEXT:    s_load_dword s2, s[0:1], 0x64
114; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
115; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
116; GCN-NEXT:    s_waitcnt lgkmcnt(0)
117; GCN-NEXT:    s_cmp_lg_u32 s2, 3
118; GCN-NEXT:    v_mov_b32_e32 v0, s7
119; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
120; GCN-NEXT:    s_cmp_lg_u32 s2, 2
121; GCN-NEXT:    v_cndmask_b32_e32 v3, 1.0, v0, vcc
122; GCN-NEXT:    v_mov_b32_e32 v0, s6
123; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
124; GCN-NEXT:    s_cmp_lg_u32 s2, 1
125; GCN-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
126; GCN-NEXT:    v_mov_b32_e32 v0, s5
127; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
128; GCN-NEXT:    s_cmp_lg_u32 s2, 0
129; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
130; GCN-NEXT:    v_mov_b32_e32 v0, s4
131; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
132; GCN-NEXT:    s_cmp_lg_u32 s2, 7
133; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
134; GCN-NEXT:    v_mov_b32_e32 v4, s11
135; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
136; GCN-NEXT:    s_cmp_lg_u32 s2, 6
137; GCN-NEXT:    v_cndmask_b32_e32 v7, 1.0, v4, vcc
138; GCN-NEXT:    v_mov_b32_e32 v4, s10
139; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
140; GCN-NEXT:    s_cmp_lg_u32 s2, 5
141; GCN-NEXT:    v_cndmask_b32_e32 v6, 1.0, v4, vcc
142; GCN-NEXT:    v_mov_b32_e32 v4, s9
143; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
144; GCN-NEXT:    s_cmp_lg_u32 s2, 4
145; GCN-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
146; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
147; GCN-NEXT:    s_add_u32 s2, s0, 16
148; GCN-NEXT:    s_addc_u32 s3, s1, 0
149; GCN-NEXT:    v_mov_b32_e32 v4, s8
150; GCN-NEXT:    v_mov_b32_e32 v9, s3
151; GCN-NEXT:    v_cndmask_b32_e32 v4, 1.0, v4, vcc
152; GCN-NEXT:    v_mov_b32_e32 v8, s2
153; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
154; GCN-NEXT:    s_nop 0
155; GCN-NEXT:    v_mov_b32_e32 v5, s1
156; GCN-NEXT:    v_mov_b32_e32 v4, s0
157; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
158; GCN-NEXT:    s_endpgm
159entry:
160  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
161  store <8 x float> %v, <8 x float> addrspace(1)* %out
162  ret void
163}
164
165define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
166; GCN-LABEL: float16_inselt:
167; GCN:       ; %bb.0: ; %entry
168; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
169; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
170; GCN-NEXT:    s_load_dword s20, s[0:1], 0xa4
171; GCN-NEXT:    s_waitcnt lgkmcnt(0)
172; GCN-NEXT:    v_mov_b32_e32 v0, s4
173; GCN-NEXT:    s_add_u32 s0, s2, 48
174; GCN-NEXT:    s_addc_u32 s1, s3, 0
175; GCN-NEXT:    v_mov_b32_e32 v17, s1
176; GCN-NEXT:    v_mov_b32_e32 v1, s5
177; GCN-NEXT:    v_mov_b32_e32 v2, s6
178; GCN-NEXT:    v_mov_b32_e32 v3, s7
179; GCN-NEXT:    v_mov_b32_e32 v4, s8
180; GCN-NEXT:    v_mov_b32_e32 v5, s9
181; GCN-NEXT:    v_mov_b32_e32 v6, s10
182; GCN-NEXT:    v_mov_b32_e32 v7, s11
183; GCN-NEXT:    v_mov_b32_e32 v8, s12
184; GCN-NEXT:    v_mov_b32_e32 v9, s13
185; GCN-NEXT:    v_mov_b32_e32 v10, s14
186; GCN-NEXT:    v_mov_b32_e32 v11, s15
187; GCN-NEXT:    v_mov_b32_e32 v12, s16
188; GCN-NEXT:    v_mov_b32_e32 v13, s17
189; GCN-NEXT:    v_mov_b32_e32 v14, s18
190; GCN-NEXT:    v_mov_b32_e32 v15, s19
191; GCN-NEXT:    s_mov_b32 m0, s20
192; GCN-NEXT:    v_mov_b32_e32 v16, s0
193; GCN-NEXT:    s_add_u32 s0, s2, 32
194; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
195; GCN-NEXT:    s_addc_u32 s1, s3, 0
196; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
197; GCN-NEXT:    s_nop 0
198; GCN-NEXT:    v_mov_b32_e32 v13, s1
199; GCN-NEXT:    v_mov_b32_e32 v12, s0
200; GCN-NEXT:    s_add_u32 s0, s2, 16
201; GCN-NEXT:    s_addc_u32 s1, s3, 0
202; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
203; GCN-NEXT:    s_nop 0
204; GCN-NEXT:    v_mov_b32_e32 v9, s1
205; GCN-NEXT:    v_mov_b32_e32 v8, s0
206; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
207; GCN-NEXT:    s_nop 0
208; GCN-NEXT:    v_mov_b32_e32 v5, s3
209; GCN-NEXT:    v_mov_b32_e32 v4, s2
210; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
211; GCN-NEXT:    s_endpgm
212entry:
213  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
214  store <16 x float> %v, <16 x float> addrspace(1)* %out
215  ret void
216}
217
218define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) {
219; GCN-LABEL: float32_inselt:
220; GCN:       ; %bb.0: ; %entry
221; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
222; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
223; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xe4
224; GCN-NEXT:    s_load_dword s0, s[0:1], 0x124
225; GCN-NEXT:    s_waitcnt lgkmcnt(0)
226; GCN-NEXT:    v_mov_b32_e32 v0, s36
227; GCN-NEXT:    v_mov_b32_e32 v1, s37
228; GCN-NEXT:    v_mov_b32_e32 v2, s38
229; GCN-NEXT:    s_mov_b32 m0, s0
230; GCN-NEXT:    s_add_u32 s0, s2, 0x70
231; GCN-NEXT:    s_addc_u32 s1, s3, 0
232; GCN-NEXT:    v_mov_b32_e32 v33, s1
233; GCN-NEXT:    v_mov_b32_e32 v3, s39
234; GCN-NEXT:    v_mov_b32_e32 v4, s40
235; GCN-NEXT:    v_mov_b32_e32 v5, s41
236; GCN-NEXT:    v_mov_b32_e32 v6, s42
237; GCN-NEXT:    v_mov_b32_e32 v7, s43
238; GCN-NEXT:    v_mov_b32_e32 v8, s44
239; GCN-NEXT:    v_mov_b32_e32 v9, s45
240; GCN-NEXT:    v_mov_b32_e32 v10, s46
241; GCN-NEXT:    v_mov_b32_e32 v11, s47
242; GCN-NEXT:    v_mov_b32_e32 v12, s48
243; GCN-NEXT:    v_mov_b32_e32 v13, s49
244; GCN-NEXT:    v_mov_b32_e32 v14, s50
245; GCN-NEXT:    v_mov_b32_e32 v15, s51
246; GCN-NEXT:    v_mov_b32_e32 v16, s4
247; GCN-NEXT:    v_mov_b32_e32 v17, s5
248; GCN-NEXT:    v_mov_b32_e32 v18, s6
249; GCN-NEXT:    v_mov_b32_e32 v19, s7
250; GCN-NEXT:    v_mov_b32_e32 v20, s8
251; GCN-NEXT:    v_mov_b32_e32 v21, s9
252; GCN-NEXT:    v_mov_b32_e32 v22, s10
253; GCN-NEXT:    v_mov_b32_e32 v23, s11
254; GCN-NEXT:    v_mov_b32_e32 v24, s12
255; GCN-NEXT:    v_mov_b32_e32 v25, s13
256; GCN-NEXT:    v_mov_b32_e32 v26, s14
257; GCN-NEXT:    v_mov_b32_e32 v27, s15
258; GCN-NEXT:    v_mov_b32_e32 v28, s16
259; GCN-NEXT:    v_mov_b32_e32 v29, s17
260; GCN-NEXT:    v_mov_b32_e32 v30, s18
261; GCN-NEXT:    v_mov_b32_e32 v31, s19
262; GCN-NEXT:    v_mov_b32_e32 v32, s0
263; GCN-NEXT:    s_add_u32 s0, s2, 0x60
264; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
265; GCN-NEXT:    s_addc_u32 s1, s3, 0
266; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
267; GCN-NEXT:    s_nop 0
268; GCN-NEXT:    v_mov_b32_e32 v29, s1
269; GCN-NEXT:    v_mov_b32_e32 v28, s0
270; GCN-NEXT:    s_add_u32 s0, s2, 0x50
271; GCN-NEXT:    s_addc_u32 s1, s3, 0
272; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
273; GCN-NEXT:    s_nop 0
274; GCN-NEXT:    v_mov_b32_e32 v25, s1
275; GCN-NEXT:    v_mov_b32_e32 v24, s0
276; GCN-NEXT:    s_add_u32 s0, s2, 64
277; GCN-NEXT:    s_addc_u32 s1, s3, 0
278; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
279; GCN-NEXT:    s_nop 0
280; GCN-NEXT:    v_mov_b32_e32 v21, s1
281; GCN-NEXT:    v_mov_b32_e32 v20, s0
282; GCN-NEXT:    s_add_u32 s0, s2, 48
283; GCN-NEXT:    s_addc_u32 s1, s3, 0
284; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
285; GCN-NEXT:    s_nop 0
286; GCN-NEXT:    v_mov_b32_e32 v17, s1
287; GCN-NEXT:    v_mov_b32_e32 v16, s0
288; GCN-NEXT:    s_add_u32 s0, s2, 32
289; GCN-NEXT:    s_addc_u32 s1, s3, 0
290; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
291; GCN-NEXT:    s_nop 0
292; GCN-NEXT:    v_mov_b32_e32 v13, s1
293; GCN-NEXT:    v_mov_b32_e32 v12, s0
294; GCN-NEXT:    s_add_u32 s0, s2, 16
295; GCN-NEXT:    s_addc_u32 s1, s3, 0
296; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
297; GCN-NEXT:    s_nop 0
298; GCN-NEXT:    v_mov_b32_e32 v9, s1
299; GCN-NEXT:    v_mov_b32_e32 v8, s0
300; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
301; GCN-NEXT:    s_nop 0
302; GCN-NEXT:    v_mov_b32_e32 v5, s3
303; GCN-NEXT:    v_mov_b32_e32 v4, s2
304; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
305; GCN-NEXT:    s_endpgm
306entry:
307  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
308  store <32 x float> %v, <32 x float> addrspace(1)* %out
309  ret void
310}
311
312define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
313; GCN-LABEL: half4_inselt:
314; GCN:       ; %bb.0: ; %entry
315; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
316; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
317; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
318; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
319; GCN-NEXT:    s_waitcnt lgkmcnt(0)
320; GCN-NEXT:    s_lshl_b32 s6, s6, 4
321; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
322; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
323; GCN-NEXT:    s_mov_b32 s7, s6
324; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
325; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
326; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
327; GCN-NEXT:    v_mov_b32_e32 v0, s2
328; GCN-NEXT:    v_mov_b32_e32 v3, s1
329; GCN-NEXT:    v_mov_b32_e32 v1, s3
330; GCN-NEXT:    v_mov_b32_e32 v2, s0
331; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
332; GCN-NEXT:    s_endpgm
333entry:
334  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
335  store <4 x half> %v, <4 x half> addrspace(1)* %out
336  ret void
337}
338
339define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
340; GCN-LABEL: half2_inselt:
341; GCN:       ; %bb.0: ; %entry
342; GCN-NEXT:    s_load_dword s2, s[0:1], 0x30
343; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
344; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
345; GCN-NEXT:    s_waitcnt lgkmcnt(0)
346; GCN-NEXT:    s_lshl_b32 s2, s2, 4
347; GCN-NEXT:    s_lshl_b32 s2, 0xffff, s2
348; GCN-NEXT:    s_andn2_b32 s3, s3, s2
349; GCN-NEXT:    s_and_b32 s2, s2, 0x3c003c00
350; GCN-NEXT:    s_or_b32 s2, s2, s3
351; GCN-NEXT:    v_mov_b32_e32 v0, s0
352; GCN-NEXT:    v_mov_b32_e32 v1, s1
353; GCN-NEXT:    v_mov_b32_e32 v2, s2
354; GCN-NEXT:    flat_store_dword v[0:1], v2
355; GCN-NEXT:    s_endpgm
356entry:
357  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
358  store <2 x half> %v, <2 x half> addrspace(1)* %out
359  ret void
360}
361
362define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
363; GCN-LABEL: half8_inselt:
364; GCN:       ; %bb.0: ; %entry
365; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
366; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
367; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
368; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
369; GCN-NEXT:    s_waitcnt lgkmcnt(0)
370; GCN-NEXT:    s_lshr_b32 s3, s7, 16
371; GCN-NEXT:    s_cmp_lg_u32 s2, 7
372; GCN-NEXT:    v_mov_b32_e32 v1, s3
373; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
374; GCN-NEXT:    s_cmp_lg_u32 s2, 6
375; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
376; GCN-NEXT:    v_mov_b32_e32 v2, s7
377; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
378; GCN-NEXT:    s_lshr_b32 s3, s6, 16
379; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
380; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
381; GCN-NEXT:    s_cmp_lg_u32 s2, 5
382; GCN-NEXT:    v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
383; GCN-NEXT:    v_mov_b32_e32 v1, s3
384; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
385; GCN-NEXT:    s_cmp_lg_u32 s2, 4
386; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
387; GCN-NEXT:    v_mov_b32_e32 v2, s6
388; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
389; GCN-NEXT:    s_lshr_b32 s3, s5, 16
390; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
391; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
392; GCN-NEXT:    s_cmp_lg_u32 s2, 3
393; GCN-NEXT:    v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
394; GCN-NEXT:    v_mov_b32_e32 v1, s3
395; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
396; GCN-NEXT:    s_cmp_lg_u32 s2, 2
397; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
398; GCN-NEXT:    v_mov_b32_e32 v4, s5
399; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
400; GCN-NEXT:    s_lshr_b32 s3, s4, 16
401; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
402; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
403; GCN-NEXT:    s_cmp_lg_u32 s2, 1
404; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
405; GCN-NEXT:    v_mov_b32_e32 v4, s3
406; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
407; GCN-NEXT:    s_cmp_lg_u32 s2, 0
408; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
409; GCN-NEXT:    v_mov_b32_e32 v5, s4
410; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
411; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
412; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
413; GCN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
414; GCN-NEXT:    v_mov_b32_e32 v5, s1
415; GCN-NEXT:    v_mov_b32_e32 v4, s0
416; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
417; GCN-NEXT:    s_endpgm
418entry:
419  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
420  store <8 x half> %v, <8 x half> addrspace(1)* %out
421  ret void
422}
423
424define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
425; GCN-LABEL: short2_inselt:
426; GCN:       ; %bb.0: ; %entry
427; GCN-NEXT:    s_load_dword s2, s[0:1], 0x30
428; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
429; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
430; GCN-NEXT:    s_waitcnt lgkmcnt(0)
431; GCN-NEXT:    s_lshl_b32 s2, s2, 4
432; GCN-NEXT:    s_lshl_b32 s2, 0xffff, s2
433; GCN-NEXT:    s_andn2_b32 s3, s3, s2
434; GCN-NEXT:    s_and_b32 s2, s2, 0x10001
435; GCN-NEXT:    s_or_b32 s2, s2, s3
436; GCN-NEXT:    v_mov_b32_e32 v0, s0
437; GCN-NEXT:    v_mov_b32_e32 v1, s1
438; GCN-NEXT:    v_mov_b32_e32 v2, s2
439; GCN-NEXT:    flat_store_dword v[0:1], v2
440; GCN-NEXT:    s_endpgm
441entry:
442  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
443  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
444  ret void
445}
446
447define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
448; GCN-LABEL: short4_inselt:
449; GCN:       ; %bb.0: ; %entry
450; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
451; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
452; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
453; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
454; GCN-NEXT:    s_waitcnt lgkmcnt(0)
455; GCN-NEXT:    s_lshl_b32 s6, s6, 4
456; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
457; GCN-NEXT:    s_mov_b32 s6, 0x10001
458; GCN-NEXT:    s_mov_b32 s7, s6
459; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
460; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
461; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
462; GCN-NEXT:    v_mov_b32_e32 v0, s2
463; GCN-NEXT:    v_mov_b32_e32 v3, s1
464; GCN-NEXT:    v_mov_b32_e32 v1, s3
465; GCN-NEXT:    v_mov_b32_e32 v2, s0
466; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
467; GCN-NEXT:    s_endpgm
468entry:
469  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
470  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
471  ret void
472}
473
474define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
475; GCN-LABEL: byte8_inselt:
476; GCN:       ; %bb.0: ; %entry
477; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
478; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
479; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
480; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
481; GCN-NEXT:    s_waitcnt lgkmcnt(0)
482; GCN-NEXT:    s_lshl_b32 s6, s6, 3
483; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
484; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
485; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
486; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
487; GCN-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
488; GCN-NEXT:    v_mov_b32_e32 v2, s2
489; GCN-NEXT:    v_mov_b32_e32 v0, s0
490; GCN-NEXT:    v_mov_b32_e32 v1, s1
491; GCN-NEXT:    v_mov_b32_e32 v3, s3
492; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
493; GCN-NEXT:    s_endpgm
494entry:
495  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
496  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
497  ret void
498}
499
500define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
501; GCN-LABEL: byte16_inselt:
502; GCN:       ; %bb.0: ; %entry
503; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
504; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
505; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
506; GCN-NEXT:    s_waitcnt lgkmcnt(0)
507; GCN-NEXT:    s_lshr_b32 s3, s7, 24
508; GCN-NEXT:    s_cmp_lg_u32 s2, 15
509; GCN-NEXT:    v_mov_b32_e32 v0, s3
510; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
511; GCN-NEXT:    s_lshr_b32 s3, s7, 16
512; GCN-NEXT:    s_cmp_lg_u32 s2, 14
513; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
514; GCN-NEXT:    v_mov_b32_e32 v1, s3
515; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
516; GCN-NEXT:    s_lshr_b32 s3, s7, 8
517; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
518; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
519; GCN-NEXT:    s_cmp_lg_u32 s2, 13
520; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
521; GCN-NEXT:    v_mov_b32_e32 v1, s3
522; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
523; GCN-NEXT:    s_cmp_lg_u32 s2, 12
524; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
525; GCN-NEXT:    v_mov_b32_e32 v2, s7
526; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
527; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
528; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
529; GCN-NEXT:    s_lshr_b32 s3, s6, 24
530; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
531; GCN-NEXT:    s_cmp_lg_u32 s2, 11
532; GCN-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
533; GCN-NEXT:    v_mov_b32_e32 v0, s3
534; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
535; GCN-NEXT:    s_lshr_b32 s3, s6, 16
536; GCN-NEXT:    s_cmp_lg_u32 s2, 10
537; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
538; GCN-NEXT:    v_mov_b32_e32 v1, s3
539; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
540; GCN-NEXT:    s_lshr_b32 s3, s6, 8
541; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
542; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
543; GCN-NEXT:    s_cmp_lg_u32 s2, 9
544; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
545; GCN-NEXT:    v_mov_b32_e32 v1, s3
546; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
547; GCN-NEXT:    s_cmp_lg_u32 s2, 8
548; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
549; GCN-NEXT:    v_mov_b32_e32 v2, s6
550; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
551; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
552; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
553; GCN-NEXT:    s_lshr_b32 s3, s5, 24
554; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
555; GCN-NEXT:    s_cmp_lg_u32 s2, 7
556; GCN-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
557; GCN-NEXT:    v_mov_b32_e32 v0, s3
558; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
559; GCN-NEXT:    s_lshr_b32 s3, s5, 16
560; GCN-NEXT:    s_cmp_lg_u32 s2, 6
561; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
562; GCN-NEXT:    v_mov_b32_e32 v1, s3
563; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
564; GCN-NEXT:    s_lshr_b32 s3, s5, 8
565; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
566; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
567; GCN-NEXT:    s_cmp_lg_u32 s2, 5
568; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
569; GCN-NEXT:    v_mov_b32_e32 v1, s3
570; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
571; GCN-NEXT:    s_cmp_lg_u32 s2, 4
572; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
573; GCN-NEXT:    v_mov_b32_e32 v4, s5
574; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
575; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
576; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
577; GCN-NEXT:    s_lshr_b32 s3, s4, 24
578; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
579; GCN-NEXT:    s_cmp_lg_u32 s2, 3
580; GCN-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
581; GCN-NEXT:    v_mov_b32_e32 v0, s3
582; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
583; GCN-NEXT:    s_lshr_b32 s3, s4, 16
584; GCN-NEXT:    s_cmp_lg_u32 s2, 2
585; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
586; GCN-NEXT:    v_mov_b32_e32 v4, s3
587; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
588; GCN-NEXT:    s_lshr_b32 s3, s4, 8
589; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
590; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
591; GCN-NEXT:    s_cmp_lg_u32 s2, 1
592; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
593; GCN-NEXT:    v_mov_b32_e32 v4, s3
594; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
595; GCN-NEXT:    s_cmp_lg_u32 s2, 0
596; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
597; GCN-NEXT:    v_mov_b32_e32 v5, s4
598; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
599; GCN-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
600; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
601; GCN-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
602; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
603; GCN-NEXT:    v_mov_b32_e32 v5, s1
604; GCN-NEXT:    v_mov_b32_e32 v4, s0
605; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
606; GCN-NEXT:    s_endpgm
607entry:
608  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
609  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
610  ret void
611}
612
613define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
614; GCN-LABEL: double2_inselt:
615; GCN:       ; %bb.0: ; %entry
616; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
617; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
618; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
619; GCN-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
620; GCN-NEXT:    s_waitcnt lgkmcnt(0)
621; GCN-NEXT:    s_cmp_eq_u32 s2, 1
622; GCN-NEXT:    v_mov_b32_e32 v1, s7
623; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
624; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
625; GCN-NEXT:    v_mov_b32_e32 v1, s6
626; GCN-NEXT:    s_cmp_eq_u32 s2, 0
627; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, 0, vcc
628; GCN-NEXT:    v_mov_b32_e32 v1, s5
629; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
630; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
631; GCN-NEXT:    v_mov_b32_e32 v0, s4
632; GCN-NEXT:    v_mov_b32_e32 v5, s1
633; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
634; GCN-NEXT:    v_mov_b32_e32 v4, s0
635; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
636; GCN-NEXT:    s_endpgm
637entry:
638  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
639  store <2 x double> %v, <2 x double> addrspace(1)* %out
640  ret void
641}
642
643define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
644; GCN-LABEL: double5_inselt:
645; GCN:       ; %bb.0: ; %entry
646; GCN-NEXT:    s_load_dword s12, s[0:1], 0xa4
647; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
648; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
649; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
650; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
651; GCN-NEXT:    s_waitcnt lgkmcnt(0)
652; GCN-NEXT:    s_cmp_eq_u32 s12, 4
653; GCN-NEXT:    v_mov_b32_e32 v0, s9
654; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
655; GCN-NEXT:    v_cndmask_b32_e32 v9, v0, v4, vcc
656; GCN-NEXT:    v_mov_b32_e32 v0, s8
657; GCN-NEXT:    s_cmp_eq_u32 s12, 1
658; GCN-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
659; GCN-NEXT:    v_mov_b32_e32 v0, s3
660; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
661; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
662; GCN-NEXT:    v_mov_b32_e32 v0, s2
663; GCN-NEXT:    s_cmp_eq_u32 s12, 0
664; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
665; GCN-NEXT:    v_mov_b32_e32 v0, s1
666; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
667; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
668; GCN-NEXT:    v_mov_b32_e32 v0, s0
669; GCN-NEXT:    s_cmp_eq_u32 s12, 3
670; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
671; GCN-NEXT:    v_mov_b32_e32 v5, s7
672; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
673; GCN-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
674; GCN-NEXT:    v_mov_b32_e32 v5, s6
675; GCN-NEXT:    s_cmp_eq_u32 s12, 2
676; GCN-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
677; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
678; GCN-NEXT:    s_add_u32 s0, s10, 16
679; GCN-NEXT:    v_mov_b32_e32 v5, s5
680; GCN-NEXT:    s_addc_u32 s1, s11, 0
681; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
682; GCN-NEXT:    v_mov_b32_e32 v4, s4
683; GCN-NEXT:    v_mov_b32_e32 v11, s1
684; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
685; GCN-NEXT:    v_mov_b32_e32 v10, s0
686; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
687; GCN-NEXT:    s_add_u32 s0, s10, 32
688; GCN-NEXT:    v_mov_b32_e32 v4, s10
689; GCN-NEXT:    v_mov_b32_e32 v5, s11
690; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
691; GCN-NEXT:    s_addc_u32 s1, s11, 0
692; GCN-NEXT:    v_mov_b32_e32 v0, s0
693; GCN-NEXT:    v_mov_b32_e32 v1, s1
694; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
695; GCN-NEXT:    s_endpgm
696entry:
697  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
698  store <5 x double> %v, <5 x double> addrspace(1)* %out
699  ret void
700}
701
702define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
703; GCN-LABEL: double8_inselt:
704; GCN:       ; %bb.0: ; %entry
705; GCN-NEXT:    s_load_dword s2, s[0:1], 0xa4
706; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
707; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
708; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
709; GCN-NEXT:    s_waitcnt lgkmcnt(0)
710; GCN-NEXT:    s_lshl_b32 s2, s2, 1
711; GCN-NEXT:    v_mov_b32_e32 v0, s4
712; GCN-NEXT:    v_mov_b32_e32 v1, s5
713; GCN-NEXT:    v_mov_b32_e32 v2, s6
714; GCN-NEXT:    v_mov_b32_e32 v3, s7
715; GCN-NEXT:    v_mov_b32_e32 v4, s8
716; GCN-NEXT:    v_mov_b32_e32 v5, s9
717; GCN-NEXT:    v_mov_b32_e32 v6, s10
718; GCN-NEXT:    v_mov_b32_e32 v7, s11
719; GCN-NEXT:    v_mov_b32_e32 v8, s12
720; GCN-NEXT:    v_mov_b32_e32 v9, s13
721; GCN-NEXT:    v_mov_b32_e32 v10, s14
722; GCN-NEXT:    v_mov_b32_e32 v11, s15
723; GCN-NEXT:    v_mov_b32_e32 v12, s16
724; GCN-NEXT:    v_mov_b32_e32 v13, s17
725; GCN-NEXT:    v_mov_b32_e32 v14, s18
726; GCN-NEXT:    v_mov_b32_e32 v15, s19
727; GCN-NEXT:    s_mov_b32 m0, s2
728; GCN-NEXT:    s_add_u32 s2, s0, 48
729; GCN-NEXT:    v_movreld_b32_e32 v0, 0
730; GCN-NEXT:    s_addc_u32 s3, s1, 0
731; GCN-NEXT:    v_movreld_b32_e32 v1, v16
732; GCN-NEXT:    v_mov_b32_e32 v17, s3
733; GCN-NEXT:    v_mov_b32_e32 v16, s2
734; GCN-NEXT:    s_add_u32 s2, s0, 32
735; GCN-NEXT:    s_addc_u32 s3, s1, 0
736; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
737; GCN-NEXT:    s_nop 0
738; GCN-NEXT:    v_mov_b32_e32 v13, s3
739; GCN-NEXT:    v_mov_b32_e32 v12, s2
740; GCN-NEXT:    s_add_u32 s2, s0, 16
741; GCN-NEXT:    s_addc_u32 s3, s1, 0
742; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
743; GCN-NEXT:    s_nop 0
744; GCN-NEXT:    v_mov_b32_e32 v9, s3
745; GCN-NEXT:    v_mov_b32_e32 v8, s2
746; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
747; GCN-NEXT:    s_nop 0
748; GCN-NEXT:    v_mov_b32_e32 v5, s1
749; GCN-NEXT:    v_mov_b32_e32 v4, s0
750; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
751; GCN-NEXT:    s_endpgm
752entry:
753  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
754  store <8 x double> %v, <8 x double> addrspace(1)* %out
755  ret void
756}
757
758define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
759; GCN-LABEL: double7_inselt:
760; GCN:       ; %bb.0: ; %entry
761; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
762; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
763; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x94
764; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x84
765; GCN-NEXT:    s_load_dword s0, s[0:1], 0xa4
766; GCN-NEXT:    s_waitcnt lgkmcnt(0)
767; GCN-NEXT:    v_mov_b32_e32 v0, s4
768; GCN-NEXT:    v_mov_b32_e32 v1, s5
769; GCN-NEXT:    v_mov_b32_e32 v2, s6
770; GCN-NEXT:    v_mov_b32_e32 v3, s7
771; GCN-NEXT:    s_lshl_b32 s0, s0, 1
772; GCN-NEXT:    v_mov_b32_e32 v4, s8
773; GCN-NEXT:    v_mov_b32_e32 v5, s9
774; GCN-NEXT:    v_mov_b32_e32 v6, s10
775; GCN-NEXT:    v_mov_b32_e32 v7, s11
776; GCN-NEXT:    v_mov_b32_e32 v8, s12
777; GCN-NEXT:    v_mov_b32_e32 v9, s13
778; GCN-NEXT:    v_mov_b32_e32 v10, s14
779; GCN-NEXT:    v_mov_b32_e32 v11, s15
780; GCN-NEXT:    v_mov_b32_e32 v12, s16
781; GCN-NEXT:    v_mov_b32_e32 v13, s17
782; GCN-NEXT:    s_mov_b32 m0, s0
783; GCN-NEXT:    v_movreld_b32_e32 v0, 0
784; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
785; GCN-NEXT:    s_add_u32 s0, s2, 16
786; GCN-NEXT:    v_movreld_b32_e32 v1, v16
787; GCN-NEXT:    s_addc_u32 s1, s3, 0
788; GCN-NEXT:    v_mov_b32_e32 v15, s1
789; GCN-NEXT:    v_mov_b32_e32 v14, s0
790; GCN-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
791; GCN-NEXT:    s_add_u32 s0, s2, 48
792; GCN-NEXT:    v_mov_b32_e32 v5, s3
793; GCN-NEXT:    v_mov_b32_e32 v4, s2
794; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
795; GCN-NEXT:    s_addc_u32 s1, s3, 0
796; GCN-NEXT:    v_mov_b32_e32 v0, s0
797; GCN-NEXT:    v_mov_b32_e32 v1, s1
798; GCN-NEXT:    s_add_u32 s0, s2, 32
799; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[12:13]
800; GCN-NEXT:    s_addc_u32 s1, s3, 0
801; GCN-NEXT:    v_mov_b32_e32 v0, s0
802; GCN-NEXT:    v_mov_b32_e32 v1, s1
803; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
804; GCN-NEXT:    s_endpgm
805entry:
806  %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
807  store <7 x double> %v, <7 x double> addrspace(1)* %out
808  ret void
809}
810
811define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) {
812; GCN-LABEL: double16_inselt:
813; GCN:       ; %bb.0: ; %entry
814; GCN-NEXT:    s_load_dword s2, s[0:1], 0x124
815; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
816; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xe4
817; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
818; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
819; GCN-NEXT:    s_waitcnt lgkmcnt(0)
820; GCN-NEXT:    v_mov_b32_e32 v0, s36
821; GCN-NEXT:    s_lshl_b32 s2, s2, 1
822; GCN-NEXT:    v_mov_b32_e32 v1, s37
823; GCN-NEXT:    v_mov_b32_e32 v2, s38
824; GCN-NEXT:    v_mov_b32_e32 v3, s39
825; GCN-NEXT:    v_mov_b32_e32 v4, s40
826; GCN-NEXT:    v_mov_b32_e32 v5, s41
827; GCN-NEXT:    v_mov_b32_e32 v6, s42
828; GCN-NEXT:    v_mov_b32_e32 v7, s43
829; GCN-NEXT:    v_mov_b32_e32 v8, s44
830; GCN-NEXT:    v_mov_b32_e32 v9, s45
831; GCN-NEXT:    v_mov_b32_e32 v10, s46
832; GCN-NEXT:    v_mov_b32_e32 v11, s47
833; GCN-NEXT:    v_mov_b32_e32 v12, s48
834; GCN-NEXT:    v_mov_b32_e32 v13, s49
835; GCN-NEXT:    v_mov_b32_e32 v14, s50
836; GCN-NEXT:    v_mov_b32_e32 v15, s51
837; GCN-NEXT:    v_mov_b32_e32 v16, s4
838; GCN-NEXT:    v_mov_b32_e32 v17, s5
839; GCN-NEXT:    v_mov_b32_e32 v18, s6
840; GCN-NEXT:    v_mov_b32_e32 v19, s7
841; GCN-NEXT:    v_mov_b32_e32 v20, s8
842; GCN-NEXT:    v_mov_b32_e32 v21, s9
843; GCN-NEXT:    v_mov_b32_e32 v22, s10
844; GCN-NEXT:    v_mov_b32_e32 v23, s11
845; GCN-NEXT:    v_mov_b32_e32 v24, s12
846; GCN-NEXT:    v_mov_b32_e32 v25, s13
847; GCN-NEXT:    v_mov_b32_e32 v26, s14
848; GCN-NEXT:    v_mov_b32_e32 v27, s15
849; GCN-NEXT:    v_mov_b32_e32 v28, s16
850; GCN-NEXT:    v_mov_b32_e32 v29, s17
851; GCN-NEXT:    v_mov_b32_e32 v30, s18
852; GCN-NEXT:    v_mov_b32_e32 v31, s19
853; GCN-NEXT:    s_mov_b32 m0, s2
854; GCN-NEXT:    s_add_u32 s2, s0, 0x70
855; GCN-NEXT:    v_movreld_b32_e32 v0, 0
856; GCN-NEXT:    s_addc_u32 s3, s1, 0
857; GCN-NEXT:    v_movreld_b32_e32 v1, v32
858; GCN-NEXT:    v_mov_b32_e32 v33, s3
859; GCN-NEXT:    v_mov_b32_e32 v32, s2
860; GCN-NEXT:    s_add_u32 s2, s0, 0x60
861; GCN-NEXT:    s_addc_u32 s3, s1, 0
862; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
863; GCN-NEXT:    s_nop 0
864; GCN-NEXT:    v_mov_b32_e32 v29, s3
865; GCN-NEXT:    v_mov_b32_e32 v28, s2
866; GCN-NEXT:    s_add_u32 s2, s0, 0x50
867; GCN-NEXT:    s_addc_u32 s3, s1, 0
868; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
869; GCN-NEXT:    s_nop 0
870; GCN-NEXT:    v_mov_b32_e32 v25, s3
871; GCN-NEXT:    v_mov_b32_e32 v24, s2
872; GCN-NEXT:    s_add_u32 s2, s0, 64
873; GCN-NEXT:    s_addc_u32 s3, s1, 0
874; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
875; GCN-NEXT:    s_nop 0
876; GCN-NEXT:    v_mov_b32_e32 v21, s3
877; GCN-NEXT:    v_mov_b32_e32 v20, s2
878; GCN-NEXT:    s_add_u32 s2, s0, 48
879; GCN-NEXT:    s_addc_u32 s3, s1, 0
880; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
881; GCN-NEXT:    s_nop 0
882; GCN-NEXT:    v_mov_b32_e32 v17, s3
883; GCN-NEXT:    v_mov_b32_e32 v16, s2
884; GCN-NEXT:    s_add_u32 s2, s0, 32
885; GCN-NEXT:    s_addc_u32 s3, s1, 0
886; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
887; GCN-NEXT:    s_nop 0
888; GCN-NEXT:    v_mov_b32_e32 v13, s3
889; GCN-NEXT:    v_mov_b32_e32 v12, s2
890; GCN-NEXT:    s_add_u32 s2, s0, 16
891; GCN-NEXT:    s_addc_u32 s3, s1, 0
892; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
893; GCN-NEXT:    s_nop 0
894; GCN-NEXT:    v_mov_b32_e32 v9, s3
895; GCN-NEXT:    v_mov_b32_e32 v8, s2
896; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
897; GCN-NEXT:    s_nop 0
898; GCN-NEXT:    v_mov_b32_e32 v5, s1
899; GCN-NEXT:    v_mov_b32_e32 v4, s0
900; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
901; GCN-NEXT:    s_endpgm
902entry:
903  %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
904  store <16 x double> %v, <16 x double> addrspace(1)* %out
905  ret void
906}
907
908define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
909; GCN-LABEL: double15_inselt:
910; GCN:       ; %bb.0: ; %entry
911; GCN-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0xa4
912; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x114
913; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x104
914; GCN-NEXT:    s_load_dwordx8 s[24:31], s[0:1], 0xe4
915; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
916; GCN-NEXT:    s_waitcnt lgkmcnt(0)
917; GCN-NEXT:    v_mov_b32_e32 v0, s8
918; GCN-NEXT:    v_mov_b32_e32 v28, s2
919; GCN-NEXT:    v_mov_b32_e32 v24, s4
920; GCN-NEXT:    s_load_dword s4, s[0:1], 0x124
921; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
922; GCN-NEXT:    v_mov_b32_e32 v1, s9
923; GCN-NEXT:    v_mov_b32_e32 v2, s10
924; GCN-NEXT:    v_mov_b32_e32 v3, s11
925; GCN-NEXT:    s_waitcnt lgkmcnt(0)
926; GCN-NEXT:    s_lshl_b32 s2, s4, 1
927; GCN-NEXT:    v_mov_b32_e32 v4, s12
928; GCN-NEXT:    v_mov_b32_e32 v5, s13
929; GCN-NEXT:    v_mov_b32_e32 v6, s14
930; GCN-NEXT:    v_mov_b32_e32 v7, s15
931; GCN-NEXT:    v_mov_b32_e32 v8, s16
932; GCN-NEXT:    v_mov_b32_e32 v9, s17
933; GCN-NEXT:    v_mov_b32_e32 v10, s18
934; GCN-NEXT:    v_mov_b32_e32 v11, s19
935; GCN-NEXT:    v_mov_b32_e32 v12, s20
936; GCN-NEXT:    v_mov_b32_e32 v13, s21
937; GCN-NEXT:    v_mov_b32_e32 v14, s22
938; GCN-NEXT:    v_mov_b32_e32 v15, s23
939; GCN-NEXT:    v_mov_b32_e32 v16, s24
940; GCN-NEXT:    v_mov_b32_e32 v17, s25
941; GCN-NEXT:    v_mov_b32_e32 v18, s26
942; GCN-NEXT:    v_mov_b32_e32 v19, s27
943; GCN-NEXT:    v_mov_b32_e32 v20, s28
944; GCN-NEXT:    v_mov_b32_e32 v21, s29
945; GCN-NEXT:    v_mov_b32_e32 v22, s30
946; GCN-NEXT:    v_mov_b32_e32 v23, s31
947; GCN-NEXT:    v_mov_b32_e32 v25, s5
948; GCN-NEXT:    v_mov_b32_e32 v26, s6
949; GCN-NEXT:    v_mov_b32_e32 v27, s7
950; GCN-NEXT:    v_mov_b32_e32 v29, s3
951; GCN-NEXT:    s_mov_b32 m0, s2
952; GCN-NEXT:    v_movreld_b32_e32 v0, 0
953; GCN-NEXT:    s_add_u32 s2, s0, 0x50
954; GCN-NEXT:    v_movreld_b32_e32 v1, v32
955; GCN-NEXT:    s_addc_u32 s3, s1, 0
956; GCN-NEXT:    v_mov_b32_e32 v31, s3
957; GCN-NEXT:    v_mov_b32_e32 v30, s2
958; GCN-NEXT:    s_add_u32 s2, s0, 64
959; GCN-NEXT:    s_addc_u32 s3, s1, 0
960; GCN-NEXT:    flat_store_dwordx4 v[30:31], v[20:23]
961; GCN-NEXT:    s_nop 0
962; GCN-NEXT:    v_mov_b32_e32 v21, s3
963; GCN-NEXT:    v_mov_b32_e32 v20, s2
964; GCN-NEXT:    s_add_u32 s2, s0, 48
965; GCN-NEXT:    s_addc_u32 s3, s1, 0
966; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
967; GCN-NEXT:    s_nop 0
968; GCN-NEXT:    v_mov_b32_e32 v17, s3
969; GCN-NEXT:    v_mov_b32_e32 v16, s2
970; GCN-NEXT:    s_add_u32 s2, s0, 32
971; GCN-NEXT:    s_addc_u32 s3, s1, 0
972; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
973; GCN-NEXT:    s_nop 0
974; GCN-NEXT:    v_mov_b32_e32 v13, s3
975; GCN-NEXT:    v_mov_b32_e32 v12, s2
976; GCN-NEXT:    s_add_u32 s2, s0, 16
977; GCN-NEXT:    s_addc_u32 s3, s1, 0
978; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
979; GCN-NEXT:    s_nop 0
980; GCN-NEXT:    v_mov_b32_e32 v9, s3
981; GCN-NEXT:    v_mov_b32_e32 v8, s2
982; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
983; GCN-NEXT:    s_add_u32 s2, s0, 0x70
984; GCN-NEXT:    v_mov_b32_e32 v5, s1
985; GCN-NEXT:    v_mov_b32_e32 v4, s0
986; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
987; GCN-NEXT:    s_addc_u32 s3, s1, 0
988; GCN-NEXT:    v_mov_b32_e32 v0, s2
989; GCN-NEXT:    v_mov_b32_e32 v1, s3
990; GCN-NEXT:    s_add_u32 s0, s0, 0x60
991; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[28:29]
992; GCN-NEXT:    s_addc_u32 s1, s1, 0
993; GCN-NEXT:    v_mov_b32_e32 v0, s0
994; GCN-NEXT:    v_mov_b32_e32 v1, s1
995; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
996; GCN-NEXT:    s_endpgm
997entry:
998  %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
999  store <15 x double> %v, <15 x double> addrspace(1)* %out
1000  ret void
1001}
1002
1003define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
1004; GCN-LABEL: bit4_inselt:
1005; GCN:       ; %bb.0: ; %entry
1006; GCN-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
1007; GCN-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
1008; GCN-NEXT:    s_mov_b32 s6, -1
1009; GCN-NEXT:    s_mov_b32 s7, 0xe80000
1010; GCN-NEXT:    s_add_u32 s4, s4, s3
1011; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1012; GCN-NEXT:    s_addc_u32 s5, s5, 0
1013; GCN-NEXT:    v_mov_b32_e32 v0, 4
1014; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1015; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1016; GCN-NEXT:    s_and_b32 s3, s3, 3
1017; GCN-NEXT:    v_mov_b32_e32 v1, s2
1018; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
1019; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
1020; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
1021; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
1022; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1023; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
1024; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
1025; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
1026; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:7
1027; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:6
1028; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:5
1029; GCN-NEXT:    v_mov_b32_e32 v1, 1
1030; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
1031; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:4
1032; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:5
1033; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1034; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:7
1035; GCN-NEXT:    s_waitcnt vmcnt(3)
1036; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
1037; GCN-NEXT:    s_waitcnt vmcnt(2)
1038; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
1039; GCN-NEXT:    s_waitcnt vmcnt(1)
1040; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1041; GCN-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1042; GCN-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
1043; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
1044; GCN-NEXT:    s_waitcnt vmcnt(0)
1045; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
1046; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
1047; GCN-NEXT:    v_or_b32_e32 v0, v0, v3
1048; GCN-NEXT:    v_and_b32_e32 v2, 15, v0
1049; GCN-NEXT:    v_mov_b32_e32 v0, s0
1050; GCN-NEXT:    v_mov_b32_e32 v1, s1
1051; GCN-NEXT:    flat_store_byte v[0:1], v2
1052; GCN-NEXT:    s_endpgm
1053entry:
1054  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
1055  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
1056  ret void
1057}
1058
1059define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
1060; GCN-LABEL: bit128_inselt:
1061; GCN:       ; %bb.0: ; %entry
1062; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1063; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1064; GCN-NEXT:    s_load_dword s0, s[0:1], 0x44
1065; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1066; GCN-NEXT:    s_lshr_b32 s1, s4, 24
1067; GCN-NEXT:    s_lshr_b32 s8, s4, 16
1068; GCN-NEXT:    s_lshr_b32 s9, s4, 17
1069; GCN-NEXT:    s_lshr_b32 s10, s4, 18
1070; GCN-NEXT:    s_lshr_b32 s11, s4, 19
1071; GCN-NEXT:    s_lshr_b32 s12, s4, 20
1072; GCN-NEXT:    s_lshr_b32 s13, s4, 21
1073; GCN-NEXT:    s_lshr_b32 s14, s4, 22
1074; GCN-NEXT:    s_lshr_b32 s15, s4, 23
1075; GCN-NEXT:    s_lshr_b32 s16, s5, 24
1076; GCN-NEXT:    s_lshr_b32 s17, s5, 16
1077; GCN-NEXT:    s_lshr_b32 s18, s5, 17
1078; GCN-NEXT:    s_lshr_b32 s19, s5, 18
1079; GCN-NEXT:    s_lshr_b32 s20, s5, 19
1080; GCN-NEXT:    s_lshr_b32 s21, s5, 20
1081; GCN-NEXT:    s_lshr_b32 s22, s5, 21
1082; GCN-NEXT:    s_lshr_b32 s23, s5, 22
1083; GCN-NEXT:    s_lshr_b32 s24, s5, 23
1084; GCN-NEXT:    s_lshr_b32 s25, s6, 24
1085; GCN-NEXT:    s_lshr_b32 s26, s6, 16
1086; GCN-NEXT:    s_lshr_b32 s27, s6, 17
1087; GCN-NEXT:    s_lshr_b32 s28, s6, 18
1088; GCN-NEXT:    s_lshr_b32 s29, s6, 19
1089; GCN-NEXT:    s_lshr_b32 s30, s6, 20
1090; GCN-NEXT:    s_lshr_b32 s31, s6, 21
1091; GCN-NEXT:    s_lshr_b32 s33, s6, 22
1092; GCN-NEXT:    s_lshr_b32 s34, s6, 23
1093; GCN-NEXT:    s_lshr_b32 s35, s7, 24
1094; GCN-NEXT:    s_lshr_b32 s36, s7, 16
1095; GCN-NEXT:    s_lshr_b32 s37, s7, 17
1096; GCN-NEXT:    s_lshr_b32 s38, s7, 18
1097; GCN-NEXT:    s_lshr_b32 s39, s7, 19
1098; GCN-NEXT:    s_lshr_b32 s40, s7, 20
1099; GCN-NEXT:    s_lshr_b32 s41, s7, 21
1100; GCN-NEXT:    s_lshr_b32 s42, s7, 22
1101; GCN-NEXT:    s_lshr_b32 s43, s7, 23
1102; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x77
1103; GCN-NEXT:    v_mov_b32_e32 v16, s43
1104; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1105; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x76
1106; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1107; GCN-NEXT:    v_mov_b32_e32 v17, s42
1108; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1109; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1110; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1111; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1112; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x75
1113; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1114; GCN-NEXT:    v_mov_b32_e32 v17, s41
1115; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1116; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x74
1117; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1118; GCN-NEXT:    v_mov_b32_e32 v18, s40
1119; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1120; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1121; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1122; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1123; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1124; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1125; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1126; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x73
1127; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1128; GCN-NEXT:    v_mov_b32_e32 v17, s39
1129; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1130; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x72
1131; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1132; GCN-NEXT:    v_mov_b32_e32 v18, s38
1133; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1134; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1135; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1136; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1137; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x71
1138; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1139; GCN-NEXT:    v_mov_b32_e32 v18, s37
1140; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1141; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x70
1142; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1143; GCN-NEXT:    v_mov_b32_e32 v19, s36
1144; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1145; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1146; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1147; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1148; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1149; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1150; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1151; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1152; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1153; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1154; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7f
1155; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1156; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s35
1157; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1158; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7e
1159; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s35
1160; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1161; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1162; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1163; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1164; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1165; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7d
1166; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1167; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s35
1168; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1169; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7c
1170; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s35
1171; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1172; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1173; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1174; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1175; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1176; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1177; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1178; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1179; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7b
1180; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1181; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s35
1182; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1183; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7a
1184; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s35
1185; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1186; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1187; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1188; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
1189; GCN-NEXT:    v_mov_b32_e32 v14, s35
1190; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1191; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1192; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1193; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x79
1194; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1195; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s35
1196; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1197; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1198; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1199; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
1200; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1201; GCN-NEXT:    v_or_b32_e32 v14, v14, v19
1202; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1203; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
1204; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
1205; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1206; GCN-NEXT:    v_and_b32_e32 v14, 15, v14
1207; GCN-NEXT:    v_or_b32_sdwa v14, v14, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1208; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6f
1209; GCN-NEXT:    v_or_b32_sdwa v14, v16, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1210; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s7
1211; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1212; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6e
1213; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s7
1214; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1215; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1216; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1217; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1218; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1219; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
1220; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1221; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s7
1222; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1223; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
1224; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s7
1225; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1226; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1227; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1228; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1229; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1230; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1231; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1232; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1233; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
1234; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1235; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s7
1236; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1237; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
1238; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s7
1239; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1240; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1241; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1242; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1243; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1244; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
1245; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1246; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s7
1247; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1248; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
1249; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s7
1250; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1251; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1252; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1253; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1254; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1255; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1256; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1257; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1258; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1259; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1260; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1261; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
1262; GCN-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1263; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s7
1264; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1265; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
1266; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s7
1267; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1268; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1269; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1270; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1271; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1272; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
1273; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1274; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s7
1275; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1276; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
1277; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s7
1278; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1279; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1280; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1281; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1282; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1283; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1284; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1285; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1286; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
1287; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1288; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s7
1289; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1290; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x62
1291; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s7
1292; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1293; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1294; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1295; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1296; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1297; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
1298; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1299; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s7
1300; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1301; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
1302; GCN-NEXT:    v_mov_b32_e32 v15, s7
1303; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1304; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1305; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1306; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1307; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1308; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
1309; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1310; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1311; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
1312; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1313; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1314; GCN-NEXT:    v_or_b32_e32 v15, v15, v17
1315; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x57
1316; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1317; GCN-NEXT:    v_mov_b32_e32 v16, s34
1318; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1319; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x56
1320; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1321; GCN-NEXT:    v_mov_b32_e32 v17, s33
1322; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1323; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1324; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1325; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1326; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
1327; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1328; GCN-NEXT:    v_mov_b32_e32 v17, s31
1329; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1330; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
1331; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1332; GCN-NEXT:    v_mov_b32_e32 v18, s30
1333; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1334; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1335; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1336; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1337; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1338; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1339; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1340; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
1341; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1342; GCN-NEXT:    v_mov_b32_e32 v17, s29
1343; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1344; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x52
1345; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1346; GCN-NEXT:    v_mov_b32_e32 v18, s28
1347; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1348; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1349; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1350; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1351; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
1352; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1353; GCN-NEXT:    v_mov_b32_e32 v18, s27
1354; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1355; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
1356; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1357; GCN-NEXT:    v_mov_b32_e32 v19, s26
1358; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1359; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1360; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1361; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1362; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1363; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1364; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1365; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1366; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1367; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1368; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5f
1369; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1370; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s25
1371; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1372; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5e
1373; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s25
1374; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1375; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1376; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1377; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1378; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1379; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5d
1380; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1381; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s25
1382; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1383; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5c
1384; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s25
1385; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1386; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1387; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1388; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1389; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1390; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1391; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1392; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1393; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5b
1394; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1395; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s25
1396; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1397; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5a
1398; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s25
1399; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1400; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1401; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1402; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x58
1403; GCN-NEXT:    v_mov_b32_e32 v3, s25
1404; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1405; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1406; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1407; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x59
1408; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1409; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s25
1410; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1411; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1412; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1413; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
1414; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1415; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
1416; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1417; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
1418; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
1419; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1420; GCN-NEXT:    v_and_b32_e32 v3, 15, v3
1421; GCN-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1422; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4f
1423; GCN-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1424; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
1425; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1426; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4e
1427; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s6
1428; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1429; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1430; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1431; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1432; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1433; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4d
1434; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
1435; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s6
1436; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1437; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4c
1438; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s6
1439; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1440; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1441; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1442; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1443; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1444; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1445; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1446; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1447; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4b
1448; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
1449; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s6
1450; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1451; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4a
1452; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s6
1453; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1454; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1455; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1456; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1457; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1458; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x49
1459; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1460; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s6
1461; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1462; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x48
1463; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s6
1464; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1465; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1466; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1467; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1468; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1469; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1470; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1471; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1472; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1473; GCN-NEXT:    v_lshlrev_b16_e32 v3, 4, v3
1474; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1475; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x47
1476; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1477; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
1478; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1479; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x46
1480; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s6
1481; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1482; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1483; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1484; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1485; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1486; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x45
1487; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
1488; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s6
1489; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1490; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x44
1491; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s6
1492; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1493; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1494; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1495; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1496; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1497; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1498; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1499; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1500; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x43
1501; GCN-NEXT:    v_or_b32_e32 v18, v18, v3
1502; GCN-NEXT:    v_lshrrev_b16_e64 v3, 3, s6
1503; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1504; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x42
1505; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s6
1506; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1507; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1508; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1509; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1510; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1511; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x41
1512; GCN-NEXT:    v_or_b32_e32 v3, v19, v3
1513; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s6
1514; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1515; GCN-NEXT:    s_cmp_lg_u32 s0, 64
1516; GCN-NEXT:    v_mov_b32_e32 v2, s6
1517; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1518; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1519; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
1520; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1521; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1522; GCN-NEXT:    v_or_b32_e32 v2, v2, v19
1523; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1524; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
1525; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
1526; GCN-NEXT:    v_or_b32_sdwa v3, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1527; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v18
1528; GCN-NEXT:    v_and_b32_e32 v2, 15, v2
1529; GCN-NEXT:    s_cmp_lg_u32 s0, 55
1530; GCN-NEXT:    v_or_b32_e32 v2, v2, v14
1531; GCN-NEXT:    v_mov_b32_e32 v14, s24
1532; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1533; GCN-NEXT:    s_cmp_lg_u32 s0, 54
1534; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1535; GCN-NEXT:    v_mov_b32_e32 v15, s23
1536; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1537; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1538; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
1539; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1540; GCN-NEXT:    s_cmp_lg_u32 s0, 53
1541; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1542; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1543; GCN-NEXT:    v_mov_b32_e32 v15, s22
1544; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1545; GCN-NEXT:    s_cmp_lg_u32 s0, 52
1546; GCN-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1547; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1548; GCN-NEXT:    v_mov_b32_e32 v16, s21
1549; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1550; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1551; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1552; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1553; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1554; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
1555; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1556; GCN-NEXT:    s_cmp_lg_u32 s0, 51
1557; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1558; GCN-NEXT:    v_mov_b32_e32 v15, s20
1559; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1560; GCN-NEXT:    s_cmp_lg_u32 s0, 50
1561; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1562; GCN-NEXT:    v_mov_b32_e32 v16, s19
1563; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1564; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1565; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1566; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1567; GCN-NEXT:    s_cmp_lg_u32 s0, 49
1568; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1569; GCN-NEXT:    v_mov_b32_e32 v16, s18
1570; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1571; GCN-NEXT:    s_cmp_lg_u32 s0, 48
1572; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1573; GCN-NEXT:    v_mov_b32_e32 v17, s17
1574; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1575; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1576; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1577; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1578; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1579; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1580; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1581; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1582; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
1583; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1584; GCN-NEXT:    s_cmp_lg_u32 s0, 63
1585; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1586; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s16
1587; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1588; GCN-NEXT:    s_cmp_lg_u32 s0, 62
1589; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s16
1590; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1591; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1592; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1593; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1594; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1595; GCN-NEXT:    s_cmp_lg_u32 s0, 61
1596; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1597; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s16
1598; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1599; GCN-NEXT:    s_cmp_lg_u32 s0, 60
1600; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s16
1601; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1602; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1603; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1604; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1605; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1606; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1607; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1608; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1609; GCN-NEXT:    s_cmp_lg_u32 s0, 59
1610; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1611; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s16
1612; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1613; GCN-NEXT:    s_cmp_lg_u32 s0, 58
1614; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s16
1615; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1616; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1617; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1618; GCN-NEXT:    s_cmp_lg_u32 s0, 56
1619; GCN-NEXT:    v_mov_b32_e32 v13, s16
1620; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1621; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1622; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1623; GCN-NEXT:    s_cmp_lg_u32 s0, 57
1624; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1625; GCN-NEXT:    v_lshrrev_b16_e64 v17, 1, s16
1626; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1627; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1628; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1629; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
1630; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1631; GCN-NEXT:    v_or_b32_e32 v13, v13, v17
1632; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1633; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
1634; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
1635; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1636; GCN-NEXT:    v_and_b32_e32 v13, 15, v13
1637; GCN-NEXT:    v_or_b32_sdwa v13, v13, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1638; GCN-NEXT:    s_cmp_lg_u32 s0, 47
1639; GCN-NEXT:    v_or_b32_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1640; GCN-NEXT:    v_lshrrev_b16_e64 v13, 15, s5
1641; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1642; GCN-NEXT:    s_cmp_lg_u32 s0, 46
1643; GCN-NEXT:    v_lshrrev_b16_e64 v15, 14, s5
1644; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1645; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1646; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1647; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1648; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1649; GCN-NEXT:    s_cmp_lg_u32 s0, 45
1650; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
1651; GCN-NEXT:    v_lshrrev_b16_e64 v15, 13, s5
1652; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1653; GCN-NEXT:    s_cmp_lg_u32 s0, 44
1654; GCN-NEXT:    v_lshrrev_b16_e64 v16, 12, s5
1655; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1656; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1657; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1658; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1659; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1660; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1661; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1662; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1663; GCN-NEXT:    s_cmp_lg_u32 s0, 43
1664; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
1665; GCN-NEXT:    v_lshrrev_b16_e64 v15, 11, s5
1666; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1667; GCN-NEXT:    s_cmp_lg_u32 s0, 42
1668; GCN-NEXT:    v_lshrrev_b16_e64 v16, 10, s5
1669; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1670; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1671; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1672; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1673; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1674; GCN-NEXT:    s_cmp_lg_u32 s0, 41
1675; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1676; GCN-NEXT:    v_lshrrev_b16_e64 v16, 9, s5
1677; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1678; GCN-NEXT:    s_cmp_lg_u32 s0, 40
1679; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s5
1680; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1681; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1682; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1683; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1684; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1685; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1686; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1687; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1688; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1689; GCN-NEXT:    v_lshlrev_b16_e32 v13, 4, v13
1690; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1691; GCN-NEXT:    s_cmp_lg_u32 s0, 39
1692; GCN-NEXT:    v_or_b32_sdwa v15, v15, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1693; GCN-NEXT:    v_lshrrev_b16_e64 v13, 7, s5
1694; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1695; GCN-NEXT:    s_cmp_lg_u32 s0, 38
1696; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s5
1697; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1698; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1699; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1700; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1701; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1702; GCN-NEXT:    s_cmp_lg_u32 s0, 37
1703; GCN-NEXT:    v_or_b32_e32 v13, v16, v13
1704; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s5
1705; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1706; GCN-NEXT:    s_cmp_lg_u32 s0, 36
1707; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s5
1708; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1709; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1710; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1711; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1712; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1713; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1714; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1715; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1716; GCN-NEXT:    s_cmp_lg_u32 s0, 35
1717; GCN-NEXT:    v_or_b32_e32 v16, v16, v13
1718; GCN-NEXT:    v_lshrrev_b16_e64 v13, 3, s5
1719; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1720; GCN-NEXT:    s_cmp_lg_u32 s0, 34
1721; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s5
1722; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1723; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1724; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1725; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1726; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1727; GCN-NEXT:    s_cmp_lg_u32 s0, 33
1728; GCN-NEXT:    v_or_b32_e32 v17, v17, v13
1729; GCN-NEXT:    v_lshrrev_b16_e64 v13, 1, s5
1730; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1731; GCN-NEXT:    s_cmp_lg_u32 s0, 32
1732; GCN-NEXT:    v_mov_b32_e32 v1, s5
1733; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1734; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1735; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
1736; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1737; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
1738; GCN-NEXT:    v_or_b32_e32 v1, v1, v13
1739; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1740; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
1741; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
1742; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1743; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
1744; GCN-NEXT:    v_or_b32_e32 v1, v1, v16
1745; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1746; GCN-NEXT:    s_cmp_lg_u32 s0, 23
1747; GCN-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1748; GCN-NEXT:    v_mov_b32_e32 v14, s15
1749; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1750; GCN-NEXT:    s_cmp_lg_u32 s0, 22
1751; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1752; GCN-NEXT:    v_mov_b32_e32 v15, s14
1753; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1754; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1755; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
1756; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1757; GCN-NEXT:    s_cmp_lg_u32 s0, 21
1758; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1759; GCN-NEXT:    v_mov_b32_e32 v15, s13
1760; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1761; GCN-NEXT:    s_cmp_lg_u32 s0, 20
1762; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1763; GCN-NEXT:    v_mov_b32_e32 v16, s12
1764; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1765; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1766; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1767; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1768; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1769; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
1770; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1771; GCN-NEXT:    s_cmp_lg_u32 s0, 19
1772; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1773; GCN-NEXT:    v_mov_b32_e32 v15, s11
1774; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1775; GCN-NEXT:    s_cmp_lg_u32 s0, 18
1776; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1777; GCN-NEXT:    v_mov_b32_e32 v16, s10
1778; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1779; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1780; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1781; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1782; GCN-NEXT:    s_cmp_lg_u32 s0, 17
1783; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1784; GCN-NEXT:    v_mov_b32_e32 v16, s9
1785; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1786; GCN-NEXT:    s_cmp_lg_u32 s0, 16
1787; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1788; GCN-NEXT:    v_mov_b32_e32 v18, s8
1789; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1790; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1791; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1792; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1793; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1794; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1795; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1796; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1797; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
1798; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1799; GCN-NEXT:    s_cmp_lg_u32 s0, 31
1800; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1801; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s1
1802; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1803; GCN-NEXT:    s_cmp_lg_u32 s0, 30
1804; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s1
1805; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1806; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1807; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1808; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1809; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1810; GCN-NEXT:    s_cmp_lg_u32 s0, 29
1811; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1812; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s1
1813; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1814; GCN-NEXT:    s_cmp_lg_u32 s0, 28
1815; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s1
1816; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1817; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1818; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1819; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1820; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1821; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1822; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1823; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1824; GCN-NEXT:    s_cmp_lg_u32 s0, 27
1825; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1826; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s1
1827; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1828; GCN-NEXT:    s_cmp_lg_u32 s0, 26
1829; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s1
1830; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1831; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1832; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1833; GCN-NEXT:    s_cmp_lg_u32 s0, 24
1834; GCN-NEXT:    v_mov_b32_e32 v17, s1
1835; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1836; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1837; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1838; GCN-NEXT:    s_cmp_lg_u32 s0, 25
1839; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1840; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s1
1841; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1842; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1843; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1844; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1845; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1846; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
1847; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1848; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1849; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1850; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1851; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
1852; GCN-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1853; GCN-NEXT:    s_cmp_lg_u32 s0, 15
1854; GCN-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1855; GCN-NEXT:    v_lshrrev_b16_e64 v15, 15, s4
1856; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1857; GCN-NEXT:    s_cmp_lg_u32 s0, 14
1858; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s4
1859; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1860; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1861; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1862; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1863; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1864; GCN-NEXT:    s_cmp_lg_u32 s0, 13
1865; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1866; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s4
1867; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1868; GCN-NEXT:    s_cmp_lg_u32 s0, 12
1869; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s4
1870; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1871; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1872; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1873; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1874; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1875; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1876; GCN-NEXT:    s_cmp_lg_u32 s0, 11
1877; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s4
1878; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1879; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1880; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1881; GCN-NEXT:    s_cmp_lg_u32 s0, 10
1882; GCN-NEXT:    v_lshrrev_b16_e64 v13, 10, s4
1883; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1884; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v18, vcc
1885; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1886; GCN-NEXT:    s_cmp_lg_u32 s0, 9
1887; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
1888; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1889; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1890; GCN-NEXT:    s_cmp_lg_u32 s0, 8
1891; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
1892; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
1893; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1894; GCN-NEXT:    s_cmp_lg_u32 s0, 7
1895; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
1896; GCN-NEXT:    v_cndmask_b32_e32 v11, 1, v11, vcc
1897; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1898; GCN-NEXT:    s_cmp_lg_u32 s0, 6
1899; GCN-NEXT:    v_lshrrev_b16_e64 v9, 6, s4
1900; GCN-NEXT:    v_cndmask_b32_e32 v10, 1, v10, vcc
1901; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1902; GCN-NEXT:    s_cmp_lg_u32 s0, 5
1903; GCN-NEXT:    v_lshrrev_b16_e64 v8, 5, s4
1904; GCN-NEXT:    v_cndmask_b32_e32 v9, 1, v9, vcc
1905; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1906; GCN-NEXT:    s_cmp_lg_u32 s0, 4
1907; GCN-NEXT:    v_lshrrev_b16_e64 v7, 4, s4
1908; GCN-NEXT:    v_cndmask_b32_e32 v8, 1, v8, vcc
1909; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1910; GCN-NEXT:    s_cmp_lg_u32 s0, 3
1911; GCN-NEXT:    v_lshrrev_b16_e64 v6, 3, s4
1912; GCN-NEXT:    v_cndmask_b32_e32 v7, 1, v7, vcc
1913; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1914; GCN-NEXT:    s_cmp_lg_u32 s0, 2
1915; GCN-NEXT:    v_lshrrev_b16_e64 v5, 2, s4
1916; GCN-NEXT:    v_cndmask_b32_e32 v6, 1, v6, vcc
1917; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1918; GCN-NEXT:    s_cmp_lg_u32 s0, 1
1919; GCN-NEXT:    v_lshrrev_b16_e64 v4, 1, s4
1920; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
1921; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1922; GCN-NEXT:    s_cmp_lg_u32 s0, 0
1923; GCN-NEXT:    v_mov_b32_e32 v0, s4
1924; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
1925; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1926; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
1927; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1928; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
1929; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
1930; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
1931; GCN-NEXT:    v_lshlrev_b16_e32 v10, 1, v10
1932; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
1933; GCN-NEXT:    v_lshlrev_b16_e32 v8, 1, v8
1934; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
1935; GCN-NEXT:    v_lshlrev_b16_e32 v6, 1, v6
1936; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
1937; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
1938; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
1939; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
1940; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
1941; GCN-NEXT:    v_or_b32_e32 v9, v9, v10
1942; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
1943; GCN-NEXT:    v_or_b32_e32 v5, v5, v6
1944; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
1945; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1946; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
1947; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
1948; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
1949; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
1950; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
1951; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
1952; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
1953; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
1954; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1955; GCN-NEXT:    v_and_b32_e32 v11, 15, v11
1956; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
1957; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
1958; GCN-NEXT:    v_or_b32_sdwa v11, v11, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1959; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
1960; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1961; GCN-NEXT:    v_mov_b32_e32 v5, s3
1962; GCN-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1963; GCN-NEXT:    v_mov_b32_e32 v4, s2
1964; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1965; GCN-NEXT:    s_endpgm
1966entry:
1967  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
1968  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
1969  ret void
1970}
1971
1972define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
1973; GCN-LABEL: float32_inselt_vec:
1974; GCN:       ; %bb.0: ; %entry
1975; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v32
1976; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v32
1977; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 3, v32
1978; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 4, v32
1979; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 5, v32
1980; GCN-NEXT:    v_cmp_ne_u32_e64 s[8:9], 6, v32
1981; GCN-NEXT:    v_cmp_ne_u32_e64 s[10:11], 7, v32
1982; GCN-NEXT:    v_cmp_ne_u32_e64 s[12:13], 8, v32
1983; GCN-NEXT:    v_cmp_ne_u32_e64 s[14:15], 9, v32
1984; GCN-NEXT:    v_cmp_ne_u32_e64 s[16:17], 10, v32
1985; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 11, v32
1986; GCN-NEXT:    v_cmp_ne_u32_e64 s[20:21], 12, v32
1987; GCN-NEXT:    v_cmp_ne_u32_e64 s[22:23], 13, v32
1988; GCN-NEXT:    v_cmp_ne_u32_e64 s[24:25], 14, v32
1989; GCN-NEXT:    v_cmp_ne_u32_e64 s[26:27], 15, v32
1990; GCN-NEXT:    v_cmp_ne_u32_e64 s[28:29], 16, v32
1991; GCN-NEXT:    v_cmp_ne_u32_e64 s[30:31], 17, v32
1992; GCN-NEXT:    v_cmp_ne_u32_e64 s[34:35], 18, v32
1993; GCN-NEXT:    v_cmp_ne_u32_e64 s[36:37], 19, v32
1994; GCN-NEXT:    v_cmp_ne_u32_e64 s[38:39], 20, v32
1995; GCN-NEXT:    v_cmp_ne_u32_e64 s[40:41], 21, v32
1996; GCN-NEXT:    v_cmp_ne_u32_e64 s[42:43], 22, v32
1997; GCN-NEXT:    v_cmp_ne_u32_e64 s[44:45], 23, v32
1998; GCN-NEXT:    v_cmp_ne_u32_e64 s[46:47], 24, v32
1999; GCN-NEXT:    v_cmp_ne_u32_e64 s[48:49], 25, v32
2000; GCN-NEXT:    v_cmp_ne_u32_e64 s[50:51], 26, v32
2001; GCN-NEXT:    v_cmp_ne_u32_e64 s[52:53], 27, v32
2002; GCN-NEXT:    v_cmp_ne_u32_e64 s[54:55], 28, v32
2003; GCN-NEXT:    v_cmp_ne_u32_e64 s[56:57], 29, v32
2004; GCN-NEXT:    v_cmp_ne_u32_e64 s[58:59], 30, v32
2005; GCN-NEXT:    v_cmp_ne_u32_e64 s[60:61], 31, v32
2006; GCN-NEXT:    v_cmp_ne_u32_e64 s[62:63], 0, v32
2007; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[62:63]
2008; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
2009; GCN-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
2010; GCN-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[2:3]
2011; GCN-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
2012; GCN-NEXT:    v_cndmask_b32_e64 v5, 1.0, v5, s[6:7]
2013; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, v6, s[8:9]
2014; GCN-NEXT:    v_cndmask_b32_e64 v7, 1.0, v7, s[10:11]
2015; GCN-NEXT:    v_cndmask_b32_e64 v8, 1.0, v8, s[12:13]
2016; GCN-NEXT:    v_cndmask_b32_e64 v9, 1.0, v9, s[14:15]
2017; GCN-NEXT:    v_cndmask_b32_e64 v10, 1.0, v10, s[16:17]
2018; GCN-NEXT:    v_cndmask_b32_e64 v11, 1.0, v11, s[18:19]
2019; GCN-NEXT:    v_cndmask_b32_e64 v12, 1.0, v12, s[20:21]
2020; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, v13, s[22:23]
2021; GCN-NEXT:    v_cndmask_b32_e64 v14, 1.0, v14, s[24:25]
2022; GCN-NEXT:    v_cndmask_b32_e64 v15, 1.0, v15, s[26:27]
2023; GCN-NEXT:    v_cndmask_b32_e64 v16, 1.0, v16, s[28:29]
2024; GCN-NEXT:    v_cndmask_b32_e64 v17, 1.0, v17, s[30:31]
2025; GCN-NEXT:    v_cndmask_b32_e64 v18, 1.0, v18, s[34:35]
2026; GCN-NEXT:    v_cndmask_b32_e64 v19, 1.0, v19, s[36:37]
2027; GCN-NEXT:    v_cndmask_b32_e64 v20, 1.0, v20, s[38:39]
2028; GCN-NEXT:    v_cndmask_b32_e64 v21, 1.0, v21, s[40:41]
2029; GCN-NEXT:    v_cndmask_b32_e64 v22, 1.0, v22, s[42:43]
2030; GCN-NEXT:    v_cndmask_b32_e64 v23, 1.0, v23, s[44:45]
2031; GCN-NEXT:    v_cndmask_b32_e64 v24, 1.0, v24, s[46:47]
2032; GCN-NEXT:    v_cndmask_b32_e64 v25, 1.0, v25, s[48:49]
2033; GCN-NEXT:    v_cndmask_b32_e64 v26, 1.0, v26, s[50:51]
2034; GCN-NEXT:    v_cndmask_b32_e64 v27, 1.0, v27, s[52:53]
2035; GCN-NEXT:    v_cndmask_b32_e64 v28, 1.0, v28, s[54:55]
2036; GCN-NEXT:    v_cndmask_b32_e64 v29, 1.0, v29, s[56:57]
2037; GCN-NEXT:    v_cndmask_b32_e64 v30, 1.0, v30, s[58:59]
2038; GCN-NEXT:    v_cndmask_b32_e64 v31, 1.0, v31, s[60:61]
2039; GCN-NEXT:    ; return to shader part epilog
2040entry:
2041  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
2042  ret <32 x float> %v
2043}
2044
2045define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
2046; GCN-LABEL: double8_inselt_vec:
2047; GCN:       ; %bb.0: ; %entry
2048; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2049; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
2050; GCN-NEXT:    v_mov_b32_e32 v17, 0x3ff00000
2051; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2052; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
2053; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
2054; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
2055; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
2056; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
2057; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
2058; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
2059; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
2060; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
2061; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
2062; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
2063; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
2064; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc
2065; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
2066; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
2067; GCN-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc
2068; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
2069; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
2070; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
2071; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
2072; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
2073; GCN-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
2074; GCN-NEXT:    s_setpc_b64 s[30:31]
2075entry:
2076  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
2077  ret <8 x double> %v
2078}
2079