1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
3
4define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
5; GCN-LABEL: float4_inselt:
6; GCN:       ; %bb.0: ; %entry
7; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
8; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
10; GCN-NEXT:    s_waitcnt lgkmcnt(0)
11; GCN-NEXT:    s_cmp_lg_u32 s2, 3
12; GCN-NEXT:    v_mov_b32_e32 v0, s7
13; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
14; GCN-NEXT:    s_cmp_lg_u32 s2, 2
15; GCN-NEXT:    v_cndmask_b32_e32 v3, 1.0, v0, vcc
16; GCN-NEXT:    v_mov_b32_e32 v0, s6
17; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
18; GCN-NEXT:    s_cmp_lg_u32 s2, 1
19; GCN-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
20; GCN-NEXT:    v_mov_b32_e32 v0, s5
21; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
22; GCN-NEXT:    s_cmp_lg_u32 s2, 0
23; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
24; GCN-NEXT:    v_mov_b32_e32 v0, s4
25; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
26; GCN-NEXT:    v_mov_b32_e32 v5, s1
27; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
28; GCN-NEXT:    v_mov_b32_e32 v4, s0
29; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
30; GCN-NEXT:    s_endpgm
31entry:
32  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
33  store <4 x float> %v, <4 x float> addrspace(1)* %out
34  ret void
35}
36
37define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
38; GCN-LABEL: float4_inselt_undef:
39; GCN:       ; %bb.0: ; %entry
40; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
42; GCN-NEXT:    v_mov_b32_e32 v1, v0
43; GCN-NEXT:    v_mov_b32_e32 v2, v0
44; GCN-NEXT:    v_mov_b32_e32 v3, v0
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_mov_b32_e32 v5, s1
47; GCN-NEXT:    v_mov_b32_e32 v4, s0
48; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
49; GCN-NEXT:    s_endpgm
50entry:
51  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
52  store <4 x float> %v, <4 x float> addrspace(1)* %out
53  ret void
54}
55
56define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
57; GCN-LABEL: int4_inselt:
58; GCN:       ; %bb.0: ; %entry
59; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
60; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
61; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
62; GCN-NEXT:    s_waitcnt lgkmcnt(0)
63; GCN-NEXT:    s_cmp_lg_u32 s2, 3
64; GCN-NEXT:    s_cselect_b32 s3, s7, 1
65; GCN-NEXT:    s_cmp_lg_u32 s2, 2
66; GCN-NEXT:    s_cselect_b32 s6, s6, 1
67; GCN-NEXT:    s_cmp_lg_u32 s2, 1
68; GCN-NEXT:    s_cselect_b32 s5, s5, 1
69; GCN-NEXT:    s_cmp_lg_u32 s2, 0
70; GCN-NEXT:    s_cselect_b32 s2, s4, 1
71; GCN-NEXT:    v_mov_b32_e32 v5, s1
72; GCN-NEXT:    v_mov_b32_e32 v0, s2
73; GCN-NEXT:    v_mov_b32_e32 v1, s5
74; GCN-NEXT:    v_mov_b32_e32 v2, s6
75; GCN-NEXT:    v_mov_b32_e32 v3, s3
76; GCN-NEXT:    v_mov_b32_e32 v4, s0
77; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
78; GCN-NEXT:    s_endpgm
79entry:
80  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
81  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
82  ret void
83}
84
85define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
86; GCN-LABEL: float2_inselt:
87; GCN:       ; %bb.0: ; %entry
88; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
89; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
90; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
91; GCN-NEXT:    s_waitcnt lgkmcnt(0)
92; GCN-NEXT:    s_cmp_lg_u32 s4, 1
93; GCN-NEXT:    v_mov_b32_e32 v0, s3
94; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
95; GCN-NEXT:    s_cmp_lg_u32 s4, 0
96; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
97; GCN-NEXT:    v_mov_b32_e32 v0, s2
98; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
101; GCN-NEXT:    v_mov_b32_e32 v2, s0
102; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
103; GCN-NEXT:    s_endpgm
104entry:
105  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
106  store <2 x float> %v, <2 x float> addrspace(1)* %out
107  ret void
108}
109
110define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
111; GCN-LABEL: float8_inselt:
112; GCN:       ; %bb.0: ; %entry
113; GCN-NEXT:    s_load_dword s2, s[0:1], 0x64
114; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
115; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
116; GCN-NEXT:    s_waitcnt lgkmcnt(0)
117; GCN-NEXT:    s_cmp_lg_u32 s2, 3
118; GCN-NEXT:    v_mov_b32_e32 v0, s7
119; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
120; GCN-NEXT:    s_cmp_lg_u32 s2, 2
121; GCN-NEXT:    v_cndmask_b32_e32 v3, 1.0, v0, vcc
122; GCN-NEXT:    v_mov_b32_e32 v0, s6
123; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
124; GCN-NEXT:    s_cmp_lg_u32 s2, 1
125; GCN-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
126; GCN-NEXT:    v_mov_b32_e32 v0, s5
127; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
128; GCN-NEXT:    s_cmp_lg_u32 s2, 0
129; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
130; GCN-NEXT:    v_mov_b32_e32 v0, s4
131; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
132; GCN-NEXT:    s_cmp_lg_u32 s2, 7
133; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
134; GCN-NEXT:    v_mov_b32_e32 v4, s11
135; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
136; GCN-NEXT:    s_cmp_lg_u32 s2, 6
137; GCN-NEXT:    v_cndmask_b32_e32 v7, 1.0, v4, vcc
138; GCN-NEXT:    v_mov_b32_e32 v4, s10
139; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
140; GCN-NEXT:    s_cmp_lg_u32 s2, 5
141; GCN-NEXT:    v_cndmask_b32_e32 v6, 1.0, v4, vcc
142; GCN-NEXT:    v_mov_b32_e32 v4, s9
143; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
144; GCN-NEXT:    s_cmp_lg_u32 s2, 4
145; GCN-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
146; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
147; GCN-NEXT:    s_add_u32 s2, s0, 16
148; GCN-NEXT:    s_addc_u32 s3, s1, 0
149; GCN-NEXT:    v_mov_b32_e32 v4, s8
150; GCN-NEXT:    v_mov_b32_e32 v9, s3
151; GCN-NEXT:    v_cndmask_b32_e32 v4, 1.0, v4, vcc
152; GCN-NEXT:    v_mov_b32_e32 v8, s2
153; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
154; GCN-NEXT:    s_nop 0
155; GCN-NEXT:    v_mov_b32_e32 v5, s1
156; GCN-NEXT:    v_mov_b32_e32 v4, s0
157; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
158; GCN-NEXT:    s_endpgm
159entry:
160  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
161  store <8 x float> %v, <8 x float> addrspace(1)* %out
162  ret void
163}
164
165define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
166; GCN-LABEL: float16_inselt:
167; GCN:       ; %bb.0: ; %entry
168; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
169; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
170; GCN-NEXT:    s_load_dword s20, s[0:1], 0xa4
171; GCN-NEXT:    s_waitcnt lgkmcnt(0)
172; GCN-NEXT:    v_mov_b32_e32 v0, s4
173; GCN-NEXT:    s_add_u32 s0, s2, 48
174; GCN-NEXT:    s_addc_u32 s1, s3, 0
175; GCN-NEXT:    v_mov_b32_e32 v17, s1
176; GCN-NEXT:    v_mov_b32_e32 v1, s5
177; GCN-NEXT:    v_mov_b32_e32 v2, s6
178; GCN-NEXT:    v_mov_b32_e32 v3, s7
179; GCN-NEXT:    v_mov_b32_e32 v4, s8
180; GCN-NEXT:    v_mov_b32_e32 v5, s9
181; GCN-NEXT:    v_mov_b32_e32 v6, s10
182; GCN-NEXT:    v_mov_b32_e32 v7, s11
183; GCN-NEXT:    v_mov_b32_e32 v8, s12
184; GCN-NEXT:    v_mov_b32_e32 v9, s13
185; GCN-NEXT:    v_mov_b32_e32 v10, s14
186; GCN-NEXT:    v_mov_b32_e32 v11, s15
187; GCN-NEXT:    v_mov_b32_e32 v12, s16
188; GCN-NEXT:    v_mov_b32_e32 v13, s17
189; GCN-NEXT:    v_mov_b32_e32 v14, s18
190; GCN-NEXT:    v_mov_b32_e32 v15, s19
191; GCN-NEXT:    s_mov_b32 m0, s20
192; GCN-NEXT:    v_mov_b32_e32 v16, s0
193; GCN-NEXT:    s_add_u32 s0, s2, 32
194; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
195; GCN-NEXT:    s_addc_u32 s1, s3, 0
196; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
197; GCN-NEXT:    s_nop 0
198; GCN-NEXT:    v_mov_b32_e32 v13, s1
199; GCN-NEXT:    v_mov_b32_e32 v12, s0
200; GCN-NEXT:    s_add_u32 s0, s2, 16
201; GCN-NEXT:    s_addc_u32 s1, s3, 0
202; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
203; GCN-NEXT:    s_nop 0
204; GCN-NEXT:    v_mov_b32_e32 v9, s1
205; GCN-NEXT:    v_mov_b32_e32 v8, s0
206; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
207; GCN-NEXT:    s_nop 0
208; GCN-NEXT:    v_mov_b32_e32 v5, s3
209; GCN-NEXT:    v_mov_b32_e32 v4, s2
210; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
211; GCN-NEXT:    s_endpgm
212entry:
213  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
214  store <16 x float> %v, <16 x float> addrspace(1)* %out
215  ret void
216}
217
218define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) {
219; GCN-LABEL: float32_inselt:
220; GCN:       ; %bb.0: ; %entry
221; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
222; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
223; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xe4
224; GCN-NEXT:    s_load_dword s0, s[0:1], 0x124
225; GCN-NEXT:    s_waitcnt lgkmcnt(0)
226; GCN-NEXT:    v_mov_b32_e32 v0, s36
227; GCN-NEXT:    v_mov_b32_e32 v1, s37
228; GCN-NEXT:    v_mov_b32_e32 v2, s38
229; GCN-NEXT:    s_mov_b32 m0, s0
230; GCN-NEXT:    s_add_u32 s0, s2, 0x70
231; GCN-NEXT:    s_addc_u32 s1, s3, 0
232; GCN-NEXT:    v_mov_b32_e32 v33, s1
233; GCN-NEXT:    v_mov_b32_e32 v3, s39
234; GCN-NEXT:    v_mov_b32_e32 v4, s40
235; GCN-NEXT:    v_mov_b32_e32 v5, s41
236; GCN-NEXT:    v_mov_b32_e32 v6, s42
237; GCN-NEXT:    v_mov_b32_e32 v7, s43
238; GCN-NEXT:    v_mov_b32_e32 v8, s44
239; GCN-NEXT:    v_mov_b32_e32 v9, s45
240; GCN-NEXT:    v_mov_b32_e32 v10, s46
241; GCN-NEXT:    v_mov_b32_e32 v11, s47
242; GCN-NEXT:    v_mov_b32_e32 v12, s48
243; GCN-NEXT:    v_mov_b32_e32 v13, s49
244; GCN-NEXT:    v_mov_b32_e32 v14, s50
245; GCN-NEXT:    v_mov_b32_e32 v15, s51
246; GCN-NEXT:    v_mov_b32_e32 v16, s4
247; GCN-NEXT:    v_mov_b32_e32 v17, s5
248; GCN-NEXT:    v_mov_b32_e32 v18, s6
249; GCN-NEXT:    v_mov_b32_e32 v19, s7
250; GCN-NEXT:    v_mov_b32_e32 v20, s8
251; GCN-NEXT:    v_mov_b32_e32 v21, s9
252; GCN-NEXT:    v_mov_b32_e32 v22, s10
253; GCN-NEXT:    v_mov_b32_e32 v23, s11
254; GCN-NEXT:    v_mov_b32_e32 v24, s12
255; GCN-NEXT:    v_mov_b32_e32 v25, s13
256; GCN-NEXT:    v_mov_b32_e32 v26, s14
257; GCN-NEXT:    v_mov_b32_e32 v27, s15
258; GCN-NEXT:    v_mov_b32_e32 v28, s16
259; GCN-NEXT:    v_mov_b32_e32 v29, s17
260; GCN-NEXT:    v_mov_b32_e32 v30, s18
261; GCN-NEXT:    v_mov_b32_e32 v31, s19
262; GCN-NEXT:    v_mov_b32_e32 v32, s0
263; GCN-NEXT:    s_add_u32 s0, s2, 0x60
264; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
265; GCN-NEXT:    s_addc_u32 s1, s3, 0
266; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
267; GCN-NEXT:    s_nop 0
268; GCN-NEXT:    v_mov_b32_e32 v29, s1
269; GCN-NEXT:    v_mov_b32_e32 v28, s0
270; GCN-NEXT:    s_add_u32 s0, s2, 0x50
271; GCN-NEXT:    s_addc_u32 s1, s3, 0
272; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
273; GCN-NEXT:    s_nop 0
274; GCN-NEXT:    v_mov_b32_e32 v25, s1
275; GCN-NEXT:    v_mov_b32_e32 v24, s0
276; GCN-NEXT:    s_add_u32 s0, s2, 64
277; GCN-NEXT:    s_addc_u32 s1, s3, 0
278; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
279; GCN-NEXT:    s_nop 0
280; GCN-NEXT:    v_mov_b32_e32 v21, s1
281; GCN-NEXT:    v_mov_b32_e32 v20, s0
282; GCN-NEXT:    s_add_u32 s0, s2, 48
283; GCN-NEXT:    s_addc_u32 s1, s3, 0
284; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
285; GCN-NEXT:    s_nop 0
286; GCN-NEXT:    v_mov_b32_e32 v17, s1
287; GCN-NEXT:    v_mov_b32_e32 v16, s0
288; GCN-NEXT:    s_add_u32 s0, s2, 32
289; GCN-NEXT:    s_addc_u32 s1, s3, 0
290; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
291; GCN-NEXT:    s_nop 0
292; GCN-NEXT:    v_mov_b32_e32 v13, s1
293; GCN-NEXT:    v_mov_b32_e32 v12, s0
294; GCN-NEXT:    s_add_u32 s0, s2, 16
295; GCN-NEXT:    s_addc_u32 s1, s3, 0
296; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
297; GCN-NEXT:    s_nop 0
298; GCN-NEXT:    v_mov_b32_e32 v9, s1
299; GCN-NEXT:    v_mov_b32_e32 v8, s0
300; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
301; GCN-NEXT:    s_nop 0
302; GCN-NEXT:    v_mov_b32_e32 v5, s3
303; GCN-NEXT:    v_mov_b32_e32 v4, s2
304; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
305; GCN-NEXT:    s_endpgm
306entry:
307  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
308  store <32 x float> %v, <32 x float> addrspace(1)* %out
309  ret void
310}
311
312define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
313; GCN-LABEL: half4_inselt:
314; GCN:       ; %bb.0: ; %entry
315; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
316; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
317; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
318; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
319; GCN-NEXT:    s_waitcnt lgkmcnt(0)
320; GCN-NEXT:    s_lshl_b32 s6, s6, 4
321; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
322; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
323; GCN-NEXT:    s_mov_b32 s7, s6
324; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
325; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
326; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
327; GCN-NEXT:    v_mov_b32_e32 v0, s2
328; GCN-NEXT:    v_mov_b32_e32 v3, s1
329; GCN-NEXT:    v_mov_b32_e32 v1, s3
330; GCN-NEXT:    v_mov_b32_e32 v2, s0
331; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
332; GCN-NEXT:    s_endpgm
333entry:
334  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
335  store <4 x half> %v, <4 x half> addrspace(1)* %out
336  ret void
337}
338
339define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
340; GCN-LABEL: half2_inselt:
341; GCN:       ; %bb.0: ; %entry
342; GCN-NEXT:    s_load_dword s2, s[0:1], 0x30
343; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
344; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
345; GCN-NEXT:    s_waitcnt lgkmcnt(0)
346; GCN-NEXT:    s_lshl_b32 s2, s2, 4
347; GCN-NEXT:    s_lshl_b32 s2, 0xffff, s2
348; GCN-NEXT:    s_andn2_b32 s3, s3, s2
349; GCN-NEXT:    s_and_b32 s2, s2, 0x3c003c00
350; GCN-NEXT:    s_or_b32 s2, s2, s3
351; GCN-NEXT:    v_mov_b32_e32 v0, s0
352; GCN-NEXT:    v_mov_b32_e32 v1, s1
353; GCN-NEXT:    v_mov_b32_e32 v2, s2
354; GCN-NEXT:    flat_store_dword v[0:1], v2
355; GCN-NEXT:    s_endpgm
356entry:
357  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
358  store <2 x half> %v, <2 x half> addrspace(1)* %out
359  ret void
360}
361
362define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
363; GCN-LABEL: half8_inselt:
364; GCN:       ; %bb.0: ; %entry
365; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
366; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
367; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
368; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
369; GCN-NEXT:    s_waitcnt lgkmcnt(0)
370; GCN-NEXT:    s_lshr_b32 s3, s7, 16
371; GCN-NEXT:    s_cmp_lg_u32 s2, 7
372; GCN-NEXT:    v_mov_b32_e32 v1, s3
373; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
374; GCN-NEXT:    s_cmp_lg_u32 s2, 6
375; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
376; GCN-NEXT:    v_mov_b32_e32 v2, s7
377; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
378; GCN-NEXT:    s_lshr_b32 s3, s6, 16
379; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
380; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
381; GCN-NEXT:    s_cmp_lg_u32 s2, 5
382; GCN-NEXT:    v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
383; GCN-NEXT:    v_mov_b32_e32 v1, s3
384; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
385; GCN-NEXT:    s_cmp_lg_u32 s2, 4
386; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
387; GCN-NEXT:    v_mov_b32_e32 v2, s6
388; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
389; GCN-NEXT:    s_lshr_b32 s3, s5, 16
390; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
391; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
392; GCN-NEXT:    s_cmp_lg_u32 s2, 3
393; GCN-NEXT:    v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
394; GCN-NEXT:    v_mov_b32_e32 v1, s3
395; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
396; GCN-NEXT:    s_cmp_lg_u32 s2, 2
397; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
398; GCN-NEXT:    v_mov_b32_e32 v4, s5
399; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
400; GCN-NEXT:    s_lshr_b32 s3, s4, 16
401; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
402; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
403; GCN-NEXT:    s_cmp_lg_u32 s2, 1
404; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
405; GCN-NEXT:    v_mov_b32_e32 v4, s3
406; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
407; GCN-NEXT:    s_cmp_lg_u32 s2, 0
408; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
409; GCN-NEXT:    v_mov_b32_e32 v5, s4
410; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
411; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
412; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
413; GCN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
414; GCN-NEXT:    v_mov_b32_e32 v5, s1
415; GCN-NEXT:    v_mov_b32_e32 v4, s0
416; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
417; GCN-NEXT:    s_endpgm
418entry:
419  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
420  store <8 x half> %v, <8 x half> addrspace(1)* %out
421  ret void
422}
423
424define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
425; GCN-LABEL: short2_inselt:
426; GCN:       ; %bb.0: ; %entry
427; GCN-NEXT:    s_load_dword s2, s[0:1], 0x30
428; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
429; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
430; GCN-NEXT:    s_waitcnt lgkmcnt(0)
431; GCN-NEXT:    s_lshl_b32 s2, s2, 4
432; GCN-NEXT:    s_lshl_b32 s2, 0xffff, s2
433; GCN-NEXT:    s_andn2_b32 s3, s3, s2
434; GCN-NEXT:    s_and_b32 s2, s2, 0x10001
435; GCN-NEXT:    s_or_b32 s2, s2, s3
436; GCN-NEXT:    v_mov_b32_e32 v0, s0
437; GCN-NEXT:    v_mov_b32_e32 v1, s1
438; GCN-NEXT:    v_mov_b32_e32 v2, s2
439; GCN-NEXT:    flat_store_dword v[0:1], v2
440; GCN-NEXT:    s_endpgm
441entry:
442  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
443  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
444  ret void
445}
446
447define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
448; GCN-LABEL: short4_inselt:
449; GCN:       ; %bb.0: ; %entry
450; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
451; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
452; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
453; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
454; GCN-NEXT:    s_waitcnt lgkmcnt(0)
455; GCN-NEXT:    s_lshl_b32 s6, s6, 4
456; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
457; GCN-NEXT:    s_mov_b32 s6, 0x10001
458; GCN-NEXT:    s_mov_b32 s7, s6
459; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
460; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
461; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
462; GCN-NEXT:    v_mov_b32_e32 v0, s2
463; GCN-NEXT:    v_mov_b32_e32 v3, s1
464; GCN-NEXT:    v_mov_b32_e32 v1, s3
465; GCN-NEXT:    v_mov_b32_e32 v2, s0
466; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
467; GCN-NEXT:    s_endpgm
468entry:
469  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
470  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
471  ret void
472}
473
474define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
475; GCN-LABEL: byte8_inselt:
476; GCN:       ; %bb.0: ; %entry
477; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
478; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
479; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
480; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
481; GCN-NEXT:    s_waitcnt lgkmcnt(0)
482; GCN-NEXT:    s_lshl_b32 s6, s6, 3
483; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
484; GCN-NEXT:    s_mov_b32 s6, 0x1010101
485; GCN-NEXT:    s_and_b32 s7, s5, s6
486; GCN-NEXT:    s_and_b32 s6, s4, s6
487; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
488; GCN-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
489; GCN-NEXT:    v_mov_b32_e32 v2, s2
490; GCN-NEXT:    v_mov_b32_e32 v0, s0
491; GCN-NEXT:    v_mov_b32_e32 v1, s1
492; GCN-NEXT:    v_mov_b32_e32 v3, s3
493; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
494; GCN-NEXT:    s_endpgm
495entry:
496  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
497  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
498  ret void
499}
500
501define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
502; GCN-LABEL: byte16_inselt:
503; GCN:       ; %bb.0: ; %entry
504; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
505; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
506; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
507; GCN-NEXT:    s_waitcnt lgkmcnt(0)
508; GCN-NEXT:    s_lshr_b32 s3, s7, 24
509; GCN-NEXT:    s_cmp_lg_u32 s2, 15
510; GCN-NEXT:    v_mov_b32_e32 v0, s3
511; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
512; GCN-NEXT:    s_lshr_b32 s3, s7, 16
513; GCN-NEXT:    s_cmp_lg_u32 s2, 14
514; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
515; GCN-NEXT:    v_mov_b32_e32 v1, s3
516; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
517; GCN-NEXT:    s_lshr_b32 s3, s7, 8
518; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
519; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
520; GCN-NEXT:    s_cmp_lg_u32 s2, 13
521; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
522; GCN-NEXT:    v_mov_b32_e32 v1, s3
523; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
524; GCN-NEXT:    s_cmp_lg_u32 s2, 12
525; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
526; GCN-NEXT:    v_mov_b32_e32 v2, s7
527; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
528; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
529; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
530; GCN-NEXT:    s_lshr_b32 s3, s6, 24
531; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
532; GCN-NEXT:    s_cmp_lg_u32 s2, 11
533; GCN-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
534; GCN-NEXT:    v_mov_b32_e32 v0, s3
535; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
536; GCN-NEXT:    s_lshr_b32 s3, s6, 16
537; GCN-NEXT:    s_cmp_lg_u32 s2, 10
538; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
539; GCN-NEXT:    v_mov_b32_e32 v1, s3
540; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
541; GCN-NEXT:    s_lshr_b32 s3, s6, 8
542; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
543; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
544; GCN-NEXT:    s_cmp_lg_u32 s2, 9
545; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
546; GCN-NEXT:    v_mov_b32_e32 v1, s3
547; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
548; GCN-NEXT:    s_cmp_lg_u32 s2, 8
549; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
550; GCN-NEXT:    v_mov_b32_e32 v2, s6
551; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
552; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
553; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
554; GCN-NEXT:    s_lshr_b32 s3, s5, 24
555; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
556; GCN-NEXT:    s_cmp_lg_u32 s2, 7
557; GCN-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
558; GCN-NEXT:    v_mov_b32_e32 v0, s3
559; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
560; GCN-NEXT:    s_lshr_b32 s3, s5, 16
561; GCN-NEXT:    s_cmp_lg_u32 s2, 6
562; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
563; GCN-NEXT:    v_mov_b32_e32 v1, s3
564; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
565; GCN-NEXT:    s_lshr_b32 s3, s5, 8
566; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
567; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
568; GCN-NEXT:    s_cmp_lg_u32 s2, 5
569; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
570; GCN-NEXT:    v_mov_b32_e32 v1, s3
571; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
572; GCN-NEXT:    s_cmp_lg_u32 s2, 4
573; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
574; GCN-NEXT:    v_mov_b32_e32 v4, s5
575; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
576; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
577; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
578; GCN-NEXT:    s_lshr_b32 s3, s4, 24
579; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
580; GCN-NEXT:    s_cmp_lg_u32 s2, 3
581; GCN-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
582; GCN-NEXT:    v_mov_b32_e32 v0, s3
583; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
584; GCN-NEXT:    s_lshr_b32 s3, s4, 16
585; GCN-NEXT:    s_cmp_lg_u32 s2, 2
586; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
587; GCN-NEXT:    v_mov_b32_e32 v4, s3
588; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
589; GCN-NEXT:    s_lshr_b32 s3, s4, 8
590; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
591; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
592; GCN-NEXT:    s_cmp_lg_u32 s2, 1
593; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
594; GCN-NEXT:    v_mov_b32_e32 v4, s3
595; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
596; GCN-NEXT:    s_cmp_lg_u32 s2, 0
597; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
598; GCN-NEXT:    v_mov_b32_e32 v5, s4
599; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
600; GCN-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
601; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
602; GCN-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
603; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
604; GCN-NEXT:    v_mov_b32_e32 v5, s1
605; GCN-NEXT:    v_mov_b32_e32 v4, s0
606; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
607; GCN-NEXT:    s_endpgm
608entry:
609  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
610  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
611  ret void
612}
613
614define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
615; GCN-LABEL: double2_inselt:
616; GCN:       ; %bb.0: ; %entry
617; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
618; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
619; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
620; GCN-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
621; GCN-NEXT:    s_waitcnt lgkmcnt(0)
622; GCN-NEXT:    s_cmp_eq_u32 s2, 1
623; GCN-NEXT:    v_mov_b32_e32 v1, s7
624; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
625; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
626; GCN-NEXT:    v_mov_b32_e32 v1, s6
627; GCN-NEXT:    s_cmp_eq_u32 s2, 0
628; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, 0, vcc
629; GCN-NEXT:    v_mov_b32_e32 v1, s5
630; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
631; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
632; GCN-NEXT:    v_mov_b32_e32 v0, s4
633; GCN-NEXT:    v_mov_b32_e32 v5, s1
634; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
635; GCN-NEXT:    v_mov_b32_e32 v4, s0
636; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
637; GCN-NEXT:    s_endpgm
638entry:
639  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
640  store <2 x double> %v, <2 x double> addrspace(1)* %out
641  ret void
642}
643
644define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
645; GCN-LABEL: double5_inselt:
646; GCN:       ; %bb.0: ; %entry
647; GCN-NEXT:    s_load_dword s12, s[0:1], 0xa4
648; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
649; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
650; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
651; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
652; GCN-NEXT:    s_waitcnt lgkmcnt(0)
653; GCN-NEXT:    s_cmp_eq_u32 s12, 4
654; GCN-NEXT:    v_mov_b32_e32 v0, s9
655; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
656; GCN-NEXT:    v_cndmask_b32_e32 v9, v0, v4, vcc
657; GCN-NEXT:    v_mov_b32_e32 v0, s8
658; GCN-NEXT:    s_cmp_eq_u32 s12, 1
659; GCN-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
660; GCN-NEXT:    v_mov_b32_e32 v0, s3
661; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
662; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
663; GCN-NEXT:    v_mov_b32_e32 v0, s2
664; GCN-NEXT:    s_cmp_eq_u32 s12, 0
665; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
666; GCN-NEXT:    v_mov_b32_e32 v0, s1
667; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
668; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
669; GCN-NEXT:    v_mov_b32_e32 v0, s0
670; GCN-NEXT:    s_cmp_eq_u32 s12, 3
671; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
672; GCN-NEXT:    v_mov_b32_e32 v5, s7
673; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
674; GCN-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
675; GCN-NEXT:    v_mov_b32_e32 v5, s6
676; GCN-NEXT:    s_cmp_eq_u32 s12, 2
677; GCN-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
678; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
679; GCN-NEXT:    s_add_u32 s0, s10, 16
680; GCN-NEXT:    v_mov_b32_e32 v5, s5
681; GCN-NEXT:    s_addc_u32 s1, s11, 0
682; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
683; GCN-NEXT:    v_mov_b32_e32 v4, s4
684; GCN-NEXT:    v_mov_b32_e32 v11, s1
685; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
686; GCN-NEXT:    v_mov_b32_e32 v10, s0
687; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
688; GCN-NEXT:    s_add_u32 s0, s10, 32
689; GCN-NEXT:    v_mov_b32_e32 v4, s10
690; GCN-NEXT:    v_mov_b32_e32 v5, s11
691; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
692; GCN-NEXT:    s_addc_u32 s1, s11, 0
693; GCN-NEXT:    v_mov_b32_e32 v0, s0
694; GCN-NEXT:    v_mov_b32_e32 v1, s1
695; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
696; GCN-NEXT:    s_endpgm
697entry:
698  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
699  store <5 x double> %v, <5 x double> addrspace(1)* %out
700  ret void
701}
702
703define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
704; GCN-LABEL: double8_inselt:
705; GCN:       ; %bb.0: ; %entry
706; GCN-NEXT:    s_load_dword s2, s[0:1], 0xa4
707; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
708; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
709; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
710; GCN-NEXT:    s_waitcnt lgkmcnt(0)
711; GCN-NEXT:    s_lshl_b32 s2, s2, 1
712; GCN-NEXT:    v_mov_b32_e32 v0, s4
713; GCN-NEXT:    v_mov_b32_e32 v1, s5
714; GCN-NEXT:    v_mov_b32_e32 v2, s6
715; GCN-NEXT:    v_mov_b32_e32 v3, s7
716; GCN-NEXT:    v_mov_b32_e32 v4, s8
717; GCN-NEXT:    v_mov_b32_e32 v5, s9
718; GCN-NEXT:    v_mov_b32_e32 v6, s10
719; GCN-NEXT:    v_mov_b32_e32 v7, s11
720; GCN-NEXT:    v_mov_b32_e32 v8, s12
721; GCN-NEXT:    v_mov_b32_e32 v9, s13
722; GCN-NEXT:    v_mov_b32_e32 v10, s14
723; GCN-NEXT:    v_mov_b32_e32 v11, s15
724; GCN-NEXT:    v_mov_b32_e32 v12, s16
725; GCN-NEXT:    v_mov_b32_e32 v13, s17
726; GCN-NEXT:    v_mov_b32_e32 v14, s18
727; GCN-NEXT:    v_mov_b32_e32 v15, s19
728; GCN-NEXT:    s_mov_b32 m0, s2
729; GCN-NEXT:    s_add_u32 s2, s0, 48
730; GCN-NEXT:    v_movreld_b32_e32 v0, 0
731; GCN-NEXT:    s_addc_u32 s3, s1, 0
732; GCN-NEXT:    v_movreld_b32_e32 v1, v16
733; GCN-NEXT:    v_mov_b32_e32 v17, s3
734; GCN-NEXT:    v_mov_b32_e32 v16, s2
735; GCN-NEXT:    s_add_u32 s2, s0, 32
736; GCN-NEXT:    s_addc_u32 s3, s1, 0
737; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
738; GCN-NEXT:    s_nop 0
739; GCN-NEXT:    v_mov_b32_e32 v13, s3
740; GCN-NEXT:    v_mov_b32_e32 v12, s2
741; GCN-NEXT:    s_add_u32 s2, s0, 16
742; GCN-NEXT:    s_addc_u32 s3, s1, 0
743; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
744; GCN-NEXT:    s_nop 0
745; GCN-NEXT:    v_mov_b32_e32 v9, s3
746; GCN-NEXT:    v_mov_b32_e32 v8, s2
747; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
748; GCN-NEXT:    s_nop 0
749; GCN-NEXT:    v_mov_b32_e32 v5, s1
750; GCN-NEXT:    v_mov_b32_e32 v4, s0
751; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
752; GCN-NEXT:    s_endpgm
753entry:
754  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
755  store <8 x double> %v, <8 x double> addrspace(1)* %out
756  ret void
757}
758
759define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
760; GCN-LABEL: double7_inselt:
761; GCN:       ; %bb.0: ; %entry
762; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
763; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
764; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x94
765; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x84
766; GCN-NEXT:    s_load_dword s0, s[0:1], 0xa4
767; GCN-NEXT:    s_waitcnt lgkmcnt(0)
768; GCN-NEXT:    v_mov_b32_e32 v0, s4
769; GCN-NEXT:    v_mov_b32_e32 v1, s5
770; GCN-NEXT:    v_mov_b32_e32 v2, s6
771; GCN-NEXT:    v_mov_b32_e32 v3, s7
772; GCN-NEXT:    s_lshl_b32 s0, s0, 1
773; GCN-NEXT:    v_mov_b32_e32 v4, s8
774; GCN-NEXT:    v_mov_b32_e32 v5, s9
775; GCN-NEXT:    v_mov_b32_e32 v6, s10
776; GCN-NEXT:    v_mov_b32_e32 v7, s11
777; GCN-NEXT:    v_mov_b32_e32 v8, s12
778; GCN-NEXT:    v_mov_b32_e32 v9, s13
779; GCN-NEXT:    v_mov_b32_e32 v10, s14
780; GCN-NEXT:    v_mov_b32_e32 v11, s15
781; GCN-NEXT:    v_mov_b32_e32 v12, s16
782; GCN-NEXT:    v_mov_b32_e32 v13, s17
783; GCN-NEXT:    s_mov_b32 m0, s0
784; GCN-NEXT:    v_movreld_b32_e32 v0, 0
785; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
786; GCN-NEXT:    s_add_u32 s0, s2, 16
787; GCN-NEXT:    v_movreld_b32_e32 v1, v16
788; GCN-NEXT:    s_addc_u32 s1, s3, 0
789; GCN-NEXT:    v_mov_b32_e32 v15, s1
790; GCN-NEXT:    v_mov_b32_e32 v14, s0
791; GCN-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
792; GCN-NEXT:    s_add_u32 s0, s2, 48
793; GCN-NEXT:    v_mov_b32_e32 v5, s3
794; GCN-NEXT:    v_mov_b32_e32 v4, s2
795; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
796; GCN-NEXT:    s_addc_u32 s1, s3, 0
797; GCN-NEXT:    v_mov_b32_e32 v0, s0
798; GCN-NEXT:    v_mov_b32_e32 v1, s1
799; GCN-NEXT:    s_add_u32 s0, s2, 32
800; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[12:13]
801; GCN-NEXT:    s_addc_u32 s1, s3, 0
802; GCN-NEXT:    v_mov_b32_e32 v0, s0
803; GCN-NEXT:    v_mov_b32_e32 v1, s1
804; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
805; GCN-NEXT:    s_endpgm
806entry:
807  %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
808  store <7 x double> %v, <7 x double> addrspace(1)* %out
809  ret void
810}
811
812define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) {
813; GCN-LABEL: double16_inselt:
814; GCN:       ; %bb.0: ; %entry
815; GCN-NEXT:    s_load_dword s2, s[0:1], 0x124
816; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
817; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xe4
818; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
819; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
820; GCN-NEXT:    s_waitcnt lgkmcnt(0)
821; GCN-NEXT:    v_mov_b32_e32 v0, s36
822; GCN-NEXT:    s_lshl_b32 s2, s2, 1
823; GCN-NEXT:    v_mov_b32_e32 v1, s37
824; GCN-NEXT:    v_mov_b32_e32 v2, s38
825; GCN-NEXT:    v_mov_b32_e32 v3, s39
826; GCN-NEXT:    v_mov_b32_e32 v4, s40
827; GCN-NEXT:    v_mov_b32_e32 v5, s41
828; GCN-NEXT:    v_mov_b32_e32 v6, s42
829; GCN-NEXT:    v_mov_b32_e32 v7, s43
830; GCN-NEXT:    v_mov_b32_e32 v8, s44
831; GCN-NEXT:    v_mov_b32_e32 v9, s45
832; GCN-NEXT:    v_mov_b32_e32 v10, s46
833; GCN-NEXT:    v_mov_b32_e32 v11, s47
834; GCN-NEXT:    v_mov_b32_e32 v12, s48
835; GCN-NEXT:    v_mov_b32_e32 v13, s49
836; GCN-NEXT:    v_mov_b32_e32 v14, s50
837; GCN-NEXT:    v_mov_b32_e32 v15, s51
838; GCN-NEXT:    v_mov_b32_e32 v16, s4
839; GCN-NEXT:    v_mov_b32_e32 v17, s5
840; GCN-NEXT:    v_mov_b32_e32 v18, s6
841; GCN-NEXT:    v_mov_b32_e32 v19, s7
842; GCN-NEXT:    v_mov_b32_e32 v20, s8
843; GCN-NEXT:    v_mov_b32_e32 v21, s9
844; GCN-NEXT:    v_mov_b32_e32 v22, s10
845; GCN-NEXT:    v_mov_b32_e32 v23, s11
846; GCN-NEXT:    v_mov_b32_e32 v24, s12
847; GCN-NEXT:    v_mov_b32_e32 v25, s13
848; GCN-NEXT:    v_mov_b32_e32 v26, s14
849; GCN-NEXT:    v_mov_b32_e32 v27, s15
850; GCN-NEXT:    v_mov_b32_e32 v28, s16
851; GCN-NEXT:    v_mov_b32_e32 v29, s17
852; GCN-NEXT:    v_mov_b32_e32 v30, s18
853; GCN-NEXT:    v_mov_b32_e32 v31, s19
854; GCN-NEXT:    s_mov_b32 m0, s2
855; GCN-NEXT:    s_add_u32 s2, s0, 0x70
856; GCN-NEXT:    v_movreld_b32_e32 v0, 0
857; GCN-NEXT:    s_addc_u32 s3, s1, 0
858; GCN-NEXT:    v_movreld_b32_e32 v1, v32
859; GCN-NEXT:    v_mov_b32_e32 v33, s3
860; GCN-NEXT:    v_mov_b32_e32 v32, s2
861; GCN-NEXT:    s_add_u32 s2, s0, 0x60
862; GCN-NEXT:    s_addc_u32 s3, s1, 0
863; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
864; GCN-NEXT:    s_nop 0
865; GCN-NEXT:    v_mov_b32_e32 v29, s3
866; GCN-NEXT:    v_mov_b32_e32 v28, s2
867; GCN-NEXT:    s_add_u32 s2, s0, 0x50
868; GCN-NEXT:    s_addc_u32 s3, s1, 0
869; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
870; GCN-NEXT:    s_nop 0
871; GCN-NEXT:    v_mov_b32_e32 v25, s3
872; GCN-NEXT:    v_mov_b32_e32 v24, s2
873; GCN-NEXT:    s_add_u32 s2, s0, 64
874; GCN-NEXT:    s_addc_u32 s3, s1, 0
875; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
876; GCN-NEXT:    s_nop 0
877; GCN-NEXT:    v_mov_b32_e32 v21, s3
878; GCN-NEXT:    v_mov_b32_e32 v20, s2
879; GCN-NEXT:    s_add_u32 s2, s0, 48
880; GCN-NEXT:    s_addc_u32 s3, s1, 0
881; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
882; GCN-NEXT:    s_nop 0
883; GCN-NEXT:    v_mov_b32_e32 v17, s3
884; GCN-NEXT:    v_mov_b32_e32 v16, s2
885; GCN-NEXT:    s_add_u32 s2, s0, 32
886; GCN-NEXT:    s_addc_u32 s3, s1, 0
887; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
888; GCN-NEXT:    s_nop 0
889; GCN-NEXT:    v_mov_b32_e32 v13, s3
890; GCN-NEXT:    v_mov_b32_e32 v12, s2
891; GCN-NEXT:    s_add_u32 s2, s0, 16
892; GCN-NEXT:    s_addc_u32 s3, s1, 0
893; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
894; GCN-NEXT:    s_nop 0
895; GCN-NEXT:    v_mov_b32_e32 v9, s3
896; GCN-NEXT:    v_mov_b32_e32 v8, s2
897; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
898; GCN-NEXT:    s_nop 0
899; GCN-NEXT:    v_mov_b32_e32 v5, s1
900; GCN-NEXT:    v_mov_b32_e32 v4, s0
901; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
902; GCN-NEXT:    s_endpgm
903entry:
904  %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
905  store <16 x double> %v, <16 x double> addrspace(1)* %out
906  ret void
907}
908
909define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
910; GCN-LABEL: double15_inselt:
911; GCN:       ; %bb.0: ; %entry
912; GCN-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0xa4
913; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x114
914; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x104
915; GCN-NEXT:    s_load_dwordx8 s[24:31], s[0:1], 0xe4
916; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
917; GCN-NEXT:    s_waitcnt lgkmcnt(0)
918; GCN-NEXT:    v_mov_b32_e32 v0, s8
919; GCN-NEXT:    v_mov_b32_e32 v28, s2
920; GCN-NEXT:    v_mov_b32_e32 v24, s4
921; GCN-NEXT:    s_load_dword s4, s[0:1], 0x124
922; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
923; GCN-NEXT:    v_mov_b32_e32 v1, s9
924; GCN-NEXT:    v_mov_b32_e32 v2, s10
925; GCN-NEXT:    v_mov_b32_e32 v3, s11
926; GCN-NEXT:    s_waitcnt lgkmcnt(0)
927; GCN-NEXT:    s_lshl_b32 s2, s4, 1
928; GCN-NEXT:    v_mov_b32_e32 v4, s12
929; GCN-NEXT:    v_mov_b32_e32 v5, s13
930; GCN-NEXT:    v_mov_b32_e32 v6, s14
931; GCN-NEXT:    v_mov_b32_e32 v7, s15
932; GCN-NEXT:    v_mov_b32_e32 v8, s16
933; GCN-NEXT:    v_mov_b32_e32 v9, s17
934; GCN-NEXT:    v_mov_b32_e32 v10, s18
935; GCN-NEXT:    v_mov_b32_e32 v11, s19
936; GCN-NEXT:    v_mov_b32_e32 v12, s20
937; GCN-NEXT:    v_mov_b32_e32 v13, s21
938; GCN-NEXT:    v_mov_b32_e32 v14, s22
939; GCN-NEXT:    v_mov_b32_e32 v15, s23
940; GCN-NEXT:    v_mov_b32_e32 v16, s24
941; GCN-NEXT:    v_mov_b32_e32 v17, s25
942; GCN-NEXT:    v_mov_b32_e32 v18, s26
943; GCN-NEXT:    v_mov_b32_e32 v19, s27
944; GCN-NEXT:    v_mov_b32_e32 v20, s28
945; GCN-NEXT:    v_mov_b32_e32 v21, s29
946; GCN-NEXT:    v_mov_b32_e32 v22, s30
947; GCN-NEXT:    v_mov_b32_e32 v23, s31
948; GCN-NEXT:    v_mov_b32_e32 v25, s5
949; GCN-NEXT:    v_mov_b32_e32 v26, s6
950; GCN-NEXT:    v_mov_b32_e32 v27, s7
951; GCN-NEXT:    v_mov_b32_e32 v29, s3
952; GCN-NEXT:    s_mov_b32 m0, s2
953; GCN-NEXT:    v_movreld_b32_e32 v0, 0
954; GCN-NEXT:    s_add_u32 s2, s0, 0x50
955; GCN-NEXT:    v_movreld_b32_e32 v1, v32
956; GCN-NEXT:    s_addc_u32 s3, s1, 0
957; GCN-NEXT:    v_mov_b32_e32 v31, s3
958; GCN-NEXT:    v_mov_b32_e32 v30, s2
959; GCN-NEXT:    s_add_u32 s2, s0, 64
960; GCN-NEXT:    s_addc_u32 s3, s1, 0
961; GCN-NEXT:    flat_store_dwordx4 v[30:31], v[20:23]
962; GCN-NEXT:    s_nop 0
963; GCN-NEXT:    v_mov_b32_e32 v21, s3
964; GCN-NEXT:    v_mov_b32_e32 v20, s2
965; GCN-NEXT:    s_add_u32 s2, s0, 48
966; GCN-NEXT:    s_addc_u32 s3, s1, 0
967; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
968; GCN-NEXT:    s_nop 0
969; GCN-NEXT:    v_mov_b32_e32 v17, s3
970; GCN-NEXT:    v_mov_b32_e32 v16, s2
971; GCN-NEXT:    s_add_u32 s2, s0, 32
972; GCN-NEXT:    s_addc_u32 s3, s1, 0
973; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
974; GCN-NEXT:    s_nop 0
975; GCN-NEXT:    v_mov_b32_e32 v13, s3
976; GCN-NEXT:    v_mov_b32_e32 v12, s2
977; GCN-NEXT:    s_add_u32 s2, s0, 16
978; GCN-NEXT:    s_addc_u32 s3, s1, 0
979; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
980; GCN-NEXT:    s_nop 0
981; GCN-NEXT:    v_mov_b32_e32 v9, s3
982; GCN-NEXT:    v_mov_b32_e32 v8, s2
983; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
984; GCN-NEXT:    s_add_u32 s2, s0, 0x70
985; GCN-NEXT:    v_mov_b32_e32 v5, s1
986; GCN-NEXT:    v_mov_b32_e32 v4, s0
987; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
988; GCN-NEXT:    s_addc_u32 s3, s1, 0
989; GCN-NEXT:    v_mov_b32_e32 v0, s2
990; GCN-NEXT:    v_mov_b32_e32 v1, s3
991; GCN-NEXT:    s_add_u32 s0, s0, 0x60
992; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[28:29]
993; GCN-NEXT:    s_addc_u32 s1, s1, 0
994; GCN-NEXT:    v_mov_b32_e32 v0, s0
995; GCN-NEXT:    v_mov_b32_e32 v1, s1
996; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
997; GCN-NEXT:    s_endpgm
998entry:
999  %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
1000  store <15 x double> %v, <15 x double> addrspace(1)* %out
1001  ret void
1002}
1003
1004define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
1005; GCN-LABEL: bit4_inselt:
1006; GCN:       ; %bb.0: ; %entry
1007; GCN-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
1008; GCN-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
1009; GCN-NEXT:    s_mov_b32 s6, -1
1010; GCN-NEXT:    s_mov_b32 s7, 0xe80000
1011; GCN-NEXT:    s_add_u32 s4, s4, s3
1012; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
1013; GCN-NEXT:    s_addc_u32 s5, s5, 0
1014; GCN-NEXT:    v_mov_b32_e32 v0, 4
1015; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1016; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1017; GCN-NEXT:    s_and_b32 s3, s3, 3
1018; GCN-NEXT:    v_mov_b32_e32 v1, s2
1019; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
1020; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
1021; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
1022; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
1023; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1024; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
1025; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
1026; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
1027; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:7
1028; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:6
1029; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:5
1030; GCN-NEXT:    v_mov_b32_e32 v1, 1
1031; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
1032; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:4
1033; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:5
1034; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1035; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:7
1036; GCN-NEXT:    s_waitcnt vmcnt(3)
1037; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
1038; GCN-NEXT:    s_waitcnt vmcnt(2)
1039; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
1040; GCN-NEXT:    s_waitcnt vmcnt(1)
1041; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1042; GCN-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1043; GCN-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
1044; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
1045; GCN-NEXT:    s_waitcnt vmcnt(0)
1046; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
1047; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
1048; GCN-NEXT:    v_or_b32_e32 v0, v0, v3
1049; GCN-NEXT:    v_and_b32_e32 v2, 15, v0
1050; GCN-NEXT:    v_mov_b32_e32 v0, s0
1051; GCN-NEXT:    v_mov_b32_e32 v1, s1
1052; GCN-NEXT:    flat_store_byte v[0:1], v2
1053; GCN-NEXT:    s_endpgm
1054entry:
1055  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
1056  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
1057  ret void
1058}
1059
1060define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
1061; GCN-LABEL: bit128_inselt:
1062; GCN:       ; %bb.0: ; %entry
1063; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1064; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1065; GCN-NEXT:    s_load_dword s0, s[0:1], 0x44
1066; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1067; GCN-NEXT:    s_lshr_b32 s1, s4, 24
1068; GCN-NEXT:    s_lshr_b32 s8, s4, 16
1069; GCN-NEXT:    s_lshr_b32 s9, s4, 17
1070; GCN-NEXT:    s_lshr_b32 s10, s4, 18
1071; GCN-NEXT:    s_lshr_b32 s11, s4, 19
1072; GCN-NEXT:    s_lshr_b32 s12, s4, 20
1073; GCN-NEXT:    s_lshr_b32 s13, s4, 21
1074; GCN-NEXT:    s_lshr_b32 s14, s4, 22
1075; GCN-NEXT:    s_lshr_b32 s15, s4, 23
1076; GCN-NEXT:    s_lshr_b32 s16, s5, 24
1077; GCN-NEXT:    s_lshr_b32 s17, s5, 16
1078; GCN-NEXT:    s_lshr_b32 s18, s5, 17
1079; GCN-NEXT:    s_lshr_b32 s19, s5, 18
1080; GCN-NEXT:    s_lshr_b32 s20, s5, 19
1081; GCN-NEXT:    s_lshr_b32 s21, s5, 20
1082; GCN-NEXT:    s_lshr_b32 s22, s5, 21
1083; GCN-NEXT:    s_lshr_b32 s23, s5, 22
1084; GCN-NEXT:    s_lshr_b32 s24, s5, 23
1085; GCN-NEXT:    s_lshr_b32 s25, s6, 24
1086; GCN-NEXT:    s_lshr_b32 s26, s6, 16
1087; GCN-NEXT:    s_lshr_b32 s27, s6, 17
1088; GCN-NEXT:    s_lshr_b32 s28, s6, 18
1089; GCN-NEXT:    s_lshr_b32 s29, s6, 19
1090; GCN-NEXT:    s_lshr_b32 s30, s6, 20
1091; GCN-NEXT:    s_lshr_b32 s31, s6, 21
1092; GCN-NEXT:    s_lshr_b32 s33, s6, 22
1093; GCN-NEXT:    s_lshr_b32 s34, s6, 23
1094; GCN-NEXT:    s_lshr_b32 s35, s7, 24
1095; GCN-NEXT:    s_lshr_b32 s36, s7, 16
1096; GCN-NEXT:    s_lshr_b32 s37, s7, 17
1097; GCN-NEXT:    s_lshr_b32 s38, s7, 18
1098; GCN-NEXT:    s_lshr_b32 s39, s7, 19
1099; GCN-NEXT:    s_lshr_b32 s40, s7, 20
1100; GCN-NEXT:    s_lshr_b32 s41, s7, 21
1101; GCN-NEXT:    s_lshr_b32 s42, s7, 22
1102; GCN-NEXT:    s_lshr_b32 s43, s7, 23
1103; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x77
1104; GCN-NEXT:    v_mov_b32_e32 v16, s43
1105; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1106; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x76
1107; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1108; GCN-NEXT:    v_mov_b32_e32 v17, s42
1109; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1110; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1111; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1112; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1113; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x75
1114; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1115; GCN-NEXT:    v_mov_b32_e32 v17, s41
1116; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1117; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x74
1118; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1119; GCN-NEXT:    v_mov_b32_e32 v18, s40
1120; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1121; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1122; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1123; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1124; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1125; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1126; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1127; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x73
1128; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1129; GCN-NEXT:    v_mov_b32_e32 v17, s39
1130; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1131; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x72
1132; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1133; GCN-NEXT:    v_mov_b32_e32 v18, s38
1134; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1135; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1136; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1137; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1138; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x71
1139; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1140; GCN-NEXT:    v_mov_b32_e32 v18, s37
1141; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1142; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x70
1143; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1144; GCN-NEXT:    v_mov_b32_e32 v19, s36
1145; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1146; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1147; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1148; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1149; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1150; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1151; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1152; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1153; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1154; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1155; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7f
1156; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1157; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s35
1158; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1159; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7e
1160; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s35
1161; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1162; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1163; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1164; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1165; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1166; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7d
1167; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1168; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s35
1169; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1170; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7c
1171; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s35
1172; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1173; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1174; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1175; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1176; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1177; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1178; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1179; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1180; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7b
1181; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1182; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s35
1183; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1184; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7a
1185; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s35
1186; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1187; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1188; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1189; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
1190; GCN-NEXT:    v_mov_b32_e32 v14, s35
1191; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1192; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1193; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1194; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x79
1195; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1196; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s35
1197; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1198; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1199; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1200; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
1201; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1202; GCN-NEXT:    v_or_b32_e32 v14, v14, v19
1203; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1204; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
1205; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
1206; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1207; GCN-NEXT:    v_and_b32_e32 v14, 15, v14
1208; GCN-NEXT:    v_or_b32_sdwa v14, v14, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1209; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6f
1210; GCN-NEXT:    v_or_b32_sdwa v14, v16, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1211; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s7
1212; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1213; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6e
1214; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s7
1215; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1216; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1217; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1218; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1219; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1220; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
1221; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1222; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s7
1223; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1224; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
1225; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s7
1226; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1227; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1228; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1229; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1230; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1231; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1232; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1233; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1234; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
1235; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1236; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s7
1237; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1238; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
1239; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s7
1240; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1241; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1242; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1243; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1244; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1245; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
1246; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1247; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s7
1248; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1249; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
1250; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s7
1251; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1252; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1253; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1254; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1255; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1256; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1257; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1258; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1259; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1260; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1261; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1262; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
1263; GCN-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1264; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s7
1265; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1266; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
1267; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s7
1268; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1269; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1270; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1271; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1272; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1273; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
1274; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1275; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s7
1276; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1277; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
1278; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s7
1279; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1280; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1281; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1282; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1283; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1284; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1285; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1286; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1287; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
1288; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1289; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s7
1290; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1291; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x62
1292; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s7
1293; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1294; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1295; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1296; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1297; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1298; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
1299; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1300; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s7
1301; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1302; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
1303; GCN-NEXT:    v_mov_b32_e32 v15, s7
1304; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1305; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1306; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1307; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1308; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1309; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
1310; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1311; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1312; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
1313; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1314; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1315; GCN-NEXT:    v_or_b32_e32 v15, v15, v17
1316; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x57
1317; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1318; GCN-NEXT:    v_mov_b32_e32 v16, s34
1319; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1320; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x56
1321; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1322; GCN-NEXT:    v_mov_b32_e32 v17, s33
1323; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1324; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1325; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1326; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1327; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
1328; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1329; GCN-NEXT:    v_mov_b32_e32 v17, s31
1330; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1331; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
1332; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1333; GCN-NEXT:    v_mov_b32_e32 v18, s30
1334; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1335; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1336; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1337; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1338; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1339; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1340; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1341; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
1342; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1343; GCN-NEXT:    v_mov_b32_e32 v17, s29
1344; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1345; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x52
1346; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1347; GCN-NEXT:    v_mov_b32_e32 v18, s28
1348; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1349; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1350; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1351; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1352; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
1353; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1354; GCN-NEXT:    v_mov_b32_e32 v18, s27
1355; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1356; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
1357; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1358; GCN-NEXT:    v_mov_b32_e32 v19, s26
1359; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1360; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1361; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1362; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1363; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1364; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1365; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1366; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1367; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1368; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1369; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5f
1370; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1371; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s25
1372; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1373; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5e
1374; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s25
1375; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1376; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1377; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1378; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1379; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1380; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5d
1381; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1382; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s25
1383; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1384; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5c
1385; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s25
1386; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1387; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1388; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1389; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1390; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1391; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1392; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1393; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1394; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5b
1395; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1396; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s25
1397; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1398; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5a
1399; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s25
1400; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1401; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1402; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1403; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x58
1404; GCN-NEXT:    v_mov_b32_e32 v3, s25
1405; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1406; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1407; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1408; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x59
1409; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1410; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s25
1411; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1412; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1413; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1414; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
1415; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1416; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
1417; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1418; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
1419; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
1420; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1421; GCN-NEXT:    v_and_b32_e32 v3, 15, v3
1422; GCN-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1423; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4f
1424; GCN-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1425; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
1426; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1427; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4e
1428; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s6
1429; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1430; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1431; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1432; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1433; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1434; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4d
1435; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
1436; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s6
1437; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1438; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4c
1439; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s6
1440; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1441; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1442; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1443; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1444; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1445; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1446; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1447; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1448; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4b
1449; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
1450; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s6
1451; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1452; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4a
1453; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s6
1454; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1455; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1456; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1457; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1458; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1459; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x49
1460; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1461; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s6
1462; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1463; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x48
1464; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s6
1465; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1466; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1467; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1468; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1469; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1470; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1471; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1472; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1473; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1474; GCN-NEXT:    v_lshlrev_b16_e32 v3, 4, v3
1475; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1476; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x47
1477; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1478; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
1479; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1480; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x46
1481; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s6
1482; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1483; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1484; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1485; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1486; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1487; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x45
1488; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
1489; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s6
1490; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1491; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x44
1492; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s6
1493; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1494; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1495; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1496; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1497; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1498; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1499; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1500; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1501; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x43
1502; GCN-NEXT:    v_or_b32_e32 v18, v18, v3
1503; GCN-NEXT:    v_lshrrev_b16_e64 v3, 3, s6
1504; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1505; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x42
1506; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s6
1507; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1508; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1509; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1510; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1511; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1512; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x41
1513; GCN-NEXT:    v_or_b32_e32 v3, v19, v3
1514; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s6
1515; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1516; GCN-NEXT:    s_cmp_lg_u32 s0, 64
1517; GCN-NEXT:    v_mov_b32_e32 v2, s6
1518; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1519; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1520; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
1521; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1522; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1523; GCN-NEXT:    v_or_b32_e32 v2, v2, v19
1524; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1525; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
1526; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
1527; GCN-NEXT:    v_or_b32_sdwa v3, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1528; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v18
1529; GCN-NEXT:    v_and_b32_e32 v2, 15, v2
1530; GCN-NEXT:    s_cmp_lg_u32 s0, 55
1531; GCN-NEXT:    v_or_b32_e32 v2, v2, v14
1532; GCN-NEXT:    v_mov_b32_e32 v14, s24
1533; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1534; GCN-NEXT:    s_cmp_lg_u32 s0, 54
1535; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1536; GCN-NEXT:    v_mov_b32_e32 v15, s23
1537; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1538; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1539; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
1540; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1541; GCN-NEXT:    s_cmp_lg_u32 s0, 53
1542; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1543; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1544; GCN-NEXT:    v_mov_b32_e32 v15, s22
1545; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1546; GCN-NEXT:    s_cmp_lg_u32 s0, 52
1547; GCN-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1548; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1549; GCN-NEXT:    v_mov_b32_e32 v16, s21
1550; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1551; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1552; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1553; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1554; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1555; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
1556; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1557; GCN-NEXT:    s_cmp_lg_u32 s0, 51
1558; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1559; GCN-NEXT:    v_mov_b32_e32 v15, s20
1560; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1561; GCN-NEXT:    s_cmp_lg_u32 s0, 50
1562; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1563; GCN-NEXT:    v_mov_b32_e32 v16, s19
1564; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1565; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1566; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1567; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1568; GCN-NEXT:    s_cmp_lg_u32 s0, 49
1569; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1570; GCN-NEXT:    v_mov_b32_e32 v16, s18
1571; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1572; GCN-NEXT:    s_cmp_lg_u32 s0, 48
1573; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1574; GCN-NEXT:    v_mov_b32_e32 v17, s17
1575; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1576; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1577; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1578; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1579; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1580; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1581; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1582; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1583; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
1584; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1585; GCN-NEXT:    s_cmp_lg_u32 s0, 63
1586; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1587; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s16
1588; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1589; GCN-NEXT:    s_cmp_lg_u32 s0, 62
1590; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s16
1591; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1592; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1593; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1594; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1595; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1596; GCN-NEXT:    s_cmp_lg_u32 s0, 61
1597; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1598; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s16
1599; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1600; GCN-NEXT:    s_cmp_lg_u32 s0, 60
1601; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s16
1602; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1603; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1604; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1605; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1606; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1607; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1608; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1609; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1610; GCN-NEXT:    s_cmp_lg_u32 s0, 59
1611; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1612; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s16
1613; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1614; GCN-NEXT:    s_cmp_lg_u32 s0, 58
1615; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s16
1616; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1617; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1618; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1619; GCN-NEXT:    s_cmp_lg_u32 s0, 56
1620; GCN-NEXT:    v_mov_b32_e32 v13, s16
1621; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1622; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1623; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1624; GCN-NEXT:    s_cmp_lg_u32 s0, 57
1625; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1626; GCN-NEXT:    v_lshrrev_b16_e64 v17, 1, s16
1627; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1628; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1629; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1630; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
1631; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1632; GCN-NEXT:    v_or_b32_e32 v13, v13, v17
1633; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1634; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
1635; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
1636; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1637; GCN-NEXT:    v_and_b32_e32 v13, 15, v13
1638; GCN-NEXT:    v_or_b32_sdwa v13, v13, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1639; GCN-NEXT:    s_cmp_lg_u32 s0, 47
1640; GCN-NEXT:    v_or_b32_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1641; GCN-NEXT:    v_lshrrev_b16_e64 v13, 15, s5
1642; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1643; GCN-NEXT:    s_cmp_lg_u32 s0, 46
1644; GCN-NEXT:    v_lshrrev_b16_e64 v15, 14, s5
1645; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1646; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1647; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1648; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1649; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1650; GCN-NEXT:    s_cmp_lg_u32 s0, 45
1651; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
1652; GCN-NEXT:    v_lshrrev_b16_e64 v15, 13, s5
1653; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1654; GCN-NEXT:    s_cmp_lg_u32 s0, 44
1655; GCN-NEXT:    v_lshrrev_b16_e64 v16, 12, s5
1656; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1657; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1658; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1659; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1660; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1661; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1662; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1663; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1664; GCN-NEXT:    s_cmp_lg_u32 s0, 43
1665; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
1666; GCN-NEXT:    v_lshrrev_b16_e64 v15, 11, s5
1667; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1668; GCN-NEXT:    s_cmp_lg_u32 s0, 42
1669; GCN-NEXT:    v_lshrrev_b16_e64 v16, 10, s5
1670; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1671; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1672; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1673; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1674; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1675; GCN-NEXT:    s_cmp_lg_u32 s0, 41
1676; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1677; GCN-NEXT:    v_lshrrev_b16_e64 v16, 9, s5
1678; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1679; GCN-NEXT:    s_cmp_lg_u32 s0, 40
1680; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s5
1681; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1682; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1683; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1684; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1685; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1686; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1687; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1688; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1689; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1690; GCN-NEXT:    v_lshlrev_b16_e32 v13, 4, v13
1691; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1692; GCN-NEXT:    s_cmp_lg_u32 s0, 39
1693; GCN-NEXT:    v_or_b32_sdwa v15, v15, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1694; GCN-NEXT:    v_lshrrev_b16_e64 v13, 7, s5
1695; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1696; GCN-NEXT:    s_cmp_lg_u32 s0, 38
1697; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s5
1698; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1699; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1700; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1701; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1702; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1703; GCN-NEXT:    s_cmp_lg_u32 s0, 37
1704; GCN-NEXT:    v_or_b32_e32 v13, v16, v13
1705; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s5
1706; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1707; GCN-NEXT:    s_cmp_lg_u32 s0, 36
1708; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s5
1709; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1710; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1711; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1712; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1713; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1714; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1715; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1716; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1717; GCN-NEXT:    s_cmp_lg_u32 s0, 35
1718; GCN-NEXT:    v_or_b32_e32 v16, v16, v13
1719; GCN-NEXT:    v_lshrrev_b16_e64 v13, 3, s5
1720; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1721; GCN-NEXT:    s_cmp_lg_u32 s0, 34
1722; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s5
1723; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1724; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1725; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1726; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1727; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1728; GCN-NEXT:    s_cmp_lg_u32 s0, 33
1729; GCN-NEXT:    v_or_b32_e32 v17, v17, v13
1730; GCN-NEXT:    v_lshrrev_b16_e64 v13, 1, s5
1731; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1732; GCN-NEXT:    s_cmp_lg_u32 s0, 32
1733; GCN-NEXT:    v_mov_b32_e32 v1, s5
1734; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1735; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1736; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
1737; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1738; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
1739; GCN-NEXT:    v_or_b32_e32 v1, v1, v13
1740; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1741; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
1742; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
1743; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1744; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
1745; GCN-NEXT:    v_or_b32_e32 v1, v1, v16
1746; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1747; GCN-NEXT:    s_cmp_lg_u32 s0, 23
1748; GCN-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1749; GCN-NEXT:    v_mov_b32_e32 v14, s15
1750; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1751; GCN-NEXT:    s_cmp_lg_u32 s0, 22
1752; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1753; GCN-NEXT:    v_mov_b32_e32 v15, s14
1754; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1755; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1756; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
1757; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1758; GCN-NEXT:    s_cmp_lg_u32 s0, 21
1759; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1760; GCN-NEXT:    v_mov_b32_e32 v15, s13
1761; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1762; GCN-NEXT:    s_cmp_lg_u32 s0, 20
1763; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1764; GCN-NEXT:    v_mov_b32_e32 v16, s12
1765; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1766; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1767; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1768; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1769; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1770; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
1771; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1772; GCN-NEXT:    s_cmp_lg_u32 s0, 19
1773; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1774; GCN-NEXT:    v_mov_b32_e32 v15, s11
1775; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1776; GCN-NEXT:    s_cmp_lg_u32 s0, 18
1777; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1778; GCN-NEXT:    v_mov_b32_e32 v16, s10
1779; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1780; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1781; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1782; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1783; GCN-NEXT:    s_cmp_lg_u32 s0, 17
1784; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1785; GCN-NEXT:    v_mov_b32_e32 v16, s9
1786; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1787; GCN-NEXT:    s_cmp_lg_u32 s0, 16
1788; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1789; GCN-NEXT:    v_mov_b32_e32 v18, s8
1790; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1791; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1792; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1793; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1794; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1795; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1796; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1797; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1798; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
1799; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1800; GCN-NEXT:    s_cmp_lg_u32 s0, 31
1801; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1802; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s1
1803; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1804; GCN-NEXT:    s_cmp_lg_u32 s0, 30
1805; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s1
1806; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1807; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1808; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1809; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1810; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1811; GCN-NEXT:    s_cmp_lg_u32 s0, 29
1812; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1813; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s1
1814; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1815; GCN-NEXT:    s_cmp_lg_u32 s0, 28
1816; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s1
1817; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1818; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1819; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1820; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1821; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1822; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1823; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1824; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1825; GCN-NEXT:    s_cmp_lg_u32 s0, 27
1826; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1827; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s1
1828; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1829; GCN-NEXT:    s_cmp_lg_u32 s0, 26
1830; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s1
1831; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1832; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1833; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1834; GCN-NEXT:    s_cmp_lg_u32 s0, 24
1835; GCN-NEXT:    v_mov_b32_e32 v17, s1
1836; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1837; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1838; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1839; GCN-NEXT:    s_cmp_lg_u32 s0, 25
1840; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1841; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s1
1842; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1843; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1844; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1845; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1846; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1847; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
1848; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1849; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1850; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1851; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1852; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
1853; GCN-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1854; GCN-NEXT:    s_cmp_lg_u32 s0, 15
1855; GCN-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1856; GCN-NEXT:    v_lshrrev_b16_e64 v15, 15, s4
1857; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1858; GCN-NEXT:    s_cmp_lg_u32 s0, 14
1859; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s4
1860; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1861; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1862; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1863; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1864; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1865; GCN-NEXT:    s_cmp_lg_u32 s0, 13
1866; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1867; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s4
1868; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1869; GCN-NEXT:    s_cmp_lg_u32 s0, 12
1870; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s4
1871; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1872; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1873; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1874; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1875; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1876; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1877; GCN-NEXT:    s_cmp_lg_u32 s0, 11
1878; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s4
1879; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1880; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1881; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1882; GCN-NEXT:    s_cmp_lg_u32 s0, 10
1883; GCN-NEXT:    v_lshrrev_b16_e64 v13, 10, s4
1884; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1885; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v18, vcc
1886; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1887; GCN-NEXT:    s_cmp_lg_u32 s0, 9
1888; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
1889; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1890; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1891; GCN-NEXT:    s_cmp_lg_u32 s0, 8
1892; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
1893; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
1894; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1895; GCN-NEXT:    s_cmp_lg_u32 s0, 7
1896; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
1897; GCN-NEXT:    v_cndmask_b32_e32 v11, 1, v11, vcc
1898; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1899; GCN-NEXT:    s_cmp_lg_u32 s0, 6
1900; GCN-NEXT:    v_lshrrev_b16_e64 v9, 6, s4
1901; GCN-NEXT:    v_cndmask_b32_e32 v10, 1, v10, vcc
1902; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1903; GCN-NEXT:    s_cmp_lg_u32 s0, 5
1904; GCN-NEXT:    v_lshrrev_b16_e64 v8, 5, s4
1905; GCN-NEXT:    v_cndmask_b32_e32 v9, 1, v9, vcc
1906; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1907; GCN-NEXT:    s_cmp_lg_u32 s0, 4
1908; GCN-NEXT:    v_lshrrev_b16_e64 v7, 4, s4
1909; GCN-NEXT:    v_cndmask_b32_e32 v8, 1, v8, vcc
1910; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1911; GCN-NEXT:    s_cmp_lg_u32 s0, 3
1912; GCN-NEXT:    v_lshrrev_b16_e64 v6, 3, s4
1913; GCN-NEXT:    v_cndmask_b32_e32 v7, 1, v7, vcc
1914; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1915; GCN-NEXT:    s_cmp_lg_u32 s0, 2
1916; GCN-NEXT:    v_lshrrev_b16_e64 v5, 2, s4
1917; GCN-NEXT:    v_cndmask_b32_e32 v6, 1, v6, vcc
1918; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1919; GCN-NEXT:    s_cmp_lg_u32 s0, 1
1920; GCN-NEXT:    v_lshrrev_b16_e64 v4, 1, s4
1921; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
1922; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1923; GCN-NEXT:    s_cmp_lg_u32 s0, 0
1924; GCN-NEXT:    v_mov_b32_e32 v0, s4
1925; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
1926; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1927; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
1928; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1929; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
1930; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
1931; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
1932; GCN-NEXT:    v_lshlrev_b16_e32 v10, 1, v10
1933; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
1934; GCN-NEXT:    v_lshlrev_b16_e32 v8, 1, v8
1935; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
1936; GCN-NEXT:    v_lshlrev_b16_e32 v6, 1, v6
1937; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
1938; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
1939; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
1940; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
1941; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
1942; GCN-NEXT:    v_or_b32_e32 v9, v9, v10
1943; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
1944; GCN-NEXT:    v_or_b32_e32 v5, v5, v6
1945; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
1946; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1947; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
1948; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
1949; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
1950; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
1951; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
1952; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
1953; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
1954; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
1955; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1956; GCN-NEXT:    v_and_b32_e32 v11, 15, v11
1957; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
1958; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
1959; GCN-NEXT:    v_or_b32_sdwa v11, v11, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1960; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
1961; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1962; GCN-NEXT:    v_mov_b32_e32 v5, s3
1963; GCN-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1964; GCN-NEXT:    v_mov_b32_e32 v4, s2
1965; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1966; GCN-NEXT:    s_endpgm
1967entry:
1968  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
1969  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
1970  ret void
1971}
1972
1973define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
1974; GCN-LABEL: float32_inselt_vec:
1975; GCN:       ; %bb.0: ; %entry
1976; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v32
1977; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v32
1978; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 3, v32
1979; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 4, v32
1980; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 5, v32
1981; GCN-NEXT:    v_cmp_ne_u32_e64 s[8:9], 6, v32
1982; GCN-NEXT:    v_cmp_ne_u32_e64 s[10:11], 7, v32
1983; GCN-NEXT:    v_cmp_ne_u32_e64 s[12:13], 8, v32
1984; GCN-NEXT:    v_cmp_ne_u32_e64 s[14:15], 9, v32
1985; GCN-NEXT:    v_cmp_ne_u32_e64 s[16:17], 10, v32
1986; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 11, v32
1987; GCN-NEXT:    v_cmp_ne_u32_e64 s[20:21], 12, v32
1988; GCN-NEXT:    v_cmp_ne_u32_e64 s[22:23], 13, v32
1989; GCN-NEXT:    v_cmp_ne_u32_e64 s[24:25], 14, v32
1990; GCN-NEXT:    v_cmp_ne_u32_e64 s[26:27], 15, v32
1991; GCN-NEXT:    v_cmp_ne_u32_e64 s[28:29], 16, v32
1992; GCN-NEXT:    v_cmp_ne_u32_e64 s[30:31], 17, v32
1993; GCN-NEXT:    v_cmp_ne_u32_e64 s[34:35], 18, v32
1994; GCN-NEXT:    v_cmp_ne_u32_e64 s[36:37], 19, v32
1995; GCN-NEXT:    v_cmp_ne_u32_e64 s[38:39], 20, v32
1996; GCN-NEXT:    v_cmp_ne_u32_e64 s[40:41], 21, v32
1997; GCN-NEXT:    v_cmp_ne_u32_e64 s[42:43], 22, v32
1998; GCN-NEXT:    v_cmp_ne_u32_e64 s[44:45], 23, v32
1999; GCN-NEXT:    v_cmp_ne_u32_e64 s[46:47], 24, v32
2000; GCN-NEXT:    v_cmp_ne_u32_e64 s[48:49], 25, v32
2001; GCN-NEXT:    v_cmp_ne_u32_e64 s[50:51], 26, v32
2002; GCN-NEXT:    v_cmp_ne_u32_e64 s[52:53], 27, v32
2003; GCN-NEXT:    v_cmp_ne_u32_e64 s[54:55], 28, v32
2004; GCN-NEXT:    v_cmp_ne_u32_e64 s[56:57], 29, v32
2005; GCN-NEXT:    v_cmp_ne_u32_e64 s[58:59], 30, v32
2006; GCN-NEXT:    v_cmp_ne_u32_e64 s[60:61], 31, v32
2007; GCN-NEXT:    v_cmp_ne_u32_e64 s[62:63], 0, v32
2008; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[62:63]
2009; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
2010; GCN-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
2011; GCN-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[2:3]
2012; GCN-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
2013; GCN-NEXT:    v_cndmask_b32_e64 v5, 1.0, v5, s[6:7]
2014; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, v6, s[8:9]
2015; GCN-NEXT:    v_cndmask_b32_e64 v7, 1.0, v7, s[10:11]
2016; GCN-NEXT:    v_cndmask_b32_e64 v8, 1.0, v8, s[12:13]
2017; GCN-NEXT:    v_cndmask_b32_e64 v9, 1.0, v9, s[14:15]
2018; GCN-NEXT:    v_cndmask_b32_e64 v10, 1.0, v10, s[16:17]
2019; GCN-NEXT:    v_cndmask_b32_e64 v11, 1.0, v11, s[18:19]
2020; GCN-NEXT:    v_cndmask_b32_e64 v12, 1.0, v12, s[20:21]
2021; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, v13, s[22:23]
2022; GCN-NEXT:    v_cndmask_b32_e64 v14, 1.0, v14, s[24:25]
2023; GCN-NEXT:    v_cndmask_b32_e64 v15, 1.0, v15, s[26:27]
2024; GCN-NEXT:    v_cndmask_b32_e64 v16, 1.0, v16, s[28:29]
2025; GCN-NEXT:    v_cndmask_b32_e64 v17, 1.0, v17, s[30:31]
2026; GCN-NEXT:    v_cndmask_b32_e64 v18, 1.0, v18, s[34:35]
2027; GCN-NEXT:    v_cndmask_b32_e64 v19, 1.0, v19, s[36:37]
2028; GCN-NEXT:    v_cndmask_b32_e64 v20, 1.0, v20, s[38:39]
2029; GCN-NEXT:    v_cndmask_b32_e64 v21, 1.0, v21, s[40:41]
2030; GCN-NEXT:    v_cndmask_b32_e64 v22, 1.0, v22, s[42:43]
2031; GCN-NEXT:    v_cndmask_b32_e64 v23, 1.0, v23, s[44:45]
2032; GCN-NEXT:    v_cndmask_b32_e64 v24, 1.0, v24, s[46:47]
2033; GCN-NEXT:    v_cndmask_b32_e64 v25, 1.0, v25, s[48:49]
2034; GCN-NEXT:    v_cndmask_b32_e64 v26, 1.0, v26, s[50:51]
2035; GCN-NEXT:    v_cndmask_b32_e64 v27, 1.0, v27, s[52:53]
2036; GCN-NEXT:    v_cndmask_b32_e64 v28, 1.0, v28, s[54:55]
2037; GCN-NEXT:    v_cndmask_b32_e64 v29, 1.0, v29, s[56:57]
2038; GCN-NEXT:    v_cndmask_b32_e64 v30, 1.0, v30, s[58:59]
2039; GCN-NEXT:    v_cndmask_b32_e64 v31, 1.0, v31, s[60:61]
2040; GCN-NEXT:    ; return to shader part epilog
2041entry:
2042  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
2043  ret <32 x float> %v
2044}
2045
2046define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
2047; GCN-LABEL: double8_inselt_vec:
2048; GCN:       ; %bb.0: ; %entry
2049; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2050; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
2051; GCN-NEXT:    v_mov_b32_e32 v17, 0x3ff00000
2052; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2053; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
2054; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
2055; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
2056; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
2057; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
2058; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
2059; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
2060; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
2061; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
2062; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
2063; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
2064; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
2065; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc
2066; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
2067; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
2068; GCN-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc
2069; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
2070; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
2071; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
2072; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
2073; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
2074; GCN-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
2075; GCN-NEXT:    s_setpc_b64 s[30:31]
2076entry:
2077  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
2078  ret <8 x double> %v
2079}
2080