1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
3
4define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
5; GCN-LABEL: float4_inselt:
6; GCN:       ; %bb.0: ; %entry
7; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
8; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
10; GCN-NEXT:    s_waitcnt lgkmcnt(0)
11; GCN-NEXT:    s_cmp_lg_u32 s2, 3
12; GCN-NEXT:    v_mov_b32_e32 v0, s7
13; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
14; GCN-NEXT:    s_cmp_lg_u32 s2, 2
15; GCN-NEXT:    v_cndmask_b32_e32 v3, 1.0, v0, vcc
16; GCN-NEXT:    v_mov_b32_e32 v0, s6
17; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
18; GCN-NEXT:    s_cmp_lg_u32 s2, 1
19; GCN-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
20; GCN-NEXT:    v_mov_b32_e32 v0, s5
21; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
22; GCN-NEXT:    s_cmp_lg_u32 s2, 0
23; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
24; GCN-NEXT:    v_mov_b32_e32 v0, s4
25; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
26; GCN-NEXT:    v_mov_b32_e32 v5, s1
27; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
28; GCN-NEXT:    v_mov_b32_e32 v4, s0
29; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
30; GCN-NEXT:    s_endpgm
31entry:
32  %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
33  store <4 x float> %v, <4 x float> addrspace(1)* %out
34  ret void
35}
36
37define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
38; GCN-LABEL: float4_inselt_undef:
39; GCN:       ; %bb.0: ; %entry
40; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
42; GCN-NEXT:    v_mov_b32_e32 v1, v0
43; GCN-NEXT:    v_mov_b32_e32 v2, v0
44; GCN-NEXT:    v_mov_b32_e32 v3, v0
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_mov_b32_e32 v5, s1
47; GCN-NEXT:    v_mov_b32_e32 v4, s0
48; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
49; GCN-NEXT:    s_endpgm
50entry:
51  %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
52  store <4 x float> %v, <4 x float> addrspace(1)* %out
53  ret void
54}
55
56define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
57; GCN-LABEL: int4_inselt:
58; GCN:       ; %bb.0: ; %entry
59; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
60; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
61; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
62; GCN-NEXT:    s_waitcnt lgkmcnt(0)
63; GCN-NEXT:    s_cmp_lg_u32 s2, 3
64; GCN-NEXT:    s_cselect_b32 s3, s7, 1
65; GCN-NEXT:    s_cmp_lg_u32 s2, 2
66; GCN-NEXT:    s_cselect_b32 s6, s6, 1
67; GCN-NEXT:    s_cmp_lg_u32 s2, 1
68; GCN-NEXT:    s_cselect_b32 s5, s5, 1
69; GCN-NEXT:    s_cmp_lg_u32 s2, 0
70; GCN-NEXT:    s_cselect_b32 s2, s4, 1
71; GCN-NEXT:    v_mov_b32_e32 v5, s1
72; GCN-NEXT:    v_mov_b32_e32 v0, s2
73; GCN-NEXT:    v_mov_b32_e32 v1, s5
74; GCN-NEXT:    v_mov_b32_e32 v2, s6
75; GCN-NEXT:    v_mov_b32_e32 v3, s3
76; GCN-NEXT:    v_mov_b32_e32 v4, s0
77; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
78; GCN-NEXT:    s_endpgm
79entry:
80  %v = insertelement <4 x i32> %vec, i32 1, i32 %sel
81  store <4 x i32> %v, <4 x i32> addrspace(1)* %out
82  ret void
83}
84
85define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
86; GCN-LABEL: float2_inselt:
87; GCN:       ; %bb.0: ; %entry
88; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
89; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
90; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
91; GCN-NEXT:    s_waitcnt lgkmcnt(0)
92; GCN-NEXT:    s_cmp_lg_u32 s4, 1
93; GCN-NEXT:    v_mov_b32_e32 v0, s3
94; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
95; GCN-NEXT:    s_cmp_lg_u32 s4, 0
96; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
97; GCN-NEXT:    v_mov_b32_e32 v0, s2
98; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
99; GCN-NEXT:    v_mov_b32_e32 v3, s1
100; GCN-NEXT:    v_cndmask_b32_e32 v0, 1.0, v0, vcc
101; GCN-NEXT:    v_mov_b32_e32 v2, s0
102; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
103; GCN-NEXT:    s_endpgm
104entry:
105  %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
106  store <2 x float> %v, <2 x float> addrspace(1)* %out
107  ret void
108}
109
110define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
111; GCN-LABEL: float8_inselt:
112; GCN:       ; %bb.0: ; %entry
113; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
114; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
115; GCN-NEXT:    s_load_dword s1, s[0:1], 0x64
116; GCN-NEXT:    s_waitcnt lgkmcnt(0)
117; GCN-NEXT:    v_mov_b32_e32 v0, s4
118; GCN-NEXT:    s_add_u32 s0, s2, 16
119; GCN-NEXT:    s_mov_b32 m0, s1
120; GCN-NEXT:    s_addc_u32 s1, s3, 0
121; GCN-NEXT:    v_mov_b32_e32 v1, s5
122; GCN-NEXT:    v_mov_b32_e32 v2, s6
123; GCN-NEXT:    v_mov_b32_e32 v3, s7
124; GCN-NEXT:    v_mov_b32_e32 v4, s8
125; GCN-NEXT:    v_mov_b32_e32 v5, s9
126; GCN-NEXT:    v_mov_b32_e32 v6, s10
127; GCN-NEXT:    v_mov_b32_e32 v7, s11
128; GCN-NEXT:    v_mov_b32_e32 v9, s1
129; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
130; GCN-NEXT:    v_mov_b32_e32 v8, s0
131; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
132; GCN-NEXT:    s_nop 0
133; GCN-NEXT:    v_mov_b32_e32 v5, s3
134; GCN-NEXT:    v_mov_b32_e32 v4, s2
135; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
136; GCN-NEXT:    s_endpgm
137entry:
138  %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
139  store <8 x float> %v, <8 x float> addrspace(1)* %out
140  ret void
141}
142
143define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
144; GCN-LABEL: float16_inselt:
145; GCN:       ; %bb.0: ; %entry
146; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
147; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
148; GCN-NEXT:    s_load_dword s20, s[0:1], 0xa4
149; GCN-NEXT:    s_waitcnt lgkmcnt(0)
150; GCN-NEXT:    v_mov_b32_e32 v0, s4
151; GCN-NEXT:    s_add_u32 s0, s2, 48
152; GCN-NEXT:    s_addc_u32 s1, s3, 0
153; GCN-NEXT:    v_mov_b32_e32 v17, s1
154; GCN-NEXT:    v_mov_b32_e32 v1, s5
155; GCN-NEXT:    v_mov_b32_e32 v2, s6
156; GCN-NEXT:    v_mov_b32_e32 v3, s7
157; GCN-NEXT:    v_mov_b32_e32 v4, s8
158; GCN-NEXT:    v_mov_b32_e32 v5, s9
159; GCN-NEXT:    v_mov_b32_e32 v6, s10
160; GCN-NEXT:    v_mov_b32_e32 v7, s11
161; GCN-NEXT:    v_mov_b32_e32 v8, s12
162; GCN-NEXT:    v_mov_b32_e32 v9, s13
163; GCN-NEXT:    v_mov_b32_e32 v10, s14
164; GCN-NEXT:    v_mov_b32_e32 v11, s15
165; GCN-NEXT:    v_mov_b32_e32 v12, s16
166; GCN-NEXT:    v_mov_b32_e32 v13, s17
167; GCN-NEXT:    v_mov_b32_e32 v14, s18
168; GCN-NEXT:    v_mov_b32_e32 v15, s19
169; GCN-NEXT:    s_mov_b32 m0, s20
170; GCN-NEXT:    v_mov_b32_e32 v16, s0
171; GCN-NEXT:    s_add_u32 s0, s2, 32
172; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
173; GCN-NEXT:    s_addc_u32 s1, s3, 0
174; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
175; GCN-NEXT:    s_nop 0
176; GCN-NEXT:    v_mov_b32_e32 v13, s1
177; GCN-NEXT:    v_mov_b32_e32 v12, s0
178; GCN-NEXT:    s_add_u32 s0, s2, 16
179; GCN-NEXT:    s_addc_u32 s1, s3, 0
180; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
181; GCN-NEXT:    s_nop 0
182; GCN-NEXT:    v_mov_b32_e32 v9, s1
183; GCN-NEXT:    v_mov_b32_e32 v8, s0
184; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
185; GCN-NEXT:    s_nop 0
186; GCN-NEXT:    v_mov_b32_e32 v5, s3
187; GCN-NEXT:    v_mov_b32_e32 v4, s2
188; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
189; GCN-NEXT:    s_endpgm
190entry:
191  %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
192  store <16 x float> %v, <16 x float> addrspace(1)* %out
193  ret void
194}
195
196define amdgpu_kernel void @float32_inselt(<32 x float> addrspace(1)* %out, <32 x float> %vec, i32 %sel) {
197; GCN-LABEL: float32_inselt:
198; GCN:       ; %bb.0: ; %entry
199; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
200; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
201; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xe4
202; GCN-NEXT:    s_load_dword s0, s[0:1], 0x124
203; GCN-NEXT:    s_waitcnt lgkmcnt(0)
204; GCN-NEXT:    v_mov_b32_e32 v0, s36
205; GCN-NEXT:    v_mov_b32_e32 v1, s37
206; GCN-NEXT:    v_mov_b32_e32 v2, s38
207; GCN-NEXT:    s_mov_b32 m0, s0
208; GCN-NEXT:    s_add_u32 s0, s2, 0x70
209; GCN-NEXT:    s_addc_u32 s1, s3, 0
210; GCN-NEXT:    v_mov_b32_e32 v33, s1
211; GCN-NEXT:    v_mov_b32_e32 v3, s39
212; GCN-NEXT:    v_mov_b32_e32 v4, s40
213; GCN-NEXT:    v_mov_b32_e32 v5, s41
214; GCN-NEXT:    v_mov_b32_e32 v6, s42
215; GCN-NEXT:    v_mov_b32_e32 v7, s43
216; GCN-NEXT:    v_mov_b32_e32 v8, s44
217; GCN-NEXT:    v_mov_b32_e32 v9, s45
218; GCN-NEXT:    v_mov_b32_e32 v10, s46
219; GCN-NEXT:    v_mov_b32_e32 v11, s47
220; GCN-NEXT:    v_mov_b32_e32 v12, s48
221; GCN-NEXT:    v_mov_b32_e32 v13, s49
222; GCN-NEXT:    v_mov_b32_e32 v14, s50
223; GCN-NEXT:    v_mov_b32_e32 v15, s51
224; GCN-NEXT:    v_mov_b32_e32 v16, s4
225; GCN-NEXT:    v_mov_b32_e32 v17, s5
226; GCN-NEXT:    v_mov_b32_e32 v18, s6
227; GCN-NEXT:    v_mov_b32_e32 v19, s7
228; GCN-NEXT:    v_mov_b32_e32 v20, s8
229; GCN-NEXT:    v_mov_b32_e32 v21, s9
230; GCN-NEXT:    v_mov_b32_e32 v22, s10
231; GCN-NEXT:    v_mov_b32_e32 v23, s11
232; GCN-NEXT:    v_mov_b32_e32 v24, s12
233; GCN-NEXT:    v_mov_b32_e32 v25, s13
234; GCN-NEXT:    v_mov_b32_e32 v26, s14
235; GCN-NEXT:    v_mov_b32_e32 v27, s15
236; GCN-NEXT:    v_mov_b32_e32 v28, s16
237; GCN-NEXT:    v_mov_b32_e32 v29, s17
238; GCN-NEXT:    v_mov_b32_e32 v30, s18
239; GCN-NEXT:    v_mov_b32_e32 v31, s19
240; GCN-NEXT:    v_mov_b32_e32 v32, s0
241; GCN-NEXT:    s_add_u32 s0, s2, 0x60
242; GCN-NEXT:    v_movreld_b32_e32 v0, 1.0
243; GCN-NEXT:    s_addc_u32 s1, s3, 0
244; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
245; GCN-NEXT:    s_nop 0
246; GCN-NEXT:    v_mov_b32_e32 v29, s1
247; GCN-NEXT:    v_mov_b32_e32 v28, s0
248; GCN-NEXT:    s_add_u32 s0, s2, 0x50
249; GCN-NEXT:    s_addc_u32 s1, s3, 0
250; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
251; GCN-NEXT:    s_nop 0
252; GCN-NEXT:    v_mov_b32_e32 v25, s1
253; GCN-NEXT:    v_mov_b32_e32 v24, s0
254; GCN-NEXT:    s_add_u32 s0, s2, 64
255; GCN-NEXT:    s_addc_u32 s1, s3, 0
256; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
257; GCN-NEXT:    s_nop 0
258; GCN-NEXT:    v_mov_b32_e32 v21, s1
259; GCN-NEXT:    v_mov_b32_e32 v20, s0
260; GCN-NEXT:    s_add_u32 s0, s2, 48
261; GCN-NEXT:    s_addc_u32 s1, s3, 0
262; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
263; GCN-NEXT:    s_nop 0
264; GCN-NEXT:    v_mov_b32_e32 v17, s1
265; GCN-NEXT:    v_mov_b32_e32 v16, s0
266; GCN-NEXT:    s_add_u32 s0, s2, 32
267; GCN-NEXT:    s_addc_u32 s1, s3, 0
268; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
269; GCN-NEXT:    s_nop 0
270; GCN-NEXT:    v_mov_b32_e32 v13, s1
271; GCN-NEXT:    v_mov_b32_e32 v12, s0
272; GCN-NEXT:    s_add_u32 s0, s2, 16
273; GCN-NEXT:    s_addc_u32 s1, s3, 0
274; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
275; GCN-NEXT:    s_nop 0
276; GCN-NEXT:    v_mov_b32_e32 v9, s1
277; GCN-NEXT:    v_mov_b32_e32 v8, s0
278; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
279; GCN-NEXT:    s_nop 0
280; GCN-NEXT:    v_mov_b32_e32 v5, s3
281; GCN-NEXT:    v_mov_b32_e32 v4, s2
282; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
283; GCN-NEXT:    s_endpgm
284entry:
285  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
286  store <32 x float> %v, <32 x float> addrspace(1)* %out
287  ret void
288}
289
290define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
291; GCN-LABEL: half4_inselt:
292; GCN:       ; %bb.0: ; %entry
293; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
294; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
295; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
296; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
297; GCN-NEXT:    s_waitcnt lgkmcnt(0)
298; GCN-NEXT:    s_lshl_b32 s6, s6, 4
299; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
300; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
301; GCN-NEXT:    s_mov_b32 s7, s6
302; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
303; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
304; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
305; GCN-NEXT:    v_mov_b32_e32 v0, s2
306; GCN-NEXT:    v_mov_b32_e32 v3, s1
307; GCN-NEXT:    v_mov_b32_e32 v1, s3
308; GCN-NEXT:    v_mov_b32_e32 v2, s0
309; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
310; GCN-NEXT:    s_endpgm
311entry:
312  %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
313  store <4 x half> %v, <4 x half> addrspace(1)* %out
314  ret void
315}
316
317define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
318; GCN-LABEL: half2_inselt:
319; GCN:       ; %bb.0: ; %entry
320; GCN-NEXT:    s_load_dword s2, s[0:1], 0x30
321; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
322; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
323; GCN-NEXT:    s_waitcnt lgkmcnt(0)
324; GCN-NEXT:    s_lshl_b32 s2, s2, 4
325; GCN-NEXT:    s_lshl_b32 s2, 0xffff, s2
326; GCN-NEXT:    s_andn2_b32 s3, s3, s2
327; GCN-NEXT:    s_and_b32 s2, s2, 0x3c003c00
328; GCN-NEXT:    s_or_b32 s2, s2, s3
329; GCN-NEXT:    v_mov_b32_e32 v0, s0
330; GCN-NEXT:    v_mov_b32_e32 v1, s1
331; GCN-NEXT:    v_mov_b32_e32 v2, s2
332; GCN-NEXT:    flat_store_dword v[0:1], v2
333; GCN-NEXT:    s_endpgm
334entry:
335  %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
336  store <2 x half> %v, <2 x half> addrspace(1)* %out
337  ret void
338}
339
340define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
341; GCN-LABEL: half8_inselt:
342; GCN:       ; %bb.0: ; %entry
343; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
344; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
345; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
346; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
347; GCN-NEXT:    s_waitcnt lgkmcnt(0)
348; GCN-NEXT:    s_lshr_b32 s3, s7, 16
349; GCN-NEXT:    s_cmp_lg_u32 s2, 7
350; GCN-NEXT:    v_mov_b32_e32 v1, s3
351; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
352; GCN-NEXT:    s_cmp_lg_u32 s2, 6
353; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
354; GCN-NEXT:    v_mov_b32_e32 v2, s7
355; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
356; GCN-NEXT:    s_lshr_b32 s3, s6, 16
357; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
358; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
359; GCN-NEXT:    s_cmp_lg_u32 s2, 5
360; GCN-NEXT:    v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
361; GCN-NEXT:    v_mov_b32_e32 v1, s3
362; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
363; GCN-NEXT:    s_cmp_lg_u32 s2, 4
364; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
365; GCN-NEXT:    v_mov_b32_e32 v2, s6
366; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
367; GCN-NEXT:    s_lshr_b32 s3, s5, 16
368; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
369; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
370; GCN-NEXT:    s_cmp_lg_u32 s2, 3
371; GCN-NEXT:    v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
372; GCN-NEXT:    v_mov_b32_e32 v1, s3
373; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
374; GCN-NEXT:    s_cmp_lg_u32 s2, 2
375; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
376; GCN-NEXT:    v_mov_b32_e32 v4, s5
377; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
378; GCN-NEXT:    s_lshr_b32 s3, s4, 16
379; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
380; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
381; GCN-NEXT:    s_cmp_lg_u32 s2, 1
382; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
383; GCN-NEXT:    v_mov_b32_e32 v4, s3
384; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
385; GCN-NEXT:    s_cmp_lg_u32 s2, 0
386; GCN-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
387; GCN-NEXT:    v_mov_b32_e32 v5, s4
388; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
389; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
390; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
391; GCN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
392; GCN-NEXT:    v_mov_b32_e32 v5, s1
393; GCN-NEXT:    v_mov_b32_e32 v4, s0
394; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
395; GCN-NEXT:    s_endpgm
396entry:
397  %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
398  store <8 x half> %v, <8 x half> addrspace(1)* %out
399  ret void
400}
401
402define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
403; GCN-LABEL: short2_inselt:
404; GCN:       ; %bb.0: ; %entry
405; GCN-NEXT:    s_load_dword s2, s[0:1], 0x30
406; GCN-NEXT:    s_load_dword s3, s[0:1], 0x2c
407; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
408; GCN-NEXT:    s_waitcnt lgkmcnt(0)
409; GCN-NEXT:    s_lshl_b32 s2, s2, 4
410; GCN-NEXT:    s_lshl_b32 s2, 0xffff, s2
411; GCN-NEXT:    s_andn2_b32 s3, s3, s2
412; GCN-NEXT:    s_and_b32 s2, s2, 0x10001
413; GCN-NEXT:    s_or_b32 s2, s2, s3
414; GCN-NEXT:    v_mov_b32_e32 v0, s0
415; GCN-NEXT:    v_mov_b32_e32 v1, s1
416; GCN-NEXT:    v_mov_b32_e32 v2, s2
417; GCN-NEXT:    flat_store_dword v[0:1], v2
418; GCN-NEXT:    s_endpgm
419entry:
420  %v = insertelement <2 x i16> %vec, i16 1, i32 %sel
421  store <2 x i16> %v, <2 x i16> addrspace(1)* %out
422  ret void
423}
424
425define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
426; GCN-LABEL: short4_inselt:
427; GCN:       ; %bb.0: ; %entry
428; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
429; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
430; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
431; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
432; GCN-NEXT:    s_waitcnt lgkmcnt(0)
433; GCN-NEXT:    s_lshl_b32 s6, s6, 4
434; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
435; GCN-NEXT:    s_mov_b32 s6, 0x10001
436; GCN-NEXT:    s_mov_b32 s7, s6
437; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
438; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
439; GCN-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
440; GCN-NEXT:    v_mov_b32_e32 v0, s2
441; GCN-NEXT:    v_mov_b32_e32 v3, s1
442; GCN-NEXT:    v_mov_b32_e32 v1, s3
443; GCN-NEXT:    v_mov_b32_e32 v2, s0
444; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
445; GCN-NEXT:    s_endpgm
446entry:
447  %v = insertelement <4 x i16> %vec, i16 1, i32 %sel
448  store <4 x i16> %v, <4 x i16> addrspace(1)* %out
449  ret void
450}
451
452define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
453; GCN-LABEL: byte8_inselt:
454; GCN:       ; %bb.0: ; %entry
455; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
456; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
457; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
458; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
459; GCN-NEXT:    s_waitcnt lgkmcnt(0)
460; GCN-NEXT:    s_lshl_b32 s6, s6, 3
461; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
462; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
463; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
464; GCN-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
465; GCN-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
466; GCN-NEXT:    v_mov_b32_e32 v2, s2
467; GCN-NEXT:    v_mov_b32_e32 v0, s0
468; GCN-NEXT:    v_mov_b32_e32 v1, s1
469; GCN-NEXT:    v_mov_b32_e32 v3, s3
470; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
471; GCN-NEXT:    s_endpgm
472entry:
473  %v = insertelement <8 x i8> %vec, i8 1, i32 %sel
474  store <8 x i8> %v, <8 x i8> addrspace(1)* %out
475  ret void
476}
477
478define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
479; GCN-LABEL: byte16_inselt:
480; GCN:       ; %bb.0: ; %entry
481; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
482; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
483; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
484; GCN-NEXT:    s_waitcnt lgkmcnt(0)
485; GCN-NEXT:    s_lshr_b32 s3, s7, 24
486; GCN-NEXT:    s_cmp_lg_u32 s2, 15
487; GCN-NEXT:    v_mov_b32_e32 v0, s3
488; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
489; GCN-NEXT:    s_lshr_b32 s3, s7, 16
490; GCN-NEXT:    s_cmp_lg_u32 s2, 14
491; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
492; GCN-NEXT:    v_mov_b32_e32 v1, s3
493; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
494; GCN-NEXT:    s_lshr_b32 s3, s7, 8
495; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
496; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
497; GCN-NEXT:    s_cmp_lg_u32 s2, 13
498; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
499; GCN-NEXT:    v_mov_b32_e32 v1, s3
500; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
501; GCN-NEXT:    s_cmp_lg_u32 s2, 12
502; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
503; GCN-NEXT:    v_mov_b32_e32 v2, s7
504; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
505; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
506; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
507; GCN-NEXT:    s_lshr_b32 s3, s6, 24
508; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
509; GCN-NEXT:    s_cmp_lg_u32 s2, 11
510; GCN-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
511; GCN-NEXT:    v_mov_b32_e32 v0, s3
512; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
513; GCN-NEXT:    s_lshr_b32 s3, s6, 16
514; GCN-NEXT:    s_cmp_lg_u32 s2, 10
515; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
516; GCN-NEXT:    v_mov_b32_e32 v1, s3
517; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
518; GCN-NEXT:    s_lshr_b32 s3, s6, 8
519; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
520; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
521; GCN-NEXT:    s_cmp_lg_u32 s2, 9
522; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
523; GCN-NEXT:    v_mov_b32_e32 v1, s3
524; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
525; GCN-NEXT:    s_cmp_lg_u32 s2, 8
526; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
527; GCN-NEXT:    v_mov_b32_e32 v2, s6
528; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
529; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
530; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
531; GCN-NEXT:    s_lshr_b32 s3, s5, 24
532; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
533; GCN-NEXT:    s_cmp_lg_u32 s2, 7
534; GCN-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
535; GCN-NEXT:    v_mov_b32_e32 v0, s3
536; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
537; GCN-NEXT:    s_lshr_b32 s3, s5, 16
538; GCN-NEXT:    s_cmp_lg_u32 s2, 6
539; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
540; GCN-NEXT:    v_mov_b32_e32 v1, s3
541; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
542; GCN-NEXT:    s_lshr_b32 s3, s5, 8
543; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
544; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
545; GCN-NEXT:    s_cmp_lg_u32 s2, 5
546; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
547; GCN-NEXT:    v_mov_b32_e32 v1, s3
548; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
549; GCN-NEXT:    s_cmp_lg_u32 s2, 4
550; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
551; GCN-NEXT:    v_mov_b32_e32 v4, s5
552; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
553; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
554; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
555; GCN-NEXT:    s_lshr_b32 s3, s4, 24
556; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
557; GCN-NEXT:    s_cmp_lg_u32 s2, 3
558; GCN-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
559; GCN-NEXT:    v_mov_b32_e32 v0, s3
560; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
561; GCN-NEXT:    s_lshr_b32 s3, s4, 16
562; GCN-NEXT:    s_cmp_lg_u32 s2, 2
563; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
564; GCN-NEXT:    v_mov_b32_e32 v4, s3
565; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
566; GCN-NEXT:    s_lshr_b32 s3, s4, 8
567; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
568; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
569; GCN-NEXT:    s_cmp_lg_u32 s2, 1
570; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
571; GCN-NEXT:    v_mov_b32_e32 v4, s3
572; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
573; GCN-NEXT:    s_cmp_lg_u32 s2, 0
574; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
575; GCN-NEXT:    v_mov_b32_e32 v5, s4
576; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
577; GCN-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
578; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
579; GCN-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
580; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
581; GCN-NEXT:    v_mov_b32_e32 v5, s1
582; GCN-NEXT:    v_mov_b32_e32 v4, s0
583; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
584; GCN-NEXT:    s_endpgm
585entry:
586  %v = insertelement <16 x i8> %vec, i8 1, i32 %sel
587  store <16 x i8> %v, <16 x i8> addrspace(1)* %out
588  ret void
589}
590
591define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
592; GCN-LABEL: double2_inselt:
593; GCN:       ; %bb.0: ; %entry
594; GCN-NEXT:    s_load_dword s2, s[0:1], 0x44
595; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
596; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
597; GCN-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
598; GCN-NEXT:    s_waitcnt lgkmcnt(0)
599; GCN-NEXT:    s_cmp_eq_u32 s2, 1
600; GCN-NEXT:    v_mov_b32_e32 v1, s7
601; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
602; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
603; GCN-NEXT:    v_mov_b32_e32 v1, s6
604; GCN-NEXT:    s_cmp_eq_u32 s2, 0
605; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, 0, vcc
606; GCN-NEXT:    v_mov_b32_e32 v1, s5
607; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
608; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
609; GCN-NEXT:    v_mov_b32_e32 v0, s4
610; GCN-NEXT:    v_mov_b32_e32 v5, s1
611; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
612; GCN-NEXT:    v_mov_b32_e32 v4, s0
613; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
614; GCN-NEXT:    s_endpgm
615entry:
616  %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
617  store <2 x double> %v, <2 x double> addrspace(1)* %out
618  ret void
619}
620
621define amdgpu_kernel void @double5_inselt(<5 x double> addrspace(1)* %out, <5 x double> %vec, i32 %sel) {
622; GCN-LABEL: double5_inselt:
623; GCN:       ; %bb.0: ; %entry
624; GCN-NEXT:    s_load_dword s12, s[0:1], 0xa4
625; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x84
626; GCN-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
627; GCN-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x64
628; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
629; GCN-NEXT:    s_waitcnt lgkmcnt(0)
630; GCN-NEXT:    s_cmp_eq_u32 s12, 4
631; GCN-NEXT:    v_mov_b32_e32 v0, s9
632; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
633; GCN-NEXT:    v_cndmask_b32_e32 v9, v0, v4, vcc
634; GCN-NEXT:    v_mov_b32_e32 v0, s8
635; GCN-NEXT:    s_cmp_eq_u32 s12, 1
636; GCN-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
637; GCN-NEXT:    v_mov_b32_e32 v0, s3
638; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
639; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
640; GCN-NEXT:    v_mov_b32_e32 v0, s2
641; GCN-NEXT:    s_cmp_eq_u32 s12, 0
642; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
643; GCN-NEXT:    v_mov_b32_e32 v0, s1
644; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
645; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
646; GCN-NEXT:    v_mov_b32_e32 v0, s0
647; GCN-NEXT:    s_cmp_eq_u32 s12, 3
648; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
649; GCN-NEXT:    v_mov_b32_e32 v5, s7
650; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
651; GCN-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
652; GCN-NEXT:    v_mov_b32_e32 v5, s6
653; GCN-NEXT:    s_cmp_eq_u32 s12, 2
654; GCN-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
655; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
656; GCN-NEXT:    s_add_u32 s0, s10, 16
657; GCN-NEXT:    v_mov_b32_e32 v5, s5
658; GCN-NEXT:    s_addc_u32 s1, s11, 0
659; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
660; GCN-NEXT:    v_mov_b32_e32 v4, s4
661; GCN-NEXT:    v_mov_b32_e32 v11, s1
662; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
663; GCN-NEXT:    v_mov_b32_e32 v10, s0
664; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
665; GCN-NEXT:    s_add_u32 s0, s10, 32
666; GCN-NEXT:    v_mov_b32_e32 v4, s10
667; GCN-NEXT:    v_mov_b32_e32 v5, s11
668; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
669; GCN-NEXT:    s_addc_u32 s1, s11, 0
670; GCN-NEXT:    v_mov_b32_e32 v0, s0
671; GCN-NEXT:    v_mov_b32_e32 v1, s1
672; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
673; GCN-NEXT:    s_endpgm
674entry:
675  %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel
676  store <5 x double> %v, <5 x double> addrspace(1)* %out
677  ret void
678}
679
680define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
681; GCN-LABEL: double8_inselt:
682; GCN:       ; %bb.0: ; %entry
683; GCN-NEXT:    s_load_dword s2, s[0:1], 0xa4
684; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
685; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
686; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
687; GCN-NEXT:    s_waitcnt lgkmcnt(0)
688; GCN-NEXT:    s_lshl_b32 s2, s2, 1
689; GCN-NEXT:    v_mov_b32_e32 v0, s4
690; GCN-NEXT:    v_mov_b32_e32 v1, s5
691; GCN-NEXT:    v_mov_b32_e32 v2, s6
692; GCN-NEXT:    v_mov_b32_e32 v3, s7
693; GCN-NEXT:    v_mov_b32_e32 v4, s8
694; GCN-NEXT:    v_mov_b32_e32 v5, s9
695; GCN-NEXT:    v_mov_b32_e32 v6, s10
696; GCN-NEXT:    v_mov_b32_e32 v7, s11
697; GCN-NEXT:    v_mov_b32_e32 v8, s12
698; GCN-NEXT:    v_mov_b32_e32 v9, s13
699; GCN-NEXT:    v_mov_b32_e32 v10, s14
700; GCN-NEXT:    v_mov_b32_e32 v11, s15
701; GCN-NEXT:    v_mov_b32_e32 v12, s16
702; GCN-NEXT:    v_mov_b32_e32 v13, s17
703; GCN-NEXT:    v_mov_b32_e32 v14, s18
704; GCN-NEXT:    v_mov_b32_e32 v15, s19
705; GCN-NEXT:    s_mov_b32 m0, s2
706; GCN-NEXT:    s_add_u32 s2, s0, 48
707; GCN-NEXT:    v_movreld_b32_e32 v0, 0
708; GCN-NEXT:    s_addc_u32 s3, s1, 0
709; GCN-NEXT:    v_movreld_b32_e32 v1, v16
710; GCN-NEXT:    v_mov_b32_e32 v17, s3
711; GCN-NEXT:    v_mov_b32_e32 v16, s2
712; GCN-NEXT:    s_add_u32 s2, s0, 32
713; GCN-NEXT:    s_addc_u32 s3, s1, 0
714; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
715; GCN-NEXT:    s_nop 0
716; GCN-NEXT:    v_mov_b32_e32 v13, s3
717; GCN-NEXT:    v_mov_b32_e32 v12, s2
718; GCN-NEXT:    s_add_u32 s2, s0, 16
719; GCN-NEXT:    s_addc_u32 s3, s1, 0
720; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
721; GCN-NEXT:    s_nop 0
722; GCN-NEXT:    v_mov_b32_e32 v9, s3
723; GCN-NEXT:    v_mov_b32_e32 v8, s2
724; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
725; GCN-NEXT:    s_nop 0
726; GCN-NEXT:    v_mov_b32_e32 v5, s1
727; GCN-NEXT:    v_mov_b32_e32 v4, s0
728; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
729; GCN-NEXT:    s_endpgm
730entry:
731  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
732  store <8 x double> %v, <8 x double> addrspace(1)* %out
733  ret void
734}
735
736define amdgpu_kernel void @double7_inselt(<7 x double> addrspace(1)* %out, <7 x double> %vec, i32 %sel) {
737; GCN-LABEL: double7_inselt:
738; GCN:       ; %bb.0: ; %entry
739; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
740; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
741; GCN-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x94
742; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x84
743; GCN-NEXT:    s_load_dword s0, s[0:1], 0xa4
744; GCN-NEXT:    s_waitcnt lgkmcnt(0)
745; GCN-NEXT:    v_mov_b32_e32 v0, s4
746; GCN-NEXT:    v_mov_b32_e32 v1, s5
747; GCN-NEXT:    v_mov_b32_e32 v2, s6
748; GCN-NEXT:    v_mov_b32_e32 v3, s7
749; GCN-NEXT:    s_lshl_b32 s0, s0, 1
750; GCN-NEXT:    v_mov_b32_e32 v4, s8
751; GCN-NEXT:    v_mov_b32_e32 v5, s9
752; GCN-NEXT:    v_mov_b32_e32 v6, s10
753; GCN-NEXT:    v_mov_b32_e32 v7, s11
754; GCN-NEXT:    v_mov_b32_e32 v8, s12
755; GCN-NEXT:    v_mov_b32_e32 v9, s13
756; GCN-NEXT:    v_mov_b32_e32 v10, s14
757; GCN-NEXT:    v_mov_b32_e32 v11, s15
758; GCN-NEXT:    v_mov_b32_e32 v12, s16
759; GCN-NEXT:    v_mov_b32_e32 v13, s17
760; GCN-NEXT:    s_mov_b32 m0, s0
761; GCN-NEXT:    v_movreld_b32_e32 v0, 0
762; GCN-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
763; GCN-NEXT:    s_add_u32 s0, s2, 16
764; GCN-NEXT:    v_movreld_b32_e32 v1, v16
765; GCN-NEXT:    s_addc_u32 s1, s3, 0
766; GCN-NEXT:    v_mov_b32_e32 v15, s1
767; GCN-NEXT:    v_mov_b32_e32 v14, s0
768; GCN-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
769; GCN-NEXT:    s_add_u32 s0, s2, 48
770; GCN-NEXT:    v_mov_b32_e32 v5, s3
771; GCN-NEXT:    v_mov_b32_e32 v4, s2
772; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
773; GCN-NEXT:    s_addc_u32 s1, s3, 0
774; GCN-NEXT:    v_mov_b32_e32 v0, s0
775; GCN-NEXT:    v_mov_b32_e32 v1, s1
776; GCN-NEXT:    s_add_u32 s0, s2, 32
777; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[12:13]
778; GCN-NEXT:    s_addc_u32 s1, s3, 0
779; GCN-NEXT:    v_mov_b32_e32 v0, s0
780; GCN-NEXT:    v_mov_b32_e32 v1, s1
781; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
782; GCN-NEXT:    s_endpgm
783entry:
784  %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel
785  store <7 x double> %v, <7 x double> addrspace(1)* %out
786  ret void
787}
788
789define amdgpu_kernel void @double16_inselt(<16 x double> addrspace(1)* %out, <16 x double> %vec, i32 %sel) {
790; GCN-LABEL: double16_inselt:
791; GCN:       ; %bb.0: ; %entry
792; GCN-NEXT:    s_load_dword s2, s[0:1], 0x124
793; GCN-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0xa4
794; GCN-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0xe4
795; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
796; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
797; GCN-NEXT:    s_waitcnt lgkmcnt(0)
798; GCN-NEXT:    v_mov_b32_e32 v0, s36
799; GCN-NEXT:    s_lshl_b32 s2, s2, 1
800; GCN-NEXT:    v_mov_b32_e32 v1, s37
801; GCN-NEXT:    v_mov_b32_e32 v2, s38
802; GCN-NEXT:    v_mov_b32_e32 v3, s39
803; GCN-NEXT:    v_mov_b32_e32 v4, s40
804; GCN-NEXT:    v_mov_b32_e32 v5, s41
805; GCN-NEXT:    v_mov_b32_e32 v6, s42
806; GCN-NEXT:    v_mov_b32_e32 v7, s43
807; GCN-NEXT:    v_mov_b32_e32 v8, s44
808; GCN-NEXT:    v_mov_b32_e32 v9, s45
809; GCN-NEXT:    v_mov_b32_e32 v10, s46
810; GCN-NEXT:    v_mov_b32_e32 v11, s47
811; GCN-NEXT:    v_mov_b32_e32 v12, s48
812; GCN-NEXT:    v_mov_b32_e32 v13, s49
813; GCN-NEXT:    v_mov_b32_e32 v14, s50
814; GCN-NEXT:    v_mov_b32_e32 v15, s51
815; GCN-NEXT:    v_mov_b32_e32 v16, s4
816; GCN-NEXT:    v_mov_b32_e32 v17, s5
817; GCN-NEXT:    v_mov_b32_e32 v18, s6
818; GCN-NEXT:    v_mov_b32_e32 v19, s7
819; GCN-NEXT:    v_mov_b32_e32 v20, s8
820; GCN-NEXT:    v_mov_b32_e32 v21, s9
821; GCN-NEXT:    v_mov_b32_e32 v22, s10
822; GCN-NEXT:    v_mov_b32_e32 v23, s11
823; GCN-NEXT:    v_mov_b32_e32 v24, s12
824; GCN-NEXT:    v_mov_b32_e32 v25, s13
825; GCN-NEXT:    v_mov_b32_e32 v26, s14
826; GCN-NEXT:    v_mov_b32_e32 v27, s15
827; GCN-NEXT:    v_mov_b32_e32 v28, s16
828; GCN-NEXT:    v_mov_b32_e32 v29, s17
829; GCN-NEXT:    v_mov_b32_e32 v30, s18
830; GCN-NEXT:    v_mov_b32_e32 v31, s19
831; GCN-NEXT:    s_mov_b32 m0, s2
832; GCN-NEXT:    s_add_u32 s2, s0, 0x70
833; GCN-NEXT:    v_movreld_b32_e32 v0, 0
834; GCN-NEXT:    s_addc_u32 s3, s1, 0
835; GCN-NEXT:    v_movreld_b32_e32 v1, v32
836; GCN-NEXT:    v_mov_b32_e32 v33, s3
837; GCN-NEXT:    v_mov_b32_e32 v32, s2
838; GCN-NEXT:    s_add_u32 s2, s0, 0x60
839; GCN-NEXT:    s_addc_u32 s3, s1, 0
840; GCN-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
841; GCN-NEXT:    s_nop 0
842; GCN-NEXT:    v_mov_b32_e32 v29, s3
843; GCN-NEXT:    v_mov_b32_e32 v28, s2
844; GCN-NEXT:    s_add_u32 s2, s0, 0x50
845; GCN-NEXT:    s_addc_u32 s3, s1, 0
846; GCN-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
847; GCN-NEXT:    s_nop 0
848; GCN-NEXT:    v_mov_b32_e32 v25, s3
849; GCN-NEXT:    v_mov_b32_e32 v24, s2
850; GCN-NEXT:    s_add_u32 s2, s0, 64
851; GCN-NEXT:    s_addc_u32 s3, s1, 0
852; GCN-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
853; GCN-NEXT:    s_nop 0
854; GCN-NEXT:    v_mov_b32_e32 v21, s3
855; GCN-NEXT:    v_mov_b32_e32 v20, s2
856; GCN-NEXT:    s_add_u32 s2, s0, 48
857; GCN-NEXT:    s_addc_u32 s3, s1, 0
858; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
859; GCN-NEXT:    s_nop 0
860; GCN-NEXT:    v_mov_b32_e32 v17, s3
861; GCN-NEXT:    v_mov_b32_e32 v16, s2
862; GCN-NEXT:    s_add_u32 s2, s0, 32
863; GCN-NEXT:    s_addc_u32 s3, s1, 0
864; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
865; GCN-NEXT:    s_nop 0
866; GCN-NEXT:    v_mov_b32_e32 v13, s3
867; GCN-NEXT:    v_mov_b32_e32 v12, s2
868; GCN-NEXT:    s_add_u32 s2, s0, 16
869; GCN-NEXT:    s_addc_u32 s3, s1, 0
870; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
871; GCN-NEXT:    s_nop 0
872; GCN-NEXT:    v_mov_b32_e32 v9, s3
873; GCN-NEXT:    v_mov_b32_e32 v8, s2
874; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
875; GCN-NEXT:    s_nop 0
876; GCN-NEXT:    v_mov_b32_e32 v5, s1
877; GCN-NEXT:    v_mov_b32_e32 v4, s0
878; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
879; GCN-NEXT:    s_endpgm
880entry:
881  %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel
882  store <16 x double> %v, <16 x double> addrspace(1)* %out
883  ret void
884}
885
886define amdgpu_kernel void @double15_inselt(<15 x double> addrspace(1)* %out, <15 x double> %vec, i32 %sel) {
887; GCN-LABEL: double15_inselt:
888; GCN:       ; %bb.0: ; %entry
889; GCN-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0xa4
890; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x114
891; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x104
892; GCN-NEXT:    s_load_dwordx8 s[24:31], s[0:1], 0xe4
893; GCN-NEXT:    v_mov_b32_e32 v32, 0x3ff00000
894; GCN-NEXT:    s_waitcnt lgkmcnt(0)
895; GCN-NEXT:    v_mov_b32_e32 v0, s8
896; GCN-NEXT:    v_mov_b32_e32 v28, s2
897; GCN-NEXT:    v_mov_b32_e32 v24, s4
898; GCN-NEXT:    s_load_dword s4, s[0:1], 0x124
899; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
900; GCN-NEXT:    v_mov_b32_e32 v1, s9
901; GCN-NEXT:    v_mov_b32_e32 v2, s10
902; GCN-NEXT:    v_mov_b32_e32 v3, s11
903; GCN-NEXT:    s_waitcnt lgkmcnt(0)
904; GCN-NEXT:    s_lshl_b32 s2, s4, 1
905; GCN-NEXT:    v_mov_b32_e32 v4, s12
906; GCN-NEXT:    v_mov_b32_e32 v5, s13
907; GCN-NEXT:    v_mov_b32_e32 v6, s14
908; GCN-NEXT:    v_mov_b32_e32 v7, s15
909; GCN-NEXT:    v_mov_b32_e32 v8, s16
910; GCN-NEXT:    v_mov_b32_e32 v9, s17
911; GCN-NEXT:    v_mov_b32_e32 v10, s18
912; GCN-NEXT:    v_mov_b32_e32 v11, s19
913; GCN-NEXT:    v_mov_b32_e32 v12, s20
914; GCN-NEXT:    v_mov_b32_e32 v13, s21
915; GCN-NEXT:    v_mov_b32_e32 v14, s22
916; GCN-NEXT:    v_mov_b32_e32 v15, s23
917; GCN-NEXT:    v_mov_b32_e32 v16, s24
918; GCN-NEXT:    v_mov_b32_e32 v17, s25
919; GCN-NEXT:    v_mov_b32_e32 v18, s26
920; GCN-NEXT:    v_mov_b32_e32 v19, s27
921; GCN-NEXT:    v_mov_b32_e32 v20, s28
922; GCN-NEXT:    v_mov_b32_e32 v21, s29
923; GCN-NEXT:    v_mov_b32_e32 v22, s30
924; GCN-NEXT:    v_mov_b32_e32 v23, s31
925; GCN-NEXT:    v_mov_b32_e32 v25, s5
926; GCN-NEXT:    v_mov_b32_e32 v26, s6
927; GCN-NEXT:    v_mov_b32_e32 v27, s7
928; GCN-NEXT:    v_mov_b32_e32 v29, s3
929; GCN-NEXT:    s_mov_b32 m0, s2
930; GCN-NEXT:    v_movreld_b32_e32 v0, 0
931; GCN-NEXT:    s_add_u32 s2, s0, 0x50
932; GCN-NEXT:    v_movreld_b32_e32 v1, v32
933; GCN-NEXT:    s_addc_u32 s3, s1, 0
934; GCN-NEXT:    v_mov_b32_e32 v31, s3
935; GCN-NEXT:    v_mov_b32_e32 v30, s2
936; GCN-NEXT:    s_add_u32 s2, s0, 64
937; GCN-NEXT:    s_addc_u32 s3, s1, 0
938; GCN-NEXT:    flat_store_dwordx4 v[30:31], v[20:23]
939; GCN-NEXT:    s_nop 0
940; GCN-NEXT:    v_mov_b32_e32 v21, s3
941; GCN-NEXT:    v_mov_b32_e32 v20, s2
942; GCN-NEXT:    s_add_u32 s2, s0, 48
943; GCN-NEXT:    s_addc_u32 s3, s1, 0
944; GCN-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
945; GCN-NEXT:    s_nop 0
946; GCN-NEXT:    v_mov_b32_e32 v17, s3
947; GCN-NEXT:    v_mov_b32_e32 v16, s2
948; GCN-NEXT:    s_add_u32 s2, s0, 32
949; GCN-NEXT:    s_addc_u32 s3, s1, 0
950; GCN-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
951; GCN-NEXT:    s_nop 0
952; GCN-NEXT:    v_mov_b32_e32 v13, s3
953; GCN-NEXT:    v_mov_b32_e32 v12, s2
954; GCN-NEXT:    s_add_u32 s2, s0, 16
955; GCN-NEXT:    s_addc_u32 s3, s1, 0
956; GCN-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
957; GCN-NEXT:    s_nop 0
958; GCN-NEXT:    v_mov_b32_e32 v9, s3
959; GCN-NEXT:    v_mov_b32_e32 v8, s2
960; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
961; GCN-NEXT:    s_add_u32 s2, s0, 0x70
962; GCN-NEXT:    v_mov_b32_e32 v5, s1
963; GCN-NEXT:    v_mov_b32_e32 v4, s0
964; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
965; GCN-NEXT:    s_addc_u32 s3, s1, 0
966; GCN-NEXT:    v_mov_b32_e32 v0, s2
967; GCN-NEXT:    v_mov_b32_e32 v1, s3
968; GCN-NEXT:    s_add_u32 s0, s0, 0x60
969; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[28:29]
970; GCN-NEXT:    s_addc_u32 s1, s1, 0
971; GCN-NEXT:    v_mov_b32_e32 v0, s0
972; GCN-NEXT:    v_mov_b32_e32 v1, s1
973; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
974; GCN-NEXT:    s_endpgm
975entry:
976  %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel
977  store <15 x double> %v, <15 x double> addrspace(1)* %out
978  ret void
979}
980
981define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
982; GCN-LABEL: bit4_inselt:
983; GCN:       ; %bb.0: ; %entry
984; GCN-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
985; GCN-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
986; GCN-NEXT:    s_mov_b32 s6, -1
987; GCN-NEXT:    s_mov_b32 s7, 0xe80000
988; GCN-NEXT:    s_add_u32 s4, s4, s3
989; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
990; GCN-NEXT:    s_addc_u32 s5, s5, 0
991; GCN-NEXT:    v_mov_b32_e32 v0, 4
992; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
993; GCN-NEXT:    s_waitcnt lgkmcnt(0)
994; GCN-NEXT:    s_and_b32 s3, s3, 3
995; GCN-NEXT:    v_mov_b32_e32 v1, s2
996; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
997; GCN-NEXT:    v_lshrrev_b16_e64 v3, 2, s2
998; GCN-NEXT:    v_lshrrev_b16_e64 v4, 3, s2
999; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
1000; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1001; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
1002; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
1003; GCN-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:4
1004; GCN-NEXT:    buffer_store_byte v4, off, s[4:7], 0 offset:7
1005; GCN-NEXT:    buffer_store_byte v3, off, s[4:7], 0 offset:6
1006; GCN-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:5
1007; GCN-NEXT:    v_mov_b32_e32 v1, 1
1008; GCN-NEXT:    buffer_store_byte v1, v0, s[4:7], 0 offen
1009; GCN-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 offset:4
1010; GCN-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 offset:5
1011; GCN-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0 offset:6
1012; GCN-NEXT:    buffer_load_ubyte v3, off, s[4:7], 0 offset:7
1013; GCN-NEXT:    s_waitcnt vmcnt(3)
1014; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
1015; GCN-NEXT:    s_waitcnt vmcnt(2)
1016; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
1017; GCN-NEXT:    s_waitcnt vmcnt(1)
1018; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1019; GCN-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1020; GCN-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
1021; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
1022; GCN-NEXT:    s_waitcnt vmcnt(0)
1023; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
1024; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
1025; GCN-NEXT:    v_or_b32_e32 v0, v0, v3
1026; GCN-NEXT:    v_and_b32_e32 v2, 15, v0
1027; GCN-NEXT:    v_mov_b32_e32 v0, s0
1028; GCN-NEXT:    v_mov_b32_e32 v1, s1
1029; GCN-NEXT:    flat_store_byte v[0:1], v2
1030; GCN-NEXT:    s_endpgm
1031entry:
1032  %v = insertelement <4 x i1> %vec, i1 1, i32 %sel
1033  store <4 x i1> %v, <4 x i1> addrspace(1)* %out
1034  ret void
1035}
1036
1037define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
1038; GCN-LABEL: bit128_inselt:
1039; GCN:       ; %bb.0: ; %entry
1040; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1041; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1042; GCN-NEXT:    s_load_dword s0, s[0:1], 0x44
1043; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1044; GCN-NEXT:    s_lshr_b32 s1, s4, 24
1045; GCN-NEXT:    s_lshr_b32 s8, s4, 16
1046; GCN-NEXT:    s_lshr_b32 s9, s4, 17
1047; GCN-NEXT:    s_lshr_b32 s10, s4, 18
1048; GCN-NEXT:    s_lshr_b32 s11, s4, 19
1049; GCN-NEXT:    s_lshr_b32 s12, s4, 20
1050; GCN-NEXT:    s_lshr_b32 s13, s4, 21
1051; GCN-NEXT:    s_lshr_b32 s14, s4, 22
1052; GCN-NEXT:    s_lshr_b32 s15, s4, 23
1053; GCN-NEXT:    s_lshr_b32 s16, s5, 24
1054; GCN-NEXT:    s_lshr_b32 s17, s5, 16
1055; GCN-NEXT:    s_lshr_b32 s18, s5, 17
1056; GCN-NEXT:    s_lshr_b32 s19, s5, 18
1057; GCN-NEXT:    s_lshr_b32 s20, s5, 19
1058; GCN-NEXT:    s_lshr_b32 s21, s5, 20
1059; GCN-NEXT:    s_lshr_b32 s22, s5, 21
1060; GCN-NEXT:    s_lshr_b32 s23, s5, 22
1061; GCN-NEXT:    s_lshr_b32 s24, s5, 23
1062; GCN-NEXT:    s_lshr_b32 s25, s6, 24
1063; GCN-NEXT:    s_lshr_b32 s26, s6, 16
1064; GCN-NEXT:    s_lshr_b32 s27, s6, 17
1065; GCN-NEXT:    s_lshr_b32 s28, s6, 18
1066; GCN-NEXT:    s_lshr_b32 s29, s6, 19
1067; GCN-NEXT:    s_lshr_b32 s30, s6, 20
1068; GCN-NEXT:    s_lshr_b32 s31, s6, 21
1069; GCN-NEXT:    s_lshr_b32 s33, s6, 22
1070; GCN-NEXT:    s_lshr_b32 s34, s6, 23
1071; GCN-NEXT:    s_lshr_b32 s35, s7, 24
1072; GCN-NEXT:    s_lshr_b32 s36, s7, 16
1073; GCN-NEXT:    s_lshr_b32 s37, s7, 17
1074; GCN-NEXT:    s_lshr_b32 s38, s7, 18
1075; GCN-NEXT:    s_lshr_b32 s39, s7, 19
1076; GCN-NEXT:    s_lshr_b32 s40, s7, 20
1077; GCN-NEXT:    s_lshr_b32 s41, s7, 21
1078; GCN-NEXT:    s_lshr_b32 s42, s7, 22
1079; GCN-NEXT:    s_lshr_b32 s43, s7, 23
1080; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x77
1081; GCN-NEXT:    v_mov_b32_e32 v16, s43
1082; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1083; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x76
1084; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1085; GCN-NEXT:    v_mov_b32_e32 v17, s42
1086; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1087; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1088; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1089; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1090; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x75
1091; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1092; GCN-NEXT:    v_mov_b32_e32 v17, s41
1093; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1094; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x74
1095; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1096; GCN-NEXT:    v_mov_b32_e32 v18, s40
1097; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1098; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1099; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1100; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1101; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1102; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1103; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1104; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x73
1105; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1106; GCN-NEXT:    v_mov_b32_e32 v17, s39
1107; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1108; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x72
1109; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1110; GCN-NEXT:    v_mov_b32_e32 v18, s38
1111; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1112; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1113; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1114; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1115; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x71
1116; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1117; GCN-NEXT:    v_mov_b32_e32 v18, s37
1118; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1119; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x70
1120; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1121; GCN-NEXT:    v_mov_b32_e32 v19, s36
1122; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1123; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1124; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1125; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1126; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1127; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1128; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1129; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1130; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1131; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1132; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7f
1133; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1134; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s35
1135; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1136; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7e
1137; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s35
1138; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1139; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1140; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1141; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1142; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1143; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7d
1144; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1145; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s35
1146; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1147; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7c
1148; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s35
1149; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1150; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1151; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1152; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1153; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1154; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1155; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1156; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1157; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7b
1158; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1159; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s35
1160; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1161; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x7a
1162; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s35
1163; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1164; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1165; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1166; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x78
1167; GCN-NEXT:    v_mov_b32_e32 v14, s35
1168; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1169; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1170; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1171; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x79
1172; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1173; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s35
1174; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1175; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1176; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1177; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
1178; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1179; GCN-NEXT:    v_or_b32_e32 v14, v14, v19
1180; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1181; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
1182; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
1183; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1184; GCN-NEXT:    v_and_b32_e32 v14, 15, v14
1185; GCN-NEXT:    v_or_b32_sdwa v14, v14, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1186; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6f
1187; GCN-NEXT:    v_or_b32_sdwa v14, v16, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1188; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s7
1189; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1190; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6e
1191; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s7
1192; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1193; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1194; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1195; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1196; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1197; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6d
1198; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1199; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s7
1200; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1201; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6c
1202; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s7
1203; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1204; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1205; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1206; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1207; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1208; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1209; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1210; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1211; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6b
1212; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1213; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s7
1214; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1215; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x6a
1216; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s7
1217; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1218; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1219; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1220; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1221; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1222; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x69
1223; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1224; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s7
1225; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1226; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x68
1227; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s7
1228; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1229; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1230; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1231; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1232; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1233; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1234; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1235; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1236; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1237; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1238; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1239; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x67
1240; GCN-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1241; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s7
1242; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1243; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x66
1244; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s7
1245; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1246; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1247; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1248; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1249; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1250; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x65
1251; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1252; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s7
1253; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1254; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x64
1255; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s7
1256; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1257; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1258; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1259; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1260; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1261; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1262; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1263; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1264; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x63
1265; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1266; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s7
1267; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1268; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x62
1269; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s7
1270; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1271; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1272; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1273; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1274; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1275; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x61
1276; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1277; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s7
1278; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1279; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x60
1280; GCN-NEXT:    v_mov_b32_e32 v15, s7
1281; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1282; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1283; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1284; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1285; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1286; GCN-NEXT:    v_or_b32_e32 v15, v15, v19
1287; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1288; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1289; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
1290; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1291; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1292; GCN-NEXT:    v_or_b32_e32 v15, v15, v17
1293; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x57
1294; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1295; GCN-NEXT:    v_mov_b32_e32 v16, s34
1296; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1297; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x56
1298; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1299; GCN-NEXT:    v_mov_b32_e32 v17, s33
1300; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1301; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1302; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1303; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1304; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x55
1305; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1306; GCN-NEXT:    v_mov_b32_e32 v17, s31
1307; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1308; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x54
1309; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1310; GCN-NEXT:    v_mov_b32_e32 v18, s30
1311; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1312; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1313; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1314; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1315; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1316; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1317; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1318; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x53
1319; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1320; GCN-NEXT:    v_mov_b32_e32 v17, s29
1321; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1322; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x52
1323; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1324; GCN-NEXT:    v_mov_b32_e32 v18, s28
1325; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1326; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1327; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1328; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1329; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x51
1330; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1331; GCN-NEXT:    v_mov_b32_e32 v18, s27
1332; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1333; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x50
1334; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1335; GCN-NEXT:    v_mov_b32_e32 v19, s26
1336; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1337; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1338; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1339; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1340; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1341; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1342; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1343; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1344; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1345; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1346; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5f
1347; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1348; GCN-NEXT:    v_lshrrev_b16_e64 v17, 7, s25
1349; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1350; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5e
1351; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s25
1352; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1353; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1354; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1355; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1356; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1357; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5d
1358; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1359; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s25
1360; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1361; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5c
1362; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s25
1363; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1364; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1365; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1366; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1367; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1368; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1369; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1370; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1371; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5b
1372; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1373; GCN-NEXT:    v_lshrrev_b16_e64 v18, 3, s25
1374; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1375; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x5a
1376; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s25
1377; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1378; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1379; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1380; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x58
1381; GCN-NEXT:    v_mov_b32_e32 v3, s25
1382; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1383; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1384; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1385; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x59
1386; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1387; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s25
1388; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1389; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1390; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1391; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
1392; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1393; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
1394; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
1395; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
1396; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
1397; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
1398; GCN-NEXT:    v_and_b32_e32 v3, 15, v3
1399; GCN-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1400; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4f
1401; GCN-NEXT:    v_or_b32_sdwa v16, v16, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1402; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
1403; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1404; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4e
1405; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s6
1406; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1407; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1408; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1409; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1410; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1411; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4d
1412; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
1413; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s6
1414; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1415; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4c
1416; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s6
1417; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1418; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1419; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1420; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1421; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1422; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1423; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1424; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1425; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4b
1426; GCN-NEXT:    v_or_b32_e32 v3, v17, v3
1427; GCN-NEXT:    v_lshrrev_b16_e64 v17, 11, s6
1428; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1429; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x4a
1430; GCN-NEXT:    v_lshrrev_b16_e64 v18, 10, s6
1431; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1432; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1433; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1434; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1435; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1436; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x49
1437; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1438; GCN-NEXT:    v_lshrrev_b16_e64 v18, 9, s6
1439; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1440; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x48
1441; GCN-NEXT:    v_lshrrev_b16_e64 v19, 8, s6
1442; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1443; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1444; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1445; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1446; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1447; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1448; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1449; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1450; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
1451; GCN-NEXT:    v_lshlrev_b16_e32 v3, 4, v3
1452; GCN-NEXT:    v_and_b32_e32 v17, 15, v17
1453; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x47
1454; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1455; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
1456; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1457; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x46
1458; GCN-NEXT:    v_lshrrev_b16_e64 v18, 6, s6
1459; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1460; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1461; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1462; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1463; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1464; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x45
1465; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
1466; GCN-NEXT:    v_lshrrev_b16_e64 v18, 5, s6
1467; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1468; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x44
1469; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s6
1470; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1471; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1472; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1473; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1474; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1475; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
1476; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1477; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
1478; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x43
1479; GCN-NEXT:    v_or_b32_e32 v18, v18, v3
1480; GCN-NEXT:    v_lshrrev_b16_e64 v3, 3, s6
1481; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1482; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x42
1483; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s6
1484; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
1485; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1486; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1487; GCN-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
1488; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
1489; GCN-NEXT:    s_cmpk_lg_i32 s0, 0x41
1490; GCN-NEXT:    v_or_b32_e32 v3, v19, v3
1491; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s6
1492; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1493; GCN-NEXT:    s_cmp_lg_u32 s0, 64
1494; GCN-NEXT:    v_mov_b32_e32 v2, s6
1495; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
1496; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1497; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
1498; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
1499; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
1500; GCN-NEXT:    v_or_b32_e32 v2, v2, v19
1501; GCN-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
1502; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
1503; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
1504; GCN-NEXT:    v_or_b32_sdwa v3, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1505; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v18
1506; GCN-NEXT:    v_and_b32_e32 v2, 15, v2
1507; GCN-NEXT:    s_cmp_lg_u32 s0, 55
1508; GCN-NEXT:    v_or_b32_e32 v2, v2, v14
1509; GCN-NEXT:    v_mov_b32_e32 v14, s24
1510; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1511; GCN-NEXT:    s_cmp_lg_u32 s0, 54
1512; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1513; GCN-NEXT:    v_mov_b32_e32 v15, s23
1514; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1515; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1516; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
1517; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1518; GCN-NEXT:    s_cmp_lg_u32 s0, 53
1519; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1520; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1521; GCN-NEXT:    v_mov_b32_e32 v15, s22
1522; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1523; GCN-NEXT:    s_cmp_lg_u32 s0, 52
1524; GCN-NEXT:    v_or_b32_sdwa v2, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1525; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1526; GCN-NEXT:    v_mov_b32_e32 v16, s21
1527; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1528; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1529; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1530; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1531; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1532; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
1533; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1534; GCN-NEXT:    s_cmp_lg_u32 s0, 51
1535; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1536; GCN-NEXT:    v_mov_b32_e32 v15, s20
1537; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1538; GCN-NEXT:    s_cmp_lg_u32 s0, 50
1539; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1540; GCN-NEXT:    v_mov_b32_e32 v16, s19
1541; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1542; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1543; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1544; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1545; GCN-NEXT:    s_cmp_lg_u32 s0, 49
1546; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1547; GCN-NEXT:    v_mov_b32_e32 v16, s18
1548; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1549; GCN-NEXT:    s_cmp_lg_u32 s0, 48
1550; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1551; GCN-NEXT:    v_mov_b32_e32 v17, s17
1552; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1553; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1554; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1555; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1556; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1557; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1558; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1559; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1560; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
1561; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1562; GCN-NEXT:    s_cmp_lg_u32 s0, 63
1563; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1564; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s16
1565; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1566; GCN-NEXT:    s_cmp_lg_u32 s0, 62
1567; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s16
1568; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1569; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1570; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1571; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1572; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1573; GCN-NEXT:    s_cmp_lg_u32 s0, 61
1574; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1575; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s16
1576; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1577; GCN-NEXT:    s_cmp_lg_u32 s0, 60
1578; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s16
1579; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1580; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1581; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1582; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1583; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1584; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1585; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1586; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1587; GCN-NEXT:    s_cmp_lg_u32 s0, 59
1588; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1589; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s16
1590; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1591; GCN-NEXT:    s_cmp_lg_u32 s0, 58
1592; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s16
1593; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1594; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1595; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1596; GCN-NEXT:    s_cmp_lg_u32 s0, 56
1597; GCN-NEXT:    v_mov_b32_e32 v13, s16
1598; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1599; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1600; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1601; GCN-NEXT:    s_cmp_lg_u32 s0, 57
1602; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1603; GCN-NEXT:    v_lshrrev_b16_e64 v17, 1, s16
1604; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1605; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1606; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1607; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
1608; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
1609; GCN-NEXT:    v_or_b32_e32 v13, v13, v17
1610; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1611; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
1612; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
1613; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1614; GCN-NEXT:    v_and_b32_e32 v13, 15, v13
1615; GCN-NEXT:    v_or_b32_sdwa v13, v13, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1616; GCN-NEXT:    s_cmp_lg_u32 s0, 47
1617; GCN-NEXT:    v_or_b32_sdwa v14, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1618; GCN-NEXT:    v_lshrrev_b16_e64 v13, 15, s5
1619; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1620; GCN-NEXT:    s_cmp_lg_u32 s0, 46
1621; GCN-NEXT:    v_lshrrev_b16_e64 v15, 14, s5
1622; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1623; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1624; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1625; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1626; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1627; GCN-NEXT:    s_cmp_lg_u32 s0, 45
1628; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
1629; GCN-NEXT:    v_lshrrev_b16_e64 v15, 13, s5
1630; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1631; GCN-NEXT:    s_cmp_lg_u32 s0, 44
1632; GCN-NEXT:    v_lshrrev_b16_e64 v16, 12, s5
1633; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1634; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1635; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1636; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1637; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1638; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1639; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1640; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1641; GCN-NEXT:    s_cmp_lg_u32 s0, 43
1642; GCN-NEXT:    v_or_b32_e32 v13, v15, v13
1643; GCN-NEXT:    v_lshrrev_b16_e64 v15, 11, s5
1644; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1645; GCN-NEXT:    s_cmp_lg_u32 s0, 42
1646; GCN-NEXT:    v_lshrrev_b16_e64 v16, 10, s5
1647; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1648; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1649; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1650; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1651; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1652; GCN-NEXT:    s_cmp_lg_u32 s0, 41
1653; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1654; GCN-NEXT:    v_lshrrev_b16_e64 v16, 9, s5
1655; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1656; GCN-NEXT:    s_cmp_lg_u32 s0, 40
1657; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s5
1658; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1659; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1660; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1661; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1662; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1663; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1664; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1665; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1666; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1667; GCN-NEXT:    v_lshlrev_b16_e32 v13, 4, v13
1668; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1669; GCN-NEXT:    s_cmp_lg_u32 s0, 39
1670; GCN-NEXT:    v_or_b32_sdwa v15, v15, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1671; GCN-NEXT:    v_lshrrev_b16_e64 v13, 7, s5
1672; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1673; GCN-NEXT:    s_cmp_lg_u32 s0, 38
1674; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s5
1675; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1676; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1677; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1678; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1679; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1680; GCN-NEXT:    s_cmp_lg_u32 s0, 37
1681; GCN-NEXT:    v_or_b32_e32 v13, v16, v13
1682; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s5
1683; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1684; GCN-NEXT:    s_cmp_lg_u32 s0, 36
1685; GCN-NEXT:    v_lshrrev_b16_e64 v17, 4, s5
1686; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1687; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1688; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1689; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1690; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1691; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1692; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1693; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1694; GCN-NEXT:    s_cmp_lg_u32 s0, 35
1695; GCN-NEXT:    v_or_b32_e32 v16, v16, v13
1696; GCN-NEXT:    v_lshrrev_b16_e64 v13, 3, s5
1697; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1698; GCN-NEXT:    s_cmp_lg_u32 s0, 34
1699; GCN-NEXT:    v_lshrrev_b16_e64 v17, 2, s5
1700; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1701; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1702; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1703; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1704; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1705; GCN-NEXT:    s_cmp_lg_u32 s0, 33
1706; GCN-NEXT:    v_or_b32_e32 v17, v17, v13
1707; GCN-NEXT:    v_lshrrev_b16_e64 v13, 1, s5
1708; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1709; GCN-NEXT:    s_cmp_lg_u32 s0, 32
1710; GCN-NEXT:    v_mov_b32_e32 v1, s5
1711; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1712; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1713; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
1714; GCN-NEXT:    v_lshlrev_b16_e32 v13, 1, v13
1715; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
1716; GCN-NEXT:    v_or_b32_e32 v1, v1, v13
1717; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
1718; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
1719; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
1720; GCN-NEXT:    v_lshlrev_b16_e32 v16, 4, v16
1721; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
1722; GCN-NEXT:    v_or_b32_e32 v1, v1, v16
1723; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1724; GCN-NEXT:    s_cmp_lg_u32 s0, 23
1725; GCN-NEXT:    v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1726; GCN-NEXT:    v_mov_b32_e32 v14, s15
1727; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1728; GCN-NEXT:    s_cmp_lg_u32 s0, 22
1729; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
1730; GCN-NEXT:    v_mov_b32_e32 v15, s14
1731; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1732; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1733; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
1734; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
1735; GCN-NEXT:    s_cmp_lg_u32 s0, 21
1736; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1737; GCN-NEXT:    v_mov_b32_e32 v15, s13
1738; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1739; GCN-NEXT:    s_cmp_lg_u32 s0, 20
1740; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1741; GCN-NEXT:    v_mov_b32_e32 v16, s12
1742; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1743; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1744; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1745; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1746; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1747; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
1748; GCN-NEXT:    v_and_b32_e32 v15, 3, v15
1749; GCN-NEXT:    s_cmp_lg_u32 s0, 19
1750; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1751; GCN-NEXT:    v_mov_b32_e32 v15, s11
1752; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1753; GCN-NEXT:    s_cmp_lg_u32 s0, 18
1754; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1755; GCN-NEXT:    v_mov_b32_e32 v16, s10
1756; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1757; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1758; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1759; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1760; GCN-NEXT:    s_cmp_lg_u32 s0, 17
1761; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1762; GCN-NEXT:    v_mov_b32_e32 v16, s9
1763; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1764; GCN-NEXT:    s_cmp_lg_u32 s0, 16
1765; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1766; GCN-NEXT:    v_mov_b32_e32 v18, s8
1767; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1768; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1769; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1770; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1771; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1772; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1773; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1774; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1775; GCN-NEXT:    v_lshlrev_b16_e32 v14, 4, v14
1776; GCN-NEXT:    v_and_b32_e32 v15, 15, v15
1777; GCN-NEXT:    s_cmp_lg_u32 s0, 31
1778; GCN-NEXT:    v_or_b32_e32 v14, v15, v14
1779; GCN-NEXT:    v_lshrrev_b16_e64 v15, 7, s1
1780; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1781; GCN-NEXT:    s_cmp_lg_u32 s0, 30
1782; GCN-NEXT:    v_lshrrev_b16_e64 v16, 6, s1
1783; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1784; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1785; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1786; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1787; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1788; GCN-NEXT:    s_cmp_lg_u32 s0, 29
1789; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1790; GCN-NEXT:    v_lshrrev_b16_e64 v16, 5, s1
1791; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1792; GCN-NEXT:    s_cmp_lg_u32 s0, 28
1793; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s1
1794; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1795; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1796; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1797; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1798; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1799; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1800; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1801; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1802; GCN-NEXT:    s_cmp_lg_u32 s0, 27
1803; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1804; GCN-NEXT:    v_lshrrev_b16_e64 v16, 3, s1
1805; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1806; GCN-NEXT:    s_cmp_lg_u32 s0, 26
1807; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s1
1808; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1809; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1810; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1811; GCN-NEXT:    s_cmp_lg_u32 s0, 24
1812; GCN-NEXT:    v_mov_b32_e32 v17, s1
1813; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1814; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
1815; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1816; GCN-NEXT:    s_cmp_lg_u32 s0, 25
1817; GCN-NEXT:    v_or_b32_e32 v16, v18, v16
1818; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s1
1819; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1820; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1821; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
1822; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1823; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
1824; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
1825; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
1826; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
1827; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1828; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1829; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
1830; GCN-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1831; GCN-NEXT:    s_cmp_lg_u32 s0, 15
1832; GCN-NEXT:    v_or_b32_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1833; GCN-NEXT:    v_lshrrev_b16_e64 v15, 15, s4
1834; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1835; GCN-NEXT:    s_cmp_lg_u32 s0, 14
1836; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s4
1837; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
1838; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1839; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1840; GCN-NEXT:    v_lshlrev_b16_e32 v15, 1, v15
1841; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
1842; GCN-NEXT:    s_cmp_lg_u32 s0, 13
1843; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1844; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s4
1845; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1846; GCN-NEXT:    s_cmp_lg_u32 s0, 12
1847; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s4
1848; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
1849; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1850; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
1851; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1852; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
1853; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
1854; GCN-NEXT:    s_cmp_lg_u32 s0, 11
1855; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s4
1856; GCN-NEXT:    v_lshlrev_b16_e32 v15, 2, v15
1857; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
1858; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1859; GCN-NEXT:    s_cmp_lg_u32 s0, 10
1860; GCN-NEXT:    v_lshrrev_b16_e64 v13, 10, s4
1861; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
1862; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v18, vcc
1863; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1864; GCN-NEXT:    s_cmp_lg_u32 s0, 9
1865; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
1866; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
1867; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1868; GCN-NEXT:    s_cmp_lg_u32 s0, 8
1869; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
1870; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
1871; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1872; GCN-NEXT:    s_cmp_lg_u32 s0, 7
1873; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
1874; GCN-NEXT:    v_cndmask_b32_e32 v11, 1, v11, vcc
1875; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1876; GCN-NEXT:    s_cmp_lg_u32 s0, 6
1877; GCN-NEXT:    v_lshrrev_b16_e64 v9, 6, s4
1878; GCN-NEXT:    v_cndmask_b32_e32 v10, 1, v10, vcc
1879; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1880; GCN-NEXT:    s_cmp_lg_u32 s0, 5
1881; GCN-NEXT:    v_lshrrev_b16_e64 v8, 5, s4
1882; GCN-NEXT:    v_cndmask_b32_e32 v9, 1, v9, vcc
1883; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1884; GCN-NEXT:    s_cmp_lg_u32 s0, 4
1885; GCN-NEXT:    v_lshrrev_b16_e64 v7, 4, s4
1886; GCN-NEXT:    v_cndmask_b32_e32 v8, 1, v8, vcc
1887; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1888; GCN-NEXT:    s_cmp_lg_u32 s0, 3
1889; GCN-NEXT:    v_lshrrev_b16_e64 v6, 3, s4
1890; GCN-NEXT:    v_cndmask_b32_e32 v7, 1, v7, vcc
1891; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1892; GCN-NEXT:    s_cmp_lg_u32 s0, 2
1893; GCN-NEXT:    v_lshrrev_b16_e64 v5, 2, s4
1894; GCN-NEXT:    v_cndmask_b32_e32 v6, 1, v6, vcc
1895; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1896; GCN-NEXT:    s_cmp_lg_u32 s0, 1
1897; GCN-NEXT:    v_lshrrev_b16_e64 v4, 1, s4
1898; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
1899; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1900; GCN-NEXT:    s_cmp_lg_u32 s0, 0
1901; GCN-NEXT:    v_mov_b32_e32 v0, s4
1902; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
1903; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
1904; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
1905; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
1906; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
1907; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
1908; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
1909; GCN-NEXT:    v_lshlrev_b16_e32 v10, 1, v10
1910; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
1911; GCN-NEXT:    v_lshlrev_b16_e32 v8, 1, v8
1912; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
1913; GCN-NEXT:    v_lshlrev_b16_e32 v6, 1, v6
1914; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
1915; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
1916; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
1917; GCN-NEXT:    v_or_b32_e32 v13, v13, v16
1918; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
1919; GCN-NEXT:    v_or_b32_e32 v9, v9, v10
1920; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
1921; GCN-NEXT:    v_or_b32_e32 v5, v5, v6
1922; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
1923; GCN-NEXT:    v_lshlrev_b16_e32 v13, 2, v13
1924; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
1925; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
1926; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
1927; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
1928; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
1929; GCN-NEXT:    v_or_b32_e32 v11, v11, v13
1930; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
1931; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
1932; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
1933; GCN-NEXT:    v_and_b32_e32 v11, 15, v11
1934; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
1935; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
1936; GCN-NEXT:    v_or_b32_sdwa v11, v11, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1937; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
1938; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1939; GCN-NEXT:    v_mov_b32_e32 v5, s3
1940; GCN-NEXT:    v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1941; GCN-NEXT:    v_mov_b32_e32 v4, s2
1942; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1943; GCN-NEXT:    s_endpgm
1944entry:
1945  %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
1946  store <128 x i1> %v, <128 x i1> addrspace(1)* %out
1947  ret void
1948}
1949
1950define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) {
1951; GCN-LABEL: float32_inselt_vec:
1952; GCN:       ; %bb.0: ; %entry
1953; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v32
1954; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 2, v32
1955; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 3, v32
1956; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 4, v32
1957; GCN-NEXT:    v_cmp_ne_u32_e64 s[6:7], 5, v32
1958; GCN-NEXT:    v_cmp_ne_u32_e64 s[8:9], 6, v32
1959; GCN-NEXT:    v_cmp_ne_u32_e64 s[10:11], 7, v32
1960; GCN-NEXT:    v_cmp_ne_u32_e64 s[12:13], 8, v32
1961; GCN-NEXT:    v_cmp_ne_u32_e64 s[14:15], 9, v32
1962; GCN-NEXT:    v_cmp_ne_u32_e64 s[16:17], 10, v32
1963; GCN-NEXT:    v_cmp_ne_u32_e64 s[18:19], 11, v32
1964; GCN-NEXT:    v_cmp_ne_u32_e64 s[20:21], 12, v32
1965; GCN-NEXT:    v_cmp_ne_u32_e64 s[22:23], 13, v32
1966; GCN-NEXT:    v_cmp_ne_u32_e64 s[24:25], 14, v32
1967; GCN-NEXT:    v_cmp_ne_u32_e64 s[26:27], 15, v32
1968; GCN-NEXT:    v_cmp_ne_u32_e64 s[28:29], 16, v32
1969; GCN-NEXT:    v_cmp_ne_u32_e64 s[30:31], 17, v32
1970; GCN-NEXT:    v_cmp_ne_u32_e64 s[34:35], 18, v32
1971; GCN-NEXT:    v_cmp_ne_u32_e64 s[36:37], 19, v32
1972; GCN-NEXT:    v_cmp_ne_u32_e64 s[38:39], 20, v32
1973; GCN-NEXT:    v_cmp_ne_u32_e64 s[40:41], 21, v32
1974; GCN-NEXT:    v_cmp_ne_u32_e64 s[42:43], 22, v32
1975; GCN-NEXT:    v_cmp_ne_u32_e64 s[44:45], 23, v32
1976; GCN-NEXT:    v_cmp_ne_u32_e64 s[46:47], 24, v32
1977; GCN-NEXT:    v_cmp_ne_u32_e64 s[48:49], 25, v32
1978; GCN-NEXT:    v_cmp_ne_u32_e64 s[50:51], 26, v32
1979; GCN-NEXT:    v_cmp_ne_u32_e64 s[52:53], 27, v32
1980; GCN-NEXT:    v_cmp_ne_u32_e64 s[54:55], 28, v32
1981; GCN-NEXT:    v_cmp_ne_u32_e64 s[56:57], 29, v32
1982; GCN-NEXT:    v_cmp_ne_u32_e64 s[58:59], 30, v32
1983; GCN-NEXT:    v_cmp_ne_u32_e64 s[60:61], 31, v32
1984; GCN-NEXT:    v_cmp_ne_u32_e64 s[62:63], 0, v32
1985; GCN-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[62:63]
1986; GCN-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
1987; GCN-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
1988; GCN-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[2:3]
1989; GCN-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
1990; GCN-NEXT:    v_cndmask_b32_e64 v5, 1.0, v5, s[6:7]
1991; GCN-NEXT:    v_cndmask_b32_e64 v6, 1.0, v6, s[8:9]
1992; GCN-NEXT:    v_cndmask_b32_e64 v7, 1.0, v7, s[10:11]
1993; GCN-NEXT:    v_cndmask_b32_e64 v8, 1.0, v8, s[12:13]
1994; GCN-NEXT:    v_cndmask_b32_e64 v9, 1.0, v9, s[14:15]
1995; GCN-NEXT:    v_cndmask_b32_e64 v10, 1.0, v10, s[16:17]
1996; GCN-NEXT:    v_cndmask_b32_e64 v11, 1.0, v11, s[18:19]
1997; GCN-NEXT:    v_cndmask_b32_e64 v12, 1.0, v12, s[20:21]
1998; GCN-NEXT:    v_cndmask_b32_e64 v13, 1.0, v13, s[22:23]
1999; GCN-NEXT:    v_cndmask_b32_e64 v14, 1.0, v14, s[24:25]
2000; GCN-NEXT:    v_cndmask_b32_e64 v15, 1.0, v15, s[26:27]
2001; GCN-NEXT:    v_cndmask_b32_e64 v16, 1.0, v16, s[28:29]
2002; GCN-NEXT:    v_cndmask_b32_e64 v17, 1.0, v17, s[30:31]
2003; GCN-NEXT:    v_cndmask_b32_e64 v18, 1.0, v18, s[34:35]
2004; GCN-NEXT:    v_cndmask_b32_e64 v19, 1.0, v19, s[36:37]
2005; GCN-NEXT:    v_cndmask_b32_e64 v20, 1.0, v20, s[38:39]
2006; GCN-NEXT:    v_cndmask_b32_e64 v21, 1.0, v21, s[40:41]
2007; GCN-NEXT:    v_cndmask_b32_e64 v22, 1.0, v22, s[42:43]
2008; GCN-NEXT:    v_cndmask_b32_e64 v23, 1.0, v23, s[44:45]
2009; GCN-NEXT:    v_cndmask_b32_e64 v24, 1.0, v24, s[46:47]
2010; GCN-NEXT:    v_cndmask_b32_e64 v25, 1.0, v25, s[48:49]
2011; GCN-NEXT:    v_cndmask_b32_e64 v26, 1.0, v26, s[50:51]
2012; GCN-NEXT:    v_cndmask_b32_e64 v27, 1.0, v27, s[52:53]
2013; GCN-NEXT:    v_cndmask_b32_e64 v28, 1.0, v28, s[54:55]
2014; GCN-NEXT:    v_cndmask_b32_e64 v29, 1.0, v29, s[56:57]
2015; GCN-NEXT:    v_cndmask_b32_e64 v30, 1.0, v30, s[58:59]
2016; GCN-NEXT:    v_cndmask_b32_e64 v31, 1.0, v31, s[60:61]
2017; GCN-NEXT:    ; return to shader part epilog
2018entry:
2019  %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel
2020  ret <32 x float> %v
2021}
2022
2023define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) {
2024; GCN-LABEL: double8_inselt_vec:
2025; GCN:       ; %bb.0: ; %entry
2026; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2027; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
2028; GCN-NEXT:    v_mov_b32_e32 v17, 0x3ff00000
2029; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
2030; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v17, vcc
2031; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
2032; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
2033; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v17, vcc
2034; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v16
2035; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
2036; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
2037; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v16
2038; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
2039; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v17, vcc
2040; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 4, v16
2041; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
2042; GCN-NEXT:    v_cndmask_b32_e32 v9, v9, v17, vcc
2043; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v16
2044; GCN-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
2045; GCN-NEXT:    v_cndmask_b32_e32 v11, v11, v17, vcc
2046; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v16
2047; GCN-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
2048; GCN-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
2049; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v16
2050; GCN-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
2051; GCN-NEXT:    v_cndmask_b32_e32 v15, v15, v17, vcc
2052; GCN-NEXT:    s_setpc_b64 s[30:31]
2053entry:
2054  %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
2055  ret <8 x double> %v
2056}
2057