1; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
3
4; FIXME: Broken on evergreen
5; FIXME: For some reason the 8 and 16 vectors are being stored as
6; individual elements instead of 128-bit stores.
7
8
9; FIXME: Why is the constant moved into the intermediate register and
10; not just directly into the vector component?
11
12; GCN-LABEL: {{^}}insertelement_v4f32_0:
13; GCN: s_load_dwordx4
14; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
15; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
16; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
17; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
18; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
19; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
20; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
21define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
22  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
23  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
24  ret void
25}
26
27; GCN-LABEL: {{^}}insertelement_v4f32_1:
28define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
29  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
30  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
31  ret void
32}
33
34; GCN-LABEL: {{^}}insertelement_v4f32_2:
35define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
36  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
37  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
38  ret void
39}
40
41; GCN-LABEL: {{^}}insertelement_v4f32_3:
42define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
43  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
44  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
45  ret void
46}
47
48; GCN-LABEL: {{^}}insertelement_v4i32_0:
49define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
50  %vecins = insertelement <4 x i32> %a, i32 999, i32 0
51  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
52  ret void
53}
54
55; GCN-LABEL: {{^}}insertelement_v3f32_1:
56define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
57  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
58  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
59  ret void
60}
61
62; GCN-LABEL: {{^}}insertelement_v3f32_2:
63define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
64  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
65  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
66  ret void
67}
68
69; GCN-LABEL: {{^}}insertelement_v3f32_3:
70define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
71  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
72  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
73  ret void
74}
75
76; GCN-LABEL: {{^}}insertelement_to_sgpr:
77; GCN-NOT: v_readfirstlane
78define <4 x float> @insertelement_to_sgpr() nounwind {
79  %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
80  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
81  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
82  ret <4 x float> %tmp2
83}
84
85; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
86; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
87; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
88; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
89; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
90; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
91; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
92define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
93  %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
94  store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
95  ret void
96}
97
98; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
99; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
100; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
101; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
102; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
103; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
104; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
105; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
106; GCN-DAG: buffer_store_dwordx2 v
107; GCN-DAG: buffer_store_dword v
108define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
109  %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
110  store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
111  ret void
112}
113
114; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
115; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
116; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
117; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]]
118; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
119; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
120; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
121; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
122; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
123; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
124; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
125define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
126  %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
127  store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
128  ret void
129}
130
131; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
132; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
133; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
134; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]]
135; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
136; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
137; GCN: buffer_store_dwordx4
138; GCN: buffer_store_dwordx4
139define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
140  %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
141  store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
142  ret void
143}
144
145; GCN-LABEL: {{^}}dynamic_insertelement_v16f32:
146; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
147; GCN: buffer_store_dwordx4
148; GCN: buffer_store_dwordx4
149; GCN: buffer_store_dwordx4
150; GCN: buffer_store_dwordx4
151define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
152  %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
153  store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
154  ret void
155}
156
157; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
158; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
159; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
160; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
161; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]]
162; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
163define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
164  %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
165  store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
166  ret void
167}
168
169; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
170; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
171; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]]
172; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
173; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
174; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
175; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
176; GCN-DAG: buffer_store_dwordx2 v
177; GCN-DAG: buffer_store_dword v
178define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
179  %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
180  store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
181  ret void
182}
183
184; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
185; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
186; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
187; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
188; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]]
189; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
190; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}},  v{{[0-9]+}}, [[VVAL]], [[CC3]]
191; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
192; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]]
193; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
194; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]]
195; GCN: buffer_store_dwordx4
196define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
197  %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
198  store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
199  ret void
200}
201
202; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
203; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
204; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
205; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
206; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
207; GCN: buffer_store_dwordx4
208; GCN: buffer_store_dwordx4
209define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
210  %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
211  store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
212  ret void
213}
214
215; GCN-LABEL: {{^}}dynamic_insertelement_v16i32:
216; GCN: v_movreld_b32
217; GCN: buffer_store_dwordx4
218; GCN: buffer_store_dwordx4
219; GCN: buffer_store_dwordx4
220; GCN: buffer_store_dwordx4
221define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
222  %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
223  store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
224  ret void
225}
226
227; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
228define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
229  %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
230  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
231  ret void
232}
233
234; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
235define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
236  %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
237  store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
238  ret void
239}
240
241; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
242; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
243; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
244; VI-NOT: _load
245; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
246; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
247; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
248; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
249; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
250; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
251; VI: buffer_store_short [[OR]]
252define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
253  %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
254  store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
255  ret void
256}
257
258; FIXME: post legalize i16 and i32 shifts aren't merged because of
259; isTypeDesirableForOp in SimplifyDemandedBits
260
261; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
262; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
263; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
264; VI-NOT: _load
265
266; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
267; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
268; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
269; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
270; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
271; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
272; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
273
274; VI-DAG: buffer_store_short [[BFI]]
275; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
276; VI: buffer_store_byte [[V_HI2]]
277define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
278  %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
279  store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
280  ret void
281}
282
283; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
284; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
285; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
286; VI-NOT: _load
287
288; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
289; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
290; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
291; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
292; VI: buffer_store_dword [[BFI]]
293define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
294  %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
295  store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
296  ret void
297}
298
299; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
300; VI-NOT: {{buffer|flat|global}}_load
301; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
302; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10
303; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}}
304; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
305
306; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
307; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
308; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
309; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
310; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]]
311; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5
312; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]]
313; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]
314; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]
315; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}}
316define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
317  %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
318  %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
319  store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
320  ret void
321}
322
323; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
324; GCN: s_load_dwordx2
325; GCN: s_load_dwordx4
326; GCN: s_load_dword s
327
328; GCN-NOT: buffer_store_byte
329
330; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15
331; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
332; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
333; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
334
335; GCN: buffer_store_dwordx4
336define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
337  %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
338  store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
339  ret void
340}
341
342; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
343; the compiler doesn't crash.
344; GCN-LABEL: {{^}}insert_split_bb:
345define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
346entry:
347  %0 = insertelement <2 x i32> undef, i32 %a, i32 0
348  %1 = icmp eq i32 %a, 0
349  br i1 %1, label %if, label %else
350
351if:
352  %2 = load i32, i32 addrspace(1)* %in
353  %3 = insertelement <2 x i32> %0, i32 %2, i32 1
354  br label %endif
355
356else:
357  %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
358  %5 = load i32, i32 addrspace(1)* %4
359  %6 = insertelement <2 x i32> %0, i32 %5, i32 1
360  br label %endif
361
362endif:
363  %7 = phi <2 x i32> [%3, %if], [%6, %else]
364  store <2 x i32> %7, <2 x i32> addrspace(1)* %out
365  ret void
366}
367
368; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
369; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
370; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
371
372; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
373; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
374; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
375; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
376; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
377
378; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
379; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]]
380; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
381; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
382; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]]
383; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
384
385; GCN: buffer_store_dwordx4
386; GCN: s_endpgm
387define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
388  %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
389  store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
390  ret void
391}
392
393; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:
394
395; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
396; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
397; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
398; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
399; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
400; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
401
402; GCN: buffer_store_dwordx4
403; GCN: s_endpgm
404define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
405  %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
406  store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
407  ret void
408}
409
410; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
411; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
412; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}},  v{{[0-9]+}}, 5, [[CC3]]
413; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}},  v{{[0-9]+}}, 0, [[CC3]]
414; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
415; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
416; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
417; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
418; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
419; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
420define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
421  %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
422  store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
423  ret void
424}
425
426; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
427
428; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000
429; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
430; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]]
431; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]]
432; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
433; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}},  v{{[0-9]+}}, [[CONST]], [[CC3]]
434; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}},  v{{[0-9]+}}, 0, [[CC3]]
435; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
436; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]]
437; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
438; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
439; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]]
440; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
441
442; GCN: buffer_store_dwordx4
443; GCN: buffer_store_dwordx4
444; GCN: s_endpgm
445; GCN: ScratchSize: 0
446
447define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
448  %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
449  store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
450  ret void
451}
452
453; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
454; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
455; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
456; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
457; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
458
459; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
460
461; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
462; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
463; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
464; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
465
466; GCN: buffer_store_dwordx4
467; GCN: buffer_store_dwordx4
468; GCN: buffer_store_dwordx4
469; GCN: buffer_store_dwordx4
470; GCN: s_endpgm
471; GCN: ScratchSize: 128
472define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
473  %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
474  store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
475  ret void
476}
477
478declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
479
480attributes #0 = { nounwind }
481attributes #1 = { nounwind readnone }
482