1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
2
3; GCN-LABEL: {{^}}float4_extelt:
4; GCN-NOT: buffer_
5; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
6; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
7; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
8; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
9; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
10; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
11; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
12; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
13; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
14; GCN:     store_dword v[{{[0-9:]+}}], [[V3]]
15define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) {
16entry:
17  %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
18  store float %ext, float addrspace(1)* %out
19  ret void
20}
21
22; GCN-LABEL: {{^}}int4_extelt:
23; GCN-NOT: buffer_
24; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2
25; GCN-DAG: s_cmp_eq_u32 [[IDX]], 1
26; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
27; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
28; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]]
29; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], vcc
30; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], vcc
31; GCN: store_dword v[{{[0-9:]+}}], [[V3]]
32define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) {
33entry:
34  %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
35  store i32 %ext, i32 addrspace(1)* %out
36  ret void
37}
38
39; GCN-LABEL: {{^}}double4_extelt:
40; GCN-NOT: buffer_
41; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
42; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
43; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
44; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
45; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
46; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
47; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
48; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
49; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
50; GCN: store_dwordx2 v[{{[0-9:]+}}]
51define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
52entry:
53  %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
54  store double %ext, double addrspace(1)* %out
55  ret void
56}
57
58; GCN-LABEL: {{^}}double5_extelt:
59; GCN-NOT: buffer_
60; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
61; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
62; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
63; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
64; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
65; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
66; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4
67; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
68; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
69; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
70; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
71; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C4]]
72; GCN: store_dwordx2 v[{{[0-9:]+}}]
73define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
74entry:
75  %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
76  store double %ext, double addrspace(1)* %out
77  ret void
78}
79
80; GCN-LABEL: {{^}}half4_extelt:
81; GCN-NOT: buffer_
82; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
83; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
84; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4
85; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
86; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
87; GCN:     store_short v[{{[0-9:]+}}], v[[VRL]]
88define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) {
89entry:
90  %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
91  store half %ext, half addrspace(1)* %out
92  ret void
93}
94
95; GCN-LABEL: {{^}}float2_extelt:
96; GCN-NOT: buffer_
97; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
98; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
99; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
100; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
101define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) {
102entry:
103  %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
104  store float %ext, float addrspace(1)* %out
105  ret void
106}
107
108; GCN-LABEL: {{^}}double2_extelt:
109; GCN-NOT: buffer_
110; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
111; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
112; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
113; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
114; GCN: store_dwordx2 v[{{[0-9:]+}}]
115define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) {
116entry:
117  %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
118  store double %ext, double addrspace(1)* %out
119  ret void
120}
121
122; GCN-LABEL: {{^}}half8_extelt:
123; GCN-NOT: buffer_
124; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
125; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
126; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
127; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
128; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
129; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
130; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
131; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
132; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
133; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
134; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
135; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
136; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
137; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
138; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
139; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
140; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
141; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
142; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
143; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
144; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
145; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
146define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) {
147entry:
148  %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
149  store half %ext, half addrspace(1)* %out
150  ret void
151}
152
153; GCN-LABEL: {{^}}short8_extelt:
154; GCN-NOT: buffer_
155; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
156; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
157; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
158; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
159; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
160; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
161; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
162; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
163; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
164; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
165; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
166; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
167; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
168; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
169; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
170; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
171; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
172; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
173; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
174; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
175; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
176; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
177define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) {
178entry:
179  %ext = extractelement <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i32 %sel
180  store i16 %ext, i16 addrspace(1)* %out
181  ret void
182}
183
184; GCN-LABEL: {{^}}float8_extelt:
185; GCN-DAG: s_load_dwordx2 s[2:3], s[0:1], 0x24
186; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[0:1], 0x2c
187; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
188; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
189; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
190; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
191; GCN-DAG: s_waitcnt lgkmcnt(0)
192; GCN-DAG: s_mov_b32 m0, [[S0]]
193; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
194; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
195; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
196; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
197; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], v{{[0-9]+}}
198; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
199; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
200; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
201define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) {
202entry:
203  %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
204  store float %ext, float addrspace(1)* %out
205  ret void
206}
207
208; GCN-LABEL: {{^}}double8_extelt:
209; GCN-NOT: buffer_
210; GCN-NOT: s_or_b32
211; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
212; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
213; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
214; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
215; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
216; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
217define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) {
218entry:
219  %ext = extractelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, i32 %sel
220  store double %ext, double addrspace(1)* %out
221  ret void
222}
223
224; GCN-LABEL: {{^}}double7_extelt:
225; GCN-NOT: buffer_
226; GCN-NOT: s_or_b32
227; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
228; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
229; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
230; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
231; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
232; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
233define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) {
234entry:
235  %ext = extractelement <7 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, i32 %sel
236  store double %ext, double addrspace(1)* %out
237  ret void
238}
239
240; GCN-LABEL: {{^}}float16_extelt:
241; GCN-NOT: buffer_
242; GCN-DAG: s_mov_b32 m0,
243; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
244; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
245; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
246; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
247; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
248; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
249; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
250; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
251; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
252; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
253; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
254; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
255; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
256; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
257; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
258; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
259; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
260; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
261define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) {
262entry:
263  %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
264  store float %ext, float addrspace(1)* %out
265  ret void
266}
267
268; GCN-LABEL: {{^}}double15_extelt:
269; GCN-NOT: buffer_
270; GCN-NOT: s_or_b32
271; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
272; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
273; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
274; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
275; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
276; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
277define amdgpu_kernel void @double15_extelt(double addrspace(1)* %out, i32 %sel) {
278entry:
279  %ext = extractelement <15 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>, i32 %sel
280  store double %ext, double addrspace(1)* %out
281  ret void
282}
283
284; GCN-LABEL: {{^}}double16_extelt:
285; GCN-NOT: buffer_
286; GCN-NOT: s_or_b32
287; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
288; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
289; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
290; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
291; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
292; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
293define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) {
294entry:
295  %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
296  store double %ext, double addrspace(1)* %out
297  ret void
298}
299
300; GCN-LABEL: {{^}}float32_extelt:
301; GCN-NOT: buffer_
302; GCN-DAG: s_mov_b32 m0,
303; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
304; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
305; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
306; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
307; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
308; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
309; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
310; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
311; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
312; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
313; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
314; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
315; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
316; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
317; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
318; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
319; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41880000
320; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
321; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41980000
322; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000
323; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
324; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
325; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b80000
326; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c00000
327; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c80000
328; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d00000
329; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d80000
330; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e00000
331; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e80000
332; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f00000
333; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f80000
334; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x42000000
335; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
336; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
337define amdgpu_kernel void @float32_extelt(float addrspace(1)* %out, i32 %sel) {
338entry:
339  %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
340  store float %ext, float addrspace(1)* %out
341  ret void
342}
343
344; GCN-LABEL: {{^}}byte8_extelt:
345; GCN-NOT: buffer_
346; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201
347; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605
348; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3
349; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
350; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
351; GCN:     store_byte v[{{[0-9:]+}}], v[[VRL]]
352define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) {
353entry:
354  %ext = extractelement <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i32 %sel
355  store i8 %ext, i8 addrspace(1)* %out
356  ret void
357}
358
359; GCN-LABEL: {{^}}byte16_extelt:
360; GCN-NOT: buffer_
361; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
362; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
363; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
364; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
365; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
366; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
367; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
368; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
369; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
370; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
371; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
372; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
373; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
374; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
375; GCN-DAG: s_cmp_lg_u32 [[IDX]], 8
376; GCN-DAG: s_cselect_b64 [[C8:[^,]+]], -1, 0
377; GCN-DAG: s_cmp_lg_u32 [[IDX]], 9
378; GCN-DAG: s_cselect_b64 [[C9:[^,]+]], -1, 0
379; GCN-DAG: s_cmp_lg_u32 [[IDX]], 10
380; GCN-DAG: s_cselect_b64 [[C10:[^,]+]], -1, 0
381; GCN-DAG: s_cmp_lg_u32 [[IDX]], 11
382; GCN-DAG: s_cselect_b64 [[C11:[^,]+]], -1, 0
383; GCN-DAG: s_cmp_lg_u32 [[IDX]], 12
384; GCN-DAG: s_cselect_b64 [[C12:[^,]+]], -1, 0
385; GCN-DAG: s_cmp_lg_u32 [[IDX]], 13
386; GCN-DAG: s_cselect_b64 [[C13:[^,]+]], -1, 0
387; GCN-DAG: s_cmp_lg_u32 [[IDX]], 14
388; GCN-DAG: s_cselect_b64 [[C14:[^,]+]], -1, 0
389; GCN-DAG: s_cmp_lg_u32 [[IDX]], 15
390; GCN-DAG: s_cselect_b64 [[C15:[^,]+]], -1, 0
391; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
392; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
393; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
394; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
395; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
396; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
397; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
398; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]]
399; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]]
400; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]]
401; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]]
402; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]]
403; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]]
404; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]]
405; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]]
406; GCN:     store_byte v[{{[0-9:]+}}], [[V15]]
407define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) {
408entry:
409  %ext = extractelement <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, i32 %sel
410  store i8 %ext, i8 addrspace(1)* %out
411  ret void
412}
413
414; GCN-LABEL: {{^}}bit4_extelt:
415; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
416; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
417; GCN-DAG: buffer_store_byte [[ZERO]],
418; GCN-DAG: buffer_store_byte [[ONE]],
419; GCN-DAG: buffer_store_byte [[ZERO]],
420; GCN-DAG: buffer_store_byte [[ONE]],
421; GCN:     buffer_load_ubyte [[LOAD:v[0-9]+]],
422; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]]
423; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
424define amdgpu_kernel void @bit4_extelt(i32 addrspace(1)* %out, i32 %sel) {
425entry:
426  %ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
427  %zext = zext i1 %ext to i32
428  store i32 %zext, i32 addrspace(1)* %out
429  ret void
430}
431
432; GCN-LABEL: {{^}}bit128_extelt:
433; GCN-NOT: buffer_
434; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1
435; GCN: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
436; GCN: s_cselect_b64 [[CL:[^,]+]], -1, 0
437; GCN: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
438; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
439; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
440define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) {
441entry:
442  %ext = extractelement <128 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, i32 %sel
443  %zext = zext i1 %ext to i32
444  store i32 %zext, i32 addrspace(1)* %out
445  ret void
446}
447
448; GCN-LABEL: {{^}}float32_extelt_vec:
449; GCN-NOT: buffer_
450; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0
451; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1.0, 2.0, [[CC1]]
452; GCN-DAG: v_mov_b32_e32 [[LASTVAL:v[0-9]+]], 0x42000000
453; GCN-DAG: v_cmp_ne_u32_e32 [[LASTCC:[^,]+]], 31, v0
454; GCN-DAG: v_cndmask_b32_e{{32|64}} v0, [[LASTVAL]], v{{[0-9]+}}, [[LASTCC]]
455define float @float32_extelt_vec(i32 %sel) {
456entry:
457  %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
458  ret float %ext
459}
460
461; GCN-LABEL: {{^}}double16_extelt_vec:
462; GCN-NOT: buffer_
463; GCN-DAG: v_mov_b32_e32 [[V1HI:v[0-9]+]], 0x3ff19999
464; GCN-DAG: v_mov_b32_e32 [[V1LO:v[0-9]+]], 0x9999999a
465; GCN-DAG: v_mov_b32_e32 [[V2HI:v[0-9]+]], 0x4000cccc
466; GCN-DAG: v_mov_b32_e32 [[V2LO:v[0-9]+]], 0xcccccccd
467; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0
468; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1HI:v[0-9]+]], [[V1HI]], [[V2HI]], [[CC1]]
469; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1LO:v[0-9]+]], [[V1LO]], [[V2LO]], [[CC1]]
470define double @double16_extelt_vec(i32 %sel) {
471entry:
472  %ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
473  ret double %ext
474}
475