1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
2
3; GCN-LABEL: {{^}}float4_extelt:
4; GCN-NOT: buffer_
5; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
6; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
7; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
8; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
9; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
10; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
11; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
12; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
13; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
14; GCN:     store_dword v[{{[0-9:]+}}], [[V3]]
15define amdgpu_kernel void @float4_extelt(float addrspace(1)* %out, i32 %sel) {
16entry:
17  %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
18  store float %ext, float addrspace(1)* %out
19  ret void
20}
21
22; GCN-LABEL: {{^}}int4_extelt:
23; GCN-NOT: buffer_
24; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2
25; GCN-DAG: s_cmp_eq_u32 [[IDX]], 1
26; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
27; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
28; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]]
29; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], vcc
30; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], vcc
31; GCN: store_dword v[{{[0-9:]+}}], [[V3]]
32define amdgpu_kernel void @int4_extelt(i32 addrspace(1)* %out, i32 %sel) {
33entry:
34  %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
35  store i32 %ext, i32 addrspace(1)* %out
36  ret void
37}
38
39; GCN-LABEL: {{^}}double4_extelt:
40; GCN-NOT: buffer_
41; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b
42; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1
43; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29
44; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5
45; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
46; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}}
47; GCN-DAG: s_mov_b32 s[[L2LO:[0-9]+]], 0xe147ae14
48; GCN-DAG: s_mov_b32 s[[L2HI:[0-9]+]], 0x4000147a
49; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
50; GCN: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}, s{{\[}}[[L2LO]]:[[L2HI]]{{\]}}
51; GCN-DAG: s_mov_b32 s[[L3LO:[0-9]+]], 0x70a3d70a
52; GCN-DAG: s_mov_b32 s[[L3HI:[0-9]+]], 0x40100a3d
53; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
54; GCN: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L3HI]]{{\]}}
55; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T2LO]]
56; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T2HI]]
57; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
58define amdgpu_kernel void @double4_extelt(double addrspace(1)* %out, i32 %sel) {
59entry:
60  %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
61  store double %ext, double addrspace(1)* %out
62  ret void
63}
64
65; GCN-LABEL: {{^}}double5_extelt:
66; GCN-NOT: buffer_
67; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b
68; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1
69; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29
70; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5
71; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
72; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}}
73; GCN-DAG: s_mov_b32 s[[L2LO:[0-9]+]], 0xe147ae14
74; GCN-DAG: s_mov_b32 s[[L2HI:[0-9]+]], 0x4000147a
75; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
76; GCN: s_cselect_b64 s{{\[}}[[T1LO:[0-9]+]]:[[T1HI:[0-9]+]]{{\]}}, s{{\[}}[[T0LO]]:[[T0HI]]{{\]}}, s{{\[}}[[L2LO]]:[[L2HI]]{{\]}}
77; GCN-DAG: s_mov_b32 s[[L3LO:[0-9]+]], 0x70a3d70a
78; GCN-DAG: s_mov_b32 s[[L3HI:[0-9]+]], 0x40100a3d
79; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
80; GCN: s_cselect_b64 s{{\[}}[[T2LO:[0-9]+]]:[[T2HI:[0-9]+]]{{\]}}, s{{\[}}[[T1LO]]:[[T1HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L3HI]]{{\]}}
81; Double literals 5.01 and 4.01 share the same low 32 bits.
82; GCN-DAG: s_mov_b32 s[[L4HI:[0-9]+]], 0x40140a3d
83; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
84; GCN: s_cselect_b64 s{{\[}}[[T3LO:[0-9]+]]:[[T3HI:[0-9]+]]{{\]}}, s{{\[}}[[T2LO]]:[[T2HI]]{{\]}}, s{{\[}}[[L3LO]]:[[L4HI]]{{\]}}
85; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T3LO]]
86; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T3HI]]
87; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
88define amdgpu_kernel void @double5_extelt(double addrspace(1)* %out, i32 %sel) {
89entry:
90  %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
91  store double %ext, double addrspace(1)* %out
92  ret void
93}
94
95; GCN-LABEL: {{^}}half4_extelt:
96; GCN-NOT: buffer_
97; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
98; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
99; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4
100; GCN:     s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
101; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
102; GCN:     store_short v[{{[0-9:]+}}], v[[VRL]]
103define amdgpu_kernel void @half4_extelt(half addrspace(1)* %out, i32 %sel) {
104entry:
105  %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
106  store half %ext, half addrspace(1)* %out
107  ret void
108}
109
110; GCN-LABEL: {{^}}float2_extelt:
111; GCN-NOT: buffer_
112; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
113; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
114; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
115; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
116define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) {
117entry:
118  %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
119  store float %ext, float addrspace(1)* %out
120  ret void
121}
122
123; GCN-LABEL: {{^}}double2_extelt:
124; GCN-NOT: buffer_
125; GCN-DAG: s_mov_b32 s[[L0LO:[0-9]+]], 0x47ae147b
126; GCN-DAG: s_mov_b32 s[[L0HI:[0-9]+]], 0x3f847ae1
127; GCN-DAG: s_mov_b32 s[[L1LO:[0-9]+]], 0xc28f5c29
128; GCN-DAG: s_mov_b32 s[[L1HI:[0-9]+]], 0x3ff028f5
129; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
130; GCN: s_cselect_b64 s{{\[}}[[T0LO:[0-9]+]]:[[T0HI:[0-9]+]]{{\]}}, s{{\[}}[[L1LO]]:[[L1HI]]{{\]}}, s{{\[}}[[L0LO]]:[[L0HI]]{{\]}}
131; GCN-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[T0LO]]
132; GCN-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[T0HI]]
133; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
134define amdgpu_kernel void @double2_extelt(double addrspace(1)* %out, i32 %sel) {
135entry:
136  %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
137  store double %ext, double addrspace(1)* %out
138  ret void
139}
140
141; GCN-LABEL: {{^}}half8_extelt:
142; GCN-NOT: buffer_
143; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
144; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
145; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
146; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
147; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
148; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
149; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
150; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
151; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
152; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
153; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
154; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
155; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
156; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
157; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
158; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
159; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
160; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
161; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
162; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
163; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
164; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
165define amdgpu_kernel void @half8_extelt(half addrspace(1)* %out, i32 %sel) {
166entry:
167  %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
168  store half %ext, half addrspace(1)* %out
169  ret void
170}
171
172; GCN-LABEL: {{^}}short8_extelt:
173; GCN-NOT: buffer_
174; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
175; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
176; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
177; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
178; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
179; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
180; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
181; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
182; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
183; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
184; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
185; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
186; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
187; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
188; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
189; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
190; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
191; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
192; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
193; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
194; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
195; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
196define amdgpu_kernel void @short8_extelt(i16 addrspace(1)* %out, i32 %sel) {
197entry:
198  %ext = extractelement <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i32 %sel
199  store i16 %ext, i16 addrspace(1)* %out
200  ret void
201}
202
203; GCN-LABEL: {{^}}float8_extelt:
204; GCN-NOT: buffer_
205; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
206; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
207; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
208; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
209; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
210; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
211; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
212; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
213; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
214; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
215; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
216; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
217; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
218; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
219; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
220; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
221; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
222; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
223; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
224; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
225; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
226; GCN:     store_dword v[{{[0-9:]+}}], [[V7]]
227define amdgpu_kernel void @float8_extelt(float addrspace(1)* %out, i32 %sel) {
228entry:
229  %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
230  store float %ext, float addrspace(1)* %out
231  ret void
232}
233
234; GCN-LABEL: {{^}}double8_extelt:
235; GCN-NOT: buffer_
236; GCN-NOT: s_or_b32
237; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
238; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
239; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
240; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
241; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
242; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
243define amdgpu_kernel void @double8_extelt(double addrspace(1)* %out, i32 %sel) {
244entry:
245  %ext = extractelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, i32 %sel
246  store double %ext, double addrspace(1)* %out
247  ret void
248}
249
250; GCN-LABEL: {{^}}double7_extelt:
251; GCN-NOT: buffer_
252; GCN-NOT: s_or_b32
253; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
254; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
255; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
256; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
257; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
258; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
259define amdgpu_kernel void @double7_extelt(double addrspace(1)* %out, i32 %sel) {
260entry:
261  %ext = extractelement <7 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, i32 %sel
262  store double %ext, double addrspace(1)* %out
263  ret void
264}
265
266; GCN-LABEL: {{^}}float16_extelt:
267; GCN-NOT: buffer_
268; GCN-DAG: s_mov_b32 m0,
269; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
270; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
271; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
272; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
273; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
274; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
275; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
276; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
277; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
278; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
279; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
280; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
281; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
282; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
283; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
284; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
285; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
286; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
287define amdgpu_kernel void @float16_extelt(float addrspace(1)* %out, i32 %sel) {
288entry:
289  %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
290  store float %ext, float addrspace(1)* %out
291  ret void
292}
293
294; GCN-LABEL: {{^}}double15_extelt:
295; GCN-NOT: buffer_
296; GCN-NOT: s_or_b32
297; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
298; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
299; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
300; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
301; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
302; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
303define amdgpu_kernel void @double15_extelt(double addrspace(1)* %out, i32 %sel) {
304entry:
305  %ext = extractelement <15 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>, i32 %sel
306  store double %ext, double addrspace(1)* %out
307  ret void
308}
309
310; GCN-LABEL: {{^}}double16_extelt:
311; GCN-NOT: buffer_
312; GCN-NOT: s_or_b32
313; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
314; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
315; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
316; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
317; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
318; GCN:     store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[RES_LO]]:[[RES_HI]]]
319define amdgpu_kernel void @double16_extelt(double addrspace(1)* %out, i32 %sel) {
320entry:
321  %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
322  store double %ext, double addrspace(1)* %out
323  ret void
324}
325
326; GCN-LABEL: {{^}}float32_extelt:
327; GCN-NOT: buffer_
328; GCN-DAG: s_mov_b32 m0,
329; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
330; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
331; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
332; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
333; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
334; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
335; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
336; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
337; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
338; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
339; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
340; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
341; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
342; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
343; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
344; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
345; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41880000
346; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
347; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41980000
348; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000
349; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
350; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
351; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b80000
352; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c00000
353; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c80000
354; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d00000
355; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d80000
356; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e00000
357; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e80000
358; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f00000
359; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f80000
360; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x42000000
361; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
362; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
363define amdgpu_kernel void @float32_extelt(float addrspace(1)* %out, i32 %sel) {
364entry:
365  %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
366  store float %ext, float addrspace(1)* %out
367  ret void
368}
369
370; GCN-LABEL: {{^}}byte8_extelt:
371; GCN-NOT: buffer_
372; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201
373; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605
374; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3
375; GCN:     s_lshr_b64 s{{\[}}[[RL:[0-9]+]]:{{[0-9]+}}], s{{\[}}[[SL]]:[[SH]]], [[SEL]]
376; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
377; GCN:     store_byte v[{{[0-9:]+}}], v[[VRL]]
378define amdgpu_kernel void @byte8_extelt(i8 addrspace(1)* %out, i32 %sel) {
379entry:
380  %ext = extractelement <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i32 %sel
381  store i8 %ext, i8 addrspace(1)* %out
382  ret void
383}
384
385; GCN-LABEL: {{^}}byte16_extelt:
386; GCN-NOT: buffer_
387; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
388; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
389; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
390; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
391; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
392; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
393; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
394; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
395; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
396; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
397; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
398; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
399; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
400; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
401; GCN-DAG: s_cmp_lg_u32 [[IDX]], 8
402; GCN-DAG: s_cselect_b64 [[C8:[^,]+]], -1, 0
403; GCN-DAG: s_cmp_lg_u32 [[IDX]], 9
404; GCN-DAG: s_cselect_b64 [[C9:[^,]+]], -1, 0
405; GCN-DAG: s_cmp_lg_u32 [[IDX]], 10
406; GCN-DAG: s_cselect_b64 [[C10:[^,]+]], -1, 0
407; GCN-DAG: s_cmp_lg_u32 [[IDX]], 11
408; GCN-DAG: s_cselect_b64 [[C11:[^,]+]], -1, 0
409; GCN-DAG: s_cmp_lg_u32 [[IDX]], 12
410; GCN-DAG: s_cselect_b64 [[C12:[^,]+]], -1, 0
411; GCN-DAG: s_cmp_lg_u32 [[IDX]], 13
412; GCN-DAG: s_cselect_b64 [[C13:[^,]+]], -1, 0
413; GCN-DAG: s_cmp_lg_u32 [[IDX]], 14
414; GCN-DAG: s_cselect_b64 [[C14:[^,]+]], -1, 0
415; GCN-DAG: s_cmp_lg_u32 [[IDX]], 15
416; GCN-DAG: s_cselect_b64 [[C15:[^,]+]], -1, 0
417; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
418; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
419; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
420; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
421; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
422; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
423; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
424; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]]
425; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]]
426; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]]
427; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]]
428; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]]
429; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]]
430; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]]
431; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]]
432; GCN:     store_byte v[{{[0-9:]+}}], [[V15]]
433define amdgpu_kernel void @byte16_extelt(i8 addrspace(1)* %out, i32 %sel) {
434entry:
435  %ext = extractelement <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, i32 %sel
436  store i8 %ext, i8 addrspace(1)* %out
437  ret void
438}
439
440; GCN-LABEL: {{^}}bit4_extelt:
441; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
442; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
443; GCN-DAG: buffer_store_byte [[ZERO]],
444; GCN-DAG: buffer_store_byte [[ONE]],
445; GCN-DAG: buffer_store_byte [[ZERO]],
446; GCN-DAG: buffer_store_byte [[ONE]],
447; GCN:     buffer_load_ubyte [[LOAD:v[0-9]+]],
448; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[LOAD]]
449; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
450define amdgpu_kernel void @bit4_extelt(i32 addrspace(1)* %out, i32 %sel) {
451entry:
452  %ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
453  %zext = zext i1 %ext to i32
454  store i32 %zext, i32 addrspace(1)* %out
455  ret void
456}
457
458; GCN-LABEL: {{^}}bit128_extelt:
459; GCN-NOT: buffer_
460; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1
461; GCN: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
462; GCN: s_cselect_b64 [[CL:[^,]+]], -1, 0
463; GCN: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
464; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
465; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
466define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) {
467entry:
468  %ext = extractelement <128 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, i32 %sel
469  %zext = zext i1 %ext to i32
470  store i32 %zext, i32 addrspace(1)* %out
471  ret void
472}
473
474; GCN-LABEL: {{^}}float32_extelt_vec:
475; GCN-NOT: buffer_
476; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0
477; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1.0, 2.0, [[CC1]]
478; GCN-DAG: v_mov_b32_e32 [[LASTVAL:v[0-9]+]], 0x42000000
479; GCN-DAG: v_cmp_ne_u32_e32 [[LASTCC:[^,]+]], 31, v0
480; GCN-DAG: v_cndmask_b32_e{{32|64}} v0, [[LASTVAL]], v{{[0-9]+}}, [[LASTCC]]
481define float @float32_extelt_vec(i32 %sel) {
482entry:
483  %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
484  ret float %ext
485}
486
487; GCN-LABEL: {{^}}double16_extelt_vec:
488; GCN-NOT: buffer_
489; GCN-DAG: v_mov_b32_e32 [[V1HI:v[0-9]+]], 0x3ff19999
490; GCN-DAG: v_mov_b32_e32 [[V1LO:v[0-9]+]], 0x9999999a
491; GCN-DAG: v_mov_b32_e32 [[V2HI:v[0-9]+]], 0x4000cccc
492; GCN-DAG: v_mov_b32_e32 [[V2LO:v[0-9]+]], 0xcccccccd
493; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0
494; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1HI:v[0-9]+]], [[V1HI]], [[V2HI]], [[CC1]]
495; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1LO:v[0-9]+]], [[V1LO]], [[V2LO]], [[CC1]]
496define double @double16_extelt_vec(i32 %sel) {
497entry:
498  %ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
499  ret double %ext
500}
501