1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
4; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
5
6; Testing for ds_read/write_b128
7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
9
10; FUNC-LABEL: {{^}}local_load_i8:
11; GCN-NOT: s_wqm_b64
12; SICIVI: s_mov_b32 m0
13; GFX9-NOT: m0
14; GCN: ds_read_u8
15
16; EG: LDS_UBYTE_READ_RET
17define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
18entry:
19  %ld = load i8, i8 addrspace(3)* %in
20  store i8 %ld, i8 addrspace(3)* %out
21  ret void
22}
23
24; FUNC-LABEL: {{^}}local_load_v2i8:
25; GCN-NOT: s_wqm_b64
26; SICIVI: s_mov_b32 m0
27; GFX9-NOT: m0
28; GCN: ds_read_u16
29
30; EG: LDS_USHORT_READ_RET
31define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
32entry:
33  %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
34  store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
35  ret void
36}
37
38; FUNC-LABEL: {{^}}local_load_v3i8:
39; GFX9-NOT: m0
40; GCN: ds_read_b32
41
42; EG: DS_READ_RET
43define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
44entry:
45  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
46  store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
47  ret void
48}
49
50; FUNC-LABEL: {{^}}local_load_v4i8:
51; GFX9-NOT: m0
52; GCN: ds_read_b32
53
54; EG: LDS_READ_RET
55define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
56entry:
57  %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
58  store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}local_load_v8i8:
63; GFX9-NOT: m0
64; GCN: ds_read_b64
65
66; EG: LDS_READ_RET
67; EG: LDS_READ_RET
68define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
69entry:
70  %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
71  store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
72  ret void
73}
74
75; FUNC-LABEL: {{^}}local_load_v16i8:
76; GFX9-NOT: m0
77; GCN: ds_read2_b64  v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}}
78; GCN: ds_write2_b64 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]] offset1:1{{$}}
79
80; EG: LDS_READ_RET
81; EG: LDS_READ_RET
82; EG: LDS_READ_RET
83; EG: LDS_READ_RET
84define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
85entry:
86  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
87  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
88  ret void
89}
90
91; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
92; GFX9-NOT: m0
93; GCN-NOT: s_wqm_b64
94; SICIVI: s_mov_b32 m0
95; GCN: ds_read_u8
96
97; EG: LDS_UBYTE_READ_RET
98define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
99  %a = load i8, i8 addrspace(3)* %in
100  %ext = zext i8 %a to i32
101  store i32 %ext, i32 addrspace(3)* %out
102  ret void
103}
104
105; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
106; GCN-NOT: s_wqm_b64
107; GFX9-NOT: m0
108; SICIVI: s_mov_b32 m0
109; GCN: ds_read_i8
110
111; EG: LDS_UBYTE_READ_RET
112; EG: BFE_INT
113define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
114  %ld = load i8, i8 addrspace(3)* %in
115  %ext = sext i8 %ld to i32
116  store i32 %ext, i32 addrspace(3)* %out
117  ret void
118}
119
120; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
121
122; EG: LDS_UBYTE_READ_RET
123define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
124  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
125  %ext = zext <1 x i8> %load to <1 x i32>
126  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
127  ret void
128}
129
130; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
131; GFX9-NOT: m0
132
133; EG: LDS_UBYTE_READ_RET
134; EG: BFE_INT
135define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
136  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
137  %ext = sext <1 x i8> %load to <1 x i32>
138  store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
139  ret void
140}
141
142; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
143; GFX9-NOT: m0
144; GCN: ds_read_u16
145
146; EG: LDS_USHORT_READ_RET
147define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
148  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
149  %ext = zext <2 x i8> %load to <2 x i32>
150  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
151  ret void
152}
153
154; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
155; GCN-NOT: s_wqm_b64
156; GFX9-NOT: m0
157; SICIVI: s_mov_b32 m0
158; GCN: ds_read_u16
159; FIXME: Need to optimize this sequence to avoid extra shift on VI.
160;         t23: i16 = srl t39, Constant:i32<8>
161;          t31: i32 = any_extend t23
162;        t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
163
164; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
165; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
166
167; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
168; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
169; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
170
171; EG: LDS_USHORT_READ_RET
172; EG-DAG: BFE_INT
173; EG-DAG: BFE_INT
174define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
175  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
176  %ext = sext <2 x i8> %load to <2 x i32>
177  store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
178  ret void
179}
180
181; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
182; GFX9-NOT: m0
183; GCN: ds_read_b32
184
185; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
186; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
187; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
188; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
189
190; EG: LDS_READ_RET
191define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
192entry:
193  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
194  %ext = zext <3 x i8> %ld to <3 x i32>
195  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
196  ret void
197}
198
199; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
200; GCN-NOT: s_wqm_b64
201; GFX9-NOT: m0
202; SICIVI: s_mov_b32 m0
203; GCN: ds_read_b32
204
205; GCN-DAG: v_bfe_i32
206; GCN-DAG: v_bfe_i32
207; GCN-DAG: v_bfe_i32
208; GCN-DAG: v_bfe_i32
209
210; SI-DAG: ds_write_b64
211; SI-DAG: ds_write_b32
212; CIVI-DAG: ds_write_b96
213; GFX9-DAG: ds_write_b96
214
215; EG: LDS_READ_RET
216; EG-DAG: BFE_INT
217; EG-DAG: BFE_INT
218; EG-DAG: BFE_INT
219define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
220entry:
221  %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
222  %ext = sext <3 x i8> %ld to <3 x i32>
223  store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
224  ret void
225}
226
227; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
228; GCN-NOT: s_wqm_b64
229; GFX9-NOT: m0
230; SICIVI: s_mov_b32 m0
231; GCN: ds_read_b32
232
233; EG: LDS_READ_RET
234; EG-DAG: BFE_UINT
235; EG-DAG: BFE_UINT
236; EG-DAG: BFE_UINT
237define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
238  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
239  %ext = zext <4 x i8> %load to <4 x i32>
240  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
241  ret void
242}
243
244; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
245; GCN-NOT: s_wqm_b64
246; GFX9-NOT: m0
247; SICIVI: s_mov_b32 m0
248; GCN: ds_read_b32
249
250; EG-DAG: LDS_READ_RET
251; EG-DAG: BFE_INT
252; EG-DAG: BFE_INT
253; EG-DAG: BFE_INT
254; EG-DAG: BFE_INT
255define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
256  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
257  %ext = sext <4 x i8> %load to <4 x i32>
258  store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
259  ret void
260}
261
262; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
263; SICIVI: s_mov_b32 m0
264; GFX9-NOT: m0
265
266; EG-DAG: LDS_READ_RET
267; EG-DAG: LDS_READ_RET
268; EG-DAG: BFE_UINT
269; EG-DAG: BFE_UINT
270; EG-DAG: BFE_UINT
271; EG-DAG: BFE_UINT
272; EG-DAG: BFE_UINT
273; EG-DAG: BFE_UINT
274define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
275  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
276  %ext = zext <8 x i8> %load to <8 x i32>
277  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
278  ret void
279}
280
281; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
282; SICIVI: s_mov_b32 m0
283; GFX9-NOT: m0
284
285; EG-DAG: LDS_READ_RET
286; EG-DAG: LDS_READ_RET
287; EG-DAG: BFE_INT
288; EG-DAG: BFE_INT
289; EG-DAG: BFE_INT
290; EG-DAG: BFE_INT
291; EG-DAG: BFE_INT
292; EG-DAG: BFE_INT
293; EG-DAG: BFE_INT
294; EG-DAG: BFE_INT
295define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
296  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
297  %ext = sext <8 x i8> %load to <8 x i32>
298  store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
299  ret void
300}
301
302; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
303; SICIVI: s_mov_b32 m0
304; GFX9-NOT: m0
305
306; EG-DAG: LDS_READ_RET
307; EG-DAG: LDS_READ_RET
308; EG-DAG: LDS_READ_RET
309; EG-DAG: LDS_READ_RET
310; EG-DAG: BFE_UINT
311; EG-DAG: BFE_UINT
312; EG-DAG: BFE_UINT
313; EG-DAG: BFE_UINT
314; EG-DAG: BFE_UINT
315; EG-DAG: BFE_UINT
316; EG-DAG: BFE_UINT
317; EG-DAG: BFE_UINT
318; EG-DAG: BFE_UINT
319; EG-DAG: BFE_UINT
320; EG-DAG: BFE_UINT
321; EG-DAG: BFE_UINT
322define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
323  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
324  %ext = zext <16 x i8> %load to <16 x i32>
325  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
326  ret void
327}
328
329; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
330; SICIVI: s_mov_b32 m0
331; GFX9-NOT: m0
332
333; EG-DAG: LDS_READ_RET
334; EG-DAG: LDS_READ_RET
335; EG-DAG: LDS_READ_RET
336; EG-DAG: LDS_READ_RET
337; EG-DAG: BFE_INT
338; EG-DAG: BFE_INT
339; EG-DAG: BFE_INT
340; EG-DAG: BFE_INT
341; EG-DAG: BFE_INT
342; EG-DAG: BFE_INT
343; EG-DAG: BFE_INT
344; EG-DAG: BFE_INT
345; EG-DAG: BFE_INT
346; EG-DAG: BFE_INT
347; EG-DAG: BFE_INT
348; EG-DAG: BFE_INT
349; EG-DAG: BFE_INT
350; EG-DAG: BFE_INT
351; EG-DAG: BFE_INT
352; EG-DAG: BFE_INT
353define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
354  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
355  %ext = sext <16 x i8> %load to <16 x i32>
356  store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
357  ret void
358}
359
360; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
361; SICIVI: s_mov_b32 m0
362; GFX9-NOT: m0
363
364; EG-DAG: LDS_READ_RET
365; EG-DAG: LDS_READ_RET
366; EG-DAG: LDS_READ_RET
367; EG-DAG: LDS_READ_RET
368; EG-DAG: LDS_READ_RET
369; EG-DAG: LDS_READ_RET
370; EG-DAG: LDS_READ_RET
371; EG-DAG: LDS_READ_RET
372define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
373  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
374  %ext = zext <32 x i8> %load to <32 x i32>
375  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
376  ret void
377}
378
379; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
380; SICIVI: s_mov_b32 m0
381; GFX9-NOT: m0
382
383; EG-DAG: LDS_READ_RET
384; EG-DAG: LDS_READ_RET
385; EG-DAG: LDS_READ_RET
386; EG-DAG: LDS_READ_RET
387; EG-DAG: LDS_READ_RET
388; EG-DAG: LDS_READ_RET
389; EG-DAG: LDS_READ_RET
390; EG-DAG: LDS_READ_RET
391define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
392  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
393  %ext = sext <32 x i8> %load to <32 x i32>
394  store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
395  ret void
396}
397
398; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
399; SICIVI: s_mov_b32 m0
400; GFX9-NOT: m0
401
402; EG-DAG: LDS_READ_RET
403; EG-DAG: LDS_READ_RET
404; EG-DAG: LDS_READ_RET
405; EG-DAG: LDS_READ_RET
406; EG-DAG: LDS_READ_RET
407; EG-DAG: LDS_READ_RET
408; EG-DAG: LDS_READ_RET
409; EG-DAG: LDS_READ_RET
410; EG-DAG: LDS_READ_RET
411; EG-DAG: LDS_READ_RET
412; EG-DAG: LDS_READ_RET
413; EG-DAG: LDS_READ_RET
414; EG-DAG: LDS_READ_RET
415; EG-DAG: LDS_READ_RET
416; EG-DAG: LDS_READ_RET
417; EG-DAG: LDS_READ_RET
418define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
419  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
420  %ext = zext <64 x i8> %load to <64 x i32>
421  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
422  ret void
423}
424
425; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
426; SICIVI: s_mov_b32 m0
427; GFX9-NOT: m0
428
429; EG-DAG: LDS_READ_RET
430; EG-DAG: LDS_READ_RET
431; EG-DAG: LDS_READ_RET
432; EG-DAG: LDS_READ_RET
433; EG-DAG: LDS_READ_RET
434; EG-DAG: LDS_READ_RET
435; EG-DAG: LDS_READ_RET
436; EG-DAG: LDS_READ_RET
437; EG-DAG: LDS_READ_RET
438; EG-DAG: LDS_READ_RET
439; EG-DAG: LDS_READ_RET
440; EG-DAG: LDS_READ_RET
441; EG-DAG: LDS_READ_RET
442; EG-DAG: LDS_READ_RET
443; EG-DAG: LDS_READ_RET
444; EG-DAG: LDS_READ_RET
445define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
446  %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
447  %ext = sext <64 x i8> %load to <64 x i32>
448  store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
449  ret void
450}
451
452; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
453; SICIVI: s_mov_b32 m0
454; GFX9-NOT: m0
455
456; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
457; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
458; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
459
460; EG: LDS_UBYTE_READ_RET
461; EG: MOV {{.*}}, literal
462; EG: 0.0
463define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
464  %a = load i8, i8 addrspace(3)* %in
465  %ext = zext i8 %a to i64
466  store i64 %ext, i64 addrspace(3)* %out
467  ret void
468}
469
470; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
471; SICIVI: s_mov_b32 m0
472; GFX9-NOT: m0
473
474; GCN: ds_read_i8 v[[LO:[0-9]+]],
475; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
476
477; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
478
479; EG: LDS_UBYTE_READ_RET
480; EG: ASHR
481; TODO: why not 7?
482; EG: 31
483define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
484  %a = load i8, i8 addrspace(3)* %in
485  %ext = sext i8 %a to i64
486  store i64 %ext, i64 addrspace(3)* %out
487  ret void
488}
489
490; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
491; SICIVI: s_mov_b32 m0
492; GFX9-NOT: m0
493
494; EG: LDS_UBYTE_READ_RET
495; EG: MOV {{.*}}, literal
496; TODO: merge?
497; EG: 0.0
498define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
499  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
500  %ext = zext <1 x i8> %load to <1 x i64>
501  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
502  ret void
503}
504
505; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
506; SICIVI: s_mov_b32 m0
507; GFX9-NOT: m0
508
509; EG: LDS_UBYTE_READ_RET
510; EG: ASHR
511; TODO: why not 7?
512; EG: 31
513define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
514  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
515  %ext = sext <1 x i8> %load to <1 x i64>
516  store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
517  ret void
518}
519
520; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
521; SICIVI: s_mov_b32 m0
522; GFX9-NOT: m0
523
524; EG: LDS_USHORT_READ_RET
525define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
526  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
527  %ext = zext <2 x i8> %load to <2 x i64>
528  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
529  ret void
530}
531
532; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
533; SICIVI: s_mov_b32 m0
534; GFX9-NOT: m0
535
536; EG: LDS_USHORT_READ_RET
537; EG: BFE_INT
538; EG: BFE_INT
539define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
540  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
541  %ext = sext <2 x i8> %load to <2 x i64>
542  store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
543  ret void
544}
545
546; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
547; SICIVI: s_mov_b32 m0
548; GFX9-NOT: m0
549
550; EG: LDS_READ_RET
551define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
552  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
553  %ext = zext <4 x i8> %load to <4 x i64>
554  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
555  ret void
556}
557
558; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
559; SICIVI: s_mov_b32 m0
560; GFX9-NOT: m0
561
562; EG: LDS_READ_RET
563define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
564  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
565  %ext = sext <4 x i8> %load to <4 x i64>
566  store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
567  ret void
568}
569
570; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
571; SICIVI: s_mov_b32 m0
572; GFX9-NOT: m0
573
574; EG: LDS_READ_RET
575; EG: LDS_READ_RET
576define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
577  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
578  %ext = zext <8 x i8> %load to <8 x i64>
579  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
580  ret void
581}
582
583; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
584; SICIVI: s_mov_b32 m0
585; GFX9-NOT: m0
586
587; EG: LDS_READ_RET
588; EG: LDS_READ_RET
589; EG-DAG: ASHR
590; EG-DAG: ASHR
591; EG-DAG: BFE_INT
592; EG-DAG: BFE_INT
593; EG-DAG: BFE_INT
594; EG-DAG: BFE_INT
595; EG-DAG: BFE_INT
596; EG-DAG: BFE_INT
597; EG-DAG: BFE_INT
598define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
599  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
600  %ext = sext <8 x i8> %load to <8 x i64>
601  store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
602  ret void
603}
604
605; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
606; SICIVI: s_mov_b32 m0
607; GFX9-NOT: m0
608
609; EG: LDS_READ_RET
610; EG: LDS_READ_RET
611; EG: LDS_READ_RET
612; EG: LDS_READ_RET
613define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
614  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
615  %ext = zext <16 x i8> %load to <16 x i64>
616  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
617  ret void
618}
619
620; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
621; SICIVI: s_mov_b32 m0
622; GFX9-NOT: m0
623
624; EG: LDS_READ_RET
625; EG: LDS_READ_RET
626; EG: LDS_READ_RET
627; EG: LDS_READ_RET
628define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
629  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
630  %ext = sext <16 x i8> %load to <16 x i64>
631  store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
632  ret void
633}
634
635; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
636; SICIVI: s_mov_b32 m0
637; GFX9-NOT: m0
638
639; EG: LDS_READ_RET
640; EG: LDS_READ_RET
641; EG: LDS_READ_RET
642; EG: LDS_READ_RET
643; EG: LDS_READ_RET
644; EG: LDS_READ_RET
645; EG: LDS_READ_RET
646; EG: LDS_READ_RET
647define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
648  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
649  %ext = zext <32 x i8> %load to <32 x i64>
650  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
651  ret void
652}
653
654; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
655; SICIVI: s_mov_b32 m0
656; GFX9-NOT: m0
657
658; EG: LDS_READ_RET
659; EG: LDS_READ_RET
660; EG: LDS_READ_RET
661; EG: LDS_READ_RET
662; EG: LDS_READ_RET
663; EG: LDS_READ_RET
664; EG: LDS_READ_RET
665; EG: LDS_READ_RET
666define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
667  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
668  %ext = sext <32 x i8> %load to <32 x i64>
669  store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
670  ret void
671}
672
673; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
674; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
675;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
676;   %ext = zext <64 x i8> %load to <64 x i64>
677;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
678;   ret void
679; }
680
681; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
682; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
683;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
684;   %ext = sext <64 x i8> %load to <64 x i64>
685;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
686;   ret void
687; }
688
689; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
690; SICIVI: s_mov_b32 m0
691; GFX9-NOT: m0
692; GCN: ds_read_u8 v[[VAL:[0-9]+]],
693; GCN: ds_write_b16 v[[VAL:[0-9]+]]
694
695; EG: LDS_UBYTE_READ_RET
696; EG: LDS_SHORT_WRITE
697define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
698  %a = load i8, i8 addrspace(3)* %in
699  %ext = zext i8 %a to i16
700  store i16 %ext, i16 addrspace(3)* %out
701  ret void
702}
703
704; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
705; SICIVI: s_mov_b32 m0
706; GFX9-NOT: m0
707; GCN: ds_read_i8 v[[VAL:[0-9]+]],
708; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
709
710; EG: LDS_UBYTE_READ_RET
711; EG: BFE_INT
712; EG: LDS_SHORT_WRITE
713define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
714  %a = load i8, i8 addrspace(3)* %in
715  %ext = sext i8 %a to i16
716  store i16 %ext, i16 addrspace(3)* %out
717  ret void
718}
719
720; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
721; SICIVI: s_mov_b32 m0
722; GFX9-NOT: m0
723
724; EG: LDS_UBYTE_READ_RET
725; EG: LDS_SHORT_WRITE
726define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
727  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
728  %ext = zext <1 x i8> %load to <1 x i16>
729  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
730  ret void
731}
732
733; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
734; SICIVI: s_mov_b32 m0
735; GFX9-NOT: m0
736
737; EG: LDS_UBYTE_READ_RET
738; EG: BFE_INT
739; EG: LDS_SHORT_WRITE
740define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
741  %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
742  %ext = sext <1 x i8> %load to <1 x i16>
743  store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
744  ret void
745}
746
747; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
748; SICIVI: s_mov_b32 m0
749; GFX9-NOT: m0
750
751; EG: LDS_USHORT_READ_RET
752; EG: LDS_WRITE
753define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
754  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
755  %ext = zext <2 x i8> %load to <2 x i16>
756  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
757  ret void
758}
759
760; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
761; SICIVI: s_mov_b32 m0
762; GFX9-NOT: m0
763
764; EG: LDS_USHORT_READ_RET
765; EG: BFE_INT
766; EG: BFE_INT
767; EG: LDS_WRITE
768define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
769  %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
770  %ext = sext <2 x i8> %load to <2 x i16>
771  store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
772  ret void
773}
774
775; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
776; SICIVI: s_mov_b32 m0
777; GFX9-NOT: m0
778
779; EG: LDS_READ_RET
780; EG: LDS_WRITE
781; EG: LDS_WRITE
782define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
783  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
784  %ext = zext <4 x i8> %load to <4 x i16>
785  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
786  ret void
787}
788
789; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
790; SICIVI: s_mov_b32 m0
791; GFX9-NOT: m0
792
793; EG: LDS_READ_RET
794; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
795; EG-DAG: BFE_INT
796; EG-DAG: BFE_INT
797; EG-DAG: BFE_INT
798; EG-DAG: BFE_INT
799; EG: LDS_WRITE
800; EG: LDS_WRITE
801define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
802  %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
803  %ext = sext <4 x i8> %load to <4 x i16>
804  store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
805  ret void
806}
807
808; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
809; SICIVI: s_mov_b32 m0
810; GFX9-NOT: m0
811
812; EG: LDS_READ_RET
813; EG: LDS_READ_RET
814; EG: LDS_WRITE
815; EG: LDS_WRITE
816; EG: LDS_WRITE
817; EG: LDS_WRITE
818define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
819  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
820  %ext = zext <8 x i8> %load to <8 x i16>
821  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
822  ret void
823}
824
825; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
826; SICIVI: s_mov_b32 m0
827; GFX9-NOT: m0
828
829; EG: LDS_READ_RET
830; EG: LDS_READ_RET
831; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
832; EG-DAG: BFE_INT
833; EG-DAG: BFE_INT
834; EG-DAG: BFE_INT
835; EG-DAG: BFE_INT
836; EG-DAG: BFE_INT
837; EG-DAG: BFE_INT
838; EG-DAG: BFE_INT
839; EG-DAG: BFE_INT
840; EG: LDS_WRITE
841; EG: LDS_WRITE
842; EG: LDS_WRITE
843; EG: LDS_WRITE
844define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
845  %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
846  %ext = sext <8 x i8> %load to <8 x i16>
847  store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
848  ret void
849}
850
851; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
852; SICIVI: s_mov_b32 m0
853; GFX9-NOT: m0
854
855; EG: LDS_READ_RET
856; EG: LDS_READ_RET
857; EG: LDS_READ_RET
858; EG: LDS_READ_RET
859; EG: LDS_WRITE
860; EG: LDS_WRITE
861; EG: LDS_WRITE
862; EG: LDS_WRITE
863; EG: LDS_WRITE
864; EG: LDS_WRITE
865; EG: LDS_WRITE
866; EG: LDS_WRITE
867define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
868  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
869  %ext = zext <16 x i8> %load to <16 x i16>
870  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
871  ret void
872}
873
874; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
875; SICIVI: s_mov_b32 m0
876; GFX9-NOT: m0
877
878; EG: LDS_READ_RET
879; EG: LDS_READ_RET
880; EG: LDS_READ_RET
881; EG: LDS_READ_RET
882; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
883; EG-DAG: BFE_INT
884; EG-DAG: BFE_INT
885; EG-DAG: BFE_INT
886; EG-DAG: BFE_INT
887; EG-DAG: BFE_INT
888; EG-DAG: BFE_INT
889; EG-DAG: BFE_INT
890; EG-DAG: BFE_INT
891; EG-DAG: BFE_INT
892; EG-DAG: BFE_INT
893; EG-DAG: BFE_INT
894; EG-DAG: BFE_INT
895; EG-DAG: BFE_INT
896; EG-DAG: BFE_INT
897; EG-DAG: BFE_INT
898; EG-DAG: BFE_INT
899; EG: LDS_WRITE
900; EG: LDS_WRITE
901; EG: LDS_WRITE
902; EG: LDS_WRITE
903; EG: LDS_WRITE
904; EG: LDS_WRITE
905; EG: LDS_WRITE
906; EG: LDS_WRITE
907define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
908  %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
909  %ext = sext <16 x i8> %load to <16 x i16>
910  store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
911  ret void
912}
913
914; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
915; SICIVI: s_mov_b32 m0
916; GFX9-NOT: m0
917
918; EG: LDS_READ_RET
919; EG: LDS_READ_RET
920; EG: LDS_READ_RET
921; EG: LDS_READ_RET
922; EG: LDS_READ_RET
923; EG: LDS_READ_RET
924; EG: LDS_READ_RET
925; EG: LDS_READ_RET
926; EG: LDS_WRITE
927; EG: LDS_WRITE
928; EG: LDS_WRITE
929; EG: LDS_WRITE
930; EG: LDS_WRITE
931; EG: LDS_WRITE
932; EG: LDS_WRITE
933; EG: LDS_WRITE
934; EG: LDS_WRITE
935; EG: LDS_WRITE
936; EG: LDS_WRITE
937; EG: LDS_WRITE
938; EG: LDS_WRITE
939; EG: LDS_WRITE
940; EG: LDS_WRITE
941; EG: LDS_WRITE
942define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
943  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
944  %ext = zext <32 x i8> %load to <32 x i16>
945  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
946  ret void
947}
948
949; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
950; SICIVI: s_mov_b32 m0
951; GFX9-NOT: m0
952
953; EG: LDS_READ_RET
954; EG: LDS_READ_RET
955; EG: LDS_READ_RET
956; EG: LDS_READ_RET
957; EG: LDS_READ_RET
958; EG: LDS_READ_RET
959; EG: LDS_READ_RET
960; EG: LDS_READ_RET
961; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
962; EG-DAG: BFE_INT
963; EG-DAG: BFE_INT
964; EG-DAG: BFE_INT
965; EG-DAG: BFE_INT
966; EG-DAG: BFE_INT
967; EG-DAG: BFE_INT
968; EG-DAG: BFE_INT
969; EG-DAG: BFE_INT
970; EG-DAG: BFE_INT
971; EG-DAG: BFE_INT
972; EG-DAG: BFE_INT
973; EG-DAG: BFE_INT
974; EG-DAG: BFE_INT
975; EG-DAG: BFE_INT
976; EG-DAG: BFE_INT
977; EG-DAG: BFE_INT
978; EG-DAG: BFE_INT
979; EG-DAG: BFE_INT
980; EG-DAG: BFE_INT
981; EG-DAG: BFE_INT
982; EG-DAG: BFE_INT
983; EG-DAG: BFE_INT
984; EG-DAG: BFE_INT
985; EG-DAG: BFE_INT
986; EG-DAG: BFE_INT
987; EG-DAG: BFE_INT
988; EG-DAG: BFE_INT
989; EG-DAG: BFE_INT
990; EG: LDS_WRITE
991; EG: LDS_WRITE
992; EG: LDS_WRITE
993; EG: LDS_WRITE
994; EG: LDS_WRITE
995; EG: LDS_WRITE
996; EG: LDS_WRITE
997; EG: LDS_WRITE
998; EG: LDS_WRITE
999; EG: LDS_WRITE
1000; EG: LDS_WRITE
1001; EG: LDS_WRITE
1002; EG: LDS_WRITE
1003; EG: LDS_WRITE
1004; EG: LDS_WRITE
1005; EG: LDS_WRITE
1006define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
1007  %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
1008  %ext = sext <32 x i8> %load to <32 x i16>
1009  store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
1010  ret void
1011}
1012
1013; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
1014; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1015;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1016;   %ext = zext <64 x i8> %load to <64 x i16>
1017;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1018;   ret void
1019; }
1020
1021; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
1022; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1023;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1024;   %ext = sext <64 x i8> %load to <64 x i16>
1025;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1026;   ret void
1027; }
1028
1029; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
1030; FUNC-LABEL: {{^}}local_v16i8_to_128:
1031
1032; SI-NOT: ds_read_b128
1033; SI-NOT: ds_write_b128
1034
1035; CIVI: ds_read_b128
1036; CIVI: ds_write_b128
1037
1038; EG: LDS_READ_RET
1039; EG: LDS_READ_RET
1040; EG: LDS_READ_RET
1041; EG: LDS_READ_RET
1042define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) {
1043  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16
1044  store <16 x i8> %ld, <16 x i8> addrspace(3)* %out, align 16
1045  ret void
1046}
1047
1048attributes #0 = { nounwind }
1049