1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GFX900-MUBUF %s
3; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s
4; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck --check-prefix=GFX803 %s
5; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX900,GFX900-FLATSCR %s
6
7define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
8; GFX900-LABEL: load_local_lo_v2i16_undeflo:
9; GFX900:       ; %bb.0: ; %entry
10; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX900-NEXT:    ds_read_u16_d16 v0, v0
12; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX900-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX906-LABEL: load_local_lo_v2i16_undeflo:
16; GFX906:       ; %bb.0: ; %entry
17; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX906-NEXT:    ds_read_u16 v0, v0
19; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
20; GFX906-NEXT:    s_setpc_b64 s[30:31]
21;
22; GFX803-LABEL: load_local_lo_v2i16_undeflo:
23; GFX803:       ; %bb.0: ; %entry
24; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25; GFX803-NEXT:    s_mov_b32 m0, -1
26; GFX803-NEXT:    ds_read_u16 v0, v0
27; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
28; GFX803-NEXT:    s_setpc_b64 s[30:31]
29entry:
30  %load = load i16, i16 addrspace(3)* %in
31  %build = insertelement <2 x i16> undef, i16 %load, i32 0
32  ret <2 x i16> %build
33}
34
35define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
36; GFX900-LABEL: load_local_lo_v2i16_reglo:
37; GFX900:       ; %bb.0: ; %entry
38; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX900-NEXT:    ds_read_u16 v0, v0
40; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
42; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
43; GFX900-NEXT:    s_setpc_b64 s[30:31]
44;
45; GFX906-LABEL: load_local_lo_v2i16_reglo:
46; GFX906:       ; %bb.0: ; %entry
47; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; GFX906-NEXT:    ds_read_u16 v0, v0
49; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
51; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
52; GFX906-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX803-LABEL: load_local_lo_v2i16_reglo:
55; GFX803:       ; %bb.0: ; %entry
56; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX803-NEXT:    s_mov_b32 m0, -1
58; GFX803-NEXT:    ds_read_u16 v0, v0
59; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
60; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
62; GFX803-NEXT:    s_setpc_b64 s[30:31]
63entry:
64  %load = load i16, i16 addrspace(3)* %in
65  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
66  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
67  ret <2 x i16> %build1
68}
69
70; Show that we get reasonable regalloc without physreg constraints.
71define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
72; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg:
73; GFX900:       ; %bb.0: ; %entry
74; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX900-NEXT:    ds_read_u16 v0, v0
76; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
77; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
78; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
79; GFX900-NEXT:    global_store_dword v[0:1], v0, off
80; GFX900-NEXT:    s_waitcnt vmcnt(0)
81; GFX900-NEXT:    s_setpc_b64 s[30:31]
82;
83; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg:
84; GFX906:       ; %bb.0: ; %entry
85; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86; GFX906-NEXT:    ds_read_u16 v0, v0
87; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
89; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
90; GFX906-NEXT:    global_store_dword v[0:1], v0, off
91; GFX906-NEXT:    s_waitcnt vmcnt(0)
92; GFX906-NEXT:    s_setpc_b64 s[30:31]
93;
94; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg:
95; GFX803:       ; %bb.0: ; %entry
96; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX803-NEXT:    s_mov_b32 m0, -1
98; GFX803-NEXT:    ds_read_u16 v0, v0
99; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
100; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
102; GFX803-NEXT:    flat_store_dword v[0:1], v0
103; GFX803-NEXT:    s_waitcnt vmcnt(0)
104; GFX803-NEXT:    s_setpc_b64 s[30:31]
105entry:
106  %load = load i16, i16 addrspace(3)* %in
107  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
108  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
109  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
110  ret void
111}
112
113define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
114; GFX900-LABEL: load_local_lo_v2i16_zerolo:
115; GFX900:       ; %bb.0: ; %entry
116; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117; GFX900-NEXT:    v_mov_b32_e32 v1, 0
118; GFX900-NEXT:    ds_read_u16_d16 v1, v0
119; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX900-NEXT:    v_mov_b32_e32 v0, v1
121; GFX900-NEXT:    s_setpc_b64 s[30:31]
122;
123; GFX906-LABEL: load_local_lo_v2i16_zerolo:
124; GFX906:       ; %bb.0: ; %entry
125; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX906-NEXT:    ds_read_u16 v0, v0
127; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
129; GFX906-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX803-LABEL: load_local_lo_v2i16_zerolo:
132; GFX803:       ; %bb.0: ; %entry
133; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX803-NEXT:    s_mov_b32 m0, -1
135; GFX803-NEXT:    ds_read_u16 v0, v0
136; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX803-NEXT:    s_setpc_b64 s[30:31]
138entry:
139  %load = load i16, i16 addrspace(3)* %in
140  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
141  ret <2 x i16> %build
142}
143
144define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
145; GFX900-LABEL: load_local_lo_v2f16_fpimm:
146; GFX900:       ; %bb.0: ; %entry
147; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148; GFX900-NEXT:    v_mov_b32_e32 v1, 2.0
149; GFX900-NEXT:    ds_read_u16_d16 v1, v0
150; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX900-NEXT:    v_mov_b32_e32 v0, v1
152; GFX900-NEXT:    s_setpc_b64 s[30:31]
153;
154; GFX906-LABEL: load_local_lo_v2f16_fpimm:
155; GFX906:       ; %bb.0: ; %entry
156; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157; GFX906-NEXT:    ds_read_u16 v0, v0
158; GFX906-NEXT:    s_movk_i32 s4, 0x4000
159; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
161; GFX906-NEXT:    v_lshl_or_b32 v0, s4, 16, v0
162; GFX906-NEXT:    s_setpc_b64 s[30:31]
163;
164; GFX803-LABEL: load_local_lo_v2f16_fpimm:
165; GFX803:       ; %bb.0: ; %entry
166; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; GFX803-NEXT:    s_mov_b32 m0, -1
168; GFX803-NEXT:    ds_read_u16 v0, v0
169; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX803-NEXT:    v_or_b32_e32 v0, 2.0, v0
171; GFX803-NEXT:    s_setpc_b64 s[30:31]
172entry:
173  %load = load half, half addrspace(3)* %in
174  %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
175  ret <2 x half> %build
176}
177
178define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
179; GFX900-LABEL: load_local_lo_v2f16_reghi_vreg:
180; GFX900:       ; %bb.0: ; %entry
181; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX900-NEXT:    ds_read_u16_d16 v1, v0
183; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX900-NEXT:    global_store_dword v[0:1], v1, off
185; GFX900-NEXT:    s_waitcnt vmcnt(0)
186; GFX900-NEXT:    s_setpc_b64 s[30:31]
187;
188; GFX906-LABEL: load_local_lo_v2f16_reghi_vreg:
189; GFX906:       ; %bb.0: ; %entry
190; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; GFX906-NEXT:    ds_read_u16 v0, v0
192; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
193; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
195; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
196; GFX906-NEXT:    global_store_dword v[0:1], v0, off
197; GFX906-NEXT:    s_waitcnt vmcnt(0)
198; GFX906-NEXT:    s_setpc_b64 s[30:31]
199;
200; GFX803-LABEL: load_local_lo_v2f16_reghi_vreg:
201; GFX803:       ; %bb.0: ; %entry
202; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203; GFX803-NEXT:    s_mov_b32 m0, -1
204; GFX803-NEXT:    ds_read_u16 v0, v0
205; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
206; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
208; GFX803-NEXT:    flat_store_dword v[0:1], v0
209; GFX803-NEXT:    s_waitcnt vmcnt(0)
210; GFX803-NEXT:    s_setpc_b64 s[30:31]
211entry:
212  %reg.bc = bitcast i32 %reg to <2 x half>
213  %load = load half, half addrspace(3)* %in
214  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
215  store <2 x half> %build1, <2 x half> addrspace(1)* undef
216  ret void
217}
218
219define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
220; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg:
221; GFX900:       ; %bb.0: ; %entry
222; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223; GFX900-NEXT:    ds_read_u16 v0, v0
224; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
226; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
227; GFX900-NEXT:    global_store_dword v[0:1], v0, off
228; GFX900-NEXT:    s_waitcnt vmcnt(0)
229; GFX900-NEXT:    s_setpc_b64 s[30:31]
230;
231; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg:
232; GFX906:       ; %bb.0: ; %entry
233; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX906-NEXT:    ds_read_u16 v0, v0
235; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
237; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
238; GFX906-NEXT:    global_store_dword v[0:1], v0, off
239; GFX906-NEXT:    s_waitcnt vmcnt(0)
240; GFX906-NEXT:    s_setpc_b64 s[30:31]
241;
242; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg:
243; GFX803:       ; %bb.0: ; %entry
244; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245; GFX803-NEXT:    s_mov_b32 m0, -1
246; GFX803-NEXT:    ds_read_u16 v0, v0
247; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
248; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
250; GFX803-NEXT:    flat_store_dword v[0:1], v0
251; GFX803-NEXT:    s_waitcnt vmcnt(0)
252; GFX803-NEXT:    s_setpc_b64 s[30:31]
253entry:
254  %load = load half, half addrspace(3)* %in
255  %build0 = insertelement <2 x half> undef, half %reg, i32 1
256  %build1 = insertelement <2 x half> %build0, half %load, i32 0
257  store <2 x half> %build1, <2 x half> addrspace(1)* undef
258  ret void
259}
260
261define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
262; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
263; GFX900:       ; %bb.0: ; %entry
264; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX900-NEXT:    ds_read_u8_d16 v1, v0
266; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX900-NEXT:    global_store_dword v[0:1], v1, off
268; GFX900-NEXT:    s_waitcnt vmcnt(0)
269; GFX900-NEXT:    s_setpc_b64 s[30:31]
270;
271; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
272; GFX906:       ; %bb.0: ; %entry
273; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274; GFX906-NEXT:    ds_read_u8 v0, v0
275; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
276; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
278; GFX906-NEXT:    global_store_dword v[0:1], v0, off
279; GFX906-NEXT:    s_waitcnt vmcnt(0)
280; GFX906-NEXT:    s_setpc_b64 s[30:31]
281;
282; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
283; GFX803:       ; %bb.0: ; %entry
284; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285; GFX803-NEXT:    s_mov_b32 m0, -1
286; GFX803-NEXT:    ds_read_u8 v0, v0
287; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
288; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
290; GFX803-NEXT:    flat_store_dword v[0:1], v0
291; GFX803-NEXT:    s_waitcnt vmcnt(0)
292; GFX803-NEXT:    s_setpc_b64 s[30:31]
293entry:
294  %reg.bc = bitcast i32 %reg to <2 x i16>
295  %load = load i8, i8 addrspace(3)* %in
296  %ext = zext i8 %load to i16
297  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
298  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
299  ret void
300}
301
302define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
303; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
304; GFX900:       ; %bb.0: ; %entry
305; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306; GFX900-NEXT:    ds_read_u8 v0, v0
307; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
309; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
310; GFX900-NEXT:    global_store_dword v[0:1], v0, off
311; GFX900-NEXT:    s_waitcnt vmcnt(0)
312; GFX900-NEXT:    s_setpc_b64 s[30:31]
313;
314; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
315; GFX906:       ; %bb.0: ; %entry
316; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
317; GFX906-NEXT:    ds_read_u8 v0, v0
318; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
320; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
321; GFX906-NEXT:    global_store_dword v[0:1], v0, off
322; GFX906-NEXT:    s_waitcnt vmcnt(0)
323; GFX906-NEXT:    s_setpc_b64 s[30:31]
324;
325; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
326; GFX803:       ; %bb.0: ; %entry
327; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; GFX803-NEXT:    s_mov_b32 m0, -1
329; GFX803-NEXT:    ds_read_u8 v0, v0
330; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
331; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
332; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
333; GFX803-NEXT:    flat_store_dword v[0:1], v0
334; GFX803-NEXT:    s_waitcnt vmcnt(0)
335; GFX803-NEXT:    s_setpc_b64 s[30:31]
336entry:
337  %load = load i8, i8 addrspace(3)* %in
338  %ext = zext i8 %load to i16
339  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
340  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
341  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
342  ret void
343}
344
345define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
346; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
347; GFX900:       ; %bb.0: ; %entry
348; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349; GFX900-NEXT:    ds_read_i8_d16 v1, v0
350; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
351; GFX900-NEXT:    global_store_dword v[0:1], v1, off
352; GFX900-NEXT:    s_waitcnt vmcnt(0)
353; GFX900-NEXT:    s_setpc_b64 s[30:31]
354;
355; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
356; GFX906:       ; %bb.0: ; %entry
357; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
358; GFX906-NEXT:    ds_read_i8 v0, v0
359; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
360; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
362; GFX906-NEXT:    global_store_dword v[0:1], v0, off
363; GFX906-NEXT:    s_waitcnt vmcnt(0)
364; GFX906-NEXT:    s_setpc_b64 s[30:31]
365;
366; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
367; GFX803:       ; %bb.0: ; %entry
368; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
369; GFX803-NEXT:    s_mov_b32 m0, -1
370; GFX803-NEXT:    ds_read_i8 v0, v0
371; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
372; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
374; GFX803-NEXT:    flat_store_dword v[0:1], v0
375; GFX803-NEXT:    s_waitcnt vmcnt(0)
376; GFX803-NEXT:    s_setpc_b64 s[30:31]
377entry:
378  %reg.bc = bitcast i32 %reg to <2 x i16>
379  %load = load i8, i8 addrspace(3)* %in
380  %ext = sext i8 %load to i16
381  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
382  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
383  ret void
384}
385
386define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
387; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
388; GFX900:       ; %bb.0: ; %entry
389; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390; GFX900-NEXT:    ds_read_i8 v0, v0
391; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
393; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
394; GFX900-NEXT:    global_store_dword v[0:1], v0, off
395; GFX900-NEXT:    s_waitcnt vmcnt(0)
396; GFX900-NEXT:    s_setpc_b64 s[30:31]
397;
398; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
399; GFX906:       ; %bb.0: ; %entry
400; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX906-NEXT:    ds_read_i8 v0, v0
402; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
404; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
405; GFX906-NEXT:    global_store_dword v[0:1], v0, off
406; GFX906-NEXT:    s_waitcnt vmcnt(0)
407; GFX906-NEXT:    s_setpc_b64 s[30:31]
408;
409; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
410; GFX803:       ; %bb.0: ; %entry
411; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
412; GFX803-NEXT:    s_mov_b32 m0, -1
413; GFX803-NEXT:    ds_read_i8 v0, v0
414; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
415; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
417; GFX803-NEXT:    flat_store_dword v[0:1], v0
418; GFX803-NEXT:    s_waitcnt vmcnt(0)
419; GFX803-NEXT:    s_setpc_b64 s[30:31]
420entry:
421  %load = load i8, i8 addrspace(3)* %in
422  %ext = sext i8 %load to i16
423  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
424  %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
425  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
426  ret void
427}
428
429define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
430; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
431; GFX900:       ; %bb.0: ; %entry
432; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
433; GFX900-NEXT:    ds_read_u8 v0, v0
434; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
435; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
436; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
437; GFX900-NEXT:    global_store_dword v[0:1], v0, off
438; GFX900-NEXT:    s_waitcnt vmcnt(0)
439; GFX900-NEXT:    s_setpc_b64 s[30:31]
440;
441; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
442; GFX906:       ; %bb.0: ; %entry
443; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
444; GFX906-NEXT:    ds_read_u8 v0, v0
445; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
447; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
448; GFX906-NEXT:    global_store_dword v[0:1], v0, off
449; GFX906-NEXT:    s_waitcnt vmcnt(0)
450; GFX906-NEXT:    s_setpc_b64 s[30:31]
451;
452; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
453; GFX803:       ; %bb.0: ; %entry
454; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GFX803-NEXT:    s_mov_b32 m0, -1
456; GFX803-NEXT:    ds_read_u8 v0, v0
457; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
458; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
459; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
460; GFX803-NEXT:    flat_store_dword v[0:1], v0
461; GFX803-NEXT:    s_waitcnt vmcnt(0)
462; GFX803-NEXT:    s_setpc_b64 s[30:31]
463entry:
464  %load = load i8, i8 addrspace(3)* %in
465  %ext = zext i8 %load to i16
466  %bitcast = bitcast i16 %ext to half
467  %build0 = insertelement <2 x half> undef, half %reg, i32 1
468  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
469  store <2 x half> %build1, <2 x half> addrspace(1)* undef
470  ret void
471}
472
473define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
474; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
475; GFX900:       ; %bb.0: ; %entry
476; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
477; GFX900-NEXT:    ds_read_i8 v0, v0
478; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
480; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
481; GFX900-NEXT:    global_store_dword v[0:1], v0, off
482; GFX900-NEXT:    s_waitcnt vmcnt(0)
483; GFX900-NEXT:    s_setpc_b64 s[30:31]
484;
485; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
486; GFX906:       ; %bb.0: ; %entry
487; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
488; GFX906-NEXT:    ds_read_i8 v0, v0
489; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
491; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
492; GFX906-NEXT:    global_store_dword v[0:1], v0, off
493; GFX906-NEXT:    s_waitcnt vmcnt(0)
494; GFX906-NEXT:    s_setpc_b64 s[30:31]
495;
496; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
497; GFX803:       ; %bb.0: ; %entry
498; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX803-NEXT:    s_mov_b32 m0, -1
500; GFX803-NEXT:    ds_read_i8 v0, v0
501; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
502; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
504; GFX803-NEXT:    flat_store_dword v[0:1], v0
505; GFX803-NEXT:    s_waitcnt vmcnt(0)
506; GFX803-NEXT:    s_setpc_b64 s[30:31]
507entry:
508  %load = load i8, i8 addrspace(3)* %in
509  %ext = sext i8 %load to i16
510  %bitcast = bitcast i16 %ext to half
511  %build0 = insertelement <2 x half> undef, half %reg, i32 1
512  %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
513  store <2 x half> %build1, <2 x half> addrspace(1)* undef
514  ret void
515}
516
517define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
518; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
519; GFX900:       ; %bb.0: ; %entry
520; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; GFX900-NEXT:    ds_read_u16 v0, v0
522; GFX900-NEXT:    v_mov_b32_e32 v2, 0
523; GFX900-NEXT:    v_mov_b32_e32 v3, 0xffff
524; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX900-NEXT:    ds_write_b16 v2, v0
526; GFX900-NEXT:    v_bfi_b32 v0, v3, v0, v1
527; GFX900-NEXT:    global_store_dword v[0:1], v0, off
528; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
529; GFX900-NEXT:    s_setpc_b64 s[30:31]
530;
531; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
532; GFX906:       ; %bb.0: ; %entry
533; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX906-NEXT:    ds_read_u16 v0, v0
535; GFX906-NEXT:    v_mov_b32_e32 v2, 0
536; GFX906-NEXT:    v_mov_b32_e32 v3, 0xffff
537; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX906-NEXT:    ds_write_b16 v2, v0
539; GFX906-NEXT:    v_bfi_b32 v0, v3, v0, v1
540; GFX906-NEXT:    global_store_dword v[0:1], v0, off
541; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
542; GFX906-NEXT:    s_setpc_b64 s[30:31]
543;
544; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
545; GFX803:       ; %bb.0: ; %entry
546; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547; GFX803-NEXT:    s_mov_b32 m0, -1
548; GFX803-NEXT:    ds_read_u16 v0, v0
549; GFX803-NEXT:    v_mov_b32_e32 v2, 0
550; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
551; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
552; GFX803-NEXT:    ds_write_b16 v2, v0
553; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
554; GFX803-NEXT:    flat_store_dword v[0:1], v0
555; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
556; GFX803-NEXT:    s_setpc_b64 s[30:31]
557entry:
558  %load = load i16, i16 addrspace(3)* %in
559  %elt1 = extractelement <2 x i16> %reg, i32 1
560  store i16 %load, i16 addrspace(3)* null
561  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
562  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
563  ret void
564}
565
566define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
567; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
568; GFX900:       ; %bb.0: ; %entry
569; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
571; GFX900-NEXT:    ds_read_u16_d16 v1, v0
572; GFX900-NEXT:    v_mov_b32_e32 v0, 0
573; GFX900-NEXT:    ds_write_b16 v0, v2
574; GFX900-NEXT:    s_waitcnt lgkmcnt(1)
575; GFX900-NEXT:    global_store_dword v[0:1], v1, off
576; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
577; GFX900-NEXT:    s_setpc_b64 s[30:31]
578;
579; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
580; GFX906:       ; %bb.0: ; %entry
581; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX906-NEXT:    ds_read_u16 v0, v0
583; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
584; GFX906-NEXT:    v_mov_b32_e32 v3, 0
585; GFX906-NEXT:    ds_write_b16 v3, v2
586; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
587; GFX906-NEXT:    s_waitcnt lgkmcnt(1)
588; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
589; GFX906-NEXT:    global_store_dword v[0:1], v0, off
590; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
591; GFX906-NEXT:    s_setpc_b64 s[30:31]
592;
593; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
594; GFX803:       ; %bb.0: ; %entry
595; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
596; GFX803-NEXT:    s_mov_b32 m0, -1
597; GFX803-NEXT:    ds_read_u16 v0, v0
598; GFX803-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
599; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
600; GFX803-NEXT:    v_mov_b32_e32 v3, 0
601; GFX803-NEXT:    ds_write_b16 v3, v2
602; GFX803-NEXT:    s_waitcnt lgkmcnt(1)
603; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
604; GFX803-NEXT:    flat_store_dword v[0:1], v0
605; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
606; GFX803-NEXT:    s_setpc_b64 s[30:31]
607entry:
608  %load = load i16, i16 addrspace(3)* %in
609  %elt1 = extractelement <2 x i16> %reg, i32 1
610  store i16 %elt1, i16 addrspace(3)* null
611  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
612  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
613  ret void
614}
615
616define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
617; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
618; GFX900:       ; %bb.0: ; %entry
619; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX900-NEXT:    ds_read_u16 v0, v0
621; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
622; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX900-NEXT:    ds_write_b16 v2, v0
624; GFX900-NEXT:    ds_write_b16 v3, v4
625; GFX900-NEXT:    v_mov_b32_e32 v2, 0xffff
626; GFX900-NEXT:    v_bfi_b32 v0, v2, v0, v1
627; GFX900-NEXT:    global_store_dword v[0:1], v0, off
628; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
629; GFX900-NEXT:    s_setpc_b64 s[30:31]
630;
631; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
632; GFX906:       ; %bb.0: ; %entry
633; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; GFX906-NEXT:    ds_read_u16 v0, v0
635; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
636; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX906-NEXT:    ds_write_b16 v2, v0
638; GFX906-NEXT:    ds_write_b16 v3, v4
639; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
640; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
641; GFX906-NEXT:    global_store_dword v[0:1], v0, off
642; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
643; GFX906-NEXT:    s_setpc_b64 s[30:31]
644;
645; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
646; GFX803:       ; %bb.0: ; %entry
647; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648; GFX803-NEXT:    s_mov_b32 m0, -1
649; GFX803-NEXT:    ds_read_u16 v0, v0
650; GFX803-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
651; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
652; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX803-NEXT:    ds_write_b16 v2, v0
654; GFX803-NEXT:    ds_write_b16 v3, v4
655; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
656; GFX803-NEXT:    flat_store_dword v[0:1], v0
657; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
658; GFX803-NEXT:    s_setpc_b64 s[30:31]
659entry:
660  %load = load i16, i16 addrspace(3)* %in
661  %elt1 = extractelement <2 x i16> %reg, i32 1
662  store i16 %load, i16 addrspace(3)* %out0
663  store i16 %elt1, i16 addrspace(3)* %out1
664  %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
665  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
666  ret void
667}
668
669define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
670; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg:
671; GFX900:       ; %bb.0: ; %entry
672; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
673; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
674; GFX900-NEXT:    s_waitcnt vmcnt(0)
675; GFX900-NEXT:    global_store_dword v[0:1], v2, off
676; GFX900-NEXT:    s_waitcnt vmcnt(0)
677; GFX900-NEXT:    s_setpc_b64 s[30:31]
678;
679; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg:
680; GFX906:       ; %bb.0: ; %entry
681; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
682; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
683; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
684; GFX906-NEXT:    s_waitcnt vmcnt(0)
685; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
686; GFX906-NEXT:    global_store_dword v[0:1], v0, off
687; GFX906-NEXT:    s_waitcnt vmcnt(0)
688; GFX906-NEXT:    s_setpc_b64 s[30:31]
689;
690; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg:
691; GFX803:       ; %bb.0: ; %entry
692; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
693; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
694; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
695; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
696; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
697; GFX803-NEXT:    s_waitcnt vmcnt(0)
698; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
699; GFX803-NEXT:    flat_store_dword v[0:1], v0
700; GFX803-NEXT:    s_waitcnt vmcnt(0)
701; GFX803-NEXT:    s_setpc_b64 s[30:31]
702entry:
703  %reg.bc = bitcast i32 %reg to <2 x i16>
704  %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
705  %load = load i16, i16 addrspace(1)* %gep
706  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
707  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
708  ret void
709}
710
711define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
712; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg:
713; GFX900:       ; %bb.0: ; %entry
714; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
715; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
716; GFX900-NEXT:    s_waitcnt vmcnt(0)
717; GFX900-NEXT:    global_store_dword v[0:1], v2, off
718; GFX900-NEXT:    s_waitcnt vmcnt(0)
719; GFX900-NEXT:    s_setpc_b64 s[30:31]
720;
721; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg:
722; GFX906:       ; %bb.0: ; %entry
723; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
724; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
725; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
726; GFX906-NEXT:    s_waitcnt vmcnt(0)
727; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
728; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
729; GFX906-NEXT:    global_store_dword v[0:1], v0, off
730; GFX906-NEXT:    s_waitcnt vmcnt(0)
731; GFX906-NEXT:    s_setpc_b64 s[30:31]
732;
733; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg:
734; GFX803:       ; %bb.0: ; %entry
735; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
736; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
737; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
738; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
739; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
740; GFX803-NEXT:    s_waitcnt vmcnt(0)
741; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
742; GFX803-NEXT:    flat_store_dword v[0:1], v0
743; GFX803-NEXT:    s_waitcnt vmcnt(0)
744; GFX803-NEXT:    s_setpc_b64 s[30:31]
745entry:
746  %reg.bc = bitcast i32 %reg to <2 x half>
747  %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
748  %load = load half, half addrspace(1)* %gep
749  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
750  store <2 x half> %build1, <2 x half> addrspace(1)* undef
751  ret void
752}
753
754define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
755; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
756; GFX900:       ; %bb.0: ; %entry
757; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
758; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
759; GFX900-NEXT:    s_waitcnt vmcnt(0)
760; GFX900-NEXT:    global_store_dword v[0:1], v2, off
761; GFX900-NEXT:    s_waitcnt vmcnt(0)
762; GFX900-NEXT:    s_setpc_b64 s[30:31]
763;
764; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
765; GFX906:       ; %bb.0: ; %entry
766; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
768; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
769; GFX906-NEXT:    s_waitcnt vmcnt(0)
770; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
771; GFX906-NEXT:    global_store_dword v[0:1], v0, off
772; GFX906-NEXT:    s_waitcnt vmcnt(0)
773; GFX906-NEXT:    s_setpc_b64 s[30:31]
774;
775; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
776; GFX803:       ; %bb.0: ; %entry
777; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
778; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
779; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
780; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
781; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
782; GFX803-NEXT:    s_waitcnt vmcnt(0)
783; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
784; GFX803-NEXT:    flat_store_dword v[0:1], v0
785; GFX803-NEXT:    s_waitcnt vmcnt(0)
786; GFX803-NEXT:    s_setpc_b64 s[30:31]
787entry:
788  %reg.bc = bitcast i32 %reg to <2 x i16>
789  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
790  %load = load i8, i8 addrspace(1)* %gep
791  %ext = zext i8 %load to i16
792  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
793  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
794  ret void
795}
796
797define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
798; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
799; GFX900:       ; %bb.0: ; %entry
800; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
802; GFX900-NEXT:    s_waitcnt vmcnt(0)
803; GFX900-NEXT:    global_store_dword v[0:1], v2, off
804; GFX900-NEXT:    s_waitcnt vmcnt(0)
805; GFX900-NEXT:    s_setpc_b64 s[30:31]
806;
807; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
808; GFX906:       ; %bb.0: ; %entry
809; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
811; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
812; GFX906-NEXT:    s_waitcnt vmcnt(0)
813; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
814; GFX906-NEXT:    global_store_dword v[0:1], v0, off
815; GFX906-NEXT:    s_waitcnt vmcnt(0)
816; GFX906-NEXT:    s_setpc_b64 s[30:31]
817;
818; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
819; GFX803:       ; %bb.0: ; %entry
820; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
821; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
822; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
823; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
824; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
825; GFX803-NEXT:    s_waitcnt vmcnt(0)
826; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
827; GFX803-NEXT:    flat_store_dword v[0:1], v0
828; GFX803-NEXT:    s_waitcnt vmcnt(0)
829; GFX803-NEXT:    s_setpc_b64 s[30:31]
830entry:
831  %reg.bc = bitcast i32 %reg to <2 x i16>
832  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
833  %load = load i8, i8 addrspace(1)* %gep
834  %ext = sext i8 %load to i16
835  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
836  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
837  ret void
838}
839
840define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
841; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
842; GFX900:       ; %bb.0: ; %entry
843; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
844; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
845; GFX900-NEXT:    s_waitcnt vmcnt(0)
846; GFX900-NEXT:    global_store_dword v[0:1], v2, off
847; GFX900-NEXT:    s_waitcnt vmcnt(0)
848; GFX900-NEXT:    s_setpc_b64 s[30:31]
849;
850; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
851; GFX906:       ; %bb.0: ; %entry
852; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
854; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
855; GFX906-NEXT:    s_waitcnt vmcnt(0)
856; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
857; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
858; GFX906-NEXT:    global_store_dword v[0:1], v0, off
859; GFX906-NEXT:    s_waitcnt vmcnt(0)
860; GFX906-NEXT:    s_setpc_b64 s[30:31]
861;
862; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
863; GFX803:       ; %bb.0: ; %entry
864; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
865; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
866; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
867; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
868; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
869; GFX803-NEXT:    s_waitcnt vmcnt(0)
870; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
871; GFX803-NEXT:    flat_store_dword v[0:1], v0
872; GFX803-NEXT:    s_waitcnt vmcnt(0)
873; GFX803-NEXT:    s_setpc_b64 s[30:31]
874entry:
875  %reg.bc = bitcast i32 %reg to <2 x half>
876  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
877  %load = load i8, i8 addrspace(1)* %gep
878  %ext = zext i8 %load to i16
879  %bitcast = bitcast i16 %ext to half
880  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
881  store <2 x half> %build1, <2 x half> addrspace(1)* undef
882  ret void
883}
884
885define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
886; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
887; GFX900:       ; %bb.0: ; %entry
888; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
890; GFX900-NEXT:    s_waitcnt vmcnt(0)
891; GFX900-NEXT:    global_store_dword v[0:1], v2, off
892; GFX900-NEXT:    s_waitcnt vmcnt(0)
893; GFX900-NEXT:    s_setpc_b64 s[30:31]
894;
895; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
896; GFX906:       ; %bb.0: ; %entry
897; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
898; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
899; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
900; GFX906-NEXT:    s_waitcnt vmcnt(0)
901; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
902; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
903; GFX906-NEXT:    global_store_dword v[0:1], v0, off
904; GFX906-NEXT:    s_waitcnt vmcnt(0)
905; GFX906-NEXT:    s_setpc_b64 s[30:31]
906;
907; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
908; GFX803:       ; %bb.0: ; %entry
909; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
910; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
911; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
912; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
913; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
914; GFX803-NEXT:    s_waitcnt vmcnt(0)
915; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
916; GFX803-NEXT:    flat_store_dword v[0:1], v0
917; GFX803-NEXT:    s_waitcnt vmcnt(0)
918; GFX803-NEXT:    s_setpc_b64 s[30:31]
919entry:
920  %reg.bc = bitcast i32 %reg to <2 x half>
921  %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
922  %load = load i8, i8 addrspace(1)* %gep
923  %ext = sext i8 %load to i16
924  %bitcast = bitcast i16 %ext to half
925  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
926  store <2 x half> %build1, <2 x half> addrspace(1)* undef
927  ret void
928}
929
930define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
931; GFX900-LABEL: load_flat_lo_v2i16_reghi_vreg:
932; GFX900:       ; %bb.0: ; %entry
933; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
934; GFX900-NEXT:    flat_load_short_d16 v2, v[0:1]
935; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
936; GFX900-NEXT:    global_store_dword v[0:1], v2, off
937; GFX900-NEXT:    s_waitcnt vmcnt(0)
938; GFX900-NEXT:    s_setpc_b64 s[30:31]
939;
940; GFX906-LABEL: load_flat_lo_v2i16_reghi_vreg:
941; GFX906:       ; %bb.0: ; %entry
942; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
943; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
944; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
945; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
946; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
947; GFX906-NEXT:    global_store_dword v[0:1], v0, off
948; GFX906-NEXT:    s_waitcnt vmcnt(0)
949; GFX906-NEXT:    s_setpc_b64 s[30:31]
950;
951; GFX803-LABEL: load_flat_lo_v2i16_reghi_vreg:
952; GFX803:       ; %bb.0: ; %entry
953; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
954; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
955; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
956; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
957; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
958; GFX803-NEXT:    flat_store_dword v[0:1], v0
959; GFX803-NEXT:    s_waitcnt vmcnt(0)
960; GFX803-NEXT:    s_setpc_b64 s[30:31]
961entry:
962  %reg.bc = bitcast i32 %reg to <2 x i16>
963  %load = load i16, i16* %in
964  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
965  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
966  ret void
967}
968
969define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
970; GFX900-LABEL: load_flat_lo_v2f16_reghi_vreg:
971; GFX900:       ; %bb.0: ; %entry
972; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
973; GFX900-NEXT:    flat_load_short_d16 v2, v[0:1]
974; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
975; GFX900-NEXT:    global_store_dword v[0:1], v2, off
976; GFX900-NEXT:    s_waitcnt vmcnt(0)
977; GFX900-NEXT:    s_setpc_b64 s[30:31]
978;
979; GFX906-LABEL: load_flat_lo_v2f16_reghi_vreg:
980; GFX906:       ; %bb.0: ; %entry
981; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
982; GFX906-NEXT:    flat_load_ushort v0, v[0:1]
983; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
984; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
985; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
986; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
987; GFX906-NEXT:    global_store_dword v[0:1], v0, off
988; GFX906-NEXT:    s_waitcnt vmcnt(0)
989; GFX906-NEXT:    s_setpc_b64 s[30:31]
990;
991; GFX803-LABEL: load_flat_lo_v2f16_reghi_vreg:
992; GFX803:       ; %bb.0: ; %entry
993; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
995; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
996; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
997; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
998; GFX803-NEXT:    flat_store_dword v[0:1], v0
999; GFX803-NEXT:    s_waitcnt vmcnt(0)
1000; GFX803-NEXT:    s_setpc_b64 s[30:31]
1001
1002; FIXME: the and above should be removable
1003entry:
1004  %reg.bc = bitcast i32 %reg to <2 x half>
1005  %load = load half, half* %in
1006  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1007  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1008  ret void
1009}
1010
1011define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
1012; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
1013; GFX900:       ; %bb.0: ; %entry
1014; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1015; GFX900-NEXT:    flat_load_ubyte_d16 v2, v[0:1]
1016; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1017; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1018; GFX900-NEXT:    s_waitcnt vmcnt(0)
1019; GFX900-NEXT:    s_setpc_b64 s[30:31]
1020;
1021; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
1022; GFX906:       ; %bb.0: ; %entry
1023; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
1025; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
1026; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1027; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
1028; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1029; GFX906-NEXT:    s_waitcnt vmcnt(0)
1030; GFX906-NEXT:    s_setpc_b64 s[30:31]
1031;
1032; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
1033; GFX803:       ; %bb.0: ; %entry
1034; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1035; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
1036; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1037; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1038; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1039; GFX803-NEXT:    flat_store_dword v[0:1], v0
1040; GFX803-NEXT:    s_waitcnt vmcnt(0)
1041; GFX803-NEXT:    s_setpc_b64 s[30:31]
1042entry:
1043  %reg.bc = bitcast i32 %reg to <2 x i16>
1044  %load = load i8, i8* %in
1045  %ext = zext i8 %load to i16
1046  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1047  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1048  ret void
1049}
1050
1051define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
1052; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
1053; GFX900:       ; %bb.0: ; %entry
1054; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1055; GFX900-NEXT:    flat_load_sbyte_d16 v2, v[0:1]
1056; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1057; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1058; GFX900-NEXT:    s_waitcnt vmcnt(0)
1059; GFX900-NEXT:    s_setpc_b64 s[30:31]
1060;
1061; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
1062; GFX906:       ; %bb.0: ; %entry
1063; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1064; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
1065; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
1066; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1067; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
1068; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1069; GFX906-NEXT:    s_waitcnt vmcnt(0)
1070; GFX906-NEXT:    s_setpc_b64 s[30:31]
1071;
1072; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
1073; GFX803:       ; %bb.0: ; %entry
1074; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1075; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
1076; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1077; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1078; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1079; GFX803-NEXT:    flat_store_dword v[0:1], v0
1080; GFX803-NEXT:    s_waitcnt vmcnt(0)
1081; GFX803-NEXT:    s_setpc_b64 s[30:31]
1082entry:
1083  %reg.bc = bitcast i32 %reg to <2 x i16>
1084  %load = load i8, i8* %in
1085  %ext = sext i8 %load to i16
1086  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1087  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1088  ret void
1089}
1090
1091define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
1092; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
1093; GFX900:       ; %bb.0: ; %entry
1094; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1095; GFX900-NEXT:    flat_load_ubyte_d16 v2, v[0:1]
1096; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1097; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1098; GFX900-NEXT:    s_waitcnt vmcnt(0)
1099; GFX900-NEXT:    s_setpc_b64 s[30:31]
1100;
1101; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
1102; GFX906:       ; %bb.0: ; %entry
1103; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1104; GFX906-NEXT:    flat_load_ubyte v0, v[0:1]
1105; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1106; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1107; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1108; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1109; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1110; GFX906-NEXT:    s_waitcnt vmcnt(0)
1111; GFX906-NEXT:    s_setpc_b64 s[30:31]
1112;
1113; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
1114; GFX803:       ; %bb.0: ; %entry
1115; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1116; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
1117; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1118; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1119; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1120; GFX803-NEXT:    flat_store_dword v[0:1], v0
1121; GFX803-NEXT:    s_waitcnt vmcnt(0)
1122; GFX803-NEXT:    s_setpc_b64 s[30:31]
1123entry:
1124  %reg.bc = bitcast i32 %reg to <2 x half>
1125  %load = load i8, i8* %in
1126  %ext = zext i8 %load to i16
1127  %bitcast = bitcast i16 %ext to half
1128  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1129  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1130  ret void
1131}
1132
1133define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
1134; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
1135; GFX900:       ; %bb.0: ; %entry
1136; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1137; GFX900-NEXT:    flat_load_sbyte_d16 v2, v[0:1]
1138; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1139; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1140; GFX900-NEXT:    s_waitcnt vmcnt(0)
1141; GFX900-NEXT:    s_setpc_b64 s[30:31]
1142;
1143; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
1144; GFX906:       ; %bb.0: ; %entry
1145; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; GFX906-NEXT:    flat_load_sbyte v0, v[0:1]
1147; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1148; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1149; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1150; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1151; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1152; GFX906-NEXT:    s_waitcnt vmcnt(0)
1153; GFX906-NEXT:    s_setpc_b64 s[30:31]
1154;
1155; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
1156; GFX803:       ; %bb.0: ; %entry
1157; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1158; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
1159; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1160; GFX803-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1161; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1162; GFX803-NEXT:    flat_store_dword v[0:1], v0
1163; GFX803-NEXT:    s_waitcnt vmcnt(0)
1164; GFX803-NEXT:    s_setpc_b64 s[30:31]
1165entry:
1166  %reg.bc = bitcast i32 %reg to <2 x half>
1167  %load = load i8, i8* %in
1168  %ext = sext i8 %load to i16
1169  %bitcast = bitcast i16 %ext to half
1170  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1171  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1172  ret void
1173}
1174
1175define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i32 %reg) #0 {
1176; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg:
1177; GFX900-MUBUF:       ; %bb.0: ; %entry
1178; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1179; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
1180; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1181; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1182; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1183; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1184;
1185; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg:
1186; GFX906:       ; %bb.0: ; %entry
1187; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1188; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1189; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1190; GFX906-NEXT:    s_waitcnt vmcnt(0)
1191; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1192; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1193; GFX906-NEXT:    s_waitcnt vmcnt(0)
1194; GFX906-NEXT:    s_setpc_b64 s[30:31]
1195;
1196; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg:
1197; GFX803:       ; %bb.0: ; %entry
1198; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1199; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1200; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1201; GFX803-NEXT:    s_waitcnt vmcnt(0)
1202; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1203; GFX803-NEXT:    flat_store_dword v[0:1], v0
1204; GFX803-NEXT:    s_waitcnt vmcnt(0)
1205; GFX803-NEXT:    s_setpc_b64 s[30:31]
1206;
1207; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg:
1208; GFX900-FLATSCR:       ; %bb.0: ; %entry
1209; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
1211; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1212; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1213; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1214; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1215entry:
1216  %reg.bc = bitcast i32 %reg to <2 x i16>
1217  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
1218  %load = load i16, i16 addrspace(5)* %gep
1219  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1220  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1221  ret void
1222}
1223
1224define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 {
1225; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg:
1226; GFX900-MUBUF:       ; %bb.0: ; %entry
1227; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1228; GFX900-MUBUF-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1229; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1230; GFX900-MUBUF-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1231; GFX900-MUBUF-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1232; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1233; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1234; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1235;
1236; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg:
1237; GFX906:       ; %bb.0: ; %entry
1238; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1239; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1240; GFX906-NEXT:    s_waitcnt vmcnt(0)
1241; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1242; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1243; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1244; GFX906-NEXT:    s_waitcnt vmcnt(0)
1245; GFX906-NEXT:    s_setpc_b64 s[30:31]
1246;
1247; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg:
1248; GFX803:       ; %bb.0: ; %entry
1249; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1250; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1251; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
1252; GFX803-NEXT:    s_waitcnt vmcnt(0)
1253; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1254; GFX803-NEXT:    flat_store_dword v[0:1], v0
1255; GFX803-NEXT:    s_waitcnt vmcnt(0)
1256; GFX803-NEXT:    s_setpc_b64 s[30:31]
1257;
1258; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg:
1259; GFX900-FLATSCR:       ; %bb.0: ; %entry
1260; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261; GFX900-FLATSCR-NEXT:    scratch_load_ushort v1, off, s32 offset:4094
1262; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1263; GFX900-FLATSCR-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1264; GFX900-FLATSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1265; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1266; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1267; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1268entry:
1269  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
1270  %load = load i16, i16 addrspace(5)* %gep
1271  %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
1272  %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
1273  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1274  ret void
1275}
1276
1277define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, i32 %reg) #0 {
1278; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg:
1279; GFX900-MUBUF:       ; %bb.0: ; %entry
1280; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1281; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
1282; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1283; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1284; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1285; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1286;
1287; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg:
1288; GFX906:       ; %bb.0: ; %entry
1289; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1290; GFX906-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1291; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1292; GFX906-NEXT:    s_waitcnt vmcnt(0)
1293; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1294; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
1295; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1296; GFX906-NEXT:    s_waitcnt vmcnt(0)
1297; GFX906-NEXT:    s_setpc_b64 s[30:31]
1298;
1299; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg:
1300; GFX803:       ; %bb.0: ; %entry
1301; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1302; GFX803-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
1303; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1304; GFX803-NEXT:    s_waitcnt vmcnt(0)
1305; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1306; GFX803-NEXT:    flat_store_dword v[0:1], v0
1307; GFX803-NEXT:    s_waitcnt vmcnt(0)
1308; GFX803-NEXT:    s_setpc_b64 s[30:31]
1309;
1310; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg:
1311; GFX900-FLATSCR:       ; %bb.0: ; %entry
1312; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1313; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
1314; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1315; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1316; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1317; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1318entry:
1319  %reg.bc = bitcast i32 %reg to <2 x half>
1320  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
1321  %load = load half, half addrspace(5)* %gep
1322  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1323  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1324  ret void
1325}
1326
1327define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
1328; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1329; GFX900-MUBUF:       ; %bb.0: ; %entry
1330; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1331; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
1332; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1333; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1334; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1335; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1336;
1337; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1338; GFX906:       ; %bb.0: ; %entry
1339; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1340; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1341; GFX906-NEXT:    s_waitcnt vmcnt(0)
1342; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1343; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1344; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1345; GFX906-NEXT:    s_waitcnt vmcnt(0)
1346; GFX906-NEXT:    s_setpc_b64 s[30:31]
1347;
1348; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1349; GFX803:       ; %bb.0: ; %entry
1350; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1351; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1352; GFX803-NEXT:    s_waitcnt vmcnt(0)
1353; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1354; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1355; GFX803-NEXT:    flat_store_dword v[0:1], v0
1356; GFX803-NEXT:    s_waitcnt vmcnt(0)
1357; GFX803-NEXT:    s_setpc_b64 s[30:31]
1358;
1359; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
1360; GFX900-FLATSCR:       ; %bb.0: ; %entry
1361; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1362; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1363; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
1364; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1365; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1366; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1367; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1368entry:
1369  %reg.bc = bitcast i32 %reg to <2 x i16>
1370  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
1371  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1372  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1373  ret void
1374}
1375
1376define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
1377; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1378; GFX900-MUBUF:       ; %bb.0: ; %entry
1379; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1380; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
1381; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1382; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1383; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1384; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1385;
1386; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1387; GFX906:       ; %bb.0: ; %entry
1388; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1389; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1390; GFX906-NEXT:    s_waitcnt vmcnt(0)
1391; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1392; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1393; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1394; GFX906-NEXT:    s_waitcnt vmcnt(0)
1395; GFX906-NEXT:    s_setpc_b64 s[30:31]
1396;
1397; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1398; GFX803:       ; %bb.0: ; %entry
1399; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1400; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1401; GFX803-NEXT:    s_waitcnt vmcnt(0)
1402; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1403; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1404; GFX803-NEXT:    flat_store_dword v[0:1], v0
1405; GFX803-NEXT:    s_waitcnt vmcnt(0)
1406; GFX803-NEXT:    s_setpc_b64 s[30:31]
1407;
1408; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
1409; GFX900-FLATSCR:       ; %bb.0: ; %entry
1410; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1411; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1412; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
1413; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1414; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1415; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1416; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1417entry:
1418  %reg.bc = bitcast i32 %reg to <2 x i16>
1419  %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
1420  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1421  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1422  ret void
1423}
1424
1425define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
1426; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1427; GFX900-MUBUF:       ; %bb.0: ; %entry
1428; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1429; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 glc
1430; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1431; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1432; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1433; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1434;
1435; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1436; GFX906:       ; %bb.0: ; %entry
1437; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1438; GFX906-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1439; GFX906-NEXT:    s_waitcnt vmcnt(0)
1440; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1441; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1442; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1443; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1444; GFX906-NEXT:    s_waitcnt vmcnt(0)
1445; GFX906-NEXT:    s_setpc_b64 s[30:31]
1446;
1447; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1448; GFX803:       ; %bb.0: ; %entry
1449; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1450; GFX803-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc
1451; GFX803-NEXT:    s_waitcnt vmcnt(0)
1452; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1453; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1454; GFX803-NEXT:    flat_store_dword v[0:1], v0
1455; GFX803-NEXT:    s_waitcnt vmcnt(0)
1456; GFX803-NEXT:    s_setpc_b64 s[30:31]
1457;
1458; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
1459; GFX900-FLATSCR:       ; %bb.0: ; %entry
1460; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1461; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1462; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s0 glc
1463; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1464; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1465; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1466; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1467entry:
1468  %reg.bc = bitcast i32 %reg to <2 x half>
1469  %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
1470  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1471  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1472  ret void
1473}
1474
1475define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i32 %reg) #0 {
1476; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1477; GFX900-MUBUF:       ; %bb.0: ; %entry
1478; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1479; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
1480; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1481; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1482; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1483; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1484;
1485; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1486; GFX906:       ; %bb.0: ; %entry
1487; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1488; GFX906-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
1489; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1490; GFX906-NEXT:    s_waitcnt vmcnt(0)
1491; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1492; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1493; GFX906-NEXT:    s_waitcnt vmcnt(0)
1494; GFX906-NEXT:    s_setpc_b64 s[30:31]
1495;
1496; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1497; GFX803:       ; %bb.0: ; %entry
1498; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1499; GFX803-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
1500; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1501; GFX803-NEXT:    s_waitcnt vmcnt(0)
1502; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1503; GFX803-NEXT:    flat_store_dword v[0:1], v0
1504; GFX803-NEXT:    s_waitcnt vmcnt(0)
1505; GFX803-NEXT:    s_setpc_b64 s[30:31]
1506;
1507; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
1508; GFX900-FLATSCR:       ; %bb.0: ; %entry
1509; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1510; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
1511; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1512; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1513; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1514; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1515entry:
1516  %reg.bc = bitcast i32 %reg to <2 x i16>
1517  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
1518  %load = load i8, i8 addrspace(5)* %gep
1519  %ext = zext i8 %load to i16
1520  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1521  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1522  ret void
1523}
1524
1525define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i32 %reg) #0 {
1526; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1527; GFX900-MUBUF:       ; %bb.0: ; %entry
1528; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1529; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
1530; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1531; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1532; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1533; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1534;
1535; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1536; GFX906:       ; %bb.0: ; %entry
1537; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1538; GFX906-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
1539; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1540; GFX906-NEXT:    s_waitcnt vmcnt(0)
1541; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1542; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1543; GFX906-NEXT:    s_waitcnt vmcnt(0)
1544; GFX906-NEXT:    s_setpc_b64 s[30:31]
1545;
1546; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1547; GFX803:       ; %bb.0: ; %entry
1548; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1549; GFX803-NEXT:    buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
1550; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1551; GFX803-NEXT:    s_waitcnt vmcnt(0)
1552; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1553; GFX803-NEXT:    flat_store_dword v[0:1], v0
1554; GFX803-NEXT:    s_waitcnt vmcnt(0)
1555; GFX803-NEXT:    s_setpc_b64 s[30:31]
1556;
1557; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
1558; GFX900-FLATSCR:       ; %bb.0: ; %entry
1559; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1560; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
1561; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1562; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1563; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1564; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1565entry:
1566  %reg.bc = bitcast i32 %reg to <2 x i16>
1567  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
1568  %load = load i8, i8 addrspace(5)* %gep
1569  %ext = sext i8 %load to i16
1570  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1571  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1572  ret void
1573}
1574
1575define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
1576; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1577; GFX900-MUBUF:       ; %bb.0: ; %entry
1578; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1579; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
1580; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1581; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1582; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1583; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1584;
1585; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1586; GFX906:       ; %bb.0: ; %entry
1587; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1588; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
1589; GFX906-NEXT:    s_waitcnt vmcnt(0)
1590; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1591; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1592; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1593; GFX906-NEXT:    s_waitcnt vmcnt(0)
1594; GFX906-NEXT:    s_setpc_b64 s[30:31]
1595;
1596; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1597; GFX803:       ; %bb.0: ; %entry
1598; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1599; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
1600; GFX803-NEXT:    s_waitcnt vmcnt(0)
1601; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1602; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1603; GFX803-NEXT:    flat_store_dword v[0:1], v0
1604; GFX803-NEXT:    s_waitcnt vmcnt(0)
1605; GFX803-NEXT:    s_setpc_b64 s[30:31]
1606;
1607; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
1608; GFX900-FLATSCR:       ; %bb.0: ; %entry
1609; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1610; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1611; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
1612; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1613; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1614; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1615; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1616entry:
1617  %reg.bc = bitcast i32 %reg to <2 x i16>
1618  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
1619  %ext = zext i8 %load to i16
1620  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1621  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1622  ret void
1623}
1624
1625define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
1626; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1627; GFX900-MUBUF:       ; %bb.0: ; %entry
1628; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1629; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 glc
1630; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1631; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1632; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1633; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1634;
1635; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1636; GFX906:       ; %bb.0: ; %entry
1637; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1638; GFX906-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
1639; GFX906-NEXT:    s_waitcnt vmcnt(0)
1640; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1641; GFX906-NEXT:    v_bfi_b32 v0, v2, v0, v1
1642; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1643; GFX906-NEXT:    s_waitcnt vmcnt(0)
1644; GFX906-NEXT:    s_setpc_b64 s[30:31]
1645;
1646; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1647; GFX803:       ; %bb.0: ; %entry
1648; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1649; GFX803-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc
1650; GFX803-NEXT:    s_waitcnt vmcnt(0)
1651; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1652; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1653; GFX803-NEXT:    flat_store_dword v[0:1], v0
1654; GFX803-NEXT:    s_waitcnt vmcnt(0)
1655; GFX803-NEXT:    s_setpc_b64 s[30:31]
1656;
1657; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
1658; GFX900-FLATSCR:       ; %bb.0: ; %entry
1659; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1660; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1661; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s0 glc
1662; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1663; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1664; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1665; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1666entry:
1667  %reg.bc = bitcast i32 %reg to <2 x i16>
1668  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
1669  %ext = sext i8 %load to i16
1670  %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
1671  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1672  ret void
1673}
1674
1675define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
1676; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1677; GFX900-MUBUF:       ; %bb.0: ; %entry
1678; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1679; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 glc
1680; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1681; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
1682; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1683; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1684;
1685; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1686; GFX906:       ; %bb.0: ; %entry
1687; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1688; GFX906-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
1689; GFX906-NEXT:    s_waitcnt vmcnt(0)
1690; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1691; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1692; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1693; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1694; GFX906-NEXT:    s_waitcnt vmcnt(0)
1695; GFX906-NEXT:    s_setpc_b64 s[30:31]
1696;
1697; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1698; GFX803:       ; %bb.0: ; %entry
1699; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1700; GFX803-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc
1701; GFX803-NEXT:    s_waitcnt vmcnt(0)
1702; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
1703; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1704; GFX803-NEXT:    flat_store_dword v[0:1], v0
1705; GFX803-NEXT:    s_waitcnt vmcnt(0)
1706; GFX803-NEXT:    s_setpc_b64 s[30:31]
1707;
1708; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
1709; GFX900-FLATSCR:       ; %bb.0: ; %entry
1710; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1711; GFX900-FLATSCR-NEXT:    s_movk_i32 s0, 0xffe
1712; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s0 glc
1713; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1714; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
1715; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1716; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1717entry:
1718  %reg.bc = bitcast i32 %reg to <2 x half>
1719  %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
1720  %ext = zext i8 %load to i16
1721  %bc.ext = bitcast i16 %ext to half
1722  %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
1723  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1724  ret void
1725}
1726
1727define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
1728; GFX900-LABEL: load_constant_lo_v2i16_reglo_vreg:
1729; GFX900:       ; %bb.0: ; %entry
1730; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1731; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
1732; GFX900-NEXT:    s_waitcnt vmcnt(0)
1733; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1734; GFX900-NEXT:    s_waitcnt vmcnt(0)
1735; GFX900-NEXT:    s_setpc_b64 s[30:31]
1736;
1737; GFX906-LABEL: load_constant_lo_v2i16_reglo_vreg:
1738; GFX906:       ; %bb.0: ; %entry
1739; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1740; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
1741; GFX906-NEXT:    v_mov_b32_e32 v1, 0xffff
1742; GFX906-NEXT:    s_waitcnt vmcnt(0)
1743; GFX906-NEXT:    v_bfi_b32 v0, v1, v0, v2
1744; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1745; GFX906-NEXT:    s_waitcnt vmcnt(0)
1746; GFX906-NEXT:    s_setpc_b64 s[30:31]
1747;
1748; GFX803-LABEL: load_constant_lo_v2i16_reglo_vreg:
1749; GFX803:       ; %bb.0: ; %entry
1750; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
1752; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1753; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
1754; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1755; GFX803-NEXT:    s_waitcnt vmcnt(0)
1756; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1757; GFX803-NEXT:    flat_store_dword v[0:1], v0
1758; GFX803-NEXT:    s_waitcnt vmcnt(0)
1759; GFX803-NEXT:    s_setpc_b64 s[30:31]
1760entry:
1761  %reg.bc = bitcast i32 %reg to <2 x i16>
1762  %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
1763  %load = load i16, i16 addrspace(4)* %gep
1764  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1765  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1766  ret void
1767}
1768
1769define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
1770; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg:
1771; GFX900:       ; %bb.0: ; %entry
1772; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1773; GFX900-NEXT:    global_load_short_d16 v2, v[0:1], off offset:-4094
1774; GFX900-NEXT:    s_waitcnt vmcnt(0)
1775; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1776; GFX900-NEXT:    s_waitcnt vmcnt(0)
1777; GFX900-NEXT:    s_setpc_b64 s[30:31]
1778;
1779; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg:
1780; GFX906:       ; %bb.0: ; %entry
1781; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1782; GFX906-NEXT:    global_load_ushort v0, v[0:1], off offset:-4094
1783; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1784; GFX906-NEXT:    s_waitcnt vmcnt(0)
1785; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1786; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1787; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1788; GFX906-NEXT:    s_waitcnt vmcnt(0)
1789; GFX906-NEXT:    s_setpc_b64 s[30:31]
1790;
1791; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg:
1792; GFX803:       ; %bb.0: ; %entry
1793; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1794; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff002, v0
1795; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1796; GFX803-NEXT:    flat_load_ushort v0, v[0:1]
1797; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1798; GFX803-NEXT:    s_waitcnt vmcnt(0)
1799; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1800; GFX803-NEXT:    flat_store_dword v[0:1], v0
1801; GFX803-NEXT:    s_waitcnt vmcnt(0)
1802; GFX803-NEXT:    s_setpc_b64 s[30:31]
1803entry:
1804  %reg.bc = bitcast i32 %reg to <2 x half>
1805  %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
1806  %load = load half, half addrspace(4)* %gep
1807  %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
1808  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1809  ret void
1810}
1811
1812define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
1813; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
1814; GFX900:       ; %bb.0: ; %entry
1815; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1816; GFX900-NEXT:    global_load_ubyte_d16 v2, v[0:1], off offset:-4095
1817; GFX900-NEXT:    s_waitcnt vmcnt(0)
1818; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1819; GFX900-NEXT:    s_waitcnt vmcnt(0)
1820; GFX900-NEXT:    s_setpc_b64 s[30:31]
1821;
1822; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
1823; GFX906:       ; %bb.0: ; %entry
1824; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1825; GFX906-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
1826; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1827; GFX906-NEXT:    s_waitcnt vmcnt(0)
1828; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1829; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1830; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1831; GFX906-NEXT:    s_waitcnt vmcnt(0)
1832; GFX906-NEXT:    s_setpc_b64 s[30:31]
1833;
1834; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
1835; GFX803:       ; %bb.0: ; %entry
1836; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1837; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
1838; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1839; GFX803-NEXT:    flat_load_ubyte v0, v[0:1]
1840; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1841; GFX803-NEXT:    s_waitcnt vmcnt(0)
1842; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
1843; GFX803-NEXT:    flat_store_dword v[0:1], v0
1844; GFX803-NEXT:    s_waitcnt vmcnt(0)
1845; GFX803-NEXT:    s_setpc_b64 s[30:31]
1846entry:
1847  %reg.bc = bitcast i32 %reg to <2 x half>
1848  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
1849  %load = load i8, i8 addrspace(4)* %gep
1850  %ext = zext i8 %load to i16
1851  %bitcast = bitcast i16 %ext to half
1852  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1853  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1854  ret void
1855}
1856
1857define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
1858; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
1859; GFX900:       ; %bb.0: ; %entry
1860; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1861; GFX900-NEXT:    global_load_sbyte_d16 v2, v[0:1], off offset:-4095
1862; GFX900-NEXT:    s_waitcnt vmcnt(0)
1863; GFX900-NEXT:    global_store_dword v[0:1], v2, off
1864; GFX900-NEXT:    s_waitcnt vmcnt(0)
1865; GFX900-NEXT:    s_setpc_b64 s[30:31]
1866;
1867; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
1868; GFX906:       ; %bb.0: ; %entry
1869; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1870; GFX906-NEXT:    global_load_sbyte v0, v[0:1], off offset:-4095
1871; GFX906-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1872; GFX906-NEXT:    s_waitcnt vmcnt(0)
1873; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1874; GFX906-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
1875; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1876; GFX906-NEXT:    s_waitcnt vmcnt(0)
1877; GFX906-NEXT:    s_setpc_b64 s[30:31]
1878;
1879; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
1880; GFX803:       ; %bb.0: ; %entry
1881; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1882; GFX803-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff001, v0
1883; GFX803-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
1884; GFX803-NEXT:    flat_load_sbyte v0, v[0:1]
1885; GFX803-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
1886; GFX803-NEXT:    s_waitcnt vmcnt(0)
1887; GFX803-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1888; GFX803-NEXT:    flat_store_dword v[0:1], v0
1889; GFX803-NEXT:    s_waitcnt vmcnt(0)
1890; GFX803-NEXT:    s_setpc_b64 s[30:31]
1891entry:
1892  %reg.bc = bitcast i32 %reg to <2 x half>
1893  %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
1894  %load = load i8, i8 addrspace(4)* %gep
1895  %ext = sext i8 %load to i16
1896  %bitcast = bitcast i16 %ext to half
1897  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
1898  store <2 x half> %build1, <2 x half> addrspace(1)* undef
1899  ret void
1900}
1901
1902define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
1903; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1904; GFX900-MUBUF:       ; %bb.0: ; %entry
1905; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1906; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
1907; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
1908; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1909; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
1910; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, v1, s[0:3], s32 offen offset:4054 glc
1911; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1912; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1913; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1914; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1915;
1916; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1917; GFX906:       ; %bb.0: ; %entry
1918; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1919; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
1920; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
1921; GFX906-NEXT:    s_waitcnt vmcnt(0)
1922; GFX906-NEXT:    v_mov_b32_e32 v3, 44
1923; GFX906-NEXT:    buffer_load_ushort v1, v3, s[0:3], s32 offen offset:4054 glc
1924; GFX906-NEXT:    s_waitcnt vmcnt(0)
1925; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1926; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1927; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1928; GFX906-NEXT:    s_waitcnt vmcnt(0)
1929; GFX906-NEXT:    s_setpc_b64 s[30:31]
1930;
1931; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1932; GFX803:       ; %bb.0: ; %entry
1933; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1934; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
1935; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
1936; GFX803-NEXT:    s_waitcnt vmcnt(0)
1937; GFX803-NEXT:    v_mov_b32_e32 v2, 44
1938; GFX803-NEXT:    buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc
1939; GFX803-NEXT:    s_waitcnt vmcnt(0)
1940; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
1941; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
1942; GFX803-NEXT:    flat_store_dword v[0:1], v0
1943; GFX803-NEXT:    s_waitcnt vmcnt(0)
1944; GFX803-NEXT:    s_setpc_b64 s[30:31]
1945;
1946; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
1947; GFX900-FLATSCR:       ; %bb.0: ; %entry
1948; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1949; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
1950; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
1951; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1952; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
1953; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, vcc_hi offset:4054 glc
1954; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1955; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
1956; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
1957; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
1958entry:
1959  %obj0 = alloca [10 x i32], align 4, addrspace(5)
1960  %obj1 = alloca [4096 x i16], align 2, addrspace(5)
1961  %reg.bc = bitcast i32 %reg to <2 x i16>
1962  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
1963  store volatile i32 123, i32 addrspace(5)* %bc
1964  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
1965  %load = load volatile i16, i16 addrspace(5)* %gep
1966  %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
1967  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
1968  ret void
1969}
1970
1971define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
1972; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
1973; GFX900-MUBUF:       ; %bb.0: ; %entry
1974; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1975; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
1976; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
1977; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1978; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
1979; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
1980; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1981; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
1982; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
1983; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
1984;
1985; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
1986; GFX906:       ; %bb.0: ; %entry
1987; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1988; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
1989; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
1990; GFX906-NEXT:    s_waitcnt vmcnt(0)
1991; GFX906-NEXT:    v_mov_b32_e32 v3, 44
1992; GFX906-NEXT:    buffer_load_sbyte v1, v3, s[0:3], s32 offen offset:4055 glc
1993; GFX906-NEXT:    s_waitcnt vmcnt(0)
1994; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
1995; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
1996; GFX906-NEXT:    global_store_dword v[0:1], v0, off
1997; GFX906-NEXT:    s_waitcnt vmcnt(0)
1998; GFX906-NEXT:    s_setpc_b64 s[30:31]
1999;
2000; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
2001; GFX803:       ; %bb.0: ; %entry
2002; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2003; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2004; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2005; GFX803-NEXT:    s_waitcnt vmcnt(0)
2006; GFX803-NEXT:    v_mov_b32_e32 v2, 44
2007; GFX803-NEXT:    buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
2008; GFX803-NEXT:    s_waitcnt vmcnt(0)
2009; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2010; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2011; GFX803-NEXT:    flat_store_dword v[0:1], v0
2012; GFX803-NEXT:    s_waitcnt vmcnt(0)
2013; GFX803-NEXT:    s_setpc_b64 s[30:31]
2014;
2015; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
2016; GFX900-FLATSCR:       ; %bb.0: ; %entry
2017; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2018; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2019; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
2020; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2021; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
2022; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, vcc_hi offset:4055 glc
2023; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2024; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2025; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2026; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2027entry:
2028  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2029  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2030  %reg.bc = bitcast i32 %reg to <2 x i16>
2031  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2032  store volatile i32 123, i32 addrspace(5)* %bc
2033  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2034  %load = load volatile i8, i8 addrspace(5)* %gep
2035  %load.ext = sext i8 %load to i16
2036  %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
2037  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
2038  ret void
2039}
2040
2041define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
2042; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2043; GFX900-MUBUF:       ; %bb.0: ; %entry
2044; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2045; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
2046; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2047; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2048; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
2049; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
2050; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2051; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
2052; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2053; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
2054;
2055; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2056; GFX906:       ; %bb.0: ; %entry
2057; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2058; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
2059; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2060; GFX906-NEXT:    s_waitcnt vmcnt(0)
2061; GFX906-NEXT:    v_mov_b32_e32 v3, 44
2062; GFX906-NEXT:    buffer_load_ubyte v1, v3, s[0:3], s32 offen offset:4055 glc
2063; GFX906-NEXT:    s_waitcnt vmcnt(0)
2064; GFX906-NEXT:    v_mov_b32_e32 v2, 0xffff
2065; GFX906-NEXT:    v_bfi_b32 v0, v2, v1, v0
2066; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2067; GFX906-NEXT:    s_waitcnt vmcnt(0)
2068; GFX906-NEXT:    s_setpc_b64 s[30:31]
2069;
2070; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2071; GFX803:       ; %bb.0: ; %entry
2072; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2073; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2074; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2075; GFX803-NEXT:    s_waitcnt vmcnt(0)
2076; GFX803-NEXT:    v_mov_b32_e32 v2, 44
2077; GFX803-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
2078; GFX803-NEXT:    s_waitcnt vmcnt(0)
2079; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2080; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
2081; GFX803-NEXT:    flat_store_dword v[0:1], v0
2082; GFX803-NEXT:    s_waitcnt vmcnt(0)
2083; GFX803-NEXT:    s_setpc_b64 s[30:31]
2084;
2085; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
2086; GFX900-FLATSCR:       ; %bb.0: ; %entry
2087; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2088; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2089; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
2090; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2091; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
2092; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, vcc_hi offset:4055 glc
2093; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2094; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2095; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2096; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2097entry:
2098  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2099  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2100  %reg.bc = bitcast i32 %reg to <2 x i16>
2101  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2102  store volatile i32 123, i32 addrspace(5)* %bc
2103  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2104  %load = load volatile i8, i8 addrspace(5)* %gep
2105  %load.ext = zext i8 %load to i16
2106  %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
2107  store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
2108  ret void
2109}
2110
2111define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
2112; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2113; GFX900-MUBUF:       ; %bb.0: ; %entry
2114; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2115; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
2116; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2117; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2118; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
2119; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
2120; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2121; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
2122; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2123; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
2124;
2125; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2126; GFX906:       ; %bb.0: ; %entry
2127; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
2129; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2130; GFX906-NEXT:    s_waitcnt vmcnt(0)
2131; GFX906-NEXT:    v_mov_b32_e32 v2, 44
2132; GFX906-NEXT:    buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
2133; GFX906-NEXT:    s_waitcnt vmcnt(0)
2134; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2135; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2136; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
2137; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2138; GFX906-NEXT:    s_waitcnt vmcnt(0)
2139; GFX906-NEXT:    s_setpc_b64 s[30:31]
2140;
2141; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2142; GFX803:       ; %bb.0: ; %entry
2143; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2144; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2145; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2146; GFX803-NEXT:    s_waitcnt vmcnt(0)
2147; GFX803-NEXT:    v_mov_b32_e32 v2, 44
2148; GFX803-NEXT:    buffer_load_sbyte v1, v2, s[0:3], s32 offen offset:4055 glc
2149; GFX803-NEXT:    s_waitcnt vmcnt(0)
2150; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2151; GFX803-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2152; GFX803-NEXT:    flat_store_dword v[0:1], v0
2153; GFX803-NEXT:    s_waitcnt vmcnt(0)
2154; GFX803-NEXT:    s_setpc_b64 s[30:31]
2155;
2156; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
2157; GFX900-FLATSCR:       ; %bb.0: ; %entry
2158; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2159; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2160; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
2161; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2162; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
2163; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, vcc_hi offset:4055 glc
2164; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2165; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2166; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2167; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2168entry:
2169  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2170  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2171  %reg.bc = bitcast i32 %reg to <2 x half>
2172  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2173  store volatile i32 123, i32 addrspace(5)* %bc
2174  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2175  %load = load volatile i8, i8 addrspace(5)* %gep
2176  %load.ext = sext i8 %load to i16
2177  %bitcast = bitcast i16 %load.ext to half
2178  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
2179  store <2 x half> %build1, <2 x half> addrspace(1)* undef
2180  ret void
2181}
2182
2183define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
2184; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2185; GFX900-MUBUF:       ; %bb.0: ; %entry
2186; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
2188; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2189; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2190; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 44
2191; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, v1, s[0:3], s32 offen offset:4055 glc
2192; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2193; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
2194; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
2195; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
2196;
2197; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2198; GFX906:       ; %bb.0: ; %entry
2199; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2200; GFX906-NEXT:    v_mov_b32_e32 v1, 0x7b
2201; GFX906-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2202; GFX906-NEXT:    s_waitcnt vmcnt(0)
2203; GFX906-NEXT:    v_mov_b32_e32 v2, 44
2204; GFX906-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
2205; GFX906-NEXT:    s_waitcnt vmcnt(0)
2206; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2207; GFX906-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2208; GFX906-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
2209; GFX906-NEXT:    global_store_dword v[0:1], v0, off
2210; GFX906-NEXT:    s_waitcnt vmcnt(0)
2211; GFX906-NEXT:    s_setpc_b64 s[30:31]
2212;
2213; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2214; GFX803:       ; %bb.0: ; %entry
2215; GFX803-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2216; GFX803-NEXT:    v_mov_b32_e32 v1, 0x7b
2217; GFX803-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
2218; GFX803-NEXT:    s_waitcnt vmcnt(0)
2219; GFX803-NEXT:    v_mov_b32_e32 v2, 44
2220; GFX803-NEXT:    buffer_load_ubyte v1, v2, s[0:3], s32 offen offset:4055 glc
2221; GFX803-NEXT:    s_waitcnt vmcnt(0)
2222; GFX803-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2223; GFX803-NEXT:    v_or_b32_e32 v0, v1, v0
2224; GFX803-NEXT:    flat_store_dword v[0:1], v0
2225; GFX803-NEXT:    s_waitcnt vmcnt(0)
2226; GFX803-NEXT:    s_setpc_b64 s[30:31]
2227;
2228; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
2229; GFX900-FLATSCR:       ; %bb.0: ; %entry
2230; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2231; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
2232; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
2233; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2234; GFX900-FLATSCR-NEXT:    s_add_i32 vcc_hi, s32, 44
2235; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, vcc_hi offset:4055 glc
2236; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2237; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
2238; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
2239; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
2240entry:
2241  %obj0 = alloca [10 x i32], align 4, addrspace(5)
2242  %obj1 = alloca [4096 x i8], align 2, addrspace(5)
2243  %reg.bc = bitcast i32 %reg to <2 x half>
2244  %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
2245  store volatile i32 123, i32 addrspace(5)* %bc
2246  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
2247  %load = load volatile i8, i8 addrspace(5)* %gep
2248  %load.ext = zext i8 %load to i16
2249  %bitcast = bitcast i16 %load.ext to half
2250  %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
2251  store <2 x half> %build1, <2 x half> addrspace(1)* undef
2252  ret void
2253}
2254
2255attributes #0 = { nounwind }
2256