1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
5
6; Test using saddr addressing mode of global_*load_* flat instructions.
7
8; --------------------------------------------------------------------------------
9; No vgpr offset, constants
10; --------------------------------------------------------------------------------
11
12; SGPR base only
13define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) {
14; GCN-LABEL: global_load_saddr_i8_offset_0:
15; GCN:       ; %bb.0:
16; GCN-NEXT:    v_mov_b32_e32 v0, 0
17; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
18; GCN-NEXT:    s_waitcnt vmcnt(0)
19; GCN-NEXT:    ; return to shader part epilog
20;
21; GFX11-LABEL: global_load_saddr_i8_offset_0:
22; GFX11:       ; %bb.0:
23; GFX11-NEXT:    v_mov_b32_e32 v0, 0
24; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
25; GFX11-NEXT:    s_waitcnt vmcnt(0)
26; GFX11-NEXT:    ; return to shader part epilog
27  %load = load i8, i8 addrspace(1)* %sbase
28  %zext = zext i8 %load to i32
29  %to.vgpr = bitcast i32 %zext to float
30  ret float %to.vgpr
31}
32
33; SGPR base with maximum gfx9 immediate offset
34define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
35; GFX9-LABEL: global_load_saddr_i8_offset_4095:
36; GFX9:       ; %bb.0:
37; GFX9-NEXT:    v_mov_b32_e32 v0, 0
38; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
39; GFX9-NEXT:    s_waitcnt vmcnt(0)
40; GFX9-NEXT:    ; return to shader part epilog
41;
42; GFX10-LABEL: global_load_saddr_i8_offset_4095:
43; GFX10:       ; %bb.0:
44; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
45; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
46; GFX10-NEXT:    s_waitcnt vmcnt(0)
47; GFX10-NEXT:    ; return to shader part epilog
48;
49; GFX11-LABEL: global_load_saddr_i8_offset_4095:
50; GFX11:       ; %bb.0:
51; GFX11-NEXT:    v_mov_b32_e32 v0, 0
52; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
53; GFX11-NEXT:    s_waitcnt vmcnt(0)
54; GFX11-NEXT:    ; return to shader part epilog
55  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
56  %load = load i8, i8 addrspace(1)* %gep0
57  %zext = zext i8 %load to i32
58  %to.vgpr = bitcast i32 %zext to float
59  ret float %to.vgpr
60}
61
62; SGPR base with maximum gfx9 immediate offset + 1
63define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) {
64; GCN-LABEL: global_load_saddr_i8_offset_4096:
65; GCN:       ; %bb.0:
66; GCN-NEXT:    v_mov_b32_e32 v0, 0x1000
67; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
68; GCN-NEXT:    s_waitcnt vmcnt(0)
69; GCN-NEXT:    ; return to shader part epilog
70;
71; GFX11-LABEL: global_load_saddr_i8_offset_4096:
72; GFX11:       ; %bb.0:
73; GFX11-NEXT:    v_mov_b32_e32 v0, 0x1000
74; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
75; GFX11-NEXT:    s_waitcnt vmcnt(0)
76; GFX11-NEXT:    ; return to shader part epilog
77  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096
78  %load = load i8, i8 addrspace(1)* %gep0
79  %zext = zext i8 %load to i32
80  %to.vgpr = bitcast i32 %zext to float
81  ret float %to.vgpr
82}
83
84; SGPR base with maximum gfx9 immediate offset + 2
85define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) {
86; GCN-LABEL: global_load_saddr_i8_offset_4097:
87; GCN:       ; %bb.0:
88; GCN-NEXT:    v_mov_b32_e32 v0, 0x1000
89; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
90; GCN-NEXT:    s_waitcnt vmcnt(0)
91; GCN-NEXT:    ; return to shader part epilog
92;
93; GFX11-LABEL: global_load_saddr_i8_offset_4097:
94; GFX11:       ; %bb.0:
95; GFX11-NEXT:    v_mov_b32_e32 v0, 0x1000
96; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:1
97; GFX11-NEXT:    s_waitcnt vmcnt(0)
98; GFX11-NEXT:    ; return to shader part epilog
99  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097
100  %load = load i8, i8 addrspace(1)* %gep0
101  %zext = zext i8 %load to i32
102  %to.vgpr = bitcast i32 %zext to float
103  ret float %to.vgpr
104}
105
106; SGPR base with maximum negative gfx9 immediate offset
107define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
108; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
109; GFX9:       ; %bb.0:
110; GFX9-NEXT:    v_mov_b32_e32 v0, 0
111; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
112; GFX9-NEXT:    s_waitcnt vmcnt(0)
113; GFX9-NEXT:    ; return to shader part epilog
114;
115; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
116; GFX10:       ; %bb.0:
117; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
118; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
119; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
120; GFX10-NEXT:    s_waitcnt vmcnt(0)
121; GFX10-NEXT:    ; return to shader part epilog
122;
123; GFX11-LABEL: global_load_saddr_i8_offset_neg4096:
124; GFX11:       ; %bb.0:
125; GFX11-NEXT:    v_mov_b32_e32 v0, 0
126; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
127; GFX11-NEXT:    s_waitcnt vmcnt(0)
128; GFX11-NEXT:    ; return to shader part epilog
129  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096
130  %load = load i8, i8 addrspace(1)* %gep0
131  %zext = zext i8 %load to i32
132  %to.vgpr = bitcast i32 %zext to float
133  ret float %to.vgpr
134}
135
136; SGPR base with maximum negative gfx9 immediate offset -1
137define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) {
138; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
139; GFX9:       ; %bb.0:
140; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffefff
141; GFX9-NEXT:    s_addc_u32 s1, s3, -1
142; GFX9-NEXT:    v_mov_b32_e32 v0, 0
143; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
144; GFX9-NEXT:    s_waitcnt vmcnt(0)
145; GFX9-NEXT:    ; return to shader part epilog
146;
147; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
148; GFX10:       ; %bb.0:
149; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
150; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
151; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
152; GFX10-NEXT:    s_waitcnt vmcnt(0)
153; GFX10-NEXT:    ; return to shader part epilog
154;
155; GFX11-LABEL: global_load_saddr_i8_offset_neg4097:
156; GFX11:       ; %bb.0:
157; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
159; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
160; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
161; GFX11-NEXT:    s_waitcnt vmcnt(0)
162; GFX11-NEXT:    ; return to shader part epilog
163  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097
164  %load = load i8, i8 addrspace(1)* %gep0
165  %zext = zext i8 %load to i32
166  %to.vgpr = bitcast i32 %zext to float
167  ret float %to.vgpr
168}
169
170; SGPR base with maximum negative gfx9 immediate offset -2
171define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) {
172; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
173; GFX9:       ; %bb.0:
174; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffeffe
175; GFX9-NEXT:    s_addc_u32 s1, s3, -1
176; GFX9-NEXT:    v_mov_b32_e32 v0, 0
177; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
178; GFX9-NEXT:    s_waitcnt vmcnt(0)
179; GFX9-NEXT:    ; return to shader part epilog
180;
181; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
182; GFX10:       ; %bb.0:
183; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
184; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
185; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2
186; GFX10-NEXT:    s_waitcnt vmcnt(0)
187; GFX10-NEXT:    ; return to shader part epilog
188;
189; GFX11-LABEL: global_load_saddr_i8_offset_neg4098:
190; GFX11:       ; %bb.0:
191; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
192; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
193; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
194; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2
195; GFX11-NEXT:    s_waitcnt vmcnt(0)
196; GFX11-NEXT:    ; return to shader part epilog
197  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098
198  %load = load i8, i8 addrspace(1)* %gep0
199  %zext = zext i8 %load to i32
200  %to.vgpr = bitcast i32 %zext to float
201  ret float %to.vgpr
202}
203
204; SGPR base with maximum gfx10 immediate offset
205define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
206; GFX9-LABEL: global_load_saddr_i8_offset_2048:
207; GFX9:       ; %bb.0:
208; GFX9-NEXT:    v_mov_b32_e32 v0, 0
209; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
210; GFX9-NEXT:    s_waitcnt vmcnt(0)
211; GFX9-NEXT:    ; return to shader part epilog
212;
213; GFX10-LABEL: global_load_saddr_i8_offset_2048:
214; GFX10:       ; %bb.0:
215; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
216; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
217; GFX10-NEXT:    s_waitcnt vmcnt(0)
218; GFX10-NEXT:    ; return to shader part epilog
219;
220; GFX11-LABEL: global_load_saddr_i8_offset_2048:
221; GFX11:       ; %bb.0:
222; GFX11-NEXT:    v_mov_b32_e32 v0, 0
223; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
224; GFX11-NEXT:    s_waitcnt vmcnt(0)
225; GFX11-NEXT:    ; return to shader part epilog
226  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048
227  %load = load i8, i8 addrspace(1)* %gep0
228  %zext = zext i8 %load to i32
229  %to.vgpr = bitcast i32 %zext to float
230  ret float %to.vgpr
231}
232
233; SGPR base with maximum gfx10 immediate offset + 1
234define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
235; GFX9-LABEL: global_load_saddr_i8_offset_2049:
236; GFX9:       ; %bb.0:
237; GFX9-NEXT:    v_mov_b32_e32 v0, 0
238; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2049
239; GFX9-NEXT:    s_waitcnt vmcnt(0)
240; GFX9-NEXT:    ; return to shader part epilog
241;
242; GFX10-LABEL: global_load_saddr_i8_offset_2049:
243; GFX10:       ; %bb.0:
244; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
245; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
246; GFX10-NEXT:    s_waitcnt vmcnt(0)
247; GFX10-NEXT:    ; return to shader part epilog
248;
249; GFX11-LABEL: global_load_saddr_i8_offset_2049:
250; GFX11:       ; %bb.0:
251; GFX11-NEXT:    v_mov_b32_e32 v0, 0
252; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2049
253; GFX11-NEXT:    s_waitcnt vmcnt(0)
254; GFX11-NEXT:    ; return to shader part epilog
255  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049
256  %load = load i8, i8 addrspace(1)* %gep0
257  %zext = zext i8 %load to i32
258  %to.vgpr = bitcast i32 %zext to float
259  ret float %to.vgpr
260}
261
262; SGPR base with maximum gfx10 immediate offset + 2
263define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
264; GFX9-LABEL: global_load_saddr_i8_offset_2050:
265; GFX9:       ; %bb.0:
266; GFX9-NEXT:    v_mov_b32_e32 v0, 0
267; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2050
268; GFX9-NEXT:    s_waitcnt vmcnt(0)
269; GFX9-NEXT:    ; return to shader part epilog
270;
271; GFX10-LABEL: global_load_saddr_i8_offset_2050:
272; GFX10:       ; %bb.0:
273; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
274; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2
275; GFX10-NEXT:    s_waitcnt vmcnt(0)
276; GFX10-NEXT:    ; return to shader part epilog
277;
278; GFX11-LABEL: global_load_saddr_i8_offset_2050:
279; GFX11:       ; %bb.0:
280; GFX11-NEXT:    v_mov_b32_e32 v0, 0
281; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2050
282; GFX11-NEXT:    s_waitcnt vmcnt(0)
283; GFX11-NEXT:    ; return to shader part epilog
284  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050
285  %load = load i8, i8 addrspace(1)* %gep0
286  %zext = zext i8 %load to i32
287  %to.vgpr = bitcast i32 %zext to float
288  ret float %to.vgpr
289}
290
291; SGPR base with maximum negative gfx10 immediate offset
292define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
293; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
294; GCN:       ; %bb.0:
295; GCN-NEXT:    v_mov_b32_e32 v0, 0
296; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
297; GCN-NEXT:    s_waitcnt vmcnt(0)
298; GCN-NEXT:    ; return to shader part epilog
299;
300; GFX11-LABEL: global_load_saddr_i8_offset_neg2048:
301; GFX11:       ; %bb.0:
302; GFX11-NEXT:    v_mov_b32_e32 v0, 0
303; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
304; GFX11-NEXT:    s_waitcnt vmcnt(0)
305; GFX11-NEXT:    ; return to shader part epilog
306  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
307  %load = load i8, i8 addrspace(1)* %gep0
308  %zext = zext i8 %load to i32
309  %to.vgpr = bitcast i32 %zext to float
310  ret float %to.vgpr
311}
312
313; SGPR base with maximum negative gfx10 immediate offset - 1
314define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
315; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
316; GFX9:       ; %bb.0:
317; GFX9-NEXT:    v_mov_b32_e32 v0, 0
318; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
319; GFX9-NEXT:    s_waitcnt vmcnt(0)
320; GFX9-NEXT:    ; return to shader part epilog
321;
322; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
323; GFX10:       ; %bb.0:
324; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff800, s2
325; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
326; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
327; GFX10-NEXT:    s_waitcnt vmcnt(0)
328; GFX10-NEXT:    ; return to shader part epilog
329;
330; GFX11-LABEL: global_load_saddr_i8_offset_neg2049:
331; GFX11:       ; %bb.0:
332; GFX11-NEXT:    v_mov_b32_e32 v0, 0
333; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
334; GFX11-NEXT:    s_waitcnt vmcnt(0)
335; GFX11-NEXT:    ; return to shader part epilog
336  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049
337  %load = load i8, i8 addrspace(1)* %gep0
338  %zext = zext i8 %load to i32
339  %to.vgpr = bitcast i32 %zext to float
340  ret float %to.vgpr
341}
342
343; SGPR base with maximum negative gfx10 immediate offset - 1
344define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
345; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
346; GFX9:       ; %bb.0:
347; GFX9-NEXT:    v_mov_b32_e32 v0, 0
348; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2050
349; GFX9-NEXT:    s_waitcnt vmcnt(0)
350; GFX9-NEXT:    ; return to shader part epilog
351;
352; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
353; GFX10:       ; %bb.0:
354; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff800, s2
355; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
356; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2
357; GFX10-NEXT:    s_waitcnt vmcnt(0)
358; GFX10-NEXT:    ; return to shader part epilog
359;
360; GFX11-LABEL: global_load_saddr_i8_offset_neg2050:
361; GFX11:       ; %bb.0:
362; GFX11-NEXT:    v_mov_b32_e32 v0, 0
363; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2050
364; GFX11-NEXT:    s_waitcnt vmcnt(0)
365; GFX11-NEXT:    ; return to shader part epilog
366  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050
367  %load = load i8, i8 addrspace(1)* %gep0
368  %zext = zext i8 %load to i32
369  %to.vgpr = bitcast i32 %zext to float
370  ret float %to.vgpr
371}
372
373define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) {
374; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
375; GFX9:       ; %bb.0:
376; GFX9-NEXT:    v_mov_b32_e32 v0, 0xfffff000
377; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
378; GFX9-NEXT:    s_waitcnt vmcnt(0)
379; GFX9-NEXT:    ; return to shader part epilog
380;
381; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
382; GFX10:       ; %bb.0:
383; GFX10-NEXT:    v_mov_b32_e32 v0, 0xfffff800
384; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
385; GFX10-NEXT:    s_waitcnt vmcnt(0)
386; GFX10-NEXT:    ; return to shader part epilog
387;
388; GFX11-LABEL: global_load_saddr_i8_offset_4294967295:
389; GFX11:       ; %bb.0:
390; GFX11-NEXT:    v_mov_b32_e32 v0, 0xfffff000
391; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
392; GFX11-NEXT:    s_waitcnt vmcnt(0)
393; GFX11-NEXT:    ; return to shader part epilog
394  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295
395  %load = load i8, i8 addrspace(1)* %gep0
396  %zext = zext i8 %load to i32
397  %to.vgpr = bitcast i32 %zext to float
398  ret float %to.vgpr
399}
400
401define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) {
402; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    v_mov_b32_e32 v1, s3
405; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
406; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
407; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
408; GFX9-NEXT:    s_waitcnt vmcnt(0)
409; GFX9-NEXT:    ; return to shader part epilog
410;
411; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
412; GFX10:       ; %bb.0:
413; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
414; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
415; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
416; GFX10-NEXT:    s_waitcnt vmcnt(0)
417; GFX10-NEXT:    ; return to shader part epilog
418;
419; GFX11-LABEL: global_load_saddr_i8_offset_4294967296:
420; GFX11:       ; %bb.0:
421; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
422; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
423; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
424; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
425; GFX11-NEXT:    s_waitcnt vmcnt(0)
426; GFX11-NEXT:    ; return to shader part epilog
427  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296
428  %load = load i8, i8 addrspace(1)* %gep0
429  %zext = zext i8 %load to i32
430  %to.vgpr = bitcast i32 %zext to float
431  ret float %to.vgpr
432}
433
434define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) {
435; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
436; GFX9:       ; %bb.0:
437; GFX9-NEXT:    v_mov_b32_e32 v1, s3
438; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
439; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
440; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
441; GFX9-NEXT:    s_waitcnt vmcnt(0)
442; GFX9-NEXT:    ; return to shader part epilog
443;
444; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
445; GFX10:       ; %bb.0:
446; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
447; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
448; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
449; GFX10-NEXT:    s_waitcnt vmcnt(0)
450; GFX10-NEXT:    ; return to shader part epilog
451;
452; GFX11-LABEL: global_load_saddr_i8_offset_4294967297:
453; GFX11:       ; %bb.0:
454; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
455; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
456; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
457; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
458; GFX11-NEXT:    s_waitcnt vmcnt(0)
459; GFX11-NEXT:    ; return to shader part epilog
460  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297
461  %load = load i8, i8 addrspace(1)* %gep0
462  %zext = zext i8 %load to i32
463  %to.vgpr = bitcast i32 %zext to float
464  ret float %to.vgpr
465}
466
467define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) {
468; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
469; GFX9:       ; %bb.0:
470; GFX9-NEXT:    s_add_u32 s0, s2, 0xfff
471; GFX9-NEXT:    s_addc_u32 s1, s3, 1
472; GFX9-NEXT:    v_mov_b32_e32 v0, 0
473; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
474; GFX9-NEXT:    s_waitcnt vmcnt(0)
475; GFX9-NEXT:    ; return to shader part epilog
476;
477; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
478; GFX10:       ; %bb.0:
479; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x800, s2
480; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
481; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
482; GFX10-NEXT:    s_waitcnt vmcnt(0)
483; GFX10-NEXT:    ; return to shader part epilog
484;
485; GFX11-LABEL: global_load_saddr_i8_offset_4294971391:
486; GFX11:       ; %bb.0:
487; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
488; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
489; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
490; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
491; GFX11-NEXT:    s_waitcnt vmcnt(0)
492; GFX11-NEXT:    ; return to shader part epilog
493  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391
494  %load = load i8, i8 addrspace(1)* %gep0
495  %zext = zext i8 %load to i32
496  %to.vgpr = bitcast i32 %zext to float
497  ret float %to.vgpr
498}
499
500define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) {
501; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
502; GFX9:       ; %bb.0:
503; GFX9-NEXT:    s_add_u32 s0, s2, 0x1000
504; GFX9-NEXT:    s_addc_u32 s1, s3, 1
505; GFX9-NEXT:    v_mov_b32_e32 v0, 0
506; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
507; GFX9-NEXT:    s_waitcnt vmcnt(0)
508; GFX9-NEXT:    ; return to shader part epilog
509;
510; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
511; GFX10:       ; %bb.0:
512; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
513; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
514; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
515; GFX10-NEXT:    s_waitcnt vmcnt(0)
516; GFX10-NEXT:    ; return to shader part epilog
517;
518; GFX11-LABEL: global_load_saddr_i8_offset_4294971392:
519; GFX11:       ; %bb.0:
520; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
521; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
522; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
523; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
524; GFX11-NEXT:    s_waitcnt vmcnt(0)
525; GFX11-NEXT:    ; return to shader part epilog
526  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392
527  %load = load i8, i8 addrspace(1)* %gep0
528  %zext = zext i8 %load to i32
529  %to.vgpr = bitcast i32 %zext to float
530  ret float %to.vgpr
531}
532
533define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) {
534; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
535; GFX9:       ; %bb.0:
536; GFX9-NEXT:    v_mov_b32_e32 v0, s2
537; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
538; GFX9-NEXT:    v_mov_b32_e32 v1, s3
539; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
540; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
541; GFX9-NEXT:    s_waitcnt vmcnt(0)
542; GFX9-NEXT:    ; return to shader part epilog
543;
544; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
545; GFX10:       ; %bb.0:
546; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x800, s2
547; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
548; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2047
549; GFX10-NEXT:    s_waitcnt vmcnt(0)
550; GFX10-NEXT:    ; return to shader part epilog
551;
552; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295:
553; GFX11:       ; %bb.0:
554; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
555; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
556; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
557; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
558; GFX11-NEXT:    s_waitcnt vmcnt(0)
559; GFX11-NEXT:    ; return to shader part epilog
560  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295
561  %load = load i8, i8 addrspace(1)* %gep0
562  %zext = zext i8 %load to i32
563  %to.vgpr = bitcast i32 %zext to float
564  ret float %to.vgpr
565}
566
567define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) {
568; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
569; GFX9:       ; %bb.0:
570; GFX9-NEXT:    v_mov_b32_e32 v1, s3
571; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
572; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
573; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
574; GFX9-NEXT:    s_waitcnt vmcnt(0)
575; GFX9-NEXT:    ; return to shader part epilog
576;
577; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
578; GFX10:       ; %bb.0:
579; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
580; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
581; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
582; GFX10-NEXT:    s_waitcnt vmcnt(0)
583; GFX10-NEXT:    ; return to shader part epilog
584;
585; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296:
586; GFX11:       ; %bb.0:
587; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
588; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
589; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
590; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
591; GFX11-NEXT:    s_waitcnt vmcnt(0)
592; GFX11-NEXT:    ; return to shader part epilog
593  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296
594  %load = load i8, i8 addrspace(1)* %gep0
595  %zext = zext i8 %load to i32
596  %to.vgpr = bitcast i32 %zext to float
597  ret float %to.vgpr
598}
599
600define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) {
601; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
602; GFX9:       ; %bb.0:
603; GFX9-NEXT:    v_mov_b32_e32 v1, s3
604; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
605; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
606; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
607; GFX9-NEXT:    s_waitcnt vmcnt(0)
608; GFX9-NEXT:    ; return to shader part epilog
609;
610; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
611; GFX10:       ; %bb.0:
612; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
613; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
614; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
615; GFX10-NEXT:    s_waitcnt vmcnt(0)
616; GFX10-NEXT:    ; return to shader part epilog
617;
618; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297:
619; GFX11:       ; %bb.0:
620; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
621; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
622; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
623; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
624; GFX11-NEXT:    s_waitcnt vmcnt(0)
625; GFX11-NEXT:    ; return to shader part epilog
626  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297
627  %load = load i8, i8 addrspace(1)* %gep0
628  %zext = zext i8 %load to i32
629  %to.vgpr = bitcast i32 %zext to float
630  ret float %to.vgpr
631}
632
633; --------------------------------------------------------------------------------
634; Basic addressing patterns
635; --------------------------------------------------------------------------------
636
637; Basic pattern, no immediate offset.
638define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
639; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
640; GCN:       ; %bb.0:
641; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
642; GCN-NEXT:    s_waitcnt vmcnt(0)
643; GCN-NEXT:    ; return to shader part epilog
644;
645; GFX11-LABEL: global_load_saddr_i8_zext_vgpr:
646; GFX11:       ; %bb.0:
647; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
648; GFX11-NEXT:    s_waitcnt vmcnt(0)
649; GFX11-NEXT:    ; return to shader part epilog
650  %zext.offset = zext i32 %voffset to i64
651  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
652  %load = load i8, i8 addrspace(1)* %gep0
653  %zext = zext i8 %load to i32
654  %to.vgpr = bitcast i32 %zext to float
655  ret float %to.vgpr
656}
657
658; Maximum positive offset on gfx9
659define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
660; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
661; GFX9:       ; %bb.0:
662; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
663; GFX9-NEXT:    s_waitcnt vmcnt(0)
664; GFX9-NEXT:    ; return to shader part epilog
665;
666; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
667; GFX10:       ; %bb.0:
668; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
669; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
670; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
671; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
672; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
673; GFX10-NEXT:    s_waitcnt vmcnt(0)
674; GFX10-NEXT:    ; return to shader part epilog
675;
676; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
677; GFX11:       ; %bb.0:
678; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
679; GFX11-NEXT:    s_waitcnt vmcnt(0)
680; GFX11-NEXT:    ; return to shader part epilog
681  %zext.offset = zext i32 %voffset to i64
682  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
683  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
684  %load = load i8, i8 addrspace(1)* %gep1
685  %zext = zext i8 %load to i32
686  %to.vgpr = bitcast i32 %zext to float
687  ret float %to.vgpr
688}
689
690; Maximum positive offset on gfx9 + 1
691define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
692; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
693; GFX9:       ; %bb.0:
694; GFX9-NEXT:    v_mov_b32_e32 v1, s3
695; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
696; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
697; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
698; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
699; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
700; GFX9-NEXT:    s_waitcnt vmcnt(0)
701; GFX9-NEXT:    ; return to shader part epilog
702;
703; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
704; GFX10:       ; %bb.0:
705; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
706; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
707; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x1000, v0
708; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
709; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
710; GFX10-NEXT:    s_waitcnt vmcnt(0)
711; GFX10-NEXT:    ; return to shader part epilog
712;
713; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
714; GFX11:       ; %bb.0:
715; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
717; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
718; GFX11-NEXT:    v_add_co_u32 v0, vcc, 0x1000, v0
719; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
720; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
721; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
722; GFX11-NEXT:    s_waitcnt vmcnt(0)
723; GFX11-NEXT:    ; return to shader part epilog
724  %zext.offset = zext i32 %voffset to i64
725  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
726  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096
727  %load = load i8, i8 addrspace(1)* %gep1
728  %zext = zext i8 %load to i32
729  %to.vgpr = bitcast i32 %zext to float
730  ret float %to.vgpr
731}
732
733; Maximum negative offset on gfx9
734define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
735; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
736; GFX9:       ; %bb.0:
737; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
738; GFX9-NEXT:    s_waitcnt vmcnt(0)
739; GFX9-NEXT:    ; return to shader part epilog
740;
741; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
742; GFX10:       ; %bb.0:
743; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
744; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
745; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xfffff000, v0
746; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
747; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
748; GFX10-NEXT:    s_waitcnt vmcnt(0)
749; GFX10-NEXT:    ; return to shader part epilog
750;
751; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
752; GFX11:       ; %bb.0:
753; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-4096
754; GFX11-NEXT:    s_waitcnt vmcnt(0)
755; GFX11-NEXT:    ; return to shader part epilog
756  %zext.offset = zext i32 %voffset to i64
757  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
758  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096
759  %load = load i8, i8 addrspace(1)* %gep1
760  %zext = zext i8 %load to i32
761  %to.vgpr = bitcast i32 %zext to float
762  ret float %to.vgpr
763}
764
765; Maximum negative offset on gfx9 - 1
766define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
767; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
768; GFX9:       ; %bb.0:
769; GFX9-NEXT:    v_mov_b32_e32 v1, s3
770; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
771; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
772; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
773; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
774; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
775; GFX9-NEXT:    s_waitcnt vmcnt(0)
776; GFX9-NEXT:    ; return to shader part epilog
777;
778; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
779; GFX10:       ; %bb.0:
780; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
781; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
782; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xfffff000, v0
783; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
784; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
785; GFX10-NEXT:    s_waitcnt vmcnt(0)
786; GFX10-NEXT:    ; return to shader part epilog
787;
788; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
789; GFX11:       ; %bb.0:
790; GFX11-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
791; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
792; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
793; GFX11-NEXT:    v_add_co_u32 v0, vcc, 0xfffff000, v0
794; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
795; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
796; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
797; GFX11-NEXT:    s_waitcnt vmcnt(0)
798; GFX11-NEXT:    ; return to shader part epilog
799  %zext.offset = zext i32 %voffset to i64
800  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
801  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097
802  %load = load i8, i8 addrspace(1)* %gep1
803  %zext = zext i8 %load to i32
804  %to.vgpr = bitcast i32 %zext to float
805  ret float %to.vgpr
806}
807
808; Maximum positive offset on gfx10
809define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
810; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
811; GCN:       ; %bb.0:
812; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
813; GCN-NEXT:    s_waitcnt vmcnt(0)
814; GCN-NEXT:    ; return to shader part epilog
815;
816; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
817; GFX11:       ; %bb.0:
818; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2047
819; GFX11-NEXT:    s_waitcnt vmcnt(0)
820; GFX11-NEXT:    ; return to shader part epilog
821  %zext.offset = zext i32 %voffset to i64
822  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
823  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
824  %load = load i8, i8 addrspace(1)* %gep1
825  %zext = zext i8 %load to i32
826  %to.vgpr = bitcast i32 %zext to float
827  ret float %to.vgpr
828}
829
830; Maximum positive offset on gfx10 + 1
831define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
832; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
833; GFX9:       ; %bb.0:
834; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
835; GFX9-NEXT:    s_waitcnt vmcnt(0)
836; GFX9-NEXT:    ; return to shader part epilog
837;
838; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
839; GFX10:       ; %bb.0:
840; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
841; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
842; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
843; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
844; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
845; GFX10-NEXT:    s_waitcnt vmcnt(0)
846; GFX10-NEXT:    ; return to shader part epilog
847;
848; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
849; GFX11:       ; %bb.0:
850; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:2048
851; GFX11-NEXT:    s_waitcnt vmcnt(0)
852; GFX11-NEXT:    ; return to shader part epilog
853  %zext.offset = zext i32 %voffset to i64
854  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
855  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
856  %load = load i8, i8 addrspace(1)* %gep1
857  %zext = zext i8 %load to i32
858  %to.vgpr = bitcast i32 %zext to float
859  ret float %to.vgpr
860}
861
862; Maximum negative offset on gfx10
863define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
864; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
865; GCN:       ; %bb.0:
866; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
867; GCN-NEXT:    s_waitcnt vmcnt(0)
868; GCN-NEXT:    ; return to shader part epilog
869;
870; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
871; GFX11:       ; %bb.0:
872; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2048
873; GFX11-NEXT:    s_waitcnt vmcnt(0)
874; GFX11-NEXT:    ; return to shader part epilog
875  %zext.offset = zext i32 %voffset to i64
876  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
877  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
878  %load = load i8, i8 addrspace(1)* %gep1
879  %zext = zext i8 %load to i32
880  %to.vgpr = bitcast i32 %zext to float
881  ret float %to.vgpr
882}
883
884; Maximum negative offset on gfx10 - 1
885define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
886; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
887; GFX9:       ; %bb.0:
888; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
889; GFX9-NEXT:    s_waitcnt vmcnt(0)
890; GFX9-NEXT:    ; return to shader part epilog
891;
892; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
893; GFX10:       ; %bb.0:
894; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
895; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
896; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xfffff800, v0
897; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
898; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
899; GFX10-NEXT:    s_waitcnt vmcnt(0)
900; GFX10-NEXT:    ; return to shader part epilog
901;
902; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
903; GFX11:       ; %bb.0:
904; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-2049
905; GFX11-NEXT:    s_waitcnt vmcnt(0)
906; GFX11-NEXT:    ; return to shader part epilog
907  %zext.offset = zext i32 %voffset to i64
908  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
909  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049
910  %load = load i8, i8 addrspace(1)* %gep1
911  %zext = zext i8 %load to i32
912  %to.vgpr = bitcast i32 %zext to float
913  ret float %to.vgpr
914}
915
916; Maximum positive offset on gfx9, and immediate needs to be moved lower.
917define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
918; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
919; GFX9:       ; %bb.0:
920; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
921; GFX9-NEXT:    s_waitcnt vmcnt(0)
922; GFX9-NEXT:    ; return to shader part epilog
923;
924; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
925; GFX10:       ; %bb.0:
926; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
927; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
928; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
929; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
930; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
931; GFX10-NEXT:    s_waitcnt vmcnt(0)
932; GFX10-NEXT:    ; return to shader part epilog
933;
934; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
935; GFX11:       ; %bb.0:
936; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:4095
937; GFX11-NEXT:    s_waitcnt vmcnt(0)
938; GFX11-NEXT:    ; return to shader part epilog
939  %zext.offset = zext i32 %voffset to i64
940  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
941  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset
942  %load = load i8, i8 addrspace(1)* %gep1
943  %zext = zext i8 %load to i32
944  %to.vgpr = bitcast i32 %zext to float
945  ret float %to.vgpr
946}
947
948; pointer addressing done in integers
949define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
950; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
951; GCN:       ; %bb.0:
952; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
953; GCN-NEXT:    s_waitcnt vmcnt(0)
954; GCN-NEXT:    ; return to shader part epilog
955;
956; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
957; GFX11:       ; %bb.0:
958; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
959; GFX11-NEXT:    s_waitcnt vmcnt(0)
960; GFX11-NEXT:    ; return to shader part epilog
961  %zext.offset = zext i32 %voffset to i64
962  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
963  %add = add i64 %sbase.as.int, %zext.offset
964  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
965  %load = load i8, i8 addrspace(1)* %dirty.gep
966  %zext = zext i8 %load to i32
967  %to.vgpr = bitcast i32 %zext to float
968  ret float %to.vgpr
969}
970
971; zext forced to LHS of addressing expression
972define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
973; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
974; GCN:       ; %bb.0:
975; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
976; GCN-NEXT:    s_waitcnt vmcnt(0)
977; GCN-NEXT:    ; return to shader part epilog
978;
979; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
980; GFX11:       ; %bb.0:
981; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
982; GFX11-NEXT:    s_waitcnt vmcnt(0)
983; GFX11-NEXT:    ; return to shader part epilog
984  %zext.offset = zext i32 %voffset to i64
985  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
986  %add = add i64 %zext.offset, %sbase.as.int
987  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
988  %load = load i8, i8 addrspace(1)* %dirty.gep
989  %zext = zext i8 %load to i32
990  %to.vgpr = bitcast i32 %zext to float
991  ret float %to.vgpr
992}
993
994; zext forced to LHS of addressing expression, with immediate offset
995define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
996; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
997; GCN:       ; %bb.0:
998; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
999; GCN-NEXT:    s_waitcnt vmcnt(0)
1000; GCN-NEXT:    ; return to shader part epilog
1001;
1002; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
1003; GFX11:       ; %bb.0:
1004; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
1005; GFX11-NEXT:    s_waitcnt vmcnt(0)
1006; GFX11-NEXT:    ; return to shader part epilog
1007  %zext.offset = zext i32 %voffset to i64
1008  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
1009  %add = add i64 %zext.offset, %sbase.as.int
1010  %add.immoffset = add i64 %add, 128
1011  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
1012  %load = load i8, i8 addrspace(1)* %dirty.gep
1013  %zext = zext i8 %load to i32
1014  %to.vgpr = bitcast i32 %zext to float
1015  ret float %to.vgpr
1016}
1017
1018; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
1019define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1020; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
1021; GCN:       ; %bb.0:
1022; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
1023; GCN-NEXT:    s_waitcnt vmcnt(0)
1024; GCN-NEXT:    ; return to shader part epilog
1025;
1026; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
1027; GFX11:       ; %bb.0:
1028; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
1029; GFX11-NEXT:    s_waitcnt vmcnt(0)
1030; GFX11-NEXT:    ; return to shader part epilog
1031  %zext.offset = zext i32 %voffset to i64
1032  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
1033  %add.immoffset = add i64 %sbase.as.int, 128
1034  %add = add i64 %zext.offset, %add.immoffset
1035  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
1036  %load = load i8, i8 addrspace(1)* %dirty.gep
1037  %zext = zext i8 %load to i32
1038  %to.vgpr = bitcast i32 %zext to float
1039  ret float %to.vgpr
1040}
1041
1042; --------------------------------------------------------------------------------
1043; Uniformity edge cases
1044; --------------------------------------------------------------------------------
1045
1046@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
1047
1048; Base pointer is uniform, but also in VGPRs
1049define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
1050; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
1051; GFX9:       ; %bb.0:
1052; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1053; GFX9-NEXT:    ds_read_b64 v[1:2], v1
1054; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1056; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1057; GFX9-NEXT:    s_nop 4
1058; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
1059; GFX9-NEXT:    s_waitcnt vmcnt(0)
1060; GFX9-NEXT:    ; return to shader part epilog
1061;
1062; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
1063; GFX10:       ; %bb.0:
1064; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1065; GFX10-NEXT:    ds_read_b64 v[1:2], v1
1066; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1067; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
1068; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
1069; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1]
1070; GFX10-NEXT:    s_waitcnt vmcnt(0)
1071; GFX10-NEXT:    ; return to shader part epilog
1072;
1073; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
1074; GFX11:       ; %bb.0:
1075; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1076; GFX11-NEXT:    ds_load_b64 v[1:2], v1
1077; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1078; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
1079; GFX11-NEXT:    v_readfirstlane_b32 s1, v2
1080; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
1081; GFX11-NEXT:    s_waitcnt vmcnt(0)
1082; GFX11-NEXT:    ; return to shader part epilog
1083  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
1084  %zext.offset = zext i32 %voffset to i64
1085  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1086  %load = load i8, i8 addrspace(1)* %gep0
1087  %zext = zext i8 %load to i32
1088  %to.vgpr = bitcast i32 %zext to float
1089  ret float %to.vgpr
1090}
1091
1092; Base pointer is uniform, but also in VGPRs, with imm offset
1093define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
1094; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
1095; GFX9:       ; %bb.0:
1096; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1097; GFX9-NEXT:    ds_read_b64 v[1:2], v1
1098; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1100; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
1101; GFX9-NEXT:    s_nop 4
1102; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:42
1103; GFX9-NEXT:    s_waitcnt vmcnt(0)
1104; GFX9-NEXT:    ; return to shader part epilog
1105;
1106; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
1107; GFX10:       ; %bb.0:
1108; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1109; GFX10-NEXT:    ds_read_b64 v[1:2], v1
1110; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
1112; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
1113; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:42
1114; GFX10-NEXT:    s_waitcnt vmcnt(0)
1115; GFX10-NEXT:    ; return to shader part epilog
1116;
1117; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
1118; GFX11:       ; %bb.0:
1119; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1120; GFX11-NEXT:    ds_load_b64 v[1:2], v1
1121; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
1123; GFX11-NEXT:    v_readfirstlane_b32 s1, v2
1124; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:42
1125; GFX11-NEXT:    s_waitcnt vmcnt(0)
1126; GFX11-NEXT:    ; return to shader part epilog
1127  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
1128  %zext.offset = zext i32 %voffset to i64
1129  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1130  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
1131  %load = load i8, i8 addrspace(1)* %gep1
1132  %zext = zext i8 %load to i32
1133  %to.vgpr = bitcast i32 %zext to float
1134  ret float %to.vgpr
1135}
1136
1137; Both 64-bit base and 32-bit offset are scalar
1138define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
1139; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
1140; GCN:       ; %bb.0:
1141; GCN-NEXT:    v_mov_b32_e32 v0, s4
1142; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
1143; GCN-NEXT:    s_waitcnt vmcnt(0)
1144; GCN-NEXT:    ; return to shader part epilog
1145;
1146; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset:
1147; GFX11:       ; %bb.0:
1148; GFX11-NEXT:    v_mov_b32_e32 v0, s4
1149; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1150; GFX11-NEXT:    s_waitcnt vmcnt(0)
1151; GFX11-NEXT:    ; return to shader part epilog
1152  %zext.offset = zext i32 %soffset to i64
1153  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1154  %load = load i8, i8 addrspace(1)* %gep0
1155  %zext = zext i8 %load to i32
1156  %to.vgpr = bitcast i32 %zext to float
1157  ret float %to.vgpr
1158}
1159
1160; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
1161define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
1162; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
1163; GCN:       ; %bb.0:
1164; GCN-NEXT:    v_mov_b32_e32 v0, s4
1165; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-24
1166; GCN-NEXT:    s_waitcnt vmcnt(0)
1167; GCN-NEXT:    ; return to shader part epilog
1168;
1169; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
1170; GFX11:       ; %bb.0:
1171; GFX11-NEXT:    v_mov_b32_e32 v0, s4
1172; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-24
1173; GFX11-NEXT:    s_waitcnt vmcnt(0)
1174; GFX11-NEXT:    ; return to shader part epilog
1175  %zext.offset = zext i32 %soffset to i64
1176  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1177  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24
1178  %load = load i8, i8 addrspace(1)* %gep1
1179  %zext = zext i8 %load to i32
1180  %to.vgpr = bitcast i32 %zext to float
1181  ret float %to.vgpr
1182}
1183
1184; Both components uniform, zext forced to LHS of addressing expression
1185define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
1186; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
1187; GCN:       ; %bb.0:
1188; GCN-NEXT:    v_mov_b32_e32 v0, s4
1189; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
1190; GCN-NEXT:    s_waitcnt vmcnt(0)
1191; GCN-NEXT:    ; return to shader part epilog
1192;
1193; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
1194; GFX11:       ; %bb.0:
1195; GFX11-NEXT:    v_mov_b32_e32 v0, s4
1196; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1197; GFX11-NEXT:    s_waitcnt vmcnt(0)
1198; GFX11-NEXT:    ; return to shader part epilog
1199  %zext.offset = zext i32 %soffset to i64
1200  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
1201  %add = add i64 %zext.offset, %sbase.as.int
1202  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
1203  %load = load i8, i8 addrspace(1)* %dirty.gep
1204  %zext = zext i8 %load to i32
1205  %to.vgpr = bitcast i32 %zext to float
1206  ret float %to.vgpr
1207}
1208
1209; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
1210define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
1211; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
1212; GCN:       ; %bb.0:
1213; GCN-NEXT:    v_mov_b32_e32 v0, s4
1214; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
1215; GCN-NEXT:    s_waitcnt vmcnt(0)
1216; GCN-NEXT:    ; return to shader part epilog
1217;
1218; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
1219; GFX11:       ; %bb.0:
1220; GFX11-NEXT:    v_mov_b32_e32 v0, s4
1221; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:128
1222; GFX11-NEXT:    s_waitcnt vmcnt(0)
1223; GFX11-NEXT:    ; return to shader part epilog
1224  %zext.offset = zext i32 %soffset to i64
1225  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
1226  %add = add i64 %zext.offset, %sbase.as.int
1227  %add.immoffset = add i64 %add, 128
1228  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
1229  %load = load i8, i8 addrspace(1)* %dirty.gep
1230  %zext = zext i8 %load to i32
1231  %to.vgpr = bitcast i32 %zext to float
1232  ret float %to.vgpr
1233}
1234
1235; divergent 64-bit base, 32-bit scalar offset.
1236define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
1237; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
1238; GFX9:       ; %bb.0:
1239; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
1240; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1241; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1242; GFX9-NEXT:    s_waitcnt vmcnt(0)
1243; GFX9-NEXT:    ; return to shader part epilog
1244;
1245; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
1246; GFX10:       ; %bb.0:
1247; GFX10-NEXT:    v_add_co_u32 v0, vcc, v0, s2
1248; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1249; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1250; GFX10-NEXT:    s_waitcnt vmcnt(0)
1251; GFX10-NEXT:    ; return to shader part epilog
1252;
1253; GFX11-LABEL: global_load_i8_vgpr64_sgpr32:
1254; GFX11:       ; %bb.0:
1255; GFX11-NEXT:    v_add_co_u32 v0, vcc, v0, s2
1256; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1257; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
1258; GFX11-NEXT:    s_waitcnt vmcnt(0)
1259; GFX11-NEXT:    ; return to shader part epilog
1260  %zext.offset = zext i32 %soffset to i64
1261  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
1262  %load = load i8, i8 addrspace(1)* %gep0
1263  %zext = zext i8 %load to i32
1264  %to.vgpr = bitcast i32 %zext to float
1265  ret float %to.vgpr
1266}
1267
1268; divergent 64-bit base, 32-bit scalar offset, with imm offset
1269define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
1270; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
1271; GFX9:       ; %bb.0:
1272; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
1273; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1274; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
1275; GFX9-NEXT:    s_waitcnt vmcnt(0)
1276; GFX9-NEXT:    ; return to shader part epilog
1277;
1278; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
1279; GFX10:       ; %bb.0:
1280; GFX10-NEXT:    v_add_co_u32 v0, vcc, v0, s2
1281; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1282; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
1283; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1284; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1285; GFX10-NEXT:    s_waitcnt vmcnt(0)
1286; GFX10-NEXT:    ; return to shader part epilog
1287;
1288; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
1289; GFX11:       ; %bb.0:
1290; GFX11-NEXT:    v_add_co_u32 v0, vcc, v0, s2
1291; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
1292; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
1293; GFX11-NEXT:    s_waitcnt vmcnt(0)
1294; GFX11-NEXT:    ; return to shader part epilog
1295  %zext.offset = zext i32 %soffset to i64
1296  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
1297  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
1298  %load = load i8, i8 addrspace(1)* %gep1
1299  %zext = zext i8 %load to i32
1300  %to.vgpr = bitcast i32 %zext to float
1301  ret float %to.vgpr
1302}
1303
1304; --------------------------------------------------------------------------------
1305; Natural addressing shifts with restricted range
1306; --------------------------------------------------------------------------------
1307
1308; Cannot push the shift into 32-bits, and cannot match.
1309define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1310; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
1311; GFX9:       ; %bb.0:
1312; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1313; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1314; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1315; GFX9-NEXT:    s_waitcnt vmcnt(0)
1316; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1317; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
1318; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1319; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1320; GFX9-NEXT:    s_waitcnt vmcnt(0)
1321; GFX9-NEXT:    ; return to shader part epilog
1322;
1323; GFX10-LABEL: global_load_saddr_f32_natural_addressing:
1324; GFX10:       ; %bb.0:
1325; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1326; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1327; GFX10-NEXT:    s_waitcnt vmcnt(0)
1328; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1329; GFX10-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1330; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1331; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1332; GFX10-NEXT:    s_waitcnt vmcnt(0)
1333; GFX10-NEXT:    ; return to shader part epilog
1334;
1335; GFX11-LABEL: global_load_saddr_f32_natural_addressing:
1336; GFX11:       ; %bb.0:
1337; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1338; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1339; GFX11-NEXT:    s_waitcnt vmcnt(0)
1340; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1341; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1342; GFX11-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1343; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1344; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1345; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1346; GFX11-NEXT:    s_waitcnt vmcnt(0)
1347; GFX11-NEXT:    ; return to shader part epilog
1348  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
1349  %zext.offset = zext i32 %voffset to i64
1350  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1351  %load = load float, float addrspace(1)* %gep
1352  ret float %load
1353}
1354
1355; Cannot push the shift into 32-bits, with an immediate offset.
1356define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1357; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
1358; GCN:       ; %bb.0:
1359; GCN-NEXT:    global_load_dword v0, v[0:1], off
1360; GCN-NEXT:    s_waitcnt vmcnt(0)
1361; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:128
1362; GCN-NEXT:    s_waitcnt vmcnt(0)
1363; GCN-NEXT:    ; return to shader part epilog
1364;
1365; GFX11-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
1366; GFX11:       ; %bb.0:
1367; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1368; GFX11-NEXT:    s_waitcnt vmcnt(0)
1369; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:128
1370; GFX11-NEXT:    s_waitcnt vmcnt(0)
1371; GFX11-NEXT:    ; return to shader part epilog
1372  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
1373  %zext.offset = zext i32 %voffset to i64
1374  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1375  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128
1376  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
1377  %load = load float, float addrspace(1)* %gep1.cast
1378  ret float %load
1379}
1380
1381; Range is sufficiently restricted to push the shift into 32-bits.
1382define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1383; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
1384; GCN:       ; %bb.0:
1385; GCN-NEXT:    global_load_dword v0, v[0:1], off
1386; GCN-NEXT:    s_waitcnt vmcnt(0)
1387; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1388; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1389; GCN-NEXT:    s_waitcnt vmcnt(0)
1390; GCN-NEXT:    ; return to shader part epilog
1391;
1392; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range:
1393; GFX11:       ; %bb.0:
1394; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1395; GFX11-NEXT:    s_waitcnt vmcnt(0)
1396; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1397; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1398; GFX11-NEXT:    s_waitcnt vmcnt(0)
1399; GFX11-NEXT:    ; return to shader part epilog
1400  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
1401  %zext.offset = zext i32 %voffset to i64
1402  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1403  %load = load float, float addrspace(1)* %gep
1404  ret float %load
1405}
1406
1407; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
1408define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1409; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
1410; GCN:       ; %bb.0:
1411; GCN-NEXT:    global_load_dword v0, v[0:1], off
1412; GCN-NEXT:    s_waitcnt vmcnt(0)
1413; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1414; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:400
1415; GCN-NEXT:    s_waitcnt vmcnt(0)
1416; GCN-NEXT:    ; return to shader part epilog
1417;
1418; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
1419; GFX11:       ; %bb.0:
1420; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1421; GFX11-NEXT:    s_waitcnt vmcnt(0)
1422; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1423; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:400
1424; GFX11-NEXT:    s_waitcnt vmcnt(0)
1425; GFX11-NEXT:    ; return to shader part epilog
1426  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
1427  %zext.offset = zext i32 %voffset to i64
1428  %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1429  %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100
1430  %load = load float, float addrspace(1)* %gep1
1431  ret float %load
1432}
1433
1434; Range is 1 beyond the limit where we can move the shift into 32-bits.
1435define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1436; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1437; GFX9:       ; %bb.0:
1438; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1439; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1440; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1441; GFX9-NEXT:    s_waitcnt vmcnt(0)
1442; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1443; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
1444; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1445; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1446; GFX9-NEXT:    s_waitcnt vmcnt(0)
1447; GFX9-NEXT:    ; return to shader part epilog
1448;
1449; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1450; GFX10:       ; %bb.0:
1451; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1452; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1453; GFX10-NEXT:    s_waitcnt vmcnt(0)
1454; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1455; GFX10-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1456; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1457; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1458; GFX10-NEXT:    s_waitcnt vmcnt(0)
1459; GFX10-NEXT:    ; return to shader part epilog
1460;
1461; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1462; GFX11:       ; %bb.0:
1463; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1464; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1465; GFX11-NEXT:    s_waitcnt vmcnt(0)
1466; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1467; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1468; GFX11-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1469; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1470; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1471; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
1472; GFX11-NEXT:    s_waitcnt vmcnt(0)
1473; GFX11-NEXT:    ; return to shader part epilog
1474  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1
1475  %zext.offset = zext i32 %voffset to i64
1476  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1477  %load = load float, float addrspace(1)* %gep
1478  ret float %load
1479}
1480
1481; --------------------------------------------------------------------------------
1482; Stress various type loads
1483; --------------------------------------------------------------------------------
1484
1485define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1486; GCN-LABEL: global_load_saddr_i16:
1487; GCN:       ; %bb.0:
1488; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
1489; GCN-NEXT:    s_waitcnt vmcnt(0)
1490; GCN-NEXT:    ; return to shader part epilog
1491;
1492; GFX11-LABEL: global_load_saddr_i16:
1493; GFX11:       ; %bb.0:
1494; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
1495; GFX11-NEXT:    s_waitcnt vmcnt(0)
1496; GFX11-NEXT:    ; return to shader part epilog
1497  %zext.offset = zext i32 %voffset to i64
1498  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1499  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1500  %load = load i16, i16 addrspace(1)* %gep0.cast
1501  %cast.load = bitcast i16 %load to half
1502  ret half %cast.load
1503}
1504
1505define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1506; GCN-LABEL: global_load_saddr_i16_immneg128:
1507; GCN:       ; %bb.0:
1508; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
1509; GCN-NEXT:    s_waitcnt vmcnt(0)
1510; GCN-NEXT:    ; return to shader part epilog
1511;
1512; GFX11-LABEL: global_load_saddr_i16_immneg128:
1513; GFX11:       ; %bb.0:
1514; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
1515; GFX11-NEXT:    s_waitcnt vmcnt(0)
1516; GFX11-NEXT:    ; return to shader part epilog
1517  %zext.offset = zext i32 %voffset to i64
1518  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1519  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1520  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1521  %load = load i16, i16 addrspace(1)* %gep1.cast
1522  %cast.load = bitcast i16 %load to half
1523  ret half %cast.load
1524}
1525
1526define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1527; GCN-LABEL: global_load_saddr_f16:
1528; GCN:       ; %bb.0:
1529; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
1530; GCN-NEXT:    s_waitcnt vmcnt(0)
1531; GCN-NEXT:    ; return to shader part epilog
1532;
1533; GFX11-LABEL: global_load_saddr_f16:
1534; GFX11:       ; %bb.0:
1535; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
1536; GFX11-NEXT:    s_waitcnt vmcnt(0)
1537; GFX11-NEXT:    ; return to shader part epilog
1538  %zext.offset = zext i32 %voffset to i64
1539  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1540  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)*
1541  %load = load half, half addrspace(1)* %gep0.cast
1542  ret half %load
1543}
1544
1545define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1546; GCN-LABEL: global_load_saddr_f16_immneg128:
1547; GCN:       ; %bb.0:
1548; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
1549; GCN-NEXT:    s_waitcnt vmcnt(0)
1550; GCN-NEXT:    ; return to shader part epilog
1551;
1552; GFX11-LABEL: global_load_saddr_f16_immneg128:
1553; GFX11:       ; %bb.0:
1554; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
1555; GFX11-NEXT:    s_waitcnt vmcnt(0)
1556; GFX11-NEXT:    ; return to shader part epilog
1557  %zext.offset = zext i32 %voffset to i64
1558  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1559  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1560  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)*
1561  %load = load half, half addrspace(1)* %gep1.cast
1562  ret half %load
1563}
1564
1565define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1566; GCN-LABEL: global_load_saddr_i32:
1567; GCN:       ; %bb.0:
1568; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1569; GCN-NEXT:    s_waitcnt vmcnt(0)
1570; GCN-NEXT:    ; return to shader part epilog
1571;
1572; GFX11-LABEL: global_load_saddr_i32:
1573; GFX11:       ; %bb.0:
1574; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1575; GFX11-NEXT:    s_waitcnt vmcnt(0)
1576; GFX11-NEXT:    ; return to shader part epilog
1577  %zext.offset = zext i32 %voffset to i64
1578  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1579  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1580  %load = load i32, i32 addrspace(1)* %gep0.cast
1581  %cast.load = bitcast i32 %load to float
1582  ret float %cast.load
1583}
1584
1585define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1586; GCN-LABEL: global_load_saddr_i32_immneg128:
1587; GCN:       ; %bb.0:
1588; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1589; GCN-NEXT:    s_waitcnt vmcnt(0)
1590; GCN-NEXT:    ; return to shader part epilog
1591;
1592; GFX11-LABEL: global_load_saddr_i32_immneg128:
1593; GFX11:       ; %bb.0:
1594; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1595; GFX11-NEXT:    s_waitcnt vmcnt(0)
1596; GFX11-NEXT:    ; return to shader part epilog
1597  %zext.offset = zext i32 %voffset to i64
1598  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1599  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1600  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1601  %load = load i32, i32 addrspace(1)* %gep1.cast
1602  %cast.load = bitcast i32 %load to float
1603  ret float %cast.load
1604}
1605
1606define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1607; GCN-LABEL: global_load_saddr_f32:
1608; GCN:       ; %bb.0:
1609; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1610; GCN-NEXT:    s_waitcnt vmcnt(0)
1611; GCN-NEXT:    ; return to shader part epilog
1612;
1613; GFX11-LABEL: global_load_saddr_f32:
1614; GFX11:       ; %bb.0:
1615; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1616; GFX11-NEXT:    s_waitcnt vmcnt(0)
1617; GFX11-NEXT:    ; return to shader part epilog
1618  %zext.offset = zext i32 %voffset to i64
1619  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1620  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
1621  %load = load float, float addrspace(1)* %gep0.cast
1622  ret float %load
1623}
1624
1625define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1626; GCN-LABEL: global_load_saddr_f32_immneg128:
1627; GCN:       ; %bb.0:
1628; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1629; GCN-NEXT:    s_waitcnt vmcnt(0)
1630; GCN-NEXT:    ; return to shader part epilog
1631;
1632; GFX11-LABEL: global_load_saddr_f32_immneg128:
1633; GFX11:       ; %bb.0:
1634; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1635; GFX11-NEXT:    s_waitcnt vmcnt(0)
1636; GFX11-NEXT:    ; return to shader part epilog
1637  %zext.offset = zext i32 %voffset to i64
1638  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1639  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1640  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
1641  %load = load float, float addrspace(1)* %gep1.cast
1642  ret float %load
1643}
1644
1645define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1646; GCN-LABEL: global_load_saddr_v2i16:
1647; GCN:       ; %bb.0:
1648; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1649; GCN-NEXT:    s_waitcnt vmcnt(0)
1650; GCN-NEXT:    ; return to shader part epilog
1651;
1652; GFX11-LABEL: global_load_saddr_v2i16:
1653; GFX11:       ; %bb.0:
1654; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1655; GFX11-NEXT:    s_waitcnt vmcnt(0)
1656; GFX11-NEXT:    ; return to shader part epilog
1657  %zext.offset = zext i32 %voffset to i64
1658  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1659  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)*
1660  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast
1661  %cast.load = bitcast <2 x i16> %load to <2 x half>
1662  ret <2 x half> %cast.load
1663}
1664
1665define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1666; GCN-LABEL: global_load_saddr_v2i16_immneg128:
1667; GCN:       ; %bb.0:
1668; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1669; GCN-NEXT:    s_waitcnt vmcnt(0)
1670; GCN-NEXT:    ; return to shader part epilog
1671;
1672; GFX11-LABEL: global_load_saddr_v2i16_immneg128:
1673; GFX11:       ; %bb.0:
1674; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1675; GFX11-NEXT:    s_waitcnt vmcnt(0)
1676; GFX11-NEXT:    ; return to shader part epilog
1677  %zext.offset = zext i32 %voffset to i64
1678  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1679  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1680  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)*
1681  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast
1682  %cast.load = bitcast <2 x i16> %load to <2 x half>
1683  ret <2 x half> %cast.load
1684}
1685
1686define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1687; GCN-LABEL: global_load_saddr_v2f16:
1688; GCN:       ; %bb.0:
1689; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1690; GCN-NEXT:    s_waitcnt vmcnt(0)
1691; GCN-NEXT:    ; return to shader part epilog
1692;
1693; GFX11-LABEL: global_load_saddr_v2f16:
1694; GFX11:       ; %bb.0:
1695; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1696; GFX11-NEXT:    s_waitcnt vmcnt(0)
1697; GFX11-NEXT:    ; return to shader part epilog
1698  %zext.offset = zext i32 %voffset to i64
1699  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1700  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
1701  %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast
1702  ret <2 x half> %load
1703}
1704
1705define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1706; GCN-LABEL: global_load_saddr_v2f16_immneg128:
1707; GCN:       ; %bb.0:
1708; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1709; GCN-NEXT:    s_waitcnt vmcnt(0)
1710; GCN-NEXT:    ; return to shader part epilog
1711;
1712; GFX11-LABEL: global_load_saddr_v2f16_immneg128:
1713; GFX11:       ; %bb.0:
1714; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1715; GFX11-NEXT:    s_waitcnt vmcnt(0)
1716; GFX11-NEXT:    ; return to shader part epilog
1717  %zext.offset = zext i32 %voffset to i64
1718  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1719  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1720  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
1721  %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast
1722  ret <2 x half> %load
1723}
1724
1725define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1726; GCN-LABEL: global_load_saddr_p3:
1727; GCN:       ; %bb.0:
1728; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1729; GCN-NEXT:    s_waitcnt vmcnt(0)
1730; GCN-NEXT:    ; return to shader part epilog
1731;
1732; GFX11-LABEL: global_load_saddr_p3:
1733; GFX11:       ; %bb.0:
1734; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1735; GFX11-NEXT:    s_waitcnt vmcnt(0)
1736; GFX11-NEXT:    ; return to shader part epilog
1737  %zext.offset = zext i32 %voffset to i64
1738  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1739  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)*
1740  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast
1741  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
1742  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1743  ret <2 x half> %cast.load1
1744}
1745
1746define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1747; GCN-LABEL: global_load_saddr_p3_immneg128:
1748; GCN:       ; %bb.0:
1749; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1750; GCN-NEXT:    s_waitcnt vmcnt(0)
1751; GCN-NEXT:    ; return to shader part epilog
1752;
1753; GFX11-LABEL: global_load_saddr_p3_immneg128:
1754; GFX11:       ; %bb.0:
1755; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128
1756; GFX11-NEXT:    s_waitcnt vmcnt(0)
1757; GFX11-NEXT:    ; return to shader part epilog
1758  %zext.offset = zext i32 %voffset to i64
1759  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1760  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1761  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)*
1762  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast
1763  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
1764  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1765  ret <2 x half> %cast.load1
1766}
1767
1768define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1769; GCN-LABEL: global_load_saddr_f64:
1770; GCN:       ; %bb.0:
1771; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1772; GCN-NEXT:    s_waitcnt vmcnt(0)
1773; GCN-NEXT:    ; return to shader part epilog
1774;
1775; GFX11-LABEL: global_load_saddr_f64:
1776; GFX11:       ; %bb.0:
1777; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1778; GFX11-NEXT:    s_waitcnt vmcnt(0)
1779; GFX11-NEXT:    ; return to shader part epilog
1780  %zext.offset = zext i32 %voffset to i64
1781  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1782  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)*
1783  %load = load double, double addrspace(1)* %gep0.cast
1784  %cast.load = bitcast double %load to <2 x float>
1785  ret <2 x float> %cast.load
1786}
1787
1788define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1789; GCN-LABEL: global_load_saddr_f64_immneg128:
1790; GCN:       ; %bb.0:
1791; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1792; GCN-NEXT:    s_waitcnt vmcnt(0)
1793; GCN-NEXT:    ; return to shader part epilog
1794;
1795; GFX11-LABEL: global_load_saddr_f64_immneg128:
1796; GFX11:       ; %bb.0:
1797; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
1798; GFX11-NEXT:    s_waitcnt vmcnt(0)
1799; GFX11-NEXT:    ; return to shader part epilog
1800  %zext.offset = zext i32 %voffset to i64
1801  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1802  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1803  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)*
1804  %load = load double, double addrspace(1)* %gep1.cast
1805  %cast.load = bitcast double %load to <2 x float>
1806  ret <2 x float> %cast.load
1807}
1808
1809define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1810; GCN-LABEL: global_load_saddr_i64:
1811; GCN:       ; %bb.0:
1812; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1813; GCN-NEXT:    s_waitcnt vmcnt(0)
1814; GCN-NEXT:    ; return to shader part epilog
1815;
1816; GFX11-LABEL: global_load_saddr_i64:
1817; GFX11:       ; %bb.0:
1818; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1819; GFX11-NEXT:    s_waitcnt vmcnt(0)
1820; GFX11-NEXT:    ; return to shader part epilog
1821  %zext.offset = zext i32 %voffset to i64
1822  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1823  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1824  %load = load i64, i64 addrspace(1)* %gep0.cast
1825  %cast.load = bitcast i64 %load to <2 x float>
1826  ret <2 x float> %cast.load
1827}
1828
1829define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1830; GCN-LABEL: global_load_saddr_i64_immneg128:
1831; GCN:       ; %bb.0:
1832; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1833; GCN-NEXT:    s_waitcnt vmcnt(0)
1834; GCN-NEXT:    ; return to shader part epilog
1835;
1836; GFX11-LABEL: global_load_saddr_i64_immneg128:
1837; GFX11:       ; %bb.0:
1838; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
1839; GFX11-NEXT:    s_waitcnt vmcnt(0)
1840; GFX11-NEXT:    ; return to shader part epilog
1841  %zext.offset = zext i32 %voffset to i64
1842  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1843  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1844  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1845  %load = load i64, i64 addrspace(1)* %gep1.cast
1846  %cast.load = bitcast i64 %load to <2 x float>
1847  ret <2 x float> %cast.load
1848}
1849
1850define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1851; GCN-LABEL: global_load_saddr_v2f32:
1852; GCN:       ; %bb.0:
1853; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1854; GCN-NEXT:    s_waitcnt vmcnt(0)
1855; GCN-NEXT:    ; return to shader part epilog
1856;
1857; GFX11-LABEL: global_load_saddr_v2f32:
1858; GFX11:       ; %bb.0:
1859; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1860; GFX11-NEXT:    s_waitcnt vmcnt(0)
1861; GFX11-NEXT:    ; return to shader part epilog
1862  %zext.offset = zext i32 %voffset to i64
1863  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1864  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)*
1865  %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast
1866  ret <2 x float> %load
1867}
1868
1869define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1870; GCN-LABEL: global_load_saddr_v2f32_immneg128:
1871; GCN:       ; %bb.0:
1872; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1873; GCN-NEXT:    s_waitcnt vmcnt(0)
1874; GCN-NEXT:    ; return to shader part epilog
1875;
1876; GFX11-LABEL: global_load_saddr_v2f32_immneg128:
1877; GFX11:       ; %bb.0:
1878; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
1879; GFX11-NEXT:    s_waitcnt vmcnt(0)
1880; GFX11-NEXT:    ; return to shader part epilog
1881  %zext.offset = zext i32 %voffset to i64
1882  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1883  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1884  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)*
1885  %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast
1886  ret <2 x float> %load
1887}
1888
1889define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1890; GCN-LABEL: global_load_saddr_v2i32:
1891; GCN:       ; %bb.0:
1892; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1893; GCN-NEXT:    s_waitcnt vmcnt(0)
1894; GCN-NEXT:    ; return to shader part epilog
1895;
1896; GFX11-LABEL: global_load_saddr_v2i32:
1897; GFX11:       ; %bb.0:
1898; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1899; GFX11-NEXT:    s_waitcnt vmcnt(0)
1900; GFX11-NEXT:    ; return to shader part epilog
1901  %zext.offset = zext i32 %voffset to i64
1902  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1903  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)*
1904  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast
1905  %cast.load = bitcast <2 x i32> %load to <2 x float>
1906  ret <2 x float> %cast.load
1907}
1908
1909define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1910; GCN-LABEL: global_load_saddr_v2i32_immneg128:
1911; GCN:       ; %bb.0:
1912; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1913; GCN-NEXT:    s_waitcnt vmcnt(0)
1914; GCN-NEXT:    ; return to shader part epilog
1915;
1916; GFX11-LABEL: global_load_saddr_v2i32_immneg128:
1917; GFX11:       ; %bb.0:
1918; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
1919; GFX11-NEXT:    s_waitcnt vmcnt(0)
1920; GFX11-NEXT:    ; return to shader part epilog
1921  %zext.offset = zext i32 %voffset to i64
1922  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1923  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1924  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)*
1925  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast
1926  %cast.load = bitcast <2 x i32> %load to <2 x float>
1927  ret <2 x float> %cast.load
1928}
1929
1930define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1931; GCN-LABEL: global_load_saddr_v4i16:
1932; GCN:       ; %bb.0:
1933; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1934; GCN-NEXT:    s_waitcnt vmcnt(0)
1935; GCN-NEXT:    ; return to shader part epilog
1936;
1937; GFX11-LABEL: global_load_saddr_v4i16:
1938; GFX11:       ; %bb.0:
1939; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1940; GFX11-NEXT:    s_waitcnt vmcnt(0)
1941; GFX11-NEXT:    ; return to shader part epilog
1942  %zext.offset = zext i32 %voffset to i64
1943  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1944  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)*
1945  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast
1946  %cast.load = bitcast <4 x i16> %load to <2 x float>
1947  ret <2 x float> %cast.load
1948}
1949
1950define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1951; GCN-LABEL: global_load_saddr_v4i16_immneg128:
1952; GCN:       ; %bb.0:
1953; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1954; GCN-NEXT:    s_waitcnt vmcnt(0)
1955; GCN-NEXT:    ; return to shader part epilog
1956;
1957; GFX11-LABEL: global_load_saddr_v4i16_immneg128:
1958; GFX11:       ; %bb.0:
1959; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
1960; GFX11-NEXT:    s_waitcnt vmcnt(0)
1961; GFX11-NEXT:    ; return to shader part epilog
1962  %zext.offset = zext i32 %voffset to i64
1963  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1964  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1965  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)*
1966  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast
1967  %cast.load = bitcast <4 x i16> %load to <2 x float>
1968  ret <2 x float> %cast.load
1969}
1970
1971define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1972; GCN-LABEL: global_load_saddr_v4f16:
1973; GCN:       ; %bb.0:
1974; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1975; GCN-NEXT:    s_waitcnt vmcnt(0)
1976; GCN-NEXT:    ; return to shader part epilog
1977;
1978; GFX11-LABEL: global_load_saddr_v4f16:
1979; GFX11:       ; %bb.0:
1980; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1981; GFX11-NEXT:    s_waitcnt vmcnt(0)
1982; GFX11-NEXT:    ; return to shader part epilog
1983  %zext.offset = zext i32 %voffset to i64
1984  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1985  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)*
1986  %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast
1987  %cast.load = bitcast <4 x half> %load to <2 x float>
1988  ret <2 x float> %cast.load
1989}
1990
1991define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1992; GCN-LABEL: global_load_saddr_v4f16_immneg128:
1993; GCN:       ; %bb.0:
1994; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1995; GCN-NEXT:    s_waitcnt vmcnt(0)
1996; GCN-NEXT:    ; return to shader part epilog
1997;
1998; GFX11-LABEL: global_load_saddr_v4f16_immneg128:
1999; GFX11:       ; %bb.0:
2000; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
2001; GFX11-NEXT:    s_waitcnt vmcnt(0)
2002; GFX11-NEXT:    ; return to shader part epilog
2003  %zext.offset = zext i32 %voffset to i64
2004  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2005  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2006  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)*
2007  %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast
2008  %cast.load = bitcast <4 x half> %load to <2 x float>
2009  ret <2 x float> %cast.load
2010}
2011
2012define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2013; GCN-LABEL: global_load_saddr_p1:
2014; GCN:       ; %bb.0:
2015; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
2016; GCN-NEXT:    s_waitcnt vmcnt(0)
2017; GCN-NEXT:    ; return to shader part epilog
2018;
2019; GFX11-LABEL: global_load_saddr_p1:
2020; GFX11:       ; %bb.0:
2021; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
2022; GFX11-NEXT:    s_waitcnt vmcnt(0)
2023; GFX11-NEXT:    ; return to shader part epilog
2024  %zext.offset = zext i32 %voffset to i64
2025  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2026  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)*
2027  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast
2028  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
2029  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
2030  ret <2 x float> %cast.load1
2031}
2032
2033define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2034; GCN-LABEL: global_load_saddr_p1_immneg128:
2035; GCN:       ; %bb.0:
2036; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
2037; GCN-NEXT:    s_waitcnt vmcnt(0)
2038; GCN-NEXT:    ; return to shader part epilog
2039;
2040; GFX11-LABEL: global_load_saddr_p1_immneg128:
2041; GFX11:       ; %bb.0:
2042; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128
2043; GFX11-NEXT:    s_waitcnt vmcnt(0)
2044; GFX11-NEXT:    ; return to shader part epilog
2045  %zext.offset = zext i32 %voffset to i64
2046  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2047  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2048  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)*
2049  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast
2050  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
2051  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
2052  ret <2 x float> %cast.load1
2053}
2054
2055define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2056; GCN-LABEL: global_load_saddr_v3f32:
2057; GCN:       ; %bb.0:
2058; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
2059; GCN-NEXT:    s_waitcnt vmcnt(0)
2060; GCN-NEXT:    ; return to shader part epilog
2061;
2062; GFX11-LABEL: global_load_saddr_v3f32:
2063; GFX11:       ; %bb.0:
2064; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
2065; GFX11-NEXT:    s_waitcnt vmcnt(0)
2066; GFX11-NEXT:    ; return to shader part epilog
2067  %zext.offset = zext i32 %voffset to i64
2068  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2069  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)*
2070  %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast
2071  ret <3 x float> %load
2072}
2073
2074define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2075; GCN-LABEL: global_load_saddr_v3f32_immneg128:
2076; GCN:       ; %bb.0:
2077; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
2078; GCN-NEXT:    s_waitcnt vmcnt(0)
2079; GCN-NEXT:    ; return to shader part epilog
2080;
2081; GFX11-LABEL: global_load_saddr_v3f32_immneg128:
2082; GFX11:       ; %bb.0:
2083; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
2084; GFX11-NEXT:    s_waitcnt vmcnt(0)
2085; GFX11-NEXT:    ; return to shader part epilog
2086  %zext.offset = zext i32 %voffset to i64
2087  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2088  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2089  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)*
2090  %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast
2091  ret <3 x float> %load
2092}
2093
2094define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2095; GCN-LABEL: global_load_saddr_v3i32:
2096; GCN:       ; %bb.0:
2097; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
2098; GCN-NEXT:    s_waitcnt vmcnt(0)
2099; GCN-NEXT:    ; return to shader part epilog
2100;
2101; GFX11-LABEL: global_load_saddr_v3i32:
2102; GFX11:       ; %bb.0:
2103; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
2104; GFX11-NEXT:    s_waitcnt vmcnt(0)
2105; GFX11-NEXT:    ; return to shader part epilog
2106  %zext.offset = zext i32 %voffset to i64
2107  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2108  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)*
2109  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast
2110  %cast.load = bitcast <3 x i32> %load to <3 x float>
2111  ret <3 x float> %cast.load
2112}
2113
2114define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2115; GCN-LABEL: global_load_saddr_v3i32_immneg128:
2116; GCN:       ; %bb.0:
2117; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
2118; GCN-NEXT:    s_waitcnt vmcnt(0)
2119; GCN-NEXT:    ; return to shader part epilog
2120;
2121; GFX11-LABEL: global_load_saddr_v3i32_immneg128:
2122; GFX11:       ; %bb.0:
2123; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
2124; GFX11-NEXT:    s_waitcnt vmcnt(0)
2125; GFX11-NEXT:    ; return to shader part epilog
2126  %zext.offset = zext i32 %voffset to i64
2127  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2128  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2129  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)*
2130  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast
2131  %cast.load = bitcast <3 x i32> %load to <3 x float>
2132  ret <3 x float> %cast.load
2133}
2134
2135define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2136; GCN-LABEL: global_load_saddr_v6f16:
2137; GCN:       ; %bb.0:
2138; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
2139; GCN-NEXT:    s_waitcnt vmcnt(0)
2140; GCN-NEXT:    ; return to shader part epilog
2141;
2142; GFX11-LABEL: global_load_saddr_v6f16:
2143; GFX11:       ; %bb.0:
2144; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3]
2145; GFX11-NEXT:    s_waitcnt vmcnt(0)
2146; GFX11-NEXT:    ; return to shader part epilog
2147  %zext.offset = zext i32 %voffset to i64
2148  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2149  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)*
2150  %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast
2151  ret <6 x half> %load
2152}
2153
2154define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2155; GCN-LABEL: global_load_saddr_v6f16_immneg128:
2156; GCN:       ; %bb.0:
2157; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
2158; GCN-NEXT:    s_waitcnt vmcnt(0)
2159; GCN-NEXT:    ; return to shader part epilog
2160;
2161; GFX11-LABEL: global_load_saddr_v6f16_immneg128:
2162; GFX11:       ; %bb.0:
2163; GFX11-NEXT:    global_load_b96 v[0:2], v0, s[2:3] offset:-128
2164; GFX11-NEXT:    s_waitcnt vmcnt(0)
2165; GFX11-NEXT:    ; return to shader part epilog
2166  %zext.offset = zext i32 %voffset to i64
2167  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2168  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2169  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)*
2170  %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast
2171  ret <6 x half> %load
2172}
2173
2174define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2175; GCN-LABEL: global_load_saddr_v4f32:
2176; GCN:       ; %bb.0:
2177; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
2178; GCN-NEXT:    s_waitcnt vmcnt(0)
2179; GCN-NEXT:    ; return to shader part epilog
2180;
2181; GFX11-LABEL: global_load_saddr_v4f32:
2182; GFX11:       ; %bb.0:
2183; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
2184; GFX11-NEXT:    s_waitcnt vmcnt(0)
2185; GFX11-NEXT:    ; return to shader part epilog
2186  %zext.offset = zext i32 %voffset to i64
2187  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2188  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)*
2189  %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast
2190  ret <4 x float> %load
2191}
2192
2193define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2194; GCN-LABEL: global_load_saddr_v4f32_immneg128:
2195; GCN:       ; %bb.0:
2196; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2197; GCN-NEXT:    s_waitcnt vmcnt(0)
2198; GCN-NEXT:    ; return to shader part epilog
2199;
2200; GFX11-LABEL: global_load_saddr_v4f32_immneg128:
2201; GFX11:       ; %bb.0:
2202; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
2203; GFX11-NEXT:    s_waitcnt vmcnt(0)
2204; GFX11-NEXT:    ; return to shader part epilog
2205  %zext.offset = zext i32 %voffset to i64
2206  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2207  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2208  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)*
2209  %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast
2210  ret <4 x float> %load
2211}
2212
2213define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2214; GCN-LABEL: global_load_saddr_v4i32:
2215; GCN:       ; %bb.0:
2216; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
2217; GCN-NEXT:    s_waitcnt vmcnt(0)
2218; GCN-NEXT:    ; return to shader part epilog
2219;
2220; GFX11-LABEL: global_load_saddr_v4i32:
2221; GFX11:       ; %bb.0:
2222; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
2223; GFX11-NEXT:    s_waitcnt vmcnt(0)
2224; GFX11-NEXT:    ; return to shader part epilog
2225  %zext.offset = zext i32 %voffset to i64
2226  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2227  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)*
2228  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast
2229  %cast.load = bitcast <4 x i32> %load to <4 x float>
2230  ret <4 x float> %cast.load
2231}
2232
2233define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2234; GCN-LABEL: global_load_saddr_v4i32_immneg128:
2235; GCN:       ; %bb.0:
2236; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2237; GCN-NEXT:    s_waitcnt vmcnt(0)
2238; GCN-NEXT:    ; return to shader part epilog
2239;
2240; GFX11-LABEL: global_load_saddr_v4i32_immneg128:
2241; GFX11:       ; %bb.0:
2242; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
2243; GFX11-NEXT:    s_waitcnt vmcnt(0)
2244; GFX11-NEXT:    ; return to shader part epilog
2245  %zext.offset = zext i32 %voffset to i64
2246  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2247  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2248  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)*
2249  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast
2250  %cast.load = bitcast <4 x i32> %load to <4 x float>
2251  ret <4 x float> %cast.load
2252}
2253
2254define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2255; GCN-LABEL: global_load_saddr_v2i64:
2256; GCN:       ; %bb.0:
2257; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
2258; GCN-NEXT:    s_waitcnt vmcnt(0)
2259; GCN-NEXT:    ; return to shader part epilog
2260;
2261; GFX11-LABEL: global_load_saddr_v2i64:
2262; GFX11:       ; %bb.0:
2263; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
2264; GFX11-NEXT:    s_waitcnt vmcnt(0)
2265; GFX11-NEXT:    ; return to shader part epilog
2266  %zext.offset = zext i32 %voffset to i64
2267  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2268  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)*
2269  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast
2270  %cast.load = bitcast <2 x i64> %load to <4 x float>
2271  ret <4 x float> %cast.load
2272}
2273
2274define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2275; GCN-LABEL: global_load_saddr_v2i64_immneg128:
2276; GCN:       ; %bb.0:
2277; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2278; GCN-NEXT:    s_waitcnt vmcnt(0)
2279; GCN-NEXT:    ; return to shader part epilog
2280;
2281; GFX11-LABEL: global_load_saddr_v2i64_immneg128:
2282; GFX11:       ; %bb.0:
2283; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
2284; GFX11-NEXT:    s_waitcnt vmcnt(0)
2285; GFX11-NEXT:    ; return to shader part epilog
2286  %zext.offset = zext i32 %voffset to i64
2287  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2288  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2289  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)*
2290  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast
2291  %cast.load = bitcast <2 x i64> %load to <4 x float>
2292  ret <4 x float> %cast.load
2293}
2294
2295define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2296; GCN-LABEL: global_load_saddr_i128:
2297; GCN:       ; %bb.0:
2298; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
2299; GCN-NEXT:    s_waitcnt vmcnt(0)
2300; GCN-NEXT:    ; return to shader part epilog
2301;
2302; GFX11-LABEL: global_load_saddr_i128:
2303; GFX11:       ; %bb.0:
2304; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
2305; GFX11-NEXT:    s_waitcnt vmcnt(0)
2306; GFX11-NEXT:    ; return to shader part epilog
2307  %zext.offset = zext i32 %voffset to i64
2308  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2309  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)*
2310  %load = load i128, i128 addrspace(1)* %gep0.cast
2311  %cast.load = bitcast i128 %load to <4 x float>
2312  ret <4 x float> %cast.load
2313}
2314
2315define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2316; GCN-LABEL: global_load_saddr_i128_immneg128:
2317; GCN:       ; %bb.0:
2318; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2319; GCN-NEXT:    s_waitcnt vmcnt(0)
2320; GCN-NEXT:    ; return to shader part epilog
2321;
2322; GFX11-LABEL: global_load_saddr_i128_immneg128:
2323; GFX11:       ; %bb.0:
2324; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
2325; GFX11-NEXT:    s_waitcnt vmcnt(0)
2326; GFX11-NEXT:    ; return to shader part epilog
2327  %zext.offset = zext i32 %voffset to i64
2328  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2329  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2330  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)*
2331  %load = load i128, i128 addrspace(1)* %gep1.cast
2332  %cast.load = bitcast i128 %load to <4 x float>
2333  ret <4 x float> %cast.load
2334}
2335
2336define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2337; GCN-LABEL: global_load_saddr_v2p1:
2338; GCN:       ; %bb.0:
2339; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
2340; GCN-NEXT:    s_waitcnt vmcnt(0)
2341; GCN-NEXT:    ; return to shader part epilog
2342;
2343; GFX11-LABEL: global_load_saddr_v2p1:
2344; GFX11:       ; %bb.0:
2345; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
2346; GFX11-NEXT:    s_waitcnt vmcnt(0)
2347; GFX11-NEXT:    ; return to shader part epilog
2348  %zext.offset = zext i32 %voffset to i64
2349  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2350  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)*
2351  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast
2352  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
2353  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
2354  ret <4 x float> %cast.load1
2355}
2356
2357define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2358; GCN-LABEL: global_load_saddr_v2p1_immneg128:
2359; GCN:       ; %bb.0:
2360; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2361; GCN-NEXT:    s_waitcnt vmcnt(0)
2362; GCN-NEXT:    ; return to shader part epilog
2363;
2364; GFX11-LABEL: global_load_saddr_v2p1_immneg128:
2365; GFX11:       ; %bb.0:
2366; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
2367; GFX11-NEXT:    s_waitcnt vmcnt(0)
2368; GFX11-NEXT:    ; return to shader part epilog
2369  %zext.offset = zext i32 %voffset to i64
2370  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2371  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2372  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)*
2373  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast
2374  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
2375  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
2376  ret <4 x float> %cast.load1
2377}
2378
2379define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2380; GCN-LABEL: global_load_saddr_v4p3:
2381; GCN:       ; %bb.0:
2382; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
2383; GCN-NEXT:    s_waitcnt vmcnt(0)
2384; GCN-NEXT:    ; return to shader part epilog
2385;
2386; GFX11-LABEL: global_load_saddr_v4p3:
2387; GFX11:       ; %bb.0:
2388; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
2389; GFX11-NEXT:    s_waitcnt vmcnt(0)
2390; GFX11-NEXT:    ; return to shader part epilog
2391  %zext.offset = zext i32 %voffset to i64
2392  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2393  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)*
2394  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast
2395  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
2396  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
2397  ret <4 x float> %cast.load1
2398}
2399
2400define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2401; GCN-LABEL: global_load_saddr_v4p3_immneg128:
2402; GCN:       ; %bb.0:
2403; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
2404; GCN-NEXT:    s_waitcnt vmcnt(0)
2405; GCN-NEXT:    ; return to shader part epilog
2406;
2407; GFX11-LABEL: global_load_saddr_v4p3_immneg128:
2408; GFX11:       ; %bb.0:
2409; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3] offset:-128
2410; GFX11-NEXT:    s_waitcnt vmcnt(0)
2411; GFX11-NEXT:    ; return to shader part epilog
2412  %zext.offset = zext i32 %voffset to i64
2413  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2414  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2415  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)*
2416  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast
2417  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
2418  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
2419  ret <4 x float> %cast.load1
2420}
2421
2422; --------------------------------------------------------------------------------
2423; Extending loads
2424; --------------------------------------------------------------------------------
2425
2426define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2427; GCN-LABEL: global_sextload_saddr_i8:
2428; GCN:       ; %bb.0:
2429; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3]
2430; GCN-NEXT:    s_waitcnt vmcnt(0)
2431; GCN-NEXT:    ; return to shader part epilog
2432;
2433; GFX11-LABEL: global_sextload_saddr_i8:
2434; GFX11:       ; %bb.0:
2435; GFX11-NEXT:    global_load_i8 v0, v0, s[2:3]
2436; GFX11-NEXT:    s_waitcnt vmcnt(0)
2437; GFX11-NEXT:    ; return to shader part epilog
2438  %zext.offset = zext i32 %voffset to i64
2439  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2440  %load = load i8, i8 addrspace(1)* %gep0
2441  %sextload = sext i8 %load to i32
2442  %cast.load = bitcast i32 %sextload to float
2443  ret float %cast.load
2444}
2445
2446define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2447; GCN-LABEL: global_sextload_saddr_i8_immneg128:
2448; GCN:       ; %bb.0:
2449; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3] offset:-128
2450; GCN-NEXT:    s_waitcnt vmcnt(0)
2451; GCN-NEXT:    ; return to shader part epilog
2452;
2453; GFX11-LABEL: global_sextload_saddr_i8_immneg128:
2454; GFX11:       ; %bb.0:
2455; GFX11-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
2456; GFX11-NEXT:    s_waitcnt vmcnt(0)
2457; GFX11-NEXT:    ; return to shader part epilog
2458  %zext.offset = zext i32 %voffset to i64
2459  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2460  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2461  %load = load i8, i8 addrspace(1)* %gep1
2462  %sextload = sext i8 %load to i32
2463  %cast.load = bitcast i32 %sextload to float
2464  ret float %cast.load
2465}
2466
2467define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2468; GCN-LABEL: global_sextload_saddr_i16:
2469; GCN:       ; %bb.0:
2470; GCN-NEXT:    global_load_sshort v0, v0, s[2:3]
2471; GCN-NEXT:    s_waitcnt vmcnt(0)
2472; GCN-NEXT:    ; return to shader part epilog
2473;
2474; GFX11-LABEL: global_sextload_saddr_i16:
2475; GFX11:       ; %bb.0:
2476; GFX11-NEXT:    global_load_i16 v0, v0, s[2:3]
2477; GFX11-NEXT:    s_waitcnt vmcnt(0)
2478; GFX11-NEXT:    ; return to shader part epilog
2479  %zext.offset = zext i32 %voffset to i64
2480  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2481  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2482  %load = load i16, i16 addrspace(1)* %gep0.cast
2483  %sextload = sext i16 %load to i32
2484  %cast.load = bitcast i32 %sextload to float
2485  ret float %cast.load
2486}
2487
2488define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2489; GCN-LABEL: global_sextload_saddr_i16_immneg128:
2490; GCN:       ; %bb.0:
2491; GCN-NEXT:    global_load_sshort v0, v0, s[2:3] offset:-128
2492; GCN-NEXT:    s_waitcnt vmcnt(0)
2493; GCN-NEXT:    ; return to shader part epilog
2494;
2495; GFX11-LABEL: global_sextload_saddr_i16_immneg128:
2496; GFX11:       ; %bb.0:
2497; GFX11-NEXT:    global_load_i16 v0, v0, s[2:3] offset:-128
2498; GFX11-NEXT:    s_waitcnt vmcnt(0)
2499; GFX11-NEXT:    ; return to shader part epilog
2500  %zext.offset = zext i32 %voffset to i64
2501  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2502  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2503  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2504  %load = load i16, i16 addrspace(1)* %gep1.cast
2505  %sextload = sext i16 %load to i32
2506  %cast.load = bitcast i32 %sextload to float
2507  ret float %cast.load
2508}
2509
2510define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2511; GCN-LABEL: global_zextload_saddr_i8:
2512; GCN:       ; %bb.0:
2513; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
2514; GCN-NEXT:    s_waitcnt vmcnt(0)
2515; GCN-NEXT:    ; return to shader part epilog
2516;
2517; GFX11-LABEL: global_zextload_saddr_i8:
2518; GFX11:       ; %bb.0:
2519; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
2520; GFX11-NEXT:    s_waitcnt vmcnt(0)
2521; GFX11-NEXT:    ; return to shader part epilog
2522  %zext.offset = zext i32 %voffset to i64
2523  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2524  %load = load i8, i8 addrspace(1)* %gep0
2525  %zextload = zext i8 %load to i32
2526  %cast.load = bitcast i32 %zextload to float
2527  ret float %cast.load
2528}
2529
2530define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2531; GCN-LABEL: global_zextload_saddr_i8_immneg128:
2532; GCN:       ; %bb.0:
2533; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-128
2534; GCN-NEXT:    s_waitcnt vmcnt(0)
2535; GCN-NEXT:    ; return to shader part epilog
2536;
2537; GFX11-LABEL: global_zextload_saddr_i8_immneg128:
2538; GFX11:       ; %bb.0:
2539; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3] offset:-128
2540; GFX11-NEXT:    s_waitcnt vmcnt(0)
2541; GFX11-NEXT:    ; return to shader part epilog
2542  %zext.offset = zext i32 %voffset to i64
2543  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2544  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2545  %load = load i8, i8 addrspace(1)* %gep1
2546  %zextload = zext i8 %load to i32
2547  %cast.load = bitcast i32 %zextload to float
2548  ret float %cast.load
2549}
2550
2551define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2552; GCN-LABEL: global_zextload_saddr_i16:
2553; GCN:       ; %bb.0:
2554; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
2555; GCN-NEXT:    s_waitcnt vmcnt(0)
2556; GCN-NEXT:    ; return to shader part epilog
2557;
2558; GFX11-LABEL: global_zextload_saddr_i16:
2559; GFX11:       ; %bb.0:
2560; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
2561; GFX11-NEXT:    s_waitcnt vmcnt(0)
2562; GFX11-NEXT:    ; return to shader part epilog
2563  %zext.offset = zext i32 %voffset to i64
2564  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2565  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2566  %load = load i16, i16 addrspace(1)* %gep0.cast
2567  %zextload = zext i16 %load to i32
2568  %cast.load = bitcast i32 %zextload to float
2569  ret float %cast.load
2570}
2571
2572define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2573; GCN-LABEL: global_zextload_saddr_i16_immneg128:
2574; GCN:       ; %bb.0:
2575; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
2576; GCN-NEXT:    s_waitcnt vmcnt(0)
2577; GCN-NEXT:    ; return to shader part epilog
2578;
2579; GFX11-LABEL: global_zextload_saddr_i16_immneg128:
2580; GFX11:       ; %bb.0:
2581; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
2582; GFX11-NEXT:    s_waitcnt vmcnt(0)
2583; GFX11-NEXT:    ; return to shader part epilog
2584  %zext.offset = zext i32 %voffset to i64
2585  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2586  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2587  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2588  %load = load i16, i16 addrspace(1)* %gep1.cast
2589  %zextload = zext i16 %load to i32
2590  %cast.load = bitcast i32 %zextload to float
2591  ret float %cast.load
2592}
2593
2594; --------------------------------------------------------------------------------
2595; Atomic load
2596; --------------------------------------------------------------------------------
2597
2598define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2599; GFX9-LABEL: atomic_global_load_saddr_i32:
2600; GFX9:       ; %bb.0:
2601; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2602; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
2603; GFX9-NEXT:    s_waitcnt vmcnt(0)
2604; GFX9-NEXT:    buffer_wbinvl1
2605; GFX9-NEXT:    ; return to shader part epilog
2606;
2607; GFX10-LABEL: atomic_global_load_saddr_i32:
2608; GFX10:       ; %bb.0:
2609; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2610; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2611; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
2612; GFX10-NEXT:    s_waitcnt vmcnt(0)
2613; GFX10-NEXT:    buffer_gl0_inv
2614; GFX10-NEXT:    buffer_gl1_inv
2615; GFX10-NEXT:    ; return to shader part epilog
2616;
2617; GFX11-LABEL: atomic_global_load_saddr_i32:
2618; GFX11:       ; %bb.0:
2619; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2620; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2621; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc
2622; GFX11-NEXT:    s_waitcnt vmcnt(0)
2623; GFX11-NEXT:    buffer_gl0_inv
2624; GFX11-NEXT:    buffer_gl1_inv
2625; GFX11-NEXT:    ; return to shader part epilog
2626  %zext.offset = zext i32 %voffset to i64
2627  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2628  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2629  %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4
2630  %cast.load = bitcast i32 %load to float
2631  ret float %cast.load
2632}
2633
2634define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2635; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
2636; GFX9:       ; %bb.0:
2637; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2638; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc
2639; GFX9-NEXT:    s_waitcnt vmcnt(0)
2640; GFX9-NEXT:    buffer_wbinvl1
2641; GFX9-NEXT:    ; return to shader part epilog
2642;
2643; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128:
2644; GFX10:       ; %bb.0:
2645; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2646; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2647; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
2648; GFX10-NEXT:    s_waitcnt vmcnt(0)
2649; GFX10-NEXT:    buffer_gl0_inv
2650; GFX10-NEXT:    buffer_gl1_inv
2651; GFX10-NEXT:    ; return to shader part epilog
2652;
2653; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128:
2654; GFX11:       ; %bb.0:
2655; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2656; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2657; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:-128 glc
2658; GFX11-NEXT:    s_waitcnt vmcnt(0)
2659; GFX11-NEXT:    buffer_gl0_inv
2660; GFX11-NEXT:    buffer_gl1_inv
2661; GFX11-NEXT:    ; return to shader part epilog
2662  %zext.offset = zext i32 %voffset to i64
2663  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2664  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2665  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2666  %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4
2667  %cast.load = bitcast i32 %load to float
2668  ret float %cast.load
2669}
2670
2671define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2672; GFX9-LABEL: atomic_global_load_saddr_i64:
2673; GFX9:       ; %bb.0:
2674; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2675; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
2676; GFX9-NEXT:    s_waitcnt vmcnt(0)
2677; GFX9-NEXT:    buffer_wbinvl1
2678; GFX9-NEXT:    ; return to shader part epilog
2679;
2680; GFX10-LABEL: atomic_global_load_saddr_i64:
2681; GFX10:       ; %bb.0:
2682; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2683; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2684; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
2685; GFX10-NEXT:    s_waitcnt vmcnt(0)
2686; GFX10-NEXT:    buffer_gl0_inv
2687; GFX10-NEXT:    buffer_gl1_inv
2688; GFX10-NEXT:    ; return to shader part epilog
2689;
2690; GFX11-LABEL: atomic_global_load_saddr_i64:
2691; GFX11:       ; %bb.0:
2692; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2693; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2694; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] glc
2695; GFX11-NEXT:    s_waitcnt vmcnt(0)
2696; GFX11-NEXT:    buffer_gl0_inv
2697; GFX11-NEXT:    buffer_gl1_inv
2698; GFX11-NEXT:    ; return to shader part epilog
2699  %zext.offset = zext i32 %voffset to i64
2700  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2701  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2702  %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8
2703  %cast.load = bitcast i64 %load to <2 x float>
2704  ret <2 x float> %cast.load
2705}
2706
2707define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2708; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
2709; GFX9:       ; %bb.0:
2710; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2711; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc
2712; GFX9-NEXT:    s_waitcnt vmcnt(0)
2713; GFX9-NEXT:    buffer_wbinvl1
2714; GFX9-NEXT:    ; return to shader part epilog
2715;
2716; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128:
2717; GFX10:       ; %bb.0:
2718; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2719; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2720; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
2721; GFX10-NEXT:    s_waitcnt vmcnt(0)
2722; GFX10-NEXT:    buffer_gl0_inv
2723; GFX10-NEXT:    buffer_gl1_inv
2724; GFX10-NEXT:    ; return to shader part epilog
2725;
2726; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128:
2727; GFX11:       ; %bb.0:
2728; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2729; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2730; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc
2731; GFX11-NEXT:    s_waitcnt vmcnt(0)
2732; GFX11-NEXT:    buffer_gl0_inv
2733; GFX11-NEXT:    buffer_gl1_inv
2734; GFX11-NEXT:    ; return to shader part epilog
2735  %zext.offset = zext i32 %voffset to i64
2736  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2737  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2738  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2739  %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8
2740  %cast.load = bitcast i64 %load to <2 x float>
2741  ret <2 x float> %cast.load
2742}
2743
2744; --------------------------------------------------------------------------------
2745; D16 load (low 16)
2746; --------------------------------------------------------------------------------
2747
2748define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2749; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
2750; GCN:       ; %bb.0:
2751; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3]
2752; GCN-NEXT:    s_waitcnt vmcnt(0)
2753; GCN-NEXT:    ; return to shader part epilog
2754;
2755; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi:
2756; GFX11:       ; %bb.0:
2757; GFX11-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
2758; GFX11-NEXT:    s_waitcnt vmcnt(0)
2759; GFX11-NEXT:    ; return to shader part epilog
2760  %zext.offset = zext i32 %voffset to i64
2761  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2762  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2763  %load = load i16, i16 addrspace(1)* %gep0.cast
2764  %build = insertelement <2 x i16> undef, i16 %load, i32 0
2765  %cast = bitcast <2 x i16> %build to <2 x half>
2766  ret <2 x half> %cast
2767}
2768
2769define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2770; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
2771; GCN:       ; %bb.0:
2772; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3] offset:-128
2773; GCN-NEXT:    s_waitcnt vmcnt(0)
2774; GCN-NEXT:    ; return to shader part epilog
2775;
2776; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
2777; GFX11:       ; %bb.0:
2778; GFX11-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
2779; GFX11-NEXT:    s_waitcnt vmcnt(0)
2780; GFX11-NEXT:    ; return to shader part epilog
2781  %zext.offset = zext i32 %voffset to i64
2782  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2783  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2784  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2785  %load = load i16, i16 addrspace(1)* %gep1.cast
2786  %build = insertelement <2 x i16> undef, i16 %load, i32 0
2787  %cast = bitcast <2 x i16> %build to <2 x half>
2788  ret <2 x half> %cast
2789}
2790
2791define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2792; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
2793; GCN:       ; %bb.0:
2794; GCN-NEXT:    v_mov_b32_e32 v1, 0
2795; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
2796; GCN-NEXT:    s_waitcnt vmcnt(0)
2797; GCN-NEXT:    v_mov_b32_e32 v0, v1
2798; GCN-NEXT:    ; return to shader part epilog
2799;
2800; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi:
2801; GFX11:       ; %bb.0:
2802; GFX11-NEXT:    v_mov_b32_e32 v1, 0
2803; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
2804; GFX11-NEXT:    s_waitcnt vmcnt(0)
2805; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2806; GFX11-NEXT:    ; return to shader part epilog
2807  %zext.offset = zext i32 %voffset to i64
2808  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2809  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2810  %load = load i16, i16 addrspace(1)* %gep0.cast
2811  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2812  %cast = bitcast <2 x i16> %build to <2 x half>
2813  ret <2 x half> %cast
2814}
2815
2816define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2817; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
2818; GCN:       ; %bb.0:
2819; GCN-NEXT:    v_mov_b32_e32 v1, 0
2820; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
2821; GCN-NEXT:    s_waitcnt vmcnt(0)
2822; GCN-NEXT:    v_mov_b32_e32 v0, v1
2823; GCN-NEXT:    ; return to shader part epilog
2824;
2825; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
2826; GFX11:       ; %bb.0:
2827; GFX11-NEXT:    v_mov_b32_e32 v1, 0
2828; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:-128
2829; GFX11-NEXT:    s_waitcnt vmcnt(0)
2830; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2831; GFX11-NEXT:    ; return to shader part epilog
2832  %zext.offset = zext i32 %voffset to i64
2833  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2834  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2835  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2836  %load = load i16, i16 addrspace(1)* %gep1.cast
2837  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2838  %cast = bitcast <2 x i16> %build to <2 x half>
2839  ret <2 x half> %cast
2840}
2841
2842define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2843; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
2844; GCN:       ; %bb.0:
2845; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
2846; GCN-NEXT:    s_waitcnt vmcnt(0)
2847; GCN-NEXT:    v_mov_b32_e32 v0, v1
2848; GCN-NEXT:    ; return to shader part epilog
2849;
2850; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi:
2851; GFX11:       ; %bb.0:
2852; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
2853; GFX11-NEXT:    s_waitcnt vmcnt(0)
2854; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2855; GFX11-NEXT:    ; return to shader part epilog
2856  %zext.offset = zext i32 %voffset to i64
2857  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2858  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2859  %load = load i16, i16 addrspace(1)* %gep0.cast
2860  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2861  %cast = bitcast <2 x i16> %build to <2 x half>
2862  ret <2 x half> %cast
2863}
2864
2865define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2866; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
2867; GCN:       ; %bb.0:
2868; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
2869; GCN-NEXT:    s_waitcnt vmcnt(0)
2870; GCN-NEXT:    v_mov_b32_e32 v0, v1
2871; GCN-NEXT:    ; return to shader part epilog
2872;
2873; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
2874; GFX11:       ; %bb.0:
2875; GFX11-NEXT:    global_load_d16_b16 v1, v0, s[2:3] offset:-128
2876; GFX11-NEXT:    s_waitcnt vmcnt(0)
2877; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2878; GFX11-NEXT:    ; return to shader part epilog
2879  %zext.offset = zext i32 %voffset to i64
2880  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2881  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2882  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2883  %load = load i16, i16 addrspace(1)* %gep1.cast
2884  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2885  %cast = bitcast <2 x i16> %build to <2 x half>
2886  ret <2 x half> %cast
2887}
2888
2889define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2890; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
2891; GCN:       ; %bb.0:
2892; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3]
2893; GCN-NEXT:    s_waitcnt vmcnt(0)
2894; GCN-NEXT:    v_mov_b32_e32 v0, v1
2895; GCN-NEXT:    ; return to shader part epilog
2896;
2897; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
2898; GFX11:       ; %bb.0:
2899; GFX11-NEXT:    global_load_d16_u8 v1, v0, s[2:3]
2900; GFX11-NEXT:    s_waitcnt vmcnt(0)
2901; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2902; GFX11-NEXT:    ; return to shader part epilog
2903  %zext.offset = zext i32 %voffset to i64
2904  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2905  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2906  %load = load i8, i8 addrspace(1)* %gep0.cast
2907  %zext.load = zext i8 %load to i16
2908  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2909  %cast = bitcast <2 x i16> %build to <2 x half>
2910  ret <2 x half> %cast
2911}
2912
2913define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2914; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
2915; GCN:       ; %bb.0:
2916; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
2917; GCN-NEXT:    s_waitcnt vmcnt(0)
2918; GCN-NEXT:    v_mov_b32_e32 v0, v1
2919; GCN-NEXT:    ; return to shader part epilog
2920;
2921; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
2922; GFX11:       ; %bb.0:
2923; GFX11-NEXT:    global_load_d16_u8 v1, v0, s[2:3] offset:-128
2924; GFX11-NEXT:    s_waitcnt vmcnt(0)
2925; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2926; GFX11-NEXT:    ; return to shader part epilog
2927  %zext.offset = zext i32 %voffset to i64
2928  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2929  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2930  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2931  %load = load i8, i8 addrspace(1)* %gep1.cast
2932  %zext.load = zext i8 %load to i16
2933  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2934  %cast = bitcast <2 x i16> %build to <2 x half>
2935  ret <2 x half> %cast
2936}
2937
2938define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2939; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
2940; GCN:       ; %bb.0:
2941; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3]
2942; GCN-NEXT:    s_waitcnt vmcnt(0)
2943; GCN-NEXT:    v_mov_b32_e32 v0, v1
2944; GCN-NEXT:    ; return to shader part epilog
2945;
2946; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
2947; GFX11:       ; %bb.0:
2948; GFX11-NEXT:    global_load_d16_i8 v1, v0, s[2:3]
2949; GFX11-NEXT:    s_waitcnt vmcnt(0)
2950; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2951; GFX11-NEXT:    ; return to shader part epilog
2952  %zext.offset = zext i32 %voffset to i64
2953  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2954  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2955  %load = load i8, i8 addrspace(1)* %gep0.cast
2956  %sext.load = sext i8 %load to i16
2957  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2958  %cast = bitcast <2 x i16> %build to <2 x half>
2959  ret <2 x half> %cast
2960}
2961
2962define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2963; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
2964; GCN:       ; %bb.0:
2965; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
2966; GCN-NEXT:    s_waitcnt vmcnt(0)
2967; GCN-NEXT:    v_mov_b32_e32 v0, v1
2968; GCN-NEXT:    ; return to shader part epilog
2969;
2970; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
2971; GFX11:       ; %bb.0:
2972; GFX11-NEXT:    global_load_d16_i8 v1, v0, s[2:3] offset:-128
2973; GFX11-NEXT:    s_waitcnt vmcnt(0)
2974; GFX11-NEXT:    v_mov_b32_e32 v0, v1
2975; GFX11-NEXT:    ; return to shader part epilog
2976  %zext.offset = zext i32 %voffset to i64
2977  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2978  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2979  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2980  %load = load i8, i8 addrspace(1)* %gep1.cast
2981  %sext.load = sext i8 %load to i16
2982  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2983  %cast = bitcast <2 x i16> %build to <2 x half>
2984  ret <2 x half> %cast
2985}
2986
2987; --------------------------------------------------------------------------------
2988; D16 hi load (hi16)
2989; --------------------------------------------------------------------------------
2990
2991define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2992; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
2993; GCN:       ; %bb.0:
2994; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3]
2995; GCN-NEXT:    s_waitcnt vmcnt(0)
2996; GCN-NEXT:    ; return to shader part epilog
2997;
2998; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi:
2999; GFX11:       ; %bb.0:
3000; GFX11-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3]
3001; GFX11-NEXT:    s_waitcnt vmcnt(0)
3002; GFX11-NEXT:    ; return to shader part epilog
3003  %zext.offset = zext i32 %voffset to i64
3004  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3005  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
3006  %load = load i16, i16 addrspace(1)* %gep0.cast
3007  %build = insertelement <2 x i16> undef, i16 %load, i32 1
3008  %cast = bitcast <2 x i16> %build to <2 x half>
3009  ret <2 x half> %cast
3010}
3011
3012define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
3013; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
3014; GCN:       ; %bb.0:
3015; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3] offset:-128
3016; GCN-NEXT:    s_waitcnt vmcnt(0)
3017; GCN-NEXT:    ; return to shader part epilog
3018;
3019; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
3020; GFX11:       ; %bb.0:
3021; GFX11-NEXT:    global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
3022; GFX11-NEXT:    s_waitcnt vmcnt(0)
3023; GFX11-NEXT:    ; return to shader part epilog
3024  %zext.offset = zext i32 %voffset to i64
3025  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3026  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
3027  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
3028  %load = load i16, i16 addrspace(1)* %gep1.cast
3029  %build = insertelement <2 x i16> undef, i16 %load, i32 1
3030  %cast = bitcast <2 x i16> %build to <2 x half>
3031  ret <2 x half> %cast
3032}
3033
3034define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
3035; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
3036; GCN:       ; %bb.0:
3037; GCN-NEXT:    v_mov_b32_e32 v1, 0
3038; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
3039; GCN-NEXT:    s_waitcnt vmcnt(0)
3040; GCN-NEXT:    v_mov_b32_e32 v0, v1
3041; GCN-NEXT:    ; return to shader part epilog
3042;
3043; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi:
3044; GFX11:       ; %bb.0:
3045; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3046; GFX11-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3]
3047; GFX11-NEXT:    s_waitcnt vmcnt(0)
3048; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3049; GFX11-NEXT:    ; return to shader part epilog
3050  %zext.offset = zext i32 %voffset to i64
3051  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3052  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
3053  %load = load i16, i16 addrspace(1)* %gep0.cast
3054  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
3055  %cast = bitcast <2 x i16> %build to <2 x half>
3056  ret <2 x half> %cast
3057}
3058
3059define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
3060; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
3061; GCN:       ; %bb.0:
3062; GCN-NEXT:    v_mov_b32_e32 v1, 0
3063; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
3064; GCN-NEXT:    s_waitcnt vmcnt(0)
3065; GCN-NEXT:    v_mov_b32_e32 v0, v1
3066; GCN-NEXT:    ; return to shader part epilog
3067;
3068; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
3069; GFX11:       ; %bb.0:
3070; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3071; GFX11-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
3072; GFX11-NEXT:    s_waitcnt vmcnt(0)
3073; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3074; GFX11-NEXT:    ; return to shader part epilog
3075  %zext.offset = zext i32 %voffset to i64
3076  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3077  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
3078  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
3079  %load = load i16, i16 addrspace(1)* %gep1.cast
3080  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
3081  %cast = bitcast <2 x i16> %build to <2 x half>
3082  ret <2 x half> %cast
3083}
3084
3085define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3086; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
3087; GCN:       ; %bb.0:
3088; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
3089; GCN-NEXT:    s_waitcnt vmcnt(0)
3090; GCN-NEXT:    v_mov_b32_e32 v0, v1
3091; GCN-NEXT:    ; return to shader part epilog
3092;
3093; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi:
3094; GFX11:       ; %bb.0:
3095; GFX11-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3]
3096; GFX11-NEXT:    s_waitcnt vmcnt(0)
3097; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3098; GFX11-NEXT:    ; return to shader part epilog
3099  %zext.offset = zext i32 %voffset to i64
3100  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3101  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
3102  %load = load i16, i16 addrspace(1)* %gep0.cast
3103  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
3104  %cast = bitcast <2 x i16> %build to <2 x half>
3105  ret <2 x half> %cast
3106}
3107
3108define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3109; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
3110; GCN:       ; %bb.0:
3111; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
3112; GCN-NEXT:    s_waitcnt vmcnt(0)
3113; GCN-NEXT:    v_mov_b32_e32 v0, v1
3114; GCN-NEXT:    ; return to shader part epilog
3115;
3116; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
3117; GFX11:       ; %bb.0:
3118; GFX11-NEXT:    global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
3119; GFX11-NEXT:    s_waitcnt vmcnt(0)
3120; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3121; GFX11-NEXT:    ; return to shader part epilog
3122  %zext.offset = zext i32 %voffset to i64
3123  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3124  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
3125  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
3126  %load = load i16, i16 addrspace(1)* %gep1.cast
3127  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
3128  %cast = bitcast <2 x i16> %build to <2 x half>
3129  ret <2 x half> %cast
3130}
3131
3132define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3133; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
3134; GCN:       ; %bb.0:
3135; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3]
3136; GCN-NEXT:    s_waitcnt vmcnt(0)
3137; GCN-NEXT:    v_mov_b32_e32 v0, v1
3138; GCN-NEXT:    ; return to shader part epilog
3139;
3140; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
3141; GFX11:       ; %bb.0:
3142; GFX11-NEXT:    global_load_d16_hi_u8 v1, v0, s[2:3]
3143; GFX11-NEXT:    s_waitcnt vmcnt(0)
3144; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3145; GFX11-NEXT:    ; return to shader part epilog
3146  %zext.offset = zext i32 %voffset to i64
3147  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3148  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
3149  %load = load i8, i8 addrspace(1)* %gep0.cast
3150  %zext.load = zext i8 %load to i16
3151  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
3152  %cast = bitcast <2 x i16> %build to <2 x half>
3153  ret <2 x half> %cast
3154}
3155
3156define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3157; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
3158; GCN:       ; %bb.0:
3159; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
3160; GCN-NEXT:    s_waitcnt vmcnt(0)
3161; GCN-NEXT:    v_mov_b32_e32 v0, v1
3162; GCN-NEXT:    ; return to shader part epilog
3163;
3164; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
3165; GFX11:       ; %bb.0:
3166; GFX11-NEXT:    global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
3167; GFX11-NEXT:    s_waitcnt vmcnt(0)
3168; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3169; GFX11-NEXT:    ; return to shader part epilog
3170  %zext.offset = zext i32 %voffset to i64
3171  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3172  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
3173  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
3174  %load = load i8, i8 addrspace(1)* %gep1.cast
3175  %zext.load = zext i8 %load to i16
3176  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
3177  %cast = bitcast <2 x i16> %build to <2 x half>
3178  ret <2 x half> %cast
3179}
3180
3181define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3182; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
3183; GCN:       ; %bb.0:
3184; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3]
3185; GCN-NEXT:    s_waitcnt vmcnt(0)
3186; GCN-NEXT:    v_mov_b32_e32 v0, v1
3187; GCN-NEXT:    ; return to shader part epilog
3188;
3189; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
3190; GFX11:       ; %bb.0:
3191; GFX11-NEXT:    global_load_d16_hi_i8 v1, v0, s[2:3]
3192; GFX11-NEXT:    s_waitcnt vmcnt(0)
3193; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3194; GFX11-NEXT:    ; return to shader part epilog
3195  %zext.offset = zext i32 %voffset to i64
3196  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3197  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
3198  %load = load i8, i8 addrspace(1)* %gep0.cast
3199  %sext.load = sext i8 %load to i16
3200  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
3201  %cast = bitcast <2 x i16> %build to <2 x half>
3202  ret <2 x half> %cast
3203}
3204
3205define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
3206; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
3207; GCN:       ; %bb.0:
3208; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
3209; GCN-NEXT:    s_waitcnt vmcnt(0)
3210; GCN-NEXT:    v_mov_b32_e32 v0, v1
3211; GCN-NEXT:    ; return to shader part epilog
3212;
3213; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
3214; GFX11:       ; %bb.0:
3215; GFX11-NEXT:    global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
3216; GFX11-NEXT:    s_waitcnt vmcnt(0)
3217; GFX11-NEXT:    v_mov_b32_e32 v0, v1
3218; GFX11-NEXT:    ; return to shader part epilog
3219  %zext.offset = zext i32 %voffset to i64
3220  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
3221  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
3222  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
3223  %load = load i8, i8 addrspace(1)* %gep1.cast
3224  %sext.load = sext i8 %load to i16
3225  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
3226  %cast = bitcast <2 x i16> %build to <2 x half>
3227  ret <2 x half> %cast
3228}
3229
3230; --------------------------------------------------------------------------------
3231; or-with-constant as add
3232; --------------------------------------------------------------------------------
3233
3234; Check add-as-or with split 64-bit or.
3235define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) {
3236; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
3237; GCN:       ; %bb.0:
3238; GCN-NEXT:    v_or_b32_e32 v0, 16, v0
3239; GCN-NEXT:    v_mov_b32_e32 v1, 0
3240; GCN-NEXT:    global_load_ubyte v0, v[0:1], off
3241; GCN-NEXT:    s_waitcnt vmcnt(0)
3242; GCN-NEXT:    ; return to shader part epilog
3243;
3244; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
3245; GFX11:       ; %bb.0:
3246; GFX11-NEXT:    v_or_b32_e32 v0, 16, v0
3247; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3248; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
3249; GFX11-NEXT:    s_waitcnt vmcnt(0)
3250; GFX11-NEXT:    ; return to shader part epilog
3251  %zext.idx = zext i32 %idx to i64
3252  %or = or i64 %zext.idx, 16
3253  %addr = inttoptr i64 %or to i8 addrspace(1)*
3254  %load = load i8, i8 addrspace(1)* %addr
3255  %zext = zext i8 %load to i32
3256  %to.vgpr = bitcast i32 %zext to float
3257  ret float %to.vgpr
3258}
3259
3260define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) {
3261; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
3262; GCN:       ; %bb.0:
3263; GCN-NEXT:    v_or_b32_e32 v0, 0x1040, v0
3264; GCN-NEXT:    v_mov_b32_e32 v1, 0
3265; GCN-NEXT:    global_load_ubyte v0, v[0:1], off
3266; GCN-NEXT:    s_waitcnt vmcnt(0)
3267; GCN-NEXT:    ; return to shader part epilog
3268;
3269; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
3270; GFX11:       ; %bb.0:
3271; GFX11-NEXT:    v_or_b32_e32 v0, 0x1040, v0
3272; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3273; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
3274; GFX11-NEXT:    s_waitcnt vmcnt(0)
3275; GFX11-NEXT:    ; return to shader part epilog
3276  %zext.idx = zext i32 %idx to i64
3277  %or = or i64 %zext.idx, 4160
3278  %addr = inttoptr i64 %or to i8 addrspace(1)*
3279  %load = load i8, i8 addrspace(1)* %addr
3280  %zext = zext i8 %load to i32
3281  %to.vgpr = bitcast i32 %zext to float
3282  ret float %to.vgpr
3283}
3284
3285; --------------------------------------------------------------------------------
3286; Full 64-bit scalar add.
3287; --------------------------------------------------------------------------------
3288
3289define amdgpu_ps void @global_addr_64bit_lsr_iv(float addrspace(1)* inreg %arg) {
3290; GFX9-LABEL: global_addr_64bit_lsr_iv:
3291; GFX9:       ; %bb.0: ; %bb
3292; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3293; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3294; GFX9-NEXT:  .LBB128_1: ; %bb3
3295; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3296; GFX9-NEXT:    s_add_u32 s4, s2, s0
3297; GFX9-NEXT:    s_addc_u32 s5, s3, s1
3298; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
3299; GFX9-NEXT:    s_waitcnt vmcnt(0)
3300; GFX9-NEXT:    s_add_u32 s0, s0, 4
3301; GFX9-NEXT:    s_addc_u32 s1, s1, 0
3302; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x400
3303; GFX9-NEXT:    s_cbranch_scc0 .LBB128_1
3304; GFX9-NEXT:  ; %bb.2: ; %bb2
3305; GFX9-NEXT:    s_endpgm
3306;
3307; GFX10-LABEL: global_addr_64bit_lsr_iv:
3308; GFX10:       ; %bb.0: ; %bb
3309; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3310; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3311; GFX10-NEXT:  .LBB128_1: ; %bb3
3312; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3313; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3314; GFX10-NEXT:    s_add_u32 s4, s2, s0
3315; GFX10-NEXT:    s_addc_u32 s5, s3, s1
3316; GFX10-NEXT:    s_add_u32 s0, s0, 4
3317; GFX10-NEXT:    global_load_dword v1, v0, s[4:5] glc dlc
3318; GFX10-NEXT:    s_waitcnt vmcnt(0)
3319; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3320; GFX10-NEXT:    s_cmpk_eq_i32 s0, 0x400
3321; GFX10-NEXT:    s_cbranch_scc0 .LBB128_1
3322; GFX10-NEXT:  ; %bb.2: ; %bb2
3323; GFX10-NEXT:    s_endpgm
3324;
3325; GFX11-LABEL: global_addr_64bit_lsr_iv:
3326; GFX11:       ; %bb.0: ; %bb
3327; GFX11-NEXT:    v_mov_b32_e32 v0, 0
3328; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3329; GFX11-NEXT:  .LBB128_1: ; %bb3
3330; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3331; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3332; GFX11-NEXT:    s_add_u32 s4, s2, s0
3333; GFX11-NEXT:    s_addc_u32 s5, s3, s1
3334; GFX11-NEXT:    s_add_u32 s0, s0, 4
3335; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5] glc dlc
3336; GFX11-NEXT:    s_waitcnt vmcnt(0)
3337; GFX11-NEXT:    s_addc_u32 s1, s1, 0
3338; GFX11-NEXT:    s_cmpk_eq_i32 s0, 0x400
3339; GFX11-NEXT:    s_cbranch_scc0 .LBB128_1
3340; GFX11-NEXT:  ; %bb.2: ; %bb2
3341; GFX11-NEXT:    s_endpgm
3342bb:
3343  br label %bb3
3344
3345bb2:                                              ; preds = %bb3
3346  ret void
3347
3348bb3:                                              ; preds = %bb3, %bb
3349  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
3350  %i4 = zext i32 %i to i64
3351  %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
3352  %i6 = load volatile float, float addrspace(1)* %i5, align 4
3353  %i8 = add nuw nsw i32 %i, 1
3354  %i9 = icmp eq i32 %i8, 256
3355  br i1 %i9, label %bb2, label %bb3
3356}
3357
3358; Make sure we only have a single zero vaddr initialization.
3359
3360define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) {
3361; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
3362; GFX9:       ; %bb.0: ; %bb
3363; GFX9-NEXT:    s_mov_b64 s[0:1], 0
3364; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3365; GFX9-NEXT:  .LBB129_1: ; %bb3
3366; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
3367; GFX9-NEXT:    s_add_u32 s4, s2, s0
3368; GFX9-NEXT:    s_addc_u32 s5, s3, s1
3369; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
3370; GFX9-NEXT:    s_waitcnt vmcnt(0)
3371; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
3372; GFX9-NEXT:    s_waitcnt vmcnt(0)
3373; GFX9-NEXT:    s_add_u32 s0, s0, 4
3374; GFX9-NEXT:    s_addc_u32 s1, s1, 0
3375; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x400
3376; GFX9-NEXT:    ; kill: killed $sgpr4 killed $sgpr5
3377; GFX9-NEXT:    s_cbranch_scc0 .LBB129_1
3378; GFX9-NEXT:  ; %bb.2: ; %bb2
3379; GFX9-NEXT:    s_endpgm
3380;
3381; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
3382; GFX10:       ; %bb.0: ; %bb
3383; GFX10-NEXT:    v_mov_b32_e32 v0, 0
3384; GFX10-NEXT:    s_mov_b64 s[0:1], 0
3385; GFX10-NEXT:  .LBB129_1: ; %bb3
3386; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
3387; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
3388; GFX10-NEXT:    s_add_u32 s4, s2, s0
3389; GFX10-NEXT:    s_addc_u32 s5, s3, s1
3390; GFX10-NEXT:    s_add_u32 s0, s0, 4
3391; GFX10-NEXT:    global_load_dword v1, v0, s[4:5] glc dlc
3392; GFX10-NEXT:    s_waitcnt vmcnt(0)
3393; GFX10-NEXT:    global_load_dword v1, v0, s[4:5] glc dlc
3394; GFX10-NEXT:    s_waitcnt vmcnt(0)
3395; GFX10-NEXT:    s_addc_u32 s1, s1, 0
3396; GFX10-NEXT:    s_cmpk_eq_i32 s0, 0x400
3397; GFX10-NEXT:    ; kill: killed $sgpr4 killed $sgpr5
3398; GFX10-NEXT:    s_cbranch_scc0 .LBB129_1
3399; GFX10-NEXT:  ; %bb.2: ; %bb2
3400; GFX10-NEXT:    s_endpgm
3401;
3402; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload:
3403; GFX11:       ; %bb.0: ; %bb
3404; GFX11-NEXT:    v_mov_b32_e32 v0, 0
3405; GFX11-NEXT:    s_mov_b64 s[0:1], 0
3406; GFX11-NEXT:  .LBB129_1: ; %bb3
3407; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
3408; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3409; GFX11-NEXT:    s_add_u32 s4, s2, s0
3410; GFX11-NEXT:    s_addc_u32 s5, s3, s1
3411; GFX11-NEXT:    s_add_u32 s0, s0, 4
3412; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5] glc dlc
3413; GFX11-NEXT:    s_waitcnt vmcnt(0)
3414; GFX11-NEXT:    global_load_b32 v1, v0, s[4:5] glc dlc
3415; GFX11-NEXT:    s_waitcnt vmcnt(0)
3416; GFX11-NEXT:    s_addc_u32 s1, s1, 0
3417; GFX11-NEXT:    s_cmpk_eq_i32 s0, 0x400
3418; GFX11-NEXT:    s_cbranch_scc0 .LBB129_1
3419; GFX11-NEXT:  ; %bb.2: ; %bb2
3420; GFX11-NEXT:    s_endpgm
3421bb:
3422  br label %bb3
3423
3424bb2:                                              ; preds = %bb3
3425  ret void
3426
3427bb3:                                              ; preds = %bb3, %bb
3428  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
3429  %i4 = zext i32 %i to i64
3430  %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
3431  %i6 = load volatile float, float addrspace(1)* %i5, align 4
3432  %i5.1 = getelementptr inbounds float, float addrspace(1)* %arg.1, i64 %i4
3433  %i6.1 = load volatile float, float addrspace(1)* %i5, align 4
3434  %i8 = add nuw nsw i32 %i, 1
3435  %i9 = icmp eq i32 %i8, 256
3436  br i1 %i9, label %bb2, label %bb3
3437}
3438
3439!0 = !{i32 0, i32 1073741824} ; (1 << 30)
3440!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
3441