1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5; Test splitting flat instruction offsets into the low and high bits
6; when the offset doesn't fit in the offset field.
7
8define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) {
9; GFX9-LABEL: global_inst_valu_offset_1:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
13; GFX9-NEXT:    s_waitcnt vmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX10-LABEL: global_inst_valu_offset_1:
17; GFX10:       ; %bb.0:
18; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
20; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
21; GFX10-NEXT:    s_waitcnt vmcnt(0)
22; GFX10-NEXT:    s_setpc_b64 s[30:31]
23  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
24  %load = load i8, i8 addrspace(1)* %gep, align 4
25  ret i8 %load
26}
27
28define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) {
29; GFX9-LABEL: global_inst_valu_offset_11bit_max:
30; GFX9:       ; %bb.0:
31; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
33; GFX9-NEXT:    s_waitcnt vmcnt(0)
34; GFX9-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX10-LABEL: global_inst_valu_offset_11bit_max:
37; GFX10:       ; %bb.0:
38; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
40; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
41; GFX10-NEXT:    s_waitcnt vmcnt(0)
42; GFX10-NEXT:    s_setpc_b64 s[30:31]
43  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
44  %load = load i8, i8 addrspace(1)* %gep, align 4
45  ret i8 %load
46}
47
48define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
49; GFX9-LABEL: global_inst_valu_offset_12bit_max:
50; GFX9:       ; %bb.0:
51; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
53; GFX9-NEXT:    s_waitcnt vmcnt(0)
54; GFX9-NEXT:    s_setpc_b64 s[30:31]
55;
56; GFX10-LABEL: global_inst_valu_offset_12bit_max:
57; GFX10:       ; %bb.0:
58; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
60; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
61; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
62; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
63; GFX10-NEXT:    s_waitcnt vmcnt(0)
64; GFX10-NEXT:    s_setpc_b64 s[30:31]
65  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
66  %load = load i8, i8 addrspace(1)* %gep, align 4
67  ret i8 %load
68}
69
70define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
71; GFX9-LABEL: global_inst_valu_offset_13bit_max:
72; GFX9:       ; %bb.0:
73; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
75; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
76; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
77; GFX9-NEXT:    s_waitcnt vmcnt(0)
78; GFX9-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX10-LABEL: global_inst_valu_offset_13bit_max:
81; GFX10:       ; %bb.0:
82; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
84; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1800, v0
85; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
86; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
87; GFX10-NEXT:    s_waitcnt vmcnt(0)
88; GFX10-NEXT:    s_setpc_b64 s[30:31]
89  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
90  %load = load i8, i8 addrspace(1)* %gep, align 4
91  ret i8 %load
92}
93
94define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
95; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
96; GFX9:       ; %bb.0:
97; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
99; GFX9-NEXT:    s_waitcnt vmcnt(0)
100; GFX9-NEXT:    s_setpc_b64 s[30:31]
101;
102; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max:
103; GFX10:       ; %bb.0:
104; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
106; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
107; GFX10-NEXT:    s_waitcnt vmcnt(0)
108; GFX10-NEXT:    s_setpc_b64 s[30:31]
109  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
110  %load = load i8, i8 addrspace(1)* %gep, align 4
111  ret i8 %load
112}
113
114define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
115; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max:
116; GFX9:       ; %bb.0:
117; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
119; GFX9-NEXT:    s_waitcnt vmcnt(0)
120; GFX9-NEXT:    s_setpc_b64 s[30:31]
121;
122; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max:
123; GFX10:       ; %bb.0:
124; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
125; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
126; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
127; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
128; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
129; GFX10-NEXT:    s_waitcnt vmcnt(0)
130; GFX10-NEXT:    s_setpc_b64 s[30:31]
131  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
132  %load = load i8, i8 addrspace(1)* %gep, align 4
133  ret i8 %load
134}
135
136define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
137; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
138; GFX9:       ; %bb.0:
139; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
141; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
142; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
143; GFX9-NEXT:    s_waitcnt vmcnt(0)
144; GFX9-NEXT:    s_setpc_b64 s[30:31]
145;
146; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max:
147; GFX10:       ; %bb.0:
148; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
150; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
151; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
152; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
153; GFX10-NEXT:    s_waitcnt vmcnt(0)
154; GFX10-NEXT:    s_setpc_b64 s[30:31]
155  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
156  %load = load i8, i8 addrspace(1)* %gep, align 4
157  ret i8 %load
158}
159
160define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
161; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
162; GFX9:       ; %bb.0:
163; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
164; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
165; GFX9-NEXT:    s_waitcnt vmcnt(0)
166; GFX9-NEXT:    s_setpc_b64 s[30:31]
167;
168; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max:
169; GFX10:       ; %bb.0:
170; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
172; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
173; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
174; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
175; GFX10-NEXT:    s_waitcnt vmcnt(0)
176; GFX10-NEXT:    s_setpc_b64 s[30:31]
177  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
178  %load = load i8, i8 addrspace(1)* %gep, align 4
179  ret i8 %load
180}
181
182define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
183; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max:
184; GFX9:       ; %bb.0:
185; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
186; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
187; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
188; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
189; GFX9-NEXT:    s_waitcnt vmcnt(0)
190; GFX9-NEXT:    s_setpc_b64 s[30:31]
191;
192; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max:
193; GFX10:       ; %bb.0:
194; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
196; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1800, v0
197; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
198; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
199; GFX10-NEXT:    s_waitcnt vmcnt(0)
200; GFX10-NEXT:    s_setpc_b64 s[30:31]
201  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
202  %load = load i8, i8 addrspace(1)* %gep, align 4
203  ret i8 %load
204}
205
206define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
207; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max:
208; GFX9:       ; %bb.0:
209; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
211; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
212; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
213; GFX9-NEXT:    s_waitcnt vmcnt(0)
214; GFX9-NEXT:    s_setpc_b64 s[30:31]
215;
216; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max:
217; GFX10:       ; %bb.0:
218; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
220; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3800, v0
221; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
222; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
223; GFX10-NEXT:    s_waitcnt vmcnt(0)
224; GFX10-NEXT:    s_setpc_b64 s[30:31]
225  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
226  %load = load i8, i8 addrspace(1)* %gep, align 4
227  ret i8 %load
228}
229
230define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
231; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
232; GFX9:       ; %bb.0:
233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
235; GFX9-NEXT:    s_waitcnt vmcnt(0)
236; GFX9-NEXT:    s_setpc_b64 s[30:31]
237;
238; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
239; GFX10:       ; %bb.0:
240; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
241; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
242; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
243; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
244; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
245; GFX10-NEXT:    s_waitcnt vmcnt(0)
246; GFX10-NEXT:    s_setpc_b64 s[30:31]
247  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
248  %load = load i8, i8 addrspace(1)* %gep, align 4
249  ret i8 %load
250}
251
252define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
253; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
254; GFX9:       ; %bb.0:
255; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
257; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
258; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
259; GFX9-NEXT:    s_waitcnt vmcnt(0)
260; GFX9-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
263; GFX10:       ; %bb.0:
264; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
266; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
267; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
268; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
269; GFX10-NEXT:    s_waitcnt vmcnt(0)
270; GFX10-NEXT:    s_setpc_b64 s[30:31]
271  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
272  %load = load i8, i8 addrspace(1)* %gep, align 4
273  ret i8 %load
274}
275
276define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
277; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
278; GFX9:       ; %bb.0:
279; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
281; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
282; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
283; GFX9-NEXT:    s_waitcnt vmcnt(0)
284; GFX9-NEXT:    s_setpc_b64 s[30:31]
285;
286; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
287; GFX10:       ; %bb.0:
288; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
290; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
291; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
292; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
293; GFX10-NEXT:    s_waitcnt vmcnt(0)
294; GFX10-NEXT:    s_setpc_b64 s[30:31]
295  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
296  %load = load i8, i8 addrspace(1)* %gep, align 4
297  ret i8 %load
298}
299
300; Fill 11-bit low-bits (1ull << 33) | 2047
301define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
302; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0:
303; GFX9:       ; %bb.0:
304; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
306; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
307; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
308; GFX9-NEXT:    s_waitcnt vmcnt(0)
309; GFX9-NEXT:    s_setpc_b64 s[30:31]
310;
311; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0:
312; GFX10:       ; %bb.0:
313; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
314; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
315; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
316; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
317; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
318; GFX10-NEXT:    s_waitcnt vmcnt(0)
319; GFX10-NEXT:    s_setpc_b64 s[30:31]
320  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
321  %load = load i8, i8 addrspace(1)* %gep, align 4
322  ret i8 %load
323}
324
325; Fill 11-bit low-bits (1ull << 33) | 2048
326define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
327; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1:
328; GFX9:       ; %bb.0:
329; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
331; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
332; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
333; GFX9-NEXT:    s_waitcnt vmcnt(0)
334; GFX9-NEXT:    s_setpc_b64 s[30:31]
335;
336; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1:
337; GFX10:       ; %bb.0:
338; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
340; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
341; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
342; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
343; GFX10-NEXT:    s_waitcnt vmcnt(0)
344; GFX10-NEXT:    s_setpc_b64 s[30:31]
345  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
346  %load = load i8, i8 addrspace(1)* %gep, align 4
347  ret i8 %load
348}
349
350; Fill 12-bit low-bits (1ull << 33) | 4095
351define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
352; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0:
353; GFX9:       ; %bb.0:
354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
356; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
357; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
358; GFX9-NEXT:    s_waitcnt vmcnt(0)
359; GFX9-NEXT:    s_setpc_b64 s[30:31]
360;
361; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0:
362; GFX10:       ; %bb.0:
363; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
365; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
366; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
367; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
368; GFX10-NEXT:    s_waitcnt vmcnt(0)
369; GFX10-NEXT:    s_setpc_b64 s[30:31]
370  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
371  %load = load i8, i8 addrspace(1)* %gep, align 4
372  ret i8 %load
373}
374
375; Fill 12-bit low-bits (1ull << 33) | 4096
376define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
377; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
378; GFX9:       ; %bb.0:
379; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
381; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
382; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
383; GFX9-NEXT:    s_waitcnt vmcnt(0)
384; GFX9-NEXT:    s_setpc_b64 s[30:31]
385;
386; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1:
387; GFX10:       ; %bb.0:
388; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
390; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
391; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
392; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
393; GFX10-NEXT:    s_waitcnt vmcnt(0)
394; GFX10-NEXT:    s_setpc_b64 s[30:31]
395  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
396  %load = load i8, i8 addrspace(1)* %gep, align 4
397  ret i8 %load
398}
399
400; Fill 13-bit low-bits (1ull << 33) | 8191
401define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
402; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
406; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
407; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
408; GFX9-NEXT:    s_waitcnt vmcnt(0)
409; GFX9-NEXT:    s_setpc_b64 s[30:31]
410;
411; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0:
412; GFX10:       ; %bb.0:
413; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
415; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1800, v0
416; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
417; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
418; GFX10-NEXT:    s_waitcnt vmcnt(0)
419; GFX10-NEXT:    s_setpc_b64 s[30:31]
420  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
421  %load = load i8, i8 addrspace(1)* %gep, align 4
422  ret i8 %load
423}
424
425; Fill 13-bit low-bits (1ull << 33) | 8192
426define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
427; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
428; GFX9:       ; %bb.0:
429; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
430; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
431; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
432; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
433; GFX9-NEXT:    s_waitcnt vmcnt(0)
434; GFX9-NEXT:    s_setpc_b64 s[30:31]
435;
436; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1:
437; GFX10:       ; %bb.0:
438; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
440; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
441; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
442; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
443; GFX10-NEXT:    s_waitcnt vmcnt(0)
444; GFX10-NEXT:    s_setpc_b64 s[30:31]
445  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
446  %load = load i8, i8 addrspace(1)* %gep, align 4
447  ret i8 %load
448}
449
450; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
451define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
452; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
453; GFX9:       ; %bb.0:
454; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
456; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
457; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
458; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2049
459; GFX9-NEXT:    s_waitcnt vmcnt(0)
460; GFX9-NEXT:    s_setpc_b64 s[30:31]
461;
462; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
463; GFX10:       ; %bb.0:
464; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
466; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
467; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
468; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
469; GFX10-NEXT:    s_waitcnt vmcnt(0)
470; GFX10-NEXT:    s_setpc_b64 s[30:31]
471  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
472  %load = load i8, i8 addrspace(1)* %gep, align 4
473  ret i8 %load
474}
475
476; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
477define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
478; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
479; GFX9:       ; %bb.0:
480; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
482; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
483; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
484; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
485; GFX9-NEXT:    s_waitcnt vmcnt(0)
486; GFX9-NEXT:    s_setpc_b64 s[30:31]
487;
488; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
489; GFX10:       ; %bb.0:
490; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
492; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
493; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
494; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
495; GFX10-NEXT:    s_waitcnt vmcnt(0)
496; GFX10-NEXT:    s_setpc_b64 s[30:31]
497  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
498  %load = load i8, i8 addrspace(1)* %gep, align 4
499  ret i8 %load
500}
501
502; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
503define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
504; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
505; GFX9:       ; %bb.0:
506; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
508; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
509; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
510; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
511; GFX9-NEXT:    s_waitcnt vmcnt(0)
512; GFX9-NEXT:    s_setpc_b64 s[30:31]
513;
514; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
515; GFX10:       ; %bb.0:
516; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
517; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
518; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
519; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
520; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
521; GFX10-NEXT:    s_waitcnt vmcnt(0)
522; GFX10-NEXT:    s_setpc_b64 s[30:31]
523  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
524  %load = load i8, i8 addrspace(1)* %gep, align 4
525  ret i8 %load
526}
527
528; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
529define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
530; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
531; GFX9:       ; %bb.0:
532; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
533; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
534; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
535; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
536; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
537; GFX9-NEXT:    s_waitcnt vmcnt(0)
538; GFX9-NEXT:    s_setpc_b64 s[30:31]
539;
540; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
541; GFX10:       ; %bb.0:
542; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
543; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
544; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
545; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
546; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
547; GFX10-NEXT:    s_waitcnt vmcnt(0)
548; GFX10-NEXT:    s_setpc_b64 s[30:31]
549  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
550  %load = load i8, i8 addrspace(1)* %gep, align 4
551  ret i8 %load
552}
553
554; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
555define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
556; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
557; GFX9:       ; %bb.0:
558; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
559; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
560; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
561; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
562; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
563; GFX9-NEXT:    s_waitcnt vmcnt(0)
564; GFX9-NEXT:    s_setpc_b64 s[30:31]
565;
566; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
567; GFX10:       ; %bb.0:
568; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
570; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
571; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
572; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
573; GFX10-NEXT:    s_waitcnt vmcnt(0)
574; GFX10-NEXT:    s_setpc_b64 s[30:31]
575  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
576  %load = load i8, i8 addrspace(1)* %gep, align 4
577  ret i8 %load
578}
579
580; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
581define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
582; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
583; GFX9:       ; %bb.0:
584; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
586; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
587; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
588; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
589; GFX9-NEXT:    s_waitcnt vmcnt(0)
590; GFX9-NEXT:    s_setpc_b64 s[30:31]
591;
592; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
593; GFX10:       ; %bb.0:
594; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
595; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
596; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
597; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
598; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
599; GFX10-NEXT:    s_waitcnt vmcnt(0)
600; GFX10-NEXT:    s_setpc_b64 s[30:31]
601  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
602  %load = load i8, i8 addrspace(1)* %gep, align 4
603  ret i8 %load
604}
605
606define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
607; GFX9-LABEL: global_inst_salu_offset_1:
608; GFX9:       ; %bb.0:
609; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
610; GFX9-NEXT:    v_mov_b32_e32 v0, 0
611; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:1 glc
613; GFX9-NEXT:    s_waitcnt vmcnt(0)
614; GFX9-NEXT:    global_store_byte v[0:1], v0, off
615; GFX9-NEXT:    s_endpgm
616;
617; GFX10-LABEL: global_inst_salu_offset_1:
618; GFX10:       ; %bb.0:
619; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
620; GFX10-NEXT:    v_mov_b32_e32 v0, 0
621; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc
623; GFX10-NEXT:    s_waitcnt vmcnt(0)
624; GFX10-NEXT:    global_store_byte v[0:1], v0, off
625; GFX10-NEXT:    s_endpgm
626  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
627  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
628  store i8 %load, i8 addrspace(1)* undef
629  ret void
630}
631
632define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) {
633; GFX9-LABEL: global_inst_salu_offset_11bit_max:
634; GFX9:       ; %bb.0:
635; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
636; GFX9-NEXT:    v_mov_b32_e32 v0, 0
637; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc
639; GFX9-NEXT:    s_waitcnt vmcnt(0)
640; GFX9-NEXT:    global_store_byte v[0:1], v0, off
641; GFX9-NEXT:    s_endpgm
642;
643; GFX10-LABEL: global_inst_salu_offset_11bit_max:
644; GFX10:       ; %bb.0:
645; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
646; GFX10-NEXT:    v_mov_b32_e32 v0, 0
647; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
649; GFX10-NEXT:    s_waitcnt vmcnt(0)
650; GFX10-NEXT:    global_store_byte v[0:1], v0, off
651; GFX10-NEXT:    s_endpgm
652  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
653  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
654  store i8 %load, i8 addrspace(1)* undef
655  ret void
656}
657
658define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) {
659; GFX9-LABEL: global_inst_salu_offset_12bit_max:
660; GFX9:       ; %bb.0:
661; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
662; GFX9-NEXT:    v_mov_b32_e32 v0, 0
663; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095 glc
665; GFX9-NEXT:    s_waitcnt vmcnt(0)
666; GFX9-NEXT:    global_store_byte v[0:1], v0, off
667; GFX9-NEXT:    s_endpgm
668;
669; GFX10-LABEL: global_inst_salu_offset_12bit_max:
670; GFX10:       ; %bb.0:
671; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
672; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
673; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
675; GFX10-NEXT:    s_waitcnt vmcnt(0)
676; GFX10-NEXT:    global_store_byte v[0:1], v0, off
677; GFX10-NEXT:    s_endpgm
678  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
679  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
680  store i8 %load, i8 addrspace(1)* undef
681  ret void
682}
683
684define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) {
685; GFX9-LABEL: global_inst_salu_offset_13bit_max:
686; GFX9:       ; %bb.0:
687; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
688; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1000
689; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
690; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095 glc
691; GFX9-NEXT:    s_waitcnt vmcnt(0)
692; GFX9-NEXT:    global_store_byte v[0:1], v0, off
693; GFX9-NEXT:    s_endpgm
694;
695; GFX10-LABEL: global_inst_salu_offset_13bit_max:
696; GFX10:       ; %bb.0:
697; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
698; GFX10-NEXT:    v_mov_b32_e32 v0, 0x1800
699; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
701; GFX10-NEXT:    s_waitcnt vmcnt(0)
702; GFX10-NEXT:    global_store_byte v[0:1], v0, off
703; GFX10-NEXT:    s_endpgm
704  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
705  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
706  store i8 %load, i8 addrspace(1)* undef
707  ret void
708}
709
710define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
711; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
712; GFX9:       ; %bb.0:
713; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
714; GFX9-NEXT:    v_mov_b32_e32 v0, 0
715; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-2048 glc
717; GFX9-NEXT:    s_waitcnt vmcnt(0)
718; GFX9-NEXT:    global_store_byte v[0:1], v0, off
719; GFX9-NEXT:    s_endpgm
720;
721; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
722; GFX10:       ; %bb.0:
723; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
724; GFX10-NEXT:    v_mov_b32_e32 v0, 0
725; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc
727; GFX10-NEXT:    s_waitcnt vmcnt(0)
728; GFX10-NEXT:    global_store_byte v[0:1], v0, off
729; GFX10-NEXT:    s_endpgm
730  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
731  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
732  store i8 %load, i8 addrspace(1)* undef
733  ret void
734}
735
736define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
737; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
738; GFX9:       ; %bb.0:
739; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
740; GFX9-NEXT:    v_mov_b32_e32 v0, 0
741; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
743; GFX9-NEXT:    s_waitcnt vmcnt(0)
744; GFX9-NEXT:    global_store_byte v[0:1], v0, off
745; GFX9-NEXT:    s_endpgm
746;
747; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max:
748; GFX10:       ; %bb.0:
749; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
750; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX10-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
752; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
753; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
754; GFX10-NEXT:    s_waitcnt vmcnt(0)
755; GFX10-NEXT:    global_store_byte v[0:1], v0, off
756; GFX10-NEXT:    s_endpgm
757  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
758  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
759  store i8 %load, i8 addrspace(1)* undef
760  ret void
761}
762
763define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
764; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
765; GFX9:       ; %bb.0:
766; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
767; GFX9-NEXT:    v_mov_b32_e32 v0, 0
768; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
769; GFX9-NEXT:    s_add_u32 s0, s0, 0xffffe000
770; GFX9-NEXT:    s_addc_u32 s1, s1, -1
771; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
772; GFX9-NEXT:    s_waitcnt vmcnt(0)
773; GFX9-NEXT:    global_store_byte v[0:1], v0, off
774; GFX9-NEXT:    s_endpgm
775;
776; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max:
777; GFX10:       ; %bb.0:
778; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
779; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX10-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
781; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
782; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
783; GFX10-NEXT:    s_waitcnt vmcnt(0)
784; GFX10-NEXT:    global_store_byte v[0:1], v0, off
785; GFX10-NEXT:    s_endpgm
786  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
787  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
788  store i8 %load, i8 addrspace(1)* undef
789  ret void
790}
791
792define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
793; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
794; GFX9:       ; %bb.0:
795; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
796; GFX9-NEXT:    v_mov_b32_e32 v0, 0
797; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
798; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095 glc
799; GFX9-NEXT:    s_waitcnt vmcnt(0)
800; GFX9-NEXT:    global_store_byte v[0:1], v0, off
801; GFX9-NEXT:    s_endpgm
802;
803; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
804; GFX10:       ; %bb.0:
805; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
806; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
807; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
808; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
809; GFX10-NEXT:    s_waitcnt vmcnt(0)
810; GFX10-NEXT:    global_store_byte v[0:1], v0, off
811; GFX10-NEXT:    s_endpgm
812  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
813  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
814  store i8 %load, i8 addrspace(1)* undef
815  ret void
816}
817
818define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
819; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
820; GFX9:       ; %bb.0:
821; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
822; GFX9-NEXT:    v_mov_b32_e32 v0, 0x1000
823; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
824; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095 glc
825; GFX9-NEXT:    s_waitcnt vmcnt(0)
826; GFX9-NEXT:    global_store_byte v[0:1], v0, off
827; GFX9-NEXT:    s_endpgm
828;
829; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
830; GFX10:       ; %bb.0:
831; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
832; GFX10-NEXT:    v_mov_b32_e32 v0, 0x1800
833; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
834; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
835; GFX10-NEXT:    s_waitcnt vmcnt(0)
836; GFX10-NEXT:    global_store_byte v[0:1], v0, off
837; GFX10-NEXT:    s_endpgm
838  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
839  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
840  store i8 %load, i8 addrspace(1)* undef
841  ret void
842}
843
844define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
845; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
846; GFX9:       ; %bb.0:
847; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
848; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3000
849; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
850; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:4095 glc
851; GFX9-NEXT:    s_waitcnt vmcnt(0)
852; GFX9-NEXT:    global_store_byte v[0:1], v0, off
853; GFX9-NEXT:    s_endpgm
854;
855; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
856; GFX10:       ; %bb.0:
857; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
858; GFX10-NEXT:    v_mov_b32_e32 v0, 0x3800
859; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
860; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
861; GFX10-NEXT:    s_waitcnt vmcnt(0)
862; GFX10-NEXT:    global_store_byte v[0:1], v0, off
863; GFX10-NEXT:    s_endpgm
864  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
865  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
866  store i8 %load, i8 addrspace(1)* undef
867  ret void
868}
869
870define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
871; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
872; GFX9:       ; %bb.0:
873; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
874; GFX9-NEXT:    v_mov_b32_e32 v0, 0
875; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
876; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
877; GFX9-NEXT:    s_waitcnt vmcnt(0)
878; GFX9-NEXT:    global_store_byte v[0:1], v0, off
879; GFX9-NEXT:    s_endpgm
880;
881; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
882; GFX10:       ; %bb.0:
883; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
884; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
885; GFX10-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
886; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
887; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
888; GFX10-NEXT:    s_waitcnt vmcnt(0)
889; GFX10-NEXT:    global_store_byte v[0:1], v0, off
890; GFX10-NEXT:    s_endpgm
891  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
892  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
893  store i8 %load, i8 addrspace(1)* undef
894  ret void
895}
896
897define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
898; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
899; GFX9:       ; %bb.0:
900; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
901; GFX9-NEXT:    v_mov_b32_e32 v0, 0
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    s_add_u32 s0, s0, 0xffffe000
904; GFX9-NEXT:    s_addc_u32 s1, s1, -1
905; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
906; GFX9-NEXT:    s_waitcnt vmcnt(0)
907; GFX9-NEXT:    global_store_byte v[0:1], v0, off
908; GFX9-NEXT:    s_endpgm
909;
910; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
911; GFX10:       ; %bb.0:
912; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
913; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX10-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
915; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
916; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
917; GFX10-NEXT:    s_waitcnt vmcnt(0)
918; GFX10-NEXT:    global_store_byte v[0:1], v0, off
919; GFX10-NEXT:    s_endpgm
920  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
921  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
922  store i8 %load, i8 addrspace(1)* undef
923  ret void
924}
925
926define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
927; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
928; GFX9:       ; %bb.0:
929; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
930; GFX9-NEXT:    v_mov_b32_e32 v0, 0
931; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX9-NEXT:    s_add_u32 s0, s0, 0xffffc000
933; GFX9-NEXT:    s_addc_u32 s1, s1, -1
934; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
935; GFX9-NEXT:    s_waitcnt vmcnt(0)
936; GFX9-NEXT:    global_store_byte v[0:1], v0, off
937; GFX9-NEXT:    s_endpgm
938;
939; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
940; GFX10:       ; %bb.0:
941; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
942; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
943; GFX10-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
944; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
945; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
946; GFX10-NEXT:    s_waitcnt vmcnt(0)
947; GFX10-NEXT:    global_store_byte v[0:1], v0, off
948; GFX10-NEXT:    s_endpgm
949  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
950  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
951  store i8 %load, i8 addrspace(1)* undef
952  ret void
953}
954
955; Fill 11-bit low-bits (1ull << 33) | 2047
956define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
957; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
958; GFX9:       ; %bb.0:
959; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
960; GFX9-NEXT:    v_mov_b32_e32 v0, 0
961; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
962; GFX9-NEXT:    s_add_u32 s0, s0, 0x7ff
963; GFX9-NEXT:    s_addc_u32 s1, s1, 2
964; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
965; GFX9-NEXT:    s_waitcnt vmcnt(0)
966; GFX9-NEXT:    global_store_byte v[0:1], v0, off
967; GFX9-NEXT:    s_endpgm
968;
969; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0:
970; GFX10:       ; %bb.0:
971; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
972; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
973; GFX10-NEXT:    v_add_co_u32 v0, s0, 0, s0
974; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
975; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
976; GFX10-NEXT:    s_waitcnt vmcnt(0)
977; GFX10-NEXT:    global_store_byte v[0:1], v0, off
978; GFX10-NEXT:    s_endpgm
979  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
980  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
981  store i8 %load, i8 addrspace(1)* undef
982  ret void
983}
984
985; Fill 11-bit low-bits (1ull << 33) | 2048
986define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
987; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
988; GFX9:       ; %bb.0:
989; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
990; GFX9-NEXT:    v_mov_b32_e32 v0, 0
991; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
992; GFX9-NEXT:    s_add_u32 s0, s0, 0x800
993; GFX9-NEXT:    s_addc_u32 s1, s1, 2
994; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
995; GFX9-NEXT:    s_waitcnt vmcnt(0)
996; GFX9-NEXT:    global_store_byte v[0:1], v0, off
997; GFX9-NEXT:    s_endpgm
998;
999; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1:
1000; GFX10:       ; %bb.0:
1001; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1002; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1003; GFX10-NEXT:    v_add_co_u32 v0, s0, 0x800, s0
1004; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
1005; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
1006; GFX10-NEXT:    s_waitcnt vmcnt(0)
1007; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1008; GFX10-NEXT:    s_endpgm
1009  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
1010  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1011  store i8 %load, i8 addrspace(1)* undef
1012  ret void
1013}
1014
1015; Fill 12-bit low-bits (1ull << 33) | 4095
1016define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
1017; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
1018; GFX9:       ; %bb.0:
1019; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1020; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1021; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX9-NEXT:    s_add_u32 s0, s0, 0xfff
1023; GFX9-NEXT:    s_addc_u32 s1, s1, 2
1024; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1025; GFX9-NEXT:    s_waitcnt vmcnt(0)
1026; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1027; GFX9-NEXT:    s_endpgm
1028;
1029; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0:
1030; GFX10:       ; %bb.0:
1031; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1032; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1033; GFX10-NEXT:    v_add_co_u32 v0, s0, 0x800, s0
1034; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
1035; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
1036; GFX10-NEXT:    s_waitcnt vmcnt(0)
1037; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1038; GFX10-NEXT:    s_endpgm
1039  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
1040  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1041  store i8 %load, i8 addrspace(1)* undef
1042  ret void
1043}
1044
1045; Fill 12-bit low-bits (1ull << 33) | 4096
1046define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
1047; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
1048; GFX9:       ; %bb.0:
1049; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1050; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1051; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX9-NEXT:    s_add_u32 s0, s0, 0x1000
1053; GFX9-NEXT:    s_addc_u32 s1, s1, 2
1054; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1055; GFX9-NEXT:    s_waitcnt vmcnt(0)
1056; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1057; GFX9-NEXT:    s_endpgm
1058;
1059; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1:
1060; GFX10:       ; %bb.0:
1061; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1062; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1063; GFX10-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
1064; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
1065; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
1066; GFX10-NEXT:    s_waitcnt vmcnt(0)
1067; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1068; GFX10-NEXT:    s_endpgm
1069  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
1070  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1071  store i8 %load, i8 addrspace(1)* undef
1072  ret void
1073}
1074
1075; Fill 13-bit low-bits (1ull << 33) | 8191
1076define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
1077; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
1078; GFX9:       ; %bb.0:
1079; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1080; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1081; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX9-NEXT:    s_add_u32 s0, s0, 0x1fff
1083; GFX9-NEXT:    s_addc_u32 s1, s1, 2
1084; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1085; GFX9-NEXT:    s_waitcnt vmcnt(0)
1086; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1087; GFX9-NEXT:    s_endpgm
1088;
1089; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0:
1090; GFX10:       ; %bb.0:
1091; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1092; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX10-NEXT:    v_add_co_u32 v0, s0, 0x1800, s0
1094; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
1095; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
1096; GFX10-NEXT:    s_waitcnt vmcnt(0)
1097; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1098; GFX10-NEXT:    s_endpgm
1099  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
1100  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1101  store i8 %load, i8 addrspace(1)* undef
1102  ret void
1103}
1104
1105; Fill 13-bit low-bits (1ull << 33) | 8192
1106define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
1107; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
1108; GFX9:       ; %bb.0:
1109; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1110; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1111; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX9-NEXT:    s_add_u32 s0, s0, 0x2000
1113; GFX9-NEXT:    s_addc_u32 s1, s1, 2
1114; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1115; GFX9-NEXT:    s_waitcnt vmcnt(0)
1116; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1117; GFX9-NEXT:    s_endpgm
1118;
1119; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1:
1120; GFX10:       ; %bb.0:
1121; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1122; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX10-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
1124; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
1125; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
1126; GFX10-NEXT:    s_waitcnt vmcnt(0)
1127; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1128; GFX10-NEXT:    s_endpgm
1129  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
1130  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1131  store i8 %load, i8 addrspace(1)* undef
1132  ret void
1133}
1134
1135; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
1136define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
1137; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
1138; GFX9:       ; %bb.0:
1139; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1140; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1141; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1142; GFX9-NEXT:    s_add_u32 s0, s0, 0x7ff
1143; GFX9-NEXT:    s_addc_u32 s1, s1, 0x80000000
1144; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1145; GFX9-NEXT:    s_waitcnt vmcnt(0)
1146; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1147; GFX9-NEXT:    s_endpgm
1148;
1149; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
1150; GFX10:       ; %bb.0:
1151; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1152; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1153; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1154; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
1155; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1156; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] glc dlc
1157; GFX10-NEXT:    s_waitcnt vmcnt(0)
1158; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1159; GFX10-NEXT:    s_endpgm
1160  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
1161  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1162  store i8 %load, i8 addrspace(1)* undef
1163  ret void
1164}
1165
1166; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
1167define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
1168; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
1169; GFX9:       ; %bb.0:
1170; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1171; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1172; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1173; GFX9-NEXT:    s_add_u32 s0, s0, 0x800
1174; GFX9-NEXT:    s_addc_u32 s1, s1, 0x80000000
1175; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1176; GFX9-NEXT:    s_waitcnt vmcnt(0)
1177; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1178; GFX9-NEXT:    s_endpgm
1179;
1180; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
1181; GFX10:       ; %bb.0:
1182; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1183; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1184; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1185; GFX10-NEXT:    s_add_u32 s0, s0, 0x800
1186; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1187; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] glc dlc
1188; GFX10-NEXT:    s_waitcnt vmcnt(0)
1189; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1190; GFX10-NEXT:    s_endpgm
1191  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
1192  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1193  store i8 %load, i8 addrspace(1)* undef
1194  ret void
1195}
1196
1197; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
1198define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
1199; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
1200; GFX9:       ; %bb.0:
1201; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1202; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1203; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1204; GFX9-NEXT:    s_add_u32 s0, s0, 0xfff
1205; GFX9-NEXT:    s_addc_u32 s1, s1, 0x80000000
1206; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1207; GFX9-NEXT:    s_waitcnt vmcnt(0)
1208; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1209; GFX9-NEXT:    s_endpgm
1210;
1211; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
1212; GFX10:       ; %bb.0:
1213; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1214; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1215; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
1217; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1218; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] glc dlc
1219; GFX10-NEXT:    s_waitcnt vmcnt(0)
1220; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1221; GFX10-NEXT:    s_endpgm
1222  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
1223  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1224  store i8 %load, i8 addrspace(1)* undef
1225  ret void
1226}
1227
1228; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
1229define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
1230; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
1231; GFX9:       ; %bb.0:
1232; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1233; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1235; GFX9-NEXT:    s_add_u32 s0, s0, 0x1000
1236; GFX9-NEXT:    s_addc_u32 s1, s1, 0x80000000
1237; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1238; GFX9-NEXT:    s_waitcnt vmcnt(0)
1239; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1240; GFX9-NEXT:    s_endpgm
1241;
1242; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
1243; GFX10:       ; %bb.0:
1244; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1245; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1246; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1247; GFX10-NEXT:    s_add_u32 s0, s0, 0x1000
1248; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1249; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] glc dlc
1250; GFX10-NEXT:    s_waitcnt vmcnt(0)
1251; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1252; GFX10-NEXT:    s_endpgm
1253  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
1254  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1255  store i8 %load, i8 addrspace(1)* undef
1256  ret void
1257}
1258
1259; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
1260define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
1261; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
1262; GFX9:       ; %bb.0:
1263; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1264; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1265; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1266; GFX9-NEXT:    s_add_u32 s0, s0, 0x1fff
1267; GFX9-NEXT:    s_addc_u32 s1, s1, 0x80000000
1268; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1269; GFX9-NEXT:    s_waitcnt vmcnt(0)
1270; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1271; GFX9-NEXT:    s_endpgm
1272;
1273; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
1274; GFX10:       ; %bb.0:
1275; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1276; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1277; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1278; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
1279; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1280; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] glc dlc
1281; GFX10-NEXT:    s_waitcnt vmcnt(0)
1282; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1283; GFX10-NEXT:    s_endpgm
1284  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
1285  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1286  store i8 %load, i8 addrspace(1)* undef
1287  ret void
1288}
1289
1290; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
1291define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
1292; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
1293; GFX9:       ; %bb.0:
1294; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1295; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1297; GFX9-NEXT:    s_add_u32 s0, s0, 0x2000
1298; GFX9-NEXT:    s_addc_u32 s1, s1, 0x80000000
1299; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] glc
1300; GFX9-NEXT:    s_waitcnt vmcnt(0)
1301; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1302; GFX9-NEXT:    s_endpgm
1303;
1304; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
1305; GFX10:       ; %bb.0:
1306; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1307; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1308; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX10-NEXT:    s_add_u32 s0, s0, 0x2000
1310; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1311; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] glc dlc
1312; GFX10-NEXT:    s_waitcnt vmcnt(0)
1313; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1314; GFX10-NEXT:    s_endpgm
1315  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
1316  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1317  store i8 %load, i8 addrspace(1)* undef
1318  ret void
1319}
1320