1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5; Test splitting flat instruction offsets into the low and high bits
6; when the offset doesn't fit in the offset field.
7
8define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) {
9; GFX9-LABEL: global_inst_valu_offset_1:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
13; GFX9-NEXT:    s_waitcnt vmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX10-LABEL: global_inst_valu_offset_1:
17; GFX10:       ; %bb.0:
18; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
20; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
21; GFX10-NEXT:    ; implicit-def: $vcc_hi
22; GFX10-NEXT:    s_waitcnt vmcnt(0)
23; GFX10-NEXT:    s_setpc_b64 s[30:31]
24  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
25  %load = load i8, i8 addrspace(1)* %gep, align 4
26  ret i8 %load
27}
28
29define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) {
30; GFX9-LABEL: global_inst_valu_offset_11bit_max:
31; GFX9:       ; %bb.0:
32; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
34; GFX9-NEXT:    s_waitcnt vmcnt(0)
35; GFX9-NEXT:    s_setpc_b64 s[30:31]
36;
37; GFX10-LABEL: global_inst_valu_offset_11bit_max:
38; GFX10:       ; %bb.0:
39; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
41; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
42; GFX10-NEXT:    ; implicit-def: $vcc_hi
43; GFX10-NEXT:    s_waitcnt vmcnt(0)
44; GFX10-NEXT:    s_setpc_b64 s[30:31]
45  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
46  %load = load i8, i8 addrspace(1)* %gep, align 4
47  ret i8 %load
48}
49
50define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
51; GFX9-LABEL: global_inst_valu_offset_12bit_max:
52; GFX9:       ; %bb.0:
53; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
55; GFX9-NEXT:    s_waitcnt vmcnt(0)
56; GFX9-NEXT:    s_setpc_b64 s[30:31]
57;
58; GFX10-LABEL: global_inst_valu_offset_12bit_max:
59; GFX10:       ; %bb.0:
60; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
62; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
63; GFX10-NEXT:    ; implicit-def: $vcc_hi
64; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
65; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
66; GFX10-NEXT:    s_waitcnt vmcnt(0)
67; GFX10-NEXT:    s_setpc_b64 s[30:31]
68  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
69  %load = load i8, i8 addrspace(1)* %gep, align 4
70  ret i8 %load
71}
72
73define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
74; GFX9-LABEL: global_inst_valu_offset_13bit_max:
75; GFX9:       ; %bb.0:
76; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
78; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
79; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
80; GFX9-NEXT:    s_waitcnt vmcnt(0)
81; GFX9-NEXT:    s_setpc_b64 s[30:31]
82;
83; GFX10-LABEL: global_inst_valu_offset_13bit_max:
84; GFX10:       ; %bb.0:
85; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
87; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
88; GFX10-NEXT:    ; implicit-def: $vcc_hi
89; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
90; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
91; GFX10-NEXT:    s_waitcnt vmcnt(0)
92; GFX10-NEXT:    s_setpc_b64 s[30:31]
93  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
94  %load = load i8, i8 addrspace(1)* %gep, align 4
95  ret i8 %load
96}
97
98define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
99; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max:
100; GFX9:       ; %bb.0:
101; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
103; GFX9-NEXT:    s_waitcnt vmcnt(0)
104; GFX9-NEXT:    s_setpc_b64 s[30:31]
105;
106; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max:
107; GFX10:       ; %bb.0:
108; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
110; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
111; GFX10-NEXT:    ; implicit-def: $vcc_hi
112; GFX10-NEXT:    s_waitcnt vmcnt(0)
113; GFX10-NEXT:    s_setpc_b64 s[30:31]
114  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
115  %load = load i8, i8 addrspace(1)* %gep, align 4
116  ret i8 %load
117}
118
119define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
120; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max:
121; GFX9:       ; %bb.0:
122; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
124; GFX9-NEXT:    s_waitcnt vmcnt(0)
125; GFX9-NEXT:    s_setpc_b64 s[30:31]
126;
127; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max:
128; GFX10:       ; %bb.0:
129; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
130; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
131; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
132; GFX10-NEXT:    ; implicit-def: $vcc_hi
133; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
134; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
135; GFX10-NEXT:    s_waitcnt vmcnt(0)
136; GFX10-NEXT:    s_setpc_b64 s[30:31]
137  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
138  %load = load i8, i8 addrspace(1)* %gep, align 4
139  ret i8 %load
140}
141
142define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
143; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
144; GFX9:       ; %bb.0:
145; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
146; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
147; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
148; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
149; GFX9-NEXT:    s_waitcnt vmcnt(0)
150; GFX9-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max:
153; GFX10:       ; %bb.0:
154; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
156; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
157; GFX10-NEXT:    ; implicit-def: $vcc_hi
158; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
159; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
160; GFX10-NEXT:    s_waitcnt vmcnt(0)
161; GFX10-NEXT:    s_setpc_b64 s[30:31]
162  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
163  %load = load i8, i8 addrspace(1)* %gep, align 4
164  ret i8 %load
165}
166
167define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
168; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max:
169; GFX9:       ; %bb.0:
170; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
172; GFX9-NEXT:    s_waitcnt vmcnt(0)
173; GFX9-NEXT:    s_setpc_b64 s[30:31]
174;
175; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max:
176; GFX10:       ; %bb.0:
177; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
179; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
180; GFX10-NEXT:    ; implicit-def: $vcc_hi
181; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
182; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
183; GFX10-NEXT:    s_waitcnt vmcnt(0)
184; GFX10-NEXT:    s_setpc_b64 s[30:31]
185  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
186  %load = load i8, i8 addrspace(1)* %gep, align 4
187  ret i8 %load
188}
189
190define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
191; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max:
192; GFX9:       ; %bb.0:
193; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
195; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
196; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
197; GFX9-NEXT:    s_waitcnt vmcnt(0)
198; GFX9-NEXT:    s_setpc_b64 s[30:31]
199;
200; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max:
201; GFX10:       ; %bb.0:
202; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
204; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
205; GFX10-NEXT:    ; implicit-def: $vcc_hi
206; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
207; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
208; GFX10-NEXT:    s_waitcnt vmcnt(0)
209; GFX10-NEXT:    s_setpc_b64 s[30:31]
210  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
211  %load = load i8, i8 addrspace(1)* %gep, align 4
212  ret i8 %load
213}
214
215define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
216; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max:
217; GFX9:       ; %bb.0:
218; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
220; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
221; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
222; GFX9-NEXT:    s_waitcnt vmcnt(0)
223; GFX9-NEXT:    s_setpc_b64 s[30:31]
224;
225; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max:
226; GFX10:       ; %bb.0:
227; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
229; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0
230; GFX10-NEXT:    ; implicit-def: $vcc_hi
231; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
232; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
233; GFX10-NEXT:    s_waitcnt vmcnt(0)
234; GFX10-NEXT:    s_setpc_b64 s[30:31]
235  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
236  %load = load i8, i8 addrspace(1)* %gep, align 4
237  ret i8 %load
238}
239
240define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
241; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
242; GFX9:       ; %bb.0:
243; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
245; GFX9-NEXT:    s_waitcnt vmcnt(0)
246; GFX9-NEXT:    s_setpc_b64 s[30:31]
247;
248; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
249; GFX10:       ; %bb.0:
250; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
252; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
253; GFX10-NEXT:    ; implicit-def: $vcc_hi
254; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
255; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
256; GFX10-NEXT:    s_waitcnt vmcnt(0)
257; GFX10-NEXT:    s_setpc_b64 s[30:31]
258  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
259  %load = load i8, i8 addrspace(1)* %gep, align 4
260  ret i8 %load
261}
262
263define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
264; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
265; GFX9:       ; %bb.0:
266; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
268; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
269; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
270; GFX9-NEXT:    s_waitcnt vmcnt(0)
271; GFX9-NEXT:    s_setpc_b64 s[30:31]
272;
273; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
274; GFX10:       ; %bb.0:
275; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
277; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
278; GFX10-NEXT:    ; implicit-def: $vcc_hi
279; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
280; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
281; GFX10-NEXT:    s_waitcnt vmcnt(0)
282; GFX10-NEXT:    s_setpc_b64 s[30:31]
283  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
284  %load = load i8, i8 addrspace(1)* %gep, align 4
285  ret i8 %load
286}
287
288define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
289; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
290; GFX9:       ; %bb.0:
291; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
293; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
294; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
295; GFX9-NEXT:    s_waitcnt vmcnt(0)
296; GFX9-NEXT:    s_setpc_b64 s[30:31]
297;
298; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
299; GFX10:       ; %bb.0:
300; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
301; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
302; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
303; GFX10-NEXT:    ; implicit-def: $vcc_hi
304; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
305; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
306; GFX10-NEXT:    s_waitcnt vmcnt(0)
307; GFX10-NEXT:    s_setpc_b64 s[30:31]
308  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
309  %load = load i8, i8 addrspace(1)* %gep, align 4
310  ret i8 %load
311}
312
313; Fill 11-bit low-bits (1ull << 33) | 2047
314define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
315; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0:
316; GFX9:       ; %bb.0:
317; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
319; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
320; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
321; GFX9-NEXT:    s_waitcnt vmcnt(0)
322; GFX9-NEXT:    s_setpc_b64 s[30:31]
323;
324; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0:
325; GFX10:       ; %bb.0:
326; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
328; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0, v0
329; GFX10-NEXT:    ; implicit-def: $vcc_hi
330; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
331; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
332; GFX10-NEXT:    s_waitcnt vmcnt(0)
333; GFX10-NEXT:    s_setpc_b64 s[30:31]
334  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
335  %load = load i8, i8 addrspace(1)* %gep, align 4
336  ret i8 %load
337}
338
339; Fill 11-bit low-bits (1ull << 33) | 2048
340define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
341; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1:
342; GFX9:       ; %bb.0:
343; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
345; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
346; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
347; GFX9-NEXT:    s_waitcnt vmcnt(0)
348; GFX9-NEXT:    s_setpc_b64 s[30:31]
349;
350; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1:
351; GFX10:       ; %bb.0:
352; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
353; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
354; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
355; GFX10-NEXT:    ; implicit-def: $vcc_hi
356; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
357; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
358; GFX10-NEXT:    s_waitcnt vmcnt(0)
359; GFX10-NEXT:    s_setpc_b64 s[30:31]
360  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
361  %load = load i8, i8 addrspace(1)* %gep, align 4
362  ret i8 %load
363}
364
365; Fill 12-bit low-bits (1ull << 33) | 4095
366define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
367; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0:
368; GFX9:       ; %bb.0:
369; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
371; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
372; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
373; GFX9-NEXT:    s_waitcnt vmcnt(0)
374; GFX9-NEXT:    s_setpc_b64 s[30:31]
375;
376; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0:
377; GFX10:       ; %bb.0:
378; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
380; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
381; GFX10-NEXT:    ; implicit-def: $vcc_hi
382; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
383; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
384; GFX10-NEXT:    s_waitcnt vmcnt(0)
385; GFX10-NEXT:    s_setpc_b64 s[30:31]
386  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
387  %load = load i8, i8 addrspace(1)* %gep, align 4
388  ret i8 %load
389}
390
391; Fill 12-bit low-bits (1ull << 33) | 4096
392define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
393; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1:
394; GFX9:       ; %bb.0:
395; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
396; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
397; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
398; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
399; GFX9-NEXT:    s_waitcnt vmcnt(0)
400; GFX9-NEXT:    s_setpc_b64 s[30:31]
401;
402; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1:
403; GFX10:       ; %bb.0:
404; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
405; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
406; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
407; GFX10-NEXT:    ; implicit-def: $vcc_hi
408; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
409; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
410; GFX10-NEXT:    s_waitcnt vmcnt(0)
411; GFX10-NEXT:    s_setpc_b64 s[30:31]
412  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
413  %load = load i8, i8 addrspace(1)* %gep, align 4
414  ret i8 %load
415}
416
417; Fill 13-bit low-bits (1ull << 33) | 8191
418define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
419; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0:
420; GFX9:       ; %bb.0:
421; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
423; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
424; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
425; GFX9-NEXT:    s_waitcnt vmcnt(0)
426; GFX9-NEXT:    s_setpc_b64 s[30:31]
427;
428; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0:
429; GFX10:       ; %bb.0:
430; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
431; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
432; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
433; GFX10-NEXT:    ; implicit-def: $vcc_hi
434; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
435; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
436; GFX10-NEXT:    s_waitcnt vmcnt(0)
437; GFX10-NEXT:    s_setpc_b64 s[30:31]
438  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
439  %load = load i8, i8 addrspace(1)* %gep, align 4
440  ret i8 %load
441}
442
443; Fill 13-bit low-bits (1ull << 33) | 8192
444define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
445; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1:
446; GFX9:       ; %bb.0:
447; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
449; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
450; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
451; GFX9-NEXT:    s_waitcnt vmcnt(0)
452; GFX9-NEXT:    s_setpc_b64 s[30:31]
453;
454; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1:
455; GFX10:       ; %bb.0:
456; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
458; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
459; GFX10-NEXT:    ; implicit-def: $vcc_hi
460; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
461; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
462; GFX10-NEXT:    s_waitcnt vmcnt(0)
463; GFX10-NEXT:    s_setpc_b64 s[30:31]
464  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
465  %load = load i8, i8 addrspace(1)* %gep, align 4
466  ret i8 %load
467}
468
469; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
470define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
471; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
472; GFX9:       ; %bb.0:
473; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
475; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
476; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
477; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
478; GFX9-NEXT:    s_waitcnt vmcnt(0)
479; GFX9-NEXT:    s_setpc_b64 s[30:31]
480;
481; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
482; GFX10:       ; %bb.0:
483; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
485; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0, v0
486; GFX10-NEXT:    ; implicit-def: $vcc_hi
487; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
488; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
489; GFX10-NEXT:    s_waitcnt vmcnt(0)
490; GFX10-NEXT:    s_setpc_b64 s[30:31]
491  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
492  %load = load i8, i8 addrspace(1)* %gep, align 4
493  ret i8 %load
494}
495
496; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
497define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
498; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
499; GFX9:       ; %bb.0:
500; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
501; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
502; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
503; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
504; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
505; GFX9-NEXT:    s_waitcnt vmcnt(0)
506; GFX9-NEXT:    s_setpc_b64 s[30:31]
507;
508; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
509; GFX10:       ; %bb.0:
510; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
511; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
512; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
513; GFX10-NEXT:    ; implicit-def: $vcc_hi
514; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
515; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
516; GFX10-NEXT:    s_waitcnt vmcnt(0)
517; GFX10-NEXT:    s_setpc_b64 s[30:31]
518  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
519  %load = load i8, i8 addrspace(1)* %gep, align 4
520  ret i8 %load
521}
522
523; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
524define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
525; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
526; GFX9:       ; %bb.0:
527; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
528; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
529; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
530; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
531; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
532; GFX9-NEXT:    s_waitcnt vmcnt(0)
533; GFX9-NEXT:    s_setpc_b64 s[30:31]
534;
535; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
536; GFX10:       ; %bb.0:
537; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
539; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
540; GFX10-NEXT:    ; implicit-def: $vcc_hi
541; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
542; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
543; GFX10-NEXT:    s_waitcnt vmcnt(0)
544; GFX10-NEXT:    s_setpc_b64 s[30:31]
545  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
546  %load = load i8, i8 addrspace(1)* %gep, align 4
547  ret i8 %load
548}
549
550; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
551define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
552; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
553; GFX9:       ; %bb.0:
554; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
556; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
557; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
558; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
559; GFX9-NEXT:    s_waitcnt vmcnt(0)
560; GFX9-NEXT:    s_setpc_b64 s[30:31]
561;
562; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
563; GFX10:       ; %bb.0:
564; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
565; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
566; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
567; GFX10-NEXT:    ; implicit-def: $vcc_hi
568; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
569; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
570; GFX10-NEXT:    s_waitcnt vmcnt(0)
571; GFX10-NEXT:    s_setpc_b64 s[30:31]
572  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
573  %load = load i8, i8 addrspace(1)* %gep, align 4
574  ret i8 %load
575}
576
577; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
578define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
579; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
580; GFX9:       ; %bb.0:
581; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
583; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
584; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
585; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
586; GFX9-NEXT:    s_waitcnt vmcnt(0)
587; GFX9-NEXT:    s_setpc_b64 s[30:31]
588;
589; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
590; GFX10:       ; %bb.0:
591; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
593; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
594; GFX10-NEXT:    ; implicit-def: $vcc_hi
595; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
596; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
597; GFX10-NEXT:    s_waitcnt vmcnt(0)
598; GFX10-NEXT:    s_setpc_b64 s[30:31]
599  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
600  %load = load i8, i8 addrspace(1)* %gep, align 4
601  ret i8 %load
602}
603
604; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
605define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
606; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
607; GFX9:       ; %bb.0:
608; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
609; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
610; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
611; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
612; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
613; GFX9-NEXT:    s_waitcnt vmcnt(0)
614; GFX9-NEXT:    s_setpc_b64 s[30:31]
615;
616; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
617; GFX10:       ; %bb.0:
618; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
620; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
621; GFX10-NEXT:    ; implicit-def: $vcc_hi
622; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
623; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
624; GFX10-NEXT:    s_waitcnt vmcnt(0)
625; GFX10-NEXT:    s_setpc_b64 s[30:31]
626  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
627  %load = load i8, i8 addrspace(1)* %gep, align 4
628  ret i8 %load
629}
630
631define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) {
632; GFX9-LABEL: global_inst_salu_offset_1:
633; GFX9:       ; %bb.0:
634; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX9-NEXT:    v_mov_b32_e32 v0, s0
637; GFX9-NEXT:    v_mov_b32_e32 v1, s1
638; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
639; GFX9-NEXT:    s_waitcnt vmcnt(0)
640; GFX9-NEXT:    global_store_byte v[0:1], v0, off
641; GFX9-NEXT:    s_endpgm
642;
643; GFX10-LABEL: global_inst_salu_offset_1:
644; GFX10:       ; %bb.0:
645; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
646; GFX10-NEXT:    ; implicit-def: $vcc_hi
647; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX10-NEXT:    v_mov_b32_e32 v0, s0
649; GFX10-NEXT:    v_mov_b32_e32 v1, s1
650; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
651; GFX10-NEXT:    s_waitcnt vmcnt(0)
652; GFX10-NEXT:    global_store_byte v[0:1], v0, off
653; GFX10-NEXT:    s_endpgm
654  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1
655  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
656  store i8 %load, i8 addrspace(1)* undef
657  ret void
658}
659
660define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) {
661; GFX9-LABEL: global_inst_salu_offset_11bit_max:
662; GFX9:       ; %bb.0:
663; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
664; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX9-NEXT:    v_mov_b32_e32 v0, s0
666; GFX9-NEXT:    v_mov_b32_e32 v1, s1
667; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
668; GFX9-NEXT:    s_waitcnt vmcnt(0)
669; GFX9-NEXT:    global_store_byte v[0:1], v0, off
670; GFX9-NEXT:    s_endpgm
671;
672; GFX10-LABEL: global_inst_salu_offset_11bit_max:
673; GFX10:       ; %bb.0:
674; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
675; GFX10-NEXT:    ; implicit-def: $vcc_hi
676; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-NEXT:    v_mov_b32_e32 v0, s0
678; GFX10-NEXT:    v_mov_b32_e32 v1, s1
679; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
680; GFX10-NEXT:    s_waitcnt vmcnt(0)
681; GFX10-NEXT:    global_store_byte v[0:1], v0, off
682; GFX10-NEXT:    s_endpgm
683  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047
684  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
685  store i8 %load, i8 addrspace(1)* undef
686  ret void
687}
688
689define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) {
690; GFX9-LABEL: global_inst_salu_offset_12bit_max:
691; GFX9:       ; %bb.0:
692; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
693; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX9-NEXT:    v_mov_b32_e32 v0, s0
695; GFX9-NEXT:    v_mov_b32_e32 v1, s1
696; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
697; GFX9-NEXT:    s_waitcnt vmcnt(0)
698; GFX9-NEXT:    global_store_byte v[0:1], v0, off
699; GFX9-NEXT:    s_endpgm
700;
701; GFX10-LABEL: global_inst_salu_offset_12bit_max:
702; GFX10:       ; %bb.0:
703; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
704; GFX10-NEXT:    ; implicit-def: $vcc_hi
705; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
706; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x800, s0
707; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
708; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
709; GFX10-NEXT:    s_waitcnt vmcnt(0)
710; GFX10-NEXT:    global_store_byte v[0:1], v0, off
711; GFX10-NEXT:    s_endpgm
712  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
713  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
714  store i8 %load, i8 addrspace(1)* undef
715  ret void
716}
717
718define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) {
719; GFX9-LABEL: global_inst_salu_offset_13bit_max:
720; GFX9:       ; %bb.0:
721; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
722; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
723; GFX9-NEXT:    v_mov_b32_e32 v0, s0
724; GFX9-NEXT:    v_mov_b32_e32 v1, s1
725; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
726; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
727; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
728; GFX9-NEXT:    s_waitcnt vmcnt(0)
729; GFX9-NEXT:    global_store_byte v[0:1], v0, off
730; GFX9-NEXT:    s_endpgm
731;
732; GFX10-LABEL: global_inst_salu_offset_13bit_max:
733; GFX10:       ; %bb.0:
734; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
735; GFX10-NEXT:    ; implicit-def: $vcc_hi
736; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
737; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x1800, s0
738; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
739; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
740; GFX10-NEXT:    s_waitcnt vmcnt(0)
741; GFX10-NEXT:    global_store_byte v[0:1], v0, off
742; GFX10-NEXT:    s_endpgm
743  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
744  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
745  store i8 %load, i8 addrspace(1)* undef
746  ret void
747}
748
749define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) {
750; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
751; GFX9:       ; %bb.0:
752; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
753; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
754; GFX9-NEXT:    v_mov_b32_e32 v0, s0
755; GFX9-NEXT:    v_mov_b32_e32 v1, s1
756; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
757; GFX9-NEXT:    s_waitcnt vmcnt(0)
758; GFX9-NEXT:    global_store_byte v[0:1], v0, off
759; GFX9-NEXT:    s_endpgm
760;
761; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
762; GFX10:       ; %bb.0:
763; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
764; GFX10-NEXT:    ; implicit-def: $vcc_hi
765; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX10-NEXT:    v_mov_b32_e32 v0, s0
767; GFX10-NEXT:    v_mov_b32_e32 v1, s1
768; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
769; GFX10-NEXT:    s_waitcnt vmcnt(0)
770; GFX10-NEXT:    global_store_byte v[0:1], v0, off
771; GFX10-NEXT:    s_endpgm
772  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048
773  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
774  store i8 %load, i8 addrspace(1)* undef
775  ret void
776}
777
778define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
779; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
780; GFX9:       ; %bb.0:
781; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
782; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX9-NEXT:    v_mov_b32_e32 v0, s0
784; GFX9-NEXT:    v_mov_b32_e32 v1, s1
785; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
786; GFX9-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-NEXT:    global_store_byte v[0:1], v0, off
788; GFX9-NEXT:    s_endpgm
789;
790; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max:
791; GFX10:       ; %bb.0:
792; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
793; GFX10-NEXT:    ; implicit-def: $vcc_hi
794; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
795; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xfffff000, s0
796; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
797; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
798; GFX10-NEXT:    s_waitcnt vmcnt(0)
799; GFX10-NEXT:    global_store_byte v[0:1], v0, off
800; GFX10-NEXT:    s_endpgm
801  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
802  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
803  store i8 %load, i8 addrspace(1)* undef
804  ret void
805}
806
807define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
808; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
809; GFX9:       ; %bb.0:
810; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
811; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX9-NEXT:    v_mov_b32_e32 v0, s0
813; GFX9-NEXT:    v_mov_b32_e32 v1, s1
814; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
815; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
816; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
817; GFX9-NEXT:    s_waitcnt vmcnt(0)
818; GFX9-NEXT:    global_store_byte v[0:1], v0, off
819; GFX9-NEXT:    s_endpgm
820;
821; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max:
822; GFX10:       ; %bb.0:
823; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
824; GFX10-NEXT:    ; implicit-def: $vcc_hi
825; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xffffe000, s0
827; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
828; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
829; GFX10-NEXT:    s_waitcnt vmcnt(0)
830; GFX10-NEXT:    global_store_byte v[0:1], v0, off
831; GFX10-NEXT:    s_endpgm
832  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
833  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
834  store i8 %load, i8 addrspace(1)* undef
835  ret void
836}
837
838define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
839; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
840; GFX9:       ; %bb.0:
841; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
842; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX9-NEXT:    v_mov_b32_e32 v0, s0
844; GFX9-NEXT:    v_mov_b32_e32 v1, s1
845; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
846; GFX9-NEXT:    s_waitcnt vmcnt(0)
847; GFX9-NEXT:    global_store_byte v[0:1], v0, off
848; GFX9-NEXT:    s_endpgm
849;
850; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
851; GFX10:       ; %bb.0:
852; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
853; GFX10-NEXT:    ; implicit-def: $vcc_hi
854; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x800, s0
856; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
857; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
858; GFX10-NEXT:    s_waitcnt vmcnt(0)
859; GFX10-NEXT:    global_store_byte v[0:1], v0, off
860; GFX10-NEXT:    s_endpgm
861  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095
862  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
863  store i8 %load, i8 addrspace(1)* undef
864  ret void
865}
866
867define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
868; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
869; GFX9:       ; %bb.0:
870; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
871; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
872; GFX9-NEXT:    v_mov_b32_e32 v0, s0
873; GFX9-NEXT:    v_mov_b32_e32 v1, s1
874; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
875; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
876; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
877; GFX9-NEXT:    s_waitcnt vmcnt(0)
878; GFX9-NEXT:    global_store_byte v[0:1], v0, off
879; GFX9-NEXT:    s_endpgm
880;
881; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
882; GFX10:       ; %bb.0:
883; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
884; GFX10-NEXT:    ; implicit-def: $vcc_hi
885; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x1800, s0
887; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
888; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
889; GFX10-NEXT:    s_waitcnt vmcnt(0)
890; GFX10-NEXT:    global_store_byte v[0:1], v0, off
891; GFX10-NEXT:    s_endpgm
892  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191
893  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
894  store i8 %load, i8 addrspace(1)* undef
895  ret void
896}
897
898define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
899; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
900; GFX9:       ; %bb.0:
901; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
902; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX9-NEXT:    v_mov_b32_e32 v0, s0
904; GFX9-NEXT:    v_mov_b32_e32 v1, s1
905; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
906; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
907; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
908; GFX9-NEXT:    s_waitcnt vmcnt(0)
909; GFX9-NEXT:    global_store_byte v[0:1], v0, off
910; GFX9-NEXT:    s_endpgm
911;
912; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
913; GFX10:       ; %bb.0:
914; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
915; GFX10-NEXT:    ; implicit-def: $vcc_hi
916; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
917; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x3800, s0
918; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s1, s0
919; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
920; GFX10-NEXT:    s_waitcnt vmcnt(0)
921; GFX10-NEXT:    global_store_byte v[0:1], v0, off
922; GFX10-NEXT:    s_endpgm
923  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383
924  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
925  store i8 %load, i8 addrspace(1)* undef
926  ret void
927}
928
929define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
930; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
931; GFX9:       ; %bb.0:
932; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
933; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX9-NEXT:    v_mov_b32_e32 v0, s0
935; GFX9-NEXT:    v_mov_b32_e32 v1, s1
936; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
937; GFX9-NEXT:    s_waitcnt vmcnt(0)
938; GFX9-NEXT:    global_store_byte v[0:1], v0, off
939; GFX9-NEXT:    s_endpgm
940;
941; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
942; GFX10:       ; %bb.0:
943; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
944; GFX10-NEXT:    ; implicit-def: $vcc_hi
945; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xfffff000, s0
947; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
948; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
949; GFX10-NEXT:    s_waitcnt vmcnt(0)
950; GFX10-NEXT:    global_store_byte v[0:1], v0, off
951; GFX10-NEXT:    s_endpgm
952  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096
953  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
954  store i8 %load, i8 addrspace(1)* undef
955  ret void
956}
957
958define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
959; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
960; GFX9:       ; %bb.0:
961; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
962; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
963; GFX9-NEXT:    v_mov_b32_e32 v0, s0
964; GFX9-NEXT:    v_mov_b32_e32 v1, s1
965; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
966; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
967; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
968; GFX9-NEXT:    s_waitcnt vmcnt(0)
969; GFX9-NEXT:    global_store_byte v[0:1], v0, off
970; GFX9-NEXT:    s_endpgm
971;
972; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
973; GFX10:       ; %bb.0:
974; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
975; GFX10-NEXT:    ; implicit-def: $vcc_hi
976; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xffffe000, s0
978; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
979; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
980; GFX10-NEXT:    s_waitcnt vmcnt(0)
981; GFX10-NEXT:    global_store_byte v[0:1], v0, off
982; GFX10-NEXT:    s_endpgm
983  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192
984  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
985  store i8 %load, i8 addrspace(1)* undef
986  ret void
987}
988
989define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
990; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
991; GFX9:       ; %bb.0:
992; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
993; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX9-NEXT:    v_mov_b32_e32 v0, s0
995; GFX9-NEXT:    v_mov_b32_e32 v1, s1
996; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
997; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
998; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
999; GFX9-NEXT:    s_waitcnt vmcnt(0)
1000; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1001; GFX9-NEXT:    s_endpgm
1002;
1003; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
1004; GFX10:       ; %bb.0:
1005; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1006; GFX10-NEXT:    ; implicit-def: $vcc_hi
1007; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1008; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0xffffc000, s0
1009; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
1010; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1011; GFX10-NEXT:    s_waitcnt vmcnt(0)
1012; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1013; GFX10-NEXT:    s_endpgm
1014  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384
1015  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1016  store i8 %load, i8 addrspace(1)* undef
1017  ret void
1018}
1019
1020; Fill 11-bit low-bits (1ull << 33) | 2047
1021define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
1022; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
1023; GFX9:       ; %bb.0:
1024; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1025; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1026; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1027; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1028; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1029; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1030; GFX9-NEXT:    s_waitcnt vmcnt(0)
1031; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1032; GFX9-NEXT:    s_endpgm
1033;
1034; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0:
1035; GFX10:       ; %bb.0:
1036; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1037; GFX10-NEXT:    ; implicit-def: $vcc_hi
1038; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1039; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0, s0
1040; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1041; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1042; GFX10-NEXT:    s_waitcnt vmcnt(0)
1043; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1044; GFX10-NEXT:    s_endpgm
1045  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639
1046  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1047  store i8 %load, i8 addrspace(1)* undef
1048  ret void
1049}
1050
1051; Fill 11-bit low-bits (1ull << 33) | 2048
1052define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
1053; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
1054; GFX9:       ; %bb.0:
1055; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1056; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1057; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1058; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1059; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1060; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
1061; GFX9-NEXT:    s_waitcnt vmcnt(0)
1062; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1063; GFX9-NEXT:    s_endpgm
1064;
1065; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1:
1066; GFX10:       ; %bb.0:
1067; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1068; GFX10-NEXT:    ; implicit-def: $vcc_hi
1069; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x800, s0
1071; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1072; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1073; GFX10-NEXT:    s_waitcnt vmcnt(0)
1074; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1075; GFX10-NEXT:    s_endpgm
1076  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640
1077  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1078  store i8 %load, i8 addrspace(1)* undef
1079  ret void
1080}
1081
1082; Fill 12-bit low-bits (1ull << 33) | 4095
1083define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
1084; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
1085; GFX9:       ; %bb.0:
1086; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1087; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1089; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1090; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1091; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
1092; GFX9-NEXT:    s_waitcnt vmcnt(0)
1093; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1094; GFX9-NEXT:    s_endpgm
1095;
1096; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0:
1097; GFX10:       ; %bb.0:
1098; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1099; GFX10-NEXT:    ; implicit-def: $vcc_hi
1100; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1101; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x800, s0
1102; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1103; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1104; GFX10-NEXT:    s_waitcnt vmcnt(0)
1105; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1106; GFX10-NEXT:    s_endpgm
1107  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687
1108  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1109  store i8 %load, i8 addrspace(1)* undef
1110  ret void
1111}
1112
1113; Fill 12-bit low-bits (1ull << 33) | 4096
1114define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
1115; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
1116; GFX9:       ; %bb.0:
1117; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1120; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1121; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1122; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1123; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1124; GFX9-NEXT:    s_waitcnt vmcnt(0)
1125; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1126; GFX9-NEXT:    s_endpgm
1127;
1128; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1:
1129; GFX10:       ; %bb.0:
1130; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1131; GFX10-NEXT:    ; implicit-def: $vcc_hi
1132; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x1000, s0
1134; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1135; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1136; GFX10-NEXT:    s_waitcnt vmcnt(0)
1137; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1138; GFX10-NEXT:    s_endpgm
1139  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688
1140  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1141  store i8 %load, i8 addrspace(1)* undef
1142  ret void
1143}
1144
1145; Fill 13-bit low-bits (1ull << 33) | 8191
1146define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
1147; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
1148; GFX9:       ; %bb.0:
1149; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1150; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1152; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1153; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1154; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1155; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
1156; GFX9-NEXT:    s_waitcnt vmcnt(0)
1157; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1158; GFX9-NEXT:    s_endpgm
1159;
1160; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0:
1161; GFX10:       ; %bb.0:
1162; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1163; GFX10-NEXT:    ; implicit-def: $vcc_hi
1164; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1165; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x1800, s0
1166; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1167; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1168; GFX10-NEXT:    s_waitcnt vmcnt(0)
1169; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1170; GFX10-NEXT:    s_endpgm
1171  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783
1172  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1173  store i8 %load, i8 addrspace(1)* undef
1174  ret void
1175}
1176
1177; Fill 13-bit low-bits (1ull << 33) | 8192
1178define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
1179; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
1180; GFX9:       ; %bb.0:
1181; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1182; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1183; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1184; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1185; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1186; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1187; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1188; GFX9-NEXT:    s_waitcnt vmcnt(0)
1189; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1190; GFX9-NEXT:    s_endpgm
1191;
1192; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1:
1193; GFX10:       ; %bb.0:
1194; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1195; GFX10-NEXT:    ; implicit-def: $vcc_hi
1196; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1197; GFX10-NEXT:    v_add_co_u32_e64 v0, s0, 0x2000, s0
1198; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
1199; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1200; GFX10-NEXT:    s_waitcnt vmcnt(0)
1201; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1202; GFX10-NEXT:    s_endpgm
1203  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784
1204  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1205  store i8 %load, i8 addrspace(1)* undef
1206  ret void
1207}
1208
1209; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
1210define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) {
1211; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
1212; GFX9:       ; %bb.0:
1213; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1214; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1215; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1217; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1218; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1219; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1220; GFX9-NEXT:    s_waitcnt vmcnt(0)
1221; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1222; GFX9-NEXT:    s_endpgm
1223;
1224; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
1225; GFX10:       ; %bb.0:
1226; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1227; GFX10-NEXT:    ; implicit-def: $vcc_hi
1228; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1230; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0, s0
1231; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1232; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
1233; GFX10-NEXT:    s_waitcnt vmcnt(0)
1234; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1235; GFX10-NEXT:    s_endpgm
1236  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761
1237  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1238  store i8 %load, i8 addrspace(1)* undef
1239  ret void
1240}
1241
1242; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
1243define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) {
1244; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
1245; GFX9:       ; %bb.0:
1246; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1247; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1249; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1250; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1251; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1252; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:2048
1253; GFX9-NEXT:    s_waitcnt vmcnt(0)
1254; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1255; GFX9-NEXT:    s_endpgm
1256;
1257; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
1258; GFX10:       ; %bb.0:
1259; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1260; GFX10-NEXT:    ; implicit-def: $vcc_hi
1261; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1262; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
1264; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1265; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2048
1266; GFX10-NEXT:    s_waitcnt vmcnt(0)
1267; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1268; GFX10-NEXT:    s_endpgm
1269  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760
1270  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1271  store i8 %load, i8 addrspace(1)* undef
1272  ret void
1273}
1274
1275; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
1276define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) {
1277; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
1278; GFX9:       ; %bb.0:
1279; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1280; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1281; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1282; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1283; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1284; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1285; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
1286; GFX9-NEXT:    s_waitcnt vmcnt(0)
1287; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1288; GFX9-NEXT:    s_endpgm
1289;
1290; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
1291; GFX10:       ; %bb.0:
1292; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1293; GFX10-NEXT:    ; implicit-def: $vcc_hi
1294; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1295; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1296; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
1297; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1298; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1299; GFX10-NEXT:    s_waitcnt vmcnt(0)
1300; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1301; GFX10-NEXT:    s_endpgm
1302  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713
1303  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1304  store i8 %load, i8 addrspace(1)* undef
1305  ret void
1306}
1307
1308; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
1309define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) {
1310; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
1311; GFX9:       ; %bb.0:
1312; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1313; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1314; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1316; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1317; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1318; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1319; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4096
1320; GFX9-NEXT:    s_waitcnt vmcnt(0)
1321; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1322; GFX9-NEXT:    s_endpgm
1323;
1324; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
1325; GFX10:       ; %bb.0:
1326; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1327; GFX10-NEXT:    ; implicit-def: $vcc_hi
1328; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1329; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1330; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
1331; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1332; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1333; GFX10-NEXT:    s_waitcnt vmcnt(0)
1334; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1335; GFX10-NEXT:    s_endpgm
1336  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712
1337  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1338  store i8 %load, i8 addrspace(1)* undef
1339  ret void
1340}
1341
1342; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
1343define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) {
1344; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
1345; GFX9:       ; %bb.0:
1346; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1347; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1349; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1350; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1351; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1352; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1353; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1354; GFX9-NEXT:    s_waitcnt vmcnt(0)
1355; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1356; GFX9-NEXT:    s_endpgm
1357;
1358; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
1359; GFX10:       ; %bb.0:
1360; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1361; GFX10-NEXT:    ; implicit-def: $vcc_hi
1362; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1363; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1364; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0
1365; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1366; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
1367; GFX10-NEXT:    s_waitcnt vmcnt(0)
1368; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1369; GFX10-NEXT:    s_endpgm
1370  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617
1371  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1372  store i8 %load, i8 addrspace(1)* undef
1373  ret void
1374}
1375
1376; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
1377define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) {
1378; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
1379; GFX9:       ; %bb.0:
1380; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1381; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1384; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1385; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1386; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1387; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
1388; GFX9-NEXT:    s_waitcnt vmcnt(0)
1389; GFX9-NEXT:    global_store_byte v[0:1], v0, off
1390; GFX9-NEXT:    s_endpgm
1391;
1392; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
1393; GFX10:       ; %bb.0:
1394; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1395; GFX10-NEXT:    ; implicit-def: $vcc_hi
1396; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1398; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0
1399; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
1400; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
1401; GFX10-NEXT:    s_waitcnt vmcnt(0)
1402; GFX10-NEXT:    global_store_byte v[0:1], v0, off
1403; GFX10-NEXT:    s_endpgm
1404  %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616
1405  %load = load volatile i8, i8 addrspace(1)* %gep, align 1
1406  store i8 %load, i8 addrspace(1)* undef
1407  ret void
1408}
1409