1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4
5; Test splitting flat instruction offsets into the low and high bits
6; when the offset doesn't fit in the offset field.
7
8define i8 @flat_inst_valu_offset_1(i8* %p) {
9; GFX9-LABEL: flat_inst_valu_offset_1:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:1
13; GFX9-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX10-LABEL: flat_inst_valu_offset_1:
16; GFX10:       ; %bb.0:
17; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
19; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, v0, 1
20; GFX10-NEXT:    ; implicit-def: $vcc_hi
21; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
22; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
23; GFX10-NEXT:    s_setpc_b64 s[30:31]
24  %gep = getelementptr i8, i8* %p, i64 1
25  %load = load i8, i8* %gep, align 4
26  ret i8 %load
27}
28
29define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
30; GFX9-LABEL: flat_inst_valu_offset_11bit_max:
31; GFX9:       ; %bb.0:
32; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
34; GFX9-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX10-LABEL: flat_inst_valu_offset_11bit_max:
37; GFX10:       ; %bb.0:
38; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
40; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
41; GFX10-NEXT:    ; implicit-def: $vcc_hi
42; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
43; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
44; GFX10-NEXT:    s_setpc_b64 s[30:31]
45  %gep = getelementptr i8, i8* %p, i64 2047
46  %load = load i8, i8* %gep, align 4
47  ret i8 %load
48}
49
50define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
51; GFX9-LABEL: flat_inst_valu_offset_12bit_max:
52; GFX9:       ; %bb.0:
53; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
54; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
55; GFX9-NEXT:    s_setpc_b64 s[30:31]
56;
57; GFX10-LABEL: flat_inst_valu_offset_12bit_max:
58; GFX10:       ; %bb.0:
59; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
61; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
62; GFX10-NEXT:    ; implicit-def: $vcc_hi
63; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
64; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
65; GFX10-NEXT:    s_setpc_b64 s[30:31]
66  %gep = getelementptr i8, i8* %p, i64 4095
67  %load = load i8, i8* %gep, align 4
68  ret i8 %load
69}
70
71define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
72; GFX9-LABEL: flat_inst_valu_offset_13bit_max:
73; GFX9:       ; %bb.0:
74; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
76; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
77; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
78; GFX9-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX10-LABEL: flat_inst_valu_offset_13bit_max:
81; GFX10:       ; %bb.0:
82; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
84; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
85; GFX10-NEXT:    ; implicit-def: $vcc_hi
86; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
87; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
88; GFX10-NEXT:    s_setpc_b64 s[30:31]
89  %gep = getelementptr i8, i8* %p, i64 8191
90  %load = load i8, i8* %gep, align 4
91  ret i8 %load
92}
93
94define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
95; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
96; GFX9:       ; %bb.0:
97; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
99; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
100; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
101; GFX9-NEXT:    s_setpc_b64 s[30:31]
102;
103; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max:
104; GFX10:       ; %bb.0:
105; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
107; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0
108; GFX10-NEXT:    ; implicit-def: $vcc_hi
109; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
110; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
111; GFX10-NEXT:    s_setpc_b64 s[30:31]
112  %gep = getelementptr i8, i8* %p, i64 -2048
113  %load = load i8, i8* %gep, align 4
114  ret i8 %load
115}
116
117define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
118; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
119; GFX9:       ; %bb.0:
120; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
121; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
122; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
123; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
124; GFX9-NEXT:    s_setpc_b64 s[30:31]
125;
126; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max:
127; GFX10:       ; %bb.0:
128; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
130; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
131; GFX10-NEXT:    ; implicit-def: $vcc_hi
132; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
133; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
134; GFX10-NEXT:    s_setpc_b64 s[30:31]
135  %gep = getelementptr i8, i8* %p, i64 -4096
136  %load = load i8, i8* %gep, align 4
137  ret i8 %load
138}
139
140define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
141; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
142; GFX9:       ; %bb.0:
143; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
145; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
146; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
147; GFX9-NEXT:    s_setpc_b64 s[30:31]
148;
149; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
153; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
154; GFX10-NEXT:    ; implicit-def: $vcc_hi
155; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
156; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
157; GFX10-NEXT:    s_setpc_b64 s[30:31]
158  %gep = getelementptr i8, i8* %p, i64 -8192
159  %load = load i8, i8* %gep, align 4
160  ret i8 %load
161}
162
163define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
164; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
165; GFX9:       ; %bb.0:
166; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
168; GFX9-NEXT:    s_setpc_b64 s[30:31]
169;
170; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max:
171; GFX10:       ; %bb.0:
172; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
174; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
175; GFX10-NEXT:    ; implicit-def: $vcc_hi
176; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
177; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
178; GFX10-NEXT:    s_setpc_b64 s[30:31]
179  %gep = getelementptr i8, i8* %p, i64 4095
180  %load = load i8, i8* %gep, align 4
181  ret i8 %load
182}
183
184define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
185; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max:
186; GFX9:       ; %bb.0:
187; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
189; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
190; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
191; GFX9-NEXT:    s_setpc_b64 s[30:31]
192;
193; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max:
194; GFX10:       ; %bb.0:
195; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
196; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
197; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
198; GFX10-NEXT:    ; implicit-def: $vcc_hi
199; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
200; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
201; GFX10-NEXT:    s_setpc_b64 s[30:31]
202  %gep = getelementptr i8, i8* %p, i64 8191
203  %load = load i8, i8* %gep, align 4
204  ret i8 %load
205}
206
207define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
208; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max:
209; GFX9:       ; %bb.0:
210; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
212; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
213; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
214; GFX9-NEXT:    s_setpc_b64 s[30:31]
215;
216; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max:
217; GFX10:       ; %bb.0:
218; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
219; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
220; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0
221; GFX10-NEXT:    ; implicit-def: $vcc_hi
222; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
223; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
224; GFX10-NEXT:    s_setpc_b64 s[30:31]
225  %gep = getelementptr i8, i8* %p, i64 16383
226  %load = load i8, i8* %gep, align 4
227  ret i8 %load
228}
229
230define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
231; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
232; GFX9:       ; %bb.0:
233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
235; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
236; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
237; GFX9-NEXT:    s_setpc_b64 s[30:31]
238;
239; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
240; GFX10:       ; %bb.0:
241; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
242; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
243; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
244; GFX10-NEXT:    ; implicit-def: $vcc_hi
245; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
246; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
247; GFX10-NEXT:    s_setpc_b64 s[30:31]
248  %gep = getelementptr i8, i8* %p, i64 -4096
249  %load = load i8, i8* %gep, align 4
250  ret i8 %load
251}
252
253define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
254; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
255; GFX9:       ; %bb.0:
256; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
258; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
259; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
260; GFX9-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
263; GFX10:       ; %bb.0:
264; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
266; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
267; GFX10-NEXT:    ; implicit-def: $vcc_hi
268; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
269; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
270; GFX10-NEXT:    s_setpc_b64 s[30:31]
271  %gep = getelementptr i8, i8* %p, i64 -8192
272  %load = load i8, i8* %gep, align 4
273  ret i8 %load
274}
275
276define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
277; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
278; GFX9:       ; %bb.0:
279; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
281; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
282; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
283; GFX9-NEXT:    s_setpc_b64 s[30:31]
284;
285; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
286; GFX10:       ; %bb.0:
287; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
289; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
290; GFX10-NEXT:    ; implicit-def: $vcc_hi
291; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
292; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
293; GFX10-NEXT:    s_setpc_b64 s[30:31]
294  %gep = getelementptr i8, i8* %p, i64 -16384
295  %load = load i8, i8* %gep, align 4
296  ret i8 %load
297}
298
299; Fill 11-bit low-bits (1ull << 33) | 2047
300define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
301; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
302; GFX9:       ; %bb.0:
303; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
305; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
306; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
307; GFX9-NEXT:    s_setpc_b64 s[30:31]
308;
309; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
310; GFX10:       ; %bb.0:
311; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
313; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
314; GFX10-NEXT:    ; implicit-def: $vcc_hi
315; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
316; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
317; GFX10-NEXT:    s_setpc_b64 s[30:31]
318  %gep = getelementptr i8, i8* %p, i64 8589936639
319  %load = load i8, i8* %gep, align 4
320  ret i8 %load
321}
322
323; Fill 11-bit low-bits (1ull << 33) | 2048
324define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
325; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
326; GFX9:       ; %bb.0:
327; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
329; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
330; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2048
331; GFX9-NEXT:    s_setpc_b64 s[30:31]
332;
333; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
334; GFX10:       ; %bb.0:
335; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
337; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
338; GFX10-NEXT:    ; implicit-def: $vcc_hi
339; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
340; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
341; GFX10-NEXT:    s_setpc_b64 s[30:31]
342  %gep = getelementptr i8, i8* %p, i64 8589936640
343  %load = load i8, i8* %gep, align 4
344  ret i8 %load
345}
346
347; Fill 12-bit low-bits (1ull << 33) | 4095
348define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
349; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
350; GFX9:       ; %bb.0:
351; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0, v0
353; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
354; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
355; GFX9-NEXT:    s_setpc_b64 s[30:31]
356;
357; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
358; GFX10:       ; %bb.0:
359; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
360; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
361; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
362; GFX10-NEXT:    ; implicit-def: $vcc_hi
363; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
364; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
365; GFX10-NEXT:    s_setpc_b64 s[30:31]
366  %gep = getelementptr i8, i8* %p, i64 8589938687
367  %load = load i8, i8* %gep, align 4
368  ret i8 %load
369}
370
371; Fill 12-bit low-bits (1ull << 33) | 4096
372define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
373; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
374; GFX9:       ; %bb.0:
375; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
377; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
378; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
379; GFX9-NEXT:    s_setpc_b64 s[30:31]
380;
381; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
382; GFX10:       ; %bb.0:
383; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
385; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
386; GFX10-NEXT:    ; implicit-def: $vcc_hi
387; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
388; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
389; GFX10-NEXT:    s_setpc_b64 s[30:31]
390  %gep = getelementptr i8, i8* %p, i64 8589938688
391  %load = load i8, i8* %gep, align 4
392  ret i8 %load
393}
394
395; Fill 13-bit low-bits (1ull << 33) | 8191
396define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
397; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
398; GFX9:       ; %bb.0:
399; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
401; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
402; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
403; GFX9-NEXT:    s_setpc_b64 s[30:31]
404;
405; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
406; GFX10:       ; %bb.0:
407; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
409; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
410; GFX10-NEXT:    ; implicit-def: $vcc_hi
411; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
412; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
413; GFX10-NEXT:    s_setpc_b64 s[30:31]
414  %gep = getelementptr i8, i8* %p, i64 8589942783
415  %load = load i8, i8* %gep, align 4
416  ret i8 %load
417}
418
419; Fill 13-bit low-bits (1ull << 33) | 8192
420define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
421; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
422; GFX9:       ; %bb.0:
423; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
424; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
425; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
426; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
427; GFX9-NEXT:    s_setpc_b64 s[30:31]
428;
429; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
430; GFX10:       ; %bb.0:
431; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
433; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
434; GFX10-NEXT:    ; implicit-def: $vcc_hi
435; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
436; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
437; GFX10-NEXT:    s_setpc_b64 s[30:31]
438  %gep = getelementptr i8, i8* %p, i64 8589942784
439  %load = load i8, i8* %gep, align 4
440  ret i8 %load
441}
442
443; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
444define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
445; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
446; GFX9:       ; %bb.0:
447; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
449; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
450; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
451; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
452; GFX9-NEXT:    s_setpc_b64 s[30:31]
453;
454; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
455; GFX10:       ; %bb.0:
456; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
458; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
459; GFX10-NEXT:    ; implicit-def: $vcc_hi
460; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
461; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
462; GFX10-NEXT:    s_setpc_b64 s[30:31]
463  %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
464  %load = load i8, i8* %gep, align 4
465  ret i8 %load
466}
467
468; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
469define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
470; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
471; GFX9:       ; %bb.0:
472; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
474; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
475; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
476; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
477; GFX9-NEXT:    s_setpc_b64 s[30:31]
478;
479; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
480; GFX10:       ; %bb.0:
481; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
482; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
483; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
484; GFX10-NEXT:    ; implicit-def: $vcc_hi
485; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
486; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
487; GFX10-NEXT:    s_setpc_b64 s[30:31]
488  %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
489  %load = load i8, i8* %gep, align 4
490  ret i8 %load
491}
492
493; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
494define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
495; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
496; GFX9:       ; %bb.0:
497; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
498; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
499; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
500; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
501; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
502; GFX9-NEXT:    s_setpc_b64 s[30:31]
503;
504; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
505; GFX10:       ; %bb.0:
506; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
507; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
508; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
509; GFX10-NEXT:    ; implicit-def: $vcc_hi
510; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
511; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
512; GFX10-NEXT:    s_setpc_b64 s[30:31]
513  %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
514  %load = load i8, i8* %gep, align 4
515  ret i8 %load
516}
517
518; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
519define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
520; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
521; GFX9:       ; %bb.0:
522; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
523; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
524; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
525; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
526; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
527; GFX9-NEXT:    s_setpc_b64 s[30:31]
528;
529; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
530; GFX10:       ; %bb.0:
531; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
532; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
533; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
534; GFX10-NEXT:    ; implicit-def: $vcc_hi
535; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
536; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
537; GFX10-NEXT:    s_setpc_b64 s[30:31]
538  %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
539  %load = load i8, i8* %gep, align 4
540  ret i8 %load
541}
542
543; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
544define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
545; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
546; GFX9:       ; %bb.0:
547; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
548; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
549; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
550; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
551; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
552; GFX9-NEXT:    s_setpc_b64 s[30:31]
553;
554; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
555; GFX10:       ; %bb.0:
556; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
557; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
558; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
559; GFX10-NEXT:    ; implicit-def: $vcc_hi
560; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
561; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
562; GFX10-NEXT:    s_setpc_b64 s[30:31]
563  %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
564  %load = load i8, i8* %gep, align 4
565  ret i8 %load
566}
567
568; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
569define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
570; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
571; GFX9:       ; %bb.0:
572; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
574; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
575; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
576; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
577; GFX9-NEXT:    s_setpc_b64 s[30:31]
578;
579; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
580; GFX10:       ; %bb.0:
581; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
582; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
583; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
584; GFX10-NEXT:    ; implicit-def: $vcc_hi
585; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
586; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
587; GFX10-NEXT:    s_setpc_b64 s[30:31]
588  %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
589  %load = load i8, i8* %gep, align 4
590  ret i8 %load
591}
592
593define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) {
594; GFX9-LABEL: flat_inst_salu_offset_1:
595; GFX9:       ; %bb.0:
596; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
597; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
598; GFX9-NEXT:    v_mov_b32_e32 v0, s0
599; GFX9-NEXT:    v_mov_b32_e32 v1, s1
600; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:1
601; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
602; GFX9-NEXT:    flat_store_byte v[0:1], v0
603; GFX9-NEXT:    s_endpgm
604;
605; GFX10-LABEL: flat_inst_salu_offset_1:
606; GFX10:       ; %bb.0:
607; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
608; GFX10-NEXT:    ; implicit-def: $vcc_hi
609; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX10-NEXT:    s_add_u32 s0, s0, 1
611; GFX10-NEXT:    s_addc_u32 s1, s1, 0
612; GFX10-NEXT:    v_mov_b32_e32 v0, s0
613; GFX10-NEXT:    v_mov_b32_e32 v1, s1
614; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
615; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
616; GFX10-NEXT:    flat_store_byte v[0:1], v0
617; GFX10-NEXT:    s_endpgm
618  %gep = getelementptr i8, i8* %p, i64 1
619  %load = load volatile i8, i8* %gep, align 1
620  store i8 %load, i8* undef
621  ret void
622}
623
624define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) {
625; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
626; GFX9:       ; %bb.0:
627; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
628; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX9-NEXT:    v_mov_b32_e32 v0, s0
630; GFX9-NEXT:    v_mov_b32_e32 v1, s1
631; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
632; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
633; GFX9-NEXT:    flat_store_byte v[0:1], v0
634; GFX9-NEXT:    s_endpgm
635;
636; GFX10-LABEL: flat_inst_salu_offset_11bit_max:
637; GFX10:       ; %bb.0:
638; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
639; GFX10-NEXT:    ; implicit-def: $vcc_hi
640; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
641; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
642; GFX10-NEXT:    s_addc_u32 s1, s1, 0
643; GFX10-NEXT:    v_mov_b32_e32 v0, s0
644; GFX10-NEXT:    v_mov_b32_e32 v1, s1
645; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
646; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
647; GFX10-NEXT:    flat_store_byte v[0:1], v0
648; GFX10-NEXT:    s_endpgm
649  %gep = getelementptr i8, i8* %p, i64 2047
650  %load = load volatile i8, i8* %gep, align 1
651  store i8 %load, i8* undef
652  ret void
653}
654
655define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) {
656; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
657; GFX9:       ; %bb.0:
658; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
659; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
660; GFX9-NEXT:    v_mov_b32_e32 v0, s0
661; GFX9-NEXT:    v_mov_b32_e32 v1, s1
662; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
663; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
664; GFX9-NEXT:    flat_store_byte v[0:1], v0
665; GFX9-NEXT:    s_endpgm
666;
667; GFX10-LABEL: flat_inst_salu_offset_12bit_max:
668; GFX10:       ; %bb.0:
669; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
670; GFX10-NEXT:    ; implicit-def: $vcc_hi
671; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
673; GFX10-NEXT:    s_addc_u32 s1, s1, 0
674; GFX10-NEXT:    v_mov_b32_e32 v0, s0
675; GFX10-NEXT:    v_mov_b32_e32 v1, s1
676; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
677; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
678; GFX10-NEXT:    flat_store_byte v[0:1], v0
679; GFX10-NEXT:    s_endpgm
680  %gep = getelementptr i8, i8* %p, i64 4095
681  %load = load volatile i8, i8* %gep, align 1
682  store i8 %load, i8* undef
683  ret void
684}
685
686define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) {
687; GFX9-LABEL: flat_inst_salu_offset_13bit_max:
688; GFX9:       ; %bb.0:
689; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
690; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX9-NEXT:    v_mov_b32_e32 v0, s0
692; GFX9-NEXT:    v_mov_b32_e32 v1, s1
693; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
694; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
695; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
696; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
697; GFX9-NEXT:    flat_store_byte v[0:1], v0
698; GFX9-NEXT:    s_endpgm
699;
700; GFX10-LABEL: flat_inst_salu_offset_13bit_max:
701; GFX10:       ; %bb.0:
702; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
703; GFX10-NEXT:    ; implicit-def: $vcc_hi
704; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
706; GFX10-NEXT:    s_addc_u32 s1, s1, 0
707; GFX10-NEXT:    v_mov_b32_e32 v0, s0
708; GFX10-NEXT:    v_mov_b32_e32 v1, s1
709; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
710; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
711; GFX10-NEXT:    flat_store_byte v[0:1], v0
712; GFX10-NEXT:    s_endpgm
713  %gep = getelementptr i8, i8* %p, i64 8191
714  %load = load volatile i8, i8* %gep, align 1
715  store i8 %load, i8* undef
716  ret void
717}
718
719define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) {
720; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max:
721; GFX9:       ; %bb.0:
722; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
723; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX9-NEXT:    v_mov_b32_e32 v0, s0
725; GFX9-NEXT:    v_mov_b32_e32 v1, s1
726; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
727; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
728; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
729; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
730; GFX9-NEXT:    flat_store_byte v[0:1], v0
731; GFX9-NEXT:    s_endpgm
732;
733; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max:
734; GFX10:       ; %bb.0:
735; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
736; GFX10-NEXT:    ; implicit-def: $vcc_hi
737; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
738; GFX10-NEXT:    s_add_u32 s0, s0, 0xfffff800
739; GFX10-NEXT:    s_addc_u32 s1, s1, -1
740; GFX10-NEXT:    v_mov_b32_e32 v0, s0
741; GFX10-NEXT:    v_mov_b32_e32 v1, s1
742; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
743; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
744; GFX10-NEXT:    flat_store_byte v[0:1], v0
745; GFX10-NEXT:    s_endpgm
746  %gep = getelementptr i8, i8* %p, i64 -2048
747  %load = load volatile i8, i8* %gep, align 1
748  store i8 %load, i8* undef
749  ret void
750}
751
752define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) {
753; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max:
754; GFX9:       ; %bb.0:
755; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
756; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX9-NEXT:    v_mov_b32_e32 v0, s0
758; GFX9-NEXT:    v_mov_b32_e32 v1, s1
759; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
760; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
761; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
762; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
763; GFX9-NEXT:    flat_store_byte v[0:1], v0
764; GFX9-NEXT:    s_endpgm
765;
766; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max:
767; GFX10:       ; %bb.0:
768; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
769; GFX10-NEXT:    ; implicit-def: $vcc_hi
770; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX10-NEXT:    s_add_u32 s0, s0, 0xfffff000
772; GFX10-NEXT:    s_addc_u32 s1, s1, -1
773; GFX10-NEXT:    v_mov_b32_e32 v0, s0
774; GFX10-NEXT:    v_mov_b32_e32 v1, s1
775; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
776; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
777; GFX10-NEXT:    flat_store_byte v[0:1], v0
778; GFX10-NEXT:    s_endpgm
779  %gep = getelementptr i8, i8* %p, i64 -4096
780  %load = load volatile i8, i8* %gep, align 1
781  store i8 %load, i8* undef
782  ret void
783}
784
785define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) {
786; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max:
787; GFX9:       ; %bb.0:
788; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
789; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX9-NEXT:    v_mov_b32_e32 v0, s0
791; GFX9-NEXT:    v_mov_b32_e32 v1, s1
792; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
793; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
794; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
795; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
796; GFX9-NEXT:    flat_store_byte v[0:1], v0
797; GFX9-NEXT:    s_endpgm
798;
799; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max:
800; GFX10:       ; %bb.0:
801; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
802; GFX10-NEXT:    ; implicit-def: $vcc_hi
803; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
804; GFX10-NEXT:    s_add_u32 s0, s0, 0xffffe000
805; GFX10-NEXT:    s_addc_u32 s1, s1, -1
806; GFX10-NEXT:    v_mov_b32_e32 v0, s0
807; GFX10-NEXT:    v_mov_b32_e32 v1, s1
808; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
809; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
810; GFX10-NEXT:    flat_store_byte v[0:1], v0
811; GFX10-NEXT:    s_endpgm
812  %gep = getelementptr i8, i8* %p, i64 -8192
813  %load = load volatile i8, i8* %gep, align 1
814  store i8 %load, i8* undef
815  ret void
816}
817
818define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) {
819; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
820; GFX9:       ; %bb.0:
821; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
822; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX9-NEXT:    v_mov_b32_e32 v0, s0
824; GFX9-NEXT:    v_mov_b32_e32 v1, s1
825; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
826; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
827; GFX9-NEXT:    flat_store_byte v[0:1], v0
828; GFX9-NEXT:    s_endpgm
829;
830; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max:
831; GFX10:       ; %bb.0:
832; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
833; GFX10-NEXT:    ; implicit-def: $vcc_hi
834; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
836; GFX10-NEXT:    s_addc_u32 s1, s1, 0
837; GFX10-NEXT:    v_mov_b32_e32 v0, s0
838; GFX10-NEXT:    v_mov_b32_e32 v1, s1
839; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
840; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
841; GFX10-NEXT:    flat_store_byte v[0:1], v0
842; GFX10-NEXT:    s_endpgm
843  %gep = getelementptr i8, i8* %p, i64 4095
844  %load = load volatile i8, i8* %gep, align 1
845  store i8 %load, i8* undef
846  ret void
847}
848
849define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) {
850; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max:
851; GFX9:       ; %bb.0:
852; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
853; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX9-NEXT:    v_mov_b32_e32 v0, s0
855; GFX9-NEXT:    v_mov_b32_e32 v1, s1
856; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
857; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
858; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
859; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
860; GFX9-NEXT:    flat_store_byte v[0:1], v0
861; GFX9-NEXT:    s_endpgm
862;
863; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max:
864; GFX10:       ; %bb.0:
865; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
866; GFX10-NEXT:    ; implicit-def: $vcc_hi
867; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
868; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
869; GFX10-NEXT:    s_addc_u32 s1, s1, 0
870; GFX10-NEXT:    v_mov_b32_e32 v0, s0
871; GFX10-NEXT:    v_mov_b32_e32 v1, s1
872; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
873; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
874; GFX10-NEXT:    flat_store_byte v[0:1], v0
875; GFX10-NEXT:    s_endpgm
876  %gep = getelementptr i8, i8* %p, i64 8191
877  %load = load volatile i8, i8* %gep, align 1
878  store i8 %load, i8* undef
879  ret void
880}
881
882define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) {
883; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max:
884; GFX9:       ; %bb.0:
885; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
886; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX9-NEXT:    v_mov_b32_e32 v0, s0
888; GFX9-NEXT:    v_mov_b32_e32 v1, s1
889; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
890; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
891; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
892; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
893; GFX9-NEXT:    flat_store_byte v[0:1], v0
894; GFX9-NEXT:    s_endpgm
895;
896; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max:
897; GFX10:       ; %bb.0:
898; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
899; GFX10-NEXT:    ; implicit-def: $vcc_hi
900; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
901; GFX10-NEXT:    s_add_u32 s0, s0, 0x3fff
902; GFX10-NEXT:    s_addc_u32 s1, s1, 0
903; GFX10-NEXT:    v_mov_b32_e32 v0, s0
904; GFX10-NEXT:    v_mov_b32_e32 v1, s1
905; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
906; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
907; GFX10-NEXT:    flat_store_byte v[0:1], v0
908; GFX10-NEXT:    s_endpgm
909  %gep = getelementptr i8, i8* %p, i64 16383
910  %load = load volatile i8, i8* %gep, align 1
911  store i8 %load, i8* undef
912  ret void
913}
914
915define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) {
916; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
917; GFX9:       ; %bb.0:
918; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
919; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
920; GFX9-NEXT:    v_mov_b32_e32 v0, s0
921; GFX9-NEXT:    v_mov_b32_e32 v1, s1
922; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
923; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
924; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
925; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
926; GFX9-NEXT:    flat_store_byte v[0:1], v0
927; GFX9-NEXT:    s_endpgm
928;
929; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
930; GFX10:       ; %bb.0:
931; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
932; GFX10-NEXT:    ; implicit-def: $vcc_hi
933; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX10-NEXT:    s_add_u32 s0, s0, 0xfffff000
935; GFX10-NEXT:    s_addc_u32 s1, s1, -1
936; GFX10-NEXT:    v_mov_b32_e32 v0, s0
937; GFX10-NEXT:    v_mov_b32_e32 v1, s1
938; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
939; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
940; GFX10-NEXT:    flat_store_byte v[0:1], v0
941; GFX10-NEXT:    s_endpgm
942  %gep = getelementptr i8, i8* %p, i64 -4096
943  %load = load volatile i8, i8* %gep, align 1
944  store i8 %load, i8* undef
945  ret void
946}
947
948define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) {
949; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
950; GFX9:       ; %bb.0:
951; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
952; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
953; GFX9-NEXT:    v_mov_b32_e32 v0, s0
954; GFX9-NEXT:    v_mov_b32_e32 v1, s1
955; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
956; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
957; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
958; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
959; GFX9-NEXT:    flat_store_byte v[0:1], v0
960; GFX9-NEXT:    s_endpgm
961;
962; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
963; GFX10:       ; %bb.0:
964; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
965; GFX10-NEXT:    ; implicit-def: $vcc_hi
966; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX10-NEXT:    s_add_u32 s0, s0, 0xffffe000
968; GFX10-NEXT:    s_addc_u32 s1, s1, -1
969; GFX10-NEXT:    v_mov_b32_e32 v0, s0
970; GFX10-NEXT:    v_mov_b32_e32 v1, s1
971; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
972; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
973; GFX10-NEXT:    flat_store_byte v[0:1], v0
974; GFX10-NEXT:    s_endpgm
975  %gep = getelementptr i8, i8* %p, i64 -8192
976  %load = load volatile i8, i8* %gep, align 1
977  store i8 %load, i8* undef
978  ret void
979}
980
981define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) {
982; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
983; GFX9:       ; %bb.0:
984; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
985; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
986; GFX9-NEXT:    v_mov_b32_e32 v0, s0
987; GFX9-NEXT:    v_mov_b32_e32 v1, s1
988; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
989; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
990; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
991; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
992; GFX9-NEXT:    flat_store_byte v[0:1], v0
993; GFX9-NEXT:    s_endpgm
994;
995; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
996; GFX10:       ; %bb.0:
997; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
998; GFX10-NEXT:    ; implicit-def: $vcc_hi
999; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX10-NEXT:    s_add_u32 s0, s0, 0xffffc000
1001; GFX10-NEXT:    s_addc_u32 s1, s1, -1
1002; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1003; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1004; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1005; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1006; GFX10-NEXT:    flat_store_byte v[0:1], v0
1007; GFX10-NEXT:    s_endpgm
1008  %gep = getelementptr i8, i8* %p, i64 -16384
1009  %load = load volatile i8, i8* %gep, align 1
1010  store i8 %load, i8* undef
1011  ret void
1012}
1013
1014; Fill 11-bit low-bits (1ull << 33) | 2047
1015define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) {
1016; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
1017; GFX9:       ; %bb.0:
1018; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1019; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1020; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1021; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1022; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1023; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2047
1024; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1025; GFX9-NEXT:    flat_store_byte v[0:1], v0
1026; GFX9-NEXT:    s_endpgm
1027;
1028; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
1029; GFX10:       ; %bb.0:
1030; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1031; GFX10-NEXT:    ; implicit-def: $vcc_hi
1032; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1033; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
1034; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1035; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1036; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1037; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1038; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1039; GFX10-NEXT:    flat_store_byte v[0:1], v0
1040; GFX10-NEXT:    s_endpgm
1041  %gep = getelementptr i8, i8* %p, i64 8589936639
1042  %load = load volatile i8, i8* %gep, align 1
1043  store i8 %load, i8* undef
1044  ret void
1045}
1046
1047; Fill 11-bit low-bits (1ull << 33) | 2048
1048define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) {
1049; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
1050; GFX9:       ; %bb.0:
1051; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1052; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1054; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1055; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1056; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:2048
1057; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1058; GFX9-NEXT:    flat_store_byte v[0:1], v0
1059; GFX9-NEXT:    s_endpgm
1060;
1061; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
1062; GFX10:       ; %bb.0:
1063; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1064; GFX10-NEXT:    ; implicit-def: $vcc_hi
1065; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1066; GFX10-NEXT:    s_add_u32 s0, s0, 0x800
1067; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1068; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1069; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1070; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1071; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1072; GFX10-NEXT:    flat_store_byte v[0:1], v0
1073; GFX10-NEXT:    s_endpgm
1074  %gep = getelementptr i8, i8* %p, i64 8589936640
1075  %load = load volatile i8, i8* %gep, align 1
1076  store i8 %load, i8* undef
1077  ret void
1078}
1079
1080; Fill 12-bit low-bits (1ull << 33) | 4095
1081define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) {
1082; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
1083; GFX9:       ; %bb.0:
1084; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1085; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1087; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s0
1088; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1089; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
1090; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1091; GFX9-NEXT:    flat_store_byte v[0:1], v0
1092; GFX9-NEXT:    s_endpgm
1093;
1094; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
1095; GFX10:       ; %bb.0:
1096; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1097; GFX10-NEXT:    ; implicit-def: $vcc_hi
1098; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
1100; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1101; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1102; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1103; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1104; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1105; GFX10-NEXT:    flat_store_byte v[0:1], v0
1106; GFX10-NEXT:    s_endpgm
1107  %gep = getelementptr i8, i8* %p, i64 8589938687
1108  %load = load volatile i8, i8* %gep, align 1
1109  store i8 %load, i8* undef
1110  ret void
1111}
1112
1113; Fill 12-bit low-bits (1ull << 33) | 4096
1114define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) {
1115; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
1116; GFX9:       ; %bb.0:
1117; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1119; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1120; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1121; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1122; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1123; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1124; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1125; GFX9-NEXT:    flat_store_byte v[0:1], v0
1126; GFX9-NEXT:    s_endpgm
1127;
1128; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
1129; GFX10:       ; %bb.0:
1130; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1131; GFX10-NEXT:    ; implicit-def: $vcc_hi
1132; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX10-NEXT:    s_add_u32 s0, s0, 0x1000
1134; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1135; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1136; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1137; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1138; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1139; GFX10-NEXT:    flat_store_byte v[0:1], v0
1140; GFX10-NEXT:    s_endpgm
1141  %gep = getelementptr i8, i8* %p, i64 8589938688
1142  %load = load volatile i8, i8* %gep, align 1
1143  store i8 %load, i8* undef
1144  ret void
1145}
1146
1147; Fill 13-bit low-bits (1ull << 33) | 8191
1148define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) {
1149; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
1150; GFX9:       ; %bb.0:
1151; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1152; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1153; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1154; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1155; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1156; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1157; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
1158; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1159; GFX9-NEXT:    flat_store_byte v[0:1], v0
1160; GFX9-NEXT:    s_endpgm
1161;
1162; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
1163; GFX10:       ; %bb.0:
1164; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1165; GFX10-NEXT:    ; implicit-def: $vcc_hi
1166; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1167; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
1168; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1169; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1170; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1171; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1172; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1173; GFX10-NEXT:    flat_store_byte v[0:1], v0
1174; GFX10-NEXT:    s_endpgm
1175  %gep = getelementptr i8, i8* %p, i64 8589942783
1176  %load = load volatile i8, i8* %gep, align 1
1177  store i8 %load, i8* undef
1178  ret void
1179}
1180
1181; Fill 13-bit low-bits (1ull << 33) | 8192
1182define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) {
1183; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
1184; GFX9:       ; %bb.0:
1185; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1188; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1189; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1190; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
1191; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1192; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1193; GFX9-NEXT:    flat_store_byte v[0:1], v0
1194; GFX9-NEXT:    s_endpgm
1195;
1196; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
1197; GFX10:       ; %bb.0:
1198; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1199; GFX10-NEXT:    ; implicit-def: $vcc_hi
1200; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1201; GFX10-NEXT:    s_add_u32 s0, s0, 0x2000
1202; GFX10-NEXT:    s_addc_u32 s1, s1, 2
1203; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1204; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1205; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1206; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1207; GFX10-NEXT:    flat_store_byte v[0:1], v0
1208; GFX10-NEXT:    s_endpgm
1209  %gep = getelementptr i8, i8* %p, i64 8589942784
1210  %load = load volatile i8, i8* %gep, align 1
1211  store i8 %load, i8* undef
1212  ret void
1213}
1214
1215; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047
1216define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) {
1217; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
1218; GFX9:       ; %bb.0:
1219; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1220; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1221; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1223; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1224; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7ff, v0
1225; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1226; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1227; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1228; GFX9-NEXT:    flat_store_byte v[0:1], v0
1229; GFX9-NEXT:    s_endpgm
1230;
1231; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
1232; GFX10:       ; %bb.0:
1233; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1234; GFX10-NEXT:    ; implicit-def: $vcc_hi
1235; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX10-NEXT:    s_add_u32 s0, s0, 0x7ff
1237; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1238; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1239; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1240; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1241; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1242; GFX10-NEXT:    flat_store_byte v[0:1], v0
1243; GFX10-NEXT:    s_endpgm
1244  %gep = getelementptr i8, i8* %p, i64 -9223372036854773761
1245  %load = load volatile i8, i8* %gep, align 1
1246  store i8 %load, i8* undef
1247  ret void
1248}
1249
1250; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048
1251define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) {
1252; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
1253; GFX9:       ; %bb.0:
1254; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1255; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1256; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1257; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1258; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1259; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x800, v0
1260; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1261; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1262; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1263; GFX9-NEXT:    flat_store_byte v[0:1], v0
1264; GFX9-NEXT:    s_endpgm
1265;
1266; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
1267; GFX10:       ; %bb.0:
1268; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1269; GFX10-NEXT:    ; implicit-def: $vcc_hi
1270; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX10-NEXT:    s_add_u32 s0, s0, 0x800
1272; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1273; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1274; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1275; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1276; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1277; GFX10-NEXT:    flat_store_byte v[0:1], v0
1278; GFX10-NEXT:    s_endpgm
1279  %gep = getelementptr i8, i8* %p, i64 -9223372036854773760
1280  %load = load volatile i8, i8* %gep, align 1
1281  store i8 %load, i8* undef
1282  ret void
1283}
1284
1285; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095
1286define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) {
1287; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
1288; GFX9:       ; %bb.0:
1289; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1290; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1291; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1292; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1293; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1294; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfff, v0
1295; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1296; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1297; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1298; GFX9-NEXT:    flat_store_byte v[0:1], v0
1299; GFX9-NEXT:    s_endpgm
1300;
1301; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
1302; GFX10:       ; %bb.0:
1303; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1304; GFX10-NEXT:    ; implicit-def: $vcc_hi
1305; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1306; GFX10-NEXT:    s_add_u32 s0, s0, 0xfff
1307; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1308; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1309; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1310; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1311; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1312; GFX10-NEXT:    flat_store_byte v[0:1], v0
1313; GFX10-NEXT:    s_endpgm
1314  %gep = getelementptr i8, i8* %p, i64 -9223372036854771713
1315  %load = load volatile i8, i8* %gep, align 1
1316  store i8 %load, i8* undef
1317  ret void
1318}
1319
1320; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096
1321define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) {
1322; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
1323; GFX9:       ; %bb.0:
1324; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1325; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1328; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1329; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
1330; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1331; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1332; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1333; GFX9-NEXT:    flat_store_byte v[0:1], v0
1334; GFX9-NEXT:    s_endpgm
1335;
1336; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
1337; GFX10:       ; %bb.0:
1338; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1339; GFX10-NEXT:    ; implicit-def: $vcc_hi
1340; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX10-NEXT:    s_add_u32 s0, s0, 0x1000
1342; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1343; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1344; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1345; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1346; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1347; GFX10-NEXT:    flat_store_byte v[0:1], v0
1348; GFX10-NEXT:    s_endpgm
1349  %gep = getelementptr i8, i8* %p, i64 -9223372036854771712
1350  %load = load volatile i8, i8* %gep, align 1
1351  store i8 %load, i8* undef
1352  ret void
1353}
1354
1355; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191
1356define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) {
1357; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
1358; GFX9:       ; %bb.0:
1359; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1360; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1361; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1363; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1364; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
1365; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1366; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1367; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1368; GFX9-NEXT:    flat_store_byte v[0:1], v0
1369; GFX9-NEXT:    s_endpgm
1370;
1371; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
1372; GFX10:       ; %bb.0:
1373; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1374; GFX10-NEXT:    ; implicit-def: $vcc_hi
1375; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1376; GFX10-NEXT:    s_add_u32 s0, s0, 0x1fff
1377; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1378; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1379; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1380; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1381; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1382; GFX10-NEXT:    flat_store_byte v[0:1], v0
1383; GFX10-NEXT:    s_endpgm
1384  %gep = getelementptr i8, i8* %p, i64 -9223372036854767617
1385  %load = load volatile i8, i8* %gep, align 1
1386  store i8 %load, i8* undef
1387  ret void
1388}
1389
1390; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192
1391define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) {
1392; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
1393; GFX9:       ; %bb.0:
1394; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1395; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
1396; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1398; GFX9-NEXT:    v_mov_b32_e32 v2, s1
1399; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
1400; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
1401; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
1402; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1403; GFX9-NEXT:    flat_store_byte v[0:1], v0
1404; GFX9-NEXT:    s_endpgm
1405;
1406; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
1407; GFX10:       ; %bb.0:
1408; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1409; GFX10-NEXT:    ; implicit-def: $vcc_hi
1410; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1411; GFX10-NEXT:    s_add_u32 s0, s0, 0x2000
1412; GFX10-NEXT:    s_addc_u32 s1, s1, 0x80000000
1413; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1414; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1415; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
1416; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1417; GFX10-NEXT:    flat_store_byte v[0:1], v0
1418; GFX10-NEXT:    s_endpgm
1419  %gep = getelementptr i8, i8* %p, i64 -9223372036854767616
1420  %load = load volatile i8, i8* %gep, align 1
1421  store i8 %load, i8* undef
1422  ret void
1423}
1424