1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5; Test splitting flat instruction offsets into the low and high bits 6; when the offset doesn't fit in the offset field. 7 8define i8 @flat_inst_valu_offset_1(i8* %p) { 9; GFX9-LABEL: flat_inst_valu_offset_1: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 13; GFX9-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX10-LABEL: flat_inst_valu_offset_1: 16; GFX10: ; %bb.0: 17; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 19; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 1 20; GFX10-NEXT: ; implicit-def: $vcc_hi 21; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 22; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 23; GFX10-NEXT: s_setpc_b64 s[30:31] 24 %gep = getelementptr i8, i8* %p, i64 1 25 %load = load i8, i8* %gep, align 4 26 ret i8 %load 27} 28 29define i8 @flat_inst_valu_offset_11bit_max(i8* %p) { 30; GFX9-LABEL: flat_inst_valu_offset_11bit_max: 31; GFX9: ; %bb.0: 32; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 34; GFX9-NEXT: s_setpc_b64 s[30:31] 35; 36; GFX10-LABEL: flat_inst_valu_offset_11bit_max: 37; GFX10: ; %bb.0: 38; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 39; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 40; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 41; GFX10-NEXT: ; implicit-def: $vcc_hi 42; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 43; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 44; GFX10-NEXT: s_setpc_b64 s[30:31] 45 %gep = getelementptr i8, i8* %p, i64 2047 46 %load = load i8, i8* %gep, align 4 47 ret i8 %load 48} 49 50define i8 @flat_inst_valu_offset_12bit_max(i8* %p) { 51; GFX9-LABEL: flat_inst_valu_offset_12bit_max: 52; GFX9: ; %bb.0: 53; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 54; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 55; GFX9-NEXT: s_setpc_b64 s[30:31] 56; 57; GFX10-LABEL: flat_inst_valu_offset_12bit_max: 58; GFX10: ; %bb.0: 59; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 61; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 62; GFX10-NEXT: ; implicit-def: $vcc_hi 63; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 64; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 65; GFX10-NEXT: s_setpc_b64 s[30:31] 66 %gep = getelementptr i8, i8* %p, i64 4095 67 %load = load i8, i8* %gep, align 4 68 ret i8 %load 69} 70 71define i8 @flat_inst_valu_offset_13bit_max(i8* %p) { 72; GFX9-LABEL: flat_inst_valu_offset_13bit_max: 73; GFX9: ; %bb.0: 74; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 75; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 76; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 77; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 78; GFX9-NEXT: s_setpc_b64 s[30:31] 79; 80; GFX10-LABEL: flat_inst_valu_offset_13bit_max: 81; GFX10: ; %bb.0: 82; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 84; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 85; GFX10-NEXT: ; implicit-def: $vcc_hi 86; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 87; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 88; GFX10-NEXT: s_setpc_b64 s[30:31] 89 %gep = getelementptr i8, i8* %p, i64 8191 90 %load = load i8, i8* %gep, align 4 91 ret i8 %load 92} 93 94define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) { 95; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max: 96; GFX9: ; %bb.0: 97; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 98; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 99; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 100; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 101; GFX9-NEXT: s_setpc_b64 s[30:31] 102; 103; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max: 104; GFX10: ; %bb.0: 105; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 107; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0 108; GFX10-NEXT: ; implicit-def: $vcc_hi 109; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 110; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 111; GFX10-NEXT: s_setpc_b64 s[30:31] 112 %gep = getelementptr i8, i8* %p, i64 -2048 113 %load = load i8, i8* %gep, align 4 114 ret i8 %load 115} 116 117define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) { 118; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max: 119; GFX9: ; %bb.0: 120; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 121; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 122; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 123; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 124; GFX9-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max: 127; GFX10: ; %bb.0: 128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 130; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 131; GFX10-NEXT: ; implicit-def: $vcc_hi 132; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 133; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 134; GFX10-NEXT: s_setpc_b64 s[30:31] 135 %gep = getelementptr i8, i8* %p, i64 -4096 136 %load = load i8, i8* %gep, align 4 137 ret i8 %load 138} 139 140define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) { 141; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max: 142; GFX9: ; %bb.0: 143; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 144; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 145; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 146; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 147; GFX9-NEXT: s_setpc_b64 s[30:31] 148; 149; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 153; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 154; GFX10-NEXT: ; implicit-def: $vcc_hi 155; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 156; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 157; GFX10-NEXT: s_setpc_b64 s[30:31] 158 %gep = getelementptr i8, i8* %p, i64 -8192 159 %load = load i8, i8* %gep, align 4 160 ret i8 %load 161} 162 163define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) { 164; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max: 165; GFX9: ; %bb.0: 166; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 167; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 168; GFX9-NEXT: s_setpc_b64 s[30:31] 169; 170; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max: 171; GFX10: ; %bb.0: 172; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 174; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 175; GFX10-NEXT: ; implicit-def: $vcc_hi 176; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 177; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 178; GFX10-NEXT: s_setpc_b64 s[30:31] 179 %gep = getelementptr i8, i8* %p, i64 4095 180 %load = load i8, i8* %gep, align 4 181 ret i8 %load 182} 183 184define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) { 185; GFX9-LABEL: flat_inst_valu_offset_2x_12bit_max: 186; GFX9: ; %bb.0: 187; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 188; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 189; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 190; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 191; GFX9-NEXT: s_setpc_b64 s[30:31] 192; 193; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max: 194; GFX10: ; %bb.0: 195; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 196; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 197; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 198; GFX10-NEXT: ; implicit-def: $vcc_hi 199; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 200; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 201; GFX10-NEXT: s_setpc_b64 s[30:31] 202 %gep = getelementptr i8, i8* %p, i64 8191 203 %load = load i8, i8* %gep, align 4 204 ret i8 %load 205} 206 207define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) { 208; GFX9-LABEL: flat_inst_valu_offset_2x_13bit_max: 209; GFX9: ; %bb.0: 210; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 211; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 212; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 213; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 214; GFX9-NEXT: s_setpc_b64 s[30:31] 215; 216; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max: 217; GFX10: ; %bb.0: 218; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 220; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0 221; GFX10-NEXT: ; implicit-def: $vcc_hi 222; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 223; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 224; GFX10-NEXT: s_setpc_b64 s[30:31] 225 %gep = getelementptr i8, i8* %p, i64 16383 226 %load = load i8, i8* %gep, align 4 227 ret i8 %load 228} 229 230define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) { 231; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: 232; GFX9: ; %bb.0: 233; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 235; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 236; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 237; GFX9-NEXT: s_setpc_b64 s[30:31] 238; 239; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: 240; GFX10: ; %bb.0: 241; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 242; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 243; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 244; GFX10-NEXT: ; implicit-def: $vcc_hi 245; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 246; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 247; GFX10-NEXT: s_setpc_b64 s[30:31] 248 %gep = getelementptr i8, i8* %p, i64 -4096 249 %load = load i8, i8* %gep, align 4 250 ret i8 %load 251} 252 253define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) { 254; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: 255; GFX9: ; %bb.0: 256; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 257; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 258; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 259; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 260; GFX9-NEXT: s_setpc_b64 s[30:31] 261; 262; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: 263; GFX10: ; %bb.0: 264; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 266; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 267; GFX10-NEXT: ; implicit-def: $vcc_hi 268; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 269; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 270; GFX10-NEXT: s_setpc_b64 s[30:31] 271 %gep = getelementptr i8, i8* %p, i64 -8192 272 %load = load i8, i8* %gep, align 4 273 ret i8 %load 274} 275 276define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) { 277; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: 278; GFX9: ; %bb.0: 279; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 281; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 282; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 283; GFX9-NEXT: s_setpc_b64 s[30:31] 284; 285; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: 286; GFX10: ; %bb.0: 287; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 288; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 289; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 290; GFX10-NEXT: ; implicit-def: $vcc_hi 291; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 292; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 293; GFX10-NEXT: s_setpc_b64 s[30:31] 294 %gep = getelementptr i8, i8* %p, i64 -16384 295 %load = load i8, i8* %gep, align 4 296 ret i8 %load 297} 298 299; Fill 11-bit low-bits (1ull << 33) | 2047 300define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) { 301; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split0: 302; GFX9: ; %bb.0: 303; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 304; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 305; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 306; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 307; GFX9-NEXT: s_setpc_b64 s[30:31] 308; 309; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0: 310; GFX10: ; %bb.0: 311; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 313; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 314; GFX10-NEXT: ; implicit-def: $vcc_hi 315; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 316; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 317; GFX10-NEXT: s_setpc_b64 s[30:31] 318 %gep = getelementptr i8, i8* %p, i64 8589936639 319 %load = load i8, i8* %gep, align 4 320 ret i8 %load 321} 322 323; Fill 11-bit low-bits (1ull << 33) | 2048 324define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) { 325; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_split1: 326; GFX9: ; %bb.0: 327; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 328; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 329; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 330; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 331; GFX9-NEXT: s_setpc_b64 s[30:31] 332; 333; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1: 334; GFX10: ; %bb.0: 335; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 337; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 338; GFX10-NEXT: ; implicit-def: $vcc_hi 339; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 340; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 341; GFX10-NEXT: s_setpc_b64 s[30:31] 342 %gep = getelementptr i8, i8* %p, i64 8589936640 343 %load = load i8, i8* %gep, align 4 344 ret i8 %load 345} 346 347; Fill 12-bit low-bits (1ull << 33) | 4095 348define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) { 349; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split0: 350; GFX9: ; %bb.0: 351; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 352; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 353; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 354; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 355; GFX9-NEXT: s_setpc_b64 s[30:31] 356; 357; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0: 358; GFX10: ; %bb.0: 359; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 360; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 361; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 362; GFX10-NEXT: ; implicit-def: $vcc_hi 363; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 364; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 365; GFX10-NEXT: s_setpc_b64 s[30:31] 366 %gep = getelementptr i8, i8* %p, i64 8589938687 367 %load = load i8, i8* %gep, align 4 368 ret i8 %load 369} 370 371; Fill 12-bit low-bits (1ull << 33) | 4096 372define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) { 373; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_split1: 374; GFX9: ; %bb.0: 375; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 376; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 377; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 378; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 379; GFX9-NEXT: s_setpc_b64 s[30:31] 380; 381; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1: 382; GFX10: ; %bb.0: 383; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 384; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 385; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 386; GFX10-NEXT: ; implicit-def: $vcc_hi 387; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 388; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 389; GFX10-NEXT: s_setpc_b64 s[30:31] 390 %gep = getelementptr i8, i8* %p, i64 8589938688 391 %load = load i8, i8* %gep, align 4 392 ret i8 %load 393} 394 395; Fill 13-bit low-bits (1ull << 33) | 8191 396define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) { 397; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split0: 398; GFX9: ; %bb.0: 399; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 400; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 401; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 402; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 403; GFX9-NEXT: s_setpc_b64 s[30:31] 404; 405; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0: 406; GFX10: ; %bb.0: 407; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 408; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 409; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 410; GFX10-NEXT: ; implicit-def: $vcc_hi 411; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 412; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 413; GFX10-NEXT: s_setpc_b64 s[30:31] 414 %gep = getelementptr i8, i8* %p, i64 8589942783 415 %load = load i8, i8* %gep, align 4 416 ret i8 %load 417} 418 419; Fill 13-bit low-bits (1ull << 33) | 8192 420define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) { 421; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_split1: 422; GFX9: ; %bb.0: 423; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 424; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 425; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 426; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 427; GFX9-NEXT: s_setpc_b64 s[30:31] 428; 429; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1: 430; GFX10: ; %bb.0: 431; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 432; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 433; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 434; GFX10-NEXT: ; implicit-def: $vcc_hi 435; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 436; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 437; GFX10-NEXT: s_setpc_b64 s[30:31] 438 %gep = getelementptr i8, i8* %p, i64 8589942784 439 %load = load i8, i8* %gep, align 4 440 ret i8 %load 441} 442 443; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 444define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) { 445; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: 446; GFX9: ; %bb.0: 447; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 449; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 450; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 451; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 452; GFX9-NEXT: s_setpc_b64 s[30:31] 453; 454; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: 455; GFX10: ; %bb.0: 456; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 457; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 458; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0 459; GFX10-NEXT: ; implicit-def: $vcc_hi 460; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 461; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 462; GFX10-NEXT: s_setpc_b64 s[30:31] 463 %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 464 %load = load i8, i8* %gep, align 4 465 ret i8 %load 466} 467 468; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 469define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) { 470; GFX9-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: 471; GFX9: ; %bb.0: 472; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 474; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 475; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 476; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 477; GFX9-NEXT: s_setpc_b64 s[30:31] 478; 479; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: 480; GFX10: ; %bb.0: 481; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 482; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 483; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 484; GFX10-NEXT: ; implicit-def: $vcc_hi 485; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 486; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 487; GFX10-NEXT: s_setpc_b64 s[30:31] 488 %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 489 %load = load i8, i8* %gep, align 4 490 ret i8 %load 491} 492 493; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 494define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) { 495; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: 496; GFX9: ; %bb.0: 497; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 498; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 499; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 500; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 501; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 502; GFX9-NEXT: s_setpc_b64 s[30:31] 503; 504; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: 505; GFX10: ; %bb.0: 506; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 507; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 508; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0 509; GFX10-NEXT: ; implicit-def: $vcc_hi 510; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 511; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 512; GFX10-NEXT: s_setpc_b64 s[30:31] 513 %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 514 %load = load i8, i8* %gep, align 4 515 ret i8 %load 516} 517 518; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 519define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) { 520; GFX9-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: 521; GFX9: ; %bb.0: 522; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 523; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 524; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 525; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 526; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 527; GFX9-NEXT: s_setpc_b64 s[30:31] 528; 529; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: 530; GFX10: ; %bb.0: 531; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 532; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 533; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 534; GFX10-NEXT: ; implicit-def: $vcc_hi 535; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 536; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 537; GFX10-NEXT: s_setpc_b64 s[30:31] 538 %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 539 %load = load i8, i8* %gep, align 4 540 ret i8 %load 541} 542 543; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 544define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) { 545; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: 546; GFX9: ; %bb.0: 547; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 548; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 549; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 550; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 551; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 552; GFX9-NEXT: s_setpc_b64 s[30:31] 553; 554; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: 555; GFX10: ; %bb.0: 556; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 557; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 558; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0 559; GFX10-NEXT: ; implicit-def: $vcc_hi 560; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 561; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 562; GFX10-NEXT: s_setpc_b64 s[30:31] 563 %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 564 %load = load i8, i8* %gep, align 4 565 ret i8 %load 566} 567 568; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 569define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) { 570; GFX9-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: 571; GFX9: ; %bb.0: 572; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 573; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 574; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 575; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 576; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 577; GFX9-NEXT: s_setpc_b64 s[30:31] 578; 579; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: 580; GFX10: ; %bb.0: 581; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 583; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 584; GFX10-NEXT: ; implicit-def: $vcc_hi 585; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 586; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 587; GFX10-NEXT: s_setpc_b64 s[30:31] 588 %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 589 %load = load i8, i8* %gep, align 4 590 ret i8 %load 591} 592 593define amdgpu_kernel void @flat_inst_salu_offset_1(i8* %p) { 594; GFX9-LABEL: flat_inst_salu_offset_1: 595; GFX9: ; %bb.0: 596; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 597; GFX9-NEXT: s_waitcnt lgkmcnt(0) 598; GFX9-NEXT: v_mov_b32_e32 v0, s0 599; GFX9-NEXT: v_mov_b32_e32 v1, s1 600; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 601; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 602; GFX9-NEXT: flat_store_byte v[0:1], v0 603; GFX9-NEXT: s_endpgm 604; 605; GFX10-LABEL: flat_inst_salu_offset_1: 606; GFX10: ; %bb.0: 607; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 608; GFX10-NEXT: ; implicit-def: $vcc_hi 609; GFX10-NEXT: s_waitcnt lgkmcnt(0) 610; GFX10-NEXT: s_add_u32 s0, s0, 1 611; GFX10-NEXT: s_addc_u32 s1, s1, 0 612; GFX10-NEXT: v_mov_b32_e32 v0, s0 613; GFX10-NEXT: v_mov_b32_e32 v1, s1 614; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 615; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 616; GFX10-NEXT: flat_store_byte v[0:1], v0 617; GFX10-NEXT: s_endpgm 618 %gep = getelementptr i8, i8* %p, i64 1 619 %load = load volatile i8, i8* %gep, align 1 620 store i8 %load, i8* undef 621 ret void 622} 623 624define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(i8* %p) { 625; GFX9-LABEL: flat_inst_salu_offset_11bit_max: 626; GFX9: ; %bb.0: 627; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 628; GFX9-NEXT: s_waitcnt lgkmcnt(0) 629; GFX9-NEXT: v_mov_b32_e32 v0, s0 630; GFX9-NEXT: v_mov_b32_e32 v1, s1 631; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 632; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 633; GFX9-NEXT: flat_store_byte v[0:1], v0 634; GFX9-NEXT: s_endpgm 635; 636; GFX10-LABEL: flat_inst_salu_offset_11bit_max: 637; GFX10: ; %bb.0: 638; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 639; GFX10-NEXT: ; implicit-def: $vcc_hi 640; GFX10-NEXT: s_waitcnt lgkmcnt(0) 641; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff 642; GFX10-NEXT: s_addc_u32 s1, s1, 0 643; GFX10-NEXT: v_mov_b32_e32 v0, s0 644; GFX10-NEXT: v_mov_b32_e32 v1, s1 645; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 646; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 647; GFX10-NEXT: flat_store_byte v[0:1], v0 648; GFX10-NEXT: s_endpgm 649 %gep = getelementptr i8, i8* %p, i64 2047 650 %load = load volatile i8, i8* %gep, align 1 651 store i8 %load, i8* undef 652 ret void 653} 654 655define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(i8* %p) { 656; GFX9-LABEL: flat_inst_salu_offset_12bit_max: 657; GFX9: ; %bb.0: 658; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 659; GFX9-NEXT: s_waitcnt lgkmcnt(0) 660; GFX9-NEXT: v_mov_b32_e32 v0, s0 661; GFX9-NEXT: v_mov_b32_e32 v1, s1 662; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 663; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 664; GFX9-NEXT: flat_store_byte v[0:1], v0 665; GFX9-NEXT: s_endpgm 666; 667; GFX10-LABEL: flat_inst_salu_offset_12bit_max: 668; GFX10: ; %bb.0: 669; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 670; GFX10-NEXT: ; implicit-def: $vcc_hi 671; GFX10-NEXT: s_waitcnt lgkmcnt(0) 672; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 673; GFX10-NEXT: s_addc_u32 s1, s1, 0 674; GFX10-NEXT: v_mov_b32_e32 v0, s0 675; GFX10-NEXT: v_mov_b32_e32 v1, s1 676; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 677; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 678; GFX10-NEXT: flat_store_byte v[0:1], v0 679; GFX10-NEXT: s_endpgm 680 %gep = getelementptr i8, i8* %p, i64 4095 681 %load = load volatile i8, i8* %gep, align 1 682 store i8 %load, i8* undef 683 ret void 684} 685 686define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(i8* %p) { 687; GFX9-LABEL: flat_inst_salu_offset_13bit_max: 688; GFX9: ; %bb.0: 689; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 690; GFX9-NEXT: s_waitcnt lgkmcnt(0) 691; GFX9-NEXT: v_mov_b32_e32 v0, s0 692; GFX9-NEXT: v_mov_b32_e32 v1, s1 693; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 694; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 695; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 696; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 697; GFX9-NEXT: flat_store_byte v[0:1], v0 698; GFX9-NEXT: s_endpgm 699; 700; GFX10-LABEL: flat_inst_salu_offset_13bit_max: 701; GFX10: ; %bb.0: 702; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 703; GFX10-NEXT: ; implicit-def: $vcc_hi 704; GFX10-NEXT: s_waitcnt lgkmcnt(0) 705; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 706; GFX10-NEXT: s_addc_u32 s1, s1, 0 707; GFX10-NEXT: v_mov_b32_e32 v0, s0 708; GFX10-NEXT: v_mov_b32_e32 v1, s1 709; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 710; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 711; GFX10-NEXT: flat_store_byte v[0:1], v0 712; GFX10-NEXT: s_endpgm 713 %gep = getelementptr i8, i8* %p, i64 8191 714 %load = load volatile i8, i8* %gep, align 1 715 store i8 %load, i8* undef 716 ret void 717} 718 719define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(i8* %p) { 720; GFX9-LABEL: flat_inst_salu_offset_neg_11bit_max: 721; GFX9: ; %bb.0: 722; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 723; GFX9-NEXT: s_waitcnt lgkmcnt(0) 724; GFX9-NEXT: v_mov_b32_e32 v0, s0 725; GFX9-NEXT: v_mov_b32_e32 v1, s1 726; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 727; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 728; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 729; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 730; GFX9-NEXT: flat_store_byte v[0:1], v0 731; GFX9-NEXT: s_endpgm 732; 733; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max: 734; GFX10: ; %bb.0: 735; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 736; GFX10-NEXT: ; implicit-def: $vcc_hi 737; GFX10-NEXT: s_waitcnt lgkmcnt(0) 738; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800 739; GFX10-NEXT: s_addc_u32 s1, s1, -1 740; GFX10-NEXT: v_mov_b32_e32 v0, s0 741; GFX10-NEXT: v_mov_b32_e32 v1, s1 742; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 743; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 744; GFX10-NEXT: flat_store_byte v[0:1], v0 745; GFX10-NEXT: s_endpgm 746 %gep = getelementptr i8, i8* %p, i64 -2048 747 %load = load volatile i8, i8* %gep, align 1 748 store i8 %load, i8* undef 749 ret void 750} 751 752define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(i8* %p) { 753; GFX9-LABEL: flat_inst_salu_offset_neg_12bit_max: 754; GFX9: ; %bb.0: 755; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 756; GFX9-NEXT: s_waitcnt lgkmcnt(0) 757; GFX9-NEXT: v_mov_b32_e32 v0, s0 758; GFX9-NEXT: v_mov_b32_e32 v1, s1 759; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 760; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 761; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 762; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 763; GFX9-NEXT: flat_store_byte v[0:1], v0 764; GFX9-NEXT: s_endpgm 765; 766; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max: 767; GFX10: ; %bb.0: 768; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 769; GFX10-NEXT: ; implicit-def: $vcc_hi 770; GFX10-NEXT: s_waitcnt lgkmcnt(0) 771; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 772; GFX10-NEXT: s_addc_u32 s1, s1, -1 773; GFX10-NEXT: v_mov_b32_e32 v0, s0 774; GFX10-NEXT: v_mov_b32_e32 v1, s1 775; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 776; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 777; GFX10-NEXT: flat_store_byte v[0:1], v0 778; GFX10-NEXT: s_endpgm 779 %gep = getelementptr i8, i8* %p, i64 -4096 780 %load = load volatile i8, i8* %gep, align 1 781 store i8 %load, i8* undef 782 ret void 783} 784 785define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(i8* %p) { 786; GFX9-LABEL: flat_inst_salu_offset_neg_13bit_max: 787; GFX9: ; %bb.0: 788; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 789; GFX9-NEXT: s_waitcnt lgkmcnt(0) 790; GFX9-NEXT: v_mov_b32_e32 v0, s0 791; GFX9-NEXT: v_mov_b32_e32 v1, s1 792; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 793; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 794; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 795; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 796; GFX9-NEXT: flat_store_byte v[0:1], v0 797; GFX9-NEXT: s_endpgm 798; 799; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max: 800; GFX10: ; %bb.0: 801; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 802; GFX10-NEXT: ; implicit-def: $vcc_hi 803; GFX10-NEXT: s_waitcnt lgkmcnt(0) 804; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 805; GFX10-NEXT: s_addc_u32 s1, s1, -1 806; GFX10-NEXT: v_mov_b32_e32 v0, s0 807; GFX10-NEXT: v_mov_b32_e32 v1, s1 808; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 809; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 810; GFX10-NEXT: flat_store_byte v[0:1], v0 811; GFX10-NEXT: s_endpgm 812 %gep = getelementptr i8, i8* %p, i64 -8192 813 %load = load volatile i8, i8* %gep, align 1 814 store i8 %load, i8* undef 815 ret void 816} 817 818define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(i8* %p) { 819; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max: 820; GFX9: ; %bb.0: 821; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 822; GFX9-NEXT: s_waitcnt lgkmcnt(0) 823; GFX9-NEXT: v_mov_b32_e32 v0, s0 824; GFX9-NEXT: v_mov_b32_e32 v1, s1 825; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 826; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 827; GFX9-NEXT: flat_store_byte v[0:1], v0 828; GFX9-NEXT: s_endpgm 829; 830; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max: 831; GFX10: ; %bb.0: 832; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 833; GFX10-NEXT: ; implicit-def: $vcc_hi 834; GFX10-NEXT: s_waitcnt lgkmcnt(0) 835; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 836; GFX10-NEXT: s_addc_u32 s1, s1, 0 837; GFX10-NEXT: v_mov_b32_e32 v0, s0 838; GFX10-NEXT: v_mov_b32_e32 v1, s1 839; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 840; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 841; GFX10-NEXT: flat_store_byte v[0:1], v0 842; GFX10-NEXT: s_endpgm 843 %gep = getelementptr i8, i8* %p, i64 4095 844 %load = load volatile i8, i8* %gep, align 1 845 store i8 %load, i8* undef 846 ret void 847} 848 849define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(i8* %p) { 850; GFX9-LABEL: flat_inst_salu_offset_2x_12bit_max: 851; GFX9: ; %bb.0: 852; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 853; GFX9-NEXT: s_waitcnt lgkmcnt(0) 854; GFX9-NEXT: v_mov_b32_e32 v0, s0 855; GFX9-NEXT: v_mov_b32_e32 v1, s1 856; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 857; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 858; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 859; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 860; GFX9-NEXT: flat_store_byte v[0:1], v0 861; GFX9-NEXT: s_endpgm 862; 863; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max: 864; GFX10: ; %bb.0: 865; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 866; GFX10-NEXT: ; implicit-def: $vcc_hi 867; GFX10-NEXT: s_waitcnt lgkmcnt(0) 868; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 869; GFX10-NEXT: s_addc_u32 s1, s1, 0 870; GFX10-NEXT: v_mov_b32_e32 v0, s0 871; GFX10-NEXT: v_mov_b32_e32 v1, s1 872; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 873; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 874; GFX10-NEXT: flat_store_byte v[0:1], v0 875; GFX10-NEXT: s_endpgm 876 %gep = getelementptr i8, i8* %p, i64 8191 877 %load = load volatile i8, i8* %gep, align 1 878 store i8 %load, i8* undef 879 ret void 880} 881 882define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(i8* %p) { 883; GFX9-LABEL: flat_inst_salu_offset_2x_13bit_max: 884; GFX9: ; %bb.0: 885; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 887; GFX9-NEXT: v_mov_b32_e32 v0, s0 888; GFX9-NEXT: v_mov_b32_e32 v1, s1 889; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 890; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 891; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 892; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 893; GFX9-NEXT: flat_store_byte v[0:1], v0 894; GFX9-NEXT: s_endpgm 895; 896; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max: 897; GFX10: ; %bb.0: 898; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 899; GFX10-NEXT: ; implicit-def: $vcc_hi 900; GFX10-NEXT: s_waitcnt lgkmcnt(0) 901; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff 902; GFX10-NEXT: s_addc_u32 s1, s1, 0 903; GFX10-NEXT: v_mov_b32_e32 v0, s0 904; GFX10-NEXT: v_mov_b32_e32 v1, s1 905; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 906; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 907; GFX10-NEXT: flat_store_byte v[0:1], v0 908; GFX10-NEXT: s_endpgm 909 %gep = getelementptr i8, i8* %p, i64 16383 910 %load = load volatile i8, i8* %gep, align 1 911 store i8 %load, i8* undef 912 ret void 913} 914 915define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(i8* %p) { 916; GFX9-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: 917; GFX9: ; %bb.0: 918; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 919; GFX9-NEXT: s_waitcnt lgkmcnt(0) 920; GFX9-NEXT: v_mov_b32_e32 v0, s0 921; GFX9-NEXT: v_mov_b32_e32 v1, s1 922; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 923; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 924; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 925; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 926; GFX9-NEXT: flat_store_byte v[0:1], v0 927; GFX9-NEXT: s_endpgm 928; 929; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: 930; GFX10: ; %bb.0: 931; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 932; GFX10-NEXT: ; implicit-def: $vcc_hi 933; GFX10-NEXT: s_waitcnt lgkmcnt(0) 934; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000 935; GFX10-NEXT: s_addc_u32 s1, s1, -1 936; GFX10-NEXT: v_mov_b32_e32 v0, s0 937; GFX10-NEXT: v_mov_b32_e32 v1, s1 938; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 939; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 940; GFX10-NEXT: flat_store_byte v[0:1], v0 941; GFX10-NEXT: s_endpgm 942 %gep = getelementptr i8, i8* %p, i64 -4096 943 %load = load volatile i8, i8* %gep, align 1 944 store i8 %load, i8* undef 945 ret void 946} 947 948define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(i8* %p) { 949; GFX9-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: 950; GFX9: ; %bb.0: 951; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 952; GFX9-NEXT: s_waitcnt lgkmcnt(0) 953; GFX9-NEXT: v_mov_b32_e32 v0, s0 954; GFX9-NEXT: v_mov_b32_e32 v1, s1 955; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 956; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 957; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 958; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 959; GFX9-NEXT: flat_store_byte v[0:1], v0 960; GFX9-NEXT: s_endpgm 961; 962; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: 963; GFX10: ; %bb.0: 964; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 965; GFX10-NEXT: ; implicit-def: $vcc_hi 966; GFX10-NEXT: s_waitcnt lgkmcnt(0) 967; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000 968; GFX10-NEXT: s_addc_u32 s1, s1, -1 969; GFX10-NEXT: v_mov_b32_e32 v0, s0 970; GFX10-NEXT: v_mov_b32_e32 v1, s1 971; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 972; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 973; GFX10-NEXT: flat_store_byte v[0:1], v0 974; GFX10-NEXT: s_endpgm 975 %gep = getelementptr i8, i8* %p, i64 -8192 976 %load = load volatile i8, i8* %gep, align 1 977 store i8 %load, i8* undef 978 ret void 979} 980 981define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(i8* %p) { 982; GFX9-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: 983; GFX9: ; %bb.0: 984; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 985; GFX9-NEXT: s_waitcnt lgkmcnt(0) 986; GFX9-NEXT: v_mov_b32_e32 v0, s0 987; GFX9-NEXT: v_mov_b32_e32 v1, s1 988; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 989; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 990; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 991; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 992; GFX9-NEXT: flat_store_byte v[0:1], v0 993; GFX9-NEXT: s_endpgm 994; 995; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: 996; GFX10: ; %bb.0: 997; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 998; GFX10-NEXT: ; implicit-def: $vcc_hi 999; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000 1001; GFX10-NEXT: s_addc_u32 s1, s1, -1 1002; GFX10-NEXT: v_mov_b32_e32 v0, s0 1003; GFX10-NEXT: v_mov_b32_e32 v1, s1 1004; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1005; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1006; GFX10-NEXT: flat_store_byte v[0:1], v0 1007; GFX10-NEXT: s_endpgm 1008 %gep = getelementptr i8, i8* %p, i64 -16384 1009 %load = load volatile i8, i8* %gep, align 1 1010 store i8 %load, i8* undef 1011 ret void 1012} 1013 1014; Fill 11-bit low-bits (1ull << 33) | 2047 1015define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(i8* %p) { 1016; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split0: 1017; GFX9: ; %bb.0: 1018; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1020; GFX9-NEXT: v_mov_b32_e32 v1, s1 1021; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1022; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1023; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 1024; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1025; GFX9-NEXT: flat_store_byte v[0:1], v0 1026; GFX9-NEXT: s_endpgm 1027; 1028; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0: 1029; GFX10: ; %bb.0: 1030; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1031; GFX10-NEXT: ; implicit-def: $vcc_hi 1032; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1033; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff 1034; GFX10-NEXT: s_addc_u32 s1, s1, 2 1035; GFX10-NEXT: v_mov_b32_e32 v0, s0 1036; GFX10-NEXT: v_mov_b32_e32 v1, s1 1037; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1038; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1039; GFX10-NEXT: flat_store_byte v[0:1], v0 1040; GFX10-NEXT: s_endpgm 1041 %gep = getelementptr i8, i8* %p, i64 8589936639 1042 %load = load volatile i8, i8* %gep, align 1 1043 store i8 %load, i8* undef 1044 ret void 1045} 1046 1047; Fill 11-bit low-bits (1ull << 33) | 2048 1048define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(i8* %p) { 1049; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_split1: 1050; GFX9: ; %bb.0: 1051; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX9-NEXT: v_mov_b32_e32 v1, s1 1054; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1055; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1056; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 1057; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1058; GFX9-NEXT: flat_store_byte v[0:1], v0 1059; GFX9-NEXT: s_endpgm 1060; 1061; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1: 1062; GFX10: ; %bb.0: 1063; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1064; GFX10-NEXT: ; implicit-def: $vcc_hi 1065; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1066; GFX10-NEXT: s_add_u32 s0, s0, 0x800 1067; GFX10-NEXT: s_addc_u32 s1, s1, 2 1068; GFX10-NEXT: v_mov_b32_e32 v0, s0 1069; GFX10-NEXT: v_mov_b32_e32 v1, s1 1070; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1071; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1072; GFX10-NEXT: flat_store_byte v[0:1], v0 1073; GFX10-NEXT: s_endpgm 1074 %gep = getelementptr i8, i8* %p, i64 8589936640 1075 %load = load volatile i8, i8* %gep, align 1 1076 store i8 %load, i8* undef 1077 ret void 1078} 1079 1080; Fill 12-bit low-bits (1ull << 33) | 4095 1081define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(i8* %p) { 1082; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split0: 1083; GFX9: ; %bb.0: 1084; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1085; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX9-NEXT: v_mov_b32_e32 v1, s1 1087; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1088; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1089; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 1090; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1091; GFX9-NEXT: flat_store_byte v[0:1], v0 1092; GFX9-NEXT: s_endpgm 1093; 1094; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0: 1095; GFX10: ; %bb.0: 1096; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1097; GFX10-NEXT: ; implicit-def: $vcc_hi 1098; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 1100; GFX10-NEXT: s_addc_u32 s1, s1, 2 1101; GFX10-NEXT: v_mov_b32_e32 v0, s0 1102; GFX10-NEXT: v_mov_b32_e32 v1, s1 1103; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1104; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1105; GFX10-NEXT: flat_store_byte v[0:1], v0 1106; GFX10-NEXT: s_endpgm 1107 %gep = getelementptr i8, i8* %p, i64 8589938687 1108 %load = load volatile i8, i8* %gep, align 1 1109 store i8 %load, i8* undef 1110 ret void 1111} 1112 1113; Fill 12-bit low-bits (1ull << 33) | 4096 1114define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(i8* %p) { 1115; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_split1: 1116; GFX9: ; %bb.0: 1117; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX9-NEXT: v_mov_b32_e32 v0, s0 1120; GFX9-NEXT: v_mov_b32_e32 v1, s1 1121; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1122; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1123; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1124; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1125; GFX9-NEXT: flat_store_byte v[0:1], v0 1126; GFX9-NEXT: s_endpgm 1127; 1128; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1: 1129; GFX10: ; %bb.0: 1130; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1131; GFX10-NEXT: ; implicit-def: $vcc_hi 1132; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 1134; GFX10-NEXT: s_addc_u32 s1, s1, 2 1135; GFX10-NEXT: v_mov_b32_e32 v0, s0 1136; GFX10-NEXT: v_mov_b32_e32 v1, s1 1137; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1138; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1139; GFX10-NEXT: flat_store_byte v[0:1], v0 1140; GFX10-NEXT: s_endpgm 1141 %gep = getelementptr i8, i8* %p, i64 8589938688 1142 %load = load volatile i8, i8* %gep, align 1 1143 store i8 %load, i8* undef 1144 ret void 1145} 1146 1147; Fill 13-bit low-bits (1ull << 33) | 8191 1148define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(i8* %p) { 1149; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split0: 1150; GFX9: ; %bb.0: 1151; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1152; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1153; GFX9-NEXT: v_mov_b32_e32 v0, s0 1154; GFX9-NEXT: v_mov_b32_e32 v1, s1 1155; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1156; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1157; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 1158; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1159; GFX9-NEXT: flat_store_byte v[0:1], v0 1160; GFX9-NEXT: s_endpgm 1161; 1162; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0: 1163; GFX10: ; %bb.0: 1164; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1165; GFX10-NEXT: ; implicit-def: $vcc_hi 1166; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 1168; GFX10-NEXT: s_addc_u32 s1, s1, 2 1169; GFX10-NEXT: v_mov_b32_e32 v0, s0 1170; GFX10-NEXT: v_mov_b32_e32 v1, s1 1171; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1172; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1173; GFX10-NEXT: flat_store_byte v[0:1], v0 1174; GFX10-NEXT: s_endpgm 1175 %gep = getelementptr i8, i8* %p, i64 8589942783 1176 %load = load volatile i8, i8* %gep, align 1 1177 store i8 %load, i8* undef 1178 ret void 1179} 1180 1181; Fill 13-bit low-bits (1ull << 33) | 8192 1182define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(i8* %p) { 1183; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_split1: 1184; GFX9: ; %bb.0: 1185; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX9-NEXT: v_mov_b32_e32 v0, s0 1188; GFX9-NEXT: v_mov_b32_e32 v1, s1 1189; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1190; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1191; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1192; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1193; GFX9-NEXT: flat_store_byte v[0:1], v0 1194; GFX9-NEXT: s_endpgm 1195; 1196; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1: 1197; GFX10: ; %bb.0: 1198; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1199; GFX10-NEXT: ; implicit-def: $vcc_hi 1200; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1201; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 1202; GFX10-NEXT: s_addc_u32 s1, s1, 2 1203; GFX10-NEXT: v_mov_b32_e32 v0, s0 1204; GFX10-NEXT: v_mov_b32_e32 v1, s1 1205; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1206; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1207; GFX10-NEXT: flat_store_byte v[0:1], v0 1208; GFX10-NEXT: s_endpgm 1209 %gep = getelementptr i8, i8* %p, i64 8589942784 1210 %load = load volatile i8, i8* %gep, align 1 1211 store i8 %load, i8* undef 1212 ret void 1213} 1214 1215; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 1216define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(i8* %p) { 1217; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: 1218; GFX9: ; %bb.0: 1219; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1220; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1221; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX9-NEXT: v_mov_b32_e32 v0, s0 1223; GFX9-NEXT: v_mov_b32_e32 v2, s1 1224; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0 1225; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1226; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1227; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1228; GFX9-NEXT: flat_store_byte v[0:1], v0 1229; GFX9-NEXT: s_endpgm 1230; 1231; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: 1232; GFX10: ; %bb.0: 1233; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1234; GFX10-NEXT: ; implicit-def: $vcc_hi 1235; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1236; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff 1237; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1238; GFX10-NEXT: v_mov_b32_e32 v0, s0 1239; GFX10-NEXT: v_mov_b32_e32 v1, s1 1240; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1241; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1242; GFX10-NEXT: flat_store_byte v[0:1], v0 1243; GFX10-NEXT: s_endpgm 1244 %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 1245 %load = load volatile i8, i8* %gep, align 1 1246 store i8 %load, i8* undef 1247 ret void 1248} 1249 1250; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 1251define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(i8* %p) { 1252; GFX9-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: 1253; GFX9: ; %bb.0: 1254; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1255; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1257; GFX9-NEXT: v_mov_b32_e32 v0, s0 1258; GFX9-NEXT: v_mov_b32_e32 v2, s1 1259; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0 1260; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1261; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1262; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1263; GFX9-NEXT: flat_store_byte v[0:1], v0 1264; GFX9-NEXT: s_endpgm 1265; 1266; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: 1267; GFX10: ; %bb.0: 1268; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1269; GFX10-NEXT: ; implicit-def: $vcc_hi 1270; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX10-NEXT: s_add_u32 s0, s0, 0x800 1272; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1273; GFX10-NEXT: v_mov_b32_e32 v0, s0 1274; GFX10-NEXT: v_mov_b32_e32 v1, s1 1275; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1276; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1277; GFX10-NEXT: flat_store_byte v[0:1], v0 1278; GFX10-NEXT: s_endpgm 1279 %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 1280 %load = load volatile i8, i8* %gep, align 1 1281 store i8 %load, i8* undef 1282 ret void 1283} 1284 1285; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 1286define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(i8* %p) { 1287; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: 1288; GFX9: ; %bb.0: 1289; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1290; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1291; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1292; GFX9-NEXT: v_mov_b32_e32 v0, s0 1293; GFX9-NEXT: v_mov_b32_e32 v2, s1 1294; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0 1295; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1296; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1297; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1298; GFX9-NEXT: flat_store_byte v[0:1], v0 1299; GFX9-NEXT: s_endpgm 1300; 1301; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: 1302; GFX10: ; %bb.0: 1303; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1304; GFX10-NEXT: ; implicit-def: $vcc_hi 1305; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1306; GFX10-NEXT: s_add_u32 s0, s0, 0xfff 1307; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1308; GFX10-NEXT: v_mov_b32_e32 v0, s0 1309; GFX10-NEXT: v_mov_b32_e32 v1, s1 1310; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1311; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1312; GFX10-NEXT: flat_store_byte v[0:1], v0 1313; GFX10-NEXT: s_endpgm 1314 %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 1315 %load = load volatile i8, i8* %gep, align 1 1316 store i8 %load, i8* undef 1317 ret void 1318} 1319 1320; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 1321define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(i8* %p) { 1322; GFX9-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: 1323; GFX9: ; %bb.0: 1324; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1325; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1326; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1327; GFX9-NEXT: v_mov_b32_e32 v0, s0 1328; GFX9-NEXT: v_mov_b32_e32 v2, s1 1329; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1330; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1331; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1332; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1333; GFX9-NEXT: flat_store_byte v[0:1], v0 1334; GFX9-NEXT: s_endpgm 1335; 1336; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: 1337; GFX10: ; %bb.0: 1338; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1339; GFX10-NEXT: ; implicit-def: $vcc_hi 1340; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX10-NEXT: s_add_u32 s0, s0, 0x1000 1342; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1343; GFX10-NEXT: v_mov_b32_e32 v0, s0 1344; GFX10-NEXT: v_mov_b32_e32 v1, s1 1345; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1346; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1347; GFX10-NEXT: flat_store_byte v[0:1], v0 1348; GFX10-NEXT: s_endpgm 1349 %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 1350 %load = load volatile i8, i8* %gep, align 1 1351 store i8 %load, i8* undef 1352 ret void 1353} 1354 1355; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 1356define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(i8* %p) { 1357; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: 1358; GFX9: ; %bb.0: 1359; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1360; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX9-NEXT: v_mov_b32_e32 v0, s0 1363; GFX9-NEXT: v_mov_b32_e32 v2, s1 1364; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0 1365; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1366; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1367; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1368; GFX9-NEXT: flat_store_byte v[0:1], v0 1369; GFX9-NEXT: s_endpgm 1370; 1371; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: 1372; GFX10: ; %bb.0: 1373; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1374; GFX10-NEXT: ; implicit-def: $vcc_hi 1375; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff 1377; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1378; GFX10-NEXT: v_mov_b32_e32 v0, s0 1379; GFX10-NEXT: v_mov_b32_e32 v1, s1 1380; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1381; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1382; GFX10-NEXT: flat_store_byte v[0:1], v0 1383; GFX10-NEXT: s_endpgm 1384 %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 1385 %load = load volatile i8, i8* %gep, align 1 1386 store i8 %load, i8* undef 1387 ret void 1388} 1389 1390; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 1391define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(i8* %p) { 1392; GFX9-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: 1393; GFX9: ; %bb.0: 1394; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1395; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX9-NEXT: v_mov_b32_e32 v0, s0 1398; GFX9-NEXT: v_mov_b32_e32 v2, s1 1399; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1400; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1401; GFX9-NEXT: flat_load_ubyte v0, v[0:1] 1402; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1403; GFX9-NEXT: flat_store_byte v[0:1], v0 1404; GFX9-NEXT: s_endpgm 1405; 1406; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: 1407; GFX10: ; %bb.0: 1408; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1409; GFX10-NEXT: ; implicit-def: $vcc_hi 1410; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX10-NEXT: s_add_u32 s0, s0, 0x2000 1412; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000 1413; GFX10-NEXT: v_mov_b32_e32 v0, s0 1414; GFX10-NEXT: v_mov_b32_e32 v1, s1 1415; GFX10-NEXT: flat_load_ubyte v0, v[0:1] 1416; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1417; GFX10-NEXT: flat_store_byte v[0:1], v0 1418; GFX10-NEXT: s_endpgm 1419 %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 1420 %load = load volatile i8, i8* %gep, align 1 1421 store i8 %load, i8* undef 1422 ret void 1423} 1424