1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5; Test splitting flat instruction offsets into the low and high bits 6; when the offset doesn't fit in the offset field. 7 8define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) { 9; GFX9-LABEL: global_inst_valu_offset_1: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 13; GFX9-NEXT: s_waitcnt vmcnt(0) 14; GFX9-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX10-LABEL: global_inst_valu_offset_1: 17; GFX10: ; %bb.0: 18; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 20; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 21; GFX10-NEXT: ; implicit-def: $vcc_hi 22; GFX10-NEXT: s_waitcnt vmcnt(0) 23; GFX10-NEXT: s_setpc_b64 s[30:31] 24 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 25 %load = load i8, i8 addrspace(1)* %gep, align 4 26 ret i8 %load 27} 28 29define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) { 30; GFX9-LABEL: global_inst_valu_offset_11bit_max: 31; GFX9: ; %bb.0: 32; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 34; GFX9-NEXT: s_waitcnt vmcnt(0) 35; GFX9-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX10-LABEL: global_inst_valu_offset_11bit_max: 38; GFX10: ; %bb.0: 39; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 41; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 42; GFX10-NEXT: ; implicit-def: $vcc_hi 43; GFX10-NEXT: s_waitcnt vmcnt(0) 44; GFX10-NEXT: s_setpc_b64 s[30:31] 45 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 46 %load = load i8, i8 addrspace(1)* %gep, align 4 47 ret i8 %load 48} 49 50define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) { 51; GFX9-LABEL: global_inst_valu_offset_12bit_max: 52; GFX9: ; %bb.0: 53; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 54; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 55; GFX9-NEXT: s_waitcnt vmcnt(0) 56; GFX9-NEXT: s_setpc_b64 s[30:31] 57; 58; GFX10-LABEL: global_inst_valu_offset_12bit_max: 59; GFX10: ; %bb.0: 60; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 62; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 63; GFX10-NEXT: ; implicit-def: $vcc_hi 64; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 65; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 66; GFX10-NEXT: s_waitcnt vmcnt(0) 67; GFX10-NEXT: s_setpc_b64 s[30:31] 68 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 69 %load = load i8, i8 addrspace(1)* %gep, align 4 70 ret i8 %load 71} 72 73define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) { 74; GFX9-LABEL: global_inst_valu_offset_13bit_max: 75; GFX9: ; %bb.0: 76; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 78; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 79; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 80; GFX9-NEXT: s_waitcnt vmcnt(0) 81; GFX9-NEXT: s_setpc_b64 s[30:31] 82; 83; GFX10-LABEL: global_inst_valu_offset_13bit_max: 84; GFX10: ; %bb.0: 85; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 86; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 87; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 88; GFX10-NEXT: ; implicit-def: $vcc_hi 89; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 90; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 91; GFX10-NEXT: s_waitcnt vmcnt(0) 92; GFX10-NEXT: s_setpc_b64 s[30:31] 93 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 94 %load = load i8, i8 addrspace(1)* %gep, align 4 95 ret i8 %load 96} 97 98define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) { 99; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max: 100; GFX9: ; %bb.0: 101; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 103; GFX9-NEXT: s_waitcnt vmcnt(0) 104; GFX9-NEXT: s_setpc_b64 s[30:31] 105; 106; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max: 107; GFX10: ; %bb.0: 108; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 109; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 110; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 111; GFX10-NEXT: ; implicit-def: $vcc_hi 112; GFX10-NEXT: s_waitcnt vmcnt(0) 113; GFX10-NEXT: s_setpc_b64 s[30:31] 114 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 115 %load = load i8, i8 addrspace(1)* %gep, align 4 116 ret i8 %load 117} 118 119define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) { 120; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max: 121; GFX9: ; %bb.0: 122; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 124; GFX9-NEXT: s_waitcnt vmcnt(0) 125; GFX9-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max: 128; GFX10: ; %bb.0: 129; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 131; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 132; GFX10-NEXT: ; implicit-def: $vcc_hi 133; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 134; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 135; GFX10-NEXT: s_waitcnt vmcnt(0) 136; GFX10-NEXT: s_setpc_b64 s[30:31] 137 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 138 %load = load i8, i8 addrspace(1)* %gep, align 4 139 ret i8 %load 140} 141 142define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) { 143; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max: 144; GFX9: ; %bb.0: 145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 146; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 147; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 148; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 149; GFX9-NEXT: s_waitcnt vmcnt(0) 150; GFX9-NEXT: s_setpc_b64 s[30:31] 151; 152; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max: 153; GFX10: ; %bb.0: 154; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 156; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 157; GFX10-NEXT: ; implicit-def: $vcc_hi 158; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 159; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 160; GFX10-NEXT: s_waitcnt vmcnt(0) 161; GFX10-NEXT: s_setpc_b64 s[30:31] 162 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 163 %load = load i8, i8 addrspace(1)* %gep, align 4 164 ret i8 %load 165} 166 167define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) { 168; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max: 169; GFX9: ; %bb.0: 170; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 172; GFX9-NEXT: s_waitcnt vmcnt(0) 173; GFX9-NEXT: s_setpc_b64 s[30:31] 174; 175; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max: 176; GFX10: ; %bb.0: 177; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 179; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 180; GFX10-NEXT: ; implicit-def: $vcc_hi 181; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 182; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 183; GFX10-NEXT: s_waitcnt vmcnt(0) 184; GFX10-NEXT: s_setpc_b64 s[30:31] 185 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 186 %load = load i8, i8 addrspace(1)* %gep, align 4 187 ret i8 %load 188} 189 190define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) { 191; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max: 192; GFX9: ; %bb.0: 193; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 195; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 196; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 197; GFX9-NEXT: s_waitcnt vmcnt(0) 198; GFX9-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max: 201; GFX10: ; %bb.0: 202; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 204; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 205; GFX10-NEXT: ; implicit-def: $vcc_hi 206; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 207; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 208; GFX10-NEXT: s_waitcnt vmcnt(0) 209; GFX10-NEXT: s_setpc_b64 s[30:31] 210 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 211 %load = load i8, i8 addrspace(1)* %gep, align 4 212 ret i8 %load 213} 214 215define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) { 216; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max: 217; GFX9: ; %bb.0: 218; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 220; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 221; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 222; GFX9-NEXT: s_waitcnt vmcnt(0) 223; GFX9-NEXT: s_setpc_b64 s[30:31] 224; 225; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max: 226; GFX10: ; %bb.0: 227; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 228; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 229; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0 230; GFX10-NEXT: ; implicit-def: $vcc_hi 231; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 232; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 233; GFX10-NEXT: s_waitcnt vmcnt(0) 234; GFX10-NEXT: s_setpc_b64 s[30:31] 235 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 236 %load = load i8, i8 addrspace(1)* %gep, align 4 237 ret i8 %load 238} 239 240define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) { 241; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max: 242; GFX9: ; %bb.0: 243; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 244; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 245; GFX9-NEXT: s_waitcnt vmcnt(0) 246; GFX9-NEXT: s_setpc_b64 s[30:31] 247; 248; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max: 249; GFX10: ; %bb.0: 250; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 252; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 253; GFX10-NEXT: ; implicit-def: $vcc_hi 254; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 255; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 256; GFX10-NEXT: s_waitcnt vmcnt(0) 257; GFX10-NEXT: s_setpc_b64 s[30:31] 258 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 259 %load = load i8, i8 addrspace(1)* %gep, align 4 260 ret i8 %load 261} 262 263define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) { 264; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max: 265; GFX9: ; %bb.0: 266; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 267; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 268; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 269; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 270; GFX9-NEXT: s_waitcnt vmcnt(0) 271; GFX9-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max: 274; GFX10: ; %bb.0: 275; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 277; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 278; GFX10-NEXT: ; implicit-def: $vcc_hi 279; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 280; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 281; GFX10-NEXT: s_waitcnt vmcnt(0) 282; GFX10-NEXT: s_setpc_b64 s[30:31] 283 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 284 %load = load i8, i8 addrspace(1)* %gep, align 4 285 ret i8 %load 286} 287 288define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) { 289; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max: 290; GFX9: ; %bb.0: 291; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 293; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 294; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 295; GFX9-NEXT: s_waitcnt vmcnt(0) 296; GFX9-NEXT: s_setpc_b64 s[30:31] 297; 298; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max: 299; GFX10: ; %bb.0: 300; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 301; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 302; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 303; GFX10-NEXT: ; implicit-def: $vcc_hi 304; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 305; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 306; GFX10-NEXT: s_waitcnt vmcnt(0) 307; GFX10-NEXT: s_setpc_b64 s[30:31] 308 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 309 %load = load i8, i8 addrspace(1)* %gep, align 4 310 ret i8 %load 311} 312 313; Fill 11-bit low-bits (1ull << 33) | 2047 314define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) { 315; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0: 316; GFX9: ; %bb.0: 317; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 318; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 319; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 320; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 321; GFX9-NEXT: s_waitcnt vmcnt(0) 322; GFX9-NEXT: s_setpc_b64 s[30:31] 323; 324; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0: 325; GFX10: ; %bb.0: 326; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 328; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 329; GFX10-NEXT: ; implicit-def: $vcc_hi 330; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 331; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 332; GFX10-NEXT: s_waitcnt vmcnt(0) 333; GFX10-NEXT: s_setpc_b64 s[30:31] 334 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 335 %load = load i8, i8 addrspace(1)* %gep, align 4 336 ret i8 %load 337} 338 339; Fill 11-bit low-bits (1ull << 33) | 2048 340define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) { 341; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 345; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 346; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 347; GFX9-NEXT: s_waitcnt vmcnt(0) 348; GFX9-NEXT: s_setpc_b64 s[30:31] 349; 350; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1: 351; GFX10: ; %bb.0: 352; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 353; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 354; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 355; GFX10-NEXT: ; implicit-def: $vcc_hi 356; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 357; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 358; GFX10-NEXT: s_waitcnt vmcnt(0) 359; GFX10-NEXT: s_setpc_b64 s[30:31] 360 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 361 %load = load i8, i8 addrspace(1)* %gep, align 4 362 ret i8 %load 363} 364 365; Fill 12-bit low-bits (1ull << 33) | 4095 366define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) { 367; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0: 368; GFX9: ; %bb.0: 369; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 371; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 372; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 373; GFX9-NEXT: s_waitcnt vmcnt(0) 374; GFX9-NEXT: s_setpc_b64 s[30:31] 375; 376; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0: 377; GFX10: ; %bb.0: 378; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 380; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 381; GFX10-NEXT: ; implicit-def: $vcc_hi 382; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 383; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 384; GFX10-NEXT: s_waitcnt vmcnt(0) 385; GFX10-NEXT: s_setpc_b64 s[30:31] 386 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 387 %load = load i8, i8 addrspace(1)* %gep, align 4 388 ret i8 %load 389} 390 391; Fill 12-bit low-bits (1ull << 33) | 4096 392define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) { 393; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1: 394; GFX9: ; %bb.0: 395; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 396; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 397; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 398; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 399; GFX9-NEXT: s_waitcnt vmcnt(0) 400; GFX9-NEXT: s_setpc_b64 s[30:31] 401; 402; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1: 403; GFX10: ; %bb.0: 404; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 405; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 406; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 407; GFX10-NEXT: ; implicit-def: $vcc_hi 408; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 409; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 410; GFX10-NEXT: s_waitcnt vmcnt(0) 411; GFX10-NEXT: s_setpc_b64 s[30:31] 412 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 413 %load = load i8, i8 addrspace(1)* %gep, align 4 414 ret i8 %load 415} 416 417; Fill 13-bit low-bits (1ull << 33) | 8191 418define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) { 419; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0: 420; GFX9: ; %bb.0: 421; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 423; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 424; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 425; GFX9-NEXT: s_waitcnt vmcnt(0) 426; GFX9-NEXT: s_setpc_b64 s[30:31] 427; 428; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0: 429; GFX10: ; %bb.0: 430; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 431; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 432; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 433; GFX10-NEXT: ; implicit-def: $vcc_hi 434; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 435; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 436; GFX10-NEXT: s_waitcnt vmcnt(0) 437; GFX10-NEXT: s_setpc_b64 s[30:31] 438 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 439 %load = load i8, i8 addrspace(1)* %gep, align 4 440 ret i8 %load 441} 442 443; Fill 13-bit low-bits (1ull << 33) | 8192 444define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) { 445; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1: 446; GFX9: ; %bb.0: 447; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 449; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 450; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 451; GFX9-NEXT: s_waitcnt vmcnt(0) 452; GFX9-NEXT: s_setpc_b64 s[30:31] 453; 454; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1: 455; GFX10: ; %bb.0: 456; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 457; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 458; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 459; GFX10-NEXT: ; implicit-def: $vcc_hi 460; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 461; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 462; GFX10-NEXT: s_waitcnt vmcnt(0) 463; GFX10-NEXT: s_setpc_b64 s[30:31] 464 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 465 %load = load i8, i8 addrspace(1)* %gep, align 4 466 ret i8 %load 467} 468 469; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 470define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) { 471; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: 472; GFX9: ; %bb.0: 473; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 474; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 475; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 476; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 477; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 478; GFX9-NEXT: s_waitcnt vmcnt(0) 479; GFX9-NEXT: s_setpc_b64 s[30:31] 480; 481; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: 482; GFX10: ; %bb.0: 483; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 484; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 485; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 486; GFX10-NEXT: ; implicit-def: $vcc_hi 487; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 488; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 489; GFX10-NEXT: s_waitcnt vmcnt(0) 490; GFX10-NEXT: s_setpc_b64 s[30:31] 491 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 492 %load = load i8, i8 addrspace(1)* %gep, align 4 493 ret i8 %load 494} 495 496; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 497define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) { 498; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: 499; GFX9: ; %bb.0: 500; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 501; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 502; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 503; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 504; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 505; GFX9-NEXT: s_waitcnt vmcnt(0) 506; GFX9-NEXT: s_setpc_b64 s[30:31] 507; 508; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: 509; GFX10: ; %bb.0: 510; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 511; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 512; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 513; GFX10-NEXT: ; implicit-def: $vcc_hi 514; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 515; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 516; GFX10-NEXT: s_waitcnt vmcnt(0) 517; GFX10-NEXT: s_setpc_b64 s[30:31] 518 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 519 %load = load i8, i8 addrspace(1)* %gep, align 4 520 ret i8 %load 521} 522 523; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 524define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) { 525; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: 526; GFX9: ; %bb.0: 527; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 529; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 530; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 531; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 532; GFX9-NEXT: s_waitcnt vmcnt(0) 533; GFX9-NEXT: s_setpc_b64 s[30:31] 534; 535; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: 536; GFX10: ; %bb.0: 537; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 538; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 539; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 540; GFX10-NEXT: ; implicit-def: $vcc_hi 541; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 542; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 543; GFX10-NEXT: s_waitcnt vmcnt(0) 544; GFX10-NEXT: s_setpc_b64 s[30:31] 545 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 546 %load = load i8, i8 addrspace(1)* %gep, align 4 547 ret i8 %load 548} 549 550; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 551define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) { 552; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: 553; GFX9: ; %bb.0: 554; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 555; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 556; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 557; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 558; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 559; GFX9-NEXT: s_waitcnt vmcnt(0) 560; GFX9-NEXT: s_setpc_b64 s[30:31] 561; 562; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: 563; GFX10: ; %bb.0: 564; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 565; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 566; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 567; GFX10-NEXT: ; implicit-def: $vcc_hi 568; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 569; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 570; GFX10-NEXT: s_waitcnt vmcnt(0) 571; GFX10-NEXT: s_setpc_b64 s[30:31] 572 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 573 %load = load i8, i8 addrspace(1)* %gep, align 4 574 ret i8 %load 575} 576 577; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 578define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) { 579; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: 580; GFX9: ; %bb.0: 581; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 583; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 584; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 585; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 586; GFX9-NEXT: s_waitcnt vmcnt(0) 587; GFX9-NEXT: s_setpc_b64 s[30:31] 588; 589; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: 590; GFX10: ; %bb.0: 591; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 592; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 593; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 594; GFX10-NEXT: ; implicit-def: $vcc_hi 595; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 596; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 597; GFX10-NEXT: s_waitcnt vmcnt(0) 598; GFX10-NEXT: s_setpc_b64 s[30:31] 599 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 600 %load = load i8, i8 addrspace(1)* %gep, align 4 601 ret i8 %load 602} 603 604; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 605define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) { 606; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: 607; GFX9: ; %bb.0: 608; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 609; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 610; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 611; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 612; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 613; GFX9-NEXT: s_waitcnt vmcnt(0) 614; GFX9-NEXT: s_setpc_b64 s[30:31] 615; 616; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: 617; GFX10: ; %bb.0: 618; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 619; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 620; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 621; GFX10-NEXT: ; implicit-def: $vcc_hi 622; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 623; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 624; GFX10-NEXT: s_waitcnt vmcnt(0) 625; GFX10-NEXT: s_setpc_b64 s[30:31] 626 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 627 %load = load i8, i8 addrspace(1)* %gep, align 4 628 ret i8 %load 629} 630 631define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { 632; GFX9-LABEL: global_inst_salu_offset_1: 633; GFX9: ; %bb.0: 634; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 635; GFX9-NEXT: v_mov_b32_e32 v0, 0 636; GFX9-NEXT: s_waitcnt lgkmcnt(0) 637; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 638; GFX9-NEXT: s_waitcnt vmcnt(0) 639; GFX9-NEXT: global_store_byte v[0:1], v0, off 640; GFX9-NEXT: s_endpgm 641; 642; GFX10-LABEL: global_inst_salu_offset_1: 643; GFX10: ; %bb.0: 644; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 645; GFX10-NEXT: v_mov_b32_e32 v0, 0 646; GFX10-NEXT: ; implicit-def: $vcc_hi 647; GFX10-NEXT: s_waitcnt lgkmcnt(0) 648; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 649; GFX10-NEXT: s_waitcnt vmcnt(0) 650; GFX10-NEXT: global_store_byte v[0:1], v0, off 651; GFX10-NEXT: s_endpgm 652 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 653 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 654 store i8 %load, i8 addrspace(1)* undef 655 ret void 656} 657 658define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) { 659; GFX9-LABEL: global_inst_salu_offset_11bit_max: 660; GFX9: ; %bb.0: 661; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 662; GFX9-NEXT: v_mov_b32_e32 v0, 0 663; GFX9-NEXT: s_waitcnt lgkmcnt(0) 664; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 665; GFX9-NEXT: s_waitcnt vmcnt(0) 666; GFX9-NEXT: global_store_byte v[0:1], v0, off 667; GFX9-NEXT: s_endpgm 668; 669; GFX10-LABEL: global_inst_salu_offset_11bit_max: 670; GFX10: ; %bb.0: 671; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 672; GFX10-NEXT: v_mov_b32_e32 v0, 0 673; GFX10-NEXT: ; implicit-def: $vcc_hi 674; GFX10-NEXT: s_waitcnt lgkmcnt(0) 675; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 676; GFX10-NEXT: s_waitcnt vmcnt(0) 677; GFX10-NEXT: global_store_byte v[0:1], v0, off 678; GFX10-NEXT: s_endpgm 679 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 680 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 681 store i8 %load, i8 addrspace(1)* undef 682 ret void 683} 684 685define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) { 686; GFX9-LABEL: global_inst_salu_offset_12bit_max: 687; GFX9: ; %bb.0: 688; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 689; GFX9-NEXT: v_mov_b32_e32 v0, 0 690; GFX9-NEXT: s_waitcnt lgkmcnt(0) 691; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 692; GFX9-NEXT: s_waitcnt vmcnt(0) 693; GFX9-NEXT: global_store_byte v[0:1], v0, off 694; GFX9-NEXT: s_endpgm 695; 696; GFX10-LABEL: global_inst_salu_offset_12bit_max: 697; GFX10: ; %bb.0: 698; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 699; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 700; GFX10-NEXT: ; implicit-def: $vcc_hi 701; GFX10-NEXT: s_waitcnt lgkmcnt(0) 702; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 703; GFX10-NEXT: s_waitcnt vmcnt(0) 704; GFX10-NEXT: global_store_byte v[0:1], v0, off 705; GFX10-NEXT: s_endpgm 706 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 707 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 708 store i8 %load, i8 addrspace(1)* undef 709 ret void 710} 711 712define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) { 713; GFX9-LABEL: global_inst_salu_offset_13bit_max: 714; GFX9: ; %bb.0: 715; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 716; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 717; GFX9-NEXT: s_waitcnt lgkmcnt(0) 718; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 719; GFX9-NEXT: s_waitcnt vmcnt(0) 720; GFX9-NEXT: global_store_byte v[0:1], v0, off 721; GFX9-NEXT: s_endpgm 722; 723; GFX10-LABEL: global_inst_salu_offset_13bit_max: 724; GFX10: ; %bb.0: 725; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 726; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 727; GFX10-NEXT: ; implicit-def: $vcc_hi 728; GFX10-NEXT: s_waitcnt lgkmcnt(0) 729; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 730; GFX10-NEXT: s_waitcnt vmcnt(0) 731; GFX10-NEXT: global_store_byte v[0:1], v0, off 732; GFX10-NEXT: s_endpgm 733 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 734 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 735 store i8 %load, i8 addrspace(1)* undef 736 ret void 737} 738 739define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) { 740; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: 741; GFX9: ; %bb.0: 742; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 743; GFX9-NEXT: v_mov_b32_e32 v0, 0 744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 745; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 746; GFX9-NEXT: s_waitcnt vmcnt(0) 747; GFX9-NEXT: global_store_byte v[0:1], v0, off 748; GFX9-NEXT: s_endpgm 749; 750; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: 751; GFX10: ; %bb.0: 752; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 753; GFX10-NEXT: v_mov_b32_e32 v0, 0 754; GFX10-NEXT: ; implicit-def: $vcc_hi 755; GFX10-NEXT: s_waitcnt lgkmcnt(0) 756; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 757; GFX10-NEXT: s_waitcnt vmcnt(0) 758; GFX10-NEXT: global_store_byte v[0:1], v0, off 759; GFX10-NEXT: s_endpgm 760 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 761 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 762 store i8 %load, i8 addrspace(1)* undef 763 ret void 764} 765 766define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) { 767; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: 768; GFX9: ; %bb.0: 769; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 770; GFX9-NEXT: v_mov_b32_e32 v0, 0 771; GFX9-NEXT: s_waitcnt lgkmcnt(0) 772; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 773; GFX9-NEXT: s_waitcnt vmcnt(0) 774; GFX9-NEXT: global_store_byte v[0:1], v0, off 775; GFX9-NEXT: s_endpgm 776; 777; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max: 778; GFX10: ; %bb.0: 779; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 780; GFX10-NEXT: ; implicit-def: $vcc_hi 781; GFX10-NEXT: s_waitcnt lgkmcnt(0) 782; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 783; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 784; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 785; GFX10-NEXT: s_waitcnt vmcnt(0) 786; GFX10-NEXT: global_store_byte v[0:1], v0, off 787; GFX10-NEXT: s_endpgm 788 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 789 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 790 store i8 %load, i8 addrspace(1)* undef 791 ret void 792} 793 794define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) { 795; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: 796; GFX9: ; %bb.0: 797; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 798; GFX9-NEXT: s_waitcnt lgkmcnt(0) 799; GFX9-NEXT: v_mov_b32_e32 v0, s0 800; GFX9-NEXT: v_mov_b32_e32 v1, s1 801; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 802; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 803; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 804; GFX9-NEXT: s_waitcnt vmcnt(0) 805; GFX9-NEXT: global_store_byte v[0:1], v0, off 806; GFX9-NEXT: s_endpgm 807; 808; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max: 809; GFX10: ; %bb.0: 810; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 811; GFX10-NEXT: ; implicit-def: $vcc_hi 812; GFX10-NEXT: s_waitcnt lgkmcnt(0) 813; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 814; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 815; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 816; GFX10-NEXT: s_waitcnt vmcnt(0) 817; GFX10-NEXT: global_store_byte v[0:1], v0, off 818; GFX10-NEXT: s_endpgm 819 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 820 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 821 store i8 %load, i8 addrspace(1)* undef 822 ret void 823} 824 825define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) { 826; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: 827; GFX9: ; %bb.0: 828; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 829; GFX9-NEXT: v_mov_b32_e32 v0, 0 830; GFX9-NEXT: s_waitcnt lgkmcnt(0) 831; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 832; GFX9-NEXT: s_waitcnt vmcnt(0) 833; GFX9-NEXT: global_store_byte v[0:1], v0, off 834; GFX9-NEXT: s_endpgm 835; 836; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: 837; GFX10: ; %bb.0: 838; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 839; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 840; GFX10-NEXT: ; implicit-def: $vcc_hi 841; GFX10-NEXT: s_waitcnt lgkmcnt(0) 842; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 843; GFX10-NEXT: s_waitcnt vmcnt(0) 844; GFX10-NEXT: global_store_byte v[0:1], v0, off 845; GFX10-NEXT: s_endpgm 846 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 847 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 848 store i8 %load, i8 addrspace(1)* undef 849 ret void 850} 851 852define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) { 853; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: 854; GFX9: ; %bb.0: 855; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 856; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000 857; GFX9-NEXT: s_waitcnt lgkmcnt(0) 858; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 859; GFX9-NEXT: s_waitcnt vmcnt(0) 860; GFX9-NEXT: global_store_byte v[0:1], v0, off 861; GFX9-NEXT: s_endpgm 862; 863; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: 864; GFX10: ; %bb.0: 865; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 866; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800 867; GFX10-NEXT: ; implicit-def: $vcc_hi 868; GFX10-NEXT: s_waitcnt lgkmcnt(0) 869; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 870; GFX10-NEXT: s_waitcnt vmcnt(0) 871; GFX10-NEXT: global_store_byte v[0:1], v0, off 872; GFX10-NEXT: s_endpgm 873 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 874 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 875 store i8 %load, i8 addrspace(1)* undef 876 ret void 877} 878 879define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) { 880; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: 881; GFX9: ; %bb.0: 882; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 883; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000 884; GFX9-NEXT: s_waitcnt lgkmcnt(0) 885; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 886; GFX9-NEXT: s_waitcnt vmcnt(0) 887; GFX9-NEXT: global_store_byte v[0:1], v0, off 888; GFX9-NEXT: s_endpgm 889; 890; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: 891; GFX10: ; %bb.0: 892; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 893; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800 894; GFX10-NEXT: ; implicit-def: $vcc_hi 895; GFX10-NEXT: s_waitcnt lgkmcnt(0) 896; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 897; GFX10-NEXT: s_waitcnt vmcnt(0) 898; GFX10-NEXT: global_store_byte v[0:1], v0, off 899; GFX10-NEXT: s_endpgm 900 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 901 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 902 store i8 %load, i8 addrspace(1)* undef 903 ret void 904} 905 906define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) { 907; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: 908; GFX9: ; %bb.0: 909; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 910; GFX9-NEXT: v_mov_b32_e32 v0, 0 911; GFX9-NEXT: s_waitcnt lgkmcnt(0) 912; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 913; GFX9-NEXT: s_waitcnt vmcnt(0) 914; GFX9-NEXT: global_store_byte v[0:1], v0, off 915; GFX9-NEXT: s_endpgm 916; 917; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max: 918; GFX10: ; %bb.0: 919; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 920; GFX10-NEXT: ; implicit-def: $vcc_hi 921; GFX10-NEXT: s_waitcnt lgkmcnt(0) 922; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 923; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 924; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 925; GFX10-NEXT: s_waitcnt vmcnt(0) 926; GFX10-NEXT: global_store_byte v[0:1], v0, off 927; GFX10-NEXT: s_endpgm 928 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 929 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 930 store i8 %load, i8 addrspace(1)* undef 931 ret void 932} 933 934define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) { 935; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: 936; GFX9: ; %bb.0: 937; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 938; GFX9-NEXT: s_waitcnt lgkmcnt(0) 939; GFX9-NEXT: v_mov_b32_e32 v0, s0 940; GFX9-NEXT: v_mov_b32_e32 v1, s1 941; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 942; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 943; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 944; GFX9-NEXT: s_waitcnt vmcnt(0) 945; GFX9-NEXT: global_store_byte v[0:1], v0, off 946; GFX9-NEXT: s_endpgm 947; 948; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max: 949; GFX10: ; %bb.0: 950; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 951; GFX10-NEXT: ; implicit-def: $vcc_hi 952; GFX10-NEXT: s_waitcnt lgkmcnt(0) 953; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 954; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 955; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 956; GFX10-NEXT: s_waitcnt vmcnt(0) 957; GFX10-NEXT: global_store_byte v[0:1], v0, off 958; GFX10-NEXT: s_endpgm 959 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 960 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 961 store i8 %load, i8 addrspace(1)* undef 962 ret void 963} 964 965define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) { 966; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: 967; GFX9: ; %bb.0: 968; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 969; GFX9-NEXT: s_waitcnt lgkmcnt(0) 970; GFX9-NEXT: v_mov_b32_e32 v0, s0 971; GFX9-NEXT: v_mov_b32_e32 v1, s1 972; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 973; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 974; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 975; GFX9-NEXT: s_waitcnt vmcnt(0) 976; GFX9-NEXT: global_store_byte v[0:1], v0, off 977; GFX9-NEXT: s_endpgm 978; 979; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max: 980; GFX10: ; %bb.0: 981; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 982; GFX10-NEXT: ; implicit-def: $vcc_hi 983; GFX10-NEXT: s_waitcnt lgkmcnt(0) 984; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0 985; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 986; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 987; GFX10-NEXT: s_waitcnt vmcnt(0) 988; GFX10-NEXT: global_store_byte v[0:1], v0, off 989; GFX10-NEXT: s_endpgm 990 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 991 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 992 store i8 %load, i8 addrspace(1)* undef 993 ret void 994} 995 996; Fill 11-bit low-bits (1ull << 33) | 2047 997define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) { 998; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: 999; GFX9: ; %bb.0: 1000; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1001; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX9-NEXT: v_mov_b32_e32 v1, s1 1003; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1004; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1005; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1006; GFX9-NEXT: s_waitcnt vmcnt(0) 1007; GFX9-NEXT: global_store_byte v[0:1], v0, off 1008; GFX9-NEXT: s_endpgm 1009; 1010; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0: 1011; GFX10: ; %bb.0: 1012; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1013; GFX10-NEXT: ; implicit-def: $vcc_hi 1014; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0 1016; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1017; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1018; GFX10-NEXT: s_waitcnt vmcnt(0) 1019; GFX10-NEXT: global_store_byte v[0:1], v0, off 1020; GFX10-NEXT: s_endpgm 1021 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 1022 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1023 store i8 %load, i8 addrspace(1)* undef 1024 ret void 1025} 1026 1027; Fill 11-bit low-bits (1ull << 33) | 2048 1028define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) { 1029; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: 1030; GFX9: ; %bb.0: 1031; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1032; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1033; GFX9-NEXT: v_mov_b32_e32 v1, s1 1034; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1035; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1036; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 1037; GFX9-NEXT: s_waitcnt vmcnt(0) 1038; GFX9-NEXT: global_store_byte v[0:1], v0, off 1039; GFX9-NEXT: s_endpgm 1040; 1041; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1: 1042; GFX10: ; %bb.0: 1043; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1044; GFX10-NEXT: ; implicit-def: $vcc_hi 1045; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 1047; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1048; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1049; GFX10-NEXT: s_waitcnt vmcnt(0) 1050; GFX10-NEXT: global_store_byte v[0:1], v0, off 1051; GFX10-NEXT: s_endpgm 1052 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 1053 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1054 store i8 %load, i8 addrspace(1)* undef 1055 ret void 1056} 1057 1058; Fill 12-bit low-bits (1ull << 33) | 4095 1059define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) { 1060; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: 1061; GFX9: ; %bb.0: 1062; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1063; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1064; GFX9-NEXT: v_mov_b32_e32 v1, s1 1065; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1066; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1067; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1068; GFX9-NEXT: s_waitcnt vmcnt(0) 1069; GFX9-NEXT: global_store_byte v[0:1], v0, off 1070; GFX9-NEXT: s_endpgm 1071; 1072; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0: 1073; GFX10: ; %bb.0: 1074; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1075; GFX10-NEXT: ; implicit-def: $vcc_hi 1076; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1077; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 1078; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1079; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1080; GFX10-NEXT: s_waitcnt vmcnt(0) 1081; GFX10-NEXT: global_store_byte v[0:1], v0, off 1082; GFX10-NEXT: s_endpgm 1083 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 1084 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1085 store i8 %load, i8 addrspace(1)* undef 1086 ret void 1087} 1088 1089; Fill 12-bit low-bits (1ull << 33) | 4096 1090define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) { 1091; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: 1092; GFX9: ; %bb.0: 1093; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1094; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX9-NEXT: v_mov_b32_e32 v0, s0 1096; GFX9-NEXT: v_mov_b32_e32 v1, s1 1097; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1098; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1099; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1100; GFX9-NEXT: s_waitcnt vmcnt(0) 1101; GFX9-NEXT: global_store_byte v[0:1], v0, off 1102; GFX9-NEXT: s_endpgm 1103; 1104; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1: 1105; GFX10: ; %bb.0: 1106; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1107; GFX10-NEXT: ; implicit-def: $vcc_hi 1108; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0 1110; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1111; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1112; GFX10-NEXT: s_waitcnt vmcnt(0) 1113; GFX10-NEXT: global_store_byte v[0:1], v0, off 1114; GFX10-NEXT: s_endpgm 1115 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 1116 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1117 store i8 %load, i8 addrspace(1)* undef 1118 ret void 1119} 1120 1121; Fill 13-bit low-bits (1ull << 33) | 8191 1122define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) { 1123; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: 1124; GFX9: ; %bb.0: 1125; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1126; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1127; GFX9-NEXT: v_mov_b32_e32 v0, s0 1128; GFX9-NEXT: v_mov_b32_e32 v1, s1 1129; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1130; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1131; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1132; GFX9-NEXT: s_waitcnt vmcnt(0) 1133; GFX9-NEXT: global_store_byte v[0:1], v0, off 1134; GFX9-NEXT: s_endpgm 1135; 1136; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0: 1137; GFX10: ; %bb.0: 1138; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1139; GFX10-NEXT: ; implicit-def: $vcc_hi 1140; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 1142; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1143; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1144; GFX10-NEXT: s_waitcnt vmcnt(0) 1145; GFX10-NEXT: global_store_byte v[0:1], v0, off 1146; GFX10-NEXT: s_endpgm 1147 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 1148 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1149 store i8 %load, i8 addrspace(1)* undef 1150 ret void 1151} 1152 1153; Fill 13-bit low-bits (1ull << 33) | 8192 1154define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) { 1155; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: 1156; GFX9: ; %bb.0: 1157; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1158; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1159; GFX9-NEXT: v_mov_b32_e32 v0, s0 1160; GFX9-NEXT: v_mov_b32_e32 v1, s1 1161; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1162; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1163; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1164; GFX9-NEXT: s_waitcnt vmcnt(0) 1165; GFX9-NEXT: global_store_byte v[0:1], v0, off 1166; GFX9-NEXT: s_endpgm 1167; 1168; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1: 1169; GFX10: ; %bb.0: 1170; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1171; GFX10-NEXT: ; implicit-def: $vcc_hi 1172; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1173; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0 1174; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1175; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1176; GFX10-NEXT: s_waitcnt vmcnt(0) 1177; GFX10-NEXT: global_store_byte v[0:1], v0, off 1178; GFX10-NEXT: s_endpgm 1179 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 1180 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1181 store i8 %load, i8 addrspace(1)* undef 1182 ret void 1183} 1184 1185; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 1186define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) { 1187; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: 1188; GFX9: ; %bb.0: 1189; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1190; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1191; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1192; GFX9-NEXT: v_mov_b32_e32 v0, s0 1193; GFX9-NEXT: v_mov_b32_e32 v2, s1 1194; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1195; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1196; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 1197; GFX9-NEXT: s_waitcnt vmcnt(0) 1198; GFX9-NEXT: global_store_byte v[0:1], v0, off 1199; GFX9-NEXT: s_endpgm 1200; 1201; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: 1202; GFX10: ; %bb.0: 1203; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1204; GFX10-NEXT: ; implicit-def: $vcc_hi 1205; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX10-NEXT: v_mov_b32_e32 v1, s1 1207; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 1208; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1209; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1210; GFX10-NEXT: s_waitcnt vmcnt(0) 1211; GFX10-NEXT: global_store_byte v[0:1], v0, off 1212; GFX10-NEXT: s_endpgm 1213 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 1214 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1215 store i8 %load, i8 addrspace(1)* undef 1216 ret void 1217} 1218 1219; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 1220define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) { 1221; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: 1222; GFX9: ; %bb.0: 1223; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1224; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1226; GFX9-NEXT: v_mov_b32_e32 v0, s0 1227; GFX9-NEXT: v_mov_b32_e32 v2, s1 1228; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1229; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1230; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 1231; GFX9-NEXT: s_waitcnt vmcnt(0) 1232; GFX9-NEXT: global_store_byte v[0:1], v0, off 1233; GFX9-NEXT: s_endpgm 1234; 1235; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: 1236; GFX10: ; %bb.0: 1237; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1238; GFX10-NEXT: ; implicit-def: $vcc_hi 1239; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1240; GFX10-NEXT: v_mov_b32_e32 v1, s1 1241; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0 1242; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1243; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1244; GFX10-NEXT: s_waitcnt vmcnt(0) 1245; GFX10-NEXT: global_store_byte v[0:1], v0, off 1246; GFX10-NEXT: s_endpgm 1247 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 1248 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1249 store i8 %load, i8 addrspace(1)* undef 1250 ret void 1251} 1252 1253; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 1254define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) { 1255; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: 1256; GFX9: ; %bb.0: 1257; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1258; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1259; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1260; GFX9-NEXT: v_mov_b32_e32 v0, s0 1261; GFX9-NEXT: v_mov_b32_e32 v2, s1 1262; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1263; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1264; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1265; GFX9-NEXT: s_waitcnt vmcnt(0) 1266; GFX9-NEXT: global_store_byte v[0:1], v0, off 1267; GFX9-NEXT: s_endpgm 1268; 1269; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: 1270; GFX10: ; %bb.0: 1271; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1272; GFX10-NEXT: ; implicit-def: $vcc_hi 1273; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1274; GFX10-NEXT: v_mov_b32_e32 v1, s1 1275; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1276; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1277; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1278; GFX10-NEXT: s_waitcnt vmcnt(0) 1279; GFX10-NEXT: global_store_byte v[0:1], v0, off 1280; GFX10-NEXT: s_endpgm 1281 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 1282 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1283 store i8 %load, i8 addrspace(1)* undef 1284 ret void 1285} 1286 1287; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 1288define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) { 1289; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: 1290; GFX9: ; %bb.0: 1291; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1292; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1293; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1294; GFX9-NEXT: v_mov_b32_e32 v0, s0 1295; GFX9-NEXT: v_mov_b32_e32 v2, s1 1296; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1297; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1298; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1299; GFX9-NEXT: s_waitcnt vmcnt(0) 1300; GFX9-NEXT: global_store_byte v[0:1], v0, off 1301; GFX9-NEXT: s_endpgm 1302; 1303; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: 1304; GFX10: ; %bb.0: 1305; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1306; GFX10-NEXT: ; implicit-def: $vcc_hi 1307; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1308; GFX10-NEXT: v_mov_b32_e32 v1, s1 1309; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1310; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1311; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1312; GFX10-NEXT: s_waitcnt vmcnt(0) 1313; GFX10-NEXT: global_store_byte v[0:1], v0, off 1314; GFX10-NEXT: s_endpgm 1315 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 1316 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1317 store i8 %load, i8 addrspace(1)* undef 1318 ret void 1319} 1320 1321; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 1322define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) { 1323; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: 1324; GFX9: ; %bb.0: 1325; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1326; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1327; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1328; GFX9-NEXT: v_mov_b32_e32 v0, s0 1329; GFX9-NEXT: v_mov_b32_e32 v2, s1 1330; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1331; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1332; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1333; GFX9-NEXT: s_waitcnt vmcnt(0) 1334; GFX9-NEXT: global_store_byte v[0:1], v0, off 1335; GFX9-NEXT: s_endpgm 1336; 1337; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: 1338; GFX10: ; %bb.0: 1339; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1340; GFX10-NEXT: ; implicit-def: $vcc_hi 1341; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1342; GFX10-NEXT: v_mov_b32_e32 v1, s1 1343; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 1344; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1345; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1346; GFX10-NEXT: s_waitcnt vmcnt(0) 1347; GFX10-NEXT: global_store_byte v[0:1], v0, off 1348; GFX10-NEXT: s_endpgm 1349 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 1350 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1351 store i8 %load, i8 addrspace(1)* undef 1352 ret void 1353} 1354 1355; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 1356define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) { 1357; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: 1358; GFX9: ; %bb.0: 1359; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1360; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1361; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX9-NEXT: v_mov_b32_e32 v0, s0 1363; GFX9-NEXT: v_mov_b32_e32 v2, s1 1364; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1365; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1366; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1367; GFX9-NEXT: s_waitcnt vmcnt(0) 1368; GFX9-NEXT: global_store_byte v[0:1], v0, off 1369; GFX9-NEXT: s_endpgm 1370; 1371; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: 1372; GFX10: ; %bb.0: 1373; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1374; GFX10-NEXT: ; implicit-def: $vcc_hi 1375; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1376; GFX10-NEXT: v_mov_b32_e32 v1, s1 1377; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 1378; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1379; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1380; GFX10-NEXT: s_waitcnt vmcnt(0) 1381; GFX10-NEXT: global_store_byte v[0:1], v0, off 1382; GFX10-NEXT: s_endpgm 1383 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 1384 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1385 store i8 %load, i8 addrspace(1)* undef 1386 ret void 1387} 1388