1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4 5; Test splitting flat instruction offsets into the low and high bits 6; when the offset doesn't fit in the offset field. 7 8define i8 @global_inst_valu_offset_1(i8 addrspace(1)* %p) { 9; GFX9-LABEL: global_inst_valu_offset_1: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 13; GFX9-NEXT: s_waitcnt vmcnt(0) 14; GFX9-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX10-LABEL: global_inst_valu_offset_1: 17; GFX10: ; %bb.0: 18; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 20; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 21; GFX10-NEXT: ; implicit-def: $vcc_hi 22; GFX10-NEXT: s_waitcnt vmcnt(0) 23; GFX10-NEXT: s_setpc_b64 s[30:31] 24 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 25 %load = load i8, i8 addrspace(1)* %gep, align 4 26 ret i8 %load 27} 28 29define i8 @global_inst_valu_offset_11bit_max(i8 addrspace(1)* %p) { 30; GFX9-LABEL: global_inst_valu_offset_11bit_max: 31; GFX9: ; %bb.0: 32; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 34; GFX9-NEXT: s_waitcnt vmcnt(0) 35; GFX9-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX10-LABEL: global_inst_valu_offset_11bit_max: 38; GFX10: ; %bb.0: 39; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 41; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 42; GFX10-NEXT: ; implicit-def: $vcc_hi 43; GFX10-NEXT: s_waitcnt vmcnt(0) 44; GFX10-NEXT: s_setpc_b64 s[30:31] 45 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 46 %load = load i8, i8 addrspace(1)* %gep, align 4 47 ret i8 %load 48} 49 50define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) { 51; GFX9-LABEL: global_inst_valu_offset_12bit_max: 52; GFX9: ; %bb.0: 53; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 54; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 55; GFX9-NEXT: s_waitcnt vmcnt(0) 56; GFX9-NEXT: s_setpc_b64 s[30:31] 57; 58; GFX10-LABEL: global_inst_valu_offset_12bit_max: 59; GFX10: ; %bb.0: 60; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 62; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 63; GFX10-NEXT: ; implicit-def: $vcc_hi 64; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 65; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 66; GFX10-NEXT: s_waitcnt vmcnt(0) 67; GFX10-NEXT: s_setpc_b64 s[30:31] 68 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 69 %load = load i8, i8 addrspace(1)* %gep, align 4 70 ret i8 %load 71} 72 73define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) { 74; GFX9-LABEL: global_inst_valu_offset_13bit_max: 75; GFX9: ; %bb.0: 76; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 78; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 79; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 80; GFX9-NEXT: s_waitcnt vmcnt(0) 81; GFX9-NEXT: s_setpc_b64 s[30:31] 82; 83; GFX10-LABEL: global_inst_valu_offset_13bit_max: 84; GFX10: ; %bb.0: 85; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 86; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 87; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 88; GFX10-NEXT: ; implicit-def: $vcc_hi 89; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 90; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 91; GFX10-NEXT: s_waitcnt vmcnt(0) 92; GFX10-NEXT: s_setpc_b64 s[30:31] 93 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 94 %load = load i8, i8 addrspace(1)* %gep, align 4 95 ret i8 %load 96} 97 98define i8 @global_inst_valu_offset_neg_11bit_max(i8 addrspace(1)* %p) { 99; GFX9-LABEL: global_inst_valu_offset_neg_11bit_max: 100; GFX9: ; %bb.0: 101; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 103; GFX9-NEXT: s_waitcnt vmcnt(0) 104; GFX9-NEXT: s_setpc_b64 s[30:31] 105; 106; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max: 107; GFX10: ; %bb.0: 108; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 109; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 110; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 111; GFX10-NEXT: ; implicit-def: $vcc_hi 112; GFX10-NEXT: s_waitcnt vmcnt(0) 113; GFX10-NEXT: s_setpc_b64 s[30:31] 114 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 115 %load = load i8, i8 addrspace(1)* %gep, align 4 116 ret i8 %load 117} 118 119define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) { 120; GFX9-LABEL: global_inst_valu_offset_neg_12bit_max: 121; GFX9: ; %bb.0: 122; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 124; GFX9-NEXT: s_waitcnt vmcnt(0) 125; GFX9-NEXT: s_setpc_b64 s[30:31] 126; 127; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max: 128; GFX10: ; %bb.0: 129; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 130; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 131; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 132; GFX10-NEXT: ; implicit-def: $vcc_hi 133; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 134; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 135; GFX10-NEXT: s_waitcnt vmcnt(0) 136; GFX10-NEXT: s_setpc_b64 s[30:31] 137 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 138 %load = load i8, i8 addrspace(1)* %gep, align 4 139 ret i8 %load 140} 141 142define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) { 143; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max: 144; GFX9: ; %bb.0: 145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 146; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 147; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 148; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 149; GFX9-NEXT: s_waitcnt vmcnt(0) 150; GFX9-NEXT: s_setpc_b64 s[30:31] 151; 152; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max: 153; GFX10: ; %bb.0: 154; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 156; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 157; GFX10-NEXT: ; implicit-def: $vcc_hi 158; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 159; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 160; GFX10-NEXT: s_waitcnt vmcnt(0) 161; GFX10-NEXT: s_setpc_b64 s[30:31] 162 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 163 %load = load i8, i8 addrspace(1)* %gep, align 4 164 ret i8 %load 165} 166 167define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) { 168; GFX9-LABEL: global_inst_valu_offset_2x_11bit_max: 169; GFX9: ; %bb.0: 170; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 172; GFX9-NEXT: s_waitcnt vmcnt(0) 173; GFX9-NEXT: s_setpc_b64 s[30:31] 174; 175; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max: 176; GFX10: ; %bb.0: 177; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 179; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 180; GFX10-NEXT: ; implicit-def: $vcc_hi 181; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 182; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 183; GFX10-NEXT: s_waitcnt vmcnt(0) 184; GFX10-NEXT: s_setpc_b64 s[30:31] 185 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 186 %load = load i8, i8 addrspace(1)* %gep, align 4 187 ret i8 %load 188} 189 190define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) { 191; GFX9-LABEL: global_inst_valu_offset_2x_12bit_max: 192; GFX9: ; %bb.0: 193; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 195; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 196; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 197; GFX9-NEXT: s_waitcnt vmcnt(0) 198; GFX9-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max: 201; GFX10: ; %bb.0: 202; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 203; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 204; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 205; GFX10-NEXT: ; implicit-def: $vcc_hi 206; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 207; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 208; GFX10-NEXT: s_waitcnt vmcnt(0) 209; GFX10-NEXT: s_setpc_b64 s[30:31] 210 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 211 %load = load i8, i8 addrspace(1)* %gep, align 4 212 ret i8 %load 213} 214 215define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) { 216; GFX9-LABEL: global_inst_valu_offset_2x_13bit_max: 217; GFX9: ; %bb.0: 218; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 220; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 221; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 222; GFX9-NEXT: s_waitcnt vmcnt(0) 223; GFX9-NEXT: s_setpc_b64 s[30:31] 224; 225; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max: 226; GFX10: ; %bb.0: 227; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 228; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 229; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0 230; GFX10-NEXT: ; implicit-def: $vcc_hi 231; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 232; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 233; GFX10-NEXT: s_waitcnt vmcnt(0) 234; GFX10-NEXT: s_setpc_b64 s[30:31] 235 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 236 %load = load i8, i8 addrspace(1)* %gep, align 4 237 ret i8 %load 238} 239 240define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) { 241; GFX9-LABEL: global_inst_valu_offset_2x_neg_11bit_max: 242; GFX9: ; %bb.0: 243; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 244; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 245; GFX9-NEXT: s_waitcnt vmcnt(0) 246; GFX9-NEXT: s_setpc_b64 s[30:31] 247; 248; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max: 249; GFX10: ; %bb.0: 250; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 252; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 253; GFX10-NEXT: ; implicit-def: $vcc_hi 254; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 255; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 256; GFX10-NEXT: s_waitcnt vmcnt(0) 257; GFX10-NEXT: s_setpc_b64 s[30:31] 258 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 259 %load = load i8, i8 addrspace(1)* %gep, align 4 260 ret i8 %load 261} 262 263define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) { 264; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max: 265; GFX9: ; %bb.0: 266; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 267; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 268; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 269; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 270; GFX9-NEXT: s_waitcnt vmcnt(0) 271; GFX9-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max: 274; GFX10: ; %bb.0: 275; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 277; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0 278; GFX10-NEXT: ; implicit-def: $vcc_hi 279; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 280; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 281; GFX10-NEXT: s_waitcnt vmcnt(0) 282; GFX10-NEXT: s_setpc_b64 s[30:31] 283 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 284 %load = load i8, i8 addrspace(1)* %gep, align 4 285 ret i8 %load 286} 287 288define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) { 289; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max: 290; GFX9: ; %bb.0: 291; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 292; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 293; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 294; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 295; GFX9-NEXT: s_waitcnt vmcnt(0) 296; GFX9-NEXT: s_setpc_b64 s[30:31] 297; 298; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max: 299; GFX10: ; %bb.0: 300; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 301; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 302; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0 303; GFX10-NEXT: ; implicit-def: $vcc_hi 304; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 305; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 306; GFX10-NEXT: s_waitcnt vmcnt(0) 307; GFX10-NEXT: s_setpc_b64 s[30:31] 308 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 309 %load = load i8, i8 addrspace(1)* %gep, align 4 310 ret i8 %load 311} 312 313; Fill 11-bit low-bits (1ull << 33) | 2047 314define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) { 315; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split0: 316; GFX9: ; %bb.0: 317; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 318; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 319; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 320; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 321; GFX9-NEXT: s_waitcnt vmcnt(0) 322; GFX9-NEXT: s_setpc_b64 s[30:31] 323; 324; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0: 325; GFX10: ; %bb.0: 326; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 328; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 329; GFX10-NEXT: ; implicit-def: $vcc_hi 330; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 331; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 332; GFX10-NEXT: s_waitcnt vmcnt(0) 333; GFX10-NEXT: s_setpc_b64 s[30:31] 334 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 335 %load = load i8, i8 addrspace(1)* %gep, align 4 336 ret i8 %load 337} 338 339; Fill 11-bit low-bits (1ull << 33) | 2048 340define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) { 341; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_split1: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 345; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 346; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 347; GFX9-NEXT: s_waitcnt vmcnt(0) 348; GFX9-NEXT: s_setpc_b64 s[30:31] 349; 350; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1: 351; GFX10: ; %bb.0: 352; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 353; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 354; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 355; GFX10-NEXT: ; implicit-def: $vcc_hi 356; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 357; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 358; GFX10-NEXT: s_waitcnt vmcnt(0) 359; GFX10-NEXT: s_setpc_b64 s[30:31] 360 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 361 %load = load i8, i8 addrspace(1)* %gep, align 4 362 ret i8 %load 363} 364 365; Fill 12-bit low-bits (1ull << 33) | 4095 366define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) { 367; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split0: 368; GFX9: ; %bb.0: 369; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 371; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 372; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 373; GFX9-NEXT: s_waitcnt vmcnt(0) 374; GFX9-NEXT: s_setpc_b64 s[30:31] 375; 376; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0: 377; GFX10: ; %bb.0: 378; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 380; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 381; GFX10-NEXT: ; implicit-def: $vcc_hi 382; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 383; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 384; GFX10-NEXT: s_waitcnt vmcnt(0) 385; GFX10-NEXT: s_setpc_b64 s[30:31] 386 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 387 %load = load i8, i8 addrspace(1)* %gep, align 4 388 ret i8 %load 389} 390 391; Fill 12-bit low-bits (1ull << 33) | 4096 392define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) { 393; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_split1: 394; GFX9: ; %bb.0: 395; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 396; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 397; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 398; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 399; GFX9-NEXT: s_waitcnt vmcnt(0) 400; GFX9-NEXT: s_setpc_b64 s[30:31] 401; 402; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1: 403; GFX10: ; %bb.0: 404; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 405; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 406; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 407; GFX10-NEXT: ; implicit-def: $vcc_hi 408; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 409; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 410; GFX10-NEXT: s_waitcnt vmcnt(0) 411; GFX10-NEXT: s_setpc_b64 s[30:31] 412 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 413 %load = load i8, i8 addrspace(1)* %gep, align 4 414 ret i8 %load 415} 416 417; Fill 13-bit low-bits (1ull << 33) | 8191 418define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) { 419; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split0: 420; GFX9: ; %bb.0: 421; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 423; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 424; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 425; GFX9-NEXT: s_waitcnt vmcnt(0) 426; GFX9-NEXT: s_setpc_b64 s[30:31] 427; 428; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0: 429; GFX10: ; %bb.0: 430; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 431; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 432; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0 433; GFX10-NEXT: ; implicit-def: $vcc_hi 434; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 435; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 436; GFX10-NEXT: s_waitcnt vmcnt(0) 437; GFX10-NEXT: s_setpc_b64 s[30:31] 438 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 439 %load = load i8, i8 addrspace(1)* %gep, align 4 440 ret i8 %load 441} 442 443; Fill 13-bit low-bits (1ull << 33) | 8192 444define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) { 445; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_split1: 446; GFX9: ; %bb.0: 447; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 449; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 450; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 451; GFX9-NEXT: s_waitcnt vmcnt(0) 452; GFX9-NEXT: s_setpc_b64 s[30:31] 453; 454; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1: 455; GFX10: ; %bb.0: 456; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 457; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 458; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 459; GFX10-NEXT: ; implicit-def: $vcc_hi 460; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo 461; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 462; GFX10-NEXT: s_waitcnt vmcnt(0) 463; GFX10-NEXT: s_setpc_b64 s[30:31] 464 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 465 %load = load i8, i8 addrspace(1)* %gep, align 4 466 ret i8 %load 467} 468 469; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 470define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) { 471; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: 472; GFX9: ; %bb.0: 473; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 474; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 475; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 476; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 477; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 478; GFX9-NEXT: s_waitcnt vmcnt(0) 479; GFX9-NEXT: s_setpc_b64 s[30:31] 480; 481; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: 482; GFX10: ; %bb.0: 483; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 484; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 485; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0 486; GFX10-NEXT: ; implicit-def: $vcc_hi 487; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 488; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 489; GFX10-NEXT: s_waitcnt vmcnt(0) 490; GFX10-NEXT: s_setpc_b64 s[30:31] 491 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 492 %load = load i8, i8 addrspace(1)* %gep, align 4 493 ret i8 %load 494} 495 496; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 497define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) { 498; GFX9-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: 499; GFX9: ; %bb.0: 500; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 501; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 502; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 503; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 504; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 505; GFX9-NEXT: s_waitcnt vmcnt(0) 506; GFX9-NEXT: s_setpc_b64 s[30:31] 507; 508; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: 509; GFX10: ; %bb.0: 510; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 511; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 512; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 513; GFX10-NEXT: ; implicit-def: $vcc_hi 514; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 515; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 516; GFX10-NEXT: s_waitcnt vmcnt(0) 517; GFX10-NEXT: s_setpc_b64 s[30:31] 518 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 519 %load = load i8, i8 addrspace(1)* %gep, align 4 520 ret i8 %load 521} 522 523; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 524define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) { 525; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: 526; GFX9: ; %bb.0: 527; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 529; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 530; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 531; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 532; GFX9-NEXT: s_waitcnt vmcnt(0) 533; GFX9-NEXT: s_setpc_b64 s[30:31] 534; 535; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: 536; GFX10: ; %bb.0: 537; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 538; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 539; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 540; GFX10-NEXT: ; implicit-def: $vcc_hi 541; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 542; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 543; GFX10-NEXT: s_waitcnt vmcnt(0) 544; GFX10-NEXT: s_setpc_b64 s[30:31] 545 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 546 %load = load i8, i8 addrspace(1)* %gep, align 4 547 ret i8 %load 548} 549 550; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 551define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) { 552; GFX9-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: 553; GFX9: ; %bb.0: 554; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 555; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 556; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 557; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 558; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 559; GFX9-NEXT: s_waitcnt vmcnt(0) 560; GFX9-NEXT: s_setpc_b64 s[30:31] 561; 562; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: 563; GFX10: ; %bb.0: 564; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 565; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 566; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 567; GFX10-NEXT: ; implicit-def: $vcc_hi 568; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 569; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 570; GFX10-NEXT: s_waitcnt vmcnt(0) 571; GFX10-NEXT: s_setpc_b64 s[30:31] 572 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 573 %load = load i8, i8 addrspace(1)* %gep, align 4 574 ret i8 %load 575} 576 577; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 578define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) { 579; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: 580; GFX9: ; %bb.0: 581; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 583; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 584; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 585; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 586; GFX9-NEXT: s_waitcnt vmcnt(0) 587; GFX9-NEXT: s_setpc_b64 s[30:31] 588; 589; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: 590; GFX10: ; %bb.0: 591; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 592; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 593; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 594; GFX10-NEXT: ; implicit-def: $vcc_hi 595; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 596; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 597; GFX10-NEXT: s_waitcnt vmcnt(0) 598; GFX10-NEXT: s_setpc_b64 s[30:31] 599 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 600 %load = load i8, i8 addrspace(1)* %gep, align 4 601 ret i8 %load 602} 603 604; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 605define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) { 606; GFX9-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: 607; GFX9: ; %bb.0: 608; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 609; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 610; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 611; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 612; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 613; GFX9-NEXT: s_waitcnt vmcnt(0) 614; GFX9-NEXT: s_setpc_b64 s[30:31] 615; 616; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: 617; GFX10: ; %bb.0: 618; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 619; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 620; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0 621; GFX10-NEXT: ; implicit-def: $vcc_hi 622; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 623; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 624; GFX10-NEXT: s_waitcnt vmcnt(0) 625; GFX10-NEXT: s_setpc_b64 s[30:31] 626 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 627 %load = load i8, i8 addrspace(1)* %gep, align 4 628 ret i8 %load 629} 630 631define amdgpu_kernel void @global_inst_salu_offset_1(i8 addrspace(1)* %p) { 632; GFX9-LABEL: global_inst_salu_offset_1: 633; GFX9: ; %bb.0: 634; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 636; GFX9-NEXT: v_mov_b32_e32 v0, s0 637; GFX9-NEXT: v_mov_b32_e32 v1, s1 638; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 639; GFX9-NEXT: s_waitcnt vmcnt(0) 640; GFX9-NEXT: global_store_byte v[0:1], v0, off 641; GFX9-NEXT: s_endpgm 642; 643; GFX10-LABEL: global_inst_salu_offset_1: 644; GFX10: ; %bb.0: 645; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 646; GFX10-NEXT: ; implicit-def: $vcc_hi 647; GFX10-NEXT: s_waitcnt lgkmcnt(0) 648; GFX10-NEXT: v_mov_b32_e32 v0, s0 649; GFX10-NEXT: v_mov_b32_e32 v1, s1 650; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 651; GFX10-NEXT: s_waitcnt vmcnt(0) 652; GFX10-NEXT: global_store_byte v[0:1], v0, off 653; GFX10-NEXT: s_endpgm 654 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 655 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 656 store i8 %load, i8 addrspace(1)* undef 657 ret void 658} 659 660define amdgpu_kernel void @global_inst_salu_offset_11bit_max(i8 addrspace(1)* %p) { 661; GFX9-LABEL: global_inst_salu_offset_11bit_max: 662; GFX9: ; %bb.0: 663; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 664; GFX9-NEXT: s_waitcnt lgkmcnt(0) 665; GFX9-NEXT: v_mov_b32_e32 v0, s0 666; GFX9-NEXT: v_mov_b32_e32 v1, s1 667; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 668; GFX9-NEXT: s_waitcnt vmcnt(0) 669; GFX9-NEXT: global_store_byte v[0:1], v0, off 670; GFX9-NEXT: s_endpgm 671; 672; GFX10-LABEL: global_inst_salu_offset_11bit_max: 673; GFX10: ; %bb.0: 674; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 675; GFX10-NEXT: ; implicit-def: $vcc_hi 676; GFX10-NEXT: s_waitcnt lgkmcnt(0) 677; GFX10-NEXT: v_mov_b32_e32 v0, s0 678; GFX10-NEXT: v_mov_b32_e32 v1, s1 679; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 680; GFX10-NEXT: s_waitcnt vmcnt(0) 681; GFX10-NEXT: global_store_byte v[0:1], v0, off 682; GFX10-NEXT: s_endpgm 683 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 684 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 685 store i8 %load, i8 addrspace(1)* undef 686 ret void 687} 688 689define amdgpu_kernel void @global_inst_salu_offset_12bit_max(i8 addrspace(1)* %p) { 690; GFX9-LABEL: global_inst_salu_offset_12bit_max: 691; GFX9: ; %bb.0: 692; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 694; GFX9-NEXT: v_mov_b32_e32 v0, s0 695; GFX9-NEXT: v_mov_b32_e32 v1, s1 696; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 697; GFX9-NEXT: s_waitcnt vmcnt(0) 698; GFX9-NEXT: global_store_byte v[0:1], v0, off 699; GFX9-NEXT: s_endpgm 700; 701; GFX10-LABEL: global_inst_salu_offset_12bit_max: 702; GFX10: ; %bb.0: 703; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 704; GFX10-NEXT: ; implicit-def: $vcc_hi 705; GFX10-NEXT: s_waitcnt lgkmcnt(0) 706; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 707; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 708; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 709; GFX10-NEXT: s_waitcnt vmcnt(0) 710; GFX10-NEXT: global_store_byte v[0:1], v0, off 711; GFX10-NEXT: s_endpgm 712 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 713 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 714 store i8 %load, i8 addrspace(1)* undef 715 ret void 716} 717 718define amdgpu_kernel void @global_inst_salu_offset_13bit_max(i8 addrspace(1)* %p) { 719; GFX9-LABEL: global_inst_salu_offset_13bit_max: 720; GFX9: ; %bb.0: 721; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 722; GFX9-NEXT: s_waitcnt lgkmcnt(0) 723; GFX9-NEXT: v_mov_b32_e32 v0, s0 724; GFX9-NEXT: v_mov_b32_e32 v1, s1 725; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 726; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 727; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 728; GFX9-NEXT: s_waitcnt vmcnt(0) 729; GFX9-NEXT: global_store_byte v[0:1], v0, off 730; GFX9-NEXT: s_endpgm 731; 732; GFX10-LABEL: global_inst_salu_offset_13bit_max: 733; GFX10: ; %bb.0: 734; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 735; GFX10-NEXT: ; implicit-def: $vcc_hi 736; GFX10-NEXT: s_waitcnt lgkmcnt(0) 737; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 738; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 739; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 740; GFX10-NEXT: s_waitcnt vmcnt(0) 741; GFX10-NEXT: global_store_byte v[0:1], v0, off 742; GFX10-NEXT: s_endpgm 743 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 744 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 745 store i8 %load, i8 addrspace(1)* undef 746 ret void 747} 748 749define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(i8 addrspace(1)* %p) { 750; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max: 751; GFX9: ; %bb.0: 752; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 753; GFX9-NEXT: s_waitcnt lgkmcnt(0) 754; GFX9-NEXT: v_mov_b32_e32 v0, s0 755; GFX9-NEXT: v_mov_b32_e32 v1, s1 756; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 757; GFX9-NEXT: s_waitcnt vmcnt(0) 758; GFX9-NEXT: global_store_byte v[0:1], v0, off 759; GFX9-NEXT: s_endpgm 760; 761; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max: 762; GFX10: ; %bb.0: 763; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 764; GFX10-NEXT: ; implicit-def: $vcc_hi 765; GFX10-NEXT: s_waitcnt lgkmcnt(0) 766; GFX10-NEXT: v_mov_b32_e32 v0, s0 767; GFX10-NEXT: v_mov_b32_e32 v1, s1 768; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 769; GFX10-NEXT: s_waitcnt vmcnt(0) 770; GFX10-NEXT: global_store_byte v[0:1], v0, off 771; GFX10-NEXT: s_endpgm 772 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 773 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 774 store i8 %load, i8 addrspace(1)* undef 775 ret void 776} 777 778define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)* %p) { 779; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max: 780; GFX9: ; %bb.0: 781; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 783; GFX9-NEXT: v_mov_b32_e32 v0, s0 784; GFX9-NEXT: v_mov_b32_e32 v1, s1 785; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 786; GFX9-NEXT: s_waitcnt vmcnt(0) 787; GFX9-NEXT: global_store_byte v[0:1], v0, off 788; GFX9-NEXT: s_endpgm 789; 790; GFX10-LABEL: global_inst_salu_offset_neg_12bit_max: 791; GFX10: ; %bb.0: 792; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 793; GFX10-NEXT: ; implicit-def: $vcc_hi 794; GFX10-NEXT: s_waitcnt lgkmcnt(0) 795; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 796; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 797; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 798; GFX10-NEXT: s_waitcnt vmcnt(0) 799; GFX10-NEXT: global_store_byte v[0:1], v0, off 800; GFX10-NEXT: s_endpgm 801 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 802 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 803 store i8 %load, i8 addrspace(1)* undef 804 ret void 805} 806 807define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)* %p) { 808; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max: 809; GFX9: ; %bb.0: 810; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 811; GFX9-NEXT: s_waitcnt lgkmcnt(0) 812; GFX9-NEXT: v_mov_b32_e32 v0, s0 813; GFX9-NEXT: v_mov_b32_e32 v1, s1 814; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 815; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 816; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 817; GFX9-NEXT: s_waitcnt vmcnt(0) 818; GFX9-NEXT: global_store_byte v[0:1], v0, off 819; GFX9-NEXT: s_endpgm 820; 821; GFX10-LABEL: global_inst_salu_offset_neg_13bit_max: 822; GFX10: ; %bb.0: 823; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 824; GFX10-NEXT: ; implicit-def: $vcc_hi 825; GFX10-NEXT: s_waitcnt lgkmcnt(0) 826; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 827; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 828; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 829; GFX10-NEXT: s_waitcnt vmcnt(0) 830; GFX10-NEXT: global_store_byte v[0:1], v0, off 831; GFX10-NEXT: s_endpgm 832 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 833 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 834 store i8 %load, i8 addrspace(1)* undef 835 ret void 836} 837 838define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(i8 addrspace(1)* %p) { 839; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max: 840; GFX9: ; %bb.0: 841; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 843; GFX9-NEXT: v_mov_b32_e32 v0, s0 844; GFX9-NEXT: v_mov_b32_e32 v1, s1 845; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 846; GFX9-NEXT: s_waitcnt vmcnt(0) 847; GFX9-NEXT: global_store_byte v[0:1], v0, off 848; GFX9-NEXT: s_endpgm 849; 850; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max: 851; GFX10: ; %bb.0: 852; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 853; GFX10-NEXT: ; implicit-def: $vcc_hi 854; GFX10-NEXT: s_waitcnt lgkmcnt(0) 855; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 856; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 857; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 858; GFX10-NEXT: s_waitcnt vmcnt(0) 859; GFX10-NEXT: global_store_byte v[0:1], v0, off 860; GFX10-NEXT: s_endpgm 861 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 862 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 863 store i8 %load, i8 addrspace(1)* undef 864 ret void 865} 866 867define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(i8 addrspace(1)* %p) { 868; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max: 869; GFX9: ; %bb.0: 870; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 871; GFX9-NEXT: s_waitcnt lgkmcnt(0) 872; GFX9-NEXT: v_mov_b32_e32 v0, s0 873; GFX9-NEXT: v_mov_b32_e32 v1, s1 874; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 875; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 876; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 877; GFX9-NEXT: s_waitcnt vmcnt(0) 878; GFX9-NEXT: global_store_byte v[0:1], v0, off 879; GFX9-NEXT: s_endpgm 880; 881; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max: 882; GFX10: ; %bb.0: 883; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 884; GFX10-NEXT: ; implicit-def: $vcc_hi 885; GFX10-NEXT: s_waitcnt lgkmcnt(0) 886; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 887; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 888; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 889; GFX10-NEXT: s_waitcnt vmcnt(0) 890; GFX10-NEXT: global_store_byte v[0:1], v0, off 891; GFX10-NEXT: s_endpgm 892 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 893 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 894 store i8 %load, i8 addrspace(1)* undef 895 ret void 896} 897 898define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(i8 addrspace(1)* %p) { 899; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max: 900; GFX9: ; %bb.0: 901; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 902; GFX9-NEXT: s_waitcnt lgkmcnt(0) 903; GFX9-NEXT: v_mov_b32_e32 v0, s0 904; GFX9-NEXT: v_mov_b32_e32 v1, s1 905; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 906; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 907; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 908; GFX9-NEXT: s_waitcnt vmcnt(0) 909; GFX9-NEXT: global_store_byte v[0:1], v0, off 910; GFX9-NEXT: s_endpgm 911; 912; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max: 913; GFX10: ; %bb.0: 914; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 915; GFX10-NEXT: ; implicit-def: $vcc_hi 916; GFX10-NEXT: s_waitcnt lgkmcnt(0) 917; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x3800, s0 918; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s1, s0 919; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 920; GFX10-NEXT: s_waitcnt vmcnt(0) 921; GFX10-NEXT: global_store_byte v[0:1], v0, off 922; GFX10-NEXT: s_endpgm 923 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 924 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 925 store i8 %load, i8 addrspace(1)* undef 926 ret void 927} 928 929define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) { 930; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max: 931; GFX9: ; %bb.0: 932; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 933; GFX9-NEXT: s_waitcnt lgkmcnt(0) 934; GFX9-NEXT: v_mov_b32_e32 v0, s0 935; GFX9-NEXT: v_mov_b32_e32 v1, s1 936; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 937; GFX9-NEXT: s_waitcnt vmcnt(0) 938; GFX9-NEXT: global_store_byte v[0:1], v0, off 939; GFX9-NEXT: s_endpgm 940; 941; GFX10-LABEL: global_inst_salu_offset_2x_neg_11bit_max: 942; GFX10: ; %bb.0: 943; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 944; GFX10-NEXT: ; implicit-def: $vcc_hi 945; GFX10-NEXT: s_waitcnt lgkmcnt(0) 946; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0 947; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 948; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 949; GFX10-NEXT: s_waitcnt vmcnt(0) 950; GFX10-NEXT: global_store_byte v[0:1], v0, off 951; GFX10-NEXT: s_endpgm 952 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 953 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 954 store i8 %load, i8 addrspace(1)* undef 955 ret void 956} 957 958define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) { 959; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max: 960; GFX9: ; %bb.0: 961; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 962; GFX9-NEXT: s_waitcnt lgkmcnt(0) 963; GFX9-NEXT: v_mov_b32_e32 v0, s0 964; GFX9-NEXT: v_mov_b32_e32 v1, s1 965; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 966; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 967; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 968; GFX9-NEXT: s_waitcnt vmcnt(0) 969; GFX9-NEXT: global_store_byte v[0:1], v0, off 970; GFX9-NEXT: s_endpgm 971; 972; GFX10-LABEL: global_inst_salu_offset_2x_neg_12bit_max: 973; GFX10: ; %bb.0: 974; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 975; GFX10-NEXT: ; implicit-def: $vcc_hi 976; GFX10-NEXT: s_waitcnt lgkmcnt(0) 977; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0 978; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 979; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 980; GFX10-NEXT: s_waitcnt vmcnt(0) 981; GFX10-NEXT: global_store_byte v[0:1], v0, off 982; GFX10-NEXT: s_endpgm 983 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 984 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 985 store i8 %load, i8 addrspace(1)* undef 986 ret void 987} 988 989define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) { 990; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max: 991; GFX9: ; %bb.0: 992; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 993; GFX9-NEXT: s_waitcnt lgkmcnt(0) 994; GFX9-NEXT: v_mov_b32_e32 v0, s0 995; GFX9-NEXT: v_mov_b32_e32 v1, s1 996; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 997; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 998; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 999; GFX9-NEXT: s_waitcnt vmcnt(0) 1000; GFX9-NEXT: global_store_byte v[0:1], v0, off 1001; GFX9-NEXT: s_endpgm 1002; 1003; GFX10-LABEL: global_inst_salu_offset_2x_neg_13bit_max: 1004; GFX10: ; %bb.0: 1005; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1006; GFX10-NEXT: ; implicit-def: $vcc_hi 1007; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1008; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0 1009; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0 1010; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1011; GFX10-NEXT: s_waitcnt vmcnt(0) 1012; GFX10-NEXT: global_store_byte v[0:1], v0, off 1013; GFX10-NEXT: s_endpgm 1014 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 1015 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1016 store i8 %load, i8 addrspace(1)* undef 1017 ret void 1018} 1019 1020; Fill 11-bit low-bits (1ull << 33) | 2047 1021define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) { 1022; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0: 1023; GFX9: ; %bb.0: 1024; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1025; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1026; GFX9-NEXT: v_mov_b32_e32 v1, s1 1027; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1028; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1029; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1030; GFX9-NEXT: s_waitcnt vmcnt(0) 1031; GFX9-NEXT: global_store_byte v[0:1], v0, off 1032; GFX9-NEXT: s_endpgm 1033; 1034; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split0: 1035; GFX10: ; %bb.0: 1036; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1037; GFX10-NEXT: ; implicit-def: $vcc_hi 1038; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1039; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0 1040; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1041; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1042; GFX10-NEXT: s_waitcnt vmcnt(0) 1043; GFX10-NEXT: global_store_byte v[0:1], v0, off 1044; GFX10-NEXT: s_endpgm 1045 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 1046 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1047 store i8 %load, i8 addrspace(1)* undef 1048 ret void 1049} 1050 1051; Fill 11-bit low-bits (1ull << 33) | 2048 1052define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) { 1053; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1: 1054; GFX9: ; %bb.0: 1055; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1056; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1057; GFX9-NEXT: v_mov_b32_e32 v1, s1 1058; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1059; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1060; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 1061; GFX9-NEXT: s_waitcnt vmcnt(0) 1062; GFX9-NEXT: global_store_byte v[0:1], v0, off 1063; GFX9-NEXT: s_endpgm 1064; 1065; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_split1: 1066; GFX10: ; %bb.0: 1067; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1068; GFX10-NEXT: ; implicit-def: $vcc_hi 1069; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 1071; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1072; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1073; GFX10-NEXT: s_waitcnt vmcnt(0) 1074; GFX10-NEXT: global_store_byte v[0:1], v0, off 1075; GFX10-NEXT: s_endpgm 1076 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 1077 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1078 store i8 %load, i8 addrspace(1)* undef 1079 ret void 1080} 1081 1082; Fill 12-bit low-bits (1ull << 33) | 4095 1083define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) { 1084; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0: 1085; GFX9: ; %bb.0: 1086; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1087; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX9-NEXT: v_mov_b32_e32 v1, s1 1089; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1090; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1091; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1092; GFX9-NEXT: s_waitcnt vmcnt(0) 1093; GFX9-NEXT: global_store_byte v[0:1], v0, off 1094; GFX9-NEXT: s_endpgm 1095; 1096; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split0: 1097; GFX10: ; %bb.0: 1098; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1099; GFX10-NEXT: ; implicit-def: $vcc_hi 1100; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1101; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0 1102; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1103; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1104; GFX10-NEXT: s_waitcnt vmcnt(0) 1105; GFX10-NEXT: global_store_byte v[0:1], v0, off 1106; GFX10-NEXT: s_endpgm 1107 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 1108 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1109 store i8 %load, i8 addrspace(1)* undef 1110 ret void 1111} 1112 1113; Fill 12-bit low-bits (1ull << 33) | 4096 1114define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) { 1115; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1: 1116; GFX9: ; %bb.0: 1117; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1119; GFX9-NEXT: v_mov_b32_e32 v0, s0 1120; GFX9-NEXT: v_mov_b32_e32 v1, s1 1121; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1122; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1123; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1124; GFX9-NEXT: s_waitcnt vmcnt(0) 1125; GFX9-NEXT: global_store_byte v[0:1], v0, off 1126; GFX9-NEXT: s_endpgm 1127; 1128; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_split1: 1129; GFX10: ; %bb.0: 1130; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1131; GFX10-NEXT: ; implicit-def: $vcc_hi 1132; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1133; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0 1134; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1135; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1136; GFX10-NEXT: s_waitcnt vmcnt(0) 1137; GFX10-NEXT: global_store_byte v[0:1], v0, off 1138; GFX10-NEXT: s_endpgm 1139 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 1140 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1141 store i8 %load, i8 addrspace(1)* undef 1142 ret void 1143} 1144 1145; Fill 13-bit low-bits (1ull << 33) | 8191 1146define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) { 1147; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0: 1148; GFX9: ; %bb.0: 1149; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX9-NEXT: v_mov_b32_e32 v0, s0 1152; GFX9-NEXT: v_mov_b32_e32 v1, s1 1153; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 1154; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1155; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1156; GFX9-NEXT: s_waitcnt vmcnt(0) 1157; GFX9-NEXT: global_store_byte v[0:1], v0, off 1158; GFX9-NEXT: s_endpgm 1159; 1160; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split0: 1161; GFX10: ; %bb.0: 1162; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1163; GFX10-NEXT: ; implicit-def: $vcc_hi 1164; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0 1166; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1167; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1168; GFX10-NEXT: s_waitcnt vmcnt(0) 1169; GFX10-NEXT: global_store_byte v[0:1], v0, off 1170; GFX10-NEXT: s_endpgm 1171 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 1172 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1173 store i8 %load, i8 addrspace(1)* undef 1174 ret void 1175} 1176 1177; Fill 13-bit low-bits (1ull << 33) | 8192 1178define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) { 1179; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1: 1180; GFX9: ; %bb.0: 1181; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1182; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1183; GFX9-NEXT: v_mov_b32_e32 v0, s0 1184; GFX9-NEXT: v_mov_b32_e32 v1, s1 1185; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1186; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc 1187; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1188; GFX9-NEXT: s_waitcnt vmcnt(0) 1189; GFX9-NEXT: global_store_byte v[0:1], v0, off 1190; GFX9-NEXT: s_endpgm 1191; 1192; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_split1: 1193; GFX10: ; %bb.0: 1194; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1195; GFX10-NEXT: ; implicit-def: $vcc_hi 1196; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0 1198; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0 1199; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1200; GFX10-NEXT: s_waitcnt vmcnt(0) 1201; GFX10-NEXT: global_store_byte v[0:1], v0, off 1202; GFX10-NEXT: s_endpgm 1203 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 1204 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1205 store i8 %load, i8 addrspace(1)* undef 1206 ret void 1207} 1208 1209; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2047 1210define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)* %p) { 1211; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: 1212; GFX9: ; %bb.0: 1213; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1214; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX9-NEXT: v_mov_b32_e32 v2, s1 1217; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1218; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1219; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1220; GFX9-NEXT: s_waitcnt vmcnt(0) 1221; GFX9-NEXT: global_store_byte v[0:1], v0, off 1222; GFX9-NEXT: s_endpgm 1223; 1224; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: 1225; GFX10: ; %bb.0: 1226; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1227; GFX10-NEXT: ; implicit-def: $vcc_hi 1228; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX10-NEXT: v_mov_b32_e32 v1, s1 1230; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, s0 1231; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1232; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1233; GFX10-NEXT: s_waitcnt vmcnt(0) 1234; GFX10-NEXT: global_store_byte v[0:1], v0, off 1235; GFX10-NEXT: s_endpgm 1236 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 1237 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1238 store i8 %load, i8 addrspace(1)* undef 1239 ret void 1240} 1241 1242; Fill 11-bit low-bits, negative high bits (1ull << 63) | 2048 1243define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)* %p) { 1244; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: 1245; GFX9: ; %bb.0: 1246; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1247; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: v_mov_b32_e32 v2, s1 1250; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1251; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1252; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 1253; GFX9-NEXT: s_waitcnt vmcnt(0) 1254; GFX9-NEXT: global_store_byte v[0:1], v0, off 1255; GFX9-NEXT: s_endpgm 1256; 1257; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: 1258; GFX10: ; %bb.0: 1259; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1260; GFX10-NEXT: ; implicit-def: $vcc_hi 1261; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1262; GFX10-NEXT: v_mov_b32_e32 v1, s1 1263; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1264; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1265; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 1266; GFX10-NEXT: s_waitcnt vmcnt(0) 1267; GFX10-NEXT: global_store_byte v[0:1], v0, off 1268; GFX10-NEXT: s_endpgm 1269 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 1270 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1271 store i8 %load, i8 addrspace(1)* undef 1272 ret void 1273} 1274 1275; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4095 1276define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)* %p) { 1277; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: 1278; GFX9: ; %bb.0: 1279; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1280; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1281; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1282; GFX9-NEXT: v_mov_b32_e32 v2, s1 1283; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0 1284; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1285; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1286; GFX9-NEXT: s_waitcnt vmcnt(0) 1287; GFX9-NEXT: global_store_byte v[0:1], v0, off 1288; GFX9-NEXT: s_endpgm 1289; 1290; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: 1291; GFX10: ; %bb.0: 1292; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1293; GFX10-NEXT: ; implicit-def: $vcc_hi 1294; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1295; GFX10-NEXT: v_mov_b32_e32 v1, s1 1296; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1297; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1298; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1299; GFX10-NEXT: s_waitcnt vmcnt(0) 1300; GFX10-NEXT: global_store_byte v[0:1], v0, off 1301; GFX10-NEXT: s_endpgm 1302 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 1303 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1304 store i8 %load, i8 addrspace(1)* undef 1305 ret void 1306} 1307 1308; Fill 12-bit low-bits, negative high bits (1ull << 63) | 4096 1309define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)* %p) { 1310; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: 1311; GFX9: ; %bb.0: 1312; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1313; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1314; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX9-NEXT: v_mov_b32_e32 v0, s0 1316; GFX9-NEXT: v_mov_b32_e32 v2, s1 1317; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1318; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1319; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 1320; GFX9-NEXT: s_waitcnt vmcnt(0) 1321; GFX9-NEXT: global_store_byte v[0:1], v0, off 1322; GFX9-NEXT: s_endpgm 1323; 1324; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: 1325; GFX10: ; %bb.0: 1326; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1327; GFX10-NEXT: ; implicit-def: $vcc_hi 1328; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1329; GFX10-NEXT: v_mov_b32_e32 v1, s1 1330; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0 1331; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1332; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1333; GFX10-NEXT: s_waitcnt vmcnt(0) 1334; GFX10-NEXT: global_store_byte v[0:1], v0, off 1335; GFX10-NEXT: s_endpgm 1336 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 1337 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1338 store i8 %load, i8 addrspace(1)* undef 1339 ret void 1340} 1341 1342; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8191 1343define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)* %p) { 1344; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: 1345; GFX9: ; %bb.0: 1346; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1347; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1349; GFX9-NEXT: v_mov_b32_e32 v0, s0 1350; GFX9-NEXT: v_mov_b32_e32 v2, s1 1351; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1352; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1353; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1354; GFX9-NEXT: s_waitcnt vmcnt(0) 1355; GFX9-NEXT: global_store_byte v[0:1], v0, off 1356; GFX9-NEXT: s_endpgm 1357; 1358; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: 1359; GFX10: ; %bb.0: 1360; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1361; GFX10-NEXT: ; implicit-def: $vcc_hi 1362; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1363; GFX10-NEXT: v_mov_b32_e32 v1, s1 1364; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 1365; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1366; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 1367; GFX10-NEXT: s_waitcnt vmcnt(0) 1368; GFX10-NEXT: global_store_byte v[0:1], v0, off 1369; GFX10-NEXT: s_endpgm 1370 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 1371 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1372 store i8 %load, i8 addrspace(1)* undef 1373 ret void 1374} 1375 1376; Fill 13-bit low-bits, negative high bits (1ull << 63) | 8192 1377define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)* %p) { 1378; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: 1379; GFX9: ; %bb.0: 1380; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1381; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 1382; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX9-NEXT: v_mov_b32_e32 v0, s0 1384; GFX9-NEXT: v_mov_b32_e32 v2, s1 1385; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 1386; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc 1387; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1388; GFX9-NEXT: s_waitcnt vmcnt(0) 1389; GFX9-NEXT: global_store_byte v[0:1], v0, off 1390; GFX9-NEXT: s_endpgm 1391; 1392; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: 1393; GFX10: ; %bb.0: 1394; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1395; GFX10-NEXT: ; implicit-def: $vcc_hi 1396; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1397; GFX10-NEXT: v_mov_b32_e32 v1, s1 1398; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0 1399; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo 1400; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1401; GFX10-NEXT: s_waitcnt vmcnt(0) 1402; GFX10-NEXT: global_store_byte v[0:1], v0, off 1403; GFX10-NEXT: s_endpgm 1404 %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 1405 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 1406 store i8 %load, i8 addrspace(1)* undef 1407 ret void 1408} 1409