1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s 5 6; Test using saddr addressing mode of global_*load_* flat instructions. 7 8; -------------------------------------------------------------------------------- 9; No vgpr offset, constants 10; -------------------------------------------------------------------------------- 11 12; SGPR base only 13define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) { 14; GCN-LABEL: global_load_saddr_i8_offset_0: 15; GCN: ; %bb.0: 16; GCN-NEXT: v_mov_b32_e32 v0, 0 17; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 18; GCN-NEXT: s_waitcnt vmcnt(0) 19; GCN-NEXT: ; return to shader part epilog 20; 21; GFX11-LABEL: global_load_saddr_i8_offset_0: 22; GFX11: ; %bb.0: 23; GFX11-NEXT: v_mov_b32_e32 v0, 0 24; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 25; GFX11-NEXT: s_waitcnt vmcnt(0) 26; GFX11-NEXT: ; return to shader part epilog 27 %load = load i8, i8 addrspace(1)* %sbase 28 %zext = zext i8 %load to i32 29 %to.vgpr = bitcast i32 %zext to float 30 ret float %to.vgpr 31} 32 33; SGPR base with maximum gfx9 immediate offset 34define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) { 35; GFX9-LABEL: global_load_saddr_i8_offset_4095: 36; GFX9: ; %bb.0: 37; GFX9-NEXT: v_mov_b32_e32 v0, 0 38; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 39; GFX9-NEXT: s_waitcnt vmcnt(0) 40; GFX9-NEXT: ; return to shader part epilog 41; 42; GFX10-LABEL: global_load_saddr_i8_offset_4095: 43; GFX10: ; %bb.0: 44; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 45; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 46; GFX10-NEXT: s_waitcnt vmcnt(0) 47; GFX10-NEXT: ; return to shader part epilog 48; 49; GFX11-LABEL: global_load_saddr_i8_offset_4095: 50; GFX11: ; %bb.0: 51; GFX11-NEXT: v_mov_b32_e32 v0, 0 52; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 53; GFX11-NEXT: s_waitcnt vmcnt(0) 54; GFX11-NEXT: ; return to shader part epilog 55 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 56 %load = load i8, i8 addrspace(1)* %gep0 57 %zext = zext i8 %load to i32 58 %to.vgpr = bitcast i32 %zext to float 59 ret float %to.vgpr 60} 61 62; SGPR base with maximum gfx9 immediate offset + 1 63define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) { 64; GCN-LABEL: global_load_saddr_i8_offset_4096: 65; GCN: ; %bb.0: 66; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 67; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 68; GCN-NEXT: s_waitcnt vmcnt(0) 69; GCN-NEXT: ; return to shader part epilog 70; 71; GFX11-LABEL: global_load_saddr_i8_offset_4096: 72; GFX11: ; %bb.0: 73; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 74; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 75; GFX11-NEXT: s_waitcnt vmcnt(0) 76; GFX11-NEXT: ; return to shader part epilog 77 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096 78 %load = load i8, i8 addrspace(1)* %gep0 79 %zext = zext i8 %load to i32 80 %to.vgpr = bitcast i32 %zext to float 81 ret float %to.vgpr 82} 83 84; SGPR base with maximum gfx9 immediate offset + 2 85define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) { 86; GCN-LABEL: global_load_saddr_i8_offset_4097: 87; GCN: ; %bb.0: 88; GCN-NEXT: v_mov_b32_e32 v0, 0x1000 89; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 90; GCN-NEXT: s_waitcnt vmcnt(0) 91; GCN-NEXT: ; return to shader part epilog 92; 93; GFX11-LABEL: global_load_saddr_i8_offset_4097: 94; GFX11: ; %bb.0: 95; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 96; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 97; GFX11-NEXT: s_waitcnt vmcnt(0) 98; GFX11-NEXT: ; return to shader part epilog 99 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097 100 %load = load i8, i8 addrspace(1)* %gep0 101 %zext = zext i8 %load to i32 102 %to.vgpr = bitcast i32 %zext to float 103 ret float %to.vgpr 104} 105 106; SGPR base with maximum negative gfx9 immediate offset 107define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) { 108; GFX9-LABEL: global_load_saddr_i8_offset_neg4096: 109; GFX9: ; %bb.0: 110; GFX9-NEXT: v_mov_b32_e32 v0, 0 111; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 112; GFX9-NEXT: s_waitcnt vmcnt(0) 113; GFX9-NEXT: ; return to shader part epilog 114; 115; GFX10-LABEL: global_load_saddr_i8_offset_neg4096: 116; GFX10: ; %bb.0: 117; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 118; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 119; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 120; GFX10-NEXT: s_waitcnt vmcnt(0) 121; GFX10-NEXT: ; return to shader part epilog 122; 123; GFX11-LABEL: global_load_saddr_i8_offset_neg4096: 124; GFX11: ; %bb.0: 125; GFX11-NEXT: v_mov_b32_e32 v0, 0 126; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 127; GFX11-NEXT: s_waitcnt vmcnt(0) 128; GFX11-NEXT: ; return to shader part epilog 129 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096 130 %load = load i8, i8 addrspace(1)* %gep0 131 %zext = zext i8 %load to i32 132 %to.vgpr = bitcast i32 %zext to float 133 ret float %to.vgpr 134} 135 136; SGPR base with maximum negative gfx9 immediate offset -1 137define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) { 138; GFX9-LABEL: global_load_saddr_i8_offset_neg4097: 139; GFX9: ; %bb.0: 140; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff 141; GFX9-NEXT: s_addc_u32 s1, s3, -1 142; GFX9-NEXT: v_mov_b32_e32 v0, 0 143; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 144; GFX9-NEXT: s_waitcnt vmcnt(0) 145; GFX9-NEXT: ; return to shader part epilog 146; 147; GFX10-LABEL: global_load_saddr_i8_offset_neg4097: 148; GFX10: ; %bb.0: 149; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 150; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 151; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 152; GFX10-NEXT: s_waitcnt vmcnt(0) 153; GFX10-NEXT: ; return to shader part epilog 154; 155; GFX11-LABEL: global_load_saddr_i8_offset_neg4097: 156; GFX11: ; %bb.0: 157; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 158; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 159; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] 160; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 161; GFX11-NEXT: s_waitcnt vmcnt(0) 162; GFX11-NEXT: ; return to shader part epilog 163 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097 164 %load = load i8, i8 addrspace(1)* %gep0 165 %zext = zext i8 %load to i32 166 %to.vgpr = bitcast i32 %zext to float 167 ret float %to.vgpr 168} 169 170; SGPR base with maximum negative gfx9 immediate offset -2 171define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) { 172; GFX9-LABEL: global_load_saddr_i8_offset_neg4098: 173; GFX9: ; %bb.0: 174; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe 175; GFX9-NEXT: s_addc_u32 s1, s3, -1 176; GFX9-NEXT: v_mov_b32_e32 v0, 0 177; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 178; GFX9-NEXT: s_waitcnt vmcnt(0) 179; GFX9-NEXT: ; return to shader part epilog 180; 181; GFX10-LABEL: global_load_saddr_i8_offset_neg4098: 182; GFX10: ; %bb.0: 183; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 184; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 185; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 186; GFX10-NEXT: s_waitcnt vmcnt(0) 187; GFX10-NEXT: ; return to shader part epilog 188; 189; GFX11-LABEL: global_load_saddr_i8_offset_neg4098: 190; GFX11: ; %bb.0: 191; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 192; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 193; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] 194; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2 195; GFX11-NEXT: s_waitcnt vmcnt(0) 196; GFX11-NEXT: ; return to shader part epilog 197 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098 198 %load = load i8, i8 addrspace(1)* %gep0 199 %zext = zext i8 %load to i32 200 %to.vgpr = bitcast i32 %zext to float 201 ret float %to.vgpr 202} 203 204; SGPR base with maximum gfx10 immediate offset 205define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) { 206; GFX9-LABEL: global_load_saddr_i8_offset_2048: 207; GFX9: ; %bb.0: 208; GFX9-NEXT: v_mov_b32_e32 v0, 0 209; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 210; GFX9-NEXT: s_waitcnt vmcnt(0) 211; GFX9-NEXT: ; return to shader part epilog 212; 213; GFX10-LABEL: global_load_saddr_i8_offset_2048: 214; GFX10: ; %bb.0: 215; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 216; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 217; GFX10-NEXT: s_waitcnt vmcnt(0) 218; GFX10-NEXT: ; return to shader part epilog 219; 220; GFX11-LABEL: global_load_saddr_i8_offset_2048: 221; GFX11: ; %bb.0: 222; GFX11-NEXT: v_mov_b32_e32 v0, 0 223; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 224; GFX11-NEXT: s_waitcnt vmcnt(0) 225; GFX11-NEXT: ; return to shader part epilog 226 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048 227 %load = load i8, i8 addrspace(1)* %gep0 228 %zext = zext i8 %load to i32 229 %to.vgpr = bitcast i32 %zext to float 230 ret float %to.vgpr 231} 232 233; SGPR base with maximum gfx10 immediate offset + 1 234define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) { 235; GFX9-LABEL: global_load_saddr_i8_offset_2049: 236; GFX9: ; %bb.0: 237; GFX9-NEXT: v_mov_b32_e32 v0, 0 238; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049 239; GFX9-NEXT: s_waitcnt vmcnt(0) 240; GFX9-NEXT: ; return to shader part epilog 241; 242; GFX10-LABEL: global_load_saddr_i8_offset_2049: 243; GFX10: ; %bb.0: 244; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 245; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 246; GFX10-NEXT: s_waitcnt vmcnt(0) 247; GFX10-NEXT: ; return to shader part epilog 248; 249; GFX11-LABEL: global_load_saddr_i8_offset_2049: 250; GFX11: ; %bb.0: 251; GFX11-NEXT: v_mov_b32_e32 v0, 0 252; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049 253; GFX11-NEXT: s_waitcnt vmcnt(0) 254; GFX11-NEXT: ; return to shader part epilog 255 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049 256 %load = load i8, i8 addrspace(1)* %gep0 257 %zext = zext i8 %load to i32 258 %to.vgpr = bitcast i32 %zext to float 259 ret float %to.vgpr 260} 261 262; SGPR base with maximum gfx10 immediate offset + 2 263define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) { 264; GFX9-LABEL: global_load_saddr_i8_offset_2050: 265; GFX9: ; %bb.0: 266; GFX9-NEXT: v_mov_b32_e32 v0, 0 267; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050 268; GFX9-NEXT: s_waitcnt vmcnt(0) 269; GFX9-NEXT: ; return to shader part epilog 270; 271; GFX10-LABEL: global_load_saddr_i8_offset_2050: 272; GFX10: ; %bb.0: 273; GFX10-NEXT: v_mov_b32_e32 v0, 0x800 274; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 275; GFX10-NEXT: s_waitcnt vmcnt(0) 276; GFX10-NEXT: ; return to shader part epilog 277; 278; GFX11-LABEL: global_load_saddr_i8_offset_2050: 279; GFX11: ; %bb.0: 280; GFX11-NEXT: v_mov_b32_e32 v0, 0 281; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050 282; GFX11-NEXT: s_waitcnt vmcnt(0) 283; GFX11-NEXT: ; return to shader part epilog 284 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050 285 %load = load i8, i8 addrspace(1)* %gep0 286 %zext = zext i8 %load to i32 287 %to.vgpr = bitcast i32 %zext to float 288 ret float %to.vgpr 289} 290 291; SGPR base with maximum negative gfx10 immediate offset 292define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) { 293; GCN-LABEL: global_load_saddr_i8_offset_neg2048: 294; GCN: ; %bb.0: 295; GCN-NEXT: v_mov_b32_e32 v0, 0 296; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 297; GCN-NEXT: s_waitcnt vmcnt(0) 298; GCN-NEXT: ; return to shader part epilog 299; 300; GFX11-LABEL: global_load_saddr_i8_offset_neg2048: 301; GFX11: ; %bb.0: 302; GFX11-NEXT: v_mov_b32_e32 v0, 0 303; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 304; GFX11-NEXT: s_waitcnt vmcnt(0) 305; GFX11-NEXT: ; return to shader part epilog 306 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048 307 %load = load i8, i8 addrspace(1)* %gep0 308 %zext = zext i8 %load to i32 309 %to.vgpr = bitcast i32 %zext to float 310 ret float %to.vgpr 311} 312 313; SGPR base with maximum negative gfx10 immediate offset - 1 314define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) { 315; GFX9-LABEL: global_load_saddr_i8_offset_neg2049: 316; GFX9: ; %bb.0: 317; GFX9-NEXT: v_mov_b32_e32 v0, 0 318; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 319; GFX9-NEXT: s_waitcnt vmcnt(0) 320; GFX9-NEXT: ; return to shader part epilog 321; 322; GFX10-LABEL: global_load_saddr_i8_offset_neg2049: 323; GFX10: ; %bb.0: 324; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 325; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 326; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 327; GFX10-NEXT: s_waitcnt vmcnt(0) 328; GFX10-NEXT: ; return to shader part epilog 329; 330; GFX11-LABEL: global_load_saddr_i8_offset_neg2049: 331; GFX11: ; %bb.0: 332; GFX11-NEXT: v_mov_b32_e32 v0, 0 333; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 334; GFX11-NEXT: s_waitcnt vmcnt(0) 335; GFX11-NEXT: ; return to shader part epilog 336 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049 337 %load = load i8, i8 addrspace(1)* %gep0 338 %zext = zext i8 %load to i32 339 %to.vgpr = bitcast i32 %zext to float 340 ret float %to.vgpr 341} 342 343; SGPR base with maximum negative gfx10 immediate offset - 1 344define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) { 345; GFX9-LABEL: global_load_saddr_i8_offset_neg2050: 346; GFX9: ; %bb.0: 347; GFX9-NEXT: v_mov_b32_e32 v0, 0 348; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050 349; GFX9-NEXT: s_waitcnt vmcnt(0) 350; GFX9-NEXT: ; return to shader part epilog 351; 352; GFX10-LABEL: global_load_saddr_i8_offset_neg2050: 353; GFX10: ; %bb.0: 354; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2 355; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 356; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 357; GFX10-NEXT: s_waitcnt vmcnt(0) 358; GFX10-NEXT: ; return to shader part epilog 359; 360; GFX11-LABEL: global_load_saddr_i8_offset_neg2050: 361; GFX11: ; %bb.0: 362; GFX11-NEXT: v_mov_b32_e32 v0, 0 363; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050 364; GFX11-NEXT: s_waitcnt vmcnt(0) 365; GFX11-NEXT: ; return to shader part epilog 366 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050 367 %load = load i8, i8 addrspace(1)* %gep0 368 %zext = zext i8 %load to i32 369 %to.vgpr = bitcast i32 %zext to float 370 ret float %to.vgpr 371} 372 373define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) { 374; GFX9-LABEL: global_load_saddr_i8_offset_4294967295: 375; GFX9: ; %bb.0: 376; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000 377; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 378; GFX9-NEXT: s_waitcnt vmcnt(0) 379; GFX9-NEXT: ; return to shader part epilog 380; 381; GFX10-LABEL: global_load_saddr_i8_offset_4294967295: 382; GFX10: ; %bb.0: 383; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800 384; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 385; GFX10-NEXT: s_waitcnt vmcnt(0) 386; GFX10-NEXT: ; return to shader part epilog 387; 388; GFX11-LABEL: global_load_saddr_i8_offset_4294967295: 389; GFX11: ; %bb.0: 390; GFX11-NEXT: v_mov_b32_e32 v0, 0xfffff000 391; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 392; GFX11-NEXT: s_waitcnt vmcnt(0) 393; GFX11-NEXT: ; return to shader part epilog 394 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295 395 %load = load i8, i8 addrspace(1)* %gep0 396 %zext = zext i8 %load to i32 397 %to.vgpr = bitcast i32 %zext to float 398 ret float %to.vgpr 399} 400 401define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) { 402; GFX9-LABEL: global_load_saddr_i8_offset_4294967296: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: v_mov_b32_e32 v1, s3 405; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 406; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc 407; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 408; GFX9-NEXT: s_waitcnt vmcnt(0) 409; GFX9-NEXT: ; return to shader part epilog 410; 411; GFX10-LABEL: global_load_saddr_i8_offset_4294967296: 412; GFX10: ; %bb.0: 413; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 414; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 415; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 416; GFX10-NEXT: s_waitcnt vmcnt(0) 417; GFX10-NEXT: ; return to shader part epilog 418; 419; GFX11-LABEL: global_load_saddr_i8_offset_4294967296: 420; GFX11: ; %bb.0: 421; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 422; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 423; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] 424; GFX11-NEXT: global_load_u8 v0, v[0:1], off 425; GFX11-NEXT: s_waitcnt vmcnt(0) 426; GFX11-NEXT: ; return to shader part epilog 427 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296 428 %load = load i8, i8 addrspace(1)* %gep0 429 %zext = zext i8 %load to i32 430 %to.vgpr = bitcast i32 %zext to float 431 ret float %to.vgpr 432} 433 434define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) { 435; GFX9-LABEL: global_load_saddr_i8_offset_4294967297: 436; GFX9: ; %bb.0: 437; GFX9-NEXT: v_mov_b32_e32 v1, s3 438; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 439; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc 440; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 441; GFX9-NEXT: s_waitcnt vmcnt(0) 442; GFX9-NEXT: ; return to shader part epilog 443; 444; GFX10-LABEL: global_load_saddr_i8_offset_4294967297: 445; GFX10: ; %bb.0: 446; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 447; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 448; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 449; GFX10-NEXT: s_waitcnt vmcnt(0) 450; GFX10-NEXT: ; return to shader part epilog 451; 452; GFX11-LABEL: global_load_saddr_i8_offset_4294967297: 453; GFX11: ; %bb.0: 454; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 455; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 456; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] 457; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 458; GFX11-NEXT: s_waitcnt vmcnt(0) 459; GFX11-NEXT: ; return to shader part epilog 460 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297 461 %load = load i8, i8 addrspace(1)* %gep0 462 %zext = zext i8 %load to i32 463 %to.vgpr = bitcast i32 %zext to float 464 ret float %to.vgpr 465} 466 467define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) { 468; GFX9-LABEL: global_load_saddr_i8_offset_4294971391: 469; GFX9: ; %bb.0: 470; GFX9-NEXT: s_add_u32 s0, s2, 0xfff 471; GFX9-NEXT: s_addc_u32 s1, s3, 1 472; GFX9-NEXT: v_mov_b32_e32 v0, 0 473; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 474; GFX9-NEXT: s_waitcnt vmcnt(0) 475; GFX9-NEXT: ; return to shader part epilog 476; 477; GFX10-LABEL: global_load_saddr_i8_offset_4294971391: 478; GFX10: ; %bb.0: 479; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 480; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 481; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 482; GFX10-NEXT: s_waitcnt vmcnt(0) 483; GFX10-NEXT: ; return to shader part epilog 484; 485; GFX11-LABEL: global_load_saddr_i8_offset_4294971391: 486; GFX11: ; %bb.0: 487; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 488; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 489; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] 490; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 491; GFX11-NEXT: s_waitcnt vmcnt(0) 492; GFX11-NEXT: ; return to shader part epilog 493 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391 494 %load = load i8, i8 addrspace(1)* %gep0 495 %zext = zext i8 %load to i32 496 %to.vgpr = bitcast i32 %zext to float 497 ret float %to.vgpr 498} 499 500define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) { 501; GFX9-LABEL: global_load_saddr_i8_offset_4294971392: 502; GFX9: ; %bb.0: 503; GFX9-NEXT: s_add_u32 s0, s2, 0x1000 504; GFX9-NEXT: s_addc_u32 s1, s3, 1 505; GFX9-NEXT: v_mov_b32_e32 v0, 0 506; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 507; GFX9-NEXT: s_waitcnt vmcnt(0) 508; GFX9-NEXT: ; return to shader part epilog 509; 510; GFX10-LABEL: global_load_saddr_i8_offset_4294971392: 511; GFX10: ; %bb.0: 512; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 513; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] 514; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 515; GFX10-NEXT: s_waitcnt vmcnt(0) 516; GFX10-NEXT: ; return to shader part epilog 517; 518; GFX11-LABEL: global_load_saddr_i8_offset_4294971392: 519; GFX11: ; %bb.0: 520; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 521; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 522; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] 523; GFX11-NEXT: global_load_u8 v0, v[0:1], off 524; GFX11-NEXT: s_waitcnt vmcnt(0) 525; GFX11-NEXT: ; return to shader part epilog 526 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392 527 %load = load i8, i8 addrspace(1)* %gep0 528 %zext = zext i8 %load to i32 529 %to.vgpr = bitcast i32 %zext to float 530 ret float %to.vgpr 531} 532 533define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) { 534; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295: 535; GFX9: ; %bb.0: 536; GFX9-NEXT: v_mov_b32_e32 v0, s2 537; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 538; GFX9-NEXT: v_mov_b32_e32 v1, s3 539; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 540; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095 541; GFX9-NEXT: s_waitcnt vmcnt(0) 542; GFX9-NEXT: ; return to shader part epilog 543; 544; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295: 545; GFX10: ; %bb.0: 546; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2 547; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 548; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 549; GFX10-NEXT: s_waitcnt vmcnt(0) 550; GFX10-NEXT: ; return to shader part epilog 551; 552; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295: 553; GFX11: ; %bb.0: 554; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 555; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 556; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] 557; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4095 558; GFX11-NEXT: s_waitcnt vmcnt(0) 559; GFX11-NEXT: ; return to shader part epilog 560 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295 561 %load = load i8, i8 addrspace(1)* %gep0 562 %zext = zext i8 %load to i32 563 %to.vgpr = bitcast i32 %zext to float 564 ret float %to.vgpr 565} 566 567define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) { 568; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296: 569; GFX9: ; %bb.0: 570; GFX9-NEXT: v_mov_b32_e32 v1, s3 571; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 572; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 573; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 574; GFX9-NEXT: s_waitcnt vmcnt(0) 575; GFX9-NEXT: ; return to shader part epilog 576; 577; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296: 578; GFX10: ; %bb.0: 579; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 580; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 581; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 582; GFX10-NEXT: s_waitcnt vmcnt(0) 583; GFX10-NEXT: ; return to shader part epilog 584; 585; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296: 586; GFX11: ; %bb.0: 587; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 588; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 589; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] 590; GFX11-NEXT: global_load_u8 v0, v[0:1], off 591; GFX11-NEXT: s_waitcnt vmcnt(0) 592; GFX11-NEXT: ; return to shader part epilog 593 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296 594 %load = load i8, i8 addrspace(1)* %gep0 595 %zext = zext i8 %load to i32 596 %to.vgpr = bitcast i32 %zext to float 597 ret float %to.vgpr 598} 599 600define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) { 601; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297: 602; GFX9: ; %bb.0: 603; GFX9-NEXT: v_mov_b32_e32 v1, s3 604; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 605; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 606; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 607; GFX9-NEXT: s_waitcnt vmcnt(0) 608; GFX9-NEXT: ; return to shader part epilog 609; 610; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297: 611; GFX10: ; %bb.0: 612; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 613; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] 614; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 615; GFX10-NEXT: s_waitcnt vmcnt(0) 616; GFX10-NEXT: ; return to shader part epilog 617; 618; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297: 619; GFX11: ; %bb.0: 620; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 621; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 622; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] 623; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 624; GFX11-NEXT: s_waitcnt vmcnt(0) 625; GFX11-NEXT: ; return to shader part epilog 626 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297 627 %load = load i8, i8 addrspace(1)* %gep0 628 %zext = zext i8 %load to i32 629 %to.vgpr = bitcast i32 %zext to float 630 ret float %to.vgpr 631} 632 633; -------------------------------------------------------------------------------- 634; Basic addressing patterns 635; -------------------------------------------------------------------------------- 636 637; Basic pattern, no immediate offset. 638define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 639; GCN-LABEL: global_load_saddr_i8_zext_vgpr: 640; GCN: ; %bb.0: 641; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 642; GCN-NEXT: s_waitcnt vmcnt(0) 643; GCN-NEXT: ; return to shader part epilog 644; 645; GFX11-LABEL: global_load_saddr_i8_zext_vgpr: 646; GFX11: ; %bb.0: 647; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 648; GFX11-NEXT: s_waitcnt vmcnt(0) 649; GFX11-NEXT: ; return to shader part epilog 650 %zext.offset = zext i32 %voffset to i64 651 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 652 %load = load i8, i8 addrspace(1)* %gep0 653 %zext = zext i8 %load to i32 654 %to.vgpr = bitcast i32 %zext to float 655 ret float %to.vgpr 656} 657 658; Maximum positive offset on gfx9 659define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 660; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 661; GFX9: ; %bb.0: 662; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 663; GFX9-NEXT: s_waitcnt vmcnt(0) 664; GFX9-NEXT: ; return to shader part epilog 665; 666; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 667; GFX10: ; %bb.0: 668; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 669; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 670; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 671; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 672; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 673; GFX10-NEXT: s_waitcnt vmcnt(0) 674; GFX10-NEXT: ; return to shader part epilog 675; 676; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: 677; GFX11: ; %bb.0: 678; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 679; GFX11-NEXT: s_waitcnt vmcnt(0) 680; GFX11-NEXT: ; return to shader part epilog 681 %zext.offset = zext i32 %voffset to i64 682 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 683 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 684 %load = load i8, i8 addrspace(1)* %gep1 685 %zext = zext i8 %load to i32 686 %to.vgpr = bitcast i32 %zext to float 687 ret float %to.vgpr 688} 689 690; Maximum positive offset on gfx9 + 1 691define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 692; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 693; GFX9: ; %bb.0: 694; GFX9-NEXT: v_mov_b32_e32 v1, s3 695; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 696; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 697; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 698; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 699; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 700; GFX9-NEXT: s_waitcnt vmcnt(0) 701; GFX9-NEXT: ; return to shader part epilog 702; 703; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 704; GFX10: ; %bb.0: 705; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 706; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 707; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0 708; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 709; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 710; GFX10-NEXT: s_waitcnt vmcnt(0) 711; GFX10-NEXT: ; return to shader part epilog 712; 713; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: 714; GFX11: ; %bb.0: 715; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 717; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] 718; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0 719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 720; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 721; GFX11-NEXT: global_load_u8 v0, v[0:1], off 722; GFX11-NEXT: s_waitcnt vmcnt(0) 723; GFX11-NEXT: ; return to shader part epilog 724 %zext.offset = zext i32 %voffset to i64 725 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 726 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096 727 %load = load i8, i8 addrspace(1)* %gep1 728 %zext = zext i8 %load to i32 729 %to.vgpr = bitcast i32 %zext to float 730 ret float %to.vgpr 731} 732 733; Maximum negative offset on gfx9 734define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 735; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 736; GFX9: ; %bb.0: 737; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 738; GFX9-NEXT: s_waitcnt vmcnt(0) 739; GFX9-NEXT: ; return to shader part epilog 740; 741; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 742; GFX10: ; %bb.0: 743; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 744; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 745; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 746; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 747; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 748; GFX10-NEXT: s_waitcnt vmcnt(0) 749; GFX10-NEXT: ; return to shader part epilog 750; 751; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: 752; GFX11: ; %bb.0: 753; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 754; GFX11-NEXT: s_waitcnt vmcnt(0) 755; GFX11-NEXT: ; return to shader part epilog 756 %zext.offset = zext i32 %voffset to i64 757 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 758 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096 759 %load = load i8, i8 addrspace(1)* %gep1 760 %zext = zext i8 %load to i32 761 %to.vgpr = bitcast i32 %zext to float 762 ret float %to.vgpr 763} 764 765; Maximum negative offset on gfx9 - 1 766define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 767; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 768; GFX9: ; %bb.0: 769; GFX9-NEXT: v_mov_b32_e32 v1, s3 770; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 771; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 772; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 773; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 774; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 775; GFX9-NEXT: s_waitcnt vmcnt(0) 776; GFX9-NEXT: ; return to shader part epilog 777; 778; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 779; GFX10: ; %bb.0: 780; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 781; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 782; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 783; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 784; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 785; GFX10-NEXT: s_waitcnt vmcnt(0) 786; GFX10-NEXT: ; return to shader part epilog 787; 788; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: 789; GFX11: ; %bb.0: 790; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 791; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 792; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] 793; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 794; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 795; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 796; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 797; GFX11-NEXT: s_waitcnt vmcnt(0) 798; GFX11-NEXT: ; return to shader part epilog 799 %zext.offset = zext i32 %voffset to i64 800 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 801 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097 802 %load = load i8, i8 addrspace(1)* %gep1 803 %zext = zext i8 %load to i32 804 %to.vgpr = bitcast i32 %zext to float 805 ret float %to.vgpr 806} 807 808; Maximum positive offset on gfx10 809define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 810; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: 811; GCN: ; %bb.0: 812; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 813; GCN-NEXT: s_waitcnt vmcnt(0) 814; GCN-NEXT: ; return to shader part epilog 815; 816; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: 817; GFX11: ; %bb.0: 818; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 819; GFX11-NEXT: s_waitcnt vmcnt(0) 820; GFX11-NEXT: ; return to shader part epilog 821 %zext.offset = zext i32 %voffset to i64 822 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 823 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 824 %load = load i8, i8 addrspace(1)* %gep1 825 %zext = zext i8 %load to i32 826 %to.vgpr = bitcast i32 %zext to float 827 ret float %to.vgpr 828} 829 830; Maximum positive offset on gfx10 + 1 831define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 832; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 833; GFX9: ; %bb.0: 834; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 835; GFX9-NEXT: s_waitcnt vmcnt(0) 836; GFX9-NEXT: ; return to shader part epilog 837; 838; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 839; GFX10: ; %bb.0: 840; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 841; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 842; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 843; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 844; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 845; GFX10-NEXT: s_waitcnt vmcnt(0) 846; GFX10-NEXT: ; return to shader part epilog 847; 848; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: 849; GFX11: ; %bb.0: 850; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 851; GFX11-NEXT: s_waitcnt vmcnt(0) 852; GFX11-NEXT: ; return to shader part epilog 853 %zext.offset = zext i32 %voffset to i64 854 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 855 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 856 %load = load i8, i8 addrspace(1)* %gep1 857 %zext = zext i8 %load to i32 858 %to.vgpr = bitcast i32 %zext to float 859 ret float %to.vgpr 860} 861 862; Maximum negative offset on gfx10 863define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 864; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: 865; GCN: ; %bb.0: 866; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 867; GCN-NEXT: s_waitcnt vmcnt(0) 868; GCN-NEXT: ; return to shader part epilog 869; 870; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: 871; GFX11: ; %bb.0: 872; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 873; GFX11-NEXT: s_waitcnt vmcnt(0) 874; GFX11-NEXT: ; return to shader part epilog 875 %zext.offset = zext i32 %voffset to i64 876 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 877 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 878 %load = load i8, i8 addrspace(1)* %gep1 879 %zext = zext i8 %load to i32 880 %to.vgpr = bitcast i32 %zext to float 881 ret float %to.vgpr 882} 883 884; Maximum negative offset on gfx10 - 1 885define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 886; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 887; GFX9: ; %bb.0: 888; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 889; GFX9-NEXT: s_waitcnt vmcnt(0) 890; GFX9-NEXT: ; return to shader part epilog 891; 892; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 893; GFX10: ; %bb.0: 894; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 895; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 896; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0 897; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc 898; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 899; GFX10-NEXT: s_waitcnt vmcnt(0) 900; GFX10-NEXT: ; return to shader part epilog 901; 902; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: 903; GFX11: ; %bb.0: 904; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 905; GFX11-NEXT: s_waitcnt vmcnt(0) 906; GFX11-NEXT: ; return to shader part epilog 907 %zext.offset = zext i32 %voffset to i64 908 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 909 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049 910 %load = load i8, i8 addrspace(1)* %gep1 911 %zext = zext i8 %load to i32 912 %to.vgpr = bitcast i32 %zext to float 913 ret float %to.vgpr 914} 915 916; Maximum positive offset on gfx9, and immediate needs to be moved lower. 917define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 918; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 919; GFX9: ; %bb.0: 920; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 921; GFX9-NEXT: s_waitcnt vmcnt(0) 922; GFX9-NEXT: ; return to shader part epilog 923; 924; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 925; GFX10: ; %bb.0: 926; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 927; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] 928; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 929; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 930; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 931; GFX10-NEXT: s_waitcnt vmcnt(0) 932; GFX10-NEXT: ; return to shader part epilog 933; 934; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: 935; GFX11: ; %bb.0: 936; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 937; GFX11-NEXT: s_waitcnt vmcnt(0) 938; GFX11-NEXT: ; return to shader part epilog 939 %zext.offset = zext i32 %voffset to i64 940 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 941 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset 942 %load = load i8, i8 addrspace(1)* %gep1 943 %zext = zext i8 %load to i32 944 %to.vgpr = bitcast i32 %zext to float 945 ret float %to.vgpr 946} 947 948; pointer addressing done in integers 949define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 950; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: 951; GCN: ; %bb.0: 952; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 953; GCN-NEXT: s_waitcnt vmcnt(0) 954; GCN-NEXT: ; return to shader part epilog 955; 956; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: 957; GFX11: ; %bb.0: 958; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 959; GFX11-NEXT: s_waitcnt vmcnt(0) 960; GFX11-NEXT: ; return to shader part epilog 961 %zext.offset = zext i32 %voffset to i64 962 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 963 %add = add i64 %sbase.as.int, %zext.offset 964 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 965 %load = load i8, i8 addrspace(1)* %dirty.gep 966 %zext = zext i8 %load to i32 967 %to.vgpr = bitcast i32 %zext to float 968 ret float %to.vgpr 969} 970 971; zext forced to LHS of addressing expression 972define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 973; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: 974; GCN: ; %bb.0: 975; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 976; GCN-NEXT: s_waitcnt vmcnt(0) 977; GCN-NEXT: ; return to shader part epilog 978; 979; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: 980; GFX11: ; %bb.0: 981; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 982; GFX11-NEXT: s_waitcnt vmcnt(0) 983; GFX11-NEXT: ; return to shader part epilog 984 %zext.offset = zext i32 %voffset to i64 985 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 986 %add = add i64 %zext.offset, %sbase.as.int 987 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 988 %load = load i8, i8 addrspace(1)* %dirty.gep 989 %zext = zext i8 %load to i32 990 %to.vgpr = bitcast i32 %zext to float 991 ret float %to.vgpr 992} 993 994; zext forced to LHS of addressing expression, with immediate offset 995define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 996; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: 997; GCN: ; %bb.0: 998; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 999; GCN-NEXT: s_waitcnt vmcnt(0) 1000; GCN-NEXT: ; return to shader part epilog 1001; 1002; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: 1003; GFX11: ; %bb.0: 1004; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 1005; GFX11-NEXT: s_waitcnt vmcnt(0) 1006; GFX11-NEXT: ; return to shader part epilog 1007 %zext.offset = zext i32 %voffset to i64 1008 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 1009 %add = add i64 %zext.offset, %sbase.as.int 1010 %add.immoffset = add i64 %add, 128 1011 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* 1012 %load = load i8, i8 addrspace(1)* %dirty.gep 1013 %zext = zext i8 %load to i32 1014 %to.vgpr = bitcast i32 %zext to float 1015 ret float %to.vgpr 1016} 1017 1018; zext forced to LHS of addressing expression, with immediate offset in non-canonical position 1019define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1020; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: 1021; GCN: ; %bb.0: 1022; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 1023; GCN-NEXT: s_waitcnt vmcnt(0) 1024; GCN-NEXT: ; return to shader part epilog 1025; 1026; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: 1027; GFX11: ; %bb.0: 1028; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 1029; GFX11-NEXT: s_waitcnt vmcnt(0) 1030; GFX11-NEXT: ; return to shader part epilog 1031 %zext.offset = zext i32 %voffset to i64 1032 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 1033 %add.immoffset = add i64 %sbase.as.int, 128 1034 %add = add i64 %zext.offset, %add.immoffset 1035 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 1036 %load = load i8, i8 addrspace(1)* %dirty.gep 1037 %zext = zext i8 %load to i32 1038 %to.vgpr = bitcast i32 %zext to float 1039 ret float %to.vgpr 1040} 1041 1042; -------------------------------------------------------------------------------- 1043; Uniformity edge cases 1044; -------------------------------------------------------------------------------- 1045 1046@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef 1047 1048; Base pointer is uniform, but also in VGPRs 1049define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { 1050; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs: 1051; GFX9: ; %bb.0: 1052; GFX9-NEXT: v_mov_b32_e32 v1, 0 1053; GFX9-NEXT: ds_read_b64 v[1:2], v1 1054; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1056; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1057; GFX9-NEXT: s_nop 4 1058; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] 1059; GFX9-NEXT: s_waitcnt vmcnt(0) 1060; GFX9-NEXT: ; return to shader part epilog 1061; 1062; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs: 1063; GFX10: ; %bb.0: 1064; GFX10-NEXT: v_mov_b32_e32 v1, 0 1065; GFX10-NEXT: ds_read_b64 v[1:2], v1 1066; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1067; GFX10-NEXT: v_readfirstlane_b32 s0, v1 1068; GFX10-NEXT: v_readfirstlane_b32 s1, v2 1069; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] 1070; GFX10-NEXT: s_waitcnt vmcnt(0) 1071; GFX10-NEXT: ; return to shader part epilog 1072; 1073; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs: 1074; GFX11: ; %bb.0: 1075; GFX11-NEXT: v_mov_b32_e32 v1, 0 1076; GFX11-NEXT: ds_load_b64 v[1:2], v1 1077; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1078; GFX11-NEXT: v_readfirstlane_b32 s0, v1 1079; GFX11-NEXT: v_readfirstlane_b32 s1, v2 1080; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] 1081; GFX11-NEXT: s_waitcnt vmcnt(0) 1082; GFX11-NEXT: ; return to shader part epilog 1083 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds 1084 %zext.offset = zext i32 %voffset to i64 1085 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1086 %load = load i8, i8 addrspace(1)* %gep0 1087 %zext = zext i8 %load to i32 1088 %to.vgpr = bitcast i32 %zext to float 1089 ret float %to.vgpr 1090} 1091 1092; Base pointer is uniform, but also in VGPRs, with imm offset 1093define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { 1094; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: 1095; GFX9: ; %bb.0: 1096; GFX9-NEXT: v_mov_b32_e32 v1, 0 1097; GFX9-NEXT: ds_read_b64 v[1:2], v1 1098; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX9-NEXT: v_readfirstlane_b32 s0, v1 1100; GFX9-NEXT: v_readfirstlane_b32 s1, v2 1101; GFX9-NEXT: s_nop 4 1102; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 1103; GFX9-NEXT: s_waitcnt vmcnt(0) 1104; GFX9-NEXT: ; return to shader part epilog 1105; 1106; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: 1107; GFX10: ; %bb.0: 1108; GFX10-NEXT: v_mov_b32_e32 v1, 0 1109; GFX10-NEXT: ds_read_b64 v[1:2], v1 1110; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1111; GFX10-NEXT: v_readfirstlane_b32 s0, v1 1112; GFX10-NEXT: v_readfirstlane_b32 s1, v2 1113; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 1114; GFX10-NEXT: s_waitcnt vmcnt(0) 1115; GFX10-NEXT: ; return to shader part epilog 1116; 1117; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: 1118; GFX11: ; %bb.0: 1119; GFX11-NEXT: v_mov_b32_e32 v1, 0 1120; GFX11-NEXT: ds_load_b64 v[1:2], v1 1121; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX11-NEXT: v_readfirstlane_b32 s0, v1 1123; GFX11-NEXT: v_readfirstlane_b32 s1, v2 1124; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 1125; GFX11-NEXT: s_waitcnt vmcnt(0) 1126; GFX11-NEXT: ; return to shader part epilog 1127 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds 1128 %zext.offset = zext i32 %voffset to i64 1129 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1130 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42 1131 %load = load i8, i8 addrspace(1)* %gep1 1132 %zext = zext i8 %load to i32 1133 %to.vgpr = bitcast i32 %zext to float 1134 ret float %to.vgpr 1135} 1136 1137; Both 64-bit base and 32-bit offset are scalar 1138define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 1139; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset: 1140; GCN: ; %bb.0: 1141; GCN-NEXT: v_mov_b32_e32 v0, s4 1142; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 1143; GCN-NEXT: s_waitcnt vmcnt(0) 1144; GCN-NEXT: ; return to shader part epilog 1145; 1146; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset: 1147; GFX11: ; %bb.0: 1148; GFX11-NEXT: v_mov_b32_e32 v0, s4 1149; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1150; GFX11-NEXT: s_waitcnt vmcnt(0) 1151; GFX11-NEXT: ; return to shader part epilog 1152 %zext.offset = zext i32 %soffset to i64 1153 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1154 %load = load i8, i8 addrspace(1)* %gep0 1155 %zext = zext i8 %load to i32 1156 %to.vgpr = bitcast i32 %zext to float 1157 ret float %to.vgpr 1158} 1159 1160; Both 64-bit base and 32-bit offset are scalar, with immediate offset. 1161define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 1162; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: 1163; GCN: ; %bb.0: 1164; GCN-NEXT: v_mov_b32_e32 v0, s4 1165; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 1166; GCN-NEXT: s_waitcnt vmcnt(0) 1167; GCN-NEXT: ; return to shader part epilog 1168; 1169; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: 1170; GFX11: ; %bb.0: 1171; GFX11-NEXT: v_mov_b32_e32 v0, s4 1172; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24 1173; GFX11-NEXT: s_waitcnt vmcnt(0) 1174; GFX11-NEXT: ; return to shader part epilog 1175 %zext.offset = zext i32 %soffset to i64 1176 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1177 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24 1178 %load = load i8, i8 addrspace(1)* %gep1 1179 %zext = zext i8 %load to i32 1180 %to.vgpr = bitcast i32 %zext to float 1181 ret float %to.vgpr 1182} 1183 1184; Both components uniform, zext forced to LHS of addressing expression 1185define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 1186; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: 1187; GCN: ; %bb.0: 1188; GCN-NEXT: v_mov_b32_e32 v0, s4 1189; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 1190; GCN-NEXT: s_waitcnt vmcnt(0) 1191; GCN-NEXT: ; return to shader part epilog 1192; 1193; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: 1194; GFX11: ; %bb.0: 1195; GFX11-NEXT: v_mov_b32_e32 v0, s4 1196; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1197; GFX11-NEXT: s_waitcnt vmcnt(0) 1198; GFX11-NEXT: ; return to shader part epilog 1199 %zext.offset = zext i32 %soffset to i64 1200 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 1201 %add = add i64 %zext.offset, %sbase.as.int 1202 %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* 1203 %load = load i8, i8 addrspace(1)* %dirty.gep 1204 %zext = zext i8 %load to i32 1205 %to.vgpr = bitcast i32 %zext to float 1206 ret float %to.vgpr 1207} 1208 1209; Both components uniform, zext forced to LHS of addressing expression, with immediate offset 1210define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { 1211; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: 1212; GCN: ; %bb.0: 1213; GCN-NEXT: v_mov_b32_e32 v0, s4 1214; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 1215; GCN-NEXT: s_waitcnt vmcnt(0) 1216; GCN-NEXT: ; return to shader part epilog 1217; 1218; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: 1219; GFX11: ; %bb.0: 1220; GFX11-NEXT: v_mov_b32_e32 v0, s4 1221; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 1222; GFX11-NEXT: s_waitcnt vmcnt(0) 1223; GFX11-NEXT: ; return to shader part epilog 1224 %zext.offset = zext i32 %soffset to i64 1225 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 1226 %add = add i64 %zext.offset, %sbase.as.int 1227 %add.immoffset = add i64 %add, 128 1228 %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* 1229 %load = load i8, i8 addrspace(1)* %dirty.gep 1230 %zext = zext i8 %load to i32 1231 %to.vgpr = bitcast i32 %zext to float 1232 ret float %to.vgpr 1233} 1234 1235; divergent 64-bit base, 32-bit scalar offset. 1236define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) { 1237; GFX9-LABEL: global_load_i8_vgpr64_sgpr32: 1238; GFX9: ; %bb.0: 1239; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 1240; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1241; GFX9-NEXT: global_load_ubyte v0, v[0:1], off 1242; GFX9-NEXT: s_waitcnt vmcnt(0) 1243; GFX9-NEXT: ; return to shader part epilog 1244; 1245; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: 1246; GFX10: ; %bb.0: 1247; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2 1248; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 1249; GFX10-NEXT: global_load_ubyte v0, v[0:1], off 1250; GFX10-NEXT: s_waitcnt vmcnt(0) 1251; GFX10-NEXT: ; return to shader part epilog 1252; 1253; GFX11-LABEL: global_load_i8_vgpr64_sgpr32: 1254; GFX11: ; %bb.0: 1255; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2 1256; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 1257; GFX11-NEXT: global_load_u8 v0, v[0:1], off 1258; GFX11-NEXT: s_waitcnt vmcnt(0) 1259; GFX11-NEXT: ; return to shader part epilog 1260 %zext.offset = zext i32 %soffset to i64 1261 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset 1262 %load = load i8, i8 addrspace(1)* %gep0 1263 %zext = zext i8 %load to i32 1264 %to.vgpr = bitcast i32 %zext to float 1265 ret float %to.vgpr 1266} 1267 1268; divergent 64-bit base, 32-bit scalar offset, with imm offset 1269define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) { 1270; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 1271; GFX9: ; %bb.0: 1272; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 1273; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1274; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 1275; GFX9-NEXT: s_waitcnt vmcnt(0) 1276; GFX9-NEXT: ; return to shader part epilog 1277; 1278; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 1279; GFX10: ; %bb.0: 1280; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2 1281; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 1282; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0 1283; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 1284; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 1285; GFX10-NEXT: s_waitcnt vmcnt(0) 1286; GFX10-NEXT: ; return to shader part epilog 1287; 1288; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: 1289; GFX11: ; %bb.0: 1290; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2 1291; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc 1292; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 1293; GFX11-NEXT: s_waitcnt vmcnt(0) 1294; GFX11-NEXT: ; return to shader part epilog 1295 %zext.offset = zext i32 %soffset to i64 1296 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset 1297 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 1298 %load = load i8, i8 addrspace(1)* %gep1 1299 %zext = zext i8 %load to i32 1300 %to.vgpr = bitcast i32 %zext to float 1301 ret float %to.vgpr 1302} 1303 1304; -------------------------------------------------------------------------------- 1305; Natural addressing shifts with restricted range 1306; -------------------------------------------------------------------------------- 1307 1308; Cannot push the shift into 32-bits, and cannot match. 1309define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1310; GFX9-LABEL: global_load_saddr_f32_natural_addressing: 1311; GFX9: ; %bb.0: 1312; GFX9-NEXT: global_load_dword v0, v[0:1], off 1313; GFX9-NEXT: v_mov_b32_e32 v1, 0 1314; GFX9-NEXT: v_mov_b32_e32 v2, s3 1315; GFX9-NEXT: s_waitcnt vmcnt(0) 1316; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1317; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 1318; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1319; GFX9-NEXT: global_load_dword v0, v[0:1], off 1320; GFX9-NEXT: s_waitcnt vmcnt(0) 1321; GFX9-NEXT: ; return to shader part epilog 1322; 1323; GFX10-LABEL: global_load_saddr_f32_natural_addressing: 1324; GFX10: ; %bb.0: 1325; GFX10-NEXT: global_load_dword v0, v[0:1], off 1326; GFX10-NEXT: v_mov_b32_e32 v1, 0 1327; GFX10-NEXT: s_waitcnt vmcnt(0) 1328; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1329; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0 1330; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1331; GFX10-NEXT: global_load_dword v0, v[0:1], off 1332; GFX10-NEXT: s_waitcnt vmcnt(0) 1333; GFX10-NEXT: ; return to shader part epilog 1334; 1335; GFX11-LABEL: global_load_saddr_f32_natural_addressing: 1336; GFX11: ; %bb.0: 1337; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1338; GFX11-NEXT: v_mov_b32_e32 v1, 0 1339; GFX11-NEXT: s_waitcnt vmcnt(0) 1340; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1341; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1342; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0 1343; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1344; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1345; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1346; GFX11-NEXT: s_waitcnt vmcnt(0) 1347; GFX11-NEXT: ; return to shader part epilog 1348 %voffset = load i32, i32 addrspace(1)* %voffset.ptr 1349 %zext.offset = zext i32 %voffset to i64 1350 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1351 %load = load float, float addrspace(1)* %gep 1352 ret float %load 1353} 1354 1355; Cannot push the shift into 32-bits, with an immediate offset. 1356define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1357; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset: 1358; GCN: ; %bb.0: 1359; GCN-NEXT: global_load_dword v0, v[0:1], off 1360; GCN-NEXT: s_waitcnt vmcnt(0) 1361; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 1362; GCN-NEXT: s_waitcnt vmcnt(0) 1363; GCN-NEXT: ; return to shader part epilog 1364; 1365; GFX11-LABEL: global_load_saddr_f32_natural_addressing_immoffset: 1366; GFX11: ; %bb.0: 1367; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1368; GFX11-NEXT: s_waitcnt vmcnt(0) 1369; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 1370; GFX11-NEXT: s_waitcnt vmcnt(0) 1371; GFX11-NEXT: ; return to shader part epilog 1372 %voffset = load i32, i32 addrspace(1)* %voffset.ptr 1373 %zext.offset = zext i32 %voffset to i64 1374 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1375 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128 1376 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* 1377 %load = load float, float addrspace(1)* %gep1.cast 1378 ret float %load 1379} 1380 1381; Range is sufficiently restricted to push the shift into 32-bits. 1382define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1383; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range: 1384; GCN: ; %bb.0: 1385; GCN-NEXT: global_load_dword v0, v[0:1], off 1386; GCN-NEXT: s_waitcnt vmcnt(0) 1387; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1388; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1389; GCN-NEXT: s_waitcnt vmcnt(0) 1390; GCN-NEXT: ; return to shader part epilog 1391; 1392; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range: 1393; GFX11: ; %bb.0: 1394; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1395; GFX11-NEXT: s_waitcnt vmcnt(0) 1396; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1397; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1398; GFX11-NEXT: s_waitcnt vmcnt(0) 1399; GFX11-NEXT: ; return to shader part epilog 1400 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 1401 %zext.offset = zext i32 %voffset to i64 1402 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1403 %load = load float, float addrspace(1)* %gep 1404 ret float %load 1405} 1406 1407; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset 1408define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1409; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: 1410; GCN: ; %bb.0: 1411; GCN-NEXT: global_load_dword v0, v[0:1], off 1412; GCN-NEXT: s_waitcnt vmcnt(0) 1413; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1414; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 1415; GCN-NEXT: s_waitcnt vmcnt(0) 1416; GCN-NEXT: ; return to shader part epilog 1417; 1418; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: 1419; GFX11: ; %bb.0: 1420; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1421; GFX11-NEXT: s_waitcnt vmcnt(0) 1422; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1423; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 1424; GFX11-NEXT: s_waitcnt vmcnt(0) 1425; GFX11-NEXT: ; return to shader part epilog 1426 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 1427 %zext.offset = zext i32 %voffset to i64 1428 %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1429 %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100 1430 %load = load float, float addrspace(1)* %gep1 1431 ret float %load 1432} 1433 1434; Range is 1 beyond the limit where we can move the shift into 32-bits. 1435define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { 1436; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 1437; GFX9: ; %bb.0: 1438; GFX9-NEXT: global_load_dword v0, v[0:1], off 1439; GFX9-NEXT: v_mov_b32_e32 v1, 0 1440; GFX9-NEXT: v_mov_b32_e32 v2, s3 1441; GFX9-NEXT: s_waitcnt vmcnt(0) 1442; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1443; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 1444; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc 1445; GFX9-NEXT: global_load_dword v0, v[0:1], off 1446; GFX9-NEXT: s_waitcnt vmcnt(0) 1447; GFX9-NEXT: ; return to shader part epilog 1448; 1449; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 1450; GFX10: ; %bb.0: 1451; GFX10-NEXT: global_load_dword v0, v[0:1], off 1452; GFX10-NEXT: v_mov_b32_e32 v1, 0 1453; GFX10-NEXT: s_waitcnt vmcnt(0) 1454; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1455; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0 1456; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1457; GFX10-NEXT: global_load_dword v0, v[0:1], off 1458; GFX10-NEXT: s_waitcnt vmcnt(0) 1459; GFX10-NEXT: ; return to shader part epilog 1460; 1461; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: 1462; GFX11: ; %bb.0: 1463; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1464; GFX11-NEXT: v_mov_b32_e32 v1, 0 1465; GFX11-NEXT: s_waitcnt vmcnt(0) 1466; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1467; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] 1468; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0 1469; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1470; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 1471; GFX11-NEXT: global_load_b32 v0, v[0:1], off 1472; GFX11-NEXT: s_waitcnt vmcnt(0) 1473; GFX11-NEXT: ; return to shader part epilog 1474 %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1 1475 %zext.offset = zext i32 %voffset to i64 1476 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset 1477 %load = load float, float addrspace(1)* %gep 1478 ret float %load 1479} 1480 1481; -------------------------------------------------------------------------------- 1482; Stress various type loads 1483; -------------------------------------------------------------------------------- 1484 1485define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1486; GCN-LABEL: global_load_saddr_i16: 1487; GCN: ; %bb.0: 1488; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 1489; GCN-NEXT: s_waitcnt vmcnt(0) 1490; GCN-NEXT: ; return to shader part epilog 1491; 1492; GFX11-LABEL: global_load_saddr_i16: 1493; GFX11: ; %bb.0: 1494; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] 1495; GFX11-NEXT: s_waitcnt vmcnt(0) 1496; GFX11-NEXT: ; return to shader part epilog 1497 %zext.offset = zext i32 %voffset to i64 1498 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1499 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 1500 %load = load i16, i16 addrspace(1)* %gep0.cast 1501 %cast.load = bitcast i16 %load to half 1502 ret half %cast.load 1503} 1504 1505define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1506; GCN-LABEL: global_load_saddr_i16_immneg128: 1507; GCN: ; %bb.0: 1508; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 1509; GCN-NEXT: s_waitcnt vmcnt(0) 1510; GCN-NEXT: ; return to shader part epilog 1511; 1512; GFX11-LABEL: global_load_saddr_i16_immneg128: 1513; GFX11: ; %bb.0: 1514; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 1515; GFX11-NEXT: s_waitcnt vmcnt(0) 1516; GFX11-NEXT: ; return to shader part epilog 1517 %zext.offset = zext i32 %voffset to i64 1518 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1519 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1520 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 1521 %load = load i16, i16 addrspace(1)* %gep1.cast 1522 %cast.load = bitcast i16 %load to half 1523 ret half %cast.load 1524} 1525 1526define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1527; GCN-LABEL: global_load_saddr_f16: 1528; GCN: ; %bb.0: 1529; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 1530; GCN-NEXT: s_waitcnt vmcnt(0) 1531; GCN-NEXT: ; return to shader part epilog 1532; 1533; GFX11-LABEL: global_load_saddr_f16: 1534; GFX11: ; %bb.0: 1535; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] 1536; GFX11-NEXT: s_waitcnt vmcnt(0) 1537; GFX11-NEXT: ; return to shader part epilog 1538 %zext.offset = zext i32 %voffset to i64 1539 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1540 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* 1541 %load = load half, half addrspace(1)* %gep0.cast 1542 ret half %load 1543} 1544 1545define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1546; GCN-LABEL: global_load_saddr_f16_immneg128: 1547; GCN: ; %bb.0: 1548; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 1549; GCN-NEXT: s_waitcnt vmcnt(0) 1550; GCN-NEXT: ; return to shader part epilog 1551; 1552; GFX11-LABEL: global_load_saddr_f16_immneg128: 1553; GFX11: ; %bb.0: 1554; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 1555; GFX11-NEXT: s_waitcnt vmcnt(0) 1556; GFX11-NEXT: ; return to shader part epilog 1557 %zext.offset = zext i32 %voffset to i64 1558 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1559 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1560 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)* 1561 %load = load half, half addrspace(1)* %gep1.cast 1562 ret half %load 1563} 1564 1565define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1566; GCN-LABEL: global_load_saddr_i32: 1567; GCN: ; %bb.0: 1568; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1569; GCN-NEXT: s_waitcnt vmcnt(0) 1570; GCN-NEXT: ; return to shader part epilog 1571; 1572; GFX11-LABEL: global_load_saddr_i32: 1573; GFX11: ; %bb.0: 1574; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1575; GFX11-NEXT: s_waitcnt vmcnt(0) 1576; GFX11-NEXT: ; return to shader part epilog 1577 %zext.offset = zext i32 %voffset to i64 1578 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1579 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* 1580 %load = load i32, i32 addrspace(1)* %gep0.cast 1581 %cast.load = bitcast i32 %load to float 1582 ret float %cast.load 1583} 1584 1585define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1586; GCN-LABEL: global_load_saddr_i32_immneg128: 1587; GCN: ; %bb.0: 1588; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1589; GCN-NEXT: s_waitcnt vmcnt(0) 1590; GCN-NEXT: ; return to shader part epilog 1591; 1592; GFX11-LABEL: global_load_saddr_i32_immneg128: 1593; GFX11: ; %bb.0: 1594; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1595; GFX11-NEXT: s_waitcnt vmcnt(0) 1596; GFX11-NEXT: ; return to shader part epilog 1597 %zext.offset = zext i32 %voffset to i64 1598 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1599 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1600 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* 1601 %load = load i32, i32 addrspace(1)* %gep1.cast 1602 %cast.load = bitcast i32 %load to float 1603 ret float %cast.load 1604} 1605 1606define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1607; GCN-LABEL: global_load_saddr_f32: 1608; GCN: ; %bb.0: 1609; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1610; GCN-NEXT: s_waitcnt vmcnt(0) 1611; GCN-NEXT: ; return to shader part epilog 1612; 1613; GFX11-LABEL: global_load_saddr_f32: 1614; GFX11: ; %bb.0: 1615; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1616; GFX11-NEXT: s_waitcnt vmcnt(0) 1617; GFX11-NEXT: ; return to shader part epilog 1618 %zext.offset = zext i32 %voffset to i64 1619 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1620 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* 1621 %load = load float, float addrspace(1)* %gep0.cast 1622 ret float %load 1623} 1624 1625define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1626; GCN-LABEL: global_load_saddr_f32_immneg128: 1627; GCN: ; %bb.0: 1628; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1629; GCN-NEXT: s_waitcnt vmcnt(0) 1630; GCN-NEXT: ; return to shader part epilog 1631; 1632; GFX11-LABEL: global_load_saddr_f32_immneg128: 1633; GFX11: ; %bb.0: 1634; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1635; GFX11-NEXT: s_waitcnt vmcnt(0) 1636; GFX11-NEXT: ; return to shader part epilog 1637 %zext.offset = zext i32 %voffset to i64 1638 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1639 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1640 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* 1641 %load = load float, float addrspace(1)* %gep1.cast 1642 ret float %load 1643} 1644 1645define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1646; GCN-LABEL: global_load_saddr_v2i16: 1647; GCN: ; %bb.0: 1648; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1649; GCN-NEXT: s_waitcnt vmcnt(0) 1650; GCN-NEXT: ; return to shader part epilog 1651; 1652; GFX11-LABEL: global_load_saddr_v2i16: 1653; GFX11: ; %bb.0: 1654; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1655; GFX11-NEXT: s_waitcnt vmcnt(0) 1656; GFX11-NEXT: ; return to shader part epilog 1657 %zext.offset = zext i32 %voffset to i64 1658 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1659 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)* 1660 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast 1661 %cast.load = bitcast <2 x i16> %load to <2 x half> 1662 ret <2 x half> %cast.load 1663} 1664 1665define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1666; GCN-LABEL: global_load_saddr_v2i16_immneg128: 1667; GCN: ; %bb.0: 1668; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1669; GCN-NEXT: s_waitcnt vmcnt(0) 1670; GCN-NEXT: ; return to shader part epilog 1671; 1672; GFX11-LABEL: global_load_saddr_v2i16_immneg128: 1673; GFX11: ; %bb.0: 1674; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1675; GFX11-NEXT: s_waitcnt vmcnt(0) 1676; GFX11-NEXT: ; return to shader part epilog 1677 %zext.offset = zext i32 %voffset to i64 1678 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1679 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1680 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)* 1681 %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast 1682 %cast.load = bitcast <2 x i16> %load to <2 x half> 1683 ret <2 x half> %cast.load 1684} 1685 1686define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1687; GCN-LABEL: global_load_saddr_v2f16: 1688; GCN: ; %bb.0: 1689; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1690; GCN-NEXT: s_waitcnt vmcnt(0) 1691; GCN-NEXT: ; return to shader part epilog 1692; 1693; GFX11-LABEL: global_load_saddr_v2f16: 1694; GFX11: ; %bb.0: 1695; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1696; GFX11-NEXT: s_waitcnt vmcnt(0) 1697; GFX11-NEXT: ; return to shader part epilog 1698 %zext.offset = zext i32 %voffset to i64 1699 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1700 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* 1701 %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast 1702 ret <2 x half> %load 1703} 1704 1705define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1706; GCN-LABEL: global_load_saddr_v2f16_immneg128: 1707; GCN: ; %bb.0: 1708; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1709; GCN-NEXT: s_waitcnt vmcnt(0) 1710; GCN-NEXT: ; return to shader part epilog 1711; 1712; GFX11-LABEL: global_load_saddr_v2f16_immneg128: 1713; GFX11: ; %bb.0: 1714; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1715; GFX11-NEXT: s_waitcnt vmcnt(0) 1716; GFX11-NEXT: ; return to shader part epilog 1717 %zext.offset = zext i32 %voffset to i64 1718 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1719 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1720 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* 1721 %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast 1722 ret <2 x half> %load 1723} 1724 1725define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1726; GCN-LABEL: global_load_saddr_p3: 1727; GCN: ; %bb.0: 1728; GCN-NEXT: global_load_dword v0, v0, s[2:3] 1729; GCN-NEXT: s_waitcnt vmcnt(0) 1730; GCN-NEXT: ; return to shader part epilog 1731; 1732; GFX11-LABEL: global_load_saddr_p3: 1733; GFX11: ; %bb.0: 1734; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1735; GFX11-NEXT: s_waitcnt vmcnt(0) 1736; GFX11-NEXT: ; return to shader part epilog 1737 %zext.offset = zext i32 %voffset to i64 1738 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1739 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* 1740 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast 1741 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 1742 %cast.load1 = bitcast i32 %cast.load0 to <2 x half> 1743 ret <2 x half> %cast.load1 1744} 1745 1746define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1747; GCN-LABEL: global_load_saddr_p3_immneg128: 1748; GCN: ; %bb.0: 1749; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 1750; GCN-NEXT: s_waitcnt vmcnt(0) 1751; GCN-NEXT: ; return to shader part epilog 1752; 1753; GFX11-LABEL: global_load_saddr_p3_immneg128: 1754; GFX11: ; %bb.0: 1755; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 1756; GFX11-NEXT: s_waitcnt vmcnt(0) 1757; GFX11-NEXT: ; return to shader part epilog 1758 %zext.offset = zext i32 %voffset to i64 1759 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1760 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1761 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)* 1762 %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast 1763 %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 1764 %cast.load1 = bitcast i32 %cast.load0 to <2 x half> 1765 ret <2 x half> %cast.load1 1766} 1767 1768define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1769; GCN-LABEL: global_load_saddr_f64: 1770; GCN: ; %bb.0: 1771; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1772; GCN-NEXT: s_waitcnt vmcnt(0) 1773; GCN-NEXT: ; return to shader part epilog 1774; 1775; GFX11-LABEL: global_load_saddr_f64: 1776; GFX11: ; %bb.0: 1777; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1778; GFX11-NEXT: s_waitcnt vmcnt(0) 1779; GFX11-NEXT: ; return to shader part epilog 1780 %zext.offset = zext i32 %voffset to i64 1781 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1782 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* 1783 %load = load double, double addrspace(1)* %gep0.cast 1784 %cast.load = bitcast double %load to <2 x float> 1785 ret <2 x float> %cast.load 1786} 1787 1788define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1789; GCN-LABEL: global_load_saddr_f64_immneg128: 1790; GCN: ; %bb.0: 1791; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1792; GCN-NEXT: s_waitcnt vmcnt(0) 1793; GCN-NEXT: ; return to shader part epilog 1794; 1795; GFX11-LABEL: global_load_saddr_f64_immneg128: 1796; GFX11: ; %bb.0: 1797; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 1798; GFX11-NEXT: s_waitcnt vmcnt(0) 1799; GFX11-NEXT: ; return to shader part epilog 1800 %zext.offset = zext i32 %voffset to i64 1801 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1802 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1803 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)* 1804 %load = load double, double addrspace(1)* %gep1.cast 1805 %cast.load = bitcast double %load to <2 x float> 1806 ret <2 x float> %cast.load 1807} 1808 1809define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1810; GCN-LABEL: global_load_saddr_i64: 1811; GCN: ; %bb.0: 1812; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1813; GCN-NEXT: s_waitcnt vmcnt(0) 1814; GCN-NEXT: ; return to shader part epilog 1815; 1816; GFX11-LABEL: global_load_saddr_i64: 1817; GFX11: ; %bb.0: 1818; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1819; GFX11-NEXT: s_waitcnt vmcnt(0) 1820; GFX11-NEXT: ; return to shader part epilog 1821 %zext.offset = zext i32 %voffset to i64 1822 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1823 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* 1824 %load = load i64, i64 addrspace(1)* %gep0.cast 1825 %cast.load = bitcast i64 %load to <2 x float> 1826 ret <2 x float> %cast.load 1827} 1828 1829define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1830; GCN-LABEL: global_load_saddr_i64_immneg128: 1831; GCN: ; %bb.0: 1832; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1833; GCN-NEXT: s_waitcnt vmcnt(0) 1834; GCN-NEXT: ; return to shader part epilog 1835; 1836; GFX11-LABEL: global_load_saddr_i64_immneg128: 1837; GFX11: ; %bb.0: 1838; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 1839; GFX11-NEXT: s_waitcnt vmcnt(0) 1840; GFX11-NEXT: ; return to shader part epilog 1841 %zext.offset = zext i32 %voffset to i64 1842 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1843 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1844 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* 1845 %load = load i64, i64 addrspace(1)* %gep1.cast 1846 %cast.load = bitcast i64 %load to <2 x float> 1847 ret <2 x float> %cast.load 1848} 1849 1850define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1851; GCN-LABEL: global_load_saddr_v2f32: 1852; GCN: ; %bb.0: 1853; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1854; GCN-NEXT: s_waitcnt vmcnt(0) 1855; GCN-NEXT: ; return to shader part epilog 1856; 1857; GFX11-LABEL: global_load_saddr_v2f32: 1858; GFX11: ; %bb.0: 1859; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1860; GFX11-NEXT: s_waitcnt vmcnt(0) 1861; GFX11-NEXT: ; return to shader part epilog 1862 %zext.offset = zext i32 %voffset to i64 1863 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1864 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* 1865 %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast 1866 ret <2 x float> %load 1867} 1868 1869define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1870; GCN-LABEL: global_load_saddr_v2f32_immneg128: 1871; GCN: ; %bb.0: 1872; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1873; GCN-NEXT: s_waitcnt vmcnt(0) 1874; GCN-NEXT: ; return to shader part epilog 1875; 1876; GFX11-LABEL: global_load_saddr_v2f32_immneg128: 1877; GFX11: ; %bb.0: 1878; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 1879; GFX11-NEXT: s_waitcnt vmcnt(0) 1880; GFX11-NEXT: ; return to shader part epilog 1881 %zext.offset = zext i32 %voffset to i64 1882 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1883 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1884 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)* 1885 %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast 1886 ret <2 x float> %load 1887} 1888 1889define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1890; GCN-LABEL: global_load_saddr_v2i32: 1891; GCN: ; %bb.0: 1892; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1893; GCN-NEXT: s_waitcnt vmcnt(0) 1894; GCN-NEXT: ; return to shader part epilog 1895; 1896; GFX11-LABEL: global_load_saddr_v2i32: 1897; GFX11: ; %bb.0: 1898; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1899; GFX11-NEXT: s_waitcnt vmcnt(0) 1900; GFX11-NEXT: ; return to shader part epilog 1901 %zext.offset = zext i32 %voffset to i64 1902 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1903 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* 1904 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast 1905 %cast.load = bitcast <2 x i32> %load to <2 x float> 1906 ret <2 x float> %cast.load 1907} 1908 1909define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1910; GCN-LABEL: global_load_saddr_v2i32_immneg128: 1911; GCN: ; %bb.0: 1912; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1913; GCN-NEXT: s_waitcnt vmcnt(0) 1914; GCN-NEXT: ; return to shader part epilog 1915; 1916; GFX11-LABEL: global_load_saddr_v2i32_immneg128: 1917; GFX11: ; %bb.0: 1918; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 1919; GFX11-NEXT: s_waitcnt vmcnt(0) 1920; GFX11-NEXT: ; return to shader part epilog 1921 %zext.offset = zext i32 %voffset to i64 1922 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1923 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1924 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)* 1925 %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast 1926 %cast.load = bitcast <2 x i32> %load to <2 x float> 1927 ret <2 x float> %cast.load 1928} 1929 1930define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1931; GCN-LABEL: global_load_saddr_v4i16: 1932; GCN: ; %bb.0: 1933; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1934; GCN-NEXT: s_waitcnt vmcnt(0) 1935; GCN-NEXT: ; return to shader part epilog 1936; 1937; GFX11-LABEL: global_load_saddr_v4i16: 1938; GFX11: ; %bb.0: 1939; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1940; GFX11-NEXT: s_waitcnt vmcnt(0) 1941; GFX11-NEXT: ; return to shader part epilog 1942 %zext.offset = zext i32 %voffset to i64 1943 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1944 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* 1945 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast 1946 %cast.load = bitcast <4 x i16> %load to <2 x float> 1947 ret <2 x float> %cast.load 1948} 1949 1950define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1951; GCN-LABEL: global_load_saddr_v4i16_immneg128: 1952; GCN: ; %bb.0: 1953; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1954; GCN-NEXT: s_waitcnt vmcnt(0) 1955; GCN-NEXT: ; return to shader part epilog 1956; 1957; GFX11-LABEL: global_load_saddr_v4i16_immneg128: 1958; GFX11: ; %bb.0: 1959; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 1960; GFX11-NEXT: s_waitcnt vmcnt(0) 1961; GFX11-NEXT: ; return to shader part epilog 1962 %zext.offset = zext i32 %voffset to i64 1963 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1964 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 1965 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)* 1966 %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast 1967 %cast.load = bitcast <4 x i16> %load to <2 x float> 1968 ret <2 x float> %cast.load 1969} 1970 1971define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1972; GCN-LABEL: global_load_saddr_v4f16: 1973; GCN: ; %bb.0: 1974; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1975; GCN-NEXT: s_waitcnt vmcnt(0) 1976; GCN-NEXT: ; return to shader part epilog 1977; 1978; GFX11-LABEL: global_load_saddr_v4f16: 1979; GFX11: ; %bb.0: 1980; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1981; GFX11-NEXT: s_waitcnt vmcnt(0) 1982; GFX11-NEXT: ; return to shader part epilog 1983 %zext.offset = zext i32 %voffset to i64 1984 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 1985 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* 1986 %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast 1987 %cast.load = bitcast <4 x half> %load to <2 x float> 1988 ret <2 x float> %cast.load 1989} 1990 1991define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 1992; GCN-LABEL: global_load_saddr_v4f16_immneg128: 1993; GCN: ; %bb.0: 1994; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 1995; GCN-NEXT: s_waitcnt vmcnt(0) 1996; GCN-NEXT: ; return to shader part epilog 1997; 1998; GFX11-LABEL: global_load_saddr_v4f16_immneg128: 1999; GFX11: ; %bb.0: 2000; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 2001; GFX11-NEXT: s_waitcnt vmcnt(0) 2002; GFX11-NEXT: ; return to shader part epilog 2003 %zext.offset = zext i32 %voffset to i64 2004 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2005 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2006 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)* 2007 %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast 2008 %cast.load = bitcast <4 x half> %load to <2 x float> 2009 ret <2 x float> %cast.load 2010} 2011 2012define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2013; GCN-LABEL: global_load_saddr_p1: 2014; GCN: ; %bb.0: 2015; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 2016; GCN-NEXT: s_waitcnt vmcnt(0) 2017; GCN-NEXT: ; return to shader part epilog 2018; 2019; GFX11-LABEL: global_load_saddr_p1: 2020; GFX11: ; %bb.0: 2021; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 2022; GFX11-NEXT: s_waitcnt vmcnt(0) 2023; GFX11-NEXT: ; return to shader part epilog 2024 %zext.offset = zext i32 %voffset to i64 2025 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2026 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* 2027 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast 2028 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 2029 %cast.load1 = bitcast i64 %cast.load0 to <2 x float> 2030 ret <2 x float> %cast.load1 2031} 2032 2033define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2034; GCN-LABEL: global_load_saddr_p1_immneg128: 2035; GCN: ; %bb.0: 2036; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 2037; GCN-NEXT: s_waitcnt vmcnt(0) 2038; GCN-NEXT: ; return to shader part epilog 2039; 2040; GFX11-LABEL: global_load_saddr_p1_immneg128: 2041; GFX11: ; %bb.0: 2042; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 2043; GFX11-NEXT: s_waitcnt vmcnt(0) 2044; GFX11-NEXT: ; return to shader part epilog 2045 %zext.offset = zext i32 %voffset to i64 2046 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2047 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2048 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)* 2049 %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast 2050 %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 2051 %cast.load1 = bitcast i64 %cast.load0 to <2 x float> 2052 ret <2 x float> %cast.load1 2053} 2054 2055define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2056; GCN-LABEL: global_load_saddr_v3f32: 2057; GCN: ; %bb.0: 2058; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 2059; GCN-NEXT: s_waitcnt vmcnt(0) 2060; GCN-NEXT: ; return to shader part epilog 2061; 2062; GFX11-LABEL: global_load_saddr_v3f32: 2063; GFX11: ; %bb.0: 2064; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] 2065; GFX11-NEXT: s_waitcnt vmcnt(0) 2066; GFX11-NEXT: ; return to shader part epilog 2067 %zext.offset = zext i32 %voffset to i64 2068 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2069 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* 2070 %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast 2071 ret <3 x float> %load 2072} 2073 2074define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2075; GCN-LABEL: global_load_saddr_v3f32_immneg128: 2076; GCN: ; %bb.0: 2077; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 2078; GCN-NEXT: s_waitcnt vmcnt(0) 2079; GCN-NEXT: ; return to shader part epilog 2080; 2081; GFX11-LABEL: global_load_saddr_v3f32_immneg128: 2082; GFX11: ; %bb.0: 2083; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 2084; GFX11-NEXT: s_waitcnt vmcnt(0) 2085; GFX11-NEXT: ; return to shader part epilog 2086 %zext.offset = zext i32 %voffset to i64 2087 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2088 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2089 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)* 2090 %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast 2091 ret <3 x float> %load 2092} 2093 2094define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2095; GCN-LABEL: global_load_saddr_v3i32: 2096; GCN: ; %bb.0: 2097; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 2098; GCN-NEXT: s_waitcnt vmcnt(0) 2099; GCN-NEXT: ; return to shader part epilog 2100; 2101; GFX11-LABEL: global_load_saddr_v3i32: 2102; GFX11: ; %bb.0: 2103; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] 2104; GFX11-NEXT: s_waitcnt vmcnt(0) 2105; GFX11-NEXT: ; return to shader part epilog 2106 %zext.offset = zext i32 %voffset to i64 2107 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2108 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* 2109 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast 2110 %cast.load = bitcast <3 x i32> %load to <3 x float> 2111 ret <3 x float> %cast.load 2112} 2113 2114define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2115; GCN-LABEL: global_load_saddr_v3i32_immneg128: 2116; GCN: ; %bb.0: 2117; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 2118; GCN-NEXT: s_waitcnt vmcnt(0) 2119; GCN-NEXT: ; return to shader part epilog 2120; 2121; GFX11-LABEL: global_load_saddr_v3i32_immneg128: 2122; GFX11: ; %bb.0: 2123; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 2124; GFX11-NEXT: s_waitcnt vmcnt(0) 2125; GFX11-NEXT: ; return to shader part epilog 2126 %zext.offset = zext i32 %voffset to i64 2127 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2128 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2129 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)* 2130 %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast 2131 %cast.load = bitcast <3 x i32> %load to <3 x float> 2132 ret <3 x float> %cast.load 2133} 2134 2135define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2136; GCN-LABEL: global_load_saddr_v6f16: 2137; GCN: ; %bb.0: 2138; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] 2139; GCN-NEXT: s_waitcnt vmcnt(0) 2140; GCN-NEXT: ; return to shader part epilog 2141; 2142; GFX11-LABEL: global_load_saddr_v6f16: 2143; GFX11: ; %bb.0: 2144; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] 2145; GFX11-NEXT: s_waitcnt vmcnt(0) 2146; GFX11-NEXT: ; return to shader part epilog 2147 %zext.offset = zext i32 %voffset to i64 2148 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2149 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* 2150 %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast 2151 ret <6 x half> %load 2152} 2153 2154define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2155; GCN-LABEL: global_load_saddr_v6f16_immneg128: 2156; GCN: ; %bb.0: 2157; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 2158; GCN-NEXT: s_waitcnt vmcnt(0) 2159; GCN-NEXT: ; return to shader part epilog 2160; 2161; GFX11-LABEL: global_load_saddr_v6f16_immneg128: 2162; GFX11: ; %bb.0: 2163; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 2164; GFX11-NEXT: s_waitcnt vmcnt(0) 2165; GFX11-NEXT: ; return to shader part epilog 2166 %zext.offset = zext i32 %voffset to i64 2167 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2168 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2169 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)* 2170 %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast 2171 ret <6 x half> %load 2172} 2173 2174define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2175; GCN-LABEL: global_load_saddr_v4f32: 2176; GCN: ; %bb.0: 2177; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 2178; GCN-NEXT: s_waitcnt vmcnt(0) 2179; GCN-NEXT: ; return to shader part epilog 2180; 2181; GFX11-LABEL: global_load_saddr_v4f32: 2182; GFX11: ; %bb.0: 2183; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 2184; GFX11-NEXT: s_waitcnt vmcnt(0) 2185; GFX11-NEXT: ; return to shader part epilog 2186 %zext.offset = zext i32 %voffset to i64 2187 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2188 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* 2189 %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast 2190 ret <4 x float> %load 2191} 2192 2193define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2194; GCN-LABEL: global_load_saddr_v4f32_immneg128: 2195; GCN: ; %bb.0: 2196; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 2197; GCN-NEXT: s_waitcnt vmcnt(0) 2198; GCN-NEXT: ; return to shader part epilog 2199; 2200; GFX11-LABEL: global_load_saddr_v4f32_immneg128: 2201; GFX11: ; %bb.0: 2202; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 2203; GFX11-NEXT: s_waitcnt vmcnt(0) 2204; GFX11-NEXT: ; return to shader part epilog 2205 %zext.offset = zext i32 %voffset to i64 2206 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2207 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2208 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)* 2209 %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast 2210 ret <4 x float> %load 2211} 2212 2213define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2214; GCN-LABEL: global_load_saddr_v4i32: 2215; GCN: ; %bb.0: 2216; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 2217; GCN-NEXT: s_waitcnt vmcnt(0) 2218; GCN-NEXT: ; return to shader part epilog 2219; 2220; GFX11-LABEL: global_load_saddr_v4i32: 2221; GFX11: ; %bb.0: 2222; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 2223; GFX11-NEXT: s_waitcnt vmcnt(0) 2224; GFX11-NEXT: ; return to shader part epilog 2225 %zext.offset = zext i32 %voffset to i64 2226 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2227 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* 2228 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast 2229 %cast.load = bitcast <4 x i32> %load to <4 x float> 2230 ret <4 x float> %cast.load 2231} 2232 2233define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2234; GCN-LABEL: global_load_saddr_v4i32_immneg128: 2235; GCN: ; %bb.0: 2236; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 2237; GCN-NEXT: s_waitcnt vmcnt(0) 2238; GCN-NEXT: ; return to shader part epilog 2239; 2240; GFX11-LABEL: global_load_saddr_v4i32_immneg128: 2241; GFX11: ; %bb.0: 2242; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 2243; GFX11-NEXT: s_waitcnt vmcnt(0) 2244; GFX11-NEXT: ; return to shader part epilog 2245 %zext.offset = zext i32 %voffset to i64 2246 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2247 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2248 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)* 2249 %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast 2250 %cast.load = bitcast <4 x i32> %load to <4 x float> 2251 ret <4 x float> %cast.load 2252} 2253 2254define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2255; GCN-LABEL: global_load_saddr_v2i64: 2256; GCN: ; %bb.0: 2257; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 2258; GCN-NEXT: s_waitcnt vmcnt(0) 2259; GCN-NEXT: ; return to shader part epilog 2260; 2261; GFX11-LABEL: global_load_saddr_v2i64: 2262; GFX11: ; %bb.0: 2263; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 2264; GFX11-NEXT: s_waitcnt vmcnt(0) 2265; GFX11-NEXT: ; return to shader part epilog 2266 %zext.offset = zext i32 %voffset to i64 2267 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2268 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* 2269 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast 2270 %cast.load = bitcast <2 x i64> %load to <4 x float> 2271 ret <4 x float> %cast.load 2272} 2273 2274define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2275; GCN-LABEL: global_load_saddr_v2i64_immneg128: 2276; GCN: ; %bb.0: 2277; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 2278; GCN-NEXT: s_waitcnt vmcnt(0) 2279; GCN-NEXT: ; return to shader part epilog 2280; 2281; GFX11-LABEL: global_load_saddr_v2i64_immneg128: 2282; GFX11: ; %bb.0: 2283; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 2284; GFX11-NEXT: s_waitcnt vmcnt(0) 2285; GFX11-NEXT: ; return to shader part epilog 2286 %zext.offset = zext i32 %voffset to i64 2287 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2288 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2289 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)* 2290 %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast 2291 %cast.load = bitcast <2 x i64> %load to <4 x float> 2292 ret <4 x float> %cast.load 2293} 2294 2295define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2296; GCN-LABEL: global_load_saddr_i128: 2297; GCN: ; %bb.0: 2298; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 2299; GCN-NEXT: s_waitcnt vmcnt(0) 2300; GCN-NEXT: ; return to shader part epilog 2301; 2302; GFX11-LABEL: global_load_saddr_i128: 2303; GFX11: ; %bb.0: 2304; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 2305; GFX11-NEXT: s_waitcnt vmcnt(0) 2306; GFX11-NEXT: ; return to shader part epilog 2307 %zext.offset = zext i32 %voffset to i64 2308 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2309 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)* 2310 %load = load i128, i128 addrspace(1)* %gep0.cast 2311 %cast.load = bitcast i128 %load to <4 x float> 2312 ret <4 x float> %cast.load 2313} 2314 2315define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2316; GCN-LABEL: global_load_saddr_i128_immneg128: 2317; GCN: ; %bb.0: 2318; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 2319; GCN-NEXT: s_waitcnt vmcnt(0) 2320; GCN-NEXT: ; return to shader part epilog 2321; 2322; GFX11-LABEL: global_load_saddr_i128_immneg128: 2323; GFX11: ; %bb.0: 2324; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 2325; GFX11-NEXT: s_waitcnt vmcnt(0) 2326; GFX11-NEXT: ; return to shader part epilog 2327 %zext.offset = zext i32 %voffset to i64 2328 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2329 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2330 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)* 2331 %load = load i128, i128 addrspace(1)* %gep1.cast 2332 %cast.load = bitcast i128 %load to <4 x float> 2333 ret <4 x float> %cast.load 2334} 2335 2336define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2337; GCN-LABEL: global_load_saddr_v2p1: 2338; GCN: ; %bb.0: 2339; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 2340; GCN-NEXT: s_waitcnt vmcnt(0) 2341; GCN-NEXT: ; return to shader part epilog 2342; 2343; GFX11-LABEL: global_load_saddr_v2p1: 2344; GFX11: ; %bb.0: 2345; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 2346; GFX11-NEXT: s_waitcnt vmcnt(0) 2347; GFX11-NEXT: ; return to shader part epilog 2348 %zext.offset = zext i32 %voffset to i64 2349 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2350 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* 2351 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast 2352 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> 2353 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> 2354 ret <4 x float> %cast.load1 2355} 2356 2357define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2358; GCN-LABEL: global_load_saddr_v2p1_immneg128: 2359; GCN: ; %bb.0: 2360; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 2361; GCN-NEXT: s_waitcnt vmcnt(0) 2362; GCN-NEXT: ; return to shader part epilog 2363; 2364; GFX11-LABEL: global_load_saddr_v2p1_immneg128: 2365; GFX11: ; %bb.0: 2366; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 2367; GFX11-NEXT: s_waitcnt vmcnt(0) 2368; GFX11-NEXT: ; return to shader part epilog 2369 %zext.offset = zext i32 %voffset to i64 2370 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2371 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2372 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)* 2373 %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast 2374 %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> 2375 %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> 2376 ret <4 x float> %cast.load1 2377} 2378 2379define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2380; GCN-LABEL: global_load_saddr_v4p3: 2381; GCN: ; %bb.0: 2382; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 2383; GCN-NEXT: s_waitcnt vmcnt(0) 2384; GCN-NEXT: ; return to shader part epilog 2385; 2386; GFX11-LABEL: global_load_saddr_v4p3: 2387; GFX11: ; %bb.0: 2388; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 2389; GFX11-NEXT: s_waitcnt vmcnt(0) 2390; GFX11-NEXT: ; return to shader part epilog 2391 %zext.offset = zext i32 %voffset to i64 2392 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2393 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* 2394 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast 2395 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> 2396 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> 2397 ret <4 x float> %cast.load1 2398} 2399 2400define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2401; GCN-LABEL: global_load_saddr_v4p3_immneg128: 2402; GCN: ; %bb.0: 2403; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 2404; GCN-NEXT: s_waitcnt vmcnt(0) 2405; GCN-NEXT: ; return to shader part epilog 2406; 2407; GFX11-LABEL: global_load_saddr_v4p3_immneg128: 2408; GFX11: ; %bb.0: 2409; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 2410; GFX11-NEXT: s_waitcnt vmcnt(0) 2411; GFX11-NEXT: ; return to shader part epilog 2412 %zext.offset = zext i32 %voffset to i64 2413 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2414 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2415 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)* 2416 %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast 2417 %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> 2418 %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> 2419 ret <4 x float> %cast.load1 2420} 2421 2422; -------------------------------------------------------------------------------- 2423; Extending loads 2424; -------------------------------------------------------------------------------- 2425 2426define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2427; GCN-LABEL: global_sextload_saddr_i8: 2428; GCN: ; %bb.0: 2429; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] 2430; GCN-NEXT: s_waitcnt vmcnt(0) 2431; GCN-NEXT: ; return to shader part epilog 2432; 2433; GFX11-LABEL: global_sextload_saddr_i8: 2434; GFX11: ; %bb.0: 2435; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] 2436; GFX11-NEXT: s_waitcnt vmcnt(0) 2437; GFX11-NEXT: ; return to shader part epilog 2438 %zext.offset = zext i32 %voffset to i64 2439 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2440 %load = load i8, i8 addrspace(1)* %gep0 2441 %sextload = sext i8 %load to i32 2442 %cast.load = bitcast i32 %sextload to float 2443 ret float %cast.load 2444} 2445 2446define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2447; GCN-LABEL: global_sextload_saddr_i8_immneg128: 2448; GCN: ; %bb.0: 2449; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 2450; GCN-NEXT: s_waitcnt vmcnt(0) 2451; GCN-NEXT: ; return to shader part epilog 2452; 2453; GFX11-LABEL: global_sextload_saddr_i8_immneg128: 2454; GFX11: ; %bb.0: 2455; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 2456; GFX11-NEXT: s_waitcnt vmcnt(0) 2457; GFX11-NEXT: ; return to shader part epilog 2458 %zext.offset = zext i32 %voffset to i64 2459 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2460 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2461 %load = load i8, i8 addrspace(1)* %gep1 2462 %sextload = sext i8 %load to i32 2463 %cast.load = bitcast i32 %sextload to float 2464 ret float %cast.load 2465} 2466 2467define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2468; GCN-LABEL: global_sextload_saddr_i16: 2469; GCN: ; %bb.0: 2470; GCN-NEXT: global_load_sshort v0, v0, s[2:3] 2471; GCN-NEXT: s_waitcnt vmcnt(0) 2472; GCN-NEXT: ; return to shader part epilog 2473; 2474; GFX11-LABEL: global_sextload_saddr_i16: 2475; GFX11: ; %bb.0: 2476; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] 2477; GFX11-NEXT: s_waitcnt vmcnt(0) 2478; GFX11-NEXT: ; return to shader part epilog 2479 %zext.offset = zext i32 %voffset to i64 2480 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2481 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2482 %load = load i16, i16 addrspace(1)* %gep0.cast 2483 %sextload = sext i16 %load to i32 2484 %cast.load = bitcast i32 %sextload to float 2485 ret float %cast.load 2486} 2487 2488define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2489; GCN-LABEL: global_sextload_saddr_i16_immneg128: 2490; GCN: ; %bb.0: 2491; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 2492; GCN-NEXT: s_waitcnt vmcnt(0) 2493; GCN-NEXT: ; return to shader part epilog 2494; 2495; GFX11-LABEL: global_sextload_saddr_i16_immneg128: 2496; GFX11: ; %bb.0: 2497; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 2498; GFX11-NEXT: s_waitcnt vmcnt(0) 2499; GFX11-NEXT: ; return to shader part epilog 2500 %zext.offset = zext i32 %voffset to i64 2501 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2502 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2503 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2504 %load = load i16, i16 addrspace(1)* %gep1.cast 2505 %sextload = sext i16 %load to i32 2506 %cast.load = bitcast i32 %sextload to float 2507 ret float %cast.load 2508} 2509 2510define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2511; GCN-LABEL: global_zextload_saddr_i8: 2512; GCN: ; %bb.0: 2513; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] 2514; GCN-NEXT: s_waitcnt vmcnt(0) 2515; GCN-NEXT: ; return to shader part epilog 2516; 2517; GFX11-LABEL: global_zextload_saddr_i8: 2518; GFX11: ; %bb.0: 2519; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 2520; GFX11-NEXT: s_waitcnt vmcnt(0) 2521; GFX11-NEXT: ; return to shader part epilog 2522 %zext.offset = zext i32 %voffset to i64 2523 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2524 %load = load i8, i8 addrspace(1)* %gep0 2525 %zextload = zext i8 %load to i32 2526 %cast.load = bitcast i32 %zextload to float 2527 ret float %cast.load 2528} 2529 2530define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2531; GCN-LABEL: global_zextload_saddr_i8_immneg128: 2532; GCN: ; %bb.0: 2533; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 2534; GCN-NEXT: s_waitcnt vmcnt(0) 2535; GCN-NEXT: ; return to shader part epilog 2536; 2537; GFX11-LABEL: global_zextload_saddr_i8_immneg128: 2538; GFX11: ; %bb.0: 2539; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 2540; GFX11-NEXT: s_waitcnt vmcnt(0) 2541; GFX11-NEXT: ; return to shader part epilog 2542 %zext.offset = zext i32 %voffset to i64 2543 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2544 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2545 %load = load i8, i8 addrspace(1)* %gep1 2546 %zextload = zext i8 %load to i32 2547 %cast.load = bitcast i32 %zextload to float 2548 ret float %cast.load 2549} 2550 2551define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2552; GCN-LABEL: global_zextload_saddr_i16: 2553; GCN: ; %bb.0: 2554; GCN-NEXT: global_load_ushort v0, v0, s[2:3] 2555; GCN-NEXT: s_waitcnt vmcnt(0) 2556; GCN-NEXT: ; return to shader part epilog 2557; 2558; GFX11-LABEL: global_zextload_saddr_i16: 2559; GFX11: ; %bb.0: 2560; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] 2561; GFX11-NEXT: s_waitcnt vmcnt(0) 2562; GFX11-NEXT: ; return to shader part epilog 2563 %zext.offset = zext i32 %voffset to i64 2564 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2565 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2566 %load = load i16, i16 addrspace(1)* %gep0.cast 2567 %zextload = zext i16 %load to i32 2568 %cast.load = bitcast i32 %zextload to float 2569 ret float %cast.load 2570} 2571 2572define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2573; GCN-LABEL: global_zextload_saddr_i16_immneg128: 2574; GCN: ; %bb.0: 2575; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 2576; GCN-NEXT: s_waitcnt vmcnt(0) 2577; GCN-NEXT: ; return to shader part epilog 2578; 2579; GFX11-LABEL: global_zextload_saddr_i16_immneg128: 2580; GFX11: ; %bb.0: 2581; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 2582; GFX11-NEXT: s_waitcnt vmcnt(0) 2583; GFX11-NEXT: ; return to shader part epilog 2584 %zext.offset = zext i32 %voffset to i64 2585 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2586 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2587 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2588 %load = load i16, i16 addrspace(1)* %gep1.cast 2589 %zextload = zext i16 %load to i32 2590 %cast.load = bitcast i32 %zextload to float 2591 ret float %cast.load 2592} 2593 2594; -------------------------------------------------------------------------------- 2595; Atomic load 2596; -------------------------------------------------------------------------------- 2597 2598define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2599; GFX9-LABEL: atomic_global_load_saddr_i32: 2600; GFX9: ; %bb.0: 2601; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2602; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 2603; GFX9-NEXT: s_waitcnt vmcnt(0) 2604; GFX9-NEXT: buffer_wbinvl1 2605; GFX9-NEXT: ; return to shader part epilog 2606; 2607; GFX10-LABEL: atomic_global_load_saddr_i32: 2608; GFX10: ; %bb.0: 2609; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2610; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2611; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 2612; GFX10-NEXT: s_waitcnt vmcnt(0) 2613; GFX10-NEXT: buffer_gl0_inv 2614; GFX10-NEXT: buffer_gl1_inv 2615; GFX10-NEXT: ; return to shader part epilog 2616; 2617; GFX11-LABEL: atomic_global_load_saddr_i32: 2618; GFX11: ; %bb.0: 2619; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2620; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2621; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc 2622; GFX11-NEXT: s_waitcnt vmcnt(0) 2623; GFX11-NEXT: buffer_gl0_inv 2624; GFX11-NEXT: buffer_gl1_inv 2625; GFX11-NEXT: ; return to shader part epilog 2626 %zext.offset = zext i32 %voffset to i64 2627 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2628 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* 2629 %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4 2630 %cast.load = bitcast i32 %load to float 2631 ret float %cast.load 2632} 2633 2634define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2635; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128: 2636; GFX9: ; %bb.0: 2637; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2638; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc 2639; GFX9-NEXT: s_waitcnt vmcnt(0) 2640; GFX9-NEXT: buffer_wbinvl1 2641; GFX9-NEXT: ; return to shader part epilog 2642; 2643; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128: 2644; GFX10: ; %bb.0: 2645; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2646; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2647; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc 2648; GFX10-NEXT: s_waitcnt vmcnt(0) 2649; GFX10-NEXT: buffer_gl0_inv 2650; GFX10-NEXT: buffer_gl1_inv 2651; GFX10-NEXT: ; return to shader part epilog 2652; 2653; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128: 2654; GFX11: ; %bb.0: 2655; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2656; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2657; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 glc 2658; GFX11-NEXT: s_waitcnt vmcnt(0) 2659; GFX11-NEXT: buffer_gl0_inv 2660; GFX11-NEXT: buffer_gl1_inv 2661; GFX11-NEXT: ; return to shader part epilog 2662 %zext.offset = zext i32 %voffset to i64 2663 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2664 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2665 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* 2666 %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4 2667 %cast.load = bitcast i32 %load to float 2668 ret float %cast.load 2669} 2670 2671define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2672; GFX9-LABEL: atomic_global_load_saddr_i64: 2673; GFX9: ; %bb.0: 2674; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2675; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc 2676; GFX9-NEXT: s_waitcnt vmcnt(0) 2677; GFX9-NEXT: buffer_wbinvl1 2678; GFX9-NEXT: ; return to shader part epilog 2679; 2680; GFX10-LABEL: atomic_global_load_saddr_i64: 2681; GFX10: ; %bb.0: 2682; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2683; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2684; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc 2685; GFX10-NEXT: s_waitcnt vmcnt(0) 2686; GFX10-NEXT: buffer_gl0_inv 2687; GFX10-NEXT: buffer_gl1_inv 2688; GFX10-NEXT: ; return to shader part epilog 2689; 2690; GFX11-LABEL: atomic_global_load_saddr_i64: 2691; GFX11: ; %bb.0: 2692; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2693; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2694; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc 2695; GFX11-NEXT: s_waitcnt vmcnt(0) 2696; GFX11-NEXT: buffer_gl0_inv 2697; GFX11-NEXT: buffer_gl1_inv 2698; GFX11-NEXT: ; return to shader part epilog 2699 %zext.offset = zext i32 %voffset to i64 2700 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2701 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* 2702 %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8 2703 %cast.load = bitcast i64 %load to <2 x float> 2704 ret <2 x float> %cast.load 2705} 2706 2707define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2708; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128: 2709; GFX9: ; %bb.0: 2710; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2711; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc 2712; GFX9-NEXT: s_waitcnt vmcnt(0) 2713; GFX9-NEXT: buffer_wbinvl1 2714; GFX9-NEXT: ; return to shader part epilog 2715; 2716; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128: 2717; GFX10: ; %bb.0: 2718; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2719; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2720; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc 2721; GFX10-NEXT: s_waitcnt vmcnt(0) 2722; GFX10-NEXT: buffer_gl0_inv 2723; GFX10-NEXT: buffer_gl1_inv 2724; GFX10-NEXT: ; return to shader part epilog 2725; 2726; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128: 2727; GFX11: ; %bb.0: 2728; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2729; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2730; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc 2731; GFX11-NEXT: s_waitcnt vmcnt(0) 2732; GFX11-NEXT: buffer_gl0_inv 2733; GFX11-NEXT: buffer_gl1_inv 2734; GFX11-NEXT: ; return to shader part epilog 2735 %zext.offset = zext i32 %voffset to i64 2736 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2737 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2738 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* 2739 %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8 2740 %cast.load = bitcast i64 %load to <2 x float> 2741 ret <2 x float> %cast.load 2742} 2743 2744; -------------------------------------------------------------------------------- 2745; D16 load (low 16) 2746; -------------------------------------------------------------------------------- 2747 2748define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2749; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi: 2750; GCN: ; %bb.0: 2751; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] 2752; GCN-NEXT: s_waitcnt vmcnt(0) 2753; GCN-NEXT: ; return to shader part epilog 2754; 2755; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi: 2756; GFX11: ; %bb.0: 2757; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] 2758; GFX11-NEXT: s_waitcnt vmcnt(0) 2759; GFX11-NEXT: ; return to shader part epilog 2760 %zext.offset = zext i32 %voffset to i64 2761 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2762 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2763 %load = load i16, i16 addrspace(1)* %gep0.cast 2764 %build = insertelement <2 x i16> undef, i16 %load, i32 0 2765 %cast = bitcast <2 x i16> %build to <2 x half> 2766 ret <2 x half> %cast 2767} 2768 2769define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2770; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: 2771; GCN: ; %bb.0: 2772; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 2773; GCN-NEXT: s_waitcnt vmcnt(0) 2774; GCN-NEXT: ; return to shader part epilog 2775; 2776; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: 2777; GFX11: ; %bb.0: 2778; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 2779; GFX11-NEXT: s_waitcnt vmcnt(0) 2780; GFX11-NEXT: ; return to shader part epilog 2781 %zext.offset = zext i32 %voffset to i64 2782 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2783 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2784 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2785 %load = load i16, i16 addrspace(1)* %gep1.cast 2786 %build = insertelement <2 x i16> undef, i16 %load, i32 0 2787 %cast = bitcast <2 x i16> %build to <2 x half> 2788 ret <2 x half> %cast 2789} 2790 2791define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2792; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi: 2793; GCN: ; %bb.0: 2794; GCN-NEXT: v_mov_b32_e32 v1, 0 2795; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] 2796; GCN-NEXT: s_waitcnt vmcnt(0) 2797; GCN-NEXT: v_mov_b32_e32 v0, v1 2798; GCN-NEXT: ; return to shader part epilog 2799; 2800; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi: 2801; GFX11: ; %bb.0: 2802; GFX11-NEXT: v_mov_b32_e32 v1, 0 2803; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] 2804; GFX11-NEXT: s_waitcnt vmcnt(0) 2805; GFX11-NEXT: v_mov_b32_e32 v0, v1 2806; GFX11-NEXT: ; return to shader part epilog 2807 %zext.offset = zext i32 %voffset to i64 2808 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2809 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2810 %load = load i16, i16 addrspace(1)* %gep0.cast 2811 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 2812 %cast = bitcast <2 x i16> %build to <2 x half> 2813 ret <2 x half> %cast 2814} 2815 2816define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2817; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: 2818; GCN: ; %bb.0: 2819; GCN-NEXT: v_mov_b32_e32 v1, 0 2820; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 2821; GCN-NEXT: s_waitcnt vmcnt(0) 2822; GCN-NEXT: v_mov_b32_e32 v0, v1 2823; GCN-NEXT: ; return to shader part epilog 2824; 2825; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: 2826; GFX11: ; %bb.0: 2827; GFX11-NEXT: v_mov_b32_e32 v1, 0 2828; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 2829; GFX11-NEXT: s_waitcnt vmcnt(0) 2830; GFX11-NEXT: v_mov_b32_e32 v0, v1 2831; GFX11-NEXT: ; return to shader part epilog 2832 %zext.offset = zext i32 %voffset to i64 2833 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2834 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2835 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2836 %load = load i16, i16 addrspace(1)* %gep1.cast 2837 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 2838 %cast = bitcast <2 x i16> %build to <2 x half> 2839 ret <2 x half> %cast 2840} 2841 2842define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2843; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi: 2844; GCN: ; %bb.0: 2845; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] 2846; GCN-NEXT: s_waitcnt vmcnt(0) 2847; GCN-NEXT: v_mov_b32_e32 v0, v1 2848; GCN-NEXT: ; return to shader part epilog 2849; 2850; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi: 2851; GFX11: ; %bb.0: 2852; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] 2853; GFX11-NEXT: s_waitcnt vmcnt(0) 2854; GFX11-NEXT: v_mov_b32_e32 v0, v1 2855; GFX11-NEXT: ; return to shader part epilog 2856 %zext.offset = zext i32 %voffset to i64 2857 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2858 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 2859 %load = load i16, i16 addrspace(1)* %gep0.cast 2860 %build = insertelement <2 x i16> %reg, i16 %load, i32 0 2861 %cast = bitcast <2 x i16> %build to <2 x half> 2862 ret <2 x half> %cast 2863} 2864 2865define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2866; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: 2867; GCN: ; %bb.0: 2868; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 2869; GCN-NEXT: s_waitcnt vmcnt(0) 2870; GCN-NEXT: v_mov_b32_e32 v0, v1 2871; GCN-NEXT: ; return to shader part epilog 2872; 2873; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: 2874; GFX11: ; %bb.0: 2875; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 2876; GFX11-NEXT: s_waitcnt vmcnt(0) 2877; GFX11-NEXT: v_mov_b32_e32 v0, v1 2878; GFX11-NEXT: ; return to shader part epilog 2879 %zext.offset = zext i32 %voffset to i64 2880 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2881 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2882 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 2883 %load = load i16, i16 addrspace(1)* %gep1.cast 2884 %build = insertelement <2 x i16> %reg, i16 %load, i32 0 2885 %cast = bitcast <2 x i16> %build to <2 x half> 2886 ret <2 x half> %cast 2887} 2888 2889define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2890; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: 2891; GCN: ; %bb.0: 2892; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] 2893; GCN-NEXT: s_waitcnt vmcnt(0) 2894; GCN-NEXT: v_mov_b32_e32 v0, v1 2895; GCN-NEXT: ; return to shader part epilog 2896; 2897; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: 2898; GFX11: ; %bb.0: 2899; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] 2900; GFX11-NEXT: s_waitcnt vmcnt(0) 2901; GFX11-NEXT: v_mov_b32_e32 v0, v1 2902; GFX11-NEXT: ; return to shader part epilog 2903 %zext.offset = zext i32 %voffset to i64 2904 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2905 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 2906 %load = load i8, i8 addrspace(1)* %gep0.cast 2907 %zext.load = zext i8 %load to i16 2908 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 2909 %cast = bitcast <2 x i16> %build to <2 x half> 2910 ret <2 x half> %cast 2911} 2912 2913define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2914; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: 2915; GCN: ; %bb.0: 2916; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128 2917; GCN-NEXT: s_waitcnt vmcnt(0) 2918; GCN-NEXT: v_mov_b32_e32 v0, v1 2919; GCN-NEXT: ; return to shader part epilog 2920; 2921; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: 2922; GFX11: ; %bb.0: 2923; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 2924; GFX11-NEXT: s_waitcnt vmcnt(0) 2925; GFX11-NEXT: v_mov_b32_e32 v0, v1 2926; GFX11-NEXT: ; return to shader part epilog 2927 %zext.offset = zext i32 %voffset to i64 2928 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2929 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2930 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 2931 %load = load i8, i8 addrspace(1)* %gep1.cast 2932 %zext.load = zext i8 %load to i16 2933 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 2934 %cast = bitcast <2 x i16> %build to <2 x half> 2935 ret <2 x half> %cast 2936} 2937 2938define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2939; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: 2940; GCN: ; %bb.0: 2941; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] 2942; GCN-NEXT: s_waitcnt vmcnt(0) 2943; GCN-NEXT: v_mov_b32_e32 v0, v1 2944; GCN-NEXT: ; return to shader part epilog 2945; 2946; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: 2947; GFX11: ; %bb.0: 2948; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] 2949; GFX11-NEXT: s_waitcnt vmcnt(0) 2950; GFX11-NEXT: v_mov_b32_e32 v0, v1 2951; GFX11-NEXT: ; return to shader part epilog 2952 %zext.offset = zext i32 %voffset to i64 2953 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2954 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 2955 %load = load i8, i8 addrspace(1)* %gep0.cast 2956 %sext.load = sext i8 %load to i16 2957 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 2958 %cast = bitcast <2 x i16> %build to <2 x half> 2959 ret <2 x half> %cast 2960} 2961 2962define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 2963; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: 2964; GCN: ; %bb.0: 2965; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128 2966; GCN-NEXT: s_waitcnt vmcnt(0) 2967; GCN-NEXT: v_mov_b32_e32 v0, v1 2968; GCN-NEXT: ; return to shader part epilog 2969; 2970; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: 2971; GFX11: ; %bb.0: 2972; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 2973; GFX11-NEXT: s_waitcnt vmcnt(0) 2974; GFX11-NEXT: v_mov_b32_e32 v0, v1 2975; GFX11-NEXT: ; return to shader part epilog 2976 %zext.offset = zext i32 %voffset to i64 2977 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 2978 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 2979 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 2980 %load = load i8, i8 addrspace(1)* %gep1.cast 2981 %sext.load = sext i8 %load to i16 2982 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 2983 %cast = bitcast <2 x i16> %build to <2 x half> 2984 ret <2 x half> %cast 2985} 2986 2987; -------------------------------------------------------------------------------- 2988; D16 hi load (hi16) 2989; -------------------------------------------------------------------------------- 2990 2991define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 2992; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi: 2993; GCN: ; %bb.0: 2994; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] 2995; GCN-NEXT: s_waitcnt vmcnt(0) 2996; GCN-NEXT: ; return to shader part epilog 2997; 2998; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi: 2999; GFX11: ; %bb.0: 3000; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] 3001; GFX11-NEXT: s_waitcnt vmcnt(0) 3002; GFX11-NEXT: ; return to shader part epilog 3003 %zext.offset = zext i32 %voffset to i64 3004 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3005 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 3006 %load = load i16, i16 addrspace(1)* %gep0.cast 3007 %build = insertelement <2 x i16> undef, i16 %load, i32 1 3008 %cast = bitcast <2 x i16> %build to <2 x half> 3009 ret <2 x half> %cast 3010} 3011 3012define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 3013; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: 3014; GCN: ; %bb.0: 3015; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 3016; GCN-NEXT: s_waitcnt vmcnt(0) 3017; GCN-NEXT: ; return to shader part epilog 3018; 3019; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: 3020; GFX11: ; %bb.0: 3021; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 3022; GFX11-NEXT: s_waitcnt vmcnt(0) 3023; GFX11-NEXT: ; return to shader part epilog 3024 %zext.offset = zext i32 %voffset to i64 3025 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3026 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 3027 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 3028 %load = load i16, i16 addrspace(1)* %gep1.cast 3029 %build = insertelement <2 x i16> undef, i16 %load, i32 1 3030 %cast = bitcast <2 x i16> %build to <2 x half> 3031 ret <2 x half> %cast 3032} 3033 3034define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 3035; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi: 3036; GCN: ; %bb.0: 3037; GCN-NEXT: v_mov_b32_e32 v1, 0 3038; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] 3039; GCN-NEXT: s_waitcnt vmcnt(0) 3040; GCN-NEXT: v_mov_b32_e32 v0, v1 3041; GCN-NEXT: ; return to shader part epilog 3042; 3043; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi: 3044; GFX11: ; %bb.0: 3045; GFX11-NEXT: v_mov_b32_e32 v1, 0 3046; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] 3047; GFX11-NEXT: s_waitcnt vmcnt(0) 3048; GFX11-NEXT: v_mov_b32_e32 v0, v1 3049; GFX11-NEXT: ; return to shader part epilog 3050 %zext.offset = zext i32 %voffset to i64 3051 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3052 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 3053 %load = load i16, i16 addrspace(1)* %gep0.cast 3054 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 3055 %cast = bitcast <2 x i16> %build to <2 x half> 3056 ret <2 x half> %cast 3057} 3058 3059define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { 3060; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: 3061; GCN: ; %bb.0: 3062; GCN-NEXT: v_mov_b32_e32 v1, 0 3063; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 3064; GCN-NEXT: s_waitcnt vmcnt(0) 3065; GCN-NEXT: v_mov_b32_e32 v0, v1 3066; GCN-NEXT: ; return to shader part epilog 3067; 3068; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: 3069; GFX11: ; %bb.0: 3070; GFX11-NEXT: v_mov_b32_e32 v1, 0 3071; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 3072; GFX11-NEXT: s_waitcnt vmcnt(0) 3073; GFX11-NEXT: v_mov_b32_e32 v0, v1 3074; GFX11-NEXT: ; return to shader part epilog 3075 %zext.offset = zext i32 %voffset to i64 3076 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3077 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 3078 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 3079 %load = load i16, i16 addrspace(1)* %gep1.cast 3080 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 3081 %cast = bitcast <2 x i16> %build to <2 x half> 3082 ret <2 x half> %cast 3083} 3084 3085define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 3086; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi: 3087; GCN: ; %bb.0: 3088; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] 3089; GCN-NEXT: s_waitcnt vmcnt(0) 3090; GCN-NEXT: v_mov_b32_e32 v0, v1 3091; GCN-NEXT: ; return to shader part epilog 3092; 3093; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi: 3094; GFX11: ; %bb.0: 3095; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] 3096; GFX11-NEXT: s_waitcnt vmcnt(0) 3097; GFX11-NEXT: v_mov_b32_e32 v0, v1 3098; GFX11-NEXT: ; return to shader part epilog 3099 %zext.offset = zext i32 %voffset to i64 3100 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3101 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* 3102 %load = load i16, i16 addrspace(1)* %gep0.cast 3103 %build = insertelement <2 x i16> %reg, i16 %load, i32 1 3104 %cast = bitcast <2 x i16> %build to <2 x half> 3105 ret <2 x half> %cast 3106} 3107 3108define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 3109; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: 3110; GCN: ; %bb.0: 3111; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 3112; GCN-NEXT: s_waitcnt vmcnt(0) 3113; GCN-NEXT: v_mov_b32_e32 v0, v1 3114; GCN-NEXT: ; return to shader part epilog 3115; 3116; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: 3117; GFX11: ; %bb.0: 3118; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 3119; GFX11-NEXT: s_waitcnt vmcnt(0) 3120; GFX11-NEXT: v_mov_b32_e32 v0, v1 3121; GFX11-NEXT: ; return to shader part epilog 3122 %zext.offset = zext i32 %voffset to i64 3123 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3124 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 3125 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* 3126 %load = load i16, i16 addrspace(1)* %gep1.cast 3127 %build = insertelement <2 x i16> %reg, i16 %load, i32 1 3128 %cast = bitcast <2 x i16> %build to <2 x half> 3129 ret <2 x half> %cast 3130} 3131 3132define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 3133; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: 3134; GCN: ; %bb.0: 3135; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] 3136; GCN-NEXT: s_waitcnt vmcnt(0) 3137; GCN-NEXT: v_mov_b32_e32 v0, v1 3138; GCN-NEXT: ; return to shader part epilog 3139; 3140; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: 3141; GFX11: ; %bb.0: 3142; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] 3143; GFX11-NEXT: s_waitcnt vmcnt(0) 3144; GFX11-NEXT: v_mov_b32_e32 v0, v1 3145; GFX11-NEXT: ; return to shader part epilog 3146 %zext.offset = zext i32 %voffset to i64 3147 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3148 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 3149 %load = load i8, i8 addrspace(1)* %gep0.cast 3150 %zext.load = zext i8 %load to i16 3151 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 3152 %cast = bitcast <2 x i16> %build to <2 x half> 3153 ret <2 x half> %cast 3154} 3155 3156define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 3157; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: 3158; GCN: ; %bb.0: 3159; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128 3160; GCN-NEXT: s_waitcnt vmcnt(0) 3161; GCN-NEXT: v_mov_b32_e32 v0, v1 3162; GCN-NEXT: ; return to shader part epilog 3163; 3164; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: 3165; GFX11: ; %bb.0: 3166; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 3167; GFX11-NEXT: s_waitcnt vmcnt(0) 3168; GFX11-NEXT: v_mov_b32_e32 v0, v1 3169; GFX11-NEXT: ; return to shader part epilog 3170 %zext.offset = zext i32 %voffset to i64 3171 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3172 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 3173 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 3174 %load = load i8, i8 addrspace(1)* %gep1.cast 3175 %zext.load = zext i8 %load to i16 3176 %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 3177 %cast = bitcast <2 x i16> %build to <2 x half> 3178 ret <2 x half> %cast 3179} 3180 3181define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 3182; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: 3183; GCN: ; %bb.0: 3184; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] 3185; GCN-NEXT: s_waitcnt vmcnt(0) 3186; GCN-NEXT: v_mov_b32_e32 v0, v1 3187; GCN-NEXT: ; return to shader part epilog 3188; 3189; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: 3190; GFX11: ; %bb.0: 3191; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] 3192; GFX11-NEXT: s_waitcnt vmcnt(0) 3193; GFX11-NEXT: v_mov_b32_e32 v0, v1 3194; GFX11-NEXT: ; return to shader part epilog 3195 %zext.offset = zext i32 %voffset to i64 3196 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3197 %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* 3198 %load = load i8, i8 addrspace(1)* %gep0.cast 3199 %sext.load = sext i8 %load to i16 3200 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 3201 %cast = bitcast <2 x i16> %build to <2 x half> 3202 ret <2 x half> %cast 3203} 3204 3205define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { 3206; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: 3207; GCN: ; %bb.0: 3208; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128 3209; GCN-NEXT: s_waitcnt vmcnt(0) 3210; GCN-NEXT: v_mov_b32_e32 v0, v1 3211; GCN-NEXT: ; return to shader part epilog 3212; 3213; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: 3214; GFX11: ; %bb.0: 3215; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 3216; GFX11-NEXT: s_waitcnt vmcnt(0) 3217; GFX11-NEXT: v_mov_b32_e32 v0, v1 3218; GFX11-NEXT: ; return to shader part epilog 3219 %zext.offset = zext i32 %voffset to i64 3220 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset 3221 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 3222 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* 3223 %load = load i8, i8 addrspace(1)* %gep1.cast 3224 %sext.load = sext i8 %load to i16 3225 %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 3226 %cast = bitcast <2 x i16> %build to <2 x half> 3227 ret <2 x half> %cast 3228} 3229 3230; -------------------------------------------------------------------------------- 3231; or-with-constant as add 3232; -------------------------------------------------------------------------------- 3233 3234; Check add-as-or with split 64-bit or. 3235define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) { 3236; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: 3237; GCN: ; %bb.0: 3238; GCN-NEXT: v_or_b32_e32 v0, 16, v0 3239; GCN-NEXT: v_mov_b32_e32 v1, 0 3240; GCN-NEXT: global_load_ubyte v0, v[0:1], off 3241; GCN-NEXT: s_waitcnt vmcnt(0) 3242; GCN-NEXT: ; return to shader part epilog 3243; 3244; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: 3245; GFX11: ; %bb.0: 3246; GFX11-NEXT: v_or_b32_e32 v0, 16, v0 3247; GFX11-NEXT: v_mov_b32_e32 v1, 0 3248; GFX11-NEXT: global_load_u8 v0, v[0:1], off 3249; GFX11-NEXT: s_waitcnt vmcnt(0) 3250; GFX11-NEXT: ; return to shader part epilog 3251 %zext.idx = zext i32 %idx to i64 3252 %or = or i64 %zext.idx, 16 3253 %addr = inttoptr i64 %or to i8 addrspace(1)* 3254 %load = load i8, i8 addrspace(1)* %addr 3255 %zext = zext i8 %load to i32 3256 %to.vgpr = bitcast i32 %zext to float 3257 ret float %to.vgpr 3258} 3259 3260define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) { 3261; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: 3262; GCN: ; %bb.0: 3263; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0 3264; GCN-NEXT: v_mov_b32_e32 v1, 0 3265; GCN-NEXT: global_load_ubyte v0, v[0:1], off 3266; GCN-NEXT: s_waitcnt vmcnt(0) 3267; GCN-NEXT: ; return to shader part epilog 3268; 3269; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: 3270; GFX11: ; %bb.0: 3271; GFX11-NEXT: v_or_b32_e32 v0, 0x1040, v0 3272; GFX11-NEXT: v_mov_b32_e32 v1, 0 3273; GFX11-NEXT: global_load_u8 v0, v[0:1], off 3274; GFX11-NEXT: s_waitcnt vmcnt(0) 3275; GFX11-NEXT: ; return to shader part epilog 3276 %zext.idx = zext i32 %idx to i64 3277 %or = or i64 %zext.idx, 4160 3278 %addr = inttoptr i64 %or to i8 addrspace(1)* 3279 %load = load i8, i8 addrspace(1)* %addr 3280 %zext = zext i8 %load to i32 3281 %to.vgpr = bitcast i32 %zext to float 3282 ret float %to.vgpr 3283} 3284 3285; -------------------------------------------------------------------------------- 3286; Full 64-bit scalar add. 3287; -------------------------------------------------------------------------------- 3288 3289define amdgpu_ps void @global_addr_64bit_lsr_iv(float addrspace(1)* inreg %arg) { 3290; GFX9-LABEL: global_addr_64bit_lsr_iv: 3291; GFX9: ; %bb.0: ; %bb 3292; GFX9-NEXT: s_mov_b64 s[0:1], 0 3293; GFX9-NEXT: v_mov_b32_e32 v0, 0 3294; GFX9-NEXT: .LBB128_1: ; %bb3 3295; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3296; GFX9-NEXT: s_add_u32 s4, s2, s0 3297; GFX9-NEXT: s_addc_u32 s5, s3, s1 3298; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc 3299; GFX9-NEXT: s_waitcnt vmcnt(0) 3300; GFX9-NEXT: s_add_u32 s0, s0, 4 3301; GFX9-NEXT: s_addc_u32 s1, s1, 0 3302; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 3303; GFX9-NEXT: s_cbranch_scc0 .LBB128_1 3304; GFX9-NEXT: ; %bb.2: ; %bb2 3305; GFX9-NEXT: s_endpgm 3306; 3307; GFX10-LABEL: global_addr_64bit_lsr_iv: 3308; GFX10: ; %bb.0: ; %bb 3309; GFX10-NEXT: v_mov_b32_e32 v0, 0 3310; GFX10-NEXT: s_mov_b64 s[0:1], 0 3311; GFX10-NEXT: .LBB128_1: ; %bb3 3312; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3313; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3314; GFX10-NEXT: s_add_u32 s4, s2, s0 3315; GFX10-NEXT: s_addc_u32 s5, s3, s1 3316; GFX10-NEXT: s_add_u32 s0, s0, 4 3317; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc 3318; GFX10-NEXT: s_waitcnt vmcnt(0) 3319; GFX10-NEXT: s_addc_u32 s1, s1, 0 3320; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 3321; GFX10-NEXT: s_cbranch_scc0 .LBB128_1 3322; GFX10-NEXT: ; %bb.2: ; %bb2 3323; GFX10-NEXT: s_endpgm 3324; 3325; GFX11-LABEL: global_addr_64bit_lsr_iv: 3326; GFX11: ; %bb.0: ; %bb 3327; GFX11-NEXT: v_mov_b32_e32 v0, 0 3328; GFX11-NEXT: s_mov_b64 s[0:1], 0 3329; GFX11-NEXT: .LBB128_1: ; %bb3 3330; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3331; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3332; GFX11-NEXT: s_add_u32 s4, s2, s0 3333; GFX11-NEXT: s_addc_u32 s5, s3, s1 3334; GFX11-NEXT: s_add_u32 s0, s0, 4 3335; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc 3336; GFX11-NEXT: s_waitcnt vmcnt(0) 3337; GFX11-NEXT: s_addc_u32 s1, s1, 0 3338; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 3339; GFX11-NEXT: s_cbranch_scc0 .LBB128_1 3340; GFX11-NEXT: ; %bb.2: ; %bb2 3341; GFX11-NEXT: s_endpgm 3342bb: 3343 br label %bb3 3344 3345bb2: ; preds = %bb3 3346 ret void 3347 3348bb3: ; preds = %bb3, %bb 3349 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] 3350 %i4 = zext i32 %i to i64 3351 %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4 3352 %i6 = load volatile float, float addrspace(1)* %i5, align 4 3353 %i8 = add nuw nsw i32 %i, 1 3354 %i9 = icmp eq i32 %i8, 256 3355 br i1 %i9, label %bb2, label %bb3 3356} 3357 3358; Make sure we only have a single zero vaddr initialization. 3359 3360define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) { 3361; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload: 3362; GFX9: ; %bb.0: ; %bb 3363; GFX9-NEXT: s_mov_b64 s[0:1], 0 3364; GFX9-NEXT: v_mov_b32_e32 v0, 0 3365; GFX9-NEXT: .LBB129_1: ; %bb3 3366; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 3367; GFX9-NEXT: s_add_u32 s4, s2, s0 3368; GFX9-NEXT: s_addc_u32 s5, s3, s1 3369; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc 3370; GFX9-NEXT: s_waitcnt vmcnt(0) 3371; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc 3372; GFX9-NEXT: s_waitcnt vmcnt(0) 3373; GFX9-NEXT: s_add_u32 s0, s0, 4 3374; GFX9-NEXT: s_addc_u32 s1, s1, 0 3375; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400 3376; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5 3377; GFX9-NEXT: s_cbranch_scc0 .LBB129_1 3378; GFX9-NEXT: ; %bb.2: ; %bb2 3379; GFX9-NEXT: s_endpgm 3380; 3381; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload: 3382; GFX10: ; %bb.0: ; %bb 3383; GFX10-NEXT: v_mov_b32_e32 v0, 0 3384; GFX10-NEXT: s_mov_b64 s[0:1], 0 3385; GFX10-NEXT: .LBB129_1: ; %bb3 3386; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3387; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3388; GFX10-NEXT: s_add_u32 s4, s2, s0 3389; GFX10-NEXT: s_addc_u32 s5, s3, s1 3390; GFX10-NEXT: s_add_u32 s0, s0, 4 3391; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc 3392; GFX10-NEXT: s_waitcnt vmcnt(0) 3393; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc 3394; GFX10-NEXT: s_waitcnt vmcnt(0) 3395; GFX10-NEXT: s_addc_u32 s1, s1, 0 3396; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400 3397; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5 3398; GFX10-NEXT: s_cbranch_scc0 .LBB129_1 3399; GFX10-NEXT: ; %bb.2: ; %bb2 3400; GFX10-NEXT: s_endpgm 3401; 3402; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload: 3403; GFX11: ; %bb.0: ; %bb 3404; GFX11-NEXT: v_mov_b32_e32 v0, 0 3405; GFX11-NEXT: s_mov_b64 s[0:1], 0 3406; GFX11-NEXT: .LBB129_1: ; %bb3 3407; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3408; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3409; GFX11-NEXT: s_add_u32 s4, s2, s0 3410; GFX11-NEXT: s_addc_u32 s5, s3, s1 3411; GFX11-NEXT: s_add_u32 s0, s0, 4 3412; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc 3413; GFX11-NEXT: s_waitcnt vmcnt(0) 3414; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc 3415; GFX11-NEXT: s_waitcnt vmcnt(0) 3416; GFX11-NEXT: s_addc_u32 s1, s1, 0 3417; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 3418; GFX11-NEXT: s_cbranch_scc0 .LBB129_1 3419; GFX11-NEXT: ; %bb.2: ; %bb2 3420; GFX11-NEXT: s_endpgm 3421bb: 3422 br label %bb3 3423 3424bb2: ; preds = %bb3 3425 ret void 3426 3427bb3: ; preds = %bb3, %bb 3428 %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] 3429 %i4 = zext i32 %i to i64 3430 %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4 3431 %i6 = load volatile float, float addrspace(1)* %i5, align 4 3432 %i5.1 = getelementptr inbounds float, float addrspace(1)* %arg.1, i64 %i4 3433 %i6.1 = load volatile float, float addrspace(1)* %i5, align 4 3434 %i8 = add nuw nsw i32 %i, 1 3435 %i9 = icmp eq i32 %i8, 256 3436 br i1 %i9, label %bb2, label %bb3 3437} 3438 3439!0 = !{i32 0, i32 1073741824} ; (1 << 30) 3440!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 3441