1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 6; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s 7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s 10 11define amdgpu_kernel void @zero_init_kernel() { 12; GFX9-LABEL: zero_init_kernel: 13; GFX9: ; %bb.0: 14; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 15; GFX9-NEXT: s_mov_b32 s0, 0 16; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 17; GFX9-NEXT: s_mov_b32 s1, s0 18; GFX9-NEXT: s_mov_b32 s2, s0 19; GFX9-NEXT: s_mov_b32 s3, s0 20; GFX9-NEXT: v_mov_b32_e32 v0, s0 21; GFX9-NEXT: v_mov_b32_e32 v1, s1 22; GFX9-NEXT: v_mov_b32_e32 v2, s2 23; GFX9-NEXT: v_mov_b32_e32 v3, s3 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 28; GFX9-NEXT: s_mov_b32 vcc_hi, 0 29; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 30; GFX9-NEXT: s_mov_b32 vcc_hi, 0 31; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 32; GFX9-NEXT: s_endpgm 33; 34; GFX10-LABEL: zero_init_kernel: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: s_add_u32 s0, s0, s3 37; GFX10-NEXT: s_addc_u32 s1, s1, 0 38; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 39; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 40; GFX10-NEXT: s_mov_b32 s0, 0 41; GFX10-NEXT: s_mov_b32 s1, s0 42; GFX10-NEXT: s_mov_b32 s2, s0 43; GFX10-NEXT: s_mov_b32 s3, s0 44; GFX10-NEXT: v_mov_b32_e32 v0, s0 45; GFX10-NEXT: v_mov_b32_e32 v1, s1 46; GFX10-NEXT: v_mov_b32_e32 v2, s2 47; GFX10-NEXT: v_mov_b32_e32 v3, s3 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 49; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 50; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 51; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 52; GFX10-NEXT: s_endpgm 53; 54; GFX11-LABEL: zero_init_kernel: 55; GFX11: ; %bb.0: 56; GFX11-NEXT: s_mov_b32 s0, 0 57; GFX11-NEXT: s_mov_b32 s1, s0 58; GFX11-NEXT: s_mov_b32 s2, s0 59; GFX11-NEXT: s_mov_b32 s3, s0 60; GFX11-NEXT: v_mov_b32_e32 v0, s0 61; GFX11-NEXT: v_mov_b32_e32 v1, s1 62; GFX11-NEXT: v_mov_b32_e32 v2, s2 63; GFX11-NEXT: v_mov_b32_e32 v3, s3 64; GFX11-NEXT: s_clause 0x3 65; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:64 66; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 67; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 68; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 69; GFX11-NEXT: s_endpgm 70; 71; GFX9-PAL-LABEL: zero_init_kernel: 72; GFX9-PAL: ; %bb.0: 73; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 74; GFX9-PAL-NEXT: s_mov_b32 s2, s0 75; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 76; GFX9-PAL-NEXT: s_mov_b32 s0, 0 77; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 78; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 79; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 80; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 81; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 82; GFX9-PAL-NEXT: s_mov_b32 s1, s0 83; GFX9-PAL-NEXT: s_mov_b32 s2, s0 84; GFX9-PAL-NEXT: s_mov_b32 s3, s0 85; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 86; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 87; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 88; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 89; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 90; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 91; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 92; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 93; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 94; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 95; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 96; GFX9-PAL-NEXT: s_endpgm 97; 98; GFX940-LABEL: zero_init_kernel: 99; GFX940: ; %bb.0: 100; GFX940-NEXT: s_mov_b32 s0, 0 101; GFX940-NEXT: s_mov_b32 s1, s0 102; GFX940-NEXT: s_mov_b32 s2, s0 103; GFX940-NEXT: s_mov_b32 s3, s0 104; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 105; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 106; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 107; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 108; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 109; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 110; GFX940-NEXT: s_endpgm 111; 112; GFX1010-PAL-LABEL: zero_init_kernel: 113; GFX1010-PAL: ; %bb.0: 114; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 115; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 116; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 117; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 118; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 119; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 120; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 121; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 122; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 123; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 124; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 125; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 126; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 127; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 128; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 129; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 130; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 131; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 132; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 133; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 134; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 135; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 136; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 137; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 138; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 139; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 140; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 141; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 142; GFX1010-PAL-NEXT: s_endpgm 143; 144; GFX1030-PAL-LABEL: zero_init_kernel: 145; GFX1030-PAL: ; %bb.0: 146; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 147; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 148; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 149; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 150; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 151; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 152; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 153; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 154; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 155; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 156; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 157; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 158; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 159; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 160; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 161; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 162; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 163; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 164; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 165; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 166; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 167; GFX1030-PAL-NEXT: s_endpgm 168; 169; GFX11-PAL-LABEL: zero_init_kernel: 170; GFX11-PAL: ; %bb.0: 171; GFX11-PAL-NEXT: s_mov_b32 s0, 0 172; GFX11-PAL-NEXT: s_mov_b32 s1, s0 173; GFX11-PAL-NEXT: s_mov_b32 s2, s0 174; GFX11-PAL-NEXT: s_mov_b32 s3, s0 175; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 176; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 177; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 178; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 179; GFX11-PAL-NEXT: s_clause 0x3 180; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:64 181; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 182; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 183; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 184; GFX11-PAL-NEXT: s_endpgm 185 %alloca = alloca [32 x i16], align 2, addrspace(5) 186 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 187 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 188 ret void 189} 190 191define void @zero_init_foo() { 192; GFX9-LABEL: zero_init_foo: 193; GFX9: ; %bb.0: 194; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; GFX9-NEXT: s_mov_b32 s0, 0 196; GFX9-NEXT: s_mov_b32 s1, s0 197; GFX9-NEXT: s_mov_b32 s2, s0 198; GFX9-NEXT: s_mov_b32 s3, s0 199; GFX9-NEXT: v_mov_b32_e32 v0, s0 200; GFX9-NEXT: v_mov_b32_e32 v1, s1 201; GFX9-NEXT: v_mov_b32_e32 v2, s2 202; GFX9-NEXT: v_mov_b32_e32 v3, s3 203; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 204; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 205; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 206; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 207; GFX9-NEXT: s_waitcnt vmcnt(0) 208; GFX9-NEXT: s_setpc_b64 s[30:31] 209; 210; GFX10-LABEL: zero_init_foo: 211; GFX10: ; %bb.0: 212; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 213; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 214; GFX10-NEXT: s_mov_b32 s0, 0 215; GFX10-NEXT: s_mov_b32 s1, s0 216; GFX10-NEXT: s_mov_b32 s2, s0 217; GFX10-NEXT: s_mov_b32 s3, s0 218; GFX10-NEXT: v_mov_b32_e32 v0, s0 219; GFX10-NEXT: v_mov_b32_e32 v1, s1 220; GFX10-NEXT: v_mov_b32_e32 v2, s2 221; GFX10-NEXT: v_mov_b32_e32 v3, s3 222; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 223; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 224; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 225; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 226; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 227; GFX10-NEXT: s_setpc_b64 s[30:31] 228; 229; GFX11-LABEL: zero_init_foo: 230; GFX11: ; %bb.0: 231; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 233; GFX11-NEXT: s_mov_b32 s0, 0 234; GFX11-NEXT: s_mov_b32 s1, s0 235; GFX11-NEXT: s_mov_b32 s2, s0 236; GFX11-NEXT: s_mov_b32 s3, s0 237; GFX11-NEXT: v_mov_b32_e32 v0, s0 238; GFX11-NEXT: v_mov_b32_e32 v1, s1 239; GFX11-NEXT: v_mov_b32_e32 v2, s2 240; GFX11-NEXT: v_mov_b32_e32 v3, s3 241; GFX11-NEXT: s_clause 0x3 242; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 243; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 244; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 245; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 246; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 247; GFX11-NEXT: s_setpc_b64 s[30:31] 248; 249; GFX9-PAL-LABEL: zero_init_foo: 250; GFX9-PAL: ; %bb.0: 251; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GFX9-PAL-NEXT: s_mov_b32 s0, 0 253; GFX9-PAL-NEXT: s_mov_b32 s1, s0 254; GFX9-PAL-NEXT: s_mov_b32 s2, s0 255; GFX9-PAL-NEXT: s_mov_b32 s3, s0 256; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 257; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 258; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 259; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 260; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 261; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 262; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 263; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 264; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 265; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 266; 267; GFX940-LABEL: zero_init_foo: 268; GFX940: ; %bb.0: 269; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 270; GFX940-NEXT: s_mov_b32 s0, 0 271; GFX940-NEXT: s_mov_b32 s1, s0 272; GFX940-NEXT: s_mov_b32 s2, s0 273; GFX940-NEXT: s_mov_b32 s3, s0 274; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 275; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 276; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 277; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 278; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 279; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 280; GFX940-NEXT: s_waitcnt vmcnt(0) 281; GFX940-NEXT: s_setpc_b64 s[30:31] 282; 283; GFX10-PAL-LABEL: zero_init_foo: 284; GFX10-PAL: ; %bb.0: 285; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 287; GFX10-PAL-NEXT: s_mov_b32 s0, 0 288; GFX10-PAL-NEXT: s_mov_b32 s1, s0 289; GFX10-PAL-NEXT: s_mov_b32 s2, s0 290; GFX10-PAL-NEXT: s_mov_b32 s3, s0 291; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 292; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 293; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 294; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 295; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 296; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 297; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 298; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 299; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 300; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 301; 302; GFX11-PAL-LABEL: zero_init_foo: 303; GFX11-PAL: ; %bb.0: 304; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 305; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 306; GFX11-PAL-NEXT: s_mov_b32 s0, 0 307; GFX11-PAL-NEXT: s_mov_b32 s1, s0 308; GFX11-PAL-NEXT: s_mov_b32 s2, s0 309; GFX11-PAL-NEXT: s_mov_b32 s3, s0 310; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 311; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 312; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 313; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 314; GFX11-PAL-NEXT: s_clause 0x3 315; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 316; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 317; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 318; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 319; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 320; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 321; GCN-LABEL: zero_init_foo: 322; GCN: ; %bb.0: 323; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 324; GCN-NEXT: s_mov_b32 s0, 0 325; GCN-NEXT: s_mov_b32 s1, s0 326; GCN-NEXT: s_mov_b32 s2, s0 327; GCN-NEXT: s_mov_b32 s3, s0 328; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 329; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 330; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 331; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 332; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 333; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 334; GCN-NEXT: s_waitcnt vmcnt(0) 335; GCN-NEXT: s_setpc_b64 s[30:31] 336 %alloca = alloca [32 x i16], align 2, addrspace(5) 337 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 338 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 339 ret void 340} 341 342define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 343; GFX9-LABEL: store_load_sindex_kernel: 344; GFX9: ; %bb.0: ; %bb 345; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 346; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 347; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 348; GFX9-NEXT: v_mov_b32_e32 v0, 15 349; GFX9-NEXT: s_waitcnt lgkmcnt(0) 350; GFX9-NEXT: s_lshl_b32 s1, s0, 2 351; GFX9-NEXT: s_and_b32 s0, s0, 15 352; GFX9-NEXT: s_add_i32 s1, s1, 4 353; GFX9-NEXT: s_lshl_b32 s0, s0, 2 354; GFX9-NEXT: scratch_store_dword off, v0, s1 355; GFX9-NEXT: s_waitcnt vmcnt(0) 356; GFX9-NEXT: s_add_i32 s0, s0, 4 357; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 358; GFX9-NEXT: s_waitcnt vmcnt(0) 359; GFX9-NEXT: s_endpgm 360; 361; GFX10-LABEL: store_load_sindex_kernel: 362; GFX10: ; %bb.0: ; %bb 363; GFX10-NEXT: s_add_u32 s2, s2, s5 364; GFX10-NEXT: s_addc_u32 s3, s3, 0 365; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 366; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 367; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 368; GFX10-NEXT: v_mov_b32_e32 v0, 15 369; GFX10-NEXT: s_waitcnt lgkmcnt(0) 370; GFX10-NEXT: s_and_b32 s1, s0, 15 371; GFX10-NEXT: s_lshl_b32 s0, s0, 2 372; GFX10-NEXT: s_lshl_b32 s1, s1, 2 373; GFX10-NEXT: s_add_i32 s0, s0, 4 374; GFX10-NEXT: s_add_i32 s1, s1, 4 375; GFX10-NEXT: scratch_store_dword off, v0, s0 376; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 377; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 378; GFX10-NEXT: s_waitcnt vmcnt(0) 379; GFX10-NEXT: s_endpgm 380; 381; GFX11-LABEL: store_load_sindex_kernel: 382; GFX11: ; %bb.0: ; %bb 383; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 384; GFX11-NEXT: v_mov_b32_e32 v0, 15 385; GFX11-NEXT: s_waitcnt lgkmcnt(0) 386; GFX11-NEXT: s_and_b32 s1, s0, 15 387; GFX11-NEXT: s_lshl_b32 s0, s0, 2 388; GFX11-NEXT: s_lshl_b32 s1, s1, 2 389; GFX11-NEXT: s_add_i32 s0, s0, 4 390; GFX11-NEXT: s_add_i32 s1, s1, 4 391; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 392; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 393; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 394; GFX11-NEXT: s_waitcnt vmcnt(0) 395; GFX11-NEXT: s_endpgm 396; 397; GFX9-PAL-LABEL: store_load_sindex_kernel: 398; GFX9-PAL: ; %bb.0: ; %bb 399; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 400; GFX9-PAL-NEXT: s_mov_b32 s4, s0 401; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 402; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 403; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 404; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 405; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 406; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 407; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 408; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 409; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 410; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 411; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 412; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 413; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 414; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 415; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 416; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 417; GFX9-PAL-NEXT: s_endpgm 418; 419; GFX940-LABEL: store_load_sindex_kernel: 420; GFX940: ; %bb.0: ; %bb 421; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 422; GFX940-NEXT: v_mov_b32_e32 v0, 15 423; GFX940-NEXT: s_waitcnt lgkmcnt(0) 424; GFX940-NEXT: s_lshl_b32 s1, s0, 2 425; GFX940-NEXT: s_and_b32 s0, s0, 15 426; GFX940-NEXT: s_add_i32 s1, s1, 4 427; GFX940-NEXT: s_lshl_b32 s0, s0, 2 428; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 429; GFX940-NEXT: s_waitcnt vmcnt(0) 430; GFX940-NEXT: s_add_i32 s0, s0, 4 431; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 432; GFX940-NEXT: s_waitcnt vmcnt(0) 433; GFX940-NEXT: s_endpgm 434; 435; GFX10-PAL-LABEL: store_load_sindex_kernel: 436; GFX10-PAL: ; %bb.0: ; %bb 437; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 438; GFX10-PAL-NEXT: s_mov_b32 s4, s0 439; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 440; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 441; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 442; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 443; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 444; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 445; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 446; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 447; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 448; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 449; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 450; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 451; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 452; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 453; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 454; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 455; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 456; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 457; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 458; GFX10-PAL-NEXT: s_endpgm 459; 460; GFX11-PAL-LABEL: store_load_sindex_kernel: 461; GFX11-PAL: ; %bb.0: ; %bb 462; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 463; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 464; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 465; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 466; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 467; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 468; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 469; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 470; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 471; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 472; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 473; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 474; GFX11-PAL-NEXT: s_endpgm 475; GCN-LABEL: store_load_sindex_kernel: 476; GCN: ; %bb.0: ; %bb 477; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 478; GCN-NEXT: v_mov_b32_e32 v0, 15 479; GCN-NEXT: s_waitcnt lgkmcnt(0) 480; GCN-NEXT: s_lshl_b32 s1, s0, 2 481; GCN-NEXT: s_and_b32 s0, s0, 15 482; GCN-NEXT: s_lshl_b32 s0, s0, 2 483; GCN-NEXT: s_add_u32 s1, 4, s1 484; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 485; GCN-NEXT: s_waitcnt vmcnt(0) 486; GCN-NEXT: s_add_u32 s0, 4, s0 487; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 488; GCN-NEXT: s_waitcnt vmcnt(0) 489; GCN-NEXT: s_endpgm 490bb: 491 %i = alloca [32 x float], align 4, addrspace(5) 492 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 493 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 494 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 495 store volatile i32 15, i32 addrspace(5)* %i8, align 4 496 %i9 = and i32 %idx, 15 497 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 498 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 499 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 500 ret void 501} 502 503define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 504; GFX9-LABEL: store_load_sindex_foo: 505; GFX9: ; %bb.0: ; %bb 506; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 507; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 508; GFX9-NEXT: s_lshl_b32 s0, s2, 2 509; GFX9-NEXT: s_add_i32 s0, s0, 4 510; GFX9-NEXT: v_mov_b32_e32 v0, 15 511; GFX9-NEXT: scratch_store_dword off, v0, s0 512; GFX9-NEXT: s_waitcnt vmcnt(0) 513; GFX9-NEXT: s_and_b32 s0, s2, 15 514; GFX9-NEXT: s_lshl_b32 s0, s0, 2 515; GFX9-NEXT: s_add_i32 s0, s0, 4 516; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 517; GFX9-NEXT: s_waitcnt vmcnt(0) 518; GFX9-NEXT: s_endpgm 519; 520; GFX10-LABEL: store_load_sindex_foo: 521; GFX10: ; %bb.0: ; %bb 522; GFX10-NEXT: s_add_u32 s0, s0, s3 523; GFX10-NEXT: s_addc_u32 s1, s1, 0 524; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 525; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 526; GFX10-NEXT: v_mov_b32_e32 v0, 15 527; GFX10-NEXT: s_and_b32 s0, s2, 15 528; GFX10-NEXT: s_lshl_b32 s1, s2, 2 529; GFX10-NEXT: s_lshl_b32 s0, s0, 2 530; GFX10-NEXT: s_add_i32 s1, s1, 4 531; GFX10-NEXT: s_add_i32 s0, s0, 4 532; GFX10-NEXT: scratch_store_dword off, v0, s1 533; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 534; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 535; GFX10-NEXT: s_waitcnt vmcnt(0) 536; GFX10-NEXT: s_endpgm 537; 538; GFX11-LABEL: store_load_sindex_foo: 539; GFX11: ; %bb.0: ; %bb 540; GFX11-NEXT: v_mov_b32_e32 v0, 15 541; GFX11-NEXT: s_and_b32 s1, s0, 15 542; GFX11-NEXT: s_lshl_b32 s0, s0, 2 543; GFX11-NEXT: s_lshl_b32 s1, s1, 2 544; GFX11-NEXT: s_add_i32 s0, s0, 4 545; GFX11-NEXT: s_add_i32 s1, s1, 4 546; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 547; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 548; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 549; GFX11-NEXT: s_waitcnt vmcnt(0) 550; GFX11-NEXT: s_endpgm 551; 552; GFX9-PAL-LABEL: store_load_sindex_foo: 553; GFX9-PAL: ; %bb.0: ; %bb 554; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 555; GFX9-PAL-NEXT: s_mov_b32 s2, s0 556; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 557; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 558; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 559; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 560; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 561; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 562; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 563; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 564; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 565; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 566; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 567; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 568; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 569; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 570; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 571; GFX9-PAL-NEXT: s_endpgm 572; 573; GFX940-LABEL: store_load_sindex_foo: 574; GFX940: ; %bb.0: ; %bb 575; GFX940-NEXT: s_lshl_b32 s1, s0, 2 576; GFX940-NEXT: s_and_b32 s0, s0, 15 577; GFX940-NEXT: s_add_i32 s1, s1, 4 578; GFX940-NEXT: v_mov_b32_e32 v0, 15 579; GFX940-NEXT: s_lshl_b32 s0, s0, 2 580; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 581; GFX940-NEXT: s_waitcnt vmcnt(0) 582; GFX940-NEXT: s_add_i32 s0, s0, 4 583; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 584; GFX940-NEXT: s_waitcnt vmcnt(0) 585; GFX940-NEXT: s_endpgm 586; 587; GFX10-PAL-LABEL: store_load_sindex_foo: 588; GFX10-PAL: ; %bb.0: ; %bb 589; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 590; GFX10-PAL-NEXT: s_mov_b32 s2, s0 591; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 592; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 593; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 594; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 595; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 596; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 597; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 598; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 599; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 600; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 601; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 602; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 603; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 604; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 605; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 606; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 607; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 608; GFX10-PAL-NEXT: s_endpgm 609; 610; GFX11-PAL-LABEL: store_load_sindex_foo: 611; GFX11-PAL: ; %bb.0: ; %bb 612; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 613; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 614; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 615; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 616; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 617; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 618; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 619; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 620; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 621; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 622; GFX11-PAL-NEXT: s_endpgm 623; GCN-LABEL: store_load_sindex_foo: 624; GCN: ; %bb.0: ; %bb 625; GCN-NEXT: s_lshl_b32 s1, s0, 2 626; GCN-NEXT: s_and_b32 s0, s0, 15 627; GCN-NEXT: s_lshl_b32 s0, s0, 2 628; GCN-NEXT: s_add_u32 s1, 4, s1 629; GCN-NEXT: v_mov_b32_e32 v0, 15 630; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 631; GCN-NEXT: s_waitcnt vmcnt(0) 632; GCN-NEXT: s_add_u32 s0, 4, s0 633; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 634; GCN-NEXT: s_waitcnt vmcnt(0) 635; GCN-NEXT: s_endpgm 636bb: 637 %i = alloca [32 x float], align 4, addrspace(5) 638 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 639 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 640 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 641 store volatile i32 15, i32 addrspace(5)* %i8, align 4 642 %i9 = and i32 %idx, 15 643 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 644 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 645 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 646 ret void 647} 648 649define amdgpu_kernel void @store_load_vindex_kernel() { 650; GFX9-LABEL: store_load_vindex_kernel: 651; GFX9: ; %bb.0: ; %bb 652; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 653; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 654; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 655; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 656; GFX9-NEXT: v_mov_b32_e32 v2, 15 657; GFX9-NEXT: scratch_store_dword v1, v2, off 658; GFX9-NEXT: s_waitcnt vmcnt(0) 659; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 660; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 661; GFX9-NEXT: s_waitcnt vmcnt(0) 662; GFX9-NEXT: s_endpgm 663; 664; GFX10-LABEL: store_load_vindex_kernel: 665; GFX10: ; %bb.0: ; %bb 666; GFX10-NEXT: s_add_u32 s0, s0, s3 667; GFX10-NEXT: s_addc_u32 s1, s1, 0 668; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 669; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 670; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 671; GFX10-NEXT: v_mov_b32_e32 v2, 15 672; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 673; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 674; GFX10-NEXT: scratch_store_dword v1, v2, off 675; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 676; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 677; GFX10-NEXT: s_waitcnt vmcnt(0) 678; GFX10-NEXT: s_endpgm 679; 680; GFX11-LABEL: store_load_vindex_kernel: 681; GFX11: ; %bb.0: ; %bb 682; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 683; GFX11-NEXT: v_mov_b32_e32 v1, 15 684; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 685; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 686; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 687; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 688; GFX11-NEXT: s_waitcnt vmcnt(0) 689; GFX11-NEXT: s_endpgm 690; 691; GFX9-PAL-LABEL: store_load_vindex_kernel: 692; GFX9-PAL: ; %bb.0: ; %bb 693; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 694; GFX9-PAL-NEXT: s_mov_b32 s2, s0 695; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 696; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 697; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 698; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 699; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 700; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 701; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 702; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 703; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 704; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 705; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 706; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 707; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 708; GFX9-PAL-NEXT: s_endpgm 709; 710; GFX940-LABEL: store_load_vindex_kernel: 711; GFX940: ; %bb.0: ; %bb 712; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 713; GFX940-NEXT: v_mov_b32_e32 v1, 15 714; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 715; GFX940-NEXT: s_waitcnt vmcnt(0) 716; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 717; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 718; GFX940-NEXT: s_waitcnt vmcnt(0) 719; GFX940-NEXT: s_endpgm 720; 721; GFX10-PAL-LABEL: store_load_vindex_kernel: 722; GFX10-PAL: ; %bb.0: ; %bb 723; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 724; GFX10-PAL-NEXT: s_mov_b32 s2, s0 725; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 726; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 727; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 728; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 729; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 730; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 731; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 732; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 733; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 734; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 735; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 736; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off 737; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 738; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 739; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 740; GFX10-PAL-NEXT: s_endpgm 741; 742; GFX11-PAL-LABEL: store_load_vindex_kernel: 743; GFX11-PAL: ; %bb.0: ; %bb 744; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 745; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 746; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 747; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 748; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 749; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 750; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 751; GFX11-PAL-NEXT: s_endpgm 752; GCN-LABEL: store_load_vindex_kernel: 753; GCN: ; %bb.0: ; %bb 754; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 755; GCN-NEXT: v_mov_b32_e32 v1, 15 756; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 757; GCN-NEXT: s_waitcnt vmcnt(0) 758; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 759; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 760; GCN-NEXT: s_waitcnt vmcnt(0) 761; GCN-NEXT: s_endpgm 762bb: 763 %i = alloca [32 x float], align 4, addrspace(5) 764 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 765 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 766 %i3 = zext i32 %i2 to i64 767 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 768 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 769 store volatile i32 15, i32 addrspace(5)* %i8, align 4 770 %i9 = sub nsw i32 31, %i2 771 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 772 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 773 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 774 ret void 775} 776 777define void @store_load_vindex_foo(i32 %idx) { 778; GFX9-LABEL: store_load_vindex_foo: 779; GFX9: ; %bb.0: ; %bb 780; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 781; GFX9-NEXT: v_mov_b32_e32 v1, s32 782; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 783; GFX9-NEXT: v_mov_b32_e32 v3, 15 784; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 785; GFX9-NEXT: scratch_store_dword v2, v3, off 786; GFX9-NEXT: s_waitcnt vmcnt(0) 787; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 788; GFX9-NEXT: scratch_load_dword v0, v0, off glc 789; GFX9-NEXT: s_waitcnt vmcnt(0) 790; GFX9-NEXT: s_setpc_b64 s[30:31] 791; 792; GFX10-LABEL: store_load_vindex_foo: 793; GFX10: ; %bb.0: ; %bb 794; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 795; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 796; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 797; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 798; GFX10-NEXT: v_mov_b32_e32 v2, 15 799; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 800; GFX10-NEXT: scratch_store_dword v0, v2, off 801; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 802; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 803; GFX10-NEXT: s_waitcnt vmcnt(0) 804; GFX10-NEXT: s_setpc_b64 s[30:31] 805; 806; GFX11-LABEL: store_load_vindex_foo: 807; GFX11: ; %bb.0: ; %bb 808; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 809; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 810; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 811; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 812; GFX11-NEXT: v_mov_b32_e32 v2, 15 813; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 814; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc 815; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 816; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 817; GFX11-NEXT: s_waitcnt vmcnt(0) 818; GFX11-NEXT: s_setpc_b64 s[30:31] 819; 820; GFX9-PAL-LABEL: store_load_vindex_foo: 821; GFX9-PAL: ; %bb.0: ; %bb 822; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 823; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 824; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 825; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 826; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 827; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 828; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 829; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 830; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 831; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 832; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 833; 834; GFX940-LABEL: store_load_vindex_foo: 835; GFX940: ; %bb.0: ; %bb 836; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 837; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 838; GFX940-NEXT: v_mov_b32_e32 v2, 15 839; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 840; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 841; GFX940-NEXT: s_waitcnt vmcnt(0) 842; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 843; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 844; GFX940-NEXT: s_waitcnt vmcnt(0) 845; GFX940-NEXT: s_setpc_b64 s[30:31] 846; 847; GFX10-PAL-LABEL: store_load_vindex_foo: 848; GFX10-PAL: ; %bb.0: ; %bb 849; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 850; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 851; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 852; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 853; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 854; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 855; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 856; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 857; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 858; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 859; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 860; 861; GFX11-PAL-LABEL: store_load_vindex_foo: 862; GFX11-PAL: ; %bb.0: ; %bb 863; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 864; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 865; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 866; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 867; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 868; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 869; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc 870; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 871; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 872; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 873; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 874; GCN-LABEL: store_load_vindex_foo: 875; GCN: ; %bb.0: ; %bb 876; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 877; GCN-NEXT: v_mov_b32_e32 v2, 15 878; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 879; GCN-NEXT: v_and_b32_e32 v0, v0, v2 880; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 881; GCN-NEXT: s_waitcnt vmcnt(0) 882; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 883; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 884; GCN-NEXT: s_waitcnt vmcnt(0) 885; GCN-NEXT: s_setpc_b64 s[30:31] 886bb: 887 %i = alloca [32 x float], align 4, addrspace(5) 888 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 889 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 890 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 891 store volatile i32 15, i32 addrspace(5)* %i8, align 4 892 %i9 = and i32 %idx, 15 893 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 894 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 895 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 896 ret void 897} 898 899define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 900; GFX9-LABEL: private_ptr_foo: 901; GFX9: ; %bb.0: 902; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 903; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 904; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 905; GFX9-NEXT: s_waitcnt vmcnt(0) 906; GFX9-NEXT: s_setpc_b64 s[30:31] 907; 908; GFX10-LABEL: private_ptr_foo: 909; GFX10: ; %bb.0: 910; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 911; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 912; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 913; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 914; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 915; GFX10-NEXT: s_setpc_b64 s[30:31] 916; 917; GFX11-LABEL: private_ptr_foo: 918; GFX11: ; %bb.0: 919; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 920; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 921; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 922; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 923; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 924; GFX11-NEXT: s_setpc_b64 s[30:31] 925; 926; GFX9-PAL-LABEL: private_ptr_foo: 927; GFX9-PAL: ; %bb.0: 928; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 929; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 930; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 931; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 932; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 933; 934; GFX940-LABEL: private_ptr_foo: 935; GFX940: ; %bb.0: 936; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 937; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 938; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 939; GFX940-NEXT: s_waitcnt vmcnt(0) 940; GFX940-NEXT: s_setpc_b64 s[30:31] 941; 942; GFX10-PAL-LABEL: private_ptr_foo: 943; GFX10-PAL: ; %bb.0: 944; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 945; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 946; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 947; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 948; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 949; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 950; 951; GFX11-PAL-LABEL: private_ptr_foo: 952; GFX11-PAL: ; %bb.0: 953; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 954; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 955; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 956; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 957; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 958; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 959; GCN-LABEL: private_ptr_foo: 960; GCN: ; %bb.0: 961; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 962; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 963; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 964; GCN-NEXT: s_waitcnt vmcnt(0) 965; GCN-NEXT: s_setpc_b64 s[30:31] 966 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 967 store float 1.000000e+01, float addrspace(5)* %gep, align 4 968 ret void 969} 970 971define amdgpu_kernel void @zero_init_small_offset_kernel() { 972; GFX9-LABEL: zero_init_small_offset_kernel: 973; GFX9: ; %bb.0: 974; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 975; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 976; GFX9-NEXT: s_mov_b32 vcc_hi, 0 977; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 978; GFX9-NEXT: s_waitcnt vmcnt(0) 979; GFX9-NEXT: s_mov_b32 s0, 0 980; GFX9-NEXT: s_mov_b32 s1, s0 981; GFX9-NEXT: s_mov_b32 s2, s0 982; GFX9-NEXT: s_mov_b32 s3, s0 983; GFX9-NEXT: v_mov_b32_e32 v0, s0 984; GFX9-NEXT: v_mov_b32_e32 v1, s1 985; GFX9-NEXT: v_mov_b32_e32 v2, s2 986; GFX9-NEXT: v_mov_b32_e32 v3, s3 987; GFX9-NEXT: s_mov_b32 vcc_hi, 0 988; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 989; GFX9-NEXT: s_mov_b32 vcc_hi, 0 990; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 991; GFX9-NEXT: s_mov_b32 vcc_hi, 0 992; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 993; GFX9-NEXT: s_mov_b32 vcc_hi, 0 994; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 995; GFX9-NEXT: s_endpgm 996; 997; GFX10-LABEL: zero_init_small_offset_kernel: 998; GFX10: ; %bb.0: 999; GFX10-NEXT: s_add_u32 s0, s0, s3 1000; GFX10-NEXT: s_addc_u32 s1, s1, 0 1001; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1002; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1003; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1004; GFX10-NEXT: s_waitcnt vmcnt(0) 1005; GFX10-NEXT: s_mov_b32 s0, 0 1006; GFX10-NEXT: s_mov_b32 s1, s0 1007; GFX10-NEXT: s_mov_b32 s2, s0 1008; GFX10-NEXT: s_mov_b32 s3, s0 1009; GFX10-NEXT: v_mov_b32_e32 v0, s0 1010; GFX10-NEXT: v_mov_b32_e32 v1, s1 1011; GFX10-NEXT: v_mov_b32_e32 v2, s2 1012; GFX10-NEXT: v_mov_b32_e32 v3, s3 1013; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1014; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1015; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1016; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1017; GFX10-NEXT: s_endpgm 1018; 1019; GFX11-LABEL: zero_init_small_offset_kernel: 1020; GFX11: ; %bb.0: 1021; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1022; GFX11-NEXT: s_waitcnt vmcnt(0) 1023; GFX11-NEXT: s_mov_b32 s0, 0 1024; GFX11-NEXT: s_mov_b32 s1, s0 1025; GFX11-NEXT: s_mov_b32 s2, s0 1026; GFX11-NEXT: s_mov_b32 s3, s0 1027; GFX11-NEXT: v_mov_b32_e32 v0, s0 1028; GFX11-NEXT: v_mov_b32_e32 v1, s1 1029; GFX11-NEXT: v_mov_b32_e32 v2, s2 1030; GFX11-NEXT: v_mov_b32_e32 v3, s3 1031; GFX11-NEXT: s_clause 0x3 1032; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1033; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1034; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1035; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:320 1036; GFX11-NEXT: s_endpgm 1037; 1038; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 1039; GFX9-PAL: ; %bb.0: 1040; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1041; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1042; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1043; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1044; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1045; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1047; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1048; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1049; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1050; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1051; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1052; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1053; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1054; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1055; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1056; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1057; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1058; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1059; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 1060; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1061; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 1062; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1063; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 1064; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1065; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 1066; GFX9-PAL-NEXT: s_endpgm 1067; 1068; GFX940-LABEL: zero_init_small_offset_kernel: 1069; GFX940: ; %bb.0: 1070; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1071; GFX940-NEXT: s_waitcnt vmcnt(0) 1072; GFX940-NEXT: s_mov_b32 s0, 0 1073; GFX940-NEXT: s_mov_b32 s1, s0 1074; GFX940-NEXT: s_mov_b32 s2, s0 1075; GFX940-NEXT: s_mov_b32 s3, s0 1076; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1077; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1078; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1079; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1080; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1081; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1082; GFX940-NEXT: s_endpgm 1083; 1084; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 1085; GFX1010-PAL: ; %bb.0: 1086; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1087; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1088; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1089; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1091; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1092; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1093; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1094; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1095; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1096; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1097; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1098; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1099; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1100; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1101; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1102; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1103; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1104; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1105; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1106; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1107; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 1108; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1109; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1110; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 1111; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1112; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1113; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 1114; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1115; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1116; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 1117; GFX1010-PAL-NEXT: s_endpgm 1118; 1119; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 1120; GFX1030-PAL: ; %bb.0: 1121; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1122; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1123; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1124; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1126; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1127; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1128; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1129; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1130; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1131; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1132; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1133; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1134; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1135; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1136; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1137; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1138; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1139; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1140; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1141; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1142; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1143; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1144; GFX1030-PAL-NEXT: s_endpgm 1145; 1146; GFX11-PAL-LABEL: zero_init_small_offset_kernel: 1147; GFX11-PAL: ; %bb.0: 1148; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1149; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1150; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1151; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1152; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1153; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1154; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 1155; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 1156; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 1157; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 1158; GFX11-PAL-NEXT: s_clause 0x3 1159; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1160; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1161; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1162; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:320 1163; GFX11-PAL-NEXT: s_endpgm 1164 %padding = alloca [64 x i32], align 4, addrspace(5) 1165 %alloca = alloca [32 x i16], align 2, addrspace(5) 1166 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1167 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1168 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1169 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1170 ret void 1171} 1172 1173define void @zero_init_small_offset_foo() { 1174; GFX9-LABEL: zero_init_small_offset_foo: 1175; GFX9: ; %bb.0: 1176; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1177; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 1178; GFX9-NEXT: s_waitcnt vmcnt(0) 1179; GFX9-NEXT: s_mov_b32 s0, 0 1180; GFX9-NEXT: s_mov_b32 s1, s0 1181; GFX9-NEXT: s_mov_b32 s2, s0 1182; GFX9-NEXT: s_mov_b32 s3, s0 1183; GFX9-NEXT: v_mov_b32_e32 v0, s0 1184; GFX9-NEXT: v_mov_b32_e32 v1, s1 1185; GFX9-NEXT: v_mov_b32_e32 v2, s2 1186; GFX9-NEXT: v_mov_b32_e32 v3, s3 1187; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1188; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1189; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1190; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1191; GFX9-NEXT: s_waitcnt vmcnt(0) 1192; GFX9-NEXT: s_setpc_b64 s[30:31] 1193; 1194; GFX10-LABEL: zero_init_small_offset_foo: 1195; GFX10: ; %bb.0: 1196; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1197; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1198; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 1199; GFX10-NEXT: s_waitcnt vmcnt(0) 1200; GFX10-NEXT: s_mov_b32 s0, 0 1201; GFX10-NEXT: s_mov_b32 s1, s0 1202; GFX10-NEXT: s_mov_b32 s2, s0 1203; GFX10-NEXT: s_mov_b32 s3, s0 1204; GFX10-NEXT: v_mov_b32_e32 v0, s0 1205; GFX10-NEXT: v_mov_b32_e32 v1, s1 1206; GFX10-NEXT: v_mov_b32_e32 v2, s2 1207; GFX10-NEXT: v_mov_b32_e32 v3, s3 1208; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1209; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1210; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1211; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1212; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1213; GFX10-NEXT: s_setpc_b64 s[30:31] 1214; 1215; GFX11-LABEL: zero_init_small_offset_foo: 1216; GFX11: ; %bb.0: 1217; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1218; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1219; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1220; GFX11-NEXT: s_waitcnt vmcnt(0) 1221; GFX11-NEXT: s_mov_b32 s0, 0 1222; GFX11-NEXT: s_mov_b32 s1, s0 1223; GFX11-NEXT: s_mov_b32 s2, s0 1224; GFX11-NEXT: s_mov_b32 s3, s0 1225; GFX11-NEXT: v_mov_b32_e32 v0, s0 1226; GFX11-NEXT: v_mov_b32_e32 v1, s1 1227; GFX11-NEXT: v_mov_b32_e32 v2, s2 1228; GFX11-NEXT: v_mov_b32_e32 v3, s3 1229; GFX11-NEXT: s_clause 0x3 1230; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1231; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1232; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1233; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1234; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1235; GFX11-NEXT: s_setpc_b64 s[30:31] 1236; 1237; GFX9-PAL-LABEL: zero_init_small_offset_foo: 1238; GFX9-PAL: ; %bb.0: 1239; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1240; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 1241; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1242; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1243; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1244; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1245; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1246; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1247; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1248; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1249; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1250; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1251; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1252; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1253; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1254; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1255; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1256; 1257; GFX940-LABEL: zero_init_small_offset_foo: 1258; GFX940: ; %bb.0: 1259; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1260; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1261; GFX940-NEXT: s_waitcnt vmcnt(0) 1262; GFX940-NEXT: s_mov_b32 s0, 0 1263; GFX940-NEXT: s_mov_b32 s1, s0 1264; GFX940-NEXT: s_mov_b32 s2, s0 1265; GFX940-NEXT: s_mov_b32 s3, s0 1266; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1267; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1268; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1269; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1270; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1271; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1272; GFX940-NEXT: s_waitcnt vmcnt(0) 1273; GFX940-NEXT: s_setpc_b64 s[30:31] 1274; 1275; GFX10-PAL-LABEL: zero_init_small_offset_foo: 1276; GFX10-PAL: ; %bb.0: 1277; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1278; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1279; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1280; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1281; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1282; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1283; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1284; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1285; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1286; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1287; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1288; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1289; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1290; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1291; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1292; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1293; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1294; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1295; 1296; GFX11-PAL-LABEL: zero_init_small_offset_foo: 1297; GFX11-PAL: ; %bb.0: 1298; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1299; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1300; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1301; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1302; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1303; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1304; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1305; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1306; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 1307; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 1308; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 1309; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 1310; GFX11-PAL-NEXT: s_clause 0x3 1311; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1312; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1313; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1314; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1315; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1316; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1317; GCN-LABEL: zero_init_small_offset_foo: 1318; GCN: ; %bb.0: 1319; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1320; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1321; GCN-NEXT: s_waitcnt vmcnt(0) 1322; GCN-NEXT: s_mov_b32 s0, 0 1323; GCN-NEXT: s_mov_b32 s1, s0 1324; GCN-NEXT: s_mov_b32 s2, s0 1325; GCN-NEXT: s_mov_b32 s3, s0 1326; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1327; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1328; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1329; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1330; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1331; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1332; GCN-NEXT: s_waitcnt vmcnt(0) 1333; GCN-NEXT: s_setpc_b64 s[30:31] 1334 %padding = alloca [64 x i32], align 4, addrspace(5) 1335 %alloca = alloca [32 x i16], align 2, addrspace(5) 1336 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1337 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1338 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1339 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1340 ret void 1341} 1342 1343define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 1344; GFX9-LABEL: store_load_sindex_small_offset_kernel: 1345; GFX9: ; %bb.0: ; %bb 1346; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1347; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1348; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1349; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1350; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1351; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1352; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1353; GFX9-NEXT: s_and_b32 s0, s0, 15 1354; GFX9-NEXT: v_mov_b32_e32 v0, 15 1355; GFX9-NEXT: s_addk_i32 s1, 0x104 1356; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1357; GFX9-NEXT: scratch_store_dword off, v0, s1 1358; GFX9-NEXT: s_waitcnt vmcnt(0) 1359; GFX9-NEXT: s_addk_i32 s0, 0x104 1360; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1361; GFX9-NEXT: s_waitcnt vmcnt(0) 1362; GFX9-NEXT: s_endpgm 1363; 1364; GFX10-LABEL: store_load_sindex_small_offset_kernel: 1365; GFX10: ; %bb.0: ; %bb 1366; GFX10-NEXT: s_add_u32 s2, s2, s5 1367; GFX10-NEXT: s_addc_u32 s3, s3, 0 1368; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1369; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1370; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1371; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1372; GFX10-NEXT: s_waitcnt vmcnt(0) 1373; GFX10-NEXT: v_mov_b32_e32 v0, 15 1374; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1375; GFX10-NEXT: s_and_b32 s1, s0, 15 1376; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1377; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1378; GFX10-NEXT: s_addk_i32 s0, 0x104 1379; GFX10-NEXT: s_addk_i32 s1, 0x104 1380; GFX10-NEXT: scratch_store_dword off, v0, s0 1381; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1382; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1383; GFX10-NEXT: s_waitcnt vmcnt(0) 1384; GFX10-NEXT: s_endpgm 1385; 1386; GFX11-LABEL: store_load_sindex_small_offset_kernel: 1387; GFX11: ; %bb.0: ; %bb 1388; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 1389; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1390; GFX11-NEXT: s_waitcnt vmcnt(0) 1391; GFX11-NEXT: v_mov_b32_e32 v0, 15 1392; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1393; GFX11-NEXT: s_and_b32 s1, s0, 15 1394; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1395; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1396; GFX11-NEXT: s_addk_i32 s0, 0x104 1397; GFX11-NEXT: s_addk_i32 s1, 0x104 1398; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1399; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1400; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1401; GFX11-NEXT: s_waitcnt vmcnt(0) 1402; GFX11-NEXT: s_endpgm 1403; 1404; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 1405; GFX9-PAL: ; %bb.0: ; %bb 1406; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1407; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1408; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1409; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1410; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1411; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1413; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1414; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1415; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1416; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1417; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1418; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1419; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1420; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1421; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1422; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1423; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1424; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1425; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1426; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1427; GFX9-PAL-NEXT: s_endpgm 1428; 1429; GFX940-LABEL: store_load_sindex_small_offset_kernel: 1430; GFX940: ; %bb.0: ; %bb 1431; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 1432; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1433; GFX940-NEXT: s_waitcnt vmcnt(0) 1434; GFX940-NEXT: v_mov_b32_e32 v0, 15 1435; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1436; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1437; GFX940-NEXT: s_and_b32 s0, s0, 15 1438; GFX940-NEXT: s_addk_i32 s1, 0x104 1439; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1440; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1441; GFX940-NEXT: s_waitcnt vmcnt(0) 1442; GFX940-NEXT: s_addk_i32 s0, 0x104 1443; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1444; GFX940-NEXT: s_waitcnt vmcnt(0) 1445; GFX940-NEXT: s_endpgm 1446; 1447; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 1448; GFX1010-PAL: ; %bb.0: ; %bb 1449; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 1450; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 1451; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1452; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1454; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 1455; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 1456; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1457; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1458; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1459; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1460; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1461; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1462; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1463; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1464; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1465; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1466; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1467; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1468; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1469; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1470; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1471; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1472; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1473; GFX1010-PAL-NEXT: s_endpgm 1474; 1475; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 1476; GFX1030-PAL: ; %bb.0: ; %bb 1477; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 1478; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 1479; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1480; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1481; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1482; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 1483; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 1484; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1485; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1486; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1487; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1488; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1489; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1490; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1492; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1493; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1494; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1495; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1496; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1497; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1498; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1499; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1500; GFX1030-PAL-NEXT: s_endpgm 1501; 1502; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: 1503; GFX11-PAL: ; %bb.0: ; %bb 1504; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 1505; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1506; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1507; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1508; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 1509; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1510; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1511; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1512; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 1513; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 1514; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1515; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1516; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1517; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1518; GFX11-PAL-NEXT: s_endpgm 1519bb: 1520 %padding = alloca [64 x i32], align 4, addrspace(5) 1521 %i = alloca [32 x float], align 4, addrspace(5) 1522 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1523 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1524 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1525 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1526 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1527 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1528 %i9 = and i32 %idx, 15 1529 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1530 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1531 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1532 ret void 1533} 1534 1535define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 1536; GFX9-LABEL: store_load_sindex_small_offset_foo: 1537; GFX9: ; %bb.0: ; %bb 1538; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1539; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1540; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1541; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1542; GFX9-NEXT: s_waitcnt vmcnt(0) 1543; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1544; GFX9-NEXT: s_addk_i32 s0, 0x104 1545; GFX9-NEXT: v_mov_b32_e32 v0, 15 1546; GFX9-NEXT: scratch_store_dword off, v0, s0 1547; GFX9-NEXT: s_waitcnt vmcnt(0) 1548; GFX9-NEXT: s_and_b32 s0, s2, 15 1549; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1550; GFX9-NEXT: s_addk_i32 s0, 0x104 1551; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1552; GFX9-NEXT: s_waitcnt vmcnt(0) 1553; GFX9-NEXT: s_endpgm 1554; 1555; GFX10-LABEL: store_load_sindex_small_offset_foo: 1556; GFX10: ; %bb.0: ; %bb 1557; GFX10-NEXT: s_add_u32 s0, s0, s3 1558; GFX10-NEXT: s_addc_u32 s1, s1, 0 1559; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1560; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1561; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1562; GFX10-NEXT: s_waitcnt vmcnt(0) 1563; GFX10-NEXT: v_mov_b32_e32 v0, 15 1564; GFX10-NEXT: s_and_b32 s0, s2, 15 1565; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1566; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1567; GFX10-NEXT: s_addk_i32 s1, 0x104 1568; GFX10-NEXT: s_addk_i32 s0, 0x104 1569; GFX10-NEXT: scratch_store_dword off, v0, s1 1570; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1571; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1572; GFX10-NEXT: s_waitcnt vmcnt(0) 1573; GFX10-NEXT: s_endpgm 1574; 1575; GFX11-LABEL: store_load_sindex_small_offset_foo: 1576; GFX11: ; %bb.0: ; %bb 1577; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1578; GFX11-NEXT: s_waitcnt vmcnt(0) 1579; GFX11-NEXT: v_mov_b32_e32 v0, 15 1580; GFX11-NEXT: s_and_b32 s1, s0, 15 1581; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1582; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1583; GFX11-NEXT: s_addk_i32 s0, 0x104 1584; GFX11-NEXT: s_addk_i32 s1, 0x104 1585; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1586; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1587; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1588; GFX11-NEXT: s_waitcnt vmcnt(0) 1589; GFX11-NEXT: s_endpgm 1590; 1591; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1592; GFX9-PAL: ; %bb.0: ; %bb 1593; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1594; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1595; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1596; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1597; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1599; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1600; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1601; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1602; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1603; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1604; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1605; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1606; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1607; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1608; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1609; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1610; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1611; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1612; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1613; GFX9-PAL-NEXT: s_endpgm 1614; 1615; GFX940-LABEL: store_load_sindex_small_offset_foo: 1616; GFX940: ; %bb.0: ; %bb 1617; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1618; GFX940-NEXT: s_waitcnt vmcnt(0) 1619; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1620; GFX940-NEXT: s_and_b32 s0, s0, 15 1621; GFX940-NEXT: s_addk_i32 s1, 0x104 1622; GFX940-NEXT: v_mov_b32_e32 v0, 15 1623; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1624; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1625; GFX940-NEXT: s_waitcnt vmcnt(0) 1626; GFX940-NEXT: s_addk_i32 s0, 0x104 1627; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1628; GFX940-NEXT: s_waitcnt vmcnt(0) 1629; GFX940-NEXT: s_endpgm 1630; 1631; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1632; GFX1010-PAL: ; %bb.0: ; %bb 1633; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1634; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1635; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1636; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1637; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1638; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1639; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1640; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1641; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1642; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1643; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1644; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1645; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1646; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1647; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1648; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1649; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1650; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1651; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1652; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1653; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1654; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1655; GFX1010-PAL-NEXT: s_endpgm 1656; 1657; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1658; GFX1030-PAL: ; %bb.0: ; %bb 1659; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1660; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1661; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1662; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1663; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1664; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1665; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1666; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1667; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1668; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1669; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1670; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1671; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1672; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1673; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1674; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1675; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1676; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1677; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1678; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1679; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1680; GFX1030-PAL-NEXT: s_endpgm 1681; 1682; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: 1683; GFX11-PAL: ; %bb.0: ; %bb 1684; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1685; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1686; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1687; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1688; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1689; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1690; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 1691; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 1692; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1693; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1694; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1695; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1696; GFX11-PAL-NEXT: s_endpgm 1697bb: 1698 %padding = alloca [64 x i32], align 4, addrspace(5) 1699 %i = alloca [32 x float], align 4, addrspace(5) 1700 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1701 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1702 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1703 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1704 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1705 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1706 %i9 = and i32 %idx, 15 1707 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1708 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1709 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1710 ret void 1711} 1712 1713define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1714; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1715; GFX9: ; %bb.0: ; %bb 1716; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1717; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1718; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1719; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1720; GFX9-NEXT: s_waitcnt vmcnt(0) 1721; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1722; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 1723; GFX9-NEXT: v_mov_b32_e32 v2, 15 1724; GFX9-NEXT: scratch_store_dword v1, v2, off 1725; GFX9-NEXT: s_waitcnt vmcnt(0) 1726; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 1727; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1728; GFX9-NEXT: s_waitcnt vmcnt(0) 1729; GFX9-NEXT: s_endpgm 1730; 1731; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1732; GFX10: ; %bb.0: ; %bb 1733; GFX10-NEXT: s_add_u32 s0, s0, s3 1734; GFX10-NEXT: s_addc_u32 s1, s1, 0 1735; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1736; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1737; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1738; GFX10-NEXT: v_mov_b32_e32 v2, 15 1739; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1740; GFX10-NEXT: s_waitcnt vmcnt(0) 1741; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1742; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1743; GFX10-NEXT: scratch_store_dword v1, v2, off 1744; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1745; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1746; GFX10-NEXT: s_waitcnt vmcnt(0) 1747; GFX10-NEXT: s_endpgm 1748; 1749; GFX11-LABEL: store_load_vindex_small_offset_kernel: 1750; GFX11: ; %bb.0: ; %bb 1751; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1752; GFX11-NEXT: v_mov_b32_e32 v1, 15 1753; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 1754; GFX11-NEXT: s_waitcnt vmcnt(0) 1755; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 1756; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 1757; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1758; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1759; GFX11-NEXT: s_waitcnt vmcnt(0) 1760; GFX11-NEXT: s_endpgm 1761; 1762; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1763; GFX9-PAL: ; %bb.0: ; %bb 1764; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1765; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1766; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1767; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1768; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1769; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 1770; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1771; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1772; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1773; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1774; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1775; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1776; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 1777; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 1778; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1779; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 1780; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1781; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1782; GFX9-PAL-NEXT: s_endpgm 1783; 1784; GFX940-LABEL: store_load_vindex_small_offset_kernel: 1785; GFX940: ; %bb.0: ; %bb 1786; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 1787; GFX940-NEXT: s_waitcnt vmcnt(0) 1788; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1789; GFX940-NEXT: v_mov_b32_e32 v1, 15 1790; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 1791; GFX940-NEXT: s_waitcnt vmcnt(0) 1792; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 1793; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 1794; GFX940-NEXT: s_waitcnt vmcnt(0) 1795; GFX940-NEXT: s_endpgm 1796; 1797; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1798; GFX1010-PAL: ; %bb.0: ; %bb 1799; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1800; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1801; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1802; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1803; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1804; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1805; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1806; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1807; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1808; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1809; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 1810; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1811; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 1812; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1813; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1814; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1815; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 1816; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1817; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1818; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1819; GFX1010-PAL-NEXT: s_endpgm 1820; 1821; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1822; GFX1030-PAL: ; %bb.0: ; %bb 1823; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1824; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1825; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1826; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1827; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1828; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1829; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1830; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1831; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1832; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1833; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 1834; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1835; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1836; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1837; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1838; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 1839; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1840; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1841; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1842; GFX1030-PAL-NEXT: s_endpgm 1843; 1844; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: 1845; GFX11-PAL: ; %bb.0: ; %bb 1846; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1847; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 1848; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 1849; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1850; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 1851; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 1852; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1853; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1854; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1855; GFX11-PAL-NEXT: s_endpgm 1856bb: 1857 %padding = alloca [64 x i32], align 4, addrspace(5) 1858 %i = alloca [32 x float], align 4, addrspace(5) 1859 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1860 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1861 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1862 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1863 %i3 = zext i32 %i2 to i64 1864 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1865 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1866 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1867 %i9 = sub nsw i32 31, %i2 1868 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1869 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1870 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1871 ret void 1872} 1873 1874define void @store_load_vindex_small_offset_foo(i32 %idx) { 1875; GFX9-LABEL: store_load_vindex_small_offset_foo: 1876; GFX9: ; %bb.0: ; %bb 1877; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1878; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1879; GFX9-NEXT: s_waitcnt vmcnt(0) 1880; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 1881; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1882; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1883; GFX9-NEXT: v_mov_b32_e32 v3, 15 1884; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 1885; GFX9-NEXT: scratch_store_dword v2, v3, off 1886; GFX9-NEXT: s_waitcnt vmcnt(0) 1887; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1888; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1889; GFX9-NEXT: s_waitcnt vmcnt(0) 1890; GFX9-NEXT: s_setpc_b64 s[30:31] 1891; 1892; GFX10-LABEL: store_load_vindex_small_offset_foo: 1893; GFX10: ; %bb.0: ; %bb 1894; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1895; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1896; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 1897; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1898; GFX10-NEXT: v_mov_b32_e32 v2, 15 1899; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1900; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1901; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1902; GFX10-NEXT: s_waitcnt vmcnt(0) 1903; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1904; GFX10-NEXT: scratch_store_dword v0, v2, off 1905; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1906; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 1907; GFX10-NEXT: s_waitcnt vmcnt(0) 1908; GFX10-NEXT: s_setpc_b64 s[30:31] 1909; 1910; GFX11-LABEL: store_load_vindex_small_offset_foo: 1911; GFX11: ; %bb.0: ; %bb 1912; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1913; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1914; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 1915; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1916; GFX11-NEXT: v_mov_b32_e32 v2, 15 1917; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc 1918; GFX11-NEXT: s_waitcnt vmcnt(0) 1919; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 1920; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 1921; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1922; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 1923; GFX11-NEXT: s_waitcnt vmcnt(0) 1924; GFX11-NEXT: s_setpc_b64 s[30:31] 1925; 1926; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1927; GFX9-PAL: ; %bb.0: ; %bb 1928; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1929; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1930; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1931; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 1932; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1933; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1934; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1935; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 1936; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1937; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1938; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1939; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1940; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1941; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1942; 1943; GFX940-LABEL: store_load_vindex_small_offset_foo: 1944; GFX940: ; %bb.0: ; %bb 1945; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1946; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1947; GFX940-NEXT: s_waitcnt vmcnt(0) 1948; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1949; GFX940-NEXT: v_mov_b32_e32 v2, 15 1950; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 1951; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1952; GFX940-NEXT: s_waitcnt vmcnt(0) 1953; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1954; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1955; GFX940-NEXT: s_waitcnt vmcnt(0) 1956; GFX940-NEXT: s_setpc_b64 s[30:31] 1957; 1958; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1959; GFX10-PAL: ; %bb.0: ; %bb 1960; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1961; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1962; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 1963; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1964; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 1965; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1966; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1967; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1968; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1969; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1970; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 1971; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1972; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 1973; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1974; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1975; 1976; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: 1977; GFX11-PAL: ; %bb.0: ; %bb 1978; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1979; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1980; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 1981; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1982; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 1983; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc 1984; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1985; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 1986; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 1987; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1988; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 1989; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1990; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1991; GCN-LABEL: store_load_vindex_small_offset_foo: 1992; GCN: ; %bb.0: ; %bb 1993; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1994; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1995; GCN-NEXT: s_waitcnt vmcnt(0) 1996; GCN-NEXT: v_mov_b32_e32 v2, 15 1997; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1998; GCN-NEXT: v_and_b32_e32 v0, v0, v2 1999; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 2000; GCN-NEXT: s_waitcnt vmcnt(0) 2001; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2002; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 2003; GCN-NEXT: s_waitcnt vmcnt(0) 2004; GCN-NEXT: s_setpc_b64 s[30:31] 2005bb: 2006 %padding = alloca [64 x i32], align 4, addrspace(5) 2007 %i = alloca [32 x float], align 4, addrspace(5) 2008 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 2009 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2010 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2011 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2012 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2013 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2014 %i9 = and i32 %idx, 15 2015 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2016 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2017 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2018 ret void 2019} 2020 2021define amdgpu_kernel void @zero_init_large_offset_kernel() { 2022; GFX9-LABEL: zero_init_large_offset_kernel: 2023; GFX9: ; %bb.0: 2024; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2025; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2026; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2027; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 2028; GFX9-NEXT: s_waitcnt vmcnt(0) 2029; GFX9-NEXT: s_mov_b32 s0, 0 2030; GFX9-NEXT: s_mov_b32 s1, s0 2031; GFX9-NEXT: s_mov_b32 s2, s0 2032; GFX9-NEXT: s_mov_b32 s3, s0 2033; GFX9-NEXT: v_mov_b32_e32 v0, s0 2034; GFX9-NEXT: v_mov_b32_e32 v1, s1 2035; GFX9-NEXT: v_mov_b32_e32 v2, s2 2036; GFX9-NEXT: v_mov_b32_e32 v3, s3 2037; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2038; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2039; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2040; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2041; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2042; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2043; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2044; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2045; GFX9-NEXT: s_endpgm 2046; 2047; GFX10-LABEL: zero_init_large_offset_kernel: 2048; GFX10: ; %bb.0: 2049; GFX10-NEXT: s_add_u32 s0, s0, s3 2050; GFX10-NEXT: s_addc_u32 s1, s1, 0 2051; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2052; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2053; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 2054; GFX10-NEXT: s_waitcnt vmcnt(0) 2055; GFX10-NEXT: s_mov_b32 s0, 0 2056; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2057; GFX10-NEXT: s_mov_b32 s1, s0 2058; GFX10-NEXT: s_mov_b32 s2, s0 2059; GFX10-NEXT: s_mov_b32 s3, s0 2060; GFX10-NEXT: v_mov_b32_e32 v0, s0 2061; GFX10-NEXT: v_mov_b32_e32 v1, s1 2062; GFX10-NEXT: v_mov_b32_e32 v2, s2 2063; GFX10-NEXT: v_mov_b32_e32 v3, s3 2064; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2065; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2066; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2067; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2068; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2069; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2070; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2071; GFX10-NEXT: s_endpgm 2072; 2073; GFX11-LABEL: zero_init_large_offset_kernel: 2074; GFX11: ; %bb.0: 2075; GFX11-NEXT: scratch_load_b32 v0, off, off offset:16 glc dlc 2076; GFX11-NEXT: s_waitcnt vmcnt(0) 2077; GFX11-NEXT: s_mov_b32 s0, 0 2078; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2079; GFX11-NEXT: s_mov_b32 s1, s0 2080; GFX11-NEXT: s_mov_b32 s2, s0 2081; GFX11-NEXT: s_mov_b32 s3, s0 2082; GFX11-NEXT: v_mov_b32_e32 v0, s0 2083; GFX11-NEXT: v_mov_b32_e32 v1, s1 2084; GFX11-NEXT: v_mov_b32_e32 v2, s2 2085; GFX11-NEXT: v_mov_b32_e32 v3, s3 2086; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2087; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2088; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2089; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2090; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2091; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2092; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2093; GFX11-NEXT: s_endpgm 2094; 2095; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 2096; GFX9-PAL: ; %bb.0: 2097; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2098; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2099; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2100; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2101; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2102; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2104; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2105; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2106; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 2107; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2108; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2109; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2110; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2111; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2112; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2113; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2114; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2115; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2116; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2117; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2118; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2119; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2120; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2121; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2122; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2123; GFX9-PAL-NEXT: s_endpgm 2124; 2125; GFX940-LABEL: zero_init_large_offset_kernel: 2126; GFX940: ; %bb.0: 2127; GFX940-NEXT: scratch_load_dword v0, off, off offset:16 sc0 sc1 2128; GFX940-NEXT: s_waitcnt vmcnt(0) 2129; GFX940-NEXT: s_mov_b32 s0, 0 2130; GFX940-NEXT: s_mov_b32 s1, s0 2131; GFX940-NEXT: s_mov_b32 s2, s0 2132; GFX940-NEXT: s_mov_b32 s3, s0 2133; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2134; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2135; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2136; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2137; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2138; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2139; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2140; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2141; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2142; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2143; GFX940-NEXT: s_endpgm 2144; 2145; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 2146; GFX1010-PAL: ; %bb.0: 2147; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2148; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2149; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2150; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2151; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2152; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2153; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2154; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2155; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2156; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2157; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2158; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc 2159; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2160; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2161; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2162; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2163; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2164; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2165; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2166; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2167; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2168; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2169; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2170; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2171; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2172; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2173; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2174; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2175; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2176; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2177; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2178; GFX1010-PAL-NEXT: s_endpgm 2179; 2180; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 2181; GFX1030-PAL: ; %bb.0: 2182; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2183; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2184; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2185; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2186; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2187; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2188; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2189; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2190; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2191; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 2192; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2193; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2194; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2195; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2196; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2197; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2198; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2199; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2200; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2201; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2202; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2203; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2204; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2205; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2206; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2207; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2208; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2209; GFX1030-PAL-NEXT: s_endpgm 2210; 2211; GFX11-PAL-LABEL: zero_init_large_offset_kernel: 2212; GFX11-PAL: ; %bb.0: 2213; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:16 glc dlc 2214; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2215; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2216; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2217; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2218; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2219; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2220; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 2221; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 2222; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 2223; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 2224; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2225; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2226; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2227; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2228; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2229; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2230; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2231; GFX11-PAL-NEXT: s_endpgm 2232 %padding = alloca [4096 x i32], align 4, addrspace(5) 2233 %alloca = alloca [32 x i16], align 2, addrspace(5) 2234 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2235 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2236 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 2237 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 2238 ret void 2239} 2240 2241define void @zero_init_large_offset_foo() { 2242; GFX9-LABEL: zero_init_large_offset_foo: 2243; GFX9: ; %bb.0: 2244; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2245; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 2246; GFX9-NEXT: s_waitcnt vmcnt(0) 2247; GFX9-NEXT: s_mov_b32 s0, 0 2248; GFX9-NEXT: s_mov_b32 s1, s0 2249; GFX9-NEXT: s_mov_b32 s2, s0 2250; GFX9-NEXT: s_mov_b32 s3, s0 2251; GFX9-NEXT: v_mov_b32_e32 v0, s0 2252; GFX9-NEXT: v_mov_b32_e32 v1, s1 2253; GFX9-NEXT: v_mov_b32_e32 v2, s2 2254; GFX9-NEXT: v_mov_b32_e32 v3, s3 2255; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2256; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2257; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2258; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2259; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2260; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2261; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2262; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2263; GFX9-NEXT: s_waitcnt vmcnt(0) 2264; GFX9-NEXT: s_setpc_b64 s[30:31] 2265; 2266; GFX10-LABEL: zero_init_large_offset_foo: 2267; GFX10: ; %bb.0: 2268; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2269; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2270; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2271; GFX10-NEXT: s_waitcnt vmcnt(0) 2272; GFX10-NEXT: s_mov_b32 s0, 0 2273; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2274; GFX10-NEXT: s_mov_b32 s1, s0 2275; GFX10-NEXT: s_mov_b32 s2, s0 2276; GFX10-NEXT: s_mov_b32 s3, s0 2277; GFX10-NEXT: v_mov_b32_e32 v0, s0 2278; GFX10-NEXT: v_mov_b32_e32 v1, s1 2279; GFX10-NEXT: v_mov_b32_e32 v2, s2 2280; GFX10-NEXT: v_mov_b32_e32 v3, s3 2281; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2282; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2283; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2284; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2285; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2286; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2287; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2288; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2289; GFX10-NEXT: s_setpc_b64 s[30:31] 2290; 2291; GFX11-LABEL: zero_init_large_offset_foo: 2292; GFX11: ; %bb.0: 2293; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2294; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2295; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:16 glc dlc 2296; GFX11-NEXT: s_waitcnt vmcnt(0) 2297; GFX11-NEXT: s_mov_b32 s0, 0 2298; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2299; GFX11-NEXT: s_mov_b32 s1, s0 2300; GFX11-NEXT: s_mov_b32 s2, s0 2301; GFX11-NEXT: s_mov_b32 s3, s0 2302; GFX11-NEXT: v_mov_b32_e32 v0, s0 2303; GFX11-NEXT: v_mov_b32_e32 v1, s1 2304; GFX11-NEXT: v_mov_b32_e32 v2, s2 2305; GFX11-NEXT: v_mov_b32_e32 v3, s3 2306; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2307; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2308; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2309; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2310; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2311; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2312; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2313; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2314; GFX11-NEXT: s_setpc_b64 s[30:31] 2315; 2316; GFX9-PAL-LABEL: zero_init_large_offset_foo: 2317; GFX9-PAL: ; %bb.0: 2318; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2319; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 2320; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2321; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2322; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2323; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2324; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2325; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2326; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2327; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2328; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2329; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2330; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2331; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2332; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2333; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2334; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2335; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2336; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2337; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2338; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2339; 2340; GFX940-LABEL: zero_init_large_offset_foo: 2341; GFX940: ; %bb.0: 2342; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2343; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:16 sc0 sc1 2344; GFX940-NEXT: s_waitcnt vmcnt(0) 2345; GFX940-NEXT: s_mov_b32 s0, 0 2346; GFX940-NEXT: s_mov_b32 s1, s0 2347; GFX940-NEXT: s_mov_b32 s2, s0 2348; GFX940-NEXT: s_mov_b32 s3, s0 2349; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2350; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2351; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2352; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2353; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2354; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2355; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2356; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2357; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2358; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2359; GFX940-NEXT: s_waitcnt vmcnt(0) 2360; GFX940-NEXT: s_setpc_b64 s[30:31] 2361; 2362; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 2363; GFX1010-PAL: ; %bb.0: 2364; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2365; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2366; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2367; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2368; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2369; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2370; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2371; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2372; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2373; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2374; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2375; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2376; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2377; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2378; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2379; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2380; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2381; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2382; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2383; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2384; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2385; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2386; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2387; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2388; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 2389; 2390; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 2391; GFX1030-PAL: ; %bb.0: 2392; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2393; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2394; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2395; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2396; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2397; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2398; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2399; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2400; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2401; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2402; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2403; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2404; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2405; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2406; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2407; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2408; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2409; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2410; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2411; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2412; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2413; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 2414; 2415; GFX11-PAL-LABEL: zero_init_large_offset_foo: 2416; GFX11-PAL: ; %bb.0: 2417; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2418; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2419; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16 glc dlc 2420; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2421; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2422; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2423; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2424; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2425; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2426; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 2427; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 2428; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 2429; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 2430; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2431; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2432; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2433; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2434; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2435; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2436; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2437; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2438; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 2439 %padding = alloca [4096 x i32], align 4, addrspace(5) 2440 %alloca = alloca [32 x i16], align 2, addrspace(5) 2441 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2442 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2443 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 2444 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 2445 ret void 2446} 2447 2448define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 2449; GFX9-LABEL: store_load_sindex_large_offset_kernel: 2450; GFX9: ; %bb.0: ; %bb 2451; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 2452; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 2453; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2454; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2455; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2456; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2457; GFX9-NEXT: s_lshl_b32 s1, s0, 2 2458; GFX9-NEXT: s_and_b32 s0, s0, 15 2459; GFX9-NEXT: v_mov_b32_e32 v0, 15 2460; GFX9-NEXT: s_addk_i32 s1, 0x4004 2461; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2462; GFX9-NEXT: scratch_store_dword off, v0, s1 2463; GFX9-NEXT: s_waitcnt vmcnt(0) 2464; GFX9-NEXT: s_addk_i32 s0, 0x4004 2465; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2466; GFX9-NEXT: s_waitcnt vmcnt(0) 2467; GFX9-NEXT: s_endpgm 2468; 2469; GFX10-LABEL: store_load_sindex_large_offset_kernel: 2470; GFX10: ; %bb.0: ; %bb 2471; GFX10-NEXT: s_add_u32 s2, s2, s5 2472; GFX10-NEXT: s_addc_u32 s3, s3, 0 2473; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2474; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2475; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2476; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2477; GFX10-NEXT: s_waitcnt vmcnt(0) 2478; GFX10-NEXT: v_mov_b32_e32 v0, 15 2479; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2480; GFX10-NEXT: s_and_b32 s1, s0, 15 2481; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2482; GFX10-NEXT: s_lshl_b32 s1, s1, 2 2483; GFX10-NEXT: s_addk_i32 s0, 0x4004 2484; GFX10-NEXT: s_addk_i32 s1, 0x4004 2485; GFX10-NEXT: scratch_store_dword off, v0, s0 2486; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2487; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 2488; GFX10-NEXT: s_waitcnt vmcnt(0) 2489; GFX10-NEXT: s_endpgm 2490; 2491; GFX11-LABEL: store_load_sindex_large_offset_kernel: 2492; GFX11: ; %bb.0: ; %bb 2493; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 2494; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2495; GFX11-NEXT: s_waitcnt vmcnt(0) 2496; GFX11-NEXT: v_mov_b32_e32 v0, 15 2497; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX11-NEXT: s_and_b32 s1, s0, 15 2499; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2500; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2501; GFX11-NEXT: s_addk_i32 s0, 0x4004 2502; GFX11-NEXT: s_addk_i32 s1, 0x4004 2503; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2504; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2505; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2506; GFX11-NEXT: s_waitcnt vmcnt(0) 2507; GFX11-NEXT: s_endpgm 2508; 2509; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 2510; GFX9-PAL: ; %bb.0: ; %bb 2511; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2512; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2513; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2514; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2515; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2516; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2517; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2518; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2519; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2520; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2521; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2522; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2523; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2524; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2525; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2526; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2527; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2528; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2529; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2530; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2531; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2532; GFX9-PAL-NEXT: s_endpgm 2533; 2534; GFX940-LABEL: store_load_sindex_large_offset_kernel: 2535; GFX940: ; %bb.0: ; %bb 2536; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2537; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2538; GFX940-NEXT: s_waitcnt vmcnt(0) 2539; GFX940-NEXT: v_mov_b32_e32 v0, 15 2540; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2541; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2542; GFX940-NEXT: s_and_b32 s0, s0, 15 2543; GFX940-NEXT: s_addk_i32 s1, 0x4004 2544; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2545; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2546; GFX940-NEXT: s_waitcnt vmcnt(0) 2547; GFX940-NEXT: s_addk_i32 s0, 0x4004 2548; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2549; GFX940-NEXT: s_waitcnt vmcnt(0) 2550; GFX940-NEXT: s_endpgm 2551; 2552; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 2553; GFX1010-PAL: ; %bb.0: ; %bb 2554; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 2555; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 2556; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2557; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2558; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2559; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 2560; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 2561; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2562; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2563; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2564; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2565; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2566; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2567; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2568; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2569; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2570; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2571; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2572; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2573; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2574; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2575; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2576; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2577; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2578; GFX1010-PAL-NEXT: s_endpgm 2579; 2580; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 2581; GFX1030-PAL: ; %bb.0: ; %bb 2582; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 2583; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 2584; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2585; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2586; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2587; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 2588; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 2589; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2590; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2591; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2592; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2593; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2594; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2595; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2596; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2597; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2598; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2599; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2600; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2601; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2602; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2603; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2604; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2605; GFX1030-PAL-NEXT: s_endpgm 2606; 2607; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: 2608; GFX11-PAL: ; %bb.0: ; %bb 2609; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 2610; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2611; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2612; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2613; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 2614; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2615; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2616; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2617; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2618; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2619; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2620; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2621; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2622; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2623; GFX11-PAL-NEXT: s_endpgm 2624bb: 2625 %padding = alloca [4096 x i32], align 4, addrspace(5) 2626 %i = alloca [32 x float], align 4, addrspace(5) 2627 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2628 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2629 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2630 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2631 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2632 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2633 %i9 = and i32 %idx, 15 2634 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2635 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2636 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2637 ret void 2638} 2639 2640define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 2641; GFX9-LABEL: store_load_sindex_large_offset_foo: 2642; GFX9: ; %bb.0: ; %bb 2643; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2644; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2645; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2646; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2647; GFX9-NEXT: s_waitcnt vmcnt(0) 2648; GFX9-NEXT: s_lshl_b32 s0, s2, 2 2649; GFX9-NEXT: s_addk_i32 s0, 0x4004 2650; GFX9-NEXT: v_mov_b32_e32 v0, 15 2651; GFX9-NEXT: scratch_store_dword off, v0, s0 2652; GFX9-NEXT: s_waitcnt vmcnt(0) 2653; GFX9-NEXT: s_and_b32 s0, s2, 15 2654; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2655; GFX9-NEXT: s_addk_i32 s0, 0x4004 2656; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2657; GFX9-NEXT: s_waitcnt vmcnt(0) 2658; GFX9-NEXT: s_endpgm 2659; 2660; GFX10-LABEL: store_load_sindex_large_offset_foo: 2661; GFX10: ; %bb.0: ; %bb 2662; GFX10-NEXT: s_add_u32 s0, s0, s3 2663; GFX10-NEXT: s_addc_u32 s1, s1, 0 2664; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2665; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2666; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2667; GFX10-NEXT: s_waitcnt vmcnt(0) 2668; GFX10-NEXT: v_mov_b32_e32 v0, 15 2669; GFX10-NEXT: s_and_b32 s0, s2, 15 2670; GFX10-NEXT: s_lshl_b32 s1, s2, 2 2671; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2672; GFX10-NEXT: s_addk_i32 s1, 0x4004 2673; GFX10-NEXT: s_addk_i32 s0, 0x4004 2674; GFX10-NEXT: scratch_store_dword off, v0, s1 2675; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2676; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 2677; GFX10-NEXT: s_waitcnt vmcnt(0) 2678; GFX10-NEXT: s_endpgm 2679; 2680; GFX11-LABEL: store_load_sindex_large_offset_foo: 2681; GFX11: ; %bb.0: ; %bb 2682; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2683; GFX11-NEXT: s_waitcnt vmcnt(0) 2684; GFX11-NEXT: v_mov_b32_e32 v0, 15 2685; GFX11-NEXT: s_and_b32 s1, s0, 15 2686; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2687; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2688; GFX11-NEXT: s_addk_i32 s0, 0x4004 2689; GFX11-NEXT: s_addk_i32 s1, 0x4004 2690; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2691; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2692; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2693; GFX11-NEXT: s_waitcnt vmcnt(0) 2694; GFX11-NEXT: s_endpgm 2695; 2696; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 2697; GFX9-PAL: ; %bb.0: ; %bb 2698; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2699; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2700; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2701; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2702; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2703; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2704; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2705; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2706; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2707; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2708; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2709; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2710; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2711; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2712; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2713; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2714; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2715; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2716; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2717; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2718; GFX9-PAL-NEXT: s_endpgm 2719; 2720; GFX940-LABEL: store_load_sindex_large_offset_foo: 2721; GFX940: ; %bb.0: ; %bb 2722; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2723; GFX940-NEXT: s_waitcnt vmcnt(0) 2724; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2725; GFX940-NEXT: s_and_b32 s0, s0, 15 2726; GFX940-NEXT: s_addk_i32 s1, 0x4004 2727; GFX940-NEXT: v_mov_b32_e32 v0, 15 2728; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2729; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2730; GFX940-NEXT: s_waitcnt vmcnt(0) 2731; GFX940-NEXT: s_addk_i32 s0, 0x4004 2732; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2733; GFX940-NEXT: s_waitcnt vmcnt(0) 2734; GFX940-NEXT: s_endpgm 2735; 2736; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 2737; GFX1010-PAL: ; %bb.0: ; %bb 2738; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2739; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2740; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2741; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2742; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2743; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2744; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2745; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2746; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2747; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2748; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2749; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2750; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2751; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2752; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2753; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2754; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2755; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2756; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2757; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2758; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2759; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2760; GFX1010-PAL-NEXT: s_endpgm 2761; 2762; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 2763; GFX1030-PAL: ; %bb.0: ; %bb 2764; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2765; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2766; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2767; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2768; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2769; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2770; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2771; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2772; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2773; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2774; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2775; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2776; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2777; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2778; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2779; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2780; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2781; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2782; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2783; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2784; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2785; GFX1030-PAL-NEXT: s_endpgm 2786; 2787; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: 2788; GFX11-PAL: ; %bb.0: ; %bb 2789; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2790; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2791; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2792; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2793; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2794; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2795; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2796; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2797; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2798; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2799; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2800; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2801; GFX11-PAL-NEXT: s_endpgm 2802bb: 2803 %padding = alloca [4096 x i32], align 4, addrspace(5) 2804 %i = alloca [32 x float], align 4, addrspace(5) 2805 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2806 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2807 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2808 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2809 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2810 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2811 %i9 = and i32 %idx, 15 2812 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2813 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2814 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2815 ret void 2816} 2817 2818define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 2819; GFX9-LABEL: store_load_vindex_large_offset_kernel: 2820; GFX9: ; %bb.0: ; %bb 2821; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2822; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2823; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2824; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2825; GFX9-NEXT: s_waitcnt vmcnt(0) 2826; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2827; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 2828; GFX9-NEXT: v_mov_b32_e32 v2, 15 2829; GFX9-NEXT: scratch_store_dword v1, v2, off 2830; GFX9-NEXT: s_waitcnt vmcnt(0) 2831; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2832; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2833; GFX9-NEXT: s_waitcnt vmcnt(0) 2834; GFX9-NEXT: s_endpgm 2835; 2836; GFX10-LABEL: store_load_vindex_large_offset_kernel: 2837; GFX10: ; %bb.0: ; %bb 2838; GFX10-NEXT: s_add_u32 s0, s0, s3 2839; GFX10-NEXT: s_addc_u32 s1, s1, 0 2840; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2841; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2842; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2843; GFX10-NEXT: v_mov_b32_e32 v2, 15 2844; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2845; GFX10-NEXT: s_waitcnt vmcnt(0) 2846; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2847; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2848; GFX10-NEXT: scratch_store_dword v1, v2, off 2849; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2850; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2851; GFX10-NEXT: s_waitcnt vmcnt(0) 2852; GFX10-NEXT: s_endpgm 2853; 2854; GFX11-LABEL: store_load_vindex_large_offset_kernel: 2855; GFX11: ; %bb.0: ; %bb 2856; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2857; GFX11-NEXT: v_mov_b32_e32 v1, 15 2858; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 2859; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 2860; GFX11-NEXT: s_waitcnt vmcnt(0) 2861; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 2862; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 2863; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2864; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2865; GFX11-NEXT: s_waitcnt vmcnt(0) 2866; GFX11-NEXT: s_endpgm 2867; 2868; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 2869; GFX9-PAL: ; %bb.0: ; %bb 2870; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2871; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2872; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2873; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2874; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2875; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 2876; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2877; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2878; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2879; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2880; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2881; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2882; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 2883; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 2884; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2885; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2886; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2887; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2888; GFX9-PAL-NEXT: s_endpgm 2889; 2890; GFX940-LABEL: store_load_vindex_large_offset_kernel: 2891; GFX940: ; %bb.0: ; %bb 2892; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 2893; GFX940-NEXT: s_waitcnt vmcnt(0) 2894; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2895; GFX940-NEXT: v_mov_b32_e32 v1, 15 2896; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 2897; GFX940-NEXT: scratch_store_dword v0, v1, vcc_hi sc0 sc1 2898; GFX940-NEXT: s_waitcnt vmcnt(0) 2899; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2900; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 2901; GFX940-NEXT: s_waitcnt vmcnt(0) 2902; GFX940-NEXT: s_endpgm 2903; 2904; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 2905; GFX1010-PAL: ; %bb.0: ; %bb 2906; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2907; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2908; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2909; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2910; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2911; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2912; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2913; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2914; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2915; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2916; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 2917; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2918; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 2919; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2920; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2921; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2922; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 2923; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2924; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2925; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2926; GFX1010-PAL-NEXT: s_endpgm 2927; 2928; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 2929; GFX1030-PAL: ; %bb.0: ; %bb 2930; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2931; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2932; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2933; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2934; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2935; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2936; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2937; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2938; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2939; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2940; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 2941; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2942; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2943; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2944; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2945; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 2946; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2947; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2948; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2949; GFX1030-PAL-NEXT: s_endpgm 2950; 2951; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: 2952; GFX11-PAL: ; %bb.0: ; %bb 2953; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2954; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 2955; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 2956; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 2957; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2958; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 2959; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 2960; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2961; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2962; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2963; GFX11-PAL-NEXT: s_endpgm 2964bb: 2965 %padding = alloca [4096 x i32], align 4, addrspace(5) 2966 %i = alloca [32 x float], align 4, addrspace(5) 2967 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2968 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2969 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2970 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2971 %i3 = zext i32 %i2 to i64 2972 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 2973 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2974 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2975 %i9 = sub nsw i32 31, %i2 2976 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2977 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2978 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2979 ret void 2980} 2981 2982define void @store_load_vindex_large_offset_foo(i32 %idx) { 2983; GFX9-LABEL: store_load_vindex_large_offset_foo: 2984; GFX9: ; %bb.0: ; %bb 2985; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2986; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 2987; GFX9-NEXT: s_waitcnt vmcnt(0) 2988; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2989; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 2990; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2991; GFX9-NEXT: v_mov_b32_e32 v3, 15 2992; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 2993; GFX9-NEXT: scratch_store_dword v2, v3, off 2994; GFX9-NEXT: s_waitcnt vmcnt(0) 2995; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2996; GFX9-NEXT: scratch_load_dword v0, v0, off glc 2997; GFX9-NEXT: s_waitcnt vmcnt(0) 2998; GFX9-NEXT: s_setpc_b64 s[30:31] 2999; 3000; GFX10-LABEL: store_load_vindex_large_offset_foo: 3001; GFX10: ; %bb.0: ; %bb 3002; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3003; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3004; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 3005; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3006; GFX10-NEXT: v_mov_b32_e32 v2, 15 3007; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 3008; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3009; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3010; GFX10-NEXT: s_waitcnt vmcnt(0) 3011; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 3012; GFX10-NEXT: scratch_store_dword v0, v2, off 3013; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3014; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 3015; GFX10-NEXT: s_waitcnt vmcnt(0) 3016; GFX10-NEXT: s_setpc_b64 s[30:31] 3017; 3018; GFX11-LABEL: store_load_vindex_large_offset_foo: 3019; GFX11: ; %bb.0: ; %bb 3020; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3021; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3022; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 3023; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3024; GFX11-NEXT: v_mov_b32_e32 v2, 15 3025; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3026; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3027; GFX11-NEXT: s_waitcnt vmcnt(0) 3028; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3029; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 3030; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3031; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3032; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 3033; GFX11-NEXT: s_waitcnt vmcnt(0) 3034; GFX11-NEXT: s_setpc_b64 s[30:31] 3035; 3036; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 3037; GFX9-PAL: ; %bb.0: ; %bb 3038; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3039; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 3040; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3041; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3042; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 3043; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 3044; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 3045; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 3046; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 3047; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3048; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3049; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 3050; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3051; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3052; 3053; GFX940-LABEL: store_load_vindex_large_offset_foo: 3054; GFX940: ; %bb.0: ; %bb 3055; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3056; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 3057; GFX940-NEXT: s_waitcnt vmcnt(0) 3058; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 3059; GFX940-NEXT: v_mov_b32_e32 v2, 15 3060; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3061; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 3062; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 3063; GFX940-NEXT: s_waitcnt vmcnt(0) 3064; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3065; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3066; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 3067; GFX940-NEXT: s_waitcnt vmcnt(0) 3068; GFX940-NEXT: s_setpc_b64 s[30:31] 3069; 3070; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 3071; GFX10-PAL: ; %bb.0: ; %bb 3072; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3073; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3074; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 3075; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3076; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 3077; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 3078; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3079; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3080; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3081; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 3082; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 3083; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3084; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 3085; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3086; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3087; 3088; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: 3089; GFX11-PAL: ; %bb.0: ; %bb 3090; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3091; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3092; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 3093; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3094; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3095; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3096; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3097; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3098; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3099; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 3100; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3101; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3102; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 3103; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3104; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3105; GCN-LABEL: store_load_vindex_large_offset_foo: 3106; GCN: ; %bb.0: ; %bb 3107; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3108; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 3109; GCN-NEXT: s_waitcnt vmcnt(0) 3110; GCN-NEXT: v_mov_b32_e32 v2, 15 3111; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 3112; GCN-NEXT: v_and_b32_e32 v0, v0, v2 3113; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 3114; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 3115; GCN-NEXT: s_waitcnt vmcnt(0) 3116; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3117; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 3118; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 3119; GCN-NEXT: s_waitcnt vmcnt(0) 3120; GCN-NEXT: s_setpc_b64 s[30:31] 3121bb: 3122 %padding = alloca [4096 x i32], align 4, addrspace(5) 3123 %i = alloca [32 x float], align 4, addrspace(5) 3124 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 3125 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 3126 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 3127 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 3128 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 3129 store volatile i32 15, i32 addrspace(5)* %i8, align 4 3130 %i9 = and i32 %idx, 15 3131 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 3132 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 3133 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 3134 ret void 3135} 3136 3137define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 3138; GFX9-LABEL: store_load_large_imm_offset_kernel: 3139; GFX9: ; %bb.0: ; %bb 3140; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 3141; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 3142; GFX9-NEXT: v_mov_b32_e32 v0, 13 3143; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3144; GFX9-NEXT: s_movk_i32 s0, 0x3000 3145; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 3146; GFX9-NEXT: s_waitcnt vmcnt(0) 3147; GFX9-NEXT: s_add_i32 s0, s0, 4 3148; GFX9-NEXT: v_mov_b32_e32 v0, 15 3149; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3150; GFX9-NEXT: s_waitcnt vmcnt(0) 3151; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3152; GFX9-NEXT: s_waitcnt vmcnt(0) 3153; GFX9-NEXT: s_endpgm 3154; 3155; GFX10-LABEL: store_load_large_imm_offset_kernel: 3156; GFX10: ; %bb.0: ; %bb 3157; GFX10-NEXT: s_add_u32 s0, s0, s3 3158; GFX10-NEXT: s_addc_u32 s1, s1, 0 3159; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 3160; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 3161; GFX10-NEXT: v_mov_b32_e32 v0, 13 3162; GFX10-NEXT: v_mov_b32_e32 v1, 15 3163; GFX10-NEXT: s_movk_i32 s0, 0x3800 3164; GFX10-NEXT: s_add_i32 s0, s0, 4 3165; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 3166; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3167; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3168; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3169; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3170; GFX10-NEXT: s_waitcnt vmcnt(0) 3171; GFX10-NEXT: s_endpgm 3172; 3173; GFX11-LABEL: store_load_large_imm_offset_kernel: 3174; GFX11: ; %bb.0: ; %bb 3175; GFX11-NEXT: v_mov_b32_e32 v0, 13 3176; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 3177; GFX11-NEXT: v_mov_b32_e32 v2, 15 3178; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3179; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3180; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3181; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3182; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3183; GFX11-NEXT: s_waitcnt vmcnt(0) 3184; GFX11-NEXT: s_endpgm 3185; 3186; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 3187; GFX9-PAL: ; %bb.0: ; %bb 3188; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 3189; GFX9-PAL-NEXT: s_mov_b32 s2, s0 3190; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3191; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3192; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3193; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3194; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3195; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3196; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 3197; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3198; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 3199; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3200; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 3201; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3202; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3203; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3204; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3205; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3206; GFX9-PAL-NEXT: s_endpgm 3207; 3208; GFX940-LABEL: store_load_large_imm_offset_kernel: 3209; GFX940: ; %bb.0: ; %bb 3210; GFX940-NEXT: v_mov_b32_e32 v0, 13 3211; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 3212; GFX940-NEXT: s_waitcnt vmcnt(0) 3213; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3214; GFX940-NEXT: v_mov_b32_e32 v1, 15 3215; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 3216; GFX940-NEXT: s_waitcnt vmcnt(0) 3217; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 3218; GFX940-NEXT: s_waitcnt vmcnt(0) 3219; GFX940-NEXT: s_endpgm 3220; 3221; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 3222; GFX1010-PAL: ; %bb.0: ; %bb 3223; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 3224; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 3225; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3226; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 3227; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3228; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 3229; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 3230; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3231; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3232; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 3233; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 3234; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 3235; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 3236; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 3237; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 3238; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3239; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3240; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3241; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3242; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3243; GFX1010-PAL-NEXT: s_endpgm 3244; 3245; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 3246; GFX1030-PAL: ; %bb.0: ; %bb 3247; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 3248; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 3249; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3250; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 3251; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3252; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 3253; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 3254; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3255; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3256; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 3257; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 3258; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 3259; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 3260; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 3261; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3262; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3263; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3264; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3265; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3266; GFX1030-PAL-NEXT: s_endpgm 3267; 3268; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: 3269; GFX11-PAL: ; %bb.0: ; %bb 3270; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 3271; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 3272; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3273; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3274; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3275; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3276; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3277; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3278; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3279; GFX11-PAL-NEXT: s_endpgm 3280bb: 3281 %i = alloca [4096 x i32], align 4, addrspace(5) 3282 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 3283 store volatile i32 13, i32 addrspace(5)* %i1, align 4 3284 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3285 store volatile i32 15, i32 addrspace(5)* %i7, align 4 3286 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3287 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 3288 ret void 3289} 3290 3291define void @store_load_large_imm_offset_foo() { 3292; GFX9-LABEL: store_load_large_imm_offset_foo: 3293; GFX9: ; %bb.0: ; %bb 3294; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3295; GFX9-NEXT: v_mov_b32_e32 v0, 13 3296; GFX9-NEXT: s_movk_i32 s0, 0x3000 3297; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 3298; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 3299; GFX9-NEXT: s_waitcnt vmcnt(0) 3300; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi 3301; GFX9-NEXT: v_mov_b32_e32 v0, 15 3302; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3303; GFX9-NEXT: s_waitcnt vmcnt(0) 3304; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3305; GFX9-NEXT: s_waitcnt vmcnt(0) 3306; GFX9-NEXT: s_setpc_b64 s[30:31] 3307; 3308; GFX10-LABEL: store_load_large_imm_offset_foo: 3309; GFX10: ; %bb.0: ; %bb 3310; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3311; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3312; GFX10-NEXT: v_mov_b32_e32 v0, 13 3313; GFX10-NEXT: v_mov_b32_e32 v1, 15 3314; GFX10-NEXT: s_movk_i32 s0, 0x3800 3315; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 3316; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo 3317; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 3318; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3319; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3320; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3321; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3322; GFX10-NEXT: s_waitcnt vmcnt(0) 3323; GFX10-NEXT: s_setpc_b64 s[30:31] 3324; 3325; GFX11-LABEL: store_load_large_imm_offset_foo: 3326; GFX11: ; %bb.0: ; %bb 3327; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3328; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3329; GFX11-NEXT: v_mov_b32_e32 v0, 13 3330; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 3331; GFX11-NEXT: v_mov_b32_e32 v2, 15 3332; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3333; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3334; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3335; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3336; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3337; GFX11-NEXT: s_waitcnt vmcnt(0) 3338; GFX11-NEXT: s_setpc_b64 s[30:31] 3339; 3340; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 3341; GFX9-PAL: ; %bb.0: ; %bb 3342; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3343; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3344; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3345; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 3346; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3347; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3348; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi 3349; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3350; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3351; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3352; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3353; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3354; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3355; 3356; GFX940-LABEL: store_load_large_imm_offset_foo: 3357; GFX940: ; %bb.0: ; %bb 3358; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3359; GFX940-NEXT: v_mov_b32_e32 v0, 13 3360; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 3361; GFX940-NEXT: s_waitcnt vmcnt(0) 3362; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3363; GFX940-NEXT: v_mov_b32_e32 v1, 15 3364; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 3365; GFX940-NEXT: s_waitcnt vmcnt(0) 3366; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 3367; GFX940-NEXT: s_waitcnt vmcnt(0) 3368; GFX940-NEXT: s_setpc_b64 s[30:31] 3369; 3370; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 3371; GFX10-PAL: ; %bb.0: ; %bb 3372; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3373; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3374; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 3375; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3376; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 3377; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 3378; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo 3379; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3380; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3381; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3382; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3383; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3384; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3385; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3386; 3387; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: 3388; GFX11-PAL: ; %bb.0: ; %bb 3389; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3390; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3391; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 3392; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 3393; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3394; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3395; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3396; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3397; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3398; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3399; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3400; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3401; GCN-LABEL: store_load_large_imm_offset_foo: 3402; GCN: ; %bb.0: ; %bb 3403; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3404; GCN-NEXT: v_mov_b32_e32 v0, 13 3405; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 3406; GCN-NEXT: s_waitcnt vmcnt(0) 3407; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 3408; GCN-NEXT: v_mov_b32_e32 v1, 15 3409; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 3410; GCN-NEXT: s_waitcnt vmcnt(0) 3411; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 3412; GCN-NEXT: s_waitcnt vmcnt(0) 3413; GCN-NEXT: s_setpc_b64 s[30:31] 3414bb: 3415 %i = alloca [4096 x i32], align 4, addrspace(5) 3416 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 3417 store volatile i32 13, i32 addrspace(5)* %i1, align 4 3418 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3419 store volatile i32 15, i32 addrspace(5)* %i7, align 4 3420 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3421 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 3422 ret void 3423} 3424 3425define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 3426; GFX9-LABEL: store_load_vidx_sidx_offset: 3427; GFX9: ; %bb.0: ; %bb 3428; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 3429; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 3430; GFX9-NEXT: v_mov_b32_e32 v1, 4 3431; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3433; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 3434; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3435; GFX9-NEXT: v_mov_b32_e32 v1, 15 3436; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 3437; GFX9-NEXT: s_waitcnt vmcnt(0) 3438; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3439; GFX9-NEXT: s_waitcnt vmcnt(0) 3440; GFX9-NEXT: s_endpgm 3441; 3442; GFX10-LABEL: store_load_vidx_sidx_offset: 3443; GFX10: ; %bb.0: ; %bb 3444; GFX10-NEXT: s_add_u32 s2, s2, s5 3445; GFX10-NEXT: s_addc_u32 s3, s3, 0 3446; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3447; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3448; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 3449; GFX10-NEXT: v_mov_b32_e32 v1, 15 3450; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3451; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 3452; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 3453; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 3454; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3455; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3456; GFX10-NEXT: s_waitcnt vmcnt(0) 3457; GFX10-NEXT: s_endpgm 3458; 3459; GFX11-LABEL: store_load_vidx_sidx_offset: 3460; GFX11: ; %bb.0: ; %bb 3461; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 3462; GFX11-NEXT: v_mov_b32_e32 v1, 15 3463; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3464; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3465; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 3466; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3467; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 3468; GFX11-NEXT: s_waitcnt vmcnt(0) 3469; GFX11-NEXT: s_endpgm 3470; 3471; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 3472; GFX9-PAL: ; %bb.0: ; %bb 3473; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 3474; GFX9-PAL-NEXT: s_mov_b32 s4, s0 3475; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 3476; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 3477; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 3478; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3479; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 3480; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 3481; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 3482; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 3483; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3484; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3485; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3486; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3487; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3488; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3489; GFX9-PAL-NEXT: s_endpgm 3490; 3491; GFX940-LABEL: store_load_vidx_sidx_offset: 3492; GFX940: ; %bb.0: ; %bb 3493; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 3494; GFX940-NEXT: v_mov_b32_e32 v1, 15 3495; GFX940-NEXT: s_waitcnt lgkmcnt(0) 3496; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3497; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 3498; GFX940-NEXT: s_waitcnt vmcnt(0) 3499; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 3500; GFX940-NEXT: s_waitcnt vmcnt(0) 3501; GFX940-NEXT: s_endpgm 3502; 3503; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 3504; GFX10-PAL: ; %bb.0: ; %bb 3505; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 3506; GFX10-PAL-NEXT: s_mov_b32 s4, s0 3507; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 3508; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3509; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 3510; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 3511; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 3512; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 3513; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 3514; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 3515; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3516; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3517; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 3518; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 3519; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3520; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3521; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3522; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3523; GFX10-PAL-NEXT: s_endpgm 3524; 3525; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: 3526; GFX11-PAL: ; %bb.0: ; %bb 3527; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 3528; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3529; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 3530; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3531; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 3532; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3533; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 3534; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3535; GFX11-PAL-NEXT: s_endpgm 3536; GCN-LABEL: store_load_vidx_sidx_offset: 3537; GCN: ; %bb.0: ; %bb 3538; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 3539; GCN-NEXT: v_mov_b32_e32 v1, 15 3540; GCN-NEXT: s_waitcnt lgkmcnt(0) 3541; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3542; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 3543; GCN-NEXT: s_waitcnt vmcnt(0) 3544; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 3545; GCN-NEXT: s_waitcnt vmcnt(0) 3546; GCN-NEXT: s_endpgm 3547bb: 3548 %alloca = alloca [32 x i32], align 4, addrspace(5) 3549 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 3550 %add1 = add nsw i32 %sidx, %vidx 3551 %add2 = add nsw i32 %add1, 256 3552 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 3553 store volatile i32 15, i32 addrspace(5)* %gep, align 4 3554 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 3555 ret void 3556} 3557 3558define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 3559; GFX9-LABEL: store_load_i64_aligned: 3560; GFX9: ; %bb.0: ; %bb 3561; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3562; GFX9-NEXT: v_mov_b32_e32 v1, 15 3563; GFX9-NEXT: v_mov_b32_e32 v2, 0 3564; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3565; GFX9-NEXT: s_waitcnt vmcnt(0) 3566; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3567; GFX9-NEXT: s_waitcnt vmcnt(0) 3568; GFX9-NEXT: s_setpc_b64 s[30:31] 3569; 3570; GFX10-LABEL: store_load_i64_aligned: 3571; GFX10: ; %bb.0: ; %bb 3572; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3573; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3574; GFX10-NEXT: v_mov_b32_e32 v1, 15 3575; GFX10-NEXT: v_mov_b32_e32 v2, 0 3576; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3577; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3578; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3579; GFX10-NEXT: s_waitcnt vmcnt(0) 3580; GFX10-NEXT: s_setpc_b64 s[30:31] 3581; 3582; GFX11-LABEL: store_load_i64_aligned: 3583; GFX11: ; %bb.0: ; %bb 3584; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3585; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3586; GFX11-NEXT: v_mov_b32_e32 v1, 15 3587; GFX11-NEXT: v_mov_b32_e32 v2, 0 3588; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3589; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3590; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3591; GFX11-NEXT: s_waitcnt vmcnt(0) 3592; GFX11-NEXT: s_setpc_b64 s[30:31] 3593; 3594; GFX9-PAL-LABEL: store_load_i64_aligned: 3595; GFX9-PAL: ; %bb.0: ; %bb 3596; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3597; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3598; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 3599; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3600; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3601; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3602; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3603; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3604; 3605; GFX940-LABEL: store_load_i64_aligned: 3606; GFX940: ; %bb.0: ; %bb 3607; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3608; GFX940-NEXT: v_mov_b32_e32 v2, 15 3609; GFX940-NEXT: v_mov_b32_e32 v3, 0 3610; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3611; GFX940-NEXT: s_waitcnt vmcnt(0) 3612; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3613; GFX940-NEXT: s_waitcnt vmcnt(0) 3614; GFX940-NEXT: s_setpc_b64 s[30:31] 3615; 3616; GFX10-PAL-LABEL: store_load_i64_aligned: 3617; GFX10-PAL: ; %bb.0: ; %bb 3618; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3619; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3620; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3621; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 3622; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3623; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3624; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3625; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3626; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3627; 3628; GFX11-PAL-LABEL: store_load_i64_aligned: 3629; GFX11-PAL: ; %bb.0: ; %bb 3630; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3631; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3632; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3633; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 3634; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3635; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3636; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3637; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3638; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3639; GCN-LABEL: store_load_i64_aligned: 3640; GCN: ; %bb.0: ; %bb 3641; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3642; GCN-NEXT: v_mov_b32_e32 v2, 15 3643; GCN-NEXT: v_mov_b32_e32 v3, 0 3644; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3645; GCN-NEXT: s_waitcnt vmcnt(0) 3646; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3647; GCN-NEXT: s_waitcnt vmcnt(0) 3648; GCN-NEXT: s_setpc_b64 s[30:31] 3649bb: 3650 store volatile i64 15, i64 addrspace(5)* %arg, align 8 3651 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 3652 ret void 3653} 3654 3655define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 3656; GFX9-LABEL: store_load_i64_unaligned: 3657; GFX9: ; %bb.0: ; %bb 3658; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3659; GFX9-NEXT: v_mov_b32_e32 v1, 15 3660; GFX9-NEXT: v_mov_b32_e32 v2, 0 3661; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3662; GFX9-NEXT: s_waitcnt vmcnt(0) 3663; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3664; GFX9-NEXT: s_waitcnt vmcnt(0) 3665; GFX9-NEXT: s_setpc_b64 s[30:31] 3666; 3667; GFX10-LABEL: store_load_i64_unaligned: 3668; GFX10: ; %bb.0: ; %bb 3669; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3670; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3671; GFX10-NEXT: v_mov_b32_e32 v1, 15 3672; GFX10-NEXT: v_mov_b32_e32 v2, 0 3673; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3674; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3675; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3676; GFX10-NEXT: s_waitcnt vmcnt(0) 3677; GFX10-NEXT: s_setpc_b64 s[30:31] 3678; 3679; GFX11-LABEL: store_load_i64_unaligned: 3680; GFX11: ; %bb.0: ; %bb 3681; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3682; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3683; GFX11-NEXT: v_mov_b32_e32 v1, 15 3684; GFX11-NEXT: v_mov_b32_e32 v2, 0 3685; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3686; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3687; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3688; GFX11-NEXT: s_waitcnt vmcnt(0) 3689; GFX11-NEXT: s_setpc_b64 s[30:31] 3690; 3691; GFX9-PAL-LABEL: store_load_i64_unaligned: 3692; GFX9-PAL: ; %bb.0: ; %bb 3693; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3694; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3695; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 3696; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3697; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3698; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3699; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3700; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3701; 3702; GFX940-LABEL: store_load_i64_unaligned: 3703; GFX940: ; %bb.0: ; %bb 3704; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3705; GFX940-NEXT: v_mov_b32_e32 v2, 15 3706; GFX940-NEXT: v_mov_b32_e32 v3, 0 3707; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3708; GFX940-NEXT: s_waitcnt vmcnt(0) 3709; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3710; GFX940-NEXT: s_waitcnt vmcnt(0) 3711; GFX940-NEXT: s_setpc_b64 s[30:31] 3712; 3713; GFX10-PAL-LABEL: store_load_i64_unaligned: 3714; GFX10-PAL: ; %bb.0: ; %bb 3715; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3716; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3717; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3718; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 3719; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3720; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3721; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3722; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3723; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3724; 3725; GFX11-PAL-LABEL: store_load_i64_unaligned: 3726; GFX11-PAL: ; %bb.0: ; %bb 3727; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3728; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3729; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3730; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 3731; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3732; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3733; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3734; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3735; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3736; GCN-LABEL: store_load_i64_unaligned: 3737; GCN: ; %bb.0: ; %bb 3738; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3739; GCN-NEXT: v_mov_b32_e32 v2, 15 3740; GCN-NEXT: v_mov_b32_e32 v3, 0 3741; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3742; GCN-NEXT: s_waitcnt vmcnt(0) 3743; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3744; GCN-NEXT: s_waitcnt vmcnt(0) 3745; GCN-NEXT: s_setpc_b64 s[30:31] 3746bb: 3747 store volatile i64 15, i64 addrspace(5)* %arg, align 1 3748 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 3749 ret void 3750} 3751 3752define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 3753; GFX9-LABEL: store_load_v3i32_unaligned: 3754; GFX9: ; %bb.0: ; %bb 3755; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3756; GFX9-NEXT: v_mov_b32_e32 v1, 1 3757; GFX9-NEXT: v_mov_b32_e32 v2, 2 3758; GFX9-NEXT: v_mov_b32_e32 v3, 3 3759; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3760; GFX9-NEXT: s_waitcnt vmcnt(0) 3761; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3762; GFX9-NEXT: s_waitcnt vmcnt(0) 3763; GFX9-NEXT: s_setpc_b64 s[30:31] 3764; 3765; GFX10-LABEL: store_load_v3i32_unaligned: 3766; GFX10: ; %bb.0: ; %bb 3767; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3768; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3769; GFX10-NEXT: v_mov_b32_e32 v1, 1 3770; GFX10-NEXT: v_mov_b32_e32 v2, 2 3771; GFX10-NEXT: v_mov_b32_e32 v3, 3 3772; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3773; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3774; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3775; GFX10-NEXT: s_waitcnt vmcnt(0) 3776; GFX10-NEXT: s_setpc_b64 s[30:31] 3777; 3778; GFX11-LABEL: store_load_v3i32_unaligned: 3779; GFX11: ; %bb.0: ; %bb 3780; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3781; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3782; GFX11-NEXT: v_mov_b32_e32 v1, 1 3783; GFX11-NEXT: v_mov_b32_e32 v2, 2 3784; GFX11-NEXT: v_mov_b32_e32 v3, 3 3785; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc 3786; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3787; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 3788; GFX11-NEXT: s_waitcnt vmcnt(0) 3789; GFX11-NEXT: s_setpc_b64 s[30:31] 3790; 3791; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 3792; GFX9-PAL: ; %bb.0: ; %bb 3793; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3794; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3795; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3796; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3797; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3798; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3799; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3800; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3801; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3802; 3803; GFX940-LABEL: store_load_v3i32_unaligned: 3804; GFX940: ; %bb.0: ; %bb 3805; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3806; GFX940-NEXT: v_mov_b32_e32 v2, 1 3807; GFX940-NEXT: v_mov_b32_e32 v3, 2 3808; GFX940-NEXT: v_mov_b32_e32 v4, 3 3809; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3810; GFX940-NEXT: s_waitcnt vmcnt(0) 3811; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3812; GFX940-NEXT: s_waitcnt vmcnt(0) 3813; GFX940-NEXT: s_setpc_b64 s[30:31] 3814; 3815; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 3816; GFX10-PAL: ; %bb.0: ; %bb 3817; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3818; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3819; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3820; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3821; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3822; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3823; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3824; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3825; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3826; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3827; 3828; GFX11-PAL-LABEL: store_load_v3i32_unaligned: 3829; GFX11-PAL: ; %bb.0: ; %bb 3830; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3831; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3832; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 3833; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 3834; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 3835; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc 3836; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3837; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 3838; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3839; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3840; GCN-LABEL: store_load_v3i32_unaligned: 3841; GCN: ; %bb.0: ; %bb 3842; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3843; GCN-NEXT: v_mov_b32_e32 v2, 1 3844; GCN-NEXT: v_mov_b32_e32 v3, 2 3845; GCN-NEXT: v_mov_b32_e32 v4, 3 3846; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3847; GCN-NEXT: s_waitcnt vmcnt(0) 3848; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3849; GCN-NEXT: s_waitcnt vmcnt(0) 3850; GCN-NEXT: s_setpc_b64 s[30:31] 3851bb: 3852 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 3853 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 3854 ret void 3855} 3856 3857define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 3858; GFX9-LABEL: store_load_v4i32_unaligned: 3859; GFX9: ; %bb.0: ; %bb 3860; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3861; GFX9-NEXT: v_mov_b32_e32 v1, 1 3862; GFX9-NEXT: v_mov_b32_e32 v2, 2 3863; GFX9-NEXT: v_mov_b32_e32 v3, 3 3864; GFX9-NEXT: v_mov_b32_e32 v4, 4 3865; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3866; GFX9-NEXT: s_waitcnt vmcnt(0) 3867; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3868; GFX9-NEXT: s_waitcnt vmcnt(0) 3869; GFX9-NEXT: s_setpc_b64 s[30:31] 3870; 3871; GFX10-LABEL: store_load_v4i32_unaligned: 3872; GFX10: ; %bb.0: ; %bb 3873; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3874; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3875; GFX10-NEXT: v_mov_b32_e32 v1, 1 3876; GFX10-NEXT: v_mov_b32_e32 v2, 2 3877; GFX10-NEXT: v_mov_b32_e32 v3, 3 3878; GFX10-NEXT: v_mov_b32_e32 v4, 4 3879; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3880; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3881; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3882; GFX10-NEXT: s_waitcnt vmcnt(0) 3883; GFX10-NEXT: s_setpc_b64 s[30:31] 3884; 3885; GFX11-LABEL: store_load_v4i32_unaligned: 3886; GFX11: ; %bb.0: ; %bb 3887; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3888; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3889; GFX11-NEXT: v_mov_b32_e32 v1, 1 3890; GFX11-NEXT: v_mov_b32_e32 v2, 2 3891; GFX11-NEXT: v_mov_b32_e32 v3, 3 3892; GFX11-NEXT: v_mov_b32_e32 v4, 4 3893; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc 3894; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3895; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 3896; GFX11-NEXT: s_waitcnt vmcnt(0) 3897; GFX11-NEXT: s_setpc_b64 s[30:31] 3898; 3899; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 3900; GFX9-PAL: ; %bb.0: ; %bb 3901; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3902; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3903; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3904; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3905; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 3906; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3907; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3908; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3909; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3910; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3911; 3912; GFX940-LABEL: store_load_v4i32_unaligned: 3913; GFX940: ; %bb.0: ; %bb 3914; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3915; GFX940-NEXT: v_mov_b32_e32 v2, 1 3916; GFX940-NEXT: v_mov_b32_e32 v3, 2 3917; GFX940-NEXT: v_mov_b32_e32 v4, 3 3918; GFX940-NEXT: v_mov_b32_e32 v5, 4 3919; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3920; GFX940-NEXT: s_waitcnt vmcnt(0) 3921; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3922; GFX940-NEXT: s_waitcnt vmcnt(0) 3923; GFX940-NEXT: s_setpc_b64 s[30:31] 3924; 3925; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 3926; GFX10-PAL: ; %bb.0: ; %bb 3927; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3928; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3929; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3930; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3931; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3932; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 3933; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3934; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3935; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3936; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3937; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3938; 3939; GFX11-PAL-LABEL: store_load_v4i32_unaligned: 3940; GFX11-PAL: ; %bb.0: ; %bb 3941; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3942; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3943; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 3944; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 3945; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 3946; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 3947; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc 3948; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3949; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 3950; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3951; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3952; GCN-LABEL: store_load_v4i32_unaligned: 3953; GCN: ; %bb.0: ; %bb 3954; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3955; GCN-NEXT: v_mov_b32_e32 v2, 1 3956; GCN-NEXT: v_mov_b32_e32 v3, 2 3957; GCN-NEXT: v_mov_b32_e32 v4, 3 3958; GCN-NEXT: v_mov_b32_e32 v5, 4 3959; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3960; GCN-NEXT: s_waitcnt vmcnt(0) 3961; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3962; GCN-NEXT: s_waitcnt vmcnt(0) 3963; GCN-NEXT: s_setpc_b64 s[30:31] 3964bb: 3965 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 3966 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 3967 ret void 3968} 3969 3970define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3971; GFX9-LABEL: store_load_i32_negative_unaligned: 3972; GFX9: ; %bb.0: ; %bb 3973; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3974; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 3975; GFX9-NEXT: v_mov_b32_e32 v1, 1 3976; GFX9-NEXT: scratch_store_byte v0, v1, off 3977; GFX9-NEXT: s_waitcnt vmcnt(0) 3978; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3979; GFX9-NEXT: s_waitcnt vmcnt(0) 3980; GFX9-NEXT: s_setpc_b64 s[30:31] 3981; 3982; GFX10-LABEL: store_load_i32_negative_unaligned: 3983; GFX10: ; %bb.0: ; %bb 3984; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3985; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3986; GFX10-NEXT: v_mov_b32_e32 v1, 1 3987; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 3988; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3989; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 3990; GFX10-NEXT: s_waitcnt vmcnt(0) 3991; GFX10-NEXT: s_setpc_b64 s[30:31] 3992; 3993; GFX11-LABEL: store_load_i32_negative_unaligned: 3994; GFX11: ; %bb.0: ; %bb 3995; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3996; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3997; GFX11-NEXT: v_mov_b32_e32 v1, 1 3998; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 3999; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4000; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4001; GFX11-NEXT: s_waitcnt vmcnt(0) 4002; GFX11-NEXT: s_setpc_b64 s[30:31] 4003; 4004; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 4005; GFX9-PAL: ; %bb.0: ; %bb 4006; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4007; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 4008; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4009; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4010; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4011; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4012; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4013; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4014; 4015; GFX940-LABEL: store_load_i32_negative_unaligned: 4016; GFX940: ; %bb.0: ; %bb 4017; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4018; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 4019; GFX940-NEXT: v_mov_b32_e32 v1, 1 4020; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 4021; GFX940-NEXT: s_waitcnt vmcnt(0) 4022; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 4023; GFX940-NEXT: s_waitcnt vmcnt(0) 4024; GFX940-NEXT: s_setpc_b64 s[30:31] 4025; 4026; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 4027; GFX1010-PAL: ; %bb.0: ; %bb 4028; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4029; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4030; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 4031; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4032; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 4033; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4034; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 4035; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4036; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4037; 4038; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 4039; GFX1030-PAL: ; %bb.0: ; %bb 4040; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4041; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4042; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4043; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 4044; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4045; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 4046; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4047; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4048; 4049; GFX11-PAL-LABEL: store_load_i32_negative_unaligned: 4050; GFX11-PAL: ; %bb.0: ; %bb 4051; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4052; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4053; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 4054; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 4055; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4056; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4057; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4058; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4059bb: 4060 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 4061 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 4062 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 4063 ret void 4064} 4065 4066define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 4067; GFX9-LABEL: store_load_i32_large_negative_unaligned: 4068; GFX9: ; %bb.0: ; %bb 4069; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4070; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4071; GFX9-NEXT: v_mov_b32_e32 v1, 1 4072; GFX9-NEXT: scratch_store_byte v0, v1, off 4073; GFX9-NEXT: s_waitcnt vmcnt(0) 4074; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 4075; GFX9-NEXT: s_waitcnt vmcnt(0) 4076; GFX9-NEXT: s_setpc_b64 s[30:31] 4077; 4078; GFX10-LABEL: store_load_i32_large_negative_unaligned: 4079; GFX10: ; %bb.0: ; %bb 4080; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4081; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4082; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4083; GFX10-NEXT: v_mov_b32_e32 v1, 1 4084; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 4085; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4086; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4087; GFX10-NEXT: s_waitcnt vmcnt(0) 4088; GFX10-NEXT: s_setpc_b64 s[30:31] 4089; 4090; GFX11-LABEL: store_load_i32_large_negative_unaligned: 4091; GFX11: ; %bb.0: ; %bb 4092; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4093; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4094; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4095; GFX11-NEXT: v_mov_b32_e32 v1, 1 4096; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4097; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4098; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4099; GFX11-NEXT: s_waitcnt vmcnt(0) 4100; GFX11-NEXT: s_setpc_b64 s[30:31] 4101; 4102; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 4103; GFX9-PAL: ; %bb.0: ; %bb 4104; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4105; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4106; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4107; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4108; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4109; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4110; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4111; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4112; 4113; GFX940-LABEL: store_load_i32_large_negative_unaligned: 4114; GFX940: ; %bb.0: ; %bb 4115; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4116; GFX940-NEXT: s_movk_i32 s0, 0xef7f 4117; GFX940-NEXT: v_mov_b32_e32 v1, 1 4118; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 4119; GFX940-NEXT: s_waitcnt vmcnt(0) 4120; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 4121; GFX940-NEXT: s_waitcnt vmcnt(0) 4122; GFX940-NEXT: s_setpc_b64 s[30:31] 4123; 4124; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 4125; GFX1010-PAL: ; %bb.0: ; %bb 4126; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4127; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4128; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 4129; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4130; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 4131; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4132; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 4133; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4134; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4135; 4136; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 4137; GFX1030-PAL: ; %bb.0: ; %bb 4138; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4139; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4140; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4141; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4142; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 4143; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4144; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4145; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4146; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4147; 4148; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned: 4149; GFX11-PAL: ; %bb.0: ; %bb 4150; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4151; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4152; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4153; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 4154; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4155; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4156; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4157; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4158; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4159bb: 4160 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 4161 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 4162 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 4163 ret void 4164} 4165 4166define amdgpu_ps void @large_offset() { 4167; GFX9-LABEL: large_offset: 4168; GFX9: ; %bb.0: ; %bb 4169; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 4170; GFX9-NEXT: v_mov_b32_e32 v0, 0 4171; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 4172; GFX9-NEXT: v_mov_b32_e32 v1, v0 4173; GFX9-NEXT: v_mov_b32_e32 v2, v0 4174; GFX9-NEXT: v_mov_b32_e32 v3, v0 4175; GFX9-NEXT: s_mov_b32 vcc_hi, 0 4176; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 4177; GFX9-NEXT: s_waitcnt vmcnt(0) 4178; GFX9-NEXT: s_mov_b32 vcc_hi, 0 4179; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 4180; GFX9-NEXT: s_waitcnt vmcnt(0) 4181; GFX9-NEXT: v_mov_b32_e32 v0, 16 4182; GFX9-NEXT: ;;#ASMSTART 4183; GFX9-NEXT: ; use v0 4184; GFX9-NEXT: ;;#ASMEND 4185; GFX9-NEXT: v_mov_b32_e32 v0, 0x810 4186; GFX9-NEXT: ;;#ASMSTART 4187; GFX9-NEXT: ; use v0 4188; GFX9-NEXT: ;;#ASMEND 4189; GFX9-NEXT: s_endpgm 4190; 4191; GFX10-LABEL: large_offset: 4192; GFX10: ; %bb.0: ; %bb 4193; GFX10-NEXT: s_add_u32 s0, s0, s2 4194; GFX10-NEXT: s_addc_u32 s1, s1, 0 4195; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 4196; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 4197; GFX10-NEXT: v_mov_b32_e32 v0, 0 4198; GFX10-NEXT: s_movk_i32 s0, 0x810 4199; GFX10-NEXT: s_addk_i32 s0, 0x3c0 4200; GFX10-NEXT: v_mov_b32_e32 v1, v0 4201; GFX10-NEXT: v_mov_b32_e32 v2, v0 4202; GFX10-NEXT: v_mov_b32_e32 v3, v0 4203; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 4204; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4205; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 4206; GFX10-NEXT: s_waitcnt vmcnt(0) 4207; GFX10-NEXT: v_mov_b32_e32 v0, 16 4208; GFX10-NEXT: v_mov_b32_e32 v1, 0x810 4209; GFX10-NEXT: ;;#ASMSTART 4210; GFX10-NEXT: ; use v0 4211; GFX10-NEXT: ;;#ASMEND 4212; GFX10-NEXT: ;;#ASMSTART 4213; GFX10-NEXT: ; use v1 4214; GFX10-NEXT: ;;#ASMEND 4215; GFX10-NEXT: s_endpgm 4216; 4217; GFX11-LABEL: large_offset: 4218; GFX11: ; %bb.0: ; %bb 4219; GFX11-NEXT: v_mov_b32_e32 v0, 0 4220; GFX11-NEXT: v_mov_b32_e32 v1, v0 4221; GFX11-NEXT: v_mov_b32_e32 v2, v0 4222; GFX11-NEXT: v_mov_b32_e32 v3, v0 4223; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4224; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4225; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4226; GFX11-NEXT: s_waitcnt vmcnt(0) 4227; GFX11-NEXT: v_mov_b32_e32 v0, 16 4228; GFX11-NEXT: v_mov_b32_e32 v1, 0x810 4229; GFX11-NEXT: ;;#ASMSTART 4230; GFX11-NEXT: ; use v0 4231; GFX11-NEXT: ;;#ASMEND 4232; GFX11-NEXT: ;;#ASMSTART 4233; GFX11-NEXT: ; use v1 4234; GFX11-NEXT: ;;#ASMEND 4235; GFX11-NEXT: s_endpgm 4236; 4237; GFX9-PAL-LABEL: large_offset: 4238; GFX9-PAL: ; %bb.0: ; %bb 4239; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 4240; GFX9-PAL-NEXT: s_mov_b32 s2, s0 4241; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4242; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 4243; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 4244; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 4245; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 4246; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 4247; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4248; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 4249; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 4250; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 4251; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 4252; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4253; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 4254; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 4255; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4256; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 4257; GFX9-PAL-NEXT: ;;#ASMSTART 4258; GFX9-PAL-NEXT: ; use v0 4259; GFX9-PAL-NEXT: ;;#ASMEND 4260; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810 4261; GFX9-PAL-NEXT: ;;#ASMSTART 4262; GFX9-PAL-NEXT: ; use v0 4263; GFX9-PAL-NEXT: ;;#ASMEND 4264; GFX9-PAL-NEXT: s_endpgm 4265; 4266; GFX940-LABEL: large_offset: 4267; GFX940: ; %bb.0: ; %bb 4268; GFX940-NEXT: v_mov_b32_e32 v0, 0 4269; GFX940-NEXT: v_mov_b32_e32 v1, v0 4270; GFX940-NEXT: v_mov_b32_e32 v2, v0 4271; GFX940-NEXT: v_mov_b32_e32 v3, v0 4272; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 4273; GFX940-NEXT: s_waitcnt vmcnt(0) 4274; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 4275; GFX940-NEXT: s_waitcnt vmcnt(0) 4276; GFX940-NEXT: v_mov_b32_e32 v0, 16 4277; GFX940-NEXT: ;;#ASMSTART 4278; GFX940-NEXT: ; use v0 4279; GFX940-NEXT: ;;#ASMEND 4280; GFX940-NEXT: v_mov_b32_e32 v0, 0x810 4281; GFX940-NEXT: ;;#ASMSTART 4282; GFX940-NEXT: ; use v0 4283; GFX940-NEXT: ;;#ASMEND 4284; GFX940-NEXT: s_endpgm 4285; 4286; GFX10-PAL-LABEL: large_offset: 4287; GFX10-PAL: ; %bb.0: ; %bb 4288; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 4289; GFX10-PAL-NEXT: s_mov_b32 s2, s0 4290; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4291; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 4292; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4293; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0 4294; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 4295; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 4296; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 4297; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 4298; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 4299; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 4300; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 4301; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 4302; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 4303; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 4304; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4305; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 4306; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4307; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16 4308; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810 4309; GFX10-PAL-NEXT: ;;#ASMSTART 4310; GFX10-PAL-NEXT: ; use v0 4311; GFX10-PAL-NEXT: ;;#ASMEND 4312; GFX10-PAL-NEXT: ;;#ASMSTART 4313; GFX10-PAL-NEXT: ; use v1 4314; GFX10-PAL-NEXT: ;;#ASMEND 4315; GFX10-PAL-NEXT: s_endpgm 4316; 4317; GFX11-PAL-LABEL: large_offset: 4318; GFX11-PAL: ; %bb.0: ; %bb 4319; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 4320; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 4321; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 4322; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0 4323; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4324; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4325; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4326; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4327; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 16 4328; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x810 4329; GFX11-PAL-NEXT: ;;#ASMSTART 4330; GFX11-PAL-NEXT: ; use v0 4331; GFX11-PAL-NEXT: ;;#ASMEND 4332; GFX11-PAL-NEXT: ;;#ASMSTART 4333; GFX11-PAL-NEXT: ; use v1 4334; GFX11-PAL-NEXT: ;;#ASMEND 4335; GFX11-PAL-NEXT: s_endpgm 4336bb: 4337 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) 4338 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) 4339 %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60 4340 store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16 4341 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16 4342 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0 4343 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0 4344 ret void 4345} 4346 4347declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 4348declare i32 @llvm.amdgcn.workitem.id.x() 4349