1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 6; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s 7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s 10 11define amdgpu_kernel void @zero_init_kernel() { 12; GFX9-LABEL: zero_init_kernel: 13; GFX9: ; %bb.0: 14; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 15; GFX9-NEXT: s_mov_b32 s0, 0 16; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 17; GFX9-NEXT: s_mov_b32 s1, s0 18; GFX9-NEXT: s_mov_b32 s2, s0 19; GFX9-NEXT: s_mov_b32 s3, s0 20; GFX9-NEXT: v_mov_b32_e32 v0, s0 21; GFX9-NEXT: v_mov_b32_e32 v1, s1 22; GFX9-NEXT: v_mov_b32_e32 v2, s2 23; GFX9-NEXT: v_mov_b32_e32 v3, s3 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 28; GFX9-NEXT: s_mov_b32 vcc_hi, 0 29; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 30; GFX9-NEXT: s_mov_b32 vcc_hi, 0 31; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 32; GFX9-NEXT: s_endpgm 33; 34; GFX10-LABEL: zero_init_kernel: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: s_add_u32 s0, s0, s3 37; GFX10-NEXT: s_addc_u32 s1, s1, 0 38; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 39; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 40; GFX10-NEXT: s_mov_b32 s0, 0 41; GFX10-NEXT: s_mov_b32 s1, s0 42; GFX10-NEXT: s_mov_b32 s2, s0 43; GFX10-NEXT: s_mov_b32 s3, s0 44; GFX10-NEXT: v_mov_b32_e32 v0, s0 45; GFX10-NEXT: v_mov_b32_e32 v1, s1 46; GFX10-NEXT: v_mov_b32_e32 v2, s2 47; GFX10-NEXT: v_mov_b32_e32 v3, s3 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 49; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 50; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 51; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 52; GFX10-NEXT: s_endpgm 53; 54; GFX11-LABEL: zero_init_kernel: 55; GFX11: ; %bb.0: 56; GFX11-NEXT: s_mov_b32 s0, 0 57; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 58; GFX11-NEXT: s_mov_b32 s1, s0 59; GFX11-NEXT: s_mov_b32 s2, s0 60; GFX11-NEXT: s_mov_b32 s3, s0 61; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 62; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 63; GFX11-NEXT: s_clause 0x3 64; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:64 65; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 66; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 67; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 68; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 69; GFX11-NEXT: s_endpgm 70; 71; GFX9-PAL-LABEL: zero_init_kernel: 72; GFX9-PAL: ; %bb.0: 73; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 74; GFX9-PAL-NEXT: s_mov_b32 s2, s0 75; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 76; GFX9-PAL-NEXT: s_mov_b32 s0, 0 77; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 78; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 79; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 80; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 81; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 82; GFX9-PAL-NEXT: s_mov_b32 s1, s0 83; GFX9-PAL-NEXT: s_mov_b32 s2, s0 84; GFX9-PAL-NEXT: s_mov_b32 s3, s0 85; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 86; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 87; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 88; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 89; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 90; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 91; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 92; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 93; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 94; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 95; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 96; GFX9-PAL-NEXT: s_endpgm 97; 98; GFX940-LABEL: zero_init_kernel: 99; GFX940: ; %bb.0: 100; GFX940-NEXT: s_mov_b32 s0, 0 101; GFX940-NEXT: s_mov_b32 s1, s0 102; GFX940-NEXT: s_mov_b32 s2, s0 103; GFX940-NEXT: s_mov_b32 s3, s0 104; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 105; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 106; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 107; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 108; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 109; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 110; GFX940-NEXT: s_endpgm 111; 112; GFX1010-PAL-LABEL: zero_init_kernel: 113; GFX1010-PAL: ; %bb.0: 114; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 115; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 116; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 117; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 118; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 119; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 120; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 121; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 122; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 123; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 124; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 125; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 126; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 127; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 128; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 129; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 130; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 131; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 132; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 133; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 134; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 135; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 136; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 137; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 138; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 139; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 140; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 141; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 142; GFX1010-PAL-NEXT: s_endpgm 143; 144; GFX1030-PAL-LABEL: zero_init_kernel: 145; GFX1030-PAL: ; %bb.0: 146; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 147; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 148; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 149; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 150; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 151; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 152; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 153; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 154; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 155; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 156; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 157; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 158; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 159; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 160; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 161; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 162; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 163; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 164; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 165; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 166; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 167; GFX1030-PAL-NEXT: s_endpgm 168; 169; GFX11-PAL-LABEL: zero_init_kernel: 170; GFX11-PAL: ; %bb.0: 171; GFX11-PAL-NEXT: s_mov_b32 s0, 0 172; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 173; GFX11-PAL-NEXT: s_mov_b32 s1, s0 174; GFX11-PAL-NEXT: s_mov_b32 s2, s0 175; GFX11-PAL-NEXT: s_mov_b32 s3, s0 176; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 177; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 178; GFX11-PAL-NEXT: s_clause 0x3 179; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:64 180; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 181; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 182; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 183; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 184; GFX11-PAL-NEXT: s_endpgm 185 %alloca = alloca [32 x i16], align 2, addrspace(5) 186 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 187 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 188 ret void 189} 190 191define void @zero_init_foo() { 192; GFX9-LABEL: zero_init_foo: 193; GFX9: ; %bb.0: 194; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; GFX9-NEXT: s_mov_b32 s0, 0 196; GFX9-NEXT: s_mov_b32 s1, s0 197; GFX9-NEXT: s_mov_b32 s2, s0 198; GFX9-NEXT: s_mov_b32 s3, s0 199; GFX9-NEXT: v_mov_b32_e32 v0, s0 200; GFX9-NEXT: v_mov_b32_e32 v1, s1 201; GFX9-NEXT: v_mov_b32_e32 v2, s2 202; GFX9-NEXT: v_mov_b32_e32 v3, s3 203; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 204; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 205; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 206; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 207; GFX9-NEXT: s_waitcnt vmcnt(0) 208; GFX9-NEXT: s_setpc_b64 s[30:31] 209; 210; GFX10-LABEL: zero_init_foo: 211; GFX10: ; %bb.0: 212; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 213; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 214; GFX10-NEXT: s_mov_b32 s0, 0 215; GFX10-NEXT: s_mov_b32 s1, s0 216; GFX10-NEXT: s_mov_b32 s2, s0 217; GFX10-NEXT: s_mov_b32 s3, s0 218; GFX10-NEXT: v_mov_b32_e32 v0, s0 219; GFX10-NEXT: v_mov_b32_e32 v1, s1 220; GFX10-NEXT: v_mov_b32_e32 v2, s2 221; GFX10-NEXT: v_mov_b32_e32 v3, s3 222; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 223; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 224; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 225; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 226; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 227; GFX10-NEXT: s_setpc_b64 s[30:31] 228; 229; GFX11-LABEL: zero_init_foo: 230; GFX11: ; %bb.0: 231; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 233; GFX11-NEXT: s_mov_b32 s0, 0 234; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 235; GFX11-NEXT: s_mov_b32 s1, s0 236; GFX11-NEXT: s_mov_b32 s2, s0 237; GFX11-NEXT: s_mov_b32 s3, s0 238; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 239; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 240; GFX11-NEXT: s_clause 0x3 241; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 242; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 243; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 244; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 245; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 246; GFX11-NEXT: s_setpc_b64 s[30:31] 247; 248; GFX9-PAL-LABEL: zero_init_foo: 249; GFX9-PAL: ; %bb.0: 250; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; GFX9-PAL-NEXT: s_mov_b32 s0, 0 252; GFX9-PAL-NEXT: s_mov_b32 s1, s0 253; GFX9-PAL-NEXT: s_mov_b32 s2, s0 254; GFX9-PAL-NEXT: s_mov_b32 s3, s0 255; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 256; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 257; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 258; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 259; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 260; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 261; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 262; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 263; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 264; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX940-LABEL: zero_init_foo: 267; GFX940: ; %bb.0: 268; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX940-NEXT: s_mov_b32 s0, 0 270; GFX940-NEXT: s_mov_b32 s1, s0 271; GFX940-NEXT: s_mov_b32 s2, s0 272; GFX940-NEXT: s_mov_b32 s3, s0 273; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 274; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 275; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 276; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 277; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 278; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 279; GFX940-NEXT: s_waitcnt vmcnt(0) 280; GFX940-NEXT: s_setpc_b64 s[30:31] 281; 282; GFX10-PAL-LABEL: zero_init_foo: 283; GFX10-PAL: ; %bb.0: 284; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 285; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 286; GFX10-PAL-NEXT: s_mov_b32 s0, 0 287; GFX10-PAL-NEXT: s_mov_b32 s1, s0 288; GFX10-PAL-NEXT: s_mov_b32 s2, s0 289; GFX10-PAL-NEXT: s_mov_b32 s3, s0 290; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 291; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 292; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 293; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 294; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 295; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 296; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 297; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 298; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 299; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 300; 301; GFX11-PAL-LABEL: zero_init_foo: 302; GFX11-PAL: ; %bb.0: 303; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 304; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 305; GFX11-PAL-NEXT: s_mov_b32 s0, 0 306; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 307; GFX11-PAL-NEXT: s_mov_b32 s1, s0 308; GFX11-PAL-NEXT: s_mov_b32 s2, s0 309; GFX11-PAL-NEXT: s_mov_b32 s3, s0 310; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 311; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 312; GFX11-PAL-NEXT: s_clause 0x3 313; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 314; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 315; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 316; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 317; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 318; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 319; GCN-LABEL: zero_init_foo: 320; GCN: ; %bb.0: 321; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 322; GCN-NEXT: s_mov_b32 s0, 0 323; GCN-NEXT: s_mov_b32 s1, s0 324; GCN-NEXT: s_mov_b32 s2, s0 325; GCN-NEXT: s_mov_b32 s3, s0 326; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 327; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 328; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 329; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 330; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 331; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 332; GCN-NEXT: s_waitcnt vmcnt(0) 333; GCN-NEXT: s_setpc_b64 s[30:31] 334 %alloca = alloca [32 x i16], align 2, addrspace(5) 335 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 336 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 337 ret void 338} 339 340define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 341; GFX9-LABEL: store_load_sindex_kernel: 342; GFX9: ; %bb.0: ; %bb 343; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 344; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 345; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 346; GFX9-NEXT: v_mov_b32_e32 v0, 15 347; GFX9-NEXT: s_waitcnt lgkmcnt(0) 348; GFX9-NEXT: s_lshl_b32 s1, s0, 2 349; GFX9-NEXT: s_and_b32 s0, s0, 15 350; GFX9-NEXT: s_add_i32 s1, s1, 4 351; GFX9-NEXT: s_lshl_b32 s0, s0, 2 352; GFX9-NEXT: scratch_store_dword off, v0, s1 353; GFX9-NEXT: s_waitcnt vmcnt(0) 354; GFX9-NEXT: s_add_i32 s0, s0, 4 355; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 356; GFX9-NEXT: s_waitcnt vmcnt(0) 357; GFX9-NEXT: s_endpgm 358; 359; GFX10-LABEL: store_load_sindex_kernel: 360; GFX10: ; %bb.0: ; %bb 361; GFX10-NEXT: s_add_u32 s2, s2, s5 362; GFX10-NEXT: s_addc_u32 s3, s3, 0 363; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 364; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 365; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 366; GFX10-NEXT: v_mov_b32_e32 v0, 15 367; GFX10-NEXT: s_waitcnt lgkmcnt(0) 368; GFX10-NEXT: s_and_b32 s1, s0, 15 369; GFX10-NEXT: s_lshl_b32 s0, s0, 2 370; GFX10-NEXT: s_lshl_b32 s1, s1, 2 371; GFX10-NEXT: s_add_i32 s0, s0, 4 372; GFX10-NEXT: s_add_i32 s1, s1, 4 373; GFX10-NEXT: scratch_store_dword off, v0, s0 374; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 375; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 376; GFX10-NEXT: s_waitcnt vmcnt(0) 377; GFX10-NEXT: s_endpgm 378; 379; GFX11-LABEL: store_load_sindex_kernel: 380; GFX11: ; %bb.0: ; %bb 381; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 382; GFX11-NEXT: v_mov_b32_e32 v0, 15 383; GFX11-NEXT: s_waitcnt lgkmcnt(0) 384; GFX11-NEXT: s_and_b32 s1, s0, 15 385; GFX11-NEXT: s_lshl_b32 s0, s0, 2 386; GFX11-NEXT: s_lshl_b32 s1, s1, 2 387; GFX11-NEXT: s_add_i32 s0, s0, 4 388; GFX11-NEXT: s_add_i32 s1, s1, 4 389; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 390; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 391; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 392; GFX11-NEXT: s_waitcnt vmcnt(0) 393; GFX11-NEXT: s_endpgm 394; 395; GFX9-PAL-LABEL: store_load_sindex_kernel: 396; GFX9-PAL: ; %bb.0: ; %bb 397; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 398; GFX9-PAL-NEXT: s_mov_b32 s4, s0 399; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 400; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 401; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 402; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 403; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 404; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 405; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 406; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 407; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 408; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 409; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 410; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 411; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 412; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 413; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 414; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 415; GFX9-PAL-NEXT: s_endpgm 416; 417; GFX940-LABEL: store_load_sindex_kernel: 418; GFX940: ; %bb.0: ; %bb 419; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 420; GFX940-NEXT: v_mov_b32_e32 v0, 15 421; GFX940-NEXT: s_waitcnt lgkmcnt(0) 422; GFX940-NEXT: s_lshl_b32 s1, s0, 2 423; GFX940-NEXT: s_and_b32 s0, s0, 15 424; GFX940-NEXT: s_add_i32 s1, s1, 4 425; GFX940-NEXT: s_lshl_b32 s0, s0, 2 426; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 427; GFX940-NEXT: s_waitcnt vmcnt(0) 428; GFX940-NEXT: s_add_i32 s0, s0, 4 429; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 430; GFX940-NEXT: s_waitcnt vmcnt(0) 431; GFX940-NEXT: s_endpgm 432; 433; GFX10-PAL-LABEL: store_load_sindex_kernel: 434; GFX10-PAL: ; %bb.0: ; %bb 435; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 436; GFX10-PAL-NEXT: s_mov_b32 s4, s0 437; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 438; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 439; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 440; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 441; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 442; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 443; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 444; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 445; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 446; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 447; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 448; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 449; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 450; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 451; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 452; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 453; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 454; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 455; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 456; GFX10-PAL-NEXT: s_endpgm 457; 458; GFX11-PAL-LABEL: store_load_sindex_kernel: 459; GFX11-PAL: ; %bb.0: ; %bb 460; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 461; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 462; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 463; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 464; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 465; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 466; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 467; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 468; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 469; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 470; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 471; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 472; GFX11-PAL-NEXT: s_endpgm 473; GCN-LABEL: store_load_sindex_kernel: 474; GCN: ; %bb.0: ; %bb 475; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 476; GCN-NEXT: v_mov_b32_e32 v0, 15 477; GCN-NEXT: s_waitcnt lgkmcnt(0) 478; GCN-NEXT: s_lshl_b32 s1, s0, 2 479; GCN-NEXT: s_and_b32 s0, s0, 15 480; GCN-NEXT: s_lshl_b32 s0, s0, 2 481; GCN-NEXT: s_add_u32 s1, 4, s1 482; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 483; GCN-NEXT: s_waitcnt vmcnt(0) 484; GCN-NEXT: s_add_u32 s0, 4, s0 485; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 486; GCN-NEXT: s_waitcnt vmcnt(0) 487; GCN-NEXT: s_endpgm 488bb: 489 %i = alloca [32 x float], align 4, addrspace(5) 490 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 491 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 492 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 493 store volatile i32 15, i32 addrspace(5)* %i8, align 4 494 %i9 = and i32 %idx, 15 495 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 496 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 497 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 498 ret void 499} 500 501define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 502; GFX9-LABEL: store_load_sindex_foo: 503; GFX9: ; %bb.0: ; %bb 504; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 505; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 506; GFX9-NEXT: s_lshl_b32 s0, s2, 2 507; GFX9-NEXT: s_add_i32 s0, s0, 4 508; GFX9-NEXT: v_mov_b32_e32 v0, 15 509; GFX9-NEXT: scratch_store_dword off, v0, s0 510; GFX9-NEXT: s_waitcnt vmcnt(0) 511; GFX9-NEXT: s_and_b32 s0, s2, 15 512; GFX9-NEXT: s_lshl_b32 s0, s0, 2 513; GFX9-NEXT: s_add_i32 s0, s0, 4 514; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 515; GFX9-NEXT: s_waitcnt vmcnt(0) 516; GFX9-NEXT: s_endpgm 517; 518; GFX10-LABEL: store_load_sindex_foo: 519; GFX10: ; %bb.0: ; %bb 520; GFX10-NEXT: s_add_u32 s0, s0, s3 521; GFX10-NEXT: s_addc_u32 s1, s1, 0 522; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 523; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 524; GFX10-NEXT: v_mov_b32_e32 v0, 15 525; GFX10-NEXT: s_and_b32 s0, s2, 15 526; GFX10-NEXT: s_lshl_b32 s1, s2, 2 527; GFX10-NEXT: s_lshl_b32 s0, s0, 2 528; GFX10-NEXT: s_add_i32 s1, s1, 4 529; GFX10-NEXT: s_add_i32 s0, s0, 4 530; GFX10-NEXT: scratch_store_dword off, v0, s1 531; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 532; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 533; GFX10-NEXT: s_waitcnt vmcnt(0) 534; GFX10-NEXT: s_endpgm 535; 536; GFX11-LABEL: store_load_sindex_foo: 537; GFX11: ; %bb.0: ; %bb 538; GFX11-NEXT: v_mov_b32_e32 v0, 15 539; GFX11-NEXT: s_and_b32 s1, s0, 15 540; GFX11-NEXT: s_lshl_b32 s0, s0, 2 541; GFX11-NEXT: s_lshl_b32 s1, s1, 2 542; GFX11-NEXT: s_add_i32 s0, s0, 4 543; GFX11-NEXT: s_add_i32 s1, s1, 4 544; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 545; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 546; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 547; GFX11-NEXT: s_waitcnt vmcnt(0) 548; GFX11-NEXT: s_endpgm 549; 550; GFX9-PAL-LABEL: store_load_sindex_foo: 551; GFX9-PAL: ; %bb.0: ; %bb 552; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 553; GFX9-PAL-NEXT: s_mov_b32 s2, s0 554; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 555; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 556; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 557; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 558; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 559; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 560; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 561; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 562; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 563; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 564; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 565; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 566; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 567; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 568; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 569; GFX9-PAL-NEXT: s_endpgm 570; 571; GFX940-LABEL: store_load_sindex_foo: 572; GFX940: ; %bb.0: ; %bb 573; GFX940-NEXT: s_lshl_b32 s1, s0, 2 574; GFX940-NEXT: s_and_b32 s0, s0, 15 575; GFX940-NEXT: s_add_i32 s1, s1, 4 576; GFX940-NEXT: v_mov_b32_e32 v0, 15 577; GFX940-NEXT: s_lshl_b32 s0, s0, 2 578; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 579; GFX940-NEXT: s_waitcnt vmcnt(0) 580; GFX940-NEXT: s_add_i32 s0, s0, 4 581; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 582; GFX940-NEXT: s_waitcnt vmcnt(0) 583; GFX940-NEXT: s_endpgm 584; 585; GFX10-PAL-LABEL: store_load_sindex_foo: 586; GFX10-PAL: ; %bb.0: ; %bb 587; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 588; GFX10-PAL-NEXT: s_mov_b32 s2, s0 589; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 590; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 591; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 592; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 593; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 594; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 595; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 596; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 597; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 598; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 599; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 600; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 601; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 602; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 603; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 604; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 605; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 606; GFX10-PAL-NEXT: s_endpgm 607; 608; GFX11-PAL-LABEL: store_load_sindex_foo: 609; GFX11-PAL: ; %bb.0: ; %bb 610; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 611; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 612; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 613; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 614; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 615; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 616; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 617; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 618; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 619; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 620; GFX11-PAL-NEXT: s_endpgm 621; GCN-LABEL: store_load_sindex_foo: 622; GCN: ; %bb.0: ; %bb 623; GCN-NEXT: s_lshl_b32 s1, s0, 2 624; GCN-NEXT: s_and_b32 s0, s0, 15 625; GCN-NEXT: s_lshl_b32 s0, s0, 2 626; GCN-NEXT: s_add_u32 s1, 4, s1 627; GCN-NEXT: v_mov_b32_e32 v0, 15 628; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 629; GCN-NEXT: s_waitcnt vmcnt(0) 630; GCN-NEXT: s_add_u32 s0, 4, s0 631; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 632; GCN-NEXT: s_waitcnt vmcnt(0) 633; GCN-NEXT: s_endpgm 634bb: 635 %i = alloca [32 x float], align 4, addrspace(5) 636 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 637 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 638 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 639 store volatile i32 15, i32 addrspace(5)* %i8, align 4 640 %i9 = and i32 %idx, 15 641 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 642 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 643 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 644 ret void 645} 646 647define amdgpu_kernel void @store_load_vindex_kernel() { 648; GFX9-LABEL: store_load_vindex_kernel: 649; GFX9: ; %bb.0: ; %bb 650; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 651; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 652; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 653; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 654; GFX9-NEXT: v_mov_b32_e32 v2, 15 655; GFX9-NEXT: scratch_store_dword v1, v2, off 656; GFX9-NEXT: s_waitcnt vmcnt(0) 657; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 658; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 659; GFX9-NEXT: s_waitcnt vmcnt(0) 660; GFX9-NEXT: s_endpgm 661; 662; GFX10-LABEL: store_load_vindex_kernel: 663; GFX10: ; %bb.0: ; %bb 664; GFX10-NEXT: s_add_u32 s0, s0, s3 665; GFX10-NEXT: s_addc_u32 s1, s1, 0 666; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 667; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 668; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 669; GFX10-NEXT: v_mov_b32_e32 v2, 15 670; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 671; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 672; GFX10-NEXT: scratch_store_dword v1, v2, off 673; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 674; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 675; GFX10-NEXT: s_waitcnt vmcnt(0) 676; GFX10-NEXT: s_endpgm 677; 678; GFX11-LABEL: store_load_vindex_kernel: 679; GFX11: ; %bb.0: ; %bb 680; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 681; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 682; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 683; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 684; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 685; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 686; GFX11-NEXT: s_waitcnt vmcnt(0) 687; GFX11-NEXT: s_endpgm 688; 689; GFX9-PAL-LABEL: store_load_vindex_kernel: 690; GFX9-PAL: ; %bb.0: ; %bb 691; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 692; GFX9-PAL-NEXT: s_mov_b32 s2, s0 693; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 694; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 695; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 696; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 697; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 698; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 699; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 700; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 701; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 702; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 703; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 704; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 705; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 706; GFX9-PAL-NEXT: s_endpgm 707; 708; GFX940-LABEL: store_load_vindex_kernel: 709; GFX940: ; %bb.0: ; %bb 710; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 711; GFX940-NEXT: v_mov_b32_e32 v1, 15 712; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 713; GFX940-NEXT: s_waitcnt vmcnt(0) 714; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 715; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 716; GFX940-NEXT: s_waitcnt vmcnt(0) 717; GFX940-NEXT: s_endpgm 718; 719; GFX10-PAL-LABEL: store_load_vindex_kernel: 720; GFX10-PAL: ; %bb.0: ; %bb 721; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 722; GFX10-PAL-NEXT: s_mov_b32 s2, s0 723; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 724; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 725; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 726; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 727; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 728; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 729; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 730; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 731; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 732; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 733; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 734; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off 735; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 736; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 737; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 738; GFX10-PAL-NEXT: s_endpgm 739; 740; GFX11-PAL-LABEL: store_load_vindex_kernel: 741; GFX11-PAL: ; %bb.0: ; %bb 742; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 743; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 744; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 745; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 746; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 747; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 748; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 749; GFX11-PAL-NEXT: s_endpgm 750; GCN-LABEL: store_load_vindex_kernel: 751; GCN: ; %bb.0: ; %bb 752; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 753; GCN-NEXT: v_mov_b32_e32 v1, 15 754; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 755; GCN-NEXT: s_waitcnt vmcnt(0) 756; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 757; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 758; GCN-NEXT: s_waitcnt vmcnt(0) 759; GCN-NEXT: s_endpgm 760bb: 761 %i = alloca [32 x float], align 4, addrspace(5) 762 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 763 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 764 %i3 = zext i32 %i2 to i64 765 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 766 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 767 store volatile i32 15, i32 addrspace(5)* %i8, align 4 768 %i9 = sub nsw i32 31, %i2 769 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 770 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 771 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 772 ret void 773} 774 775define void @store_load_vindex_foo(i32 %idx) { 776; GFX9-LABEL: store_load_vindex_foo: 777; GFX9: ; %bb.0: ; %bb 778; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 779; GFX9-NEXT: v_mov_b32_e32 v1, s32 780; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 781; GFX9-NEXT: v_mov_b32_e32 v3, 15 782; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 783; GFX9-NEXT: scratch_store_dword v2, v3, off 784; GFX9-NEXT: s_waitcnt vmcnt(0) 785; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 786; GFX9-NEXT: scratch_load_dword v0, v0, off glc 787; GFX9-NEXT: s_waitcnt vmcnt(0) 788; GFX9-NEXT: s_setpc_b64 s[30:31] 789; 790; GFX10-LABEL: store_load_vindex_foo: 791; GFX10: ; %bb.0: ; %bb 792; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 793; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 794; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 795; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 796; GFX10-NEXT: v_mov_b32_e32 v2, 15 797; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 798; GFX10-NEXT: scratch_store_dword v0, v2, off 799; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 800; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 801; GFX10-NEXT: s_waitcnt vmcnt(0) 802; GFX10-NEXT: s_setpc_b64 s[30:31] 803; 804; GFX11-LABEL: store_load_vindex_foo: 805; GFX11: ; %bb.0: ; %bb 806; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 807; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 808; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 809; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 810; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 811; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 812; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc 813; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 814; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 815; GFX11-NEXT: s_waitcnt vmcnt(0) 816; GFX11-NEXT: s_setpc_b64 s[30:31] 817; 818; GFX9-PAL-LABEL: store_load_vindex_foo: 819; GFX9-PAL: ; %bb.0: ; %bb 820; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 821; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 822; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 823; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 824; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 825; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 826; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 827; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 828; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 829; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 830; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 831; 832; GFX940-LABEL: store_load_vindex_foo: 833; GFX940: ; %bb.0: ; %bb 834; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 835; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 836; GFX940-NEXT: v_mov_b32_e32 v2, 15 837; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 838; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 839; GFX940-NEXT: s_waitcnt vmcnt(0) 840; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 841; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 842; GFX940-NEXT: s_waitcnt vmcnt(0) 843; GFX940-NEXT: s_setpc_b64 s[30:31] 844; 845; GFX10-PAL-LABEL: store_load_vindex_foo: 846; GFX10-PAL: ; %bb.0: ; %bb 847; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 848; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 849; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 850; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 851; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 852; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 853; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 854; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 855; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 856; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 857; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 858; 859; GFX11-PAL-LABEL: store_load_vindex_foo: 860; GFX11-PAL: ; %bb.0: ; %bb 861; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 862; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 863; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 864; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 865; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) 866; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 867; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc 868; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 869; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 870; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 871; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 872; GCN-LABEL: store_load_vindex_foo: 873; GCN: ; %bb.0: ; %bb 874; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 875; GCN-NEXT: v_mov_b32_e32 v2, 15 876; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 877; GCN-NEXT: v_and_b32_e32 v0, v0, v2 878; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 879; GCN-NEXT: s_waitcnt vmcnt(0) 880; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 881; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 882; GCN-NEXT: s_waitcnt vmcnt(0) 883; GCN-NEXT: s_setpc_b64 s[30:31] 884bb: 885 %i = alloca [32 x float], align 4, addrspace(5) 886 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 887 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 888 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 889 store volatile i32 15, i32 addrspace(5)* %i8, align 4 890 %i9 = and i32 %idx, 15 891 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 892 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 893 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 894 ret void 895} 896 897define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 898; GFX9-LABEL: private_ptr_foo: 899; GFX9: ; %bb.0: 900; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 901; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 902; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 903; GFX9-NEXT: s_waitcnt vmcnt(0) 904; GFX9-NEXT: s_setpc_b64 s[30:31] 905; 906; GFX10-LABEL: private_ptr_foo: 907; GFX10: ; %bb.0: 908; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 909; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 910; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 911; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 912; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 913; GFX10-NEXT: s_setpc_b64 s[30:31] 914; 915; GFX11-LABEL: private_ptr_foo: 916; GFX11: ; %bb.0: 917; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 918; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 919; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 920; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 921; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 922; GFX11-NEXT: s_setpc_b64 s[30:31] 923; 924; GFX9-PAL-LABEL: private_ptr_foo: 925; GFX9-PAL: ; %bb.0: 926; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 927; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 928; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 929; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 930; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 931; 932; GFX940-LABEL: private_ptr_foo: 933; GFX940: ; %bb.0: 934; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 935; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 936; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 937; GFX940-NEXT: s_waitcnt vmcnt(0) 938; GFX940-NEXT: s_setpc_b64 s[30:31] 939; 940; GFX10-PAL-LABEL: private_ptr_foo: 941; GFX10-PAL: ; %bb.0: 942; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 943; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 944; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 945; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 946; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 947; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 948; 949; GFX11-PAL-LABEL: private_ptr_foo: 950; GFX11-PAL: ; %bb.0: 951; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 952; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 953; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 954; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 955; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 956; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 957; GCN-LABEL: private_ptr_foo: 958; GCN: ; %bb.0: 959; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 960; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 961; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 962; GCN-NEXT: s_waitcnt vmcnt(0) 963; GCN-NEXT: s_setpc_b64 s[30:31] 964 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 965 store float 1.000000e+01, float addrspace(5)* %gep, align 4 966 ret void 967} 968 969define amdgpu_kernel void @zero_init_small_offset_kernel() { 970; GFX9-LABEL: zero_init_small_offset_kernel: 971; GFX9: ; %bb.0: 972; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 973; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 974; GFX9-NEXT: s_mov_b32 vcc_hi, 0 975; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 976; GFX9-NEXT: s_waitcnt vmcnt(0) 977; GFX9-NEXT: s_mov_b32 s0, 0 978; GFX9-NEXT: s_mov_b32 s1, s0 979; GFX9-NEXT: s_mov_b32 s2, s0 980; GFX9-NEXT: s_mov_b32 s3, s0 981; GFX9-NEXT: v_mov_b32_e32 v0, s0 982; GFX9-NEXT: v_mov_b32_e32 v1, s1 983; GFX9-NEXT: v_mov_b32_e32 v2, s2 984; GFX9-NEXT: v_mov_b32_e32 v3, s3 985; GFX9-NEXT: s_mov_b32 vcc_hi, 0 986; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 987; GFX9-NEXT: s_mov_b32 vcc_hi, 0 988; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 989; GFX9-NEXT: s_mov_b32 vcc_hi, 0 990; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 991; GFX9-NEXT: s_mov_b32 vcc_hi, 0 992; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 993; GFX9-NEXT: s_endpgm 994; 995; GFX10-LABEL: zero_init_small_offset_kernel: 996; GFX10: ; %bb.0: 997; GFX10-NEXT: s_add_u32 s0, s0, s3 998; GFX10-NEXT: s_addc_u32 s1, s1, 0 999; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1000; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1001; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1002; GFX10-NEXT: s_waitcnt vmcnt(0) 1003; GFX10-NEXT: s_mov_b32 s0, 0 1004; GFX10-NEXT: s_mov_b32 s1, s0 1005; GFX10-NEXT: s_mov_b32 s2, s0 1006; GFX10-NEXT: s_mov_b32 s3, s0 1007; GFX10-NEXT: v_mov_b32_e32 v0, s0 1008; GFX10-NEXT: v_mov_b32_e32 v1, s1 1009; GFX10-NEXT: v_mov_b32_e32 v2, s2 1010; GFX10-NEXT: v_mov_b32_e32 v3, s3 1011; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1012; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1013; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1014; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1015; GFX10-NEXT: s_endpgm 1016; 1017; GFX11-LABEL: zero_init_small_offset_kernel: 1018; GFX11: ; %bb.0: 1019; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1020; GFX11-NEXT: s_waitcnt vmcnt(0) 1021; GFX11-NEXT: s_mov_b32 s0, 0 1022; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1023; GFX11-NEXT: s_mov_b32 s1, s0 1024; GFX11-NEXT: s_mov_b32 s2, s0 1025; GFX11-NEXT: s_mov_b32 s3, s0 1026; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1027; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1028; GFX11-NEXT: s_clause 0x3 1029; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1030; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1031; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1032; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:320 1033; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1034; GFX11-NEXT: s_endpgm 1035; 1036; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 1037; GFX9-PAL: ; %bb.0: 1038; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1039; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1040; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1041; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1042; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1043; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1044; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1045; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1046; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1047; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1048; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1049; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1050; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1051; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1052; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1053; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1054; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1055; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1056; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1057; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 1058; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1059; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 1060; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1061; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 1062; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1063; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 1064; GFX9-PAL-NEXT: s_endpgm 1065; 1066; GFX940-LABEL: zero_init_small_offset_kernel: 1067; GFX940: ; %bb.0: 1068; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1069; GFX940-NEXT: s_waitcnt vmcnt(0) 1070; GFX940-NEXT: s_mov_b32 s0, 0 1071; GFX940-NEXT: s_mov_b32 s1, s0 1072; GFX940-NEXT: s_mov_b32 s2, s0 1073; GFX940-NEXT: s_mov_b32 s3, s0 1074; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1075; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1076; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1077; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1078; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1079; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1080; GFX940-NEXT: s_endpgm 1081; 1082; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 1083; GFX1010-PAL: ; %bb.0: 1084; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1085; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1086; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1087; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1089; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1090; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1091; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1092; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1093; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1094; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1095; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1096; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1097; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1098; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1099; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1100; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1101; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1102; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1103; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1104; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1105; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 1106; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1107; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1108; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 1109; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1110; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1111; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 1112; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1113; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1114; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 1115; GFX1010-PAL-NEXT: s_endpgm 1116; 1117; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 1118; GFX1030-PAL: ; %bb.0: 1119; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1120; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1121; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1122; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1123; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1124; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1125; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1126; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1127; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1128; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1129; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1130; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1131; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1132; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1133; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1134; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1135; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1136; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1137; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1138; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1139; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1140; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1141; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1142; GFX1030-PAL-NEXT: s_endpgm 1143; 1144; GFX11-PAL-LABEL: zero_init_small_offset_kernel: 1145; GFX11-PAL: ; %bb.0: 1146; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1147; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1148; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1149; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1150; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1151; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1152; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1153; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1154; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1155; GFX11-PAL-NEXT: s_clause 0x3 1156; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1157; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1158; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1159; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:320 1160; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1161; GFX11-PAL-NEXT: s_endpgm 1162 %padding = alloca [64 x i32], align 4, addrspace(5) 1163 %alloca = alloca [32 x i16], align 2, addrspace(5) 1164 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1165 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1166 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1167 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1168 ret void 1169} 1170 1171define void @zero_init_small_offset_foo() { 1172; GFX9-LABEL: zero_init_small_offset_foo: 1173; GFX9: ; %bb.0: 1174; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1175; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 1176; GFX9-NEXT: s_waitcnt vmcnt(0) 1177; GFX9-NEXT: s_mov_b32 s0, 0 1178; GFX9-NEXT: s_mov_b32 s1, s0 1179; GFX9-NEXT: s_mov_b32 s2, s0 1180; GFX9-NEXT: s_mov_b32 s3, s0 1181; GFX9-NEXT: v_mov_b32_e32 v0, s0 1182; GFX9-NEXT: v_mov_b32_e32 v1, s1 1183; GFX9-NEXT: v_mov_b32_e32 v2, s2 1184; GFX9-NEXT: v_mov_b32_e32 v3, s3 1185; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1186; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1187; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1188; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1189; GFX9-NEXT: s_waitcnt vmcnt(0) 1190; GFX9-NEXT: s_setpc_b64 s[30:31] 1191; 1192; GFX10-LABEL: zero_init_small_offset_foo: 1193; GFX10: ; %bb.0: 1194; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1195; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1196; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 1197; GFX10-NEXT: s_waitcnt vmcnt(0) 1198; GFX10-NEXT: s_mov_b32 s0, 0 1199; GFX10-NEXT: s_mov_b32 s1, s0 1200; GFX10-NEXT: s_mov_b32 s2, s0 1201; GFX10-NEXT: s_mov_b32 s3, s0 1202; GFX10-NEXT: v_mov_b32_e32 v0, s0 1203; GFX10-NEXT: v_mov_b32_e32 v1, s1 1204; GFX10-NEXT: v_mov_b32_e32 v2, s2 1205; GFX10-NEXT: v_mov_b32_e32 v3, s3 1206; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1207; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1208; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1209; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1210; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1211; GFX10-NEXT: s_setpc_b64 s[30:31] 1212; 1213; GFX11-LABEL: zero_init_small_offset_foo: 1214; GFX11: ; %bb.0: 1215; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1216; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1217; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1218; GFX11-NEXT: s_waitcnt vmcnt(0) 1219; GFX11-NEXT: s_mov_b32 s0, 0 1220; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1221; GFX11-NEXT: s_mov_b32 s1, s0 1222; GFX11-NEXT: s_mov_b32 s2, s0 1223; GFX11-NEXT: s_mov_b32 s3, s0 1224; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1225; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1226; GFX11-NEXT: s_clause 0x3 1227; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1228; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1229; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1230; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1231; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1232; GFX11-NEXT: s_setpc_b64 s[30:31] 1233; 1234; GFX9-PAL-LABEL: zero_init_small_offset_foo: 1235; GFX9-PAL: ; %bb.0: 1236; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1237; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 1238; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1239; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1240; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1241; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1242; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1243; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1244; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1245; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1246; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1247; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1248; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1249; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1250; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1251; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1252; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1253; 1254; GFX940-LABEL: zero_init_small_offset_foo: 1255; GFX940: ; %bb.0: 1256; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1257; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1258; GFX940-NEXT: s_waitcnt vmcnt(0) 1259; GFX940-NEXT: s_mov_b32 s0, 0 1260; GFX940-NEXT: s_mov_b32 s1, s0 1261; GFX940-NEXT: s_mov_b32 s2, s0 1262; GFX940-NEXT: s_mov_b32 s3, s0 1263; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1264; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1265; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1266; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1267; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1268; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1269; GFX940-NEXT: s_waitcnt vmcnt(0) 1270; GFX940-NEXT: s_setpc_b64 s[30:31] 1271; 1272; GFX10-PAL-LABEL: zero_init_small_offset_foo: 1273; GFX10-PAL: ; %bb.0: 1274; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1275; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1276; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1277; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1278; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1279; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1280; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1281; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1282; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1283; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1284; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1285; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1286; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1287; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1288; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1289; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1290; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1291; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1292; 1293; GFX11-PAL-LABEL: zero_init_small_offset_foo: 1294; GFX11-PAL: ; %bb.0: 1295; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1296; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1297; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1298; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1299; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1300; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1301; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1302; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1303; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1304; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1305; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 1306; GFX11-PAL-NEXT: s_clause 0x3 1307; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1308; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1309; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1310; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1311; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1312; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1313; GCN-LABEL: zero_init_small_offset_foo: 1314; GCN: ; %bb.0: 1315; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1316; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1317; GCN-NEXT: s_waitcnt vmcnt(0) 1318; GCN-NEXT: s_mov_b32 s0, 0 1319; GCN-NEXT: s_mov_b32 s1, s0 1320; GCN-NEXT: s_mov_b32 s2, s0 1321; GCN-NEXT: s_mov_b32 s3, s0 1322; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1323; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1324; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1325; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1326; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1327; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1328; GCN-NEXT: s_waitcnt vmcnt(0) 1329; GCN-NEXT: s_setpc_b64 s[30:31] 1330 %padding = alloca [64 x i32], align 4, addrspace(5) 1331 %alloca = alloca [32 x i16], align 2, addrspace(5) 1332 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1333 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1334 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1335 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1336 ret void 1337} 1338 1339define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 1340; GFX9-LABEL: store_load_sindex_small_offset_kernel: 1341; GFX9: ; %bb.0: ; %bb 1342; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1343; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1344; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1345; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1346; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1347; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1348; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1349; GFX9-NEXT: s_and_b32 s0, s0, 15 1350; GFX9-NEXT: v_mov_b32_e32 v0, 15 1351; GFX9-NEXT: s_addk_i32 s1, 0x104 1352; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1353; GFX9-NEXT: scratch_store_dword off, v0, s1 1354; GFX9-NEXT: s_waitcnt vmcnt(0) 1355; GFX9-NEXT: s_addk_i32 s0, 0x104 1356; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1357; GFX9-NEXT: s_waitcnt vmcnt(0) 1358; GFX9-NEXT: s_endpgm 1359; 1360; GFX10-LABEL: store_load_sindex_small_offset_kernel: 1361; GFX10: ; %bb.0: ; %bb 1362; GFX10-NEXT: s_add_u32 s2, s2, s5 1363; GFX10-NEXT: s_addc_u32 s3, s3, 0 1364; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1365; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1366; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1367; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1368; GFX10-NEXT: s_waitcnt vmcnt(0) 1369; GFX10-NEXT: v_mov_b32_e32 v0, 15 1370; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1371; GFX10-NEXT: s_and_b32 s1, s0, 15 1372; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1373; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1374; GFX10-NEXT: s_addk_i32 s0, 0x104 1375; GFX10-NEXT: s_addk_i32 s1, 0x104 1376; GFX10-NEXT: scratch_store_dword off, v0, s0 1377; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1378; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1379; GFX10-NEXT: s_waitcnt vmcnt(0) 1380; GFX10-NEXT: s_endpgm 1381; 1382; GFX11-LABEL: store_load_sindex_small_offset_kernel: 1383; GFX11: ; %bb.0: ; %bb 1384; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 1385; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1386; GFX11-NEXT: s_waitcnt vmcnt(0) 1387; GFX11-NEXT: v_mov_b32_e32 v0, 15 1388; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX11-NEXT: s_and_b32 s1, s0, 15 1390; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1391; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1392; GFX11-NEXT: s_addk_i32 s0, 0x104 1393; GFX11-NEXT: s_addk_i32 s1, 0x104 1394; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1395; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1396; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1397; GFX11-NEXT: s_waitcnt vmcnt(0) 1398; GFX11-NEXT: s_endpgm 1399; 1400; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 1401; GFX9-PAL: ; %bb.0: ; %bb 1402; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1403; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1404; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1405; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1406; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1407; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1408; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1409; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1410; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1411; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1412; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1413; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1414; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1415; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1416; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1417; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1418; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1419; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1420; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1421; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1422; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1423; GFX9-PAL-NEXT: s_endpgm 1424; 1425; GFX940-LABEL: store_load_sindex_small_offset_kernel: 1426; GFX940: ; %bb.0: ; %bb 1427; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 1428; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1429; GFX940-NEXT: s_waitcnt vmcnt(0) 1430; GFX940-NEXT: v_mov_b32_e32 v0, 15 1431; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1432; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1433; GFX940-NEXT: s_and_b32 s0, s0, 15 1434; GFX940-NEXT: s_addk_i32 s1, 0x104 1435; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1436; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1437; GFX940-NEXT: s_waitcnt vmcnt(0) 1438; GFX940-NEXT: s_addk_i32 s0, 0x104 1439; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1440; GFX940-NEXT: s_waitcnt vmcnt(0) 1441; GFX940-NEXT: s_endpgm 1442; 1443; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 1444; GFX1010-PAL: ; %bb.0: ; %bb 1445; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 1446; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 1447; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1448; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1449; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1450; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 1451; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 1452; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1453; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1454; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1455; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1456; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1457; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1458; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1459; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1460; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1461; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1462; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1463; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1464; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1465; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1466; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1467; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1468; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1469; GFX1010-PAL-NEXT: s_endpgm 1470; 1471; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 1472; GFX1030-PAL: ; %bb.0: ; %bb 1473; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 1474; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 1475; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1476; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1477; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1478; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 1479; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 1480; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1481; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1482; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1483; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1484; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1485; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1486; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1487; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1488; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1489; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1490; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1491; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1492; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1493; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1494; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1495; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1496; GFX1030-PAL-NEXT: s_endpgm 1497; 1498; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: 1499; GFX11-PAL: ; %bb.0: ; %bb 1500; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 1501; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1502; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1503; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1504; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1506; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1507; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1508; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 1509; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 1510; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1511; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1512; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1513; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1514; GFX11-PAL-NEXT: s_endpgm 1515bb: 1516 %padding = alloca [64 x i32], align 4, addrspace(5) 1517 %i = alloca [32 x float], align 4, addrspace(5) 1518 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1519 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1520 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1521 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1522 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1523 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1524 %i9 = and i32 %idx, 15 1525 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1526 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1527 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1528 ret void 1529} 1530 1531define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 1532; GFX9-LABEL: store_load_sindex_small_offset_foo: 1533; GFX9: ; %bb.0: ; %bb 1534; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1535; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1536; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1537; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1538; GFX9-NEXT: s_waitcnt vmcnt(0) 1539; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1540; GFX9-NEXT: s_addk_i32 s0, 0x104 1541; GFX9-NEXT: v_mov_b32_e32 v0, 15 1542; GFX9-NEXT: scratch_store_dword off, v0, s0 1543; GFX9-NEXT: s_waitcnt vmcnt(0) 1544; GFX9-NEXT: s_and_b32 s0, s2, 15 1545; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1546; GFX9-NEXT: s_addk_i32 s0, 0x104 1547; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1548; GFX9-NEXT: s_waitcnt vmcnt(0) 1549; GFX9-NEXT: s_endpgm 1550; 1551; GFX10-LABEL: store_load_sindex_small_offset_foo: 1552; GFX10: ; %bb.0: ; %bb 1553; GFX10-NEXT: s_add_u32 s0, s0, s3 1554; GFX10-NEXT: s_addc_u32 s1, s1, 0 1555; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1556; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1557; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1558; GFX10-NEXT: s_waitcnt vmcnt(0) 1559; GFX10-NEXT: v_mov_b32_e32 v0, 15 1560; GFX10-NEXT: s_and_b32 s0, s2, 15 1561; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1562; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1563; GFX10-NEXT: s_addk_i32 s1, 0x104 1564; GFX10-NEXT: s_addk_i32 s0, 0x104 1565; GFX10-NEXT: scratch_store_dword off, v0, s1 1566; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1567; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1568; GFX10-NEXT: s_waitcnt vmcnt(0) 1569; GFX10-NEXT: s_endpgm 1570; 1571; GFX11-LABEL: store_load_sindex_small_offset_foo: 1572; GFX11: ; %bb.0: ; %bb 1573; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1574; GFX11-NEXT: s_waitcnt vmcnt(0) 1575; GFX11-NEXT: v_mov_b32_e32 v0, 15 1576; GFX11-NEXT: s_and_b32 s1, s0, 15 1577; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1578; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1579; GFX11-NEXT: s_addk_i32 s0, 0x104 1580; GFX11-NEXT: s_addk_i32 s1, 0x104 1581; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1582; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1583; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1584; GFX11-NEXT: s_waitcnt vmcnt(0) 1585; GFX11-NEXT: s_endpgm 1586; 1587; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1588; GFX9-PAL: ; %bb.0: ; %bb 1589; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1590; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1591; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1592; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1593; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1595; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1596; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1597; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1598; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1599; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1600; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1601; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1602; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1603; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1604; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1605; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1606; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1607; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1608; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1609; GFX9-PAL-NEXT: s_endpgm 1610; 1611; GFX940-LABEL: store_load_sindex_small_offset_foo: 1612; GFX940: ; %bb.0: ; %bb 1613; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1614; GFX940-NEXT: s_waitcnt vmcnt(0) 1615; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1616; GFX940-NEXT: s_and_b32 s0, s0, 15 1617; GFX940-NEXT: s_addk_i32 s1, 0x104 1618; GFX940-NEXT: v_mov_b32_e32 v0, 15 1619; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1620; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1621; GFX940-NEXT: s_waitcnt vmcnt(0) 1622; GFX940-NEXT: s_addk_i32 s0, 0x104 1623; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1624; GFX940-NEXT: s_waitcnt vmcnt(0) 1625; GFX940-NEXT: s_endpgm 1626; 1627; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1628; GFX1010-PAL: ; %bb.0: ; %bb 1629; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1630; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1631; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1632; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1633; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1634; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1635; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1636; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1637; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1638; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1639; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1640; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1641; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1642; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1643; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1644; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1645; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1646; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1647; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1648; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1649; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1650; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1651; GFX1010-PAL-NEXT: s_endpgm 1652; 1653; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1654; GFX1030-PAL: ; %bb.0: ; %bb 1655; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1656; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1657; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1658; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1659; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1660; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1661; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1662; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1663; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1664; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1665; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1666; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1667; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1668; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1669; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1670; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1671; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1672; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1673; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1674; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1675; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1676; GFX1030-PAL-NEXT: s_endpgm 1677; 1678; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: 1679; GFX11-PAL: ; %bb.0: ; %bb 1680; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1681; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1682; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1683; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1684; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1685; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1686; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 1687; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 1688; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1689; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1690; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1691; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1692; GFX11-PAL-NEXT: s_endpgm 1693bb: 1694 %padding = alloca [64 x i32], align 4, addrspace(5) 1695 %i = alloca [32 x float], align 4, addrspace(5) 1696 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1697 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1698 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1699 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1700 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1701 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1702 %i9 = and i32 %idx, 15 1703 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1704 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1705 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1706 ret void 1707} 1708 1709define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1710; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1711; GFX9: ; %bb.0: ; %bb 1712; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1713; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1714; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1715; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1716; GFX9-NEXT: s_waitcnt vmcnt(0) 1717; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1718; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 1719; GFX9-NEXT: v_mov_b32_e32 v2, 15 1720; GFX9-NEXT: scratch_store_dword v1, v2, off 1721; GFX9-NEXT: s_waitcnt vmcnt(0) 1722; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 1723; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1724; GFX9-NEXT: s_waitcnt vmcnt(0) 1725; GFX9-NEXT: s_endpgm 1726; 1727; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1728; GFX10: ; %bb.0: ; %bb 1729; GFX10-NEXT: s_add_u32 s0, s0, s3 1730; GFX10-NEXT: s_addc_u32 s1, s1, 0 1731; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1732; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1733; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1734; GFX10-NEXT: v_mov_b32_e32 v2, 15 1735; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1736; GFX10-NEXT: s_waitcnt vmcnt(0) 1737; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1738; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1739; GFX10-NEXT: scratch_store_dword v1, v2, off 1740; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1741; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1742; GFX10-NEXT: s_waitcnt vmcnt(0) 1743; GFX10-NEXT: s_endpgm 1744; 1745; GFX11-LABEL: store_load_vindex_small_offset_kernel: 1746; GFX11: ; %bb.0: ; %bb 1747; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 1748; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 1749; GFX11-NEXT: s_waitcnt vmcnt(0) 1750; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 1751; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 1752; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1753; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1754; GFX11-NEXT: s_waitcnt vmcnt(0) 1755; GFX11-NEXT: s_endpgm 1756; 1757; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1758; GFX9-PAL: ; %bb.0: ; %bb 1759; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1760; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1761; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1762; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1763; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1764; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 1765; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1767; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1768; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1769; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1770; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1771; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 1772; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 1773; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1774; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 1775; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1776; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1777; GFX9-PAL-NEXT: s_endpgm 1778; 1779; GFX940-LABEL: store_load_vindex_small_offset_kernel: 1780; GFX940: ; %bb.0: ; %bb 1781; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 1782; GFX940-NEXT: s_waitcnt vmcnt(0) 1783; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1784; GFX940-NEXT: v_mov_b32_e32 v1, 15 1785; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 1786; GFX940-NEXT: s_waitcnt vmcnt(0) 1787; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 1788; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 1789; GFX940-NEXT: s_waitcnt vmcnt(0) 1790; GFX940-NEXT: s_endpgm 1791; 1792; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1793; GFX1010-PAL: ; %bb.0: ; %bb 1794; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1795; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1796; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1797; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1798; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1799; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1800; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1801; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1802; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1803; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1804; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 1805; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1806; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 1807; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1808; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1809; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1810; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 1811; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1812; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1813; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1814; GFX1010-PAL-NEXT: s_endpgm 1815; 1816; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1817; GFX1030-PAL: ; %bb.0: ; %bb 1818; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1819; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1820; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1821; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1822; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1823; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1824; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1825; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1826; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1827; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1828; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 1829; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1830; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1831; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1832; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1833; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 1834; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1835; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1836; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1837; GFX1030-PAL-NEXT: s_endpgm 1838; 1839; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: 1840; GFX11-PAL: ; %bb.0: ; %bb 1841; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 1842; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 1843; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1844; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 1845; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 1846; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1847; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1848; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1849; GFX11-PAL-NEXT: s_endpgm 1850bb: 1851 %padding = alloca [64 x i32], align 4, addrspace(5) 1852 %i = alloca [32 x float], align 4, addrspace(5) 1853 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1854 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1855 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1856 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1857 %i3 = zext i32 %i2 to i64 1858 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1859 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1860 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1861 %i9 = sub nsw i32 31, %i2 1862 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1863 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1864 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1865 ret void 1866} 1867 1868define void @store_load_vindex_small_offset_foo(i32 %idx) { 1869; GFX9-LABEL: store_load_vindex_small_offset_foo: 1870; GFX9: ; %bb.0: ; %bb 1871; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1872; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1873; GFX9-NEXT: s_waitcnt vmcnt(0) 1874; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 1875; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1876; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1877; GFX9-NEXT: v_mov_b32_e32 v3, 15 1878; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 1879; GFX9-NEXT: scratch_store_dword v2, v3, off 1880; GFX9-NEXT: s_waitcnt vmcnt(0) 1881; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1882; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1883; GFX9-NEXT: s_waitcnt vmcnt(0) 1884; GFX9-NEXT: s_setpc_b64 s[30:31] 1885; 1886; GFX10-LABEL: store_load_vindex_small_offset_foo: 1887; GFX10: ; %bb.0: ; %bb 1888; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1889; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1890; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 1891; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1892; GFX10-NEXT: v_mov_b32_e32 v2, 15 1893; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1894; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1895; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1896; GFX10-NEXT: s_waitcnt vmcnt(0) 1897; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1898; GFX10-NEXT: scratch_store_dword v0, v2, off 1899; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1900; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 1901; GFX10-NEXT: s_waitcnt vmcnt(0) 1902; GFX10-NEXT: s_setpc_b64 s[30:31] 1903; 1904; GFX11-LABEL: store_load_vindex_small_offset_foo: 1905; GFX11: ; %bb.0: ; %bb 1906; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1907; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1908; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 1909; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1910; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc 1911; GFX11-NEXT: s_waitcnt vmcnt(0) 1912; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 1913; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 1914; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1915; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 1916; GFX11-NEXT: s_waitcnt vmcnt(0) 1917; GFX11-NEXT: s_setpc_b64 s[30:31] 1918; 1919; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1920; GFX9-PAL: ; %bb.0: ; %bb 1921; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1922; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1923; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1924; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 1925; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1926; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1927; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1928; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 1929; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1930; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1931; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1932; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1933; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1934; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1935; 1936; GFX940-LABEL: store_load_vindex_small_offset_foo: 1937; GFX940: ; %bb.0: ; %bb 1938; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1939; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1940; GFX940-NEXT: s_waitcnt vmcnt(0) 1941; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1942; GFX940-NEXT: v_mov_b32_e32 v2, 15 1943; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 1944; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1945; GFX940-NEXT: s_waitcnt vmcnt(0) 1946; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1947; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1948; GFX940-NEXT: s_waitcnt vmcnt(0) 1949; GFX940-NEXT: s_setpc_b64 s[30:31] 1950; 1951; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1952; GFX10-PAL: ; %bb.0: ; %bb 1953; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1954; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1955; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 1956; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1957; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 1958; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1959; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1960; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1961; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1962; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1963; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 1964; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1965; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 1966; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1967; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1968; 1969; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: 1970; GFX11-PAL: ; %bb.0: ; %bb 1971; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1972; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1973; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 1974; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1975; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc 1976; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1977; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 1978; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 1979; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1980; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 1981; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1982; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1983; GCN-LABEL: store_load_vindex_small_offset_foo: 1984; GCN: ; %bb.0: ; %bb 1985; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1986; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1987; GCN-NEXT: s_waitcnt vmcnt(0) 1988; GCN-NEXT: v_mov_b32_e32 v2, 15 1989; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1990; GCN-NEXT: v_and_b32_e32 v0, v0, v2 1991; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1992; GCN-NEXT: s_waitcnt vmcnt(0) 1993; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1994; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1995; GCN-NEXT: s_waitcnt vmcnt(0) 1996; GCN-NEXT: s_setpc_b64 s[30:31] 1997bb: 1998 %padding = alloca [64 x i32], align 4, addrspace(5) 1999 %i = alloca [32 x float], align 4, addrspace(5) 2000 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 2001 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2002 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2003 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2004 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2005 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2006 %i9 = and i32 %idx, 15 2007 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2008 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2009 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2010 ret void 2011} 2012 2013define amdgpu_kernel void @zero_init_large_offset_kernel() { 2014; GFX9-LABEL: zero_init_large_offset_kernel: 2015; GFX9: ; %bb.0: 2016; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2017; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2018; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2019; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 2020; GFX9-NEXT: s_waitcnt vmcnt(0) 2021; GFX9-NEXT: s_mov_b32 s0, 0 2022; GFX9-NEXT: s_mov_b32 s1, s0 2023; GFX9-NEXT: s_mov_b32 s2, s0 2024; GFX9-NEXT: s_mov_b32 s3, s0 2025; GFX9-NEXT: v_mov_b32_e32 v0, s0 2026; GFX9-NEXT: v_mov_b32_e32 v1, s1 2027; GFX9-NEXT: v_mov_b32_e32 v2, s2 2028; GFX9-NEXT: v_mov_b32_e32 v3, s3 2029; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2030; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2031; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2032; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2033; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2034; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2035; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2036; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2037; GFX9-NEXT: s_endpgm 2038; 2039; GFX10-LABEL: zero_init_large_offset_kernel: 2040; GFX10: ; %bb.0: 2041; GFX10-NEXT: s_add_u32 s0, s0, s3 2042; GFX10-NEXT: s_addc_u32 s1, s1, 0 2043; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2044; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2045; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 2046; GFX10-NEXT: s_waitcnt vmcnt(0) 2047; GFX10-NEXT: s_mov_b32 s0, 0 2048; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2049; GFX10-NEXT: s_mov_b32 s1, s0 2050; GFX10-NEXT: s_mov_b32 s2, s0 2051; GFX10-NEXT: s_mov_b32 s3, s0 2052; GFX10-NEXT: v_mov_b32_e32 v0, s0 2053; GFX10-NEXT: v_mov_b32_e32 v1, s1 2054; GFX10-NEXT: v_mov_b32_e32 v2, s2 2055; GFX10-NEXT: v_mov_b32_e32 v3, s3 2056; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2057; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2058; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2059; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2060; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2061; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2062; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2063; GFX10-NEXT: s_endpgm 2064; 2065; GFX11-LABEL: zero_init_large_offset_kernel: 2066; GFX11: ; %bb.0: 2067; GFX11-NEXT: scratch_load_b32 v0, off, off offset:16 glc dlc 2068; GFX11-NEXT: s_waitcnt vmcnt(0) 2069; GFX11-NEXT: s_mov_b32 s0, 0 2070; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2071; GFX11-NEXT: s_mov_b32 s1, s0 2072; GFX11-NEXT: s_mov_b32 s2, s0 2073; GFX11-NEXT: s_mov_b32 s3, s0 2074; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2075; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2076; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2077; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2078; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2079; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2080; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2081; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2082; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2083; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2084; GFX11-NEXT: s_endpgm 2085; 2086; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 2087; GFX9-PAL: ; %bb.0: 2088; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2089; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2090; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2091; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2092; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2093; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2094; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2095; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2096; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2097; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 2098; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2099; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2100; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2101; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2102; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2103; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2104; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2105; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2106; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2107; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2108; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2109; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2110; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2111; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2112; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2113; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2114; GFX9-PAL-NEXT: s_endpgm 2115; 2116; GFX940-LABEL: zero_init_large_offset_kernel: 2117; GFX940: ; %bb.0: 2118; GFX940-NEXT: scratch_load_dword v0, off, off offset:16 sc0 sc1 2119; GFX940-NEXT: s_waitcnt vmcnt(0) 2120; GFX940-NEXT: s_mov_b32 s0, 0 2121; GFX940-NEXT: s_mov_b32 s1, s0 2122; GFX940-NEXT: s_mov_b32 s2, s0 2123; GFX940-NEXT: s_mov_b32 s3, s0 2124; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2125; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2126; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2127; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2128; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2129; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2130; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2131; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2132; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2133; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2134; GFX940-NEXT: s_endpgm 2135; 2136; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 2137; GFX1010-PAL: ; %bb.0: 2138; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2139; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2140; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2141; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2142; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2143; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2144; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2145; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2146; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2147; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2148; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2149; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc 2150; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2151; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2152; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2153; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2154; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2155; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2156; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2157; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2158; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2159; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2160; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2161; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2162; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2163; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2164; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2165; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2166; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2167; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2168; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2169; GFX1010-PAL-NEXT: s_endpgm 2170; 2171; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 2172; GFX1030-PAL: ; %bb.0: 2173; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2174; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2175; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2176; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2177; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2178; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2179; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2180; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2181; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2182; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 2183; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2184; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2185; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2186; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2187; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2188; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2189; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2190; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2191; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2192; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2193; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2194; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2195; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2196; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2197; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2198; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2199; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2200; GFX1030-PAL-NEXT: s_endpgm 2201; 2202; GFX11-PAL-LABEL: zero_init_large_offset_kernel: 2203; GFX11-PAL: ; %bb.0: 2204; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:16 glc dlc 2205; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2206; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2207; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2208; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2209; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2210; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2211; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2212; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2213; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2214; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2215; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2216; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2217; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2218; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2219; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2220; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2221; GFX11-PAL-NEXT: s_endpgm 2222 %padding = alloca [4096 x i32], align 4, addrspace(5) 2223 %alloca = alloca [32 x i16], align 2, addrspace(5) 2224 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2225 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2226 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 2227 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 2228 ret void 2229} 2230 2231define void @zero_init_large_offset_foo() { 2232; GFX9-LABEL: zero_init_large_offset_foo: 2233; GFX9: ; %bb.0: 2234; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2235; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 2236; GFX9-NEXT: s_waitcnt vmcnt(0) 2237; GFX9-NEXT: s_mov_b32 s0, 0 2238; GFX9-NEXT: s_mov_b32 s1, s0 2239; GFX9-NEXT: s_mov_b32 s2, s0 2240; GFX9-NEXT: s_mov_b32 s3, s0 2241; GFX9-NEXT: v_mov_b32_e32 v0, s0 2242; GFX9-NEXT: v_mov_b32_e32 v1, s1 2243; GFX9-NEXT: v_mov_b32_e32 v2, s2 2244; GFX9-NEXT: v_mov_b32_e32 v3, s3 2245; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2246; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2247; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2248; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2249; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2250; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2251; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2252; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2253; GFX9-NEXT: s_waitcnt vmcnt(0) 2254; GFX9-NEXT: s_setpc_b64 s[30:31] 2255; 2256; GFX10-LABEL: zero_init_large_offset_foo: 2257; GFX10: ; %bb.0: 2258; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2259; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2260; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2261; GFX10-NEXT: s_waitcnt vmcnt(0) 2262; GFX10-NEXT: s_mov_b32 s0, 0 2263; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2264; GFX10-NEXT: s_mov_b32 s1, s0 2265; GFX10-NEXT: s_mov_b32 s2, s0 2266; GFX10-NEXT: s_mov_b32 s3, s0 2267; GFX10-NEXT: v_mov_b32_e32 v0, s0 2268; GFX10-NEXT: v_mov_b32_e32 v1, s1 2269; GFX10-NEXT: v_mov_b32_e32 v2, s2 2270; GFX10-NEXT: v_mov_b32_e32 v3, s3 2271; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2272; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2273; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2274; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2275; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2276; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2277; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2278; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2279; GFX10-NEXT: s_setpc_b64 s[30:31] 2280; 2281; GFX11-LABEL: zero_init_large_offset_foo: 2282; GFX11: ; %bb.0: 2283; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2284; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2285; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:16 glc dlc 2286; GFX11-NEXT: s_waitcnt vmcnt(0) 2287; GFX11-NEXT: s_mov_b32 s0, 0 2288; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2289; GFX11-NEXT: s_mov_b32 s1, s0 2290; GFX11-NEXT: s_mov_b32 s2, s0 2291; GFX11-NEXT: s_mov_b32 s3, s0 2292; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2293; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2294; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2295; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2296; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2297; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2298; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2299; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2300; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2301; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2302; GFX11-NEXT: s_setpc_b64 s[30:31] 2303; 2304; GFX9-PAL-LABEL: zero_init_large_offset_foo: 2305; GFX9-PAL: ; %bb.0: 2306; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2307; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 2308; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2309; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2310; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2311; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2312; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2313; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2314; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2315; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2316; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2317; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2318; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2319; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2320; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2321; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2322; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2323; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2324; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2325; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2326; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2327; 2328; GFX940-LABEL: zero_init_large_offset_foo: 2329; GFX940: ; %bb.0: 2330; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2331; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:16 sc0 sc1 2332; GFX940-NEXT: s_waitcnt vmcnt(0) 2333; GFX940-NEXT: s_mov_b32 s0, 0 2334; GFX940-NEXT: s_mov_b32 s1, s0 2335; GFX940-NEXT: s_mov_b32 s2, s0 2336; GFX940-NEXT: s_mov_b32 s3, s0 2337; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2338; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2339; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2340; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2341; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2342; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2343; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2344; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2345; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2346; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2347; GFX940-NEXT: s_waitcnt vmcnt(0) 2348; GFX940-NEXT: s_setpc_b64 s[30:31] 2349; 2350; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 2351; GFX1010-PAL: ; %bb.0: 2352; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2353; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2354; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2355; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2356; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2357; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2358; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2359; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2360; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2361; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2362; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2363; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2364; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2365; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2366; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2367; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2368; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2369; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2370; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2371; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2372; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2373; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2374; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2375; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2376; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 2377; 2378; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 2379; GFX1030-PAL: ; %bb.0: 2380; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2381; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2382; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2383; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2384; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2385; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2386; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2387; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2388; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2389; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2390; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2391; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2392; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2393; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2394; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2395; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2396; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2397; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2398; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2399; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2400; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2401; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 2402; 2403; GFX11-PAL-LABEL: zero_init_large_offset_foo: 2404; GFX11-PAL: ; %bb.0: 2405; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2406; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2407; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16 glc dlc 2408; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2409; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2410; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2411; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2412; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2413; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2414; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2415; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2416; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2417; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2418; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2419; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2420; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2421; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2422; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2423; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2424; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 2425 %padding = alloca [4096 x i32], align 4, addrspace(5) 2426 %alloca = alloca [32 x i16], align 2, addrspace(5) 2427 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2428 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2429 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 2430 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 2431 ret void 2432} 2433 2434define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 2435; GFX9-LABEL: store_load_sindex_large_offset_kernel: 2436; GFX9: ; %bb.0: ; %bb 2437; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 2438; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 2439; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2440; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2441; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2442; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2443; GFX9-NEXT: s_lshl_b32 s1, s0, 2 2444; GFX9-NEXT: s_and_b32 s0, s0, 15 2445; GFX9-NEXT: v_mov_b32_e32 v0, 15 2446; GFX9-NEXT: s_addk_i32 s1, 0x4004 2447; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2448; GFX9-NEXT: scratch_store_dword off, v0, s1 2449; GFX9-NEXT: s_waitcnt vmcnt(0) 2450; GFX9-NEXT: s_addk_i32 s0, 0x4004 2451; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2452; GFX9-NEXT: s_waitcnt vmcnt(0) 2453; GFX9-NEXT: s_endpgm 2454; 2455; GFX10-LABEL: store_load_sindex_large_offset_kernel: 2456; GFX10: ; %bb.0: ; %bb 2457; GFX10-NEXT: s_add_u32 s2, s2, s5 2458; GFX10-NEXT: s_addc_u32 s3, s3, 0 2459; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2460; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2461; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2462; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2463; GFX10-NEXT: s_waitcnt vmcnt(0) 2464; GFX10-NEXT: v_mov_b32_e32 v0, 15 2465; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2466; GFX10-NEXT: s_and_b32 s1, s0, 15 2467; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2468; GFX10-NEXT: s_lshl_b32 s1, s1, 2 2469; GFX10-NEXT: s_addk_i32 s0, 0x4004 2470; GFX10-NEXT: s_addk_i32 s1, 0x4004 2471; GFX10-NEXT: scratch_store_dword off, v0, s0 2472; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2473; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 2474; GFX10-NEXT: s_waitcnt vmcnt(0) 2475; GFX10-NEXT: s_endpgm 2476; 2477; GFX11-LABEL: store_load_sindex_large_offset_kernel: 2478; GFX11: ; %bb.0: ; %bb 2479; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 2480; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2481; GFX11-NEXT: s_waitcnt vmcnt(0) 2482; GFX11-NEXT: v_mov_b32_e32 v0, 15 2483; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2484; GFX11-NEXT: s_and_b32 s1, s0, 15 2485; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2486; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2487; GFX11-NEXT: s_addk_i32 s0, 0x4004 2488; GFX11-NEXT: s_addk_i32 s1, 0x4004 2489; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2490; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2491; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2492; GFX11-NEXT: s_waitcnt vmcnt(0) 2493; GFX11-NEXT: s_endpgm 2494; 2495; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 2496; GFX9-PAL: ; %bb.0: ; %bb 2497; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2498; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2499; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2500; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2501; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2502; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2503; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2504; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2505; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2506; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2507; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2508; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2509; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2510; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2511; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2512; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2513; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2514; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2515; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2516; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2517; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2518; GFX9-PAL-NEXT: s_endpgm 2519; 2520; GFX940-LABEL: store_load_sindex_large_offset_kernel: 2521; GFX940: ; %bb.0: ; %bb 2522; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2523; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2524; GFX940-NEXT: s_waitcnt vmcnt(0) 2525; GFX940-NEXT: v_mov_b32_e32 v0, 15 2526; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2527; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2528; GFX940-NEXT: s_and_b32 s0, s0, 15 2529; GFX940-NEXT: s_addk_i32 s1, 0x4004 2530; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2531; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2532; GFX940-NEXT: s_waitcnt vmcnt(0) 2533; GFX940-NEXT: s_addk_i32 s0, 0x4004 2534; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2535; GFX940-NEXT: s_waitcnt vmcnt(0) 2536; GFX940-NEXT: s_endpgm 2537; 2538; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 2539; GFX1010-PAL: ; %bb.0: ; %bb 2540; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 2541; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 2542; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2543; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2544; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2545; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 2546; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 2547; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2548; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2549; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2550; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2551; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2552; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2553; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2554; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2555; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2556; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2557; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2558; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2559; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2560; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2561; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2562; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2563; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2564; GFX1010-PAL-NEXT: s_endpgm 2565; 2566; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 2567; GFX1030-PAL: ; %bb.0: ; %bb 2568; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 2569; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 2570; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2571; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2572; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2573; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 2574; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 2575; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2576; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2577; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2578; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2579; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2580; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2581; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2582; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2583; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2584; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2585; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2586; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2587; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2588; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2589; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2590; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2591; GFX1030-PAL-NEXT: s_endpgm 2592; 2593; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: 2594; GFX11-PAL: ; %bb.0: ; %bb 2595; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 2596; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2597; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2598; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2599; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 2600; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2601; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2602; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2603; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2604; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2605; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2606; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2607; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2608; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2609; GFX11-PAL-NEXT: s_endpgm 2610bb: 2611 %padding = alloca [4096 x i32], align 4, addrspace(5) 2612 %i = alloca [32 x float], align 4, addrspace(5) 2613 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2614 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2615 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2616 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2617 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2618 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2619 %i9 = and i32 %idx, 15 2620 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2621 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2622 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2623 ret void 2624} 2625 2626define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 2627; GFX9-LABEL: store_load_sindex_large_offset_foo: 2628; GFX9: ; %bb.0: ; %bb 2629; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2630; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2631; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2632; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2633; GFX9-NEXT: s_waitcnt vmcnt(0) 2634; GFX9-NEXT: s_lshl_b32 s0, s2, 2 2635; GFX9-NEXT: s_addk_i32 s0, 0x4004 2636; GFX9-NEXT: v_mov_b32_e32 v0, 15 2637; GFX9-NEXT: scratch_store_dword off, v0, s0 2638; GFX9-NEXT: s_waitcnt vmcnt(0) 2639; GFX9-NEXT: s_and_b32 s0, s2, 15 2640; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2641; GFX9-NEXT: s_addk_i32 s0, 0x4004 2642; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2643; GFX9-NEXT: s_waitcnt vmcnt(0) 2644; GFX9-NEXT: s_endpgm 2645; 2646; GFX10-LABEL: store_load_sindex_large_offset_foo: 2647; GFX10: ; %bb.0: ; %bb 2648; GFX10-NEXT: s_add_u32 s0, s0, s3 2649; GFX10-NEXT: s_addc_u32 s1, s1, 0 2650; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2651; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2652; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2653; GFX10-NEXT: s_waitcnt vmcnt(0) 2654; GFX10-NEXT: v_mov_b32_e32 v0, 15 2655; GFX10-NEXT: s_and_b32 s0, s2, 15 2656; GFX10-NEXT: s_lshl_b32 s1, s2, 2 2657; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2658; GFX10-NEXT: s_addk_i32 s1, 0x4004 2659; GFX10-NEXT: s_addk_i32 s0, 0x4004 2660; GFX10-NEXT: scratch_store_dword off, v0, s1 2661; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2662; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 2663; GFX10-NEXT: s_waitcnt vmcnt(0) 2664; GFX10-NEXT: s_endpgm 2665; 2666; GFX11-LABEL: store_load_sindex_large_offset_foo: 2667; GFX11: ; %bb.0: ; %bb 2668; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2669; GFX11-NEXT: s_waitcnt vmcnt(0) 2670; GFX11-NEXT: v_mov_b32_e32 v0, 15 2671; GFX11-NEXT: s_and_b32 s1, s0, 15 2672; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2673; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2674; GFX11-NEXT: s_addk_i32 s0, 0x4004 2675; GFX11-NEXT: s_addk_i32 s1, 0x4004 2676; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2677; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2678; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2679; GFX11-NEXT: s_waitcnt vmcnt(0) 2680; GFX11-NEXT: s_endpgm 2681; 2682; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 2683; GFX9-PAL: ; %bb.0: ; %bb 2684; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2685; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2686; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2687; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2688; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2689; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2690; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2691; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2692; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2693; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2694; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2695; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2696; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2697; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2698; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2699; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2700; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2701; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2702; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2703; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2704; GFX9-PAL-NEXT: s_endpgm 2705; 2706; GFX940-LABEL: store_load_sindex_large_offset_foo: 2707; GFX940: ; %bb.0: ; %bb 2708; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2709; GFX940-NEXT: s_waitcnt vmcnt(0) 2710; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2711; GFX940-NEXT: s_and_b32 s0, s0, 15 2712; GFX940-NEXT: s_addk_i32 s1, 0x4004 2713; GFX940-NEXT: v_mov_b32_e32 v0, 15 2714; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2715; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2716; GFX940-NEXT: s_waitcnt vmcnt(0) 2717; GFX940-NEXT: s_addk_i32 s0, 0x4004 2718; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2719; GFX940-NEXT: s_waitcnt vmcnt(0) 2720; GFX940-NEXT: s_endpgm 2721; 2722; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 2723; GFX1010-PAL: ; %bb.0: ; %bb 2724; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2725; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2726; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2727; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2728; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2729; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2730; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2731; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2732; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2733; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2734; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2735; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2736; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2737; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2738; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2739; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2740; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2741; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2742; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2743; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2744; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2745; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2746; GFX1010-PAL-NEXT: s_endpgm 2747; 2748; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 2749; GFX1030-PAL: ; %bb.0: ; %bb 2750; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2751; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2752; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2753; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2754; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2755; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2756; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2757; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2758; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2759; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2760; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2761; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2762; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2763; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2764; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2765; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2766; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2767; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2768; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2769; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2770; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2771; GFX1030-PAL-NEXT: s_endpgm 2772; 2773; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: 2774; GFX11-PAL: ; %bb.0: ; %bb 2775; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2776; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2777; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2778; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2779; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2780; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2781; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2782; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2783; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2784; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2785; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2786; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2787; GFX11-PAL-NEXT: s_endpgm 2788bb: 2789 %padding = alloca [4096 x i32], align 4, addrspace(5) 2790 %i = alloca [32 x float], align 4, addrspace(5) 2791 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2792 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2793 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2794 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2795 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2796 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2797 %i9 = and i32 %idx, 15 2798 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2799 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2800 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2801 ret void 2802} 2803 2804define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 2805; GFX9-LABEL: store_load_vindex_large_offset_kernel: 2806; GFX9: ; %bb.0: ; %bb 2807; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2808; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2809; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2810; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2811; GFX9-NEXT: s_waitcnt vmcnt(0) 2812; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2813; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 2814; GFX9-NEXT: v_mov_b32_e32 v2, 15 2815; GFX9-NEXT: scratch_store_dword v1, v2, off 2816; GFX9-NEXT: s_waitcnt vmcnt(0) 2817; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2818; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2819; GFX9-NEXT: s_waitcnt vmcnt(0) 2820; GFX9-NEXT: s_endpgm 2821; 2822; GFX10-LABEL: store_load_vindex_large_offset_kernel: 2823; GFX10: ; %bb.0: ; %bb 2824; GFX10-NEXT: s_add_u32 s0, s0, s3 2825; GFX10-NEXT: s_addc_u32 s1, s1, 0 2826; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2827; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2828; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2829; GFX10-NEXT: v_mov_b32_e32 v2, 15 2830; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2831; GFX10-NEXT: s_waitcnt vmcnt(0) 2832; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2833; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2834; GFX10-NEXT: scratch_store_dword v1, v2, off 2835; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2836; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2837; GFX10-NEXT: s_waitcnt vmcnt(0) 2838; GFX10-NEXT: s_endpgm 2839; 2840; GFX11-LABEL: store_load_vindex_large_offset_kernel: 2841; GFX11: ; %bb.0: ; %bb 2842; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 2843; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 2844; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 2845; GFX11-NEXT: s_waitcnt vmcnt(0) 2846; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 2847; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 2848; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2849; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2850; GFX11-NEXT: s_waitcnt vmcnt(0) 2851; GFX11-NEXT: s_endpgm 2852; 2853; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 2854; GFX9-PAL: ; %bb.0: ; %bb 2855; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2856; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2857; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2858; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2859; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2860; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 2861; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2862; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2863; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2864; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2865; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2866; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2867; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 2868; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 2869; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2870; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2871; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2872; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2873; GFX9-PAL-NEXT: s_endpgm 2874; 2875; GFX940-LABEL: store_load_vindex_large_offset_kernel: 2876; GFX940: ; %bb.0: ; %bb 2877; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 2878; GFX940-NEXT: s_waitcnt vmcnt(0) 2879; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2880; GFX940-NEXT: v_mov_b32_e32 v1, 15 2881; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 2882; GFX940-NEXT: scratch_store_dword v0, v1, vcc_hi sc0 sc1 2883; GFX940-NEXT: s_waitcnt vmcnt(0) 2884; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2885; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 2886; GFX940-NEXT: s_waitcnt vmcnt(0) 2887; GFX940-NEXT: s_endpgm 2888; 2889; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 2890; GFX1010-PAL: ; %bb.0: ; %bb 2891; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2892; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2893; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2894; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2896; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2897; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2898; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2899; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2900; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2901; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 2902; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2903; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 2904; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2905; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2906; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2907; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 2908; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2909; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2910; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2911; GFX1010-PAL-NEXT: s_endpgm 2912; 2913; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 2914; GFX1030-PAL: ; %bb.0: ; %bb 2915; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2916; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2917; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2918; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2919; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2920; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2921; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2922; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2923; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2924; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2925; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 2926; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2927; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2928; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2929; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2930; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 2931; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2932; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2933; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2934; GFX1030-PAL-NEXT: s_endpgm 2935; 2936; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: 2937; GFX11-PAL: ; %bb.0: ; %bb 2938; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 2939; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 2940; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 2941; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2942; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 2943; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 2944; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2945; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2946; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2947; GFX11-PAL-NEXT: s_endpgm 2948bb: 2949 %padding = alloca [4096 x i32], align 4, addrspace(5) 2950 %i = alloca [32 x float], align 4, addrspace(5) 2951 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2952 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2953 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2954 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2955 %i3 = zext i32 %i2 to i64 2956 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 2957 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2958 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2959 %i9 = sub nsw i32 31, %i2 2960 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2961 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2962 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2963 ret void 2964} 2965 2966define void @store_load_vindex_large_offset_foo(i32 %idx) { 2967; GFX9-LABEL: store_load_vindex_large_offset_foo: 2968; GFX9: ; %bb.0: ; %bb 2969; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2970; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 2971; GFX9-NEXT: s_waitcnt vmcnt(0) 2972; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2973; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 2974; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2975; GFX9-NEXT: v_mov_b32_e32 v3, 15 2976; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 2977; GFX9-NEXT: scratch_store_dword v2, v3, off 2978; GFX9-NEXT: s_waitcnt vmcnt(0) 2979; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2980; GFX9-NEXT: scratch_load_dword v0, v0, off glc 2981; GFX9-NEXT: s_waitcnt vmcnt(0) 2982; GFX9-NEXT: s_setpc_b64 s[30:31] 2983; 2984; GFX10-LABEL: store_load_vindex_large_offset_foo: 2985; GFX10: ; %bb.0: ; %bb 2986; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2987; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2988; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 2989; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2990; GFX10-NEXT: v_mov_b32_e32 v2, 15 2991; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 2992; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2993; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 2994; GFX10-NEXT: s_waitcnt vmcnt(0) 2995; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 2996; GFX10-NEXT: scratch_store_dword v0, v2, off 2997; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2998; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 2999; GFX10-NEXT: s_waitcnt vmcnt(0) 3000; GFX10-NEXT: s_setpc_b64 s[30:31] 3001; 3002; GFX11-LABEL: store_load_vindex_large_offset_foo: 3003; GFX11: ; %bb.0: ; %bb 3004; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3005; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3006; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 3007; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3008; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3009; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3010; GFX11-NEXT: s_waitcnt vmcnt(0) 3011; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3012; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 3013; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3014; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3015; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 3016; GFX11-NEXT: s_waitcnt vmcnt(0) 3017; GFX11-NEXT: s_setpc_b64 s[30:31] 3018; 3019; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 3020; GFX9-PAL: ; %bb.0: ; %bb 3021; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3022; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 3023; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3024; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3025; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 3026; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 3027; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 3028; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 3029; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 3030; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3031; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3032; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 3033; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3034; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3035; 3036; GFX940-LABEL: store_load_vindex_large_offset_foo: 3037; GFX940: ; %bb.0: ; %bb 3038; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3039; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 3040; GFX940-NEXT: s_waitcnt vmcnt(0) 3041; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 3042; GFX940-NEXT: v_mov_b32_e32 v2, 15 3043; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3044; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 3045; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 3046; GFX940-NEXT: s_waitcnt vmcnt(0) 3047; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3048; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3049; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 3050; GFX940-NEXT: s_waitcnt vmcnt(0) 3051; GFX940-NEXT: s_setpc_b64 s[30:31] 3052; 3053; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 3054; GFX10-PAL: ; %bb.0: ; %bb 3055; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3056; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3057; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 3058; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3059; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 3060; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 3061; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3062; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3063; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3064; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 3065; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 3066; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3067; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 3068; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3069; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3070; 3071; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: 3072; GFX11-PAL: ; %bb.0: ; %bb 3073; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3074; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3075; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 3076; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3077; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3078; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3079; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3080; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3081; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 3082; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3083; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3084; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 3085; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3086; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3087; GCN-LABEL: store_load_vindex_large_offset_foo: 3088; GCN: ; %bb.0: ; %bb 3089; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3090; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 3091; GCN-NEXT: s_waitcnt vmcnt(0) 3092; GCN-NEXT: v_mov_b32_e32 v2, 15 3093; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 3094; GCN-NEXT: v_and_b32_e32 v0, v0, v2 3095; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 3096; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 3097; GCN-NEXT: s_waitcnt vmcnt(0) 3098; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3099; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 3100; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 3101; GCN-NEXT: s_waitcnt vmcnt(0) 3102; GCN-NEXT: s_setpc_b64 s[30:31] 3103bb: 3104 %padding = alloca [4096 x i32], align 4, addrspace(5) 3105 %i = alloca [32 x float], align 4, addrspace(5) 3106 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 3107 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 3108 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 3109 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 3110 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 3111 store volatile i32 15, i32 addrspace(5)* %i8, align 4 3112 %i9 = and i32 %idx, 15 3113 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 3114 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 3115 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 3116 ret void 3117} 3118 3119define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 3120; GFX9-LABEL: store_load_large_imm_offset_kernel: 3121; GFX9: ; %bb.0: ; %bb 3122; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 3123; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 3124; GFX9-NEXT: v_mov_b32_e32 v0, 13 3125; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3126; GFX9-NEXT: s_movk_i32 s0, 0x3000 3127; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 3128; GFX9-NEXT: s_waitcnt vmcnt(0) 3129; GFX9-NEXT: s_add_i32 s0, s0, 4 3130; GFX9-NEXT: v_mov_b32_e32 v0, 15 3131; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3132; GFX9-NEXT: s_waitcnt vmcnt(0) 3133; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3134; GFX9-NEXT: s_waitcnt vmcnt(0) 3135; GFX9-NEXT: s_endpgm 3136; 3137; GFX10-LABEL: store_load_large_imm_offset_kernel: 3138; GFX10: ; %bb.0: ; %bb 3139; GFX10-NEXT: s_add_u32 s0, s0, s3 3140; GFX10-NEXT: s_addc_u32 s1, s1, 0 3141; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 3142; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 3143; GFX10-NEXT: v_mov_b32_e32 v0, 13 3144; GFX10-NEXT: v_mov_b32_e32 v1, 15 3145; GFX10-NEXT: s_movk_i32 s0, 0x3800 3146; GFX10-NEXT: s_add_i32 s0, s0, 4 3147; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 3148; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3149; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3150; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3151; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3152; GFX10-NEXT: s_waitcnt vmcnt(0) 3153; GFX10-NEXT: s_endpgm 3154; 3155; GFX11-LABEL: store_load_large_imm_offset_kernel: 3156; GFX11: ; %bb.0: ; %bb 3157; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3158; GFX11-NEXT: v_mov_b32_e32 v2, 15 3159; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3160; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3161; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3162; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3163; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3164; GFX11-NEXT: s_waitcnt vmcnt(0) 3165; GFX11-NEXT: s_endpgm 3166; 3167; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 3168; GFX9-PAL: ; %bb.0: ; %bb 3169; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 3170; GFX9-PAL-NEXT: s_mov_b32 s2, s0 3171; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3172; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3173; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3174; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3175; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3176; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3177; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 3178; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3179; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 3180; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3181; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 3182; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3183; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3184; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3185; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3186; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3187; GFX9-PAL-NEXT: s_endpgm 3188; 3189; GFX940-LABEL: store_load_large_imm_offset_kernel: 3190; GFX940: ; %bb.0: ; %bb 3191; GFX940-NEXT: v_mov_b32_e32 v0, 13 3192; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 3193; GFX940-NEXT: s_waitcnt vmcnt(0) 3194; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3195; GFX940-NEXT: v_mov_b32_e32 v1, 15 3196; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 3197; GFX940-NEXT: s_waitcnt vmcnt(0) 3198; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 3199; GFX940-NEXT: s_waitcnt vmcnt(0) 3200; GFX940-NEXT: s_endpgm 3201; 3202; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 3203; GFX1010-PAL: ; %bb.0: ; %bb 3204; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 3205; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 3206; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3207; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 3208; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3209; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 3210; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 3211; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3212; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3213; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 3214; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 3215; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 3216; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 3217; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 3218; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 3219; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3220; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3221; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3222; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3223; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3224; GFX1010-PAL-NEXT: s_endpgm 3225; 3226; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 3227; GFX1030-PAL: ; %bb.0: ; %bb 3228; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 3229; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 3230; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3231; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 3232; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3233; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 3234; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 3235; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3236; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3237; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 3238; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 3239; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 3240; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 3241; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 3242; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3243; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3244; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3245; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3246; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3247; GFX1030-PAL-NEXT: s_endpgm 3248; 3249; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: 3250; GFX11-PAL: ; %bb.0: ; %bb 3251; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3252; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3253; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3254; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3255; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3256; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3257; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3258; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3259; GFX11-PAL-NEXT: s_endpgm 3260bb: 3261 %i = alloca [4096 x i32], align 4, addrspace(5) 3262 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 3263 store volatile i32 13, i32 addrspace(5)* %i1, align 4 3264 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3265 store volatile i32 15, i32 addrspace(5)* %i7, align 4 3266 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3267 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 3268 ret void 3269} 3270 3271define void @store_load_large_imm_offset_foo() { 3272; GFX9-LABEL: store_load_large_imm_offset_foo: 3273; GFX9: ; %bb.0: ; %bb 3274; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3275; GFX9-NEXT: v_mov_b32_e32 v0, 13 3276; GFX9-NEXT: s_movk_i32 s0, 0x3000 3277; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 3278; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 3279; GFX9-NEXT: s_waitcnt vmcnt(0) 3280; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi 3281; GFX9-NEXT: v_mov_b32_e32 v0, 15 3282; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3283; GFX9-NEXT: s_waitcnt vmcnt(0) 3284; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3285; GFX9-NEXT: s_waitcnt vmcnt(0) 3286; GFX9-NEXT: s_setpc_b64 s[30:31] 3287; 3288; GFX10-LABEL: store_load_large_imm_offset_foo: 3289; GFX10: ; %bb.0: ; %bb 3290; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3291; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3292; GFX10-NEXT: v_mov_b32_e32 v0, 13 3293; GFX10-NEXT: v_mov_b32_e32 v1, 15 3294; GFX10-NEXT: s_movk_i32 s0, 0x3800 3295; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 3296; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo 3297; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 3298; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3299; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3300; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3301; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3302; GFX10-NEXT: s_waitcnt vmcnt(0) 3303; GFX10-NEXT: s_setpc_b64 s[30:31] 3304; 3305; GFX11-LABEL: store_load_large_imm_offset_foo: 3306; GFX11: ; %bb.0: ; %bb 3307; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3308; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3309; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3310; GFX11-NEXT: v_mov_b32_e32 v2, 15 3311; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3312; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3313; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3314; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3315; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3316; GFX11-NEXT: s_waitcnt vmcnt(0) 3317; GFX11-NEXT: s_setpc_b64 s[30:31] 3318; 3319; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 3320; GFX9-PAL: ; %bb.0: ; %bb 3321; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3322; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3323; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3324; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 3325; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3326; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3327; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi 3328; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3329; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3330; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3331; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3332; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3333; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3334; 3335; GFX940-LABEL: store_load_large_imm_offset_foo: 3336; GFX940: ; %bb.0: ; %bb 3337; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3338; GFX940-NEXT: v_mov_b32_e32 v0, 13 3339; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 3340; GFX940-NEXT: s_waitcnt vmcnt(0) 3341; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3342; GFX940-NEXT: v_mov_b32_e32 v1, 15 3343; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 3344; GFX940-NEXT: s_waitcnt vmcnt(0) 3345; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 3346; GFX940-NEXT: s_waitcnt vmcnt(0) 3347; GFX940-NEXT: s_setpc_b64 s[30:31] 3348; 3349; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 3350; GFX10-PAL: ; %bb.0: ; %bb 3351; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3352; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3353; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 3354; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3355; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 3356; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 3357; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo 3358; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3359; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3360; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3361; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3362; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3363; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3364; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3365; 3366; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: 3367; GFX11-PAL: ; %bb.0: ; %bb 3368; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3369; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3370; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 3371; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3372; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3373; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3374; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3375; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3376; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3377; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3378; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3379; GCN-LABEL: store_load_large_imm_offset_foo: 3380; GCN: ; %bb.0: ; %bb 3381; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3382; GCN-NEXT: v_mov_b32_e32 v0, 13 3383; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 3384; GCN-NEXT: s_waitcnt vmcnt(0) 3385; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 3386; GCN-NEXT: v_mov_b32_e32 v1, 15 3387; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 3388; GCN-NEXT: s_waitcnt vmcnt(0) 3389; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 3390; GCN-NEXT: s_waitcnt vmcnt(0) 3391; GCN-NEXT: s_setpc_b64 s[30:31] 3392bb: 3393 %i = alloca [4096 x i32], align 4, addrspace(5) 3394 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 3395 store volatile i32 13, i32 addrspace(5)* %i1, align 4 3396 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3397 store volatile i32 15, i32 addrspace(5)* %i7, align 4 3398 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3399 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 3400 ret void 3401} 3402 3403define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 3404; GFX9-LABEL: store_load_vidx_sidx_offset: 3405; GFX9: ; %bb.0: ; %bb 3406; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 3407; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 3408; GFX9-NEXT: v_mov_b32_e32 v1, 4 3409; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3411; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 3412; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3413; GFX9-NEXT: v_mov_b32_e32 v1, 15 3414; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 3415; GFX9-NEXT: s_waitcnt vmcnt(0) 3416; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3417; GFX9-NEXT: s_waitcnt vmcnt(0) 3418; GFX9-NEXT: s_endpgm 3419; 3420; GFX10-LABEL: store_load_vidx_sidx_offset: 3421; GFX10: ; %bb.0: ; %bb 3422; GFX10-NEXT: s_add_u32 s2, s2, s5 3423; GFX10-NEXT: s_addc_u32 s3, s3, 0 3424; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3425; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3426; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 3427; GFX10-NEXT: v_mov_b32_e32 v1, 15 3428; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3429; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 3430; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 3431; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 3432; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3433; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3434; GFX10-NEXT: s_waitcnt vmcnt(0) 3435; GFX10-NEXT: s_endpgm 3436; 3437; GFX11-LABEL: store_load_vidx_sidx_offset: 3438; GFX11: ; %bb.0: ; %bb 3439; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 3440; GFX11-NEXT: v_mov_b32_e32 v1, 15 3441; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3442; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3443; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 3444; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3445; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 3446; GFX11-NEXT: s_waitcnt vmcnt(0) 3447; GFX11-NEXT: s_endpgm 3448; 3449; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 3450; GFX9-PAL: ; %bb.0: ; %bb 3451; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 3452; GFX9-PAL-NEXT: s_mov_b32 s4, s0 3453; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 3454; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 3455; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 3456; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3457; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 3458; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 3459; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 3460; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 3461; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3462; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3463; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3464; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3465; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3466; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3467; GFX9-PAL-NEXT: s_endpgm 3468; 3469; GFX940-LABEL: store_load_vidx_sidx_offset: 3470; GFX940: ; %bb.0: ; %bb 3471; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 3472; GFX940-NEXT: v_mov_b32_e32 v1, 15 3473; GFX940-NEXT: s_waitcnt lgkmcnt(0) 3474; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3475; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 3476; GFX940-NEXT: s_waitcnt vmcnt(0) 3477; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 3478; GFX940-NEXT: s_waitcnt vmcnt(0) 3479; GFX940-NEXT: s_endpgm 3480; 3481; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 3482; GFX10-PAL: ; %bb.0: ; %bb 3483; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 3484; GFX10-PAL-NEXT: s_mov_b32 s4, s0 3485; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 3486; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3487; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 3488; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 3489; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 3490; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 3491; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 3492; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 3493; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3494; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3495; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 3496; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 3497; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3498; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3499; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3500; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3501; GFX10-PAL-NEXT: s_endpgm 3502; 3503; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: 3504; GFX11-PAL: ; %bb.0: ; %bb 3505; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 3506; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3507; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 3508; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3509; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 3510; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3511; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 3512; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3513; GFX11-PAL-NEXT: s_endpgm 3514; GCN-LABEL: store_load_vidx_sidx_offset: 3515; GCN: ; %bb.0: ; %bb 3516; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 3517; GCN-NEXT: v_mov_b32_e32 v1, 15 3518; GCN-NEXT: s_waitcnt lgkmcnt(0) 3519; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3520; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 3521; GCN-NEXT: s_waitcnt vmcnt(0) 3522; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 3523; GCN-NEXT: s_waitcnt vmcnt(0) 3524; GCN-NEXT: s_endpgm 3525bb: 3526 %alloca = alloca [32 x i32], align 4, addrspace(5) 3527 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 3528 %add1 = add nsw i32 %sidx, %vidx 3529 %add2 = add nsw i32 %add1, 256 3530 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 3531 store volatile i32 15, i32 addrspace(5)* %gep, align 4 3532 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 3533 ret void 3534} 3535 3536define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 3537; GFX9-LABEL: store_load_i64_aligned: 3538; GFX9: ; %bb.0: ; %bb 3539; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3540; GFX9-NEXT: v_mov_b32_e32 v1, 15 3541; GFX9-NEXT: v_mov_b32_e32 v2, 0 3542; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3543; GFX9-NEXT: s_waitcnt vmcnt(0) 3544; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3545; GFX9-NEXT: s_waitcnt vmcnt(0) 3546; GFX9-NEXT: s_setpc_b64 s[30:31] 3547; 3548; GFX10-LABEL: store_load_i64_aligned: 3549; GFX10: ; %bb.0: ; %bb 3550; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3551; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3552; GFX10-NEXT: v_mov_b32_e32 v1, 15 3553; GFX10-NEXT: v_mov_b32_e32 v2, 0 3554; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3555; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3556; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3557; GFX10-NEXT: s_waitcnt vmcnt(0) 3558; GFX10-NEXT: s_setpc_b64 s[30:31] 3559; 3560; GFX11-LABEL: store_load_i64_aligned: 3561; GFX11: ; %bb.0: ; %bb 3562; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3563; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3564; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 3565; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3566; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3567; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3568; GFX11-NEXT: s_waitcnt vmcnt(0) 3569; GFX11-NEXT: s_setpc_b64 s[30:31] 3570; 3571; GFX9-PAL-LABEL: store_load_i64_aligned: 3572; GFX9-PAL: ; %bb.0: ; %bb 3573; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3574; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3575; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 3576; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3577; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3578; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3579; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3580; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3581; 3582; GFX940-LABEL: store_load_i64_aligned: 3583; GFX940: ; %bb.0: ; %bb 3584; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3585; GFX940-NEXT: v_mov_b32_e32 v2, 15 3586; GFX940-NEXT: v_mov_b32_e32 v3, 0 3587; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3588; GFX940-NEXT: s_waitcnt vmcnt(0) 3589; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3590; GFX940-NEXT: s_waitcnt vmcnt(0) 3591; GFX940-NEXT: s_setpc_b64 s[30:31] 3592; 3593; GFX10-PAL-LABEL: store_load_i64_aligned: 3594; GFX10-PAL: ; %bb.0: ; %bb 3595; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3596; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3597; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3598; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 3599; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3600; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3601; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3602; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3603; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3604; 3605; GFX11-PAL-LABEL: store_load_i64_aligned: 3606; GFX11-PAL: ; %bb.0: ; %bb 3607; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3608; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3609; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 3610; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3611; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3612; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3613; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3614; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3615; GCN-LABEL: store_load_i64_aligned: 3616; GCN: ; %bb.0: ; %bb 3617; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3618; GCN-NEXT: v_mov_b32_e32 v2, 15 3619; GCN-NEXT: v_mov_b32_e32 v3, 0 3620; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3621; GCN-NEXT: s_waitcnt vmcnt(0) 3622; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3623; GCN-NEXT: s_waitcnt vmcnt(0) 3624; GCN-NEXT: s_setpc_b64 s[30:31] 3625bb: 3626 store volatile i64 15, i64 addrspace(5)* %arg, align 8 3627 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 3628 ret void 3629} 3630 3631define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 3632; GFX9-LABEL: store_load_i64_unaligned: 3633; GFX9: ; %bb.0: ; %bb 3634; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3635; GFX9-NEXT: v_mov_b32_e32 v1, 15 3636; GFX9-NEXT: v_mov_b32_e32 v2, 0 3637; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3638; GFX9-NEXT: s_waitcnt vmcnt(0) 3639; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3640; GFX9-NEXT: s_waitcnt vmcnt(0) 3641; GFX9-NEXT: s_setpc_b64 s[30:31] 3642; 3643; GFX10-LABEL: store_load_i64_unaligned: 3644; GFX10: ; %bb.0: ; %bb 3645; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3646; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3647; GFX10-NEXT: v_mov_b32_e32 v1, 15 3648; GFX10-NEXT: v_mov_b32_e32 v2, 0 3649; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3650; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3651; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3652; GFX10-NEXT: s_waitcnt vmcnt(0) 3653; GFX10-NEXT: s_setpc_b64 s[30:31] 3654; 3655; GFX11-LABEL: store_load_i64_unaligned: 3656; GFX11: ; %bb.0: ; %bb 3657; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3658; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3659; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 3660; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3661; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3662; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3663; GFX11-NEXT: s_waitcnt vmcnt(0) 3664; GFX11-NEXT: s_setpc_b64 s[30:31] 3665; 3666; GFX9-PAL-LABEL: store_load_i64_unaligned: 3667; GFX9-PAL: ; %bb.0: ; %bb 3668; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3669; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3670; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 3671; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3672; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3673; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3674; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3675; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3676; 3677; GFX940-LABEL: store_load_i64_unaligned: 3678; GFX940: ; %bb.0: ; %bb 3679; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3680; GFX940-NEXT: v_mov_b32_e32 v2, 15 3681; GFX940-NEXT: v_mov_b32_e32 v3, 0 3682; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3683; GFX940-NEXT: s_waitcnt vmcnt(0) 3684; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3685; GFX940-NEXT: s_waitcnt vmcnt(0) 3686; GFX940-NEXT: s_setpc_b64 s[30:31] 3687; 3688; GFX10-PAL-LABEL: store_load_i64_unaligned: 3689; GFX10-PAL: ; %bb.0: ; %bb 3690; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3691; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3692; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3693; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 3694; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3695; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3696; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3697; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3698; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3699; 3700; GFX11-PAL-LABEL: store_load_i64_unaligned: 3701; GFX11-PAL: ; %bb.0: ; %bb 3702; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3703; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3704; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 3705; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3706; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3707; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3708; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3709; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3710; GCN-LABEL: store_load_i64_unaligned: 3711; GCN: ; %bb.0: ; %bb 3712; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3713; GCN-NEXT: v_mov_b32_e32 v2, 15 3714; GCN-NEXT: v_mov_b32_e32 v3, 0 3715; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3716; GCN-NEXT: s_waitcnt vmcnt(0) 3717; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3718; GCN-NEXT: s_waitcnt vmcnt(0) 3719; GCN-NEXT: s_setpc_b64 s[30:31] 3720bb: 3721 store volatile i64 15, i64 addrspace(5)* %arg, align 1 3722 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 3723 ret void 3724} 3725 3726define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 3727; GFX9-LABEL: store_load_v3i32_unaligned: 3728; GFX9: ; %bb.0: ; %bb 3729; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3730; GFX9-NEXT: v_mov_b32_e32 v1, 1 3731; GFX9-NEXT: v_mov_b32_e32 v2, 2 3732; GFX9-NEXT: v_mov_b32_e32 v3, 3 3733; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3734; GFX9-NEXT: s_waitcnt vmcnt(0) 3735; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3736; GFX9-NEXT: s_waitcnt vmcnt(0) 3737; GFX9-NEXT: s_setpc_b64 s[30:31] 3738; 3739; GFX10-LABEL: store_load_v3i32_unaligned: 3740; GFX10: ; %bb.0: ; %bb 3741; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3742; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3743; GFX10-NEXT: v_mov_b32_e32 v1, 1 3744; GFX10-NEXT: v_mov_b32_e32 v2, 2 3745; GFX10-NEXT: v_mov_b32_e32 v3, 3 3746; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3747; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3748; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3749; GFX10-NEXT: s_waitcnt vmcnt(0) 3750; GFX10-NEXT: s_setpc_b64 s[30:31] 3751; 3752; GFX11-LABEL: store_load_v3i32_unaligned: 3753; GFX11: ; %bb.0: ; %bb 3754; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3755; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3756; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 3757; GFX11-NEXT: v_mov_b32_e32 v3, 3 3758; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc 3759; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3760; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 3761; GFX11-NEXT: s_waitcnt vmcnt(0) 3762; GFX11-NEXT: s_setpc_b64 s[30:31] 3763; 3764; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 3765; GFX9-PAL: ; %bb.0: ; %bb 3766; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3767; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3768; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3769; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3770; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3771; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3772; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3773; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3774; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3775; 3776; GFX940-LABEL: store_load_v3i32_unaligned: 3777; GFX940: ; %bb.0: ; %bb 3778; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3779; GFX940-NEXT: v_mov_b32_e32 v2, 1 3780; GFX940-NEXT: v_mov_b32_e32 v3, 2 3781; GFX940-NEXT: v_mov_b32_e32 v4, 3 3782; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3783; GFX940-NEXT: s_waitcnt vmcnt(0) 3784; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3785; GFX940-NEXT: s_waitcnt vmcnt(0) 3786; GFX940-NEXT: s_setpc_b64 s[30:31] 3787; 3788; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 3789; GFX10-PAL: ; %bb.0: ; %bb 3790; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3791; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3792; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3793; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3794; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3795; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3796; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3797; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3798; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3799; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3800; 3801; GFX11-PAL-LABEL: store_load_v3i32_unaligned: 3802; GFX11-PAL: ; %bb.0: ; %bb 3803; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3804; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3805; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 3806; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 3807; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc 3808; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3809; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 3810; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3811; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3812; GCN-LABEL: store_load_v3i32_unaligned: 3813; GCN: ; %bb.0: ; %bb 3814; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3815; GCN-NEXT: v_mov_b32_e32 v2, 1 3816; GCN-NEXT: v_mov_b32_e32 v3, 2 3817; GCN-NEXT: v_mov_b32_e32 v4, 3 3818; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3819; GCN-NEXT: s_waitcnt vmcnt(0) 3820; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3821; GCN-NEXT: s_waitcnt vmcnt(0) 3822; GCN-NEXT: s_setpc_b64 s[30:31] 3823bb: 3824 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 3825 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 3826 ret void 3827} 3828 3829define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 3830; GFX9-LABEL: store_load_v4i32_unaligned: 3831; GFX9: ; %bb.0: ; %bb 3832; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3833; GFX9-NEXT: v_mov_b32_e32 v1, 1 3834; GFX9-NEXT: v_mov_b32_e32 v2, 2 3835; GFX9-NEXT: v_mov_b32_e32 v3, 3 3836; GFX9-NEXT: v_mov_b32_e32 v4, 4 3837; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3838; GFX9-NEXT: s_waitcnt vmcnt(0) 3839; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3840; GFX9-NEXT: s_waitcnt vmcnt(0) 3841; GFX9-NEXT: s_setpc_b64 s[30:31] 3842; 3843; GFX10-LABEL: store_load_v4i32_unaligned: 3844; GFX10: ; %bb.0: ; %bb 3845; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3846; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3847; GFX10-NEXT: v_mov_b32_e32 v1, 1 3848; GFX10-NEXT: v_mov_b32_e32 v2, 2 3849; GFX10-NEXT: v_mov_b32_e32 v3, 3 3850; GFX10-NEXT: v_mov_b32_e32 v4, 4 3851; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3852; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3853; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3854; GFX10-NEXT: s_waitcnt vmcnt(0) 3855; GFX10-NEXT: s_setpc_b64 s[30:31] 3856; 3857; GFX11-LABEL: store_load_v4i32_unaligned: 3858; GFX11: ; %bb.0: ; %bb 3859; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3860; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3861; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 3862; GFX11-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 3863; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc 3864; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3865; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 3866; GFX11-NEXT: s_waitcnt vmcnt(0) 3867; GFX11-NEXT: s_setpc_b64 s[30:31] 3868; 3869; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 3870; GFX9-PAL: ; %bb.0: ; %bb 3871; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3872; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3873; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3874; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3875; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 3876; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3877; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3878; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3879; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3880; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3881; 3882; GFX940-LABEL: store_load_v4i32_unaligned: 3883; GFX940: ; %bb.0: ; %bb 3884; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3885; GFX940-NEXT: v_mov_b32_e32 v2, 1 3886; GFX940-NEXT: v_mov_b32_e32 v3, 2 3887; GFX940-NEXT: v_mov_b32_e32 v4, 3 3888; GFX940-NEXT: v_mov_b32_e32 v5, 4 3889; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3890; GFX940-NEXT: s_waitcnt vmcnt(0) 3891; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3892; GFX940-NEXT: s_waitcnt vmcnt(0) 3893; GFX940-NEXT: s_setpc_b64 s[30:31] 3894; 3895; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 3896; GFX10-PAL: ; %bb.0: ; %bb 3897; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3898; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3899; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3900; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3901; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3902; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 3903; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3904; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3905; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3906; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3907; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3908; 3909; GFX11-PAL-LABEL: store_load_v4i32_unaligned: 3910; GFX11-PAL: ; %bb.0: ; %bb 3911; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3912; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3913; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 3914; GFX11-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 3915; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc 3916; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3917; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 3918; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3919; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3920; GCN-LABEL: store_load_v4i32_unaligned: 3921; GCN: ; %bb.0: ; %bb 3922; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3923; GCN-NEXT: v_mov_b32_e32 v2, 1 3924; GCN-NEXT: v_mov_b32_e32 v3, 2 3925; GCN-NEXT: v_mov_b32_e32 v4, 3 3926; GCN-NEXT: v_mov_b32_e32 v5, 4 3927; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3928; GCN-NEXT: s_waitcnt vmcnt(0) 3929; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3930; GCN-NEXT: s_waitcnt vmcnt(0) 3931; GCN-NEXT: s_setpc_b64 s[30:31] 3932bb: 3933 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 3934 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 3935 ret void 3936} 3937 3938define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3939; GFX9-LABEL: store_load_i32_negative_unaligned: 3940; GFX9: ; %bb.0: ; %bb 3941; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3942; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 3943; GFX9-NEXT: v_mov_b32_e32 v1, 1 3944; GFX9-NEXT: scratch_store_byte v0, v1, off 3945; GFX9-NEXT: s_waitcnt vmcnt(0) 3946; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3947; GFX9-NEXT: s_waitcnt vmcnt(0) 3948; GFX9-NEXT: s_setpc_b64 s[30:31] 3949; 3950; GFX10-LABEL: store_load_i32_negative_unaligned: 3951; GFX10: ; %bb.0: ; %bb 3952; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3953; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3954; GFX10-NEXT: v_mov_b32_e32 v1, 1 3955; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 3956; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3957; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 3958; GFX10-NEXT: s_waitcnt vmcnt(0) 3959; GFX10-NEXT: s_setpc_b64 s[30:31] 3960; 3961; GFX11-LABEL: store_load_i32_negative_unaligned: 3962; GFX11: ; %bb.0: ; %bb 3963; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3964; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3965; GFX11-NEXT: v_mov_b32_e32 v1, 1 3966; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 3967; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3968; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 3969; GFX11-NEXT: s_waitcnt vmcnt(0) 3970; GFX11-NEXT: s_setpc_b64 s[30:31] 3971; 3972; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 3973; GFX9-PAL: ; %bb.0: ; %bb 3974; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3975; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 3976; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3977; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 3978; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3979; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 3980; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3981; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3982; 3983; GFX940-LABEL: store_load_i32_negative_unaligned: 3984; GFX940: ; %bb.0: ; %bb 3985; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3986; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 3987; GFX940-NEXT: v_mov_b32_e32 v1, 1 3988; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 3989; GFX940-NEXT: s_waitcnt vmcnt(0) 3990; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 3991; GFX940-NEXT: s_waitcnt vmcnt(0) 3992; GFX940-NEXT: s_setpc_b64 s[30:31] 3993; 3994; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 3995; GFX1010-PAL: ; %bb.0: ; %bb 3996; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3997; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3998; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 3999; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4000; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 4001; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4002; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 4003; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4004; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4005; 4006; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 4007; GFX1030-PAL: ; %bb.0: ; %bb 4008; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4009; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4010; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4011; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 4012; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4013; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 4014; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4015; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4016; 4017; GFX11-PAL-LABEL: store_load_i32_negative_unaligned: 4018; GFX11-PAL: ; %bb.0: ; %bb 4019; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4020; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4021; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 4022; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 4023; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4024; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4025; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4026; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4027bb: 4028 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 4029 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 4030 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 4031 ret void 4032} 4033 4034define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 4035; GFX9-LABEL: store_load_i32_large_negative_unaligned: 4036; GFX9: ; %bb.0: ; %bb 4037; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4038; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4039; GFX9-NEXT: v_mov_b32_e32 v1, 1 4040; GFX9-NEXT: scratch_store_byte v0, v1, off 4041; GFX9-NEXT: s_waitcnt vmcnt(0) 4042; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 4043; GFX9-NEXT: s_waitcnt vmcnt(0) 4044; GFX9-NEXT: s_setpc_b64 s[30:31] 4045; 4046; GFX10-LABEL: store_load_i32_large_negative_unaligned: 4047; GFX10: ; %bb.0: ; %bb 4048; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4049; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4050; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4051; GFX10-NEXT: v_mov_b32_e32 v1, 1 4052; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 4053; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4054; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4055; GFX10-NEXT: s_waitcnt vmcnt(0) 4056; GFX10-NEXT: s_setpc_b64 s[30:31] 4057; 4058; GFX11-LABEL: store_load_i32_large_negative_unaligned: 4059; GFX11: ; %bb.0: ; %bb 4060; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4061; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4062; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 4063; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4064; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4065; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4066; GFX11-NEXT: s_waitcnt vmcnt(0) 4067; GFX11-NEXT: s_setpc_b64 s[30:31] 4068; 4069; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 4070; GFX9-PAL: ; %bb.0: ; %bb 4071; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4072; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4073; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4074; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4075; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4076; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4077; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4078; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4079; 4080; GFX940-LABEL: store_load_i32_large_negative_unaligned: 4081; GFX940: ; %bb.0: ; %bb 4082; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4083; GFX940-NEXT: s_movk_i32 s0, 0xef7f 4084; GFX940-NEXT: v_mov_b32_e32 v1, 1 4085; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 4086; GFX940-NEXT: s_waitcnt vmcnt(0) 4087; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 4088; GFX940-NEXT: s_waitcnt vmcnt(0) 4089; GFX940-NEXT: s_setpc_b64 s[30:31] 4090; 4091; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 4092; GFX1010-PAL: ; %bb.0: ; %bb 4093; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4094; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4095; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 4096; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4097; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 4098; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4099; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 4100; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4101; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4102; 4103; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 4104; GFX1030-PAL: ; %bb.0: ; %bb 4105; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4106; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4107; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4108; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4109; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 4110; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4111; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4112; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4113; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4114; 4115; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned: 4116; GFX11-PAL: ; %bb.0: ; %bb 4117; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4118; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4119; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 4120; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4121; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4122; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4123; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4124; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4125bb: 4126 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 4127 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 4128 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 4129 ret void 4130} 4131 4132define amdgpu_ps void @large_offset() { 4133; GFX9-LABEL: large_offset: 4134; GFX9: ; %bb.0: ; %bb 4135; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 4136; GFX9-NEXT: v_mov_b32_e32 v0, 0 4137; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 4138; GFX9-NEXT: v_mov_b32_e32 v1, v0 4139; GFX9-NEXT: v_mov_b32_e32 v2, v0 4140; GFX9-NEXT: v_mov_b32_e32 v3, v0 4141; GFX9-NEXT: s_mov_b32 vcc_hi, 0 4142; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 4143; GFX9-NEXT: s_waitcnt vmcnt(0) 4144; GFX9-NEXT: s_mov_b32 vcc_hi, 0 4145; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 4146; GFX9-NEXT: s_waitcnt vmcnt(0) 4147; GFX9-NEXT: v_mov_b32_e32 v0, 16 4148; GFX9-NEXT: ;;#ASMSTART 4149; GFX9-NEXT: ; use v0 4150; GFX9-NEXT: ;;#ASMEND 4151; GFX9-NEXT: v_mov_b32_e32 v0, 0x810 4152; GFX9-NEXT: ;;#ASMSTART 4153; GFX9-NEXT: ; use v0 4154; GFX9-NEXT: ;;#ASMEND 4155; GFX9-NEXT: s_endpgm 4156; 4157; GFX10-LABEL: large_offset: 4158; GFX10: ; %bb.0: ; %bb 4159; GFX10-NEXT: s_add_u32 s0, s0, s2 4160; GFX10-NEXT: s_addc_u32 s1, s1, 0 4161; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 4162; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 4163; GFX10-NEXT: v_mov_b32_e32 v0, 0 4164; GFX10-NEXT: s_movk_i32 s0, 0x810 4165; GFX10-NEXT: s_addk_i32 s0, 0x3c0 4166; GFX10-NEXT: v_mov_b32_e32 v1, v0 4167; GFX10-NEXT: v_mov_b32_e32 v2, v0 4168; GFX10-NEXT: v_mov_b32_e32 v3, v0 4169; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 4170; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4171; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 4172; GFX10-NEXT: s_waitcnt vmcnt(0) 4173; GFX10-NEXT: v_mov_b32_e32 v0, 16 4174; GFX10-NEXT: v_mov_b32_e32 v1, 0x810 4175; GFX10-NEXT: ;;#ASMSTART 4176; GFX10-NEXT: ; use v0 4177; GFX10-NEXT: ;;#ASMEND 4178; GFX10-NEXT: ;;#ASMSTART 4179; GFX10-NEXT: ; use v1 4180; GFX10-NEXT: ;;#ASMEND 4181; GFX10-NEXT: s_endpgm 4182; 4183; GFX11-LABEL: large_offset: 4184; GFX11: ; %bb.0: ; %bb 4185; GFX11-NEXT: v_mov_b32_e32 v0, 0 4186; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4187; GFX11-NEXT: v_mov_b32_e32 v1, v0 4188; GFX11-NEXT: v_mov_b32_e32 v2, v0 4189; GFX11-NEXT: v_mov_b32_e32 v3, v0 4190; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4191; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4192; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4193; GFX11-NEXT: s_waitcnt vmcnt(0) 4194; GFX11-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 4195; GFX11-NEXT: ;;#ASMSTART 4196; GFX11-NEXT: ; use v0 4197; GFX11-NEXT: ;;#ASMEND 4198; GFX11-NEXT: ;;#ASMSTART 4199; GFX11-NEXT: ; use v1 4200; GFX11-NEXT: ;;#ASMEND 4201; GFX11-NEXT: s_endpgm 4202; 4203; GFX9-PAL-LABEL: large_offset: 4204; GFX9-PAL: ; %bb.0: ; %bb 4205; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 4206; GFX9-PAL-NEXT: s_mov_b32 s2, s0 4207; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4208; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 4209; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 4210; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 4211; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 4212; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 4213; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4214; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 4215; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 4216; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 4217; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 4218; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4219; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 4220; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 4221; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4222; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 4223; GFX9-PAL-NEXT: ;;#ASMSTART 4224; GFX9-PAL-NEXT: ; use v0 4225; GFX9-PAL-NEXT: ;;#ASMEND 4226; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810 4227; GFX9-PAL-NEXT: ;;#ASMSTART 4228; GFX9-PAL-NEXT: ; use v0 4229; GFX9-PAL-NEXT: ;;#ASMEND 4230; GFX9-PAL-NEXT: s_endpgm 4231; 4232; GFX940-LABEL: large_offset: 4233; GFX940: ; %bb.0: ; %bb 4234; GFX940-NEXT: v_mov_b32_e32 v0, 0 4235; GFX940-NEXT: v_mov_b32_e32 v1, v0 4236; GFX940-NEXT: v_mov_b32_e32 v2, v0 4237; GFX940-NEXT: v_mov_b32_e32 v3, v0 4238; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 4239; GFX940-NEXT: s_waitcnt vmcnt(0) 4240; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 4241; GFX940-NEXT: s_waitcnt vmcnt(0) 4242; GFX940-NEXT: v_mov_b32_e32 v0, 16 4243; GFX940-NEXT: ;;#ASMSTART 4244; GFX940-NEXT: ; use v0 4245; GFX940-NEXT: ;;#ASMEND 4246; GFX940-NEXT: v_mov_b32_e32 v0, 0x810 4247; GFX940-NEXT: ;;#ASMSTART 4248; GFX940-NEXT: ; use v0 4249; GFX940-NEXT: ;;#ASMEND 4250; GFX940-NEXT: s_endpgm 4251; 4252; GFX10-PAL-LABEL: large_offset: 4253; GFX10-PAL: ; %bb.0: ; %bb 4254; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 4255; GFX10-PAL-NEXT: s_mov_b32 s2, s0 4256; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4257; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 4258; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4259; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0 4260; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 4261; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 4262; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 4263; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 4264; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 4265; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 4266; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 4267; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 4268; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 4269; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 4270; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4271; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 4272; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4273; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16 4274; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810 4275; GFX10-PAL-NEXT: ;;#ASMSTART 4276; GFX10-PAL-NEXT: ; use v0 4277; GFX10-PAL-NEXT: ;;#ASMEND 4278; GFX10-PAL-NEXT: ;;#ASMSTART 4279; GFX10-PAL-NEXT: ; use v1 4280; GFX10-PAL-NEXT: ;;#ASMEND 4281; GFX10-PAL-NEXT: s_endpgm 4282; 4283; GFX11-PAL-LABEL: large_offset: 4284; GFX11-PAL: ; %bb.0: ; %bb 4285; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 4286; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4287; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 4288; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 4289; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0 4290; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4291; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4292; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4293; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4294; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 4295; GFX11-PAL-NEXT: ;;#ASMSTART 4296; GFX11-PAL-NEXT: ; use v0 4297; GFX11-PAL-NEXT: ;;#ASMEND 4298; GFX11-PAL-NEXT: ;;#ASMSTART 4299; GFX11-PAL-NEXT: ; use v1 4300; GFX11-PAL-NEXT: ;;#ASMEND 4301; GFX11-PAL-NEXT: s_endpgm 4302bb: 4303 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) 4304 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) 4305 %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60 4306 store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16 4307 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16 4308 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0 4309 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0 4310 ret void 4311} 4312 4313declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 4314declare i32 @llvm.amdgcn.workitem.id.x() 4315