1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 6; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s 7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s 10 11define amdgpu_kernel void @zero_init_kernel() { 12; GFX9-LABEL: zero_init_kernel: 13; GFX9: ; %bb.0: 14; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 15; GFX9-NEXT: s_mov_b32 s0, 0 16; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 17; GFX9-NEXT: s_mov_b32 s1, s0 18; GFX9-NEXT: s_mov_b32 s2, s0 19; GFX9-NEXT: s_mov_b32 s3, s0 20; GFX9-NEXT: v_mov_b32_e32 v0, s0 21; GFX9-NEXT: v_mov_b32_e32 v1, s1 22; GFX9-NEXT: v_mov_b32_e32 v2, s2 23; GFX9-NEXT: v_mov_b32_e32 v3, s3 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 28; GFX9-NEXT: s_mov_b32 vcc_hi, 0 29; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 30; GFX9-NEXT: s_mov_b32 vcc_hi, 0 31; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 32; GFX9-NEXT: s_endpgm 33; 34; GFX10-LABEL: zero_init_kernel: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: s_add_u32 s0, s0, s3 37; GFX10-NEXT: s_addc_u32 s1, s1, 0 38; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 39; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 40; GFX10-NEXT: s_mov_b32 s0, 0 41; GFX10-NEXT: s_mov_b32 s1, s0 42; GFX10-NEXT: s_mov_b32 s2, s0 43; GFX10-NEXT: s_mov_b32 s3, s0 44; GFX10-NEXT: v_mov_b32_e32 v0, s0 45; GFX10-NEXT: v_mov_b32_e32 v1, s1 46; GFX10-NEXT: v_mov_b32_e32 v2, s2 47; GFX10-NEXT: v_mov_b32_e32 v3, s3 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 49; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 50; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 51; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 52; GFX10-NEXT: s_endpgm 53; 54; GFX11-LABEL: zero_init_kernel: 55; GFX11: ; %bb.0: 56; GFX11-NEXT: s_mov_b32 s0, 0 57; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 58; GFX11-NEXT: s_mov_b32 s1, s0 59; GFX11-NEXT: s_mov_b32 s2, s0 60; GFX11-NEXT: s_mov_b32 s3, s0 61; GFX11-NEXT: v_mov_b32_e32 v0, s0 62; GFX11-NEXT: v_mov_b32_e32 v1, s1 63; GFX11-NEXT: v_mov_b32_e32 v2, s2 64; GFX11-NEXT: v_mov_b32_e32 v3, s3 65; GFX11-NEXT: s_clause 0x3 66; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:64 67; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 68; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:32 69; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:16 70; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 71; GFX11-NEXT: s_endpgm 72; 73; GFX9-PAL-LABEL: zero_init_kernel: 74; GFX9-PAL: ; %bb.0: 75; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 76; GFX9-PAL-NEXT: s_mov_b32 s2, s0 77; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 78; GFX9-PAL-NEXT: s_mov_b32 s0, 0 79; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 80; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 82; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 83; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 84; GFX9-PAL-NEXT: s_mov_b32 s1, s0 85; GFX9-PAL-NEXT: s_mov_b32 s2, s0 86; GFX9-PAL-NEXT: s_mov_b32 s3, s0 87; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 88; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 89; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 90; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 91; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 92; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 93; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 94; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 95; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 96; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 97; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 98; GFX9-PAL-NEXT: s_endpgm 99; 100; GFX940-LABEL: zero_init_kernel: 101; GFX940: ; %bb.0: 102; GFX940-NEXT: s_mov_b32 s0, 0 103; GFX940-NEXT: s_mov_b32 s1, s0 104; GFX940-NEXT: s_mov_b32 s2, s0 105; GFX940-NEXT: s_mov_b32 s3, s0 106; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 107; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 108; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 109; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 110; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 111; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 112; GFX940-NEXT: s_endpgm 113; 114; GFX1010-PAL-LABEL: zero_init_kernel: 115; GFX1010-PAL: ; %bb.0: 116; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 117; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 118; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 119; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 120; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 121; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 122; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 123; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 124; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 125; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 126; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 127; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 128; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 129; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 130; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 131; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 132; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 133; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 134; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 135; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 136; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 137; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 138; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 139; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 140; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 141; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 142; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 143; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 144; GFX1010-PAL-NEXT: s_endpgm 145; 146; GFX1030-PAL-LABEL: zero_init_kernel: 147; GFX1030-PAL: ; %bb.0: 148; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 149; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 150; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 151; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 152; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 153; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 154; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 155; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 156; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 157; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 158; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 159; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 160; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 161; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 162; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 163; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 164; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 165; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 166; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 167; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 168; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 169; GFX1030-PAL-NEXT: s_endpgm 170; 171; GFX11-PAL-LABEL: zero_init_kernel: 172; GFX11-PAL: ; %bb.0: 173; GFX11-PAL-NEXT: s_mov_b32 s0, 0 174; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 175; GFX11-PAL-NEXT: s_mov_b32 s1, s0 176; GFX11-PAL-NEXT: s_mov_b32 s2, s0 177; GFX11-PAL-NEXT: s_mov_b32 s3, s0 178; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 179; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 180; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 181; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 182; GFX11-PAL-NEXT: s_clause 0x3 183; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:64 184; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 185; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:32 186; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:16 187; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 188; GFX11-PAL-NEXT: s_endpgm 189 %alloca = alloca [32 x i16], align 2, addrspace(5) 190 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 191 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 192 ret void 193} 194 195define void @zero_init_foo() { 196; GFX9-LABEL: zero_init_foo: 197; GFX9: ; %bb.0: 198; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX9-NEXT: s_mov_b32 s0, 0 200; GFX9-NEXT: s_mov_b32 s1, s0 201; GFX9-NEXT: s_mov_b32 s2, s0 202; GFX9-NEXT: s_mov_b32 s3, s0 203; GFX9-NEXT: v_mov_b32_e32 v0, s0 204; GFX9-NEXT: v_mov_b32_e32 v1, s1 205; GFX9-NEXT: v_mov_b32_e32 v2, s2 206; GFX9-NEXT: v_mov_b32_e32 v3, s3 207; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 208; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 209; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 210; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: s_setpc_b64 s[30:31] 213; 214; GFX10-LABEL: zero_init_foo: 215; GFX10: ; %bb.0: 216; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 218; GFX10-NEXT: s_mov_b32 s0, 0 219; GFX10-NEXT: s_mov_b32 s1, s0 220; GFX10-NEXT: s_mov_b32 s2, s0 221; GFX10-NEXT: s_mov_b32 s3, s0 222; GFX10-NEXT: v_mov_b32_e32 v0, s0 223; GFX10-NEXT: v_mov_b32_e32 v1, s1 224; GFX10-NEXT: v_mov_b32_e32 v2, s2 225; GFX10-NEXT: v_mov_b32_e32 v3, s3 226; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 227; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 228; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 229; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 230; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 231; GFX10-NEXT: s_setpc_b64 s[30:31] 232; 233; GFX11-LABEL: zero_init_foo: 234; GFX11: ; %bb.0: 235; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 236; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 237; GFX11-NEXT: s_mov_b32 s0, 0 238; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 239; GFX11-NEXT: s_mov_b32 s1, s0 240; GFX11-NEXT: s_mov_b32 s2, s0 241; GFX11-NEXT: s_mov_b32 s3, s0 242; GFX11-NEXT: v_mov_b32_e32 v0, s0 243; GFX11-NEXT: v_mov_b32_e32 v1, s1 244; GFX11-NEXT: v_mov_b32_e32 v2, s2 245; GFX11-NEXT: v_mov_b32_e32 v3, s3 246; GFX11-NEXT: s_clause 0x3 247; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 248; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 249; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 250; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 251; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 252; GFX11-NEXT: s_setpc_b64 s[30:31] 253; 254; GFX9-PAL-LABEL: zero_init_foo: 255; GFX9-PAL: ; %bb.0: 256; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 257; GFX9-PAL-NEXT: s_mov_b32 s0, 0 258; GFX9-PAL-NEXT: s_mov_b32 s1, s0 259; GFX9-PAL-NEXT: s_mov_b32 s2, s0 260; GFX9-PAL-NEXT: s_mov_b32 s3, s0 261; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 262; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 263; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 264; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 265; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 266; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 267; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 268; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 269; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 270; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 271; 272; GFX940-LABEL: zero_init_foo: 273; GFX940: ; %bb.0: 274; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 275; GFX940-NEXT: s_mov_b32 s0, 0 276; GFX940-NEXT: s_mov_b32 s1, s0 277; GFX940-NEXT: s_mov_b32 s2, s0 278; GFX940-NEXT: s_mov_b32 s3, s0 279; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 280; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 281; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 282; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 283; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 284; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 285; GFX940-NEXT: s_waitcnt vmcnt(0) 286; GFX940-NEXT: s_setpc_b64 s[30:31] 287; 288; GFX10-PAL-LABEL: zero_init_foo: 289; GFX10-PAL: ; %bb.0: 290; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 291; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 292; GFX10-PAL-NEXT: s_mov_b32 s0, 0 293; GFX10-PAL-NEXT: s_mov_b32 s1, s0 294; GFX10-PAL-NEXT: s_mov_b32 s2, s0 295; GFX10-PAL-NEXT: s_mov_b32 s3, s0 296; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 297; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 298; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 299; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 300; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 301; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 302; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 303; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 304; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 305; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 306; 307; GFX11-PAL-LABEL: zero_init_foo: 308; GFX11-PAL: ; %bb.0: 309; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 310; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 311; GFX11-PAL-NEXT: s_mov_b32 s0, 0 312; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 313; GFX11-PAL-NEXT: s_mov_b32 s1, s0 314; GFX11-PAL-NEXT: s_mov_b32 s2, s0 315; GFX11-PAL-NEXT: s_mov_b32 s3, s0 316; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 317; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 318; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 319; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 320; GFX11-PAL-NEXT: s_clause 0x3 321; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 322; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 323; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 324; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 325; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 326; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 327; GCN-LABEL: zero_init_foo: 328; GCN: ; %bb.0: 329; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GCN-NEXT: s_mov_b32 s0, 0 331; GCN-NEXT: s_mov_b32 s1, s0 332; GCN-NEXT: s_mov_b32 s2, s0 333; GCN-NEXT: s_mov_b32 s3, s0 334; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 335; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 336; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 337; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 338; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 339; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 340; GCN-NEXT: s_waitcnt vmcnt(0) 341; GCN-NEXT: s_setpc_b64 s[30:31] 342 %alloca = alloca [32 x i16], align 2, addrspace(5) 343 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 344 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 345 ret void 346} 347 348define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 349; GFX9-LABEL: store_load_sindex_kernel: 350; GFX9: ; %bb.0: ; %bb 351; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 352; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 353; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 354; GFX9-NEXT: v_mov_b32_e32 v0, 15 355; GFX9-NEXT: s_waitcnt lgkmcnt(0) 356; GFX9-NEXT: s_lshl_b32 s1, s0, 2 357; GFX9-NEXT: s_and_b32 s0, s0, 15 358; GFX9-NEXT: s_add_i32 s1, s1, 4 359; GFX9-NEXT: s_lshl_b32 s0, s0, 2 360; GFX9-NEXT: scratch_store_dword off, v0, s1 361; GFX9-NEXT: s_waitcnt vmcnt(0) 362; GFX9-NEXT: s_add_i32 s0, s0, 4 363; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 364; GFX9-NEXT: s_waitcnt vmcnt(0) 365; GFX9-NEXT: s_endpgm 366; 367; GFX10-LABEL: store_load_sindex_kernel: 368; GFX10: ; %bb.0: ; %bb 369; GFX10-NEXT: s_add_u32 s2, s2, s5 370; GFX10-NEXT: s_addc_u32 s3, s3, 0 371; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 372; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 373; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 374; GFX10-NEXT: v_mov_b32_e32 v0, 15 375; GFX10-NEXT: s_waitcnt lgkmcnt(0) 376; GFX10-NEXT: s_and_b32 s1, s0, 15 377; GFX10-NEXT: s_lshl_b32 s0, s0, 2 378; GFX10-NEXT: s_lshl_b32 s1, s1, 2 379; GFX10-NEXT: s_add_i32 s0, s0, 4 380; GFX10-NEXT: s_add_i32 s1, s1, 4 381; GFX10-NEXT: scratch_store_dword off, v0, s0 382; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 383; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 384; GFX10-NEXT: s_waitcnt vmcnt(0) 385; GFX10-NEXT: s_endpgm 386; 387; GFX11-LABEL: store_load_sindex_kernel: 388; GFX11: ; %bb.0: ; %bb 389; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 390; GFX11-NEXT: v_mov_b32_e32 v0, 15 391; GFX11-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-NEXT: s_and_b32 s1, s0, 15 393; GFX11-NEXT: s_lshl_b32 s0, s0, 2 394; GFX11-NEXT: s_lshl_b32 s1, s1, 2 395; GFX11-NEXT: s_add_i32 s0, s0, 4 396; GFX11-NEXT: s_add_i32 s1, s1, 4 397; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 398; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 399; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 400; GFX11-NEXT: s_waitcnt vmcnt(0) 401; GFX11-NEXT: s_endpgm 402; 403; GFX9-PAL-LABEL: store_load_sindex_kernel: 404; GFX9-PAL: ; %bb.0: ; %bb 405; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 406; GFX9-PAL-NEXT: s_mov_b32 s4, s0 407; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 408; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 409; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 410; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 411; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 412; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 413; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 414; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 415; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 416; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 417; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 418; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 419; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 420; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 421; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 422; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 423; GFX9-PAL-NEXT: s_endpgm 424; 425; GFX940-LABEL: store_load_sindex_kernel: 426; GFX940: ; %bb.0: ; %bb 427; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 428; GFX940-NEXT: v_mov_b32_e32 v0, 15 429; GFX940-NEXT: s_waitcnt lgkmcnt(0) 430; GFX940-NEXT: s_lshl_b32 s1, s0, 2 431; GFX940-NEXT: s_and_b32 s0, s0, 15 432; GFX940-NEXT: s_add_i32 s1, s1, 4 433; GFX940-NEXT: s_lshl_b32 s0, s0, 2 434; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 435; GFX940-NEXT: s_waitcnt vmcnt(0) 436; GFX940-NEXT: s_add_i32 s0, s0, 4 437; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 438; GFX940-NEXT: s_waitcnt vmcnt(0) 439; GFX940-NEXT: s_endpgm 440; 441; GFX10-PAL-LABEL: store_load_sindex_kernel: 442; GFX10-PAL: ; %bb.0: ; %bb 443; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 444; GFX10-PAL-NEXT: s_mov_b32 s4, s0 445; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 446; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 447; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 448; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 449; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 450; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 451; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 452; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 453; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 454; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 455; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 456; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 457; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 458; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 459; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 460; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 461; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 462; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 463; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 464; GFX10-PAL-NEXT: s_endpgm 465; 466; GFX11-PAL-LABEL: store_load_sindex_kernel: 467; GFX11-PAL: ; %bb.0: ; %bb 468; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 469; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 470; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 471; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 472; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 473; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 474; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 475; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 476; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 477; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 478; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 479; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 480; GFX11-PAL-NEXT: s_endpgm 481; GCN-LABEL: store_load_sindex_kernel: 482; GCN: ; %bb.0: ; %bb 483; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 484; GCN-NEXT: v_mov_b32_e32 v0, 15 485; GCN-NEXT: s_waitcnt lgkmcnt(0) 486; GCN-NEXT: s_lshl_b32 s1, s0, 2 487; GCN-NEXT: s_and_b32 s0, s0, 15 488; GCN-NEXT: s_lshl_b32 s0, s0, 2 489; GCN-NEXT: s_add_u32 s1, 4, s1 490; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 491; GCN-NEXT: s_waitcnt vmcnt(0) 492; GCN-NEXT: s_add_u32 s0, 4, s0 493; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 494; GCN-NEXT: s_waitcnt vmcnt(0) 495; GCN-NEXT: s_endpgm 496bb: 497 %i = alloca [32 x float], align 4, addrspace(5) 498 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 499 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 500 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 501 store volatile i32 15, i32 addrspace(5)* %i8, align 4 502 %i9 = and i32 %idx, 15 503 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 504 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 505 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 506 ret void 507} 508 509define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 510; GFX9-LABEL: store_load_sindex_foo: 511; GFX9: ; %bb.0: ; %bb 512; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 513; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 514; GFX9-NEXT: s_lshl_b32 s0, s2, 2 515; GFX9-NEXT: s_add_i32 s0, s0, 4 516; GFX9-NEXT: v_mov_b32_e32 v0, 15 517; GFX9-NEXT: scratch_store_dword off, v0, s0 518; GFX9-NEXT: s_waitcnt vmcnt(0) 519; GFX9-NEXT: s_and_b32 s0, s2, 15 520; GFX9-NEXT: s_lshl_b32 s0, s0, 2 521; GFX9-NEXT: s_add_i32 s0, s0, 4 522; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 523; GFX9-NEXT: s_waitcnt vmcnt(0) 524; GFX9-NEXT: s_endpgm 525; 526; GFX10-LABEL: store_load_sindex_foo: 527; GFX10: ; %bb.0: ; %bb 528; GFX10-NEXT: s_add_u32 s0, s0, s3 529; GFX10-NEXT: s_addc_u32 s1, s1, 0 530; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 531; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 532; GFX10-NEXT: v_mov_b32_e32 v0, 15 533; GFX10-NEXT: s_and_b32 s0, s2, 15 534; GFX10-NEXT: s_lshl_b32 s1, s2, 2 535; GFX10-NEXT: s_lshl_b32 s0, s0, 2 536; GFX10-NEXT: s_add_i32 s1, s1, 4 537; GFX10-NEXT: s_add_i32 s0, s0, 4 538; GFX10-NEXT: scratch_store_dword off, v0, s1 539; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 540; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 541; GFX10-NEXT: s_waitcnt vmcnt(0) 542; GFX10-NEXT: s_endpgm 543; 544; GFX11-LABEL: store_load_sindex_foo: 545; GFX11: ; %bb.0: ; %bb 546; GFX11-NEXT: v_mov_b32_e32 v0, 15 547; GFX11-NEXT: s_and_b32 s1, s0, 15 548; GFX11-NEXT: s_lshl_b32 s0, s0, 2 549; GFX11-NEXT: s_lshl_b32 s1, s1, 2 550; GFX11-NEXT: s_add_i32 s0, s0, 4 551; GFX11-NEXT: s_add_i32 s1, s1, 4 552; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 553; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 554; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 555; GFX11-NEXT: s_waitcnt vmcnt(0) 556; GFX11-NEXT: s_endpgm 557; 558; GFX9-PAL-LABEL: store_load_sindex_foo: 559; GFX9-PAL: ; %bb.0: ; %bb 560; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 561; GFX9-PAL-NEXT: s_mov_b32 s2, s0 562; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 563; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 564; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 565; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 566; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 567; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 568; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 569; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 570; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 571; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 572; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 573; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 574; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 575; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 576; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 577; GFX9-PAL-NEXT: s_endpgm 578; 579; GFX940-LABEL: store_load_sindex_foo: 580; GFX940: ; %bb.0: ; %bb 581; GFX940-NEXT: s_lshl_b32 s1, s0, 2 582; GFX940-NEXT: s_and_b32 s0, s0, 15 583; GFX940-NEXT: s_add_i32 s1, s1, 4 584; GFX940-NEXT: v_mov_b32_e32 v0, 15 585; GFX940-NEXT: s_lshl_b32 s0, s0, 2 586; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 587; GFX940-NEXT: s_waitcnt vmcnt(0) 588; GFX940-NEXT: s_add_i32 s0, s0, 4 589; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 590; GFX940-NEXT: s_waitcnt vmcnt(0) 591; GFX940-NEXT: s_endpgm 592; 593; GFX10-PAL-LABEL: store_load_sindex_foo: 594; GFX10-PAL: ; %bb.0: ; %bb 595; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 596; GFX10-PAL-NEXT: s_mov_b32 s2, s0 597; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 598; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 599; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 600; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 601; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 602; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 603; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 604; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 605; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 606; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 607; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 608; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 609; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 610; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 611; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 612; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 613; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 614; GFX10-PAL-NEXT: s_endpgm 615; 616; GFX11-PAL-LABEL: store_load_sindex_foo: 617; GFX11-PAL: ; %bb.0: ; %bb 618; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 619; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 620; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 621; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 622; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4 623; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4 624; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 625; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 626; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 627; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 628; GFX11-PAL-NEXT: s_endpgm 629; GCN-LABEL: store_load_sindex_foo: 630; GCN: ; %bb.0: ; %bb 631; GCN-NEXT: s_lshl_b32 s1, s0, 2 632; GCN-NEXT: s_and_b32 s0, s0, 15 633; GCN-NEXT: s_lshl_b32 s0, s0, 2 634; GCN-NEXT: s_add_u32 s1, 4, s1 635; GCN-NEXT: v_mov_b32_e32 v0, 15 636; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 637; GCN-NEXT: s_waitcnt vmcnt(0) 638; GCN-NEXT: s_add_u32 s0, 4, s0 639; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 640; GCN-NEXT: s_waitcnt vmcnt(0) 641; GCN-NEXT: s_endpgm 642bb: 643 %i = alloca [32 x float], align 4, addrspace(5) 644 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 645 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 646 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 647 store volatile i32 15, i32 addrspace(5)* %i8, align 4 648 %i9 = and i32 %idx, 15 649 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 650 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 651 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 652 ret void 653} 654 655define amdgpu_kernel void @store_load_vindex_kernel() { 656; GFX9-LABEL: store_load_vindex_kernel: 657; GFX9: ; %bb.0: ; %bb 658; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 659; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 660; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 661; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 662; GFX9-NEXT: v_mov_b32_e32 v2, 15 663; GFX9-NEXT: scratch_store_dword v1, v2, off 664; GFX9-NEXT: s_waitcnt vmcnt(0) 665; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 666; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 667; GFX9-NEXT: s_waitcnt vmcnt(0) 668; GFX9-NEXT: s_endpgm 669; 670; GFX10-LABEL: store_load_vindex_kernel: 671; GFX10: ; %bb.0: ; %bb 672; GFX10-NEXT: s_add_u32 s0, s0, s3 673; GFX10-NEXT: s_addc_u32 s1, s1, 0 674; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 675; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 676; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 677; GFX10-NEXT: v_mov_b32_e32 v2, 15 678; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 679; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 680; GFX10-NEXT: scratch_store_dword v1, v2, off 681; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 682; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 683; GFX10-NEXT: s_waitcnt vmcnt(0) 684; GFX10-NEXT: s_endpgm 685; 686; GFX11-LABEL: store_load_vindex_kernel: 687; GFX11: ; %bb.0: ; %bb 688; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 689; GFX11-NEXT: v_mov_b32_e32 v1, 15 690; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 691; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 692; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 693; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 694; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 695; GFX11-NEXT: s_waitcnt vmcnt(0) 696; GFX11-NEXT: s_endpgm 697; 698; GFX9-PAL-LABEL: store_load_vindex_kernel: 699; GFX9-PAL: ; %bb.0: ; %bb 700; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 701; GFX9-PAL-NEXT: s_mov_b32 s2, s0 702; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 703; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 704; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 705; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 706; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 707; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 708; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 709; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 710; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 711; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 712; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 713; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 714; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 715; GFX9-PAL-NEXT: s_endpgm 716; 717; GFX940-LABEL: store_load_vindex_kernel: 718; GFX940: ; %bb.0: ; %bb 719; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 720; GFX940-NEXT: v_mov_b32_e32 v1, 15 721; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 722; GFX940-NEXT: s_waitcnt vmcnt(0) 723; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 724; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 725; GFX940-NEXT: s_waitcnt vmcnt(0) 726; GFX940-NEXT: s_endpgm 727; 728; GFX10-PAL-LABEL: store_load_vindex_kernel: 729; GFX10-PAL: ; %bb.0: ; %bb 730; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 731; GFX10-PAL-NEXT: s_mov_b32 s2, s0 732; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 733; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 734; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 735; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 736; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 737; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 738; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 739; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 740; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 741; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 742; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 743; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off 744; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 745; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 746; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 747; GFX10-PAL-NEXT: s_endpgm 748; 749; GFX11-PAL-LABEL: store_load_vindex_kernel: 750; GFX11-PAL: ; %bb.0: ; %bb 751; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 752; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 753; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) 754; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 755; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 756; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 757; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 758; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 759; GFX11-PAL-NEXT: s_endpgm 760; GCN-LABEL: store_load_vindex_kernel: 761; GCN: ; %bb.0: ; %bb 762; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 763; GCN-NEXT: v_mov_b32_e32 v1, 15 764; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 765; GCN-NEXT: s_waitcnt vmcnt(0) 766; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 767; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 768; GCN-NEXT: s_waitcnt vmcnt(0) 769; GCN-NEXT: s_endpgm 770bb: 771 %i = alloca [32 x float], align 4, addrspace(5) 772 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 773 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 774 %i3 = zext i32 %i2 to i64 775 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 776 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 777 store volatile i32 15, i32 addrspace(5)* %i8, align 4 778 %i9 = sub nsw i32 31, %i2 779 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 780 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 781 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 782 ret void 783} 784 785define void @store_load_vindex_foo(i32 %idx) { 786; GFX9-LABEL: store_load_vindex_foo: 787; GFX9: ; %bb.0: ; %bb 788; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 789; GFX9-NEXT: v_mov_b32_e32 v1, s32 790; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 791; GFX9-NEXT: v_mov_b32_e32 v3, 15 792; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 793; GFX9-NEXT: scratch_store_dword v2, v3, off 794; GFX9-NEXT: s_waitcnt vmcnt(0) 795; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 796; GFX9-NEXT: scratch_load_dword v0, v0, off glc 797; GFX9-NEXT: s_waitcnt vmcnt(0) 798; GFX9-NEXT: s_setpc_b64 s[30:31] 799; 800; GFX10-LABEL: store_load_vindex_foo: 801; GFX10: ; %bb.0: ; %bb 802; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 803; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 804; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 805; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 806; GFX10-NEXT: v_mov_b32_e32 v2, 15 807; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 808; GFX10-NEXT: scratch_store_dword v0, v2, off 809; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 810; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 811; GFX10-NEXT: s_waitcnt vmcnt(0) 812; GFX10-NEXT: s_setpc_b64 s[30:31] 813; 814; GFX11-LABEL: store_load_vindex_foo: 815; GFX11: ; %bb.0: ; %bb 816; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 817; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 818; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 819; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 820; GFX11-NEXT: v_mov_b32_e32 v2, 15 821; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 822; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 823; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc 824; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 825; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 826; GFX11-NEXT: s_waitcnt vmcnt(0) 827; GFX11-NEXT: s_setpc_b64 s[30:31] 828; 829; GFX9-PAL-LABEL: store_load_vindex_foo: 830; GFX9-PAL: ; %bb.0: ; %bb 831; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 832; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 833; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 834; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 835; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 836; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 837; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 838; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 839; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 840; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 841; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 842; 843; GFX940-LABEL: store_load_vindex_foo: 844; GFX940: ; %bb.0: ; %bb 845; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 846; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 847; GFX940-NEXT: v_mov_b32_e32 v2, 15 848; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 849; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 850; GFX940-NEXT: s_waitcnt vmcnt(0) 851; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 852; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 853; GFX940-NEXT: s_waitcnt vmcnt(0) 854; GFX940-NEXT: s_setpc_b64 s[30:31] 855; 856; GFX10-PAL-LABEL: store_load_vindex_foo: 857; GFX10-PAL: ; %bb.0: ; %bb 858; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 859; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 860; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 861; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 862; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 863; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 864; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 865; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 866; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 867; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 868; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 869; 870; GFX11-PAL-LABEL: store_load_vindex_foo: 871; GFX11-PAL: ; %bb.0: ; %bb 872; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 873; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 874; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 875; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 876; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 877; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3) 878; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 879; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc 880; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 881; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 882; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 883; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 884; GCN-LABEL: store_load_vindex_foo: 885; GCN: ; %bb.0: ; %bb 886; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 887; GCN-NEXT: v_mov_b32_e32 v2, 15 888; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 889; GCN-NEXT: v_and_b32_e32 v0, v0, v2 890; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 891; GCN-NEXT: s_waitcnt vmcnt(0) 892; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 893; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 894; GCN-NEXT: s_waitcnt vmcnt(0) 895; GCN-NEXT: s_setpc_b64 s[30:31] 896bb: 897 %i = alloca [32 x float], align 4, addrspace(5) 898 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 899 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 900 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 901 store volatile i32 15, i32 addrspace(5)* %i8, align 4 902 %i9 = and i32 %idx, 15 903 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 904 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 905 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 906 ret void 907} 908 909define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 910; GFX9-LABEL: private_ptr_foo: 911; GFX9: ; %bb.0: 912; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 913; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 914; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 915; GFX9-NEXT: s_waitcnt vmcnt(0) 916; GFX9-NEXT: s_setpc_b64 s[30:31] 917; 918; GFX10-LABEL: private_ptr_foo: 919; GFX10: ; %bb.0: 920; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 921; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 922; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 923; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 924; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 925; GFX10-NEXT: s_setpc_b64 s[30:31] 926; 927; GFX11-LABEL: private_ptr_foo: 928; GFX11: ; %bb.0: 929; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 930; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 931; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 932; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 933; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 934; GFX11-NEXT: s_setpc_b64 s[30:31] 935; 936; GFX9-PAL-LABEL: private_ptr_foo: 937; GFX9-PAL: ; %bb.0: 938; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 939; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 940; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 941; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 942; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 943; 944; GFX940-LABEL: private_ptr_foo: 945; GFX940: ; %bb.0: 946; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 947; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 948; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 949; GFX940-NEXT: s_waitcnt vmcnt(0) 950; GFX940-NEXT: s_setpc_b64 s[30:31] 951; 952; GFX10-PAL-LABEL: private_ptr_foo: 953; GFX10-PAL: ; %bb.0: 954; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 955; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 956; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 957; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 958; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 959; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 960; 961; GFX11-PAL-LABEL: private_ptr_foo: 962; GFX11-PAL: ; %bb.0: 963; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 964; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 965; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 966; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 967; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 968; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 969; GCN-LABEL: private_ptr_foo: 970; GCN: ; %bb.0: 971; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 972; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 973; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 974; GCN-NEXT: s_waitcnt vmcnt(0) 975; GCN-NEXT: s_setpc_b64 s[30:31] 976 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 977 store float 1.000000e+01, float addrspace(5)* %gep, align 4 978 ret void 979} 980 981define amdgpu_kernel void @zero_init_small_offset_kernel() { 982; GFX9-LABEL: zero_init_small_offset_kernel: 983; GFX9: ; %bb.0: 984; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 985; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 986; GFX9-NEXT: s_mov_b32 vcc_hi, 0 987; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 988; GFX9-NEXT: s_waitcnt vmcnt(0) 989; GFX9-NEXT: s_mov_b32 s0, 0 990; GFX9-NEXT: s_mov_b32 s1, s0 991; GFX9-NEXT: s_mov_b32 s2, s0 992; GFX9-NEXT: s_mov_b32 s3, s0 993; GFX9-NEXT: v_mov_b32_e32 v0, s0 994; GFX9-NEXT: v_mov_b32_e32 v1, s1 995; GFX9-NEXT: v_mov_b32_e32 v2, s2 996; GFX9-NEXT: v_mov_b32_e32 v3, s3 997; GFX9-NEXT: s_mov_b32 vcc_hi, 0 998; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 999; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1000; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 1001; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1002; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 1003; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1004; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 1005; GFX9-NEXT: s_endpgm 1006; 1007; GFX10-LABEL: zero_init_small_offset_kernel: 1008; GFX10: ; %bb.0: 1009; GFX10-NEXT: s_add_u32 s0, s0, s3 1010; GFX10-NEXT: s_addc_u32 s1, s1, 0 1011; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1012; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1013; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1014; GFX10-NEXT: s_waitcnt vmcnt(0) 1015; GFX10-NEXT: s_mov_b32 s0, 0 1016; GFX10-NEXT: s_mov_b32 s1, s0 1017; GFX10-NEXT: s_mov_b32 s2, s0 1018; GFX10-NEXT: s_mov_b32 s3, s0 1019; GFX10-NEXT: v_mov_b32_e32 v0, s0 1020; GFX10-NEXT: v_mov_b32_e32 v1, s1 1021; GFX10-NEXT: v_mov_b32_e32 v2, s2 1022; GFX10-NEXT: v_mov_b32_e32 v3, s3 1023; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1024; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1025; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1026; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1027; GFX10-NEXT: s_endpgm 1028; 1029; GFX11-LABEL: zero_init_small_offset_kernel: 1030; GFX11: ; %bb.0: 1031; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1032; GFX11-NEXT: s_waitcnt vmcnt(0) 1033; GFX11-NEXT: s_mov_b32 s0, 0 1034; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1035; GFX11-NEXT: s_mov_b32 s1, s0 1036; GFX11-NEXT: s_mov_b32 s2, s0 1037; GFX11-NEXT: s_mov_b32 s3, s0 1038; GFX11-NEXT: v_mov_b32_e32 v0, s0 1039; GFX11-NEXT: v_mov_b32_e32 v1, s1 1040; GFX11-NEXT: v_mov_b32_e32 v2, s2 1041; GFX11-NEXT: v_mov_b32_e32 v3, s3 1042; GFX11-NEXT: s_clause 0x3 1043; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1044; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1045; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1046; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:320 1047; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1048; GFX11-NEXT: s_endpgm 1049; 1050; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 1051; GFX9-PAL: ; %bb.0: 1052; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1053; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1054; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1055; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1056; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1057; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1058; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1059; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1060; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1061; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1062; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1063; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1064; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1065; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1066; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1067; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1068; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1069; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1070; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1071; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 1072; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1073; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 1074; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1075; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 1076; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1077; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 1078; GFX9-PAL-NEXT: s_endpgm 1079; 1080; GFX940-LABEL: zero_init_small_offset_kernel: 1081; GFX940: ; %bb.0: 1082; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1083; GFX940-NEXT: s_waitcnt vmcnt(0) 1084; GFX940-NEXT: s_mov_b32 s0, 0 1085; GFX940-NEXT: s_mov_b32 s1, s0 1086; GFX940-NEXT: s_mov_b32 s2, s0 1087; GFX940-NEXT: s_mov_b32 s3, s0 1088; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1089; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1090; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1091; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1092; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1093; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1094; GFX940-NEXT: s_endpgm 1095; 1096; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 1097; GFX1010-PAL: ; %bb.0: 1098; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1099; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1100; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1101; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1102; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1103; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1104; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1105; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1106; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1107; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1108; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1109; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1110; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1111; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1112; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1113; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1114; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1115; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1116; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1117; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1118; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1119; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 1120; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1121; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1122; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 1123; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1124; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1125; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 1126; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1127; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1128; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 1129; GFX1010-PAL-NEXT: s_endpgm 1130; 1131; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 1132; GFX1030-PAL: ; %bb.0: 1133; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1134; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1135; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1136; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1138; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1139; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1140; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1141; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1142; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1143; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1144; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1145; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1146; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1147; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1148; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1149; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1150; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1151; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1152; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 1153; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 1154; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 1155; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 1156; GFX1030-PAL-NEXT: s_endpgm 1157; 1158; GFX11-PAL-LABEL: zero_init_small_offset_kernel: 1159; GFX11-PAL: ; %bb.0: 1160; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1161; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1162; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1163; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1164; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1165; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1166; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1167; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 1168; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 1169; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 1170; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 1171; GFX11-PAL-NEXT: s_clause 0x3 1172; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 1173; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 1174; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:304 1175; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:320 1176; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1177; GFX11-PAL-NEXT: s_endpgm 1178 %padding = alloca [64 x i32], align 4, addrspace(5) 1179 %alloca = alloca [32 x i16], align 2, addrspace(5) 1180 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1181 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1182 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1183 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1184 ret void 1185} 1186 1187define void @zero_init_small_offset_foo() { 1188; GFX9-LABEL: zero_init_small_offset_foo: 1189; GFX9: ; %bb.0: 1190; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1191; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 1192; GFX9-NEXT: s_waitcnt vmcnt(0) 1193; GFX9-NEXT: s_mov_b32 s0, 0 1194; GFX9-NEXT: s_mov_b32 s1, s0 1195; GFX9-NEXT: s_mov_b32 s2, s0 1196; GFX9-NEXT: s_mov_b32 s3, s0 1197; GFX9-NEXT: v_mov_b32_e32 v0, s0 1198; GFX9-NEXT: v_mov_b32_e32 v1, s1 1199; GFX9-NEXT: v_mov_b32_e32 v2, s2 1200; GFX9-NEXT: v_mov_b32_e32 v3, s3 1201; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1202; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1203; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1204; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1205; GFX9-NEXT: s_waitcnt vmcnt(0) 1206; GFX9-NEXT: s_setpc_b64 s[30:31] 1207; 1208; GFX10-LABEL: zero_init_small_offset_foo: 1209; GFX10: ; %bb.0: 1210; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1211; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1212; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 1213; GFX10-NEXT: s_waitcnt vmcnt(0) 1214; GFX10-NEXT: s_mov_b32 s0, 0 1215; GFX10-NEXT: s_mov_b32 s1, s0 1216; GFX10-NEXT: s_mov_b32 s2, s0 1217; GFX10-NEXT: s_mov_b32 s3, s0 1218; GFX10-NEXT: v_mov_b32_e32 v0, s0 1219; GFX10-NEXT: v_mov_b32_e32 v1, s1 1220; GFX10-NEXT: v_mov_b32_e32 v2, s2 1221; GFX10-NEXT: v_mov_b32_e32 v3, s3 1222; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1223; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1224; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1225; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1226; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1227; GFX10-NEXT: s_setpc_b64 s[30:31] 1228; 1229; GFX11-LABEL: zero_init_small_offset_foo: 1230; GFX11: ; %bb.0: 1231; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1232; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1233; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1234; GFX11-NEXT: s_waitcnt vmcnt(0) 1235; GFX11-NEXT: s_mov_b32 s0, 0 1236; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1237; GFX11-NEXT: s_mov_b32 s1, s0 1238; GFX11-NEXT: s_mov_b32 s2, s0 1239; GFX11-NEXT: s_mov_b32 s3, s0 1240; GFX11-NEXT: v_mov_b32_e32 v0, s0 1241; GFX11-NEXT: v_mov_b32_e32 v1, s1 1242; GFX11-NEXT: v_mov_b32_e32 v2, s2 1243; GFX11-NEXT: v_mov_b32_e32 v3, s3 1244; GFX11-NEXT: s_clause 0x3 1245; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1246; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1247; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1248; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1249; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1250; GFX11-NEXT: s_setpc_b64 s[30:31] 1251; 1252; GFX9-PAL-LABEL: zero_init_small_offset_foo: 1253; GFX9-PAL: ; %bb.0: 1254; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1255; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 1256; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1257; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1258; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1259; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1260; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1261; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1262; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1263; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1264; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1265; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1266; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1267; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1268; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1269; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1270; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1271; 1272; GFX940-LABEL: zero_init_small_offset_foo: 1273; GFX940: ; %bb.0: 1274; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1275; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1276; GFX940-NEXT: s_waitcnt vmcnt(0) 1277; GFX940-NEXT: s_mov_b32 s0, 0 1278; GFX940-NEXT: s_mov_b32 s1, s0 1279; GFX940-NEXT: s_mov_b32 s2, s0 1280; GFX940-NEXT: s_mov_b32 s3, s0 1281; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1282; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1283; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1284; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1285; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1286; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1287; GFX940-NEXT: s_waitcnt vmcnt(0) 1288; GFX940-NEXT: s_setpc_b64 s[30:31] 1289; 1290; GFX10-PAL-LABEL: zero_init_small_offset_foo: 1291; GFX10-PAL: ; %bb.0: 1292; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1293; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1294; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1295; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1296; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1297; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1298; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1299; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1300; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1301; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1302; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1303; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1304; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1305; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1306; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1307; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1308; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1309; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1310; 1311; GFX11-PAL-LABEL: zero_init_small_offset_foo: 1312; GFX11-PAL: ; %bb.0: 1313; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1314; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1315; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc 1316; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1317; GFX11-PAL-NEXT: s_mov_b32 s0, 0 1318; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1319; GFX11-PAL-NEXT: s_mov_b32 s1, s0 1320; GFX11-PAL-NEXT: s_mov_b32 s2, s0 1321; GFX11-PAL-NEXT: s_mov_b32 s3, s0 1322; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 1323; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 1324; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 1325; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 1326; GFX11-PAL-NEXT: s_clause 0x3 1327; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 1328; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 1329; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288 1330; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304 1331; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1332; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 1333; GCN-LABEL: zero_init_small_offset_foo: 1334; GCN: ; %bb.0: 1335; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1336; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1337; GCN-NEXT: s_waitcnt vmcnt(0) 1338; GCN-NEXT: s_mov_b32 s0, 0 1339; GCN-NEXT: s_mov_b32 s1, s0 1340; GCN-NEXT: s_mov_b32 s2, s0 1341; GCN-NEXT: s_mov_b32 s3, s0 1342; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1343; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1344; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1345; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1346; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1347; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1348; GCN-NEXT: s_waitcnt vmcnt(0) 1349; GCN-NEXT: s_setpc_b64 s[30:31] 1350 %padding = alloca [64 x i32], align 4, addrspace(5) 1351 %alloca = alloca [32 x i16], align 2, addrspace(5) 1352 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1353 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1354 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1355 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1356 ret void 1357} 1358 1359define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 1360; GFX9-LABEL: store_load_sindex_small_offset_kernel: 1361; GFX9: ; %bb.0: ; %bb 1362; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1363; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1364; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1365; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1366; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1367; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1368; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1369; GFX9-NEXT: s_and_b32 s0, s0, 15 1370; GFX9-NEXT: v_mov_b32_e32 v0, 15 1371; GFX9-NEXT: s_addk_i32 s1, 0x104 1372; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1373; GFX9-NEXT: scratch_store_dword off, v0, s1 1374; GFX9-NEXT: s_waitcnt vmcnt(0) 1375; GFX9-NEXT: s_addk_i32 s0, 0x104 1376; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1377; GFX9-NEXT: s_waitcnt vmcnt(0) 1378; GFX9-NEXT: s_endpgm 1379; 1380; GFX10-LABEL: store_load_sindex_small_offset_kernel: 1381; GFX10: ; %bb.0: ; %bb 1382; GFX10-NEXT: s_add_u32 s2, s2, s5 1383; GFX10-NEXT: s_addc_u32 s3, s3, 0 1384; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1385; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1386; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1387; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1388; GFX10-NEXT: s_waitcnt vmcnt(0) 1389; GFX10-NEXT: v_mov_b32_e32 v0, 15 1390; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1391; GFX10-NEXT: s_and_b32 s1, s0, 15 1392; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1393; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1394; GFX10-NEXT: s_addk_i32 s0, 0x104 1395; GFX10-NEXT: s_addk_i32 s1, 0x104 1396; GFX10-NEXT: scratch_store_dword off, v0, s0 1397; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1398; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1399; GFX10-NEXT: s_waitcnt vmcnt(0) 1400; GFX10-NEXT: s_endpgm 1401; 1402; GFX11-LABEL: store_load_sindex_small_offset_kernel: 1403; GFX11: ; %bb.0: ; %bb 1404; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 1405; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1406; GFX11-NEXT: s_waitcnt vmcnt(0) 1407; GFX11-NEXT: v_mov_b32_e32 v0, 15 1408; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1409; GFX11-NEXT: s_and_b32 s1, s0, 15 1410; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1411; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1412; GFX11-NEXT: s_addk_i32 s0, 0x104 1413; GFX11-NEXT: s_addk_i32 s1, 0x104 1414; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1415; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1416; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1417; GFX11-NEXT: s_waitcnt vmcnt(0) 1418; GFX11-NEXT: s_endpgm 1419; 1420; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 1421; GFX9-PAL: ; %bb.0: ; %bb 1422; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1423; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1424; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1425; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1426; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1427; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1428; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1429; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1430; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1431; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1432; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1433; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1434; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1435; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1436; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1437; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1438; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1439; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1440; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1441; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1442; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1443; GFX9-PAL-NEXT: s_endpgm 1444; 1445; GFX940-LABEL: store_load_sindex_small_offset_kernel: 1446; GFX940: ; %bb.0: ; %bb 1447; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 1448; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1449; GFX940-NEXT: s_waitcnt vmcnt(0) 1450; GFX940-NEXT: v_mov_b32_e32 v0, 15 1451; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1452; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1453; GFX940-NEXT: s_and_b32 s0, s0, 15 1454; GFX940-NEXT: s_addk_i32 s1, 0x104 1455; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1456; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1457; GFX940-NEXT: s_waitcnt vmcnt(0) 1458; GFX940-NEXT: s_addk_i32 s0, 0x104 1459; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1460; GFX940-NEXT: s_waitcnt vmcnt(0) 1461; GFX940-NEXT: s_endpgm 1462; 1463; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 1464; GFX1010-PAL: ; %bb.0: ; %bb 1465; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 1466; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 1467; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1468; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1469; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1470; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 1471; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 1472; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1473; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1474; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1475; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1476; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1477; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1478; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1479; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1481; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1482; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1483; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1484; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1485; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1486; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1487; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1488; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1489; GFX1010-PAL-NEXT: s_endpgm 1490; 1491; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 1492; GFX1030-PAL: ; %bb.0: ; %bb 1493; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 1494; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 1495; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1496; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1498; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 1499; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 1500; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1501; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1502; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1503; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1504; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1505; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1506; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1507; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1508; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1509; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1510; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1511; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1512; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1513; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1514; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1515; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1516; GFX1030-PAL-NEXT: s_endpgm 1517; 1518; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel: 1519; GFX11-PAL: ; %bb.0: ; %bb 1520; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 1521; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1522; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1523; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1524; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1526; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1527; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1528; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 1529; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 1530; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1531; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1532; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1533; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1534; GFX11-PAL-NEXT: s_endpgm 1535bb: 1536 %padding = alloca [64 x i32], align 4, addrspace(5) 1537 %i = alloca [32 x float], align 4, addrspace(5) 1538 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1539 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1540 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1541 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1542 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1543 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1544 %i9 = and i32 %idx, 15 1545 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1546 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1547 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1548 ret void 1549} 1550 1551define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 1552; GFX9-LABEL: store_load_sindex_small_offset_foo: 1553; GFX9: ; %bb.0: ; %bb 1554; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1555; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1556; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1557; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1558; GFX9-NEXT: s_waitcnt vmcnt(0) 1559; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1560; GFX9-NEXT: s_addk_i32 s0, 0x104 1561; GFX9-NEXT: v_mov_b32_e32 v0, 15 1562; GFX9-NEXT: scratch_store_dword off, v0, s0 1563; GFX9-NEXT: s_waitcnt vmcnt(0) 1564; GFX9-NEXT: s_and_b32 s0, s2, 15 1565; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1566; GFX9-NEXT: s_addk_i32 s0, 0x104 1567; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1568; GFX9-NEXT: s_waitcnt vmcnt(0) 1569; GFX9-NEXT: s_endpgm 1570; 1571; GFX10-LABEL: store_load_sindex_small_offset_foo: 1572; GFX10: ; %bb.0: ; %bb 1573; GFX10-NEXT: s_add_u32 s0, s0, s3 1574; GFX10-NEXT: s_addc_u32 s1, s1, 0 1575; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1576; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1577; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1578; GFX10-NEXT: s_waitcnt vmcnt(0) 1579; GFX10-NEXT: v_mov_b32_e32 v0, 15 1580; GFX10-NEXT: s_and_b32 s0, s2, 15 1581; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1582; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1583; GFX10-NEXT: s_addk_i32 s1, 0x104 1584; GFX10-NEXT: s_addk_i32 s0, 0x104 1585; GFX10-NEXT: scratch_store_dword off, v0, s1 1586; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1587; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1588; GFX10-NEXT: s_waitcnt vmcnt(0) 1589; GFX10-NEXT: s_endpgm 1590; 1591; GFX11-LABEL: store_load_sindex_small_offset_foo: 1592; GFX11: ; %bb.0: ; %bb 1593; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1594; GFX11-NEXT: s_waitcnt vmcnt(0) 1595; GFX11-NEXT: v_mov_b32_e32 v0, 15 1596; GFX11-NEXT: s_and_b32 s1, s0, 15 1597; GFX11-NEXT: s_lshl_b32 s0, s0, 2 1598; GFX11-NEXT: s_lshl_b32 s1, s1, 2 1599; GFX11-NEXT: s_addk_i32 s0, 0x104 1600; GFX11-NEXT: s_addk_i32 s1, 0x104 1601; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 1602; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1603; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1604; GFX11-NEXT: s_waitcnt vmcnt(0) 1605; GFX11-NEXT: s_endpgm 1606; 1607; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1608; GFX9-PAL: ; %bb.0: ; %bb 1609; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1610; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1611; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1612; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1613; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1614; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1615; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1616; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1617; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1618; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1619; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1620; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1621; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1622; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1623; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1624; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1625; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1626; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1627; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1628; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1629; GFX9-PAL-NEXT: s_endpgm 1630; 1631; GFX940-LABEL: store_load_sindex_small_offset_foo: 1632; GFX940: ; %bb.0: ; %bb 1633; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1634; GFX940-NEXT: s_waitcnt vmcnt(0) 1635; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1636; GFX940-NEXT: s_and_b32 s0, s0, 15 1637; GFX940-NEXT: s_addk_i32 s1, 0x104 1638; GFX940-NEXT: v_mov_b32_e32 v0, 15 1639; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1640; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1641; GFX940-NEXT: s_waitcnt vmcnt(0) 1642; GFX940-NEXT: s_addk_i32 s0, 0x104 1643; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1644; GFX940-NEXT: s_waitcnt vmcnt(0) 1645; GFX940-NEXT: s_endpgm 1646; 1647; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1648; GFX1010-PAL: ; %bb.0: ; %bb 1649; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1650; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1651; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1652; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1653; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1654; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1655; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1656; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1657; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1658; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1659; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1660; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1661; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1662; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1663; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1664; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1665; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1666; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1667; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1668; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1669; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1670; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1671; GFX1010-PAL-NEXT: s_endpgm 1672; 1673; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1674; GFX1030-PAL: ; %bb.0: ; %bb 1675; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1676; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1677; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1678; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1679; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1680; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1681; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1682; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1683; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1684; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1685; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1686; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1687; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1688; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1689; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1690; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1691; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1692; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1693; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1694; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1695; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1696; GFX1030-PAL-NEXT: s_endpgm 1697; 1698; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo: 1699; GFX11-PAL: ; %bb.0: ; %bb 1700; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 1701; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1702; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 1703; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 1704; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 1705; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 1706; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104 1707; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104 1708; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 1709; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1710; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 1711; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1712; GFX11-PAL-NEXT: s_endpgm 1713bb: 1714 %padding = alloca [64 x i32], align 4, addrspace(5) 1715 %i = alloca [32 x float], align 4, addrspace(5) 1716 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1717 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1718 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1719 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1720 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1721 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1722 %i9 = and i32 %idx, 15 1723 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1724 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1725 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1726 ret void 1727} 1728 1729define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1730; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1731; GFX9: ; %bb.0: ; %bb 1732; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1733; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1734; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1735; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1736; GFX9-NEXT: s_waitcnt vmcnt(0) 1737; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1738; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 1739; GFX9-NEXT: v_mov_b32_e32 v2, 15 1740; GFX9-NEXT: scratch_store_dword v1, v2, off 1741; GFX9-NEXT: s_waitcnt vmcnt(0) 1742; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 1743; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1744; GFX9-NEXT: s_waitcnt vmcnt(0) 1745; GFX9-NEXT: s_endpgm 1746; 1747; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1748; GFX10: ; %bb.0: ; %bb 1749; GFX10-NEXT: s_add_u32 s0, s0, s3 1750; GFX10-NEXT: s_addc_u32 s1, s1, 0 1751; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1752; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1753; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1754; GFX10-NEXT: v_mov_b32_e32 v2, 15 1755; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1756; GFX10-NEXT: s_waitcnt vmcnt(0) 1757; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1758; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1759; GFX10-NEXT: scratch_store_dword v1, v2, off 1760; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1761; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1762; GFX10-NEXT: s_waitcnt vmcnt(0) 1763; GFX10-NEXT: s_endpgm 1764; 1765; GFX11-LABEL: store_load_vindex_small_offset_kernel: 1766; GFX11: ; %bb.0: ; %bb 1767; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1768; GFX11-NEXT: v_mov_b32_e32 v1, 15 1769; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 1770; GFX11-NEXT: s_waitcnt vmcnt(0) 1771; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 1772; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 1773; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1774; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1775; GFX11-NEXT: s_waitcnt vmcnt(0) 1776; GFX11-NEXT: s_endpgm 1777; 1778; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1779; GFX9-PAL: ; %bb.0: ; %bb 1780; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1781; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1782; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1783; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1784; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1785; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 1786; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1787; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1788; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1789; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1790; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1791; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1792; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 1793; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 1794; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1795; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 1796; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1797; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1798; GFX9-PAL-NEXT: s_endpgm 1799; 1800; GFX940-LABEL: store_load_vindex_small_offset_kernel: 1801; GFX940: ; %bb.0: ; %bb 1802; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 1803; GFX940-NEXT: s_waitcnt vmcnt(0) 1804; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1805; GFX940-NEXT: v_mov_b32_e32 v1, 15 1806; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 1807; GFX940-NEXT: s_waitcnt vmcnt(0) 1808; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 1809; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 1810; GFX940-NEXT: s_waitcnt vmcnt(0) 1811; GFX940-NEXT: s_endpgm 1812; 1813; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1814; GFX1010-PAL: ; %bb.0: ; %bb 1815; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1816; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1817; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1818; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1819; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1820; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1821; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1822; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1823; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1824; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1825; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 1826; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1827; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 1828; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1829; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1830; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1831; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 1832; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1833; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1834; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1835; GFX1010-PAL-NEXT: s_endpgm 1836; 1837; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1838; GFX1030-PAL: ; %bb.0: ; %bb 1839; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1840; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1841; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1842; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1843; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1844; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1845; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1846; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1847; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1848; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1849; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 1850; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1851; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1852; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1853; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1854; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 1855; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1856; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1857; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1858; GFX1030-PAL-NEXT: s_endpgm 1859; 1860; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: 1861; GFX11-PAL: ; %bb.0: ; %bb 1862; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1863; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 1864; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 1865; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1866; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 1867; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 1868; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1869; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 1870; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 1871; GFX11-PAL-NEXT: s_endpgm 1872bb: 1873 %padding = alloca [64 x i32], align 4, addrspace(5) 1874 %i = alloca [32 x float], align 4, addrspace(5) 1875 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1876 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1877 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1878 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1879 %i3 = zext i32 %i2 to i64 1880 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1881 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1882 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1883 %i9 = sub nsw i32 31, %i2 1884 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1885 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1886 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1887 ret void 1888} 1889 1890define void @store_load_vindex_small_offset_foo(i32 %idx) { 1891; GFX9-LABEL: store_load_vindex_small_offset_foo: 1892; GFX9: ; %bb.0: ; %bb 1893; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1894; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1895; GFX9-NEXT: s_waitcnt vmcnt(0) 1896; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 1897; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1898; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1899; GFX9-NEXT: v_mov_b32_e32 v3, 15 1900; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 1901; GFX9-NEXT: scratch_store_dword v2, v3, off 1902; GFX9-NEXT: s_waitcnt vmcnt(0) 1903; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1904; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1905; GFX9-NEXT: s_waitcnt vmcnt(0) 1906; GFX9-NEXT: s_setpc_b64 s[30:31] 1907; 1908; GFX10-LABEL: store_load_vindex_small_offset_foo: 1909; GFX10: ; %bb.0: ; %bb 1910; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1911; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1912; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 1913; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1914; GFX10-NEXT: v_mov_b32_e32 v2, 15 1915; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1916; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1917; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1918; GFX10-NEXT: s_waitcnt vmcnt(0) 1919; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1920; GFX10-NEXT: scratch_store_dword v0, v2, off 1921; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1922; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 1923; GFX10-NEXT: s_waitcnt vmcnt(0) 1924; GFX10-NEXT: s_setpc_b64 s[30:31] 1925; 1926; GFX11-LABEL: store_load_vindex_small_offset_foo: 1927; GFX11: ; %bb.0: ; %bb 1928; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1929; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1930; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 1931; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1932; GFX11-NEXT: v_mov_b32_e32 v2, 15 1933; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc 1934; GFX11-NEXT: s_waitcnt vmcnt(0) 1935; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 1936; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 1937; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1938; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 1939; GFX11-NEXT: s_waitcnt vmcnt(0) 1940; GFX11-NEXT: s_setpc_b64 s[30:31] 1941; 1942; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1943; GFX9-PAL: ; %bb.0: ; %bb 1944; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1945; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1946; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1947; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 1948; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1949; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1950; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1951; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 1952; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1953; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1954; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1955; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1956; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1957; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1958; 1959; GFX940-LABEL: store_load_vindex_small_offset_foo: 1960; GFX940: ; %bb.0: ; %bb 1961; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1962; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1963; GFX940-NEXT: s_waitcnt vmcnt(0) 1964; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1965; GFX940-NEXT: v_mov_b32_e32 v2, 15 1966; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 1967; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1968; GFX940-NEXT: s_waitcnt vmcnt(0) 1969; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1970; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1971; GFX940-NEXT: s_waitcnt vmcnt(0) 1972; GFX940-NEXT: s_setpc_b64 s[30:31] 1973; 1974; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1975; GFX10-PAL: ; %bb.0: ; %bb 1976; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1977; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1978; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 1979; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1980; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 1981; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1982; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1983; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1984; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1985; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1986; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 1987; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1988; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 1989; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1990; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1991; 1992; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo: 1993; GFX11-PAL: ; %bb.0: ; %bb 1994; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1995; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1996; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 1997; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1998; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 1999; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc 2000; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2001; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 2002; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 2003; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2004; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 2005; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2006; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 2007; GCN-LABEL: store_load_vindex_small_offset_foo: 2008; GCN: ; %bb.0: ; %bb 2009; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2010; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 2011; GCN-NEXT: s_waitcnt vmcnt(0) 2012; GCN-NEXT: v_mov_b32_e32 v2, 15 2013; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 2014; GCN-NEXT: v_and_b32_e32 v0, v0, v2 2015; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 2016; GCN-NEXT: s_waitcnt vmcnt(0) 2017; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2018; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 2019; GCN-NEXT: s_waitcnt vmcnt(0) 2020; GCN-NEXT: s_setpc_b64 s[30:31] 2021bb: 2022 %padding = alloca [64 x i32], align 4, addrspace(5) 2023 %i = alloca [32 x float], align 4, addrspace(5) 2024 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 2025 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2026 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2027 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2028 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2029 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2030 %i9 = and i32 %idx, 15 2031 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2032 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2033 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2034 ret void 2035} 2036 2037define amdgpu_kernel void @zero_init_large_offset_kernel() { 2038; GFX9-LABEL: zero_init_large_offset_kernel: 2039; GFX9: ; %bb.0: 2040; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2041; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2042; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2043; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 2044; GFX9-NEXT: s_waitcnt vmcnt(0) 2045; GFX9-NEXT: s_mov_b32 s0, 0 2046; GFX9-NEXT: s_mov_b32 s1, s0 2047; GFX9-NEXT: s_mov_b32 s2, s0 2048; GFX9-NEXT: s_mov_b32 s3, s0 2049; GFX9-NEXT: v_mov_b32_e32 v0, s0 2050; GFX9-NEXT: v_mov_b32_e32 v1, s1 2051; GFX9-NEXT: v_mov_b32_e32 v2, s2 2052; GFX9-NEXT: v_mov_b32_e32 v3, s3 2053; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2054; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2055; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2056; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2057; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2058; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2059; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 2060; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2061; GFX9-NEXT: s_endpgm 2062; 2063; GFX10-LABEL: zero_init_large_offset_kernel: 2064; GFX10: ; %bb.0: 2065; GFX10-NEXT: s_add_u32 s0, s0, s3 2066; GFX10-NEXT: s_addc_u32 s1, s1, 0 2067; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2068; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2069; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 2070; GFX10-NEXT: s_waitcnt vmcnt(0) 2071; GFX10-NEXT: s_mov_b32 s0, 0 2072; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2073; GFX10-NEXT: s_mov_b32 s1, s0 2074; GFX10-NEXT: s_mov_b32 s2, s0 2075; GFX10-NEXT: s_mov_b32 s3, s0 2076; GFX10-NEXT: v_mov_b32_e32 v0, s0 2077; GFX10-NEXT: v_mov_b32_e32 v1, s1 2078; GFX10-NEXT: v_mov_b32_e32 v2, s2 2079; GFX10-NEXT: v_mov_b32_e32 v3, s3 2080; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2081; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2082; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2083; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2084; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2085; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 2086; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2087; GFX10-NEXT: s_endpgm 2088; 2089; GFX11-LABEL: zero_init_large_offset_kernel: 2090; GFX11: ; %bb.0: 2091; GFX11-NEXT: scratch_load_b32 v0, off, off offset:16 glc dlc 2092; GFX11-NEXT: s_waitcnt vmcnt(0) 2093; GFX11-NEXT: s_mov_b32 s0, 0 2094; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2095; GFX11-NEXT: s_mov_b32 s1, s0 2096; GFX11-NEXT: s_mov_b32 s2, s0 2097; GFX11-NEXT: s_mov_b32 s3, s0 2098; GFX11-NEXT: v_mov_b32_e32 v0, s0 2099; GFX11-NEXT: v_mov_b32_e32 v1, s1 2100; GFX11-NEXT: v_mov_b32_e32 v2, s2 2101; GFX11-NEXT: v_mov_b32_e32 v3, s3 2102; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2103; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2104; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2105; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2106; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2107; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 2108; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2109; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2110; GFX11-NEXT: s_endpgm 2111; 2112; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 2113; GFX9-PAL: ; %bb.0: 2114; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2115; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2116; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2117; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2118; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2119; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2120; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2121; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2122; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2123; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 2124; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2125; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2126; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2127; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2128; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2129; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2130; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2131; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2132; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2133; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2134; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2135; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2136; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2137; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2138; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 2139; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2140; GFX9-PAL-NEXT: s_endpgm 2141; 2142; GFX940-LABEL: zero_init_large_offset_kernel: 2143; GFX940: ; %bb.0: 2144; GFX940-NEXT: scratch_load_dword v0, off, off offset:16 sc0 sc1 2145; GFX940-NEXT: s_waitcnt vmcnt(0) 2146; GFX940-NEXT: s_mov_b32 s0, 0 2147; GFX940-NEXT: s_mov_b32 s1, s0 2148; GFX940-NEXT: s_mov_b32 s2, s0 2149; GFX940-NEXT: s_mov_b32 s3, s0 2150; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2151; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2152; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2153; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2154; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2155; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2156; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2157; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2158; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 2159; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2160; GFX940-NEXT: s_endpgm 2161; 2162; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 2163; GFX1010-PAL: ; %bb.0: 2164; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2165; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2166; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2167; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2168; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2169; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2170; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2171; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2172; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2173; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2174; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2175; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc 2176; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2177; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2178; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2179; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2180; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2181; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2182; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2183; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2184; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2185; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2186; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2187; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2188; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2189; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2190; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2191; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2192; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2193; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2194; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2195; GFX1010-PAL-NEXT: s_endpgm 2196; 2197; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 2198; GFX1030-PAL: ; %bb.0: 2199; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2200; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2201; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2202; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2204; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2205; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2206; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2207; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2208; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 2209; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2210; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2211; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2212; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2213; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2214; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2215; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2216; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2217; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2218; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2219; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2220; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2221; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2222; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2223; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2224; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2225; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2226; GFX1030-PAL-NEXT: s_endpgm 2227; 2228; GFX11-PAL-LABEL: zero_init_large_offset_kernel: 2229; GFX11-PAL: ; %bb.0: 2230; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:16 glc dlc 2231; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2232; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2233; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2234; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2235; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2236; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2237; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 2238; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 2239; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 2240; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 2241; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2242; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2243; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2244; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2245; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2246; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 2247; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2248; GFX11-PAL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2249; GFX11-PAL-NEXT: s_endpgm 2250 %padding = alloca [4096 x i32], align 4, addrspace(5) 2251 %alloca = alloca [32 x i16], align 2, addrspace(5) 2252 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2253 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2254 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 2255 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 2256 ret void 2257} 2258 2259define void @zero_init_large_offset_foo() { 2260; GFX9-LABEL: zero_init_large_offset_foo: 2261; GFX9: ; %bb.0: 2262; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2263; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 2264; GFX9-NEXT: s_waitcnt vmcnt(0) 2265; GFX9-NEXT: s_mov_b32 s0, 0 2266; GFX9-NEXT: s_mov_b32 s1, s0 2267; GFX9-NEXT: s_mov_b32 s2, s0 2268; GFX9-NEXT: s_mov_b32 s3, s0 2269; GFX9-NEXT: v_mov_b32_e32 v0, s0 2270; GFX9-NEXT: v_mov_b32_e32 v1, s1 2271; GFX9-NEXT: v_mov_b32_e32 v2, s2 2272; GFX9-NEXT: v_mov_b32_e32 v3, s3 2273; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2274; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2275; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2276; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2277; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2278; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2279; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2280; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2281; GFX9-NEXT: s_waitcnt vmcnt(0) 2282; GFX9-NEXT: s_setpc_b64 s[30:31] 2283; 2284; GFX10-LABEL: zero_init_large_offset_foo: 2285; GFX10: ; %bb.0: 2286; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2287; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2288; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2289; GFX10-NEXT: s_waitcnt vmcnt(0) 2290; GFX10-NEXT: s_mov_b32 s0, 0 2291; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2292; GFX10-NEXT: s_mov_b32 s1, s0 2293; GFX10-NEXT: s_mov_b32 s2, s0 2294; GFX10-NEXT: s_mov_b32 s3, s0 2295; GFX10-NEXT: v_mov_b32_e32 v0, s0 2296; GFX10-NEXT: v_mov_b32_e32 v1, s1 2297; GFX10-NEXT: v_mov_b32_e32 v2, s2 2298; GFX10-NEXT: v_mov_b32_e32 v3, s3 2299; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2300; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2301; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2302; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2303; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2304; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2305; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2306; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2307; GFX10-NEXT: s_setpc_b64 s[30:31] 2308; 2309; GFX11-LABEL: zero_init_large_offset_foo: 2310; GFX11: ; %bb.0: 2311; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2312; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2313; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:16 glc dlc 2314; GFX11-NEXT: s_waitcnt vmcnt(0) 2315; GFX11-NEXT: s_mov_b32 s0, 0 2316; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2317; GFX11-NEXT: s_mov_b32 s1, s0 2318; GFX11-NEXT: s_mov_b32 s2, s0 2319; GFX11-NEXT: s_mov_b32 s3, s0 2320; GFX11-NEXT: v_mov_b32_e32 v0, s0 2321; GFX11-NEXT: v_mov_b32_e32 v1, s1 2322; GFX11-NEXT: v_mov_b32_e32 v2, s2 2323; GFX11-NEXT: v_mov_b32_e32 v3, s3 2324; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2325; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2326; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2327; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2328; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2329; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2330; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2331; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2332; GFX11-NEXT: s_setpc_b64 s[30:31] 2333; 2334; GFX9-PAL-LABEL: zero_init_large_offset_foo: 2335; GFX9-PAL: ; %bb.0: 2336; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2337; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 2338; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2339; GFX9-PAL-NEXT: s_mov_b32 s0, 0 2340; GFX9-PAL-NEXT: s_mov_b32 s1, s0 2341; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2342; GFX9-PAL-NEXT: s_mov_b32 s3, s0 2343; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 2344; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 2345; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 2346; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 2347; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2348; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2349; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2350; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2351; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2352; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2353; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2354; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2355; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2356; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2357; 2358; GFX940-LABEL: zero_init_large_offset_foo: 2359; GFX940: ; %bb.0: 2360; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2361; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:16 sc0 sc1 2362; GFX940-NEXT: s_waitcnt vmcnt(0) 2363; GFX940-NEXT: s_mov_b32 s0, 0 2364; GFX940-NEXT: s_mov_b32 s1, s0 2365; GFX940-NEXT: s_mov_b32 s2, s0 2366; GFX940-NEXT: s_mov_b32 s3, s0 2367; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 2368; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 2369; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2370; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 2371; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2372; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 2373; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2374; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 2375; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 2376; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 2377; GFX940-NEXT: s_waitcnt vmcnt(0) 2378; GFX940-NEXT: s_setpc_b64 s[30:31] 2379; 2380; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 2381; GFX1010-PAL: ; %bb.0: 2382; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2383; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2384; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2385; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2386; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 2387; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2388; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 2389; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2390; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 2391; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 2392; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 2393; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 2394; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 2395; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2396; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2397; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2398; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2399; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2400; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2401; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2402; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 2403; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2404; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2405; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2406; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 2407; 2408; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 2409; GFX1030-PAL: ; %bb.0: 2410; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2411; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2412; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 2413; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2414; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 2415; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2416; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 2417; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2418; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 2419; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 2420; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 2421; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 2422; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 2423; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 2424; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2425; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 2426; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2427; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 2428; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2429; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 2430; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2431; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 2432; 2433; GFX11-PAL-LABEL: zero_init_large_offset_foo: 2434; GFX11-PAL: ; %bb.0: 2435; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2436; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2437; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:16 glc dlc 2438; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2439; GFX11-PAL-NEXT: s_mov_b32 s0, 0 2440; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2441; GFX11-PAL-NEXT: s_mov_b32 s1, s0 2442; GFX11-PAL-NEXT: s_mov_b32 s2, s0 2443; GFX11-PAL-NEXT: s_mov_b32 s3, s0 2444; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 2445; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 2446; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 2447; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 2448; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo 2449; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2450; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 2451; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2452; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:32 2453; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 2454; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:48 2455; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2456; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 2457 %padding = alloca [4096 x i32], align 4, addrspace(5) 2458 %alloca = alloca [32 x i16], align 2, addrspace(5) 2459 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2460 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2461 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 2462 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 2463 ret void 2464} 2465 2466define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 2467; GFX9-LABEL: store_load_sindex_large_offset_kernel: 2468; GFX9: ; %bb.0: ; %bb 2469; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 2470; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 2471; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2472; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2473; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2474; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2475; GFX9-NEXT: s_lshl_b32 s1, s0, 2 2476; GFX9-NEXT: s_and_b32 s0, s0, 15 2477; GFX9-NEXT: v_mov_b32_e32 v0, 15 2478; GFX9-NEXT: s_addk_i32 s1, 0x4004 2479; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2480; GFX9-NEXT: scratch_store_dword off, v0, s1 2481; GFX9-NEXT: s_waitcnt vmcnt(0) 2482; GFX9-NEXT: s_addk_i32 s0, 0x4004 2483; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2484; GFX9-NEXT: s_waitcnt vmcnt(0) 2485; GFX9-NEXT: s_endpgm 2486; 2487; GFX10-LABEL: store_load_sindex_large_offset_kernel: 2488; GFX10: ; %bb.0: ; %bb 2489; GFX10-NEXT: s_add_u32 s2, s2, s5 2490; GFX10-NEXT: s_addc_u32 s3, s3, 0 2491; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2492; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2493; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2494; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2495; GFX10-NEXT: s_waitcnt vmcnt(0) 2496; GFX10-NEXT: v_mov_b32_e32 v0, 15 2497; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX10-NEXT: s_and_b32 s1, s0, 15 2499; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2500; GFX10-NEXT: s_lshl_b32 s1, s1, 2 2501; GFX10-NEXT: s_addk_i32 s0, 0x4004 2502; GFX10-NEXT: s_addk_i32 s1, 0x4004 2503; GFX10-NEXT: scratch_store_dword off, v0, s0 2504; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2505; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 2506; GFX10-NEXT: s_waitcnt vmcnt(0) 2507; GFX10-NEXT: s_endpgm 2508; 2509; GFX11-LABEL: store_load_sindex_large_offset_kernel: 2510; GFX11: ; %bb.0: ; %bb 2511; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 2512; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2513; GFX11-NEXT: s_waitcnt vmcnt(0) 2514; GFX11-NEXT: v_mov_b32_e32 v0, 15 2515; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2516; GFX11-NEXT: s_and_b32 s1, s0, 15 2517; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2518; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2519; GFX11-NEXT: s_addk_i32 s0, 0x4004 2520; GFX11-NEXT: s_addk_i32 s1, 0x4004 2521; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2522; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2523; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2524; GFX11-NEXT: s_waitcnt vmcnt(0) 2525; GFX11-NEXT: s_endpgm 2526; 2527; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 2528; GFX9-PAL: ; %bb.0: ; %bb 2529; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2530; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2531; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2532; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2533; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2534; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2535; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2536; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2537; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2538; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2539; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2540; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2541; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2542; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2543; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2544; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2545; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2546; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2547; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2548; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2549; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2550; GFX9-PAL-NEXT: s_endpgm 2551; 2552; GFX940-LABEL: store_load_sindex_large_offset_kernel: 2553; GFX940: ; %bb.0: ; %bb 2554; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2555; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2556; GFX940-NEXT: s_waitcnt vmcnt(0) 2557; GFX940-NEXT: v_mov_b32_e32 v0, 15 2558; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2559; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2560; GFX940-NEXT: s_and_b32 s0, s0, 15 2561; GFX940-NEXT: s_addk_i32 s1, 0x4004 2562; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2563; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2564; GFX940-NEXT: s_waitcnt vmcnt(0) 2565; GFX940-NEXT: s_addk_i32 s0, 0x4004 2566; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2567; GFX940-NEXT: s_waitcnt vmcnt(0) 2568; GFX940-NEXT: s_endpgm 2569; 2570; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 2571; GFX1010-PAL: ; %bb.0: ; %bb 2572; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 2573; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 2574; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2575; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2576; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2577; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 2578; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 2579; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2580; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2581; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2582; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2583; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2584; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2585; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2586; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2587; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2588; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2589; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2590; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2591; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2592; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2593; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2594; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2595; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2596; GFX1010-PAL-NEXT: s_endpgm 2597; 2598; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 2599; GFX1030-PAL: ; %bb.0: ; %bb 2600; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 2601; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 2602; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2603; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2604; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2605; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 2606; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 2607; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2608; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2609; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2610; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2611; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2612; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2613; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2614; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2615; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2616; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2617; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2618; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2619; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2620; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2621; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2622; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2623; GFX1030-PAL-NEXT: s_endpgm 2624; 2625; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel: 2626; GFX11-PAL: ; %bb.0: ; %bb 2627; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 2628; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2629; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2630; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2631; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 2632; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2633; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2634; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2635; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2636; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2637; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2638; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2639; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2640; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2641; GFX11-PAL-NEXT: s_endpgm 2642bb: 2643 %padding = alloca [4096 x i32], align 4, addrspace(5) 2644 %i = alloca [32 x float], align 4, addrspace(5) 2645 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2646 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2647 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2648 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2649 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2650 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2651 %i9 = and i32 %idx, 15 2652 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2653 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2654 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2655 ret void 2656} 2657 2658define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 2659; GFX9-LABEL: store_load_sindex_large_offset_foo: 2660; GFX9: ; %bb.0: ; %bb 2661; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2662; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2663; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2664; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2665; GFX9-NEXT: s_waitcnt vmcnt(0) 2666; GFX9-NEXT: s_lshl_b32 s0, s2, 2 2667; GFX9-NEXT: s_addk_i32 s0, 0x4004 2668; GFX9-NEXT: v_mov_b32_e32 v0, 15 2669; GFX9-NEXT: scratch_store_dword off, v0, s0 2670; GFX9-NEXT: s_waitcnt vmcnt(0) 2671; GFX9-NEXT: s_and_b32 s0, s2, 15 2672; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2673; GFX9-NEXT: s_addk_i32 s0, 0x4004 2674; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2675; GFX9-NEXT: s_waitcnt vmcnt(0) 2676; GFX9-NEXT: s_endpgm 2677; 2678; GFX10-LABEL: store_load_sindex_large_offset_foo: 2679; GFX10: ; %bb.0: ; %bb 2680; GFX10-NEXT: s_add_u32 s0, s0, s3 2681; GFX10-NEXT: s_addc_u32 s1, s1, 0 2682; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2683; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2684; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2685; GFX10-NEXT: s_waitcnt vmcnt(0) 2686; GFX10-NEXT: v_mov_b32_e32 v0, 15 2687; GFX10-NEXT: s_and_b32 s0, s2, 15 2688; GFX10-NEXT: s_lshl_b32 s1, s2, 2 2689; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2690; GFX10-NEXT: s_addk_i32 s1, 0x4004 2691; GFX10-NEXT: s_addk_i32 s0, 0x4004 2692; GFX10-NEXT: scratch_store_dword off, v0, s1 2693; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2694; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 2695; GFX10-NEXT: s_waitcnt vmcnt(0) 2696; GFX10-NEXT: s_endpgm 2697; 2698; GFX11-LABEL: store_load_sindex_large_offset_foo: 2699; GFX11: ; %bb.0: ; %bb 2700; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2701; GFX11-NEXT: s_waitcnt vmcnt(0) 2702; GFX11-NEXT: v_mov_b32_e32 v0, 15 2703; GFX11-NEXT: s_and_b32 s1, s0, 15 2704; GFX11-NEXT: s_lshl_b32 s0, s0, 2 2705; GFX11-NEXT: s_lshl_b32 s1, s1, 2 2706; GFX11-NEXT: s_addk_i32 s0, 0x4004 2707; GFX11-NEXT: s_addk_i32 s1, 0x4004 2708; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc 2709; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2710; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2711; GFX11-NEXT: s_waitcnt vmcnt(0) 2712; GFX11-NEXT: s_endpgm 2713; 2714; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 2715; GFX9-PAL: ; %bb.0: ; %bb 2716; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2717; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2718; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2719; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2720; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2721; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2722; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2723; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2724; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2725; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2726; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2727; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2728; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2729; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2730; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2731; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2732; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2733; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2734; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2735; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2736; GFX9-PAL-NEXT: s_endpgm 2737; 2738; GFX940-LABEL: store_load_sindex_large_offset_foo: 2739; GFX940: ; %bb.0: ; %bb 2740; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2741; GFX940-NEXT: s_waitcnt vmcnt(0) 2742; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2743; GFX940-NEXT: s_and_b32 s0, s0, 15 2744; GFX940-NEXT: s_addk_i32 s1, 0x4004 2745; GFX940-NEXT: v_mov_b32_e32 v0, 15 2746; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2747; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2748; GFX940-NEXT: s_waitcnt vmcnt(0) 2749; GFX940-NEXT: s_addk_i32 s0, 0x4004 2750; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2751; GFX940-NEXT: s_waitcnt vmcnt(0) 2752; GFX940-NEXT: s_endpgm 2753; 2754; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 2755; GFX1010-PAL: ; %bb.0: ; %bb 2756; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2757; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2758; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2759; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2760; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2761; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2762; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2763; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2764; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2765; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2766; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2767; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2768; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2769; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2770; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2771; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2772; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2773; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2774; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2775; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2776; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2777; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2778; GFX1010-PAL-NEXT: s_endpgm 2779; 2780; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 2781; GFX1030-PAL: ; %bb.0: ; %bb 2782; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2783; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2784; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2785; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2786; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2787; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2788; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2789; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2790; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2791; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2792; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2793; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2794; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2795; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2796; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2797; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2798; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2799; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2800; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2801; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2802; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2803; GFX1030-PAL-NEXT: s_endpgm 2804; 2805; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo: 2806; GFX11-PAL: ; %bb.0: ; %bb 2807; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc 2808; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2809; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15 2810; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15 2811; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2 2812; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2 2813; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 2814; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004 2815; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc 2816; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2817; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc 2818; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2819; GFX11-PAL-NEXT: s_endpgm 2820bb: 2821 %padding = alloca [4096 x i32], align 4, addrspace(5) 2822 %i = alloca [32 x float], align 4, addrspace(5) 2823 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2824 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2825 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2826 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2827 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2828 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2829 %i9 = and i32 %idx, 15 2830 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2831 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2832 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2833 ret void 2834} 2835 2836define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 2837; GFX9-LABEL: store_load_vindex_large_offset_kernel: 2838; GFX9: ; %bb.0: ; %bb 2839; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2840; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2841; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2842; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2843; GFX9-NEXT: s_waitcnt vmcnt(0) 2844; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2845; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 2846; GFX9-NEXT: v_mov_b32_e32 v2, 15 2847; GFX9-NEXT: scratch_store_dword v1, v2, off 2848; GFX9-NEXT: s_waitcnt vmcnt(0) 2849; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2850; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2851; GFX9-NEXT: s_waitcnt vmcnt(0) 2852; GFX9-NEXT: s_endpgm 2853; 2854; GFX10-LABEL: store_load_vindex_large_offset_kernel: 2855; GFX10: ; %bb.0: ; %bb 2856; GFX10-NEXT: s_add_u32 s0, s0, s3 2857; GFX10-NEXT: s_addc_u32 s1, s1, 0 2858; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2859; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2860; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2861; GFX10-NEXT: v_mov_b32_e32 v2, 15 2862; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2863; GFX10-NEXT: s_waitcnt vmcnt(0) 2864; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2865; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2866; GFX10-NEXT: scratch_store_dword v1, v2, off 2867; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2868; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2869; GFX10-NEXT: s_waitcnt vmcnt(0) 2870; GFX10-NEXT: s_endpgm 2871; 2872; GFX11-LABEL: store_load_vindex_large_offset_kernel: 2873; GFX11: ; %bb.0: ; %bb 2874; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2875; GFX11-NEXT: v_mov_b32_e32 v1, 15 2876; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 2877; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 2878; GFX11-NEXT: s_waitcnt vmcnt(0) 2879; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 2880; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 2881; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2882; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2883; GFX11-NEXT: s_waitcnt vmcnt(0) 2884; GFX11-NEXT: s_endpgm 2885; 2886; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 2887; GFX9-PAL: ; %bb.0: ; %bb 2888; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2889; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2890; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2891; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2892; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2893; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 2894; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2896; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2897; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2898; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2899; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2900; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 2901; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 2902; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2903; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2904; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2905; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2906; GFX9-PAL-NEXT: s_endpgm 2907; 2908; GFX940-LABEL: store_load_vindex_large_offset_kernel: 2909; GFX940: ; %bb.0: ; %bb 2910; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 2911; GFX940-NEXT: s_waitcnt vmcnt(0) 2912; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2913; GFX940-NEXT: v_mov_b32_e32 v1, 15 2914; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 2915; GFX940-NEXT: scratch_store_dword v0, v1, vcc_hi sc0 sc1 2916; GFX940-NEXT: s_waitcnt vmcnt(0) 2917; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2918; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 2919; GFX940-NEXT: s_waitcnt vmcnt(0) 2920; GFX940-NEXT: s_endpgm 2921; 2922; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 2923; GFX1010-PAL: ; %bb.0: ; %bb 2924; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2925; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2926; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2927; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2928; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2929; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2930; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2931; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2932; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2933; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2934; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 2935; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2936; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 2937; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2938; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2939; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2940; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 2941; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2942; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2943; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2944; GFX1010-PAL-NEXT: s_endpgm 2945; 2946; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 2947; GFX1030-PAL: ; %bb.0: ; %bb 2948; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2949; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2950; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2951; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2952; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2953; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2954; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2955; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2956; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2957; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2958; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 2959; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2960; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2961; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2962; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2963; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 2964; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2965; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2966; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2967; GFX1030-PAL-NEXT: s_endpgm 2968; 2969; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: 2970; GFX11-PAL: ; %bb.0: ; %bb 2971; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2972; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 2973; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 2974; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 2975; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2976; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 2977; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 2978; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2979; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc 2980; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 2981; GFX11-PAL-NEXT: s_endpgm 2982bb: 2983 %padding = alloca [4096 x i32], align 4, addrspace(5) 2984 %i = alloca [32 x float], align 4, addrspace(5) 2985 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2986 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2987 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2988 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2989 %i3 = zext i32 %i2 to i64 2990 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 2991 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2992 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2993 %i9 = sub nsw i32 31, %i2 2994 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2995 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2996 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2997 ret void 2998} 2999 3000define void @store_load_vindex_large_offset_foo(i32 %idx) { 3001; GFX9-LABEL: store_load_vindex_large_offset_foo: 3002; GFX9: ; %bb.0: ; %bb 3003; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3004; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 3005; GFX9-NEXT: s_waitcnt vmcnt(0) 3006; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3007; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 3008; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 3009; GFX9-NEXT: v_mov_b32_e32 v3, 15 3010; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 3011; GFX9-NEXT: scratch_store_dword v2, v3, off 3012; GFX9-NEXT: s_waitcnt vmcnt(0) 3013; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3014; GFX9-NEXT: scratch_load_dword v0, v0, off glc 3015; GFX9-NEXT: s_waitcnt vmcnt(0) 3016; GFX9-NEXT: s_setpc_b64 s[30:31] 3017; 3018; GFX10-LABEL: store_load_vindex_large_offset_foo: 3019; GFX10: ; %bb.0: ; %bb 3020; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3021; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3022; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 3023; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3024; GFX10-NEXT: v_mov_b32_e32 v2, 15 3025; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 3026; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3027; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3028; GFX10-NEXT: s_waitcnt vmcnt(0) 3029; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 3030; GFX10-NEXT: scratch_store_dword v0, v2, off 3031; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3032; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 3033; GFX10-NEXT: s_waitcnt vmcnt(0) 3034; GFX10-NEXT: s_setpc_b64 s[30:31] 3035; 3036; GFX11-LABEL: store_load_vindex_large_offset_foo: 3037; GFX11: ; %bb.0: ; %bb 3038; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3039; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3040; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 3041; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3042; GFX11-NEXT: v_mov_b32_e32 v2, 15 3043; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3044; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3045; GFX11-NEXT: s_waitcnt vmcnt(0) 3046; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3047; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 3048; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3049; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3050; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 3051; GFX11-NEXT: s_waitcnt vmcnt(0) 3052; GFX11-NEXT: s_setpc_b64 s[30:31] 3053; 3054; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 3055; GFX9-PAL: ; %bb.0: ; %bb 3056; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3057; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 3058; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3059; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3060; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 3061; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 3062; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 3063; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 3064; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 3065; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3066; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3067; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 3068; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3069; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3070; 3071; GFX940-LABEL: store_load_vindex_large_offset_foo: 3072; GFX940: ; %bb.0: ; %bb 3073; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3074; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 3075; GFX940-NEXT: s_waitcnt vmcnt(0) 3076; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 3077; GFX940-NEXT: v_mov_b32_e32 v2, 15 3078; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3079; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 3080; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 3081; GFX940-NEXT: s_waitcnt vmcnt(0) 3082; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3083; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 3084; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 3085; GFX940-NEXT: s_waitcnt vmcnt(0) 3086; GFX940-NEXT: s_setpc_b64 s[30:31] 3087; 3088; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 3089; GFX10-PAL: ; %bb.0: ; %bb 3090; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3091; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3092; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 3093; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3094; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 3095; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 3096; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3097; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 3098; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3099; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 3100; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 3101; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3102; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 3103; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3104; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3105; 3106; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo: 3107; GFX11-PAL: ; %bb.0: ; %bb 3108; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3109; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3110; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 3111; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3112; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3113; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3114; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 3115; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3116; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 3117; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 3118; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3119; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 3120; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 3121; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3122; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3123; GCN-LABEL: store_load_vindex_large_offset_foo: 3124; GCN: ; %bb.0: ; %bb 3125; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3126; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 3127; GCN-NEXT: s_waitcnt vmcnt(0) 3128; GCN-NEXT: v_mov_b32_e32 v2, 15 3129; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 3130; GCN-NEXT: v_and_b32_e32 v0, v0, v2 3131; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 3132; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 3133; GCN-NEXT: s_waitcnt vmcnt(0) 3134; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3135; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 3136; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 3137; GCN-NEXT: s_waitcnt vmcnt(0) 3138; GCN-NEXT: s_setpc_b64 s[30:31] 3139bb: 3140 %padding = alloca [4096 x i32], align 4, addrspace(5) 3141 %i = alloca [32 x float], align 4, addrspace(5) 3142 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 3143 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 3144 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 3145 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 3146 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 3147 store volatile i32 15, i32 addrspace(5)* %i8, align 4 3148 %i9 = and i32 %idx, 15 3149 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 3150 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 3151 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 3152 ret void 3153} 3154 3155define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 3156; GFX9-LABEL: store_load_large_imm_offset_kernel: 3157; GFX9: ; %bb.0: ; %bb 3158; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 3159; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 3160; GFX9-NEXT: v_mov_b32_e32 v0, 13 3161; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3162; GFX9-NEXT: s_movk_i32 s0, 0x3000 3163; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 3164; GFX9-NEXT: s_waitcnt vmcnt(0) 3165; GFX9-NEXT: s_add_i32 s0, s0, 4 3166; GFX9-NEXT: v_mov_b32_e32 v0, 15 3167; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3168; GFX9-NEXT: s_waitcnt vmcnt(0) 3169; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3170; GFX9-NEXT: s_waitcnt vmcnt(0) 3171; GFX9-NEXT: s_endpgm 3172; 3173; GFX10-LABEL: store_load_large_imm_offset_kernel: 3174; GFX10: ; %bb.0: ; %bb 3175; GFX10-NEXT: s_add_u32 s0, s0, s3 3176; GFX10-NEXT: s_addc_u32 s1, s1, 0 3177; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 3178; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 3179; GFX10-NEXT: v_mov_b32_e32 v0, 13 3180; GFX10-NEXT: v_mov_b32_e32 v1, 15 3181; GFX10-NEXT: s_movk_i32 s0, 0x3800 3182; GFX10-NEXT: s_add_i32 s0, s0, 4 3183; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 3184; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3185; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3186; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3187; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3188; GFX10-NEXT: s_waitcnt vmcnt(0) 3189; GFX10-NEXT: s_endpgm 3190; 3191; GFX11-LABEL: store_load_large_imm_offset_kernel: 3192; GFX11: ; %bb.0: ; %bb 3193; GFX11-NEXT: v_mov_b32_e32 v0, 13 3194; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 3195; GFX11-NEXT: v_mov_b32_e32 v2, 15 3196; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3197; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3198; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3199; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3200; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3201; GFX11-NEXT: s_waitcnt vmcnt(0) 3202; GFX11-NEXT: s_endpgm 3203; 3204; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 3205; GFX9-PAL: ; %bb.0: ; %bb 3206; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 3207; GFX9-PAL-NEXT: s_mov_b32 s2, s0 3208; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3209; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3210; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3211; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3212; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3213; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3214; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 3215; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3216; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 3217; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3218; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 3219; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3220; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3221; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3222; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3223; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3224; GFX9-PAL-NEXT: s_endpgm 3225; 3226; GFX940-LABEL: store_load_large_imm_offset_kernel: 3227; GFX940: ; %bb.0: ; %bb 3228; GFX940-NEXT: v_mov_b32_e32 v0, 13 3229; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 3230; GFX940-NEXT: s_waitcnt vmcnt(0) 3231; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3232; GFX940-NEXT: v_mov_b32_e32 v1, 15 3233; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 3234; GFX940-NEXT: s_waitcnt vmcnt(0) 3235; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 3236; GFX940-NEXT: s_waitcnt vmcnt(0) 3237; GFX940-NEXT: s_endpgm 3238; 3239; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 3240; GFX1010-PAL: ; %bb.0: ; %bb 3241; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 3242; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 3243; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3244; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 3245; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3246; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 3247; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 3248; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3249; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3250; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 3251; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 3252; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 3253; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 3254; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 3255; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 3256; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3257; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3258; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3259; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3260; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3261; GFX1010-PAL-NEXT: s_endpgm 3262; 3263; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 3264; GFX1030-PAL: ; %bb.0: ; %bb 3265; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 3266; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 3267; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3268; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 3269; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3270; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 3271; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 3272; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3273; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3274; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 3275; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 3276; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 3277; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 3278; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 3279; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3280; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3281; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3282; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3283; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3284; GFX1030-PAL-NEXT: s_endpgm 3285; 3286; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: 3287; GFX11-PAL: ; %bb.0: ; %bb 3288; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 3289; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 3290; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3291; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 3292; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3293; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off offset:3716 dlc 3294; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3295; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off offset:3716 glc dlc 3296; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3297; GFX11-PAL-NEXT: s_endpgm 3298bb: 3299 %i = alloca [4096 x i32], align 4, addrspace(5) 3300 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 3301 store volatile i32 13, i32 addrspace(5)* %i1, align 4 3302 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3303 store volatile i32 15, i32 addrspace(5)* %i7, align 4 3304 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3305 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 3306 ret void 3307} 3308 3309define void @store_load_large_imm_offset_foo() { 3310; GFX9-LABEL: store_load_large_imm_offset_foo: 3311; GFX9: ; %bb.0: ; %bb 3312; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3313; GFX9-NEXT: v_mov_b32_e32 v0, 13 3314; GFX9-NEXT: s_movk_i32 s0, 0x3000 3315; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 3316; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 3317; GFX9-NEXT: s_waitcnt vmcnt(0) 3318; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi 3319; GFX9-NEXT: v_mov_b32_e32 v0, 15 3320; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 3321; GFX9-NEXT: s_waitcnt vmcnt(0) 3322; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3323; GFX9-NEXT: s_waitcnt vmcnt(0) 3324; GFX9-NEXT: s_setpc_b64 s[30:31] 3325; 3326; GFX10-LABEL: store_load_large_imm_offset_foo: 3327; GFX10: ; %bb.0: ; %bb 3328; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3329; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3330; GFX10-NEXT: v_mov_b32_e32 v0, 13 3331; GFX10-NEXT: v_mov_b32_e32 v1, 15 3332; GFX10-NEXT: s_movk_i32 s0, 0x3800 3333; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 3334; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo 3335; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 3336; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3337; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 3338; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3339; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3340; GFX10-NEXT: s_waitcnt vmcnt(0) 3341; GFX10-NEXT: s_setpc_b64 s[30:31] 3342; 3343; GFX11-LABEL: store_load_large_imm_offset_foo: 3344; GFX11: ; %bb.0: ; %bb 3345; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3346; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3347; GFX11-NEXT: v_mov_b32_e32 v0, 13 3348; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 3349; GFX11-NEXT: v_mov_b32_e32 v2, 15 3350; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3351; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3352; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3353; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3354; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3355; GFX11-NEXT: s_waitcnt vmcnt(0) 3356; GFX11-NEXT: s_setpc_b64 s[30:31] 3357; 3358; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 3359; GFX9-PAL: ; %bb.0: ; %bb 3360; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3361; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 3362; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 3363; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 3364; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3365; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3366; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi 3367; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 3368; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 3369; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3370; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 3371; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3372; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3373; 3374; GFX940-LABEL: store_load_large_imm_offset_foo: 3375; GFX940: ; %bb.0: ; %bb 3376; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3377; GFX940-NEXT: v_mov_b32_e32 v0, 13 3378; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 3379; GFX940-NEXT: s_waitcnt vmcnt(0) 3380; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 3381; GFX940-NEXT: v_mov_b32_e32 v1, 15 3382; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 3383; GFX940-NEXT: s_waitcnt vmcnt(0) 3384; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 3385; GFX940-NEXT: s_waitcnt vmcnt(0) 3386; GFX940-NEXT: s_setpc_b64 s[30:31] 3387; 3388; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 3389; GFX10-PAL: ; %bb.0: ; %bb 3390; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3391; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3392; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 3393; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3394; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 3395; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 3396; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo 3397; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 3398; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3399; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 3400; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3401; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 3402; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3403; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3404; 3405; GFX11-PAL-LABEL: store_load_large_imm_offset_foo: 3406; GFX11-PAL: ; %bb.0: ; %bb 3407; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3408; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3409; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 3410; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 3411; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 3412; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 3413; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3414; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, s32 offset:3716 dlc 3415; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3416; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:3716 glc dlc 3417; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3418; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3419; GCN-LABEL: store_load_large_imm_offset_foo: 3420; GCN: ; %bb.0: ; %bb 3421; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3422; GCN-NEXT: v_mov_b32_e32 v0, 13 3423; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 3424; GCN-NEXT: s_waitcnt vmcnt(0) 3425; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 3426; GCN-NEXT: v_mov_b32_e32 v1, 15 3427; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 3428; GCN-NEXT: s_waitcnt vmcnt(0) 3429; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 3430; GCN-NEXT: s_waitcnt vmcnt(0) 3431; GCN-NEXT: s_setpc_b64 s[30:31] 3432bb: 3433 %i = alloca [4096 x i32], align 4, addrspace(5) 3434 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 3435 store volatile i32 13, i32 addrspace(5)* %i1, align 4 3436 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3437 store volatile i32 15, i32 addrspace(5)* %i7, align 4 3438 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 3439 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 3440 ret void 3441} 3442 3443define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 3444; GFX9-LABEL: store_load_vidx_sidx_offset: 3445; GFX9: ; %bb.0: ; %bb 3446; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 3447; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 3448; GFX9-NEXT: v_mov_b32_e32 v1, 4 3449; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3450; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3451; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 3452; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3453; GFX9-NEXT: v_mov_b32_e32 v1, 15 3454; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 3455; GFX9-NEXT: s_waitcnt vmcnt(0) 3456; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3457; GFX9-NEXT: s_waitcnt vmcnt(0) 3458; GFX9-NEXT: s_endpgm 3459; 3460; GFX10-LABEL: store_load_vidx_sidx_offset: 3461; GFX10: ; %bb.0: ; %bb 3462; GFX10-NEXT: s_add_u32 s2, s2, s5 3463; GFX10-NEXT: s_addc_u32 s3, s3, 0 3464; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3465; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3466; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 3467; GFX10-NEXT: v_mov_b32_e32 v1, 15 3468; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3469; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 3470; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 3471; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 3472; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3473; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3474; GFX10-NEXT: s_waitcnt vmcnt(0) 3475; GFX10-NEXT: s_endpgm 3476; 3477; GFX11-LABEL: store_load_vidx_sidx_offset: 3478; GFX11: ; %bb.0: ; %bb 3479; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 3480; GFX11-NEXT: v_mov_b32_e32 v1, 15 3481; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3482; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3483; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 3484; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3485; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 3486; GFX11-NEXT: s_waitcnt vmcnt(0) 3487; GFX11-NEXT: s_endpgm 3488; 3489; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 3490; GFX9-PAL: ; %bb.0: ; %bb 3491; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 3492; GFX9-PAL-NEXT: s_mov_b32 s4, s0 3493; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 3494; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 3495; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 3496; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3497; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 3498; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 3499; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 3500; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 3501; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 3502; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3503; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3504; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3505; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 3506; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3507; GFX9-PAL-NEXT: s_endpgm 3508; 3509; GFX940-LABEL: store_load_vidx_sidx_offset: 3510; GFX940: ; %bb.0: ; %bb 3511; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 3512; GFX940-NEXT: v_mov_b32_e32 v1, 15 3513; GFX940-NEXT: s_waitcnt lgkmcnt(0) 3514; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3515; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 3516; GFX940-NEXT: s_waitcnt vmcnt(0) 3517; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 3518; GFX940-NEXT: s_waitcnt vmcnt(0) 3519; GFX940-NEXT: s_endpgm 3520; 3521; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 3522; GFX10-PAL: ; %bb.0: ; %bb 3523; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 3524; GFX10-PAL-NEXT: s_mov_b32 s4, s0 3525; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 3526; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3527; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 3528; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 3529; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 3530; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 3531; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 3532; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 3533; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3534; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3535; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 3536; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 3537; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 3538; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3539; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 3540; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3541; GFX10-PAL-NEXT: s_endpgm 3542; 3543; GFX11-PAL-LABEL: store_load_vidx_sidx_offset: 3544; GFX11-PAL: ; %bb.0: ; %bb 3545; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0 3546; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3547; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) 3548; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3549; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 3550; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3551; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 3552; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3553; GFX11-PAL-NEXT: s_endpgm 3554; GCN-LABEL: store_load_vidx_sidx_offset: 3555; GCN: ; %bb.0: ; %bb 3556; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 3557; GCN-NEXT: v_mov_b32_e32 v1, 15 3558; GCN-NEXT: s_waitcnt lgkmcnt(0) 3559; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 3560; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 3561; GCN-NEXT: s_waitcnt vmcnt(0) 3562; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 3563; GCN-NEXT: s_waitcnt vmcnt(0) 3564; GCN-NEXT: s_endpgm 3565bb: 3566 %alloca = alloca [32 x i32], align 4, addrspace(5) 3567 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 3568 %add1 = add nsw i32 %sidx, %vidx 3569 %add2 = add nsw i32 %add1, 256 3570 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 3571 store volatile i32 15, i32 addrspace(5)* %gep, align 4 3572 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 3573 ret void 3574} 3575 3576define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 3577; GFX9-LABEL: store_load_i64_aligned: 3578; GFX9: ; %bb.0: ; %bb 3579; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3580; GFX9-NEXT: v_mov_b32_e32 v1, 15 3581; GFX9-NEXT: v_mov_b32_e32 v2, 0 3582; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3583; GFX9-NEXT: s_waitcnt vmcnt(0) 3584; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3585; GFX9-NEXT: s_waitcnt vmcnt(0) 3586; GFX9-NEXT: s_setpc_b64 s[30:31] 3587; 3588; GFX10-LABEL: store_load_i64_aligned: 3589; GFX10: ; %bb.0: ; %bb 3590; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3591; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3592; GFX10-NEXT: v_mov_b32_e32 v1, 15 3593; GFX10-NEXT: v_mov_b32_e32 v2, 0 3594; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3595; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3596; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3597; GFX10-NEXT: s_waitcnt vmcnt(0) 3598; GFX10-NEXT: s_setpc_b64 s[30:31] 3599; 3600; GFX11-LABEL: store_load_i64_aligned: 3601; GFX11: ; %bb.0: ; %bb 3602; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3603; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3604; GFX11-NEXT: v_mov_b32_e32 v1, 15 3605; GFX11-NEXT: v_mov_b32_e32 v2, 0 3606; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3607; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3608; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3609; GFX11-NEXT: s_waitcnt vmcnt(0) 3610; GFX11-NEXT: s_setpc_b64 s[30:31] 3611; 3612; GFX9-PAL-LABEL: store_load_i64_aligned: 3613; GFX9-PAL: ; %bb.0: ; %bb 3614; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3615; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3616; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 3617; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3618; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3619; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3620; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3621; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3622; 3623; GFX940-LABEL: store_load_i64_aligned: 3624; GFX940: ; %bb.0: ; %bb 3625; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3626; GFX940-NEXT: v_mov_b32_e32 v2, 15 3627; GFX940-NEXT: v_mov_b32_e32 v3, 0 3628; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3629; GFX940-NEXT: s_waitcnt vmcnt(0) 3630; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3631; GFX940-NEXT: s_waitcnt vmcnt(0) 3632; GFX940-NEXT: s_setpc_b64 s[30:31] 3633; 3634; GFX10-PAL-LABEL: store_load_i64_aligned: 3635; GFX10-PAL: ; %bb.0: ; %bb 3636; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3637; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3638; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3639; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 3640; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3641; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3642; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3643; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3644; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3645; 3646; GFX11-PAL-LABEL: store_load_i64_aligned: 3647; GFX11-PAL: ; %bb.0: ; %bb 3648; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3649; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3650; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3651; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 3652; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3653; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3654; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3655; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3656; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3657; GCN-LABEL: store_load_i64_aligned: 3658; GCN: ; %bb.0: ; %bb 3659; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3660; GCN-NEXT: v_mov_b32_e32 v2, 15 3661; GCN-NEXT: v_mov_b32_e32 v3, 0 3662; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3663; GCN-NEXT: s_waitcnt vmcnt(0) 3664; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3665; GCN-NEXT: s_waitcnt vmcnt(0) 3666; GCN-NEXT: s_setpc_b64 s[30:31] 3667bb: 3668 store volatile i64 15, i64 addrspace(5)* %arg, align 8 3669 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 3670 ret void 3671} 3672 3673define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 3674; GFX9-LABEL: store_load_i64_unaligned: 3675; GFX9: ; %bb.0: ; %bb 3676; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3677; GFX9-NEXT: v_mov_b32_e32 v1, 15 3678; GFX9-NEXT: v_mov_b32_e32 v2, 0 3679; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3680; GFX9-NEXT: s_waitcnt vmcnt(0) 3681; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3682; GFX9-NEXT: s_waitcnt vmcnt(0) 3683; GFX9-NEXT: s_setpc_b64 s[30:31] 3684; 3685; GFX10-LABEL: store_load_i64_unaligned: 3686; GFX10: ; %bb.0: ; %bb 3687; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3688; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3689; GFX10-NEXT: v_mov_b32_e32 v1, 15 3690; GFX10-NEXT: v_mov_b32_e32 v2, 0 3691; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3692; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3693; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3694; GFX10-NEXT: s_waitcnt vmcnt(0) 3695; GFX10-NEXT: s_setpc_b64 s[30:31] 3696; 3697; GFX11-LABEL: store_load_i64_unaligned: 3698; GFX11: ; %bb.0: ; %bb 3699; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3700; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3701; GFX11-NEXT: v_mov_b32_e32 v1, 15 3702; GFX11-NEXT: v_mov_b32_e32 v2, 0 3703; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3704; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3705; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3706; GFX11-NEXT: s_waitcnt vmcnt(0) 3707; GFX11-NEXT: s_setpc_b64 s[30:31] 3708; 3709; GFX9-PAL-LABEL: store_load_i64_unaligned: 3710; GFX9-PAL: ; %bb.0: ; %bb 3711; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3712; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 3713; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 3714; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3715; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3716; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 3717; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3718; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3719; 3720; GFX940-LABEL: store_load_i64_unaligned: 3721; GFX940: ; %bb.0: ; %bb 3722; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3723; GFX940-NEXT: v_mov_b32_e32 v2, 15 3724; GFX940-NEXT: v_mov_b32_e32 v3, 0 3725; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3726; GFX940-NEXT: s_waitcnt vmcnt(0) 3727; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3728; GFX940-NEXT: s_waitcnt vmcnt(0) 3729; GFX940-NEXT: s_setpc_b64 s[30:31] 3730; 3731; GFX10-PAL-LABEL: store_load_i64_unaligned: 3732; GFX10-PAL: ; %bb.0: ; %bb 3733; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3734; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3735; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 3736; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 3737; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 3738; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3739; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 3740; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3741; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3742; 3743; GFX11-PAL-LABEL: store_load_i64_unaligned: 3744; GFX11-PAL: ; %bb.0: ; %bb 3745; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3746; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3747; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 3748; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 3749; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc 3750; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3751; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 3752; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3753; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3754; GCN-LABEL: store_load_i64_unaligned: 3755; GCN: ; %bb.0: ; %bb 3756; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3757; GCN-NEXT: v_mov_b32_e32 v2, 15 3758; GCN-NEXT: v_mov_b32_e32 v3, 0 3759; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 3760; GCN-NEXT: s_waitcnt vmcnt(0) 3761; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 3762; GCN-NEXT: s_waitcnt vmcnt(0) 3763; GCN-NEXT: s_setpc_b64 s[30:31] 3764bb: 3765 store volatile i64 15, i64 addrspace(5)* %arg, align 1 3766 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 3767 ret void 3768} 3769 3770define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 3771; GFX9-LABEL: store_load_v3i32_unaligned: 3772; GFX9: ; %bb.0: ; %bb 3773; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3774; GFX9-NEXT: v_mov_b32_e32 v1, 1 3775; GFX9-NEXT: v_mov_b32_e32 v2, 2 3776; GFX9-NEXT: v_mov_b32_e32 v3, 3 3777; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3778; GFX9-NEXT: s_waitcnt vmcnt(0) 3779; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3780; GFX9-NEXT: s_waitcnt vmcnt(0) 3781; GFX9-NEXT: s_setpc_b64 s[30:31] 3782; 3783; GFX10-LABEL: store_load_v3i32_unaligned: 3784; GFX10: ; %bb.0: ; %bb 3785; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3786; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3787; GFX10-NEXT: v_mov_b32_e32 v1, 1 3788; GFX10-NEXT: v_mov_b32_e32 v2, 2 3789; GFX10-NEXT: v_mov_b32_e32 v3, 3 3790; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3791; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3792; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3793; GFX10-NEXT: s_waitcnt vmcnt(0) 3794; GFX10-NEXT: s_setpc_b64 s[30:31] 3795; 3796; GFX11-LABEL: store_load_v3i32_unaligned: 3797; GFX11: ; %bb.0: ; %bb 3798; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3799; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3800; GFX11-NEXT: v_mov_b32_e32 v1, 1 3801; GFX11-NEXT: v_mov_b32_e32 v2, 2 3802; GFX11-NEXT: v_mov_b32_e32 v3, 3 3803; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc 3804; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3805; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 3806; GFX11-NEXT: s_waitcnt vmcnt(0) 3807; GFX11-NEXT: s_setpc_b64 s[30:31] 3808; 3809; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 3810; GFX9-PAL: ; %bb.0: ; %bb 3811; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3812; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3813; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3814; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3815; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3816; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3817; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3818; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3819; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3820; 3821; GFX940-LABEL: store_load_v3i32_unaligned: 3822; GFX940: ; %bb.0: ; %bb 3823; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3824; GFX940-NEXT: v_mov_b32_e32 v2, 1 3825; GFX940-NEXT: v_mov_b32_e32 v3, 2 3826; GFX940-NEXT: v_mov_b32_e32 v4, 3 3827; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3828; GFX940-NEXT: s_waitcnt vmcnt(0) 3829; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3830; GFX940-NEXT: s_waitcnt vmcnt(0) 3831; GFX940-NEXT: s_setpc_b64 s[30:31] 3832; 3833; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 3834; GFX10-PAL: ; %bb.0: ; %bb 3835; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3836; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3837; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3838; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3839; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3840; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3841; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3842; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3843; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3844; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3845; 3846; GFX11-PAL-LABEL: store_load_v3i32_unaligned: 3847; GFX11-PAL: ; %bb.0: ; %bb 3848; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3849; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3850; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 3851; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 3852; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 3853; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc 3854; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3855; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 3856; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3857; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3858; GCN-LABEL: store_load_v3i32_unaligned: 3859; GCN: ; %bb.0: ; %bb 3860; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3861; GCN-NEXT: v_mov_b32_e32 v2, 1 3862; GCN-NEXT: v_mov_b32_e32 v3, 2 3863; GCN-NEXT: v_mov_b32_e32 v4, 3 3864; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3865; GCN-NEXT: s_waitcnt vmcnt(0) 3866; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3867; GCN-NEXT: s_waitcnt vmcnt(0) 3868; GCN-NEXT: s_setpc_b64 s[30:31] 3869bb: 3870 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 3871 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 3872 ret void 3873} 3874 3875define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 3876; GFX9-LABEL: store_load_v4i32_unaligned: 3877; GFX9: ; %bb.0: ; %bb 3878; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3879; GFX9-NEXT: v_mov_b32_e32 v1, 1 3880; GFX9-NEXT: v_mov_b32_e32 v2, 2 3881; GFX9-NEXT: v_mov_b32_e32 v3, 3 3882; GFX9-NEXT: v_mov_b32_e32 v4, 4 3883; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3884; GFX9-NEXT: s_waitcnt vmcnt(0) 3885; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3886; GFX9-NEXT: s_waitcnt vmcnt(0) 3887; GFX9-NEXT: s_setpc_b64 s[30:31] 3888; 3889; GFX10-LABEL: store_load_v4i32_unaligned: 3890; GFX10: ; %bb.0: ; %bb 3891; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3892; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3893; GFX10-NEXT: v_mov_b32_e32 v1, 1 3894; GFX10-NEXT: v_mov_b32_e32 v2, 2 3895; GFX10-NEXT: v_mov_b32_e32 v3, 3 3896; GFX10-NEXT: v_mov_b32_e32 v4, 4 3897; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3898; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3899; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3900; GFX10-NEXT: s_waitcnt vmcnt(0) 3901; GFX10-NEXT: s_setpc_b64 s[30:31] 3902; 3903; GFX11-LABEL: store_load_v4i32_unaligned: 3904; GFX11: ; %bb.0: ; %bb 3905; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3906; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3907; GFX11-NEXT: v_mov_b32_e32 v1, 1 3908; GFX11-NEXT: v_mov_b32_e32 v2, 2 3909; GFX11-NEXT: v_mov_b32_e32 v3, 3 3910; GFX11-NEXT: v_mov_b32_e32 v4, 4 3911; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc 3912; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3913; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 3914; GFX11-NEXT: s_waitcnt vmcnt(0) 3915; GFX11-NEXT: s_setpc_b64 s[30:31] 3916; 3917; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 3918; GFX9-PAL: ; %bb.0: ; %bb 3919; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3920; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3921; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3922; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3923; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 3924; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3925; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3926; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3927; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3928; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3929; 3930; GFX940-LABEL: store_load_v4i32_unaligned: 3931; GFX940: ; %bb.0: ; %bb 3932; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3933; GFX940-NEXT: v_mov_b32_e32 v2, 1 3934; GFX940-NEXT: v_mov_b32_e32 v3, 2 3935; GFX940-NEXT: v_mov_b32_e32 v4, 3 3936; GFX940-NEXT: v_mov_b32_e32 v5, 4 3937; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3938; GFX940-NEXT: s_waitcnt vmcnt(0) 3939; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3940; GFX940-NEXT: s_waitcnt vmcnt(0) 3941; GFX940-NEXT: s_setpc_b64 s[30:31] 3942; 3943; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 3944; GFX10-PAL: ; %bb.0: ; %bb 3945; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3946; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3947; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3948; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3949; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3950; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 3951; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3952; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3953; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3954; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3955; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3956; 3957; GFX11-PAL-LABEL: store_load_v4i32_unaligned: 3958; GFX11-PAL: ; %bb.0: ; %bb 3959; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3960; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3961; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 3962; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 3963; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 3964; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 3965; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc 3966; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3967; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 3968; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 3969; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 3970; GCN-LABEL: store_load_v4i32_unaligned: 3971; GCN: ; %bb.0: ; %bb 3972; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3973; GCN-NEXT: v_mov_b32_e32 v2, 1 3974; GCN-NEXT: v_mov_b32_e32 v3, 2 3975; GCN-NEXT: v_mov_b32_e32 v4, 3 3976; GCN-NEXT: v_mov_b32_e32 v5, 4 3977; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3978; GCN-NEXT: s_waitcnt vmcnt(0) 3979; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3980; GCN-NEXT: s_waitcnt vmcnt(0) 3981; GCN-NEXT: s_setpc_b64 s[30:31] 3982bb: 3983 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 3984 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 3985 ret void 3986} 3987 3988define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3989; GFX9-LABEL: store_load_i32_negative_unaligned: 3990; GFX9: ; %bb.0: ; %bb 3991; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3992; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 3993; GFX9-NEXT: v_mov_b32_e32 v1, 1 3994; GFX9-NEXT: scratch_store_byte v0, v1, off 3995; GFX9-NEXT: s_waitcnt vmcnt(0) 3996; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3997; GFX9-NEXT: s_waitcnt vmcnt(0) 3998; GFX9-NEXT: s_setpc_b64 s[30:31] 3999; 4000; GFX10-LABEL: store_load_i32_negative_unaligned: 4001; GFX10: ; %bb.0: ; %bb 4002; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4003; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4004; GFX10-NEXT: v_mov_b32_e32 v1, 1 4005; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 4006; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4007; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 4008; GFX10-NEXT: s_waitcnt vmcnt(0) 4009; GFX10-NEXT: s_setpc_b64 s[30:31] 4010; 4011; GFX11-LABEL: store_load_i32_negative_unaligned: 4012; GFX11: ; %bb.0: ; %bb 4013; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4014; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4015; GFX11-NEXT: v_mov_b32_e32 v1, 1 4016; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 4017; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4018; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4019; GFX11-NEXT: s_waitcnt vmcnt(0) 4020; GFX11-NEXT: s_setpc_b64 s[30:31] 4021; 4022; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 4023; GFX9-PAL: ; %bb.0: ; %bb 4024; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4025; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 4026; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4027; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4028; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4029; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4030; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4031; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4032; 4033; GFX940-LABEL: store_load_i32_negative_unaligned: 4034; GFX940: ; %bb.0: ; %bb 4035; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4036; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 4037; GFX940-NEXT: v_mov_b32_e32 v1, 1 4038; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 4039; GFX940-NEXT: s_waitcnt vmcnt(0) 4040; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 4041; GFX940-NEXT: s_waitcnt vmcnt(0) 4042; GFX940-NEXT: s_setpc_b64 s[30:31] 4043; 4044; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 4045; GFX1010-PAL: ; %bb.0: ; %bb 4046; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4047; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4048; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 4049; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4050; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 4051; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4052; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 4053; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4054; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4055; 4056; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 4057; GFX1030-PAL: ; %bb.0: ; %bb 4058; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4059; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4060; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4061; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 4062; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4063; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 4064; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4065; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4066; 4067; GFX11-PAL-LABEL: store_load_i32_negative_unaligned: 4068; GFX11-PAL: ; %bb.0: ; %bb 4069; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4070; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4071; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 4072; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 dlc 4073; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4074; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 glc dlc 4075; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4076; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4077bb: 4078 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 4079 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 4080 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 4081 ret void 4082} 4083 4084define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 4085; GFX9-LABEL: store_load_i32_large_negative_unaligned: 4086; GFX9: ; %bb.0: ; %bb 4087; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4088; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4089; GFX9-NEXT: v_mov_b32_e32 v1, 1 4090; GFX9-NEXT: scratch_store_byte v0, v1, off 4091; GFX9-NEXT: s_waitcnt vmcnt(0) 4092; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 4093; GFX9-NEXT: s_waitcnt vmcnt(0) 4094; GFX9-NEXT: s_setpc_b64 s[30:31] 4095; 4096; GFX10-LABEL: store_load_i32_large_negative_unaligned: 4097; GFX10: ; %bb.0: ; %bb 4098; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4099; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4100; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4101; GFX10-NEXT: v_mov_b32_e32 v1, 1 4102; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 4103; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4104; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4105; GFX10-NEXT: s_waitcnt vmcnt(0) 4106; GFX10-NEXT: s_setpc_b64 s[30:31] 4107; 4108; GFX11-LABEL: store_load_i32_large_negative_unaligned: 4109; GFX11: ; %bb.0: ; %bb 4110; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4111; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4112; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4113; GFX11-NEXT: v_mov_b32_e32 v1, 1 4114; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4115; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4116; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4117; GFX11-NEXT: s_waitcnt vmcnt(0) 4118; GFX11-NEXT: s_setpc_b64 s[30:31] 4119; 4120; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 4121; GFX9-PAL: ; %bb.0: ; %bb 4122; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4123; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 4124; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 4125; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 4126; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4127; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 4128; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4129; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 4130; 4131; GFX940-LABEL: store_load_i32_large_negative_unaligned: 4132; GFX940: ; %bb.0: ; %bb 4133; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4134; GFX940-NEXT: s_movk_i32 s0, 0xef7f 4135; GFX940-NEXT: v_mov_b32_e32 v1, 1 4136; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 4137; GFX940-NEXT: s_waitcnt vmcnt(0) 4138; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 4139; GFX940-NEXT: s_waitcnt vmcnt(0) 4140; GFX940-NEXT: s_setpc_b64 s[30:31] 4141; 4142; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 4143; GFX1010-PAL: ; %bb.0: ; %bb 4144; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4145; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4146; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 4147; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 4148; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 4149; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4150; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 4151; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 4152; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 4153; 4154; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 4155; GFX1030-PAL: ; %bb.0: ; %bb 4156; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4157; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4158; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4159; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 4160; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 4161; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4162; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 4163; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 4164; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 4165; 4166; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned: 4167; GFX11-PAL: ; %bb.0: ; %bb 4168; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4169; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4170; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 4171; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 4172; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc 4173; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4174; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc 4175; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4176; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] 4177bb: 4178 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 4179 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 4180 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 4181 ret void 4182} 4183 4184define amdgpu_ps void @large_offset() { 4185; GFX9-LABEL: large_offset: 4186; GFX9: ; %bb.0: ; %bb 4187; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 4188; GFX9-NEXT: v_mov_b32_e32 v0, 0 4189; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 4190; GFX9-NEXT: v_mov_b32_e32 v1, v0 4191; GFX9-NEXT: v_mov_b32_e32 v2, v0 4192; GFX9-NEXT: v_mov_b32_e32 v3, v0 4193; GFX9-NEXT: s_mov_b32 vcc_hi, 0 4194; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 4195; GFX9-NEXT: s_waitcnt vmcnt(0) 4196; GFX9-NEXT: s_mov_b32 vcc_hi, 0 4197; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 4198; GFX9-NEXT: s_waitcnt vmcnt(0) 4199; GFX9-NEXT: v_mov_b32_e32 v0, 16 4200; GFX9-NEXT: ;;#ASMSTART 4201; GFX9-NEXT: ; use v0 4202; GFX9-NEXT: ;;#ASMEND 4203; GFX9-NEXT: v_mov_b32_e32 v0, 0x810 4204; GFX9-NEXT: ;;#ASMSTART 4205; GFX9-NEXT: ; use v0 4206; GFX9-NEXT: ;;#ASMEND 4207; GFX9-NEXT: s_endpgm 4208; 4209; GFX10-LABEL: large_offset: 4210; GFX10: ; %bb.0: ; %bb 4211; GFX10-NEXT: s_add_u32 s0, s0, s2 4212; GFX10-NEXT: s_addc_u32 s1, s1, 0 4213; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 4214; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 4215; GFX10-NEXT: v_mov_b32_e32 v0, 0 4216; GFX10-NEXT: s_movk_i32 s0, 0x810 4217; GFX10-NEXT: s_addk_i32 s0, 0x3c0 4218; GFX10-NEXT: v_mov_b32_e32 v1, v0 4219; GFX10-NEXT: v_mov_b32_e32 v2, v0 4220; GFX10-NEXT: v_mov_b32_e32 v3, v0 4221; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 4222; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4223; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 4224; GFX10-NEXT: s_waitcnt vmcnt(0) 4225; GFX10-NEXT: v_mov_b32_e32 v0, 16 4226; GFX10-NEXT: v_mov_b32_e32 v1, 0x810 4227; GFX10-NEXT: ;;#ASMSTART 4228; GFX10-NEXT: ; use v0 4229; GFX10-NEXT: ;;#ASMEND 4230; GFX10-NEXT: ;;#ASMSTART 4231; GFX10-NEXT: ; use v1 4232; GFX10-NEXT: ;;#ASMEND 4233; GFX10-NEXT: s_endpgm 4234; 4235; GFX11-LABEL: large_offset: 4236; GFX11: ; %bb.0: ; %bb 4237; GFX11-NEXT: v_mov_b32_e32 v0, 0 4238; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4239; GFX11-NEXT: v_mov_b32_e32 v1, v0 4240; GFX11-NEXT: v_mov_b32_e32 v2, v0 4241; GFX11-NEXT: v_mov_b32_e32 v3, v0 4242; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4243; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4244; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4245; GFX11-NEXT: s_waitcnt vmcnt(0) 4246; GFX11-NEXT: v_mov_b32_e32 v0, 16 4247; GFX11-NEXT: v_mov_b32_e32 v1, 0x810 4248; GFX11-NEXT: ;;#ASMSTART 4249; GFX11-NEXT: ; use v0 4250; GFX11-NEXT: ;;#ASMEND 4251; GFX11-NEXT: ;;#ASMSTART 4252; GFX11-NEXT: ; use v1 4253; GFX11-NEXT: ;;#ASMEND 4254; GFX11-NEXT: s_endpgm 4255; 4256; GFX9-PAL-LABEL: large_offset: 4257; GFX9-PAL: ; %bb.0: ; %bb 4258; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 4259; GFX9-PAL-NEXT: s_mov_b32 s2, s0 4260; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4261; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 4262; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 4263; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 4264; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 4265; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 4266; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4267; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 4268; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 4269; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 4270; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 4271; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4272; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 4273; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 4274; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 4275; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 4276; GFX9-PAL-NEXT: ;;#ASMSTART 4277; GFX9-PAL-NEXT: ; use v0 4278; GFX9-PAL-NEXT: ;;#ASMEND 4279; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810 4280; GFX9-PAL-NEXT: ;;#ASMSTART 4281; GFX9-PAL-NEXT: ; use v0 4282; GFX9-PAL-NEXT: ;;#ASMEND 4283; GFX9-PAL-NEXT: s_endpgm 4284; 4285; GFX940-LABEL: large_offset: 4286; GFX940: ; %bb.0: ; %bb 4287; GFX940-NEXT: v_mov_b32_e32 v0, 0 4288; GFX940-NEXT: v_mov_b32_e32 v1, v0 4289; GFX940-NEXT: v_mov_b32_e32 v2, v0 4290; GFX940-NEXT: v_mov_b32_e32 v3, v0 4291; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 4292; GFX940-NEXT: s_waitcnt vmcnt(0) 4293; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 4294; GFX940-NEXT: s_waitcnt vmcnt(0) 4295; GFX940-NEXT: v_mov_b32_e32 v0, 16 4296; GFX940-NEXT: ;;#ASMSTART 4297; GFX940-NEXT: ; use v0 4298; GFX940-NEXT: ;;#ASMEND 4299; GFX940-NEXT: v_mov_b32_e32 v0, 0x810 4300; GFX940-NEXT: ;;#ASMSTART 4301; GFX940-NEXT: ; use v0 4302; GFX940-NEXT: ;;#ASMEND 4303; GFX940-NEXT: s_endpgm 4304; 4305; GFX10-PAL-LABEL: large_offset: 4306; GFX10-PAL: ; %bb.0: ; %bb 4307; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 4308; GFX10-PAL-NEXT: s_mov_b32 s2, s0 4309; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 4310; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 4311; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 4312; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0 4313; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 4314; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 4315; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 4316; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 4317; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 4318; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 4319; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 4320; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 4321; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 4322; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 4323; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4324; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 4325; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 4326; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16 4327; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810 4328; GFX10-PAL-NEXT: ;;#ASMSTART 4329; GFX10-PAL-NEXT: ; use v0 4330; GFX10-PAL-NEXT: ;;#ASMEND 4331; GFX10-PAL-NEXT: ;;#ASMSTART 4332; GFX10-PAL-NEXT: ; use v1 4333; GFX10-PAL-NEXT: ;;#ASMEND 4334; GFX10-PAL-NEXT: s_endpgm 4335; 4336; GFX11-PAL-LABEL: large_offset: 4337; GFX11-PAL: ; %bb.0: ; %bb 4338; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 4339; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) 4340; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 4341; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 4342; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0 4343; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc 4344; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 4345; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc 4346; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) 4347; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 16 4348; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x810 4349; GFX11-PAL-NEXT: ;;#ASMSTART 4350; GFX11-PAL-NEXT: ; use v0 4351; GFX11-PAL-NEXT: ;;#ASMEND 4352; GFX11-PAL-NEXT: ;;#ASMSTART 4353; GFX11-PAL-NEXT: ; use v1 4354; GFX11-PAL-NEXT: ;;#ASMEND 4355; GFX11-PAL-NEXT: s_endpgm 4356bb: 4357 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) 4358 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) 4359 %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60 4360 store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16 4361 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16 4362 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0 4363 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0 4364 ret void 4365} 4366 4367declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 4368declare i32 @llvm.amdgcn.workitem.id.x() 4369