; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s define amdgpu_kernel void @zero_init_kernel() { ; GFX9-LABEL: zero_init_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:76 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:72 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:68 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:64 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_kernel: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword off, v0, off offset:76 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:72 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:68 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:64 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:60 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:56 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:52 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:48 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:44 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:40 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:36 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:32 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:28 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:24 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:20 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:16 ; GFX10-NEXT: s_endpgm %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) ret void } define void @zero_init_foo() { ; GFX9-LABEL: zero_init_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:60 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:56 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:52 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:48 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:44 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:40 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:36 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:32 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:28 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:24 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:20 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:16 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:12 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:8 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: scratch_store_dword off, v0, s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: zero_init_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:60 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:56 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:52 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:48 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:44 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:40 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:36 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:32 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:28 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:24 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:20 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:16 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:12 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:8 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i16], align 2, addrspace(5) %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) ret void } define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s1, 4, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s2, s2, s5 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: s_add_u32 s1, 4, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 ; GFX10-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { ; GFX9-LABEL: store_load_sindex_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_add_u32 s1, 4, s1 ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: scratch_load_dword v0, off, s0 ; GFX10-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GFX10-NEXT: scratch_store_dword v2, v3, off ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX10-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = sub nsw i32 31, %i2 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define void @store_load_vindex_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: v_mov_b32_e32 v2, s32 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 ; GFX10-NEXT: scratch_store_dword v0, v1, off ; GFX10-NEXT: scratch_load_dword v0, v2, off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { ; GFX9-LABEL: private_ptr_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_ptr_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 store float 1.000000e+01, float addrspace(5)* %gep, align 4 ret void } define amdgpu_kernel void @zero_init_small_offset_kernel() { ; GFX9-LABEL: zero_init_small_offset_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:284 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:280 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:276 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:272 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:300 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:296 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:292 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:288 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:316 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:312 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:308 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:304 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:332 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:328 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:324 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:320 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_small_offset_kernel: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword off, v0, off offset:284 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:280 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:276 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:272 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:300 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:296 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:292 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:288 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:316 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:312 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:308 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:304 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:332 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:328 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:324 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:320 ; GFX10-NEXT: s_endpgm %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) ret void } define void @zero_init_small_offset_foo() { ; GFX9-LABEL: zero_init_small_offset_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:268 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:264 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:260 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:256 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:284 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:280 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:276 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:272 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:300 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:296 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:292 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:288 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:316 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:312 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:308 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: zero_init_small_offset_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:268 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:264 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:260 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:256 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:284 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:280 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:276 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:272 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:300 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:296 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:292 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:288 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:316 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:312 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:308 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:304 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %padding = alloca [64 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) ret void } define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x104, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_add_u32 s0, 0x104, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s2, s2, s5 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 ; GFX10-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 ; GFX10-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { ; GFX9-LABEL: store_load_sindex_small_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; GFX9-NEXT: s_add_u32 s0, 0x104, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s0, 0x104, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_small_offset_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_add_u32 s1, 0x104, s1 ; GFX10-NEXT: s_add_u32 s0, 0x104, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: scratch_load_dword v0, off, s0 ; GFX10-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_small_offset_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 ; GFX10-NEXT: scratch_store_dword v2, v3, off ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX10-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = sub nsw i32 31, %i2 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_small_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_small_offset_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 ; GFX10-NEXT: scratch_load_dword v3, off, s32 ; GFX10-NEXT: scratch_store_dword v0, v1, off ; GFX10-NEXT: scratch_load_dword v0, v2, off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_kernel void @zero_init_large_offset_kernel() { ; GFX9-LABEL: zero_init_large_offset_kernel: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 ; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: zero_init_large_offset_kernel: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 ; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 ; GFX10-NEXT: s_endpgm %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) ret void } define void @zero_init_large_offset_foo() { ; GFX9-LABEL: zero_init_large_offset_foo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: zero_init_large_offset_foo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, off, s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %padding = alloca [4096 x i32], align 4, addrspace(5) %alloca = alloca [32 x i16], align 2, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) ret void } define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 ; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s2, s2, s5 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s1, s0, 15 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2 ; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-NEXT: scratch_store_dword off, v0, s0 ; GFX10-NEXT: scratch_load_dword v0, off, s1 ; GFX10-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { ; GFX9-LABEL: store_load_sindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2 ; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 ; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_and_b32 s0, s2, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: scratch_load_dword v0, off, s0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_sindex_large_offset_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 ; GFX10-NEXT: s_and_b32 s0, s2, 15 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX10-NEXT: scratch_store_dword off, v0, s1 ; GFX10-NEXT: scratch_load_dword v0, off, s0 ; GFX10-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vindex_large_offset_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 15 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 ; GFX10-NEXT: scratch_store_dword v2, v3, off ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 ; GFX10-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = sub nsw i32 31, %i2 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX9-LABEL: store_load_vindex_large_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: scratch_load_dword v1, off, s32 ; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi ; GFX9-NEXT: v_mov_b32_e32 v3, 15 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 ; GFX9-NEXT: scratch_store_dword v2, v3, off ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: scratch_load_dword v0, v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_vindex_large_offset_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo ; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 ; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 ; GFX10-NEXT: scratch_load_dword v3, off, s32 ; GFX10-NEXT: scratch_store_dword v0, v1, off ; GFX10-NEXT: scratch_load_dword v0, v2, off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* store volatile i32 15, i32 addrspace(5)* %i8, align 4 %i9 = and i32 %idx, 15 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 ret void } define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-LABEL: store_load_large_imm_offset_kernel: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 ; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 ; GFX9-NEXT: s_add_u32 s0, 4, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_large_imm_offset_kernel: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s0, s0, s3 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: s_add_u32 s0, 4, s0 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 ; GFX10-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef store volatile i32 13, i32 addrspace(5)* %i1, align 4 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 ret void } define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_movk_i32 s4, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: scratch_store_dword off, v0, s32 ; GFX9-NEXT: s_add_u32 s4, s32, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712 ; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_large_imm_offset_foo: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_movk_i32 s4, 0x3800 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_add_u32 s4, s32, s4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 ; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664 ; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef store volatile i32 13, i32 addrspace(5)* %i1, align 4 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 store volatile i32 15, i32 addrspace(5)* %i7, align 4 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 ret void } define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { ; GFX9-LABEL: store_load_vidx_sidx_offset: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: store_load_vidx_sidx_offset: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_add_u32 s2, s2, s5 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 ; GFX10-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() %add1 = add nsw i32 %sidx, %vidx %add2 = add nsw i32 %add1, 256 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 store volatile i32 15, i32 addrspace(5)* %gep, align 4 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 ret void } ; FIXME: Multi-DWORD scratch shall be supported define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { ; GFX9-LABEL: store_load_i64_aligned: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: scratch_store_dword v0, v1, off ; GFX9-NEXT: scratch_load_dword v1, v0, off offset:4 ; GFX9-NEXT: scratch_load_dword v0, v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_i64_aligned: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: scratch_load_dword v1, v0, off offset:4 ; GFX10-NEXT: scratch_load_dword v0, v0, off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 ret void } ; FIXME: Multi-DWORD unaligned scratch shall be supported define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { ; GFX9-LABEL: store_load_i64_unaligned: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:7 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:6 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:5 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:4 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:3 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:2 ; GFX9-NEXT: scratch_store_byte v0, v1, off offset:1 ; GFX9-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-NEXT: scratch_store_byte v0, v1, off ; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:7 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_ubyte v1, v0, off ; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: store_load_i64_unaligned: ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:7 ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:6 ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:5 ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:4 ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:3 ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:2 ; GFX10-NEXT: scratch_store_byte v0, v1, off offset:1 ; GFX10-NEXT: scratch_store_byte v0, v2, off ; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:6 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: scratch_load_ubyte v1, v0, off ; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 ret void } declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) declare i32 @llvm.amdgcn.workitem.id.x()