1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4 5define amdgpu_kernel void @zero_init_kernel() { 6; GFX9-LABEL: zero_init_kernel: 7; GFX9: ; %bb.0: 8; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 9; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 10; GFX9-NEXT: v_mov_b32_e32 v0, 0 11; GFX9-NEXT: s_mov_b32 vcc_hi, 0 12; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:76 13; GFX9-NEXT: s_mov_b32 vcc_hi, 0 14; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:72 15; GFX9-NEXT: s_mov_b32 vcc_hi, 0 16; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:68 17; GFX9-NEXT: s_mov_b32 vcc_hi, 0 18; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:64 19; GFX9-NEXT: s_mov_b32 vcc_hi, 0 20; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 21; GFX9-NEXT: s_mov_b32 vcc_hi, 0 22; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 23; GFX9-NEXT: s_mov_b32 vcc_hi, 0 24; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 25; GFX9-NEXT: s_mov_b32 vcc_hi, 0 26; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 27; GFX9-NEXT: s_mov_b32 vcc_hi, 0 28; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 29; GFX9-NEXT: s_mov_b32 vcc_hi, 0 30; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 31; GFX9-NEXT: s_mov_b32 vcc_hi, 0 32; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 33; GFX9-NEXT: s_mov_b32 vcc_hi, 0 34; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 35; GFX9-NEXT: s_mov_b32 vcc_hi, 0 36; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 37; GFX9-NEXT: s_mov_b32 vcc_hi, 0 38; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 39; GFX9-NEXT: s_mov_b32 vcc_hi, 0 40; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 41; GFX9-NEXT: s_mov_b32 vcc_hi, 0 42; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 43; GFX9-NEXT: s_endpgm 44; 45; GFX10-LABEL: zero_init_kernel: 46; GFX10: ; %bb.0: 47; GFX10-NEXT: s_add_u32 s0, s0, s3 48; GFX10-NEXT: s_addc_u32 s1, s1, 0 49; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 50; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 51; GFX10-NEXT: v_mov_b32_e32 v0, 0 52; GFX10-NEXT: ; implicit-def: $vcc_hi 53; GFX10-NEXT: scratch_store_dword off, v0, off offset:76 54; GFX10-NEXT: scratch_store_dword off, v0, off offset:72 55; GFX10-NEXT: scratch_store_dword off, v0, off offset:68 56; GFX10-NEXT: scratch_store_dword off, v0, off offset:64 57; GFX10-NEXT: scratch_store_dword off, v0, off offset:60 58; GFX10-NEXT: scratch_store_dword off, v0, off offset:56 59; GFX10-NEXT: scratch_store_dword off, v0, off offset:52 60; GFX10-NEXT: scratch_store_dword off, v0, off offset:48 61; GFX10-NEXT: scratch_store_dword off, v0, off offset:44 62; GFX10-NEXT: scratch_store_dword off, v0, off offset:40 63; GFX10-NEXT: scratch_store_dword off, v0, off offset:36 64; GFX10-NEXT: scratch_store_dword off, v0, off offset:32 65; GFX10-NEXT: scratch_store_dword off, v0, off offset:28 66; GFX10-NEXT: scratch_store_dword off, v0, off offset:24 67; GFX10-NEXT: scratch_store_dword off, v0, off offset:20 68; GFX10-NEXT: scratch_store_dword off, v0, off offset:16 69; GFX10-NEXT: s_endpgm 70 %alloca = alloca [32 x i16], align 2, addrspace(5) 71 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 72 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 73 ret void 74} 75 76define void @zero_init_foo() { 77; GFX9-LABEL: zero_init_foo: 78; GFX9: ; %bb.0: 79; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX9-NEXT: v_mov_b32_e32 v0, 0 81; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:60 82; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:56 83; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:52 84; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:48 85; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:44 86; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:40 87; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:36 88; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:32 89; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:28 90; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:24 91; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:20 92; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:16 93; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:12 94; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:8 95; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 96; GFX9-NEXT: scratch_store_dword off, v0, s32 97; GFX9-NEXT: s_waitcnt vmcnt(0) 98; GFX9-NEXT: s_setpc_b64 s[30:31] 99; 100; GFX10-LABEL: zero_init_foo: 101; GFX10: ; %bb.0: 102; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 103; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 104; GFX10-NEXT: v_mov_b32_e32 v0, 0 105; GFX10-NEXT: ; implicit-def: $vcc_hi 106; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:60 107; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:56 108; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:52 109; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:48 110; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:44 111; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:40 112; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:36 113; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:32 114; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:28 115; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:24 116; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:20 117; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:16 118; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:12 119; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:8 120; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 121; GFX10-NEXT: scratch_store_dword off, v0, s32 122; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX10-NEXT: s_setpc_b64 s[30:31] 124 %alloca = alloca [32 x i16], align 2, addrspace(5) 125 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 126 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 127 ret void 128} 129 130define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 131; GFX9-LABEL: store_load_sindex_kernel: 132; GFX9: ; %bb.0: ; %bb 133; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 134; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 135; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 136; GFX9-NEXT: v_mov_b32_e32 v0, 15 137; GFX9-NEXT: s_waitcnt lgkmcnt(0) 138; GFX9-NEXT: s_lshl_b32 s1, s0, 2 139; GFX9-NEXT: s_and_b32 s0, s0, 15 140; GFX9-NEXT: s_lshl_b32 s0, s0, 2 141; GFX9-NEXT: s_add_u32 s1, 4, s1 142; GFX9-NEXT: scratch_store_dword off, v0, s1 143; GFX9-NEXT: s_add_u32 s0, 4, s0 144; GFX9-NEXT: scratch_load_dword v0, off, s0 145; GFX9-NEXT: s_endpgm 146; 147; GFX10-LABEL: store_load_sindex_kernel: 148; GFX10: ; %bb.0: ; %bb 149; GFX10-NEXT: s_add_u32 s2, s2, s5 150; GFX10-NEXT: s_addc_u32 s3, s3, 0 151; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 152; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 153; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 154; GFX10-NEXT: v_mov_b32_e32 v0, 15 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: s_and_b32 s1, s0, 15 157; GFX10-NEXT: s_lshl_b32 s0, s0, 2 158; GFX10-NEXT: s_lshl_b32 s1, s1, 2 159; GFX10-NEXT: s_add_u32 s0, 4, s0 160; GFX10-NEXT: s_add_u32 s1, 4, s1 161; GFX10-NEXT: scratch_store_dword off, v0, s0 162; GFX10-NEXT: scratch_load_dword v0, off, s1 163; GFX10-NEXT: s_endpgm 164bb: 165 %i = alloca [32 x float], align 4, addrspace(5) 166 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 167 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 168 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 169 store volatile i32 15, i32 addrspace(5)* %i8, align 4 170 %i9 = and i32 %idx, 15 171 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 172 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 173 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 174 ret void 175} 176 177define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 178; GFX9-LABEL: store_load_sindex_foo: 179; GFX9: ; %bb.0: ; %bb 180; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 181; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 182; GFX9-NEXT: s_lshl_b32 s0, s2, 2 183; GFX9-NEXT: s_add_u32 s0, 4, s0 184; GFX9-NEXT: v_mov_b32_e32 v0, 15 185; GFX9-NEXT: scratch_store_dword off, v0, s0 186; GFX9-NEXT: s_and_b32 s0, s2, 15 187; GFX9-NEXT: s_lshl_b32 s0, s0, 2 188; GFX9-NEXT: s_add_u32 s0, 4, s0 189; GFX9-NEXT: scratch_load_dword v0, off, s0 190; GFX9-NEXT: s_endpgm 191; 192; GFX10-LABEL: store_load_sindex_foo: 193; GFX10: ; %bb.0: ; %bb 194; GFX10-NEXT: s_add_u32 s0, s0, s3 195; GFX10-NEXT: s_addc_u32 s1, s1, 0 196; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 197; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 198; GFX10-NEXT: s_and_b32 s0, s2, 15 199; GFX10-NEXT: v_mov_b32_e32 v0, 15 200; GFX10-NEXT: s_lshl_b32 s1, s2, 2 201; GFX10-NEXT: s_lshl_b32 s0, s0, 2 202; GFX10-NEXT: s_add_u32 s1, 4, s1 203; GFX10-NEXT: s_add_u32 s0, 4, s0 204; GFX10-NEXT: scratch_store_dword off, v0, s1 205; GFX10-NEXT: scratch_load_dword v0, off, s0 206; GFX10-NEXT: s_endpgm 207bb: 208 %i = alloca [32 x float], align 4, addrspace(5) 209 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 210 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 211 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 212 store volatile i32 15, i32 addrspace(5)* %i8, align 4 213 %i9 = and i32 %idx, 15 214 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 215 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 216 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 217 ret void 218} 219 220define amdgpu_kernel void @store_load_vindex_kernel() { 221; GFX9-LABEL: store_load_vindex_kernel: 222; GFX9: ; %bb.0: ; %bb 223; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 224; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 225; GFX9-NEXT: v_mov_b32_e32 v1, 4 226; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 227; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 228; GFX9-NEXT: v_mov_b32_e32 v3, 15 229; GFX9-NEXT: scratch_store_dword v2, v3, off 230; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 231; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 232; GFX9-NEXT: s_endpgm 233; 234; GFX10-LABEL: store_load_vindex_kernel: 235; GFX10: ; %bb.0: ; %bb 236; GFX10-NEXT: s_add_u32 s0, s0, s3 237; GFX10-NEXT: s_addc_u32 s1, s1, 0 238; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 239; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 240; GFX10-NEXT: v_mov_b32_e32 v1, 4 241; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 242; GFX10-NEXT: v_mov_b32_e32 v3, 15 243; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 244; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 245; GFX10-NEXT: scratch_store_dword v2, v3, off 246; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 247; GFX10-NEXT: s_endpgm 248bb: 249 %i = alloca [32 x float], align 4, addrspace(5) 250 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 251 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 252 %i3 = zext i32 %i2 to i64 253 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 254 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 255 store volatile i32 15, i32 addrspace(5)* %i8, align 4 256 %i9 = sub nsw i32 31, %i2 257 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 258 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 259 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 260 ret void 261} 262 263define void @store_load_vindex_foo(i32 %idx) { 264; GFX9-LABEL: store_load_vindex_foo: 265; GFX9: ; %bb.0: ; %bb 266; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 267; GFX9-NEXT: v_mov_b32_e32 v1, s32 268; GFX9-NEXT: v_mov_b32_e32 v3, 15 269; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 270; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 271; GFX9-NEXT: scratch_store_dword v2, v3, off 272; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 273; GFX9-NEXT: scratch_load_dword v0, v0, off 274; GFX9-NEXT: s_waitcnt vmcnt(0) 275; GFX9-NEXT: s_setpc_b64 s[30:31] 276; 277; GFX10-LABEL: store_load_vindex_foo: 278; GFX10: ; %bb.0: ; %bb 279; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 281; GFX10-NEXT: v_mov_b32_e32 v1, 15 282; GFX10-NEXT: v_mov_b32_e32 v2, s32 283; GFX10-NEXT: ; implicit-def: $vcc_hi 284; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 285; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 286; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 287; GFX10-NEXT: scratch_store_dword v0, v1, off 288; GFX10-NEXT: scratch_load_dword v0, v2, off 289; GFX10-NEXT: s_waitcnt vmcnt(0) 290; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 291; GFX10-NEXT: s_setpc_b64 s[30:31] 292bb: 293 %i = alloca [32 x float], align 4, addrspace(5) 294 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 295 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 296 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 297 store volatile i32 15, i32 addrspace(5)* %i8, align 4 298 %i9 = and i32 %idx, 15 299 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 300 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 301 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 302 ret void 303} 304 305define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 306; GFX9-LABEL: private_ptr_foo: 307; GFX9: ; %bb.0: 308; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 309; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 310; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 311; GFX9-NEXT: s_waitcnt vmcnt(0) 312; GFX9-NEXT: s_setpc_b64 s[30:31] 313; 314; GFX10-LABEL: private_ptr_foo: 315; GFX10: ; %bb.0: 316; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 317; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 318; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 319; GFX10-NEXT: ; implicit-def: $vcc_hi 320; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 321; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 322; GFX10-NEXT: s_setpc_b64 s[30:31] 323 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 324 store float 1.000000e+01, float addrspace(5)* %gep, align 4 325 ret void 326} 327 328define amdgpu_kernel void @zero_init_small_offset_kernel() { 329; GFX9-LABEL: zero_init_small_offset_kernel: 330; GFX9: ; %bb.0: 331; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 332; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 333; GFX9-NEXT: s_mov_b32 vcc_hi, 0 334; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 335; GFX9-NEXT: s_waitcnt vmcnt(0) 336; GFX9-NEXT: v_mov_b32_e32 v0, 0 337; GFX9-NEXT: s_mov_b32 vcc_hi, 0 338; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:284 339; GFX9-NEXT: s_mov_b32 vcc_hi, 0 340; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:280 341; GFX9-NEXT: s_mov_b32 vcc_hi, 0 342; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:276 343; GFX9-NEXT: s_mov_b32 vcc_hi, 0 344; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:272 345; GFX9-NEXT: s_mov_b32 vcc_hi, 0 346; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:300 347; GFX9-NEXT: s_mov_b32 vcc_hi, 0 348; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:296 349; GFX9-NEXT: s_mov_b32 vcc_hi, 0 350; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:292 351; GFX9-NEXT: s_mov_b32 vcc_hi, 0 352; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:288 353; GFX9-NEXT: s_mov_b32 vcc_hi, 0 354; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:316 355; GFX9-NEXT: s_mov_b32 vcc_hi, 0 356; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:312 357; GFX9-NEXT: s_mov_b32 vcc_hi, 0 358; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:308 359; GFX9-NEXT: s_mov_b32 vcc_hi, 0 360; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:304 361; GFX9-NEXT: s_mov_b32 vcc_hi, 0 362; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:332 363; GFX9-NEXT: s_mov_b32 vcc_hi, 0 364; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:328 365; GFX9-NEXT: s_mov_b32 vcc_hi, 0 366; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:324 367; GFX9-NEXT: s_mov_b32 vcc_hi, 0 368; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:320 369; GFX9-NEXT: s_endpgm 370; 371; GFX10-LABEL: zero_init_small_offset_kernel: 372; GFX10: ; %bb.0: 373; GFX10-NEXT: s_add_u32 s0, s0, s3 374; GFX10-NEXT: s_addc_u32 s1, s1, 0 375; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 376; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 377; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 378; GFX10-NEXT: s_waitcnt vmcnt(0) 379; GFX10-NEXT: v_mov_b32_e32 v0, 0 380; GFX10-NEXT: ; implicit-def: $vcc_hi 381; GFX10-NEXT: scratch_store_dword off, v0, off offset:284 382; GFX10-NEXT: scratch_store_dword off, v0, off offset:280 383; GFX10-NEXT: scratch_store_dword off, v0, off offset:276 384; GFX10-NEXT: scratch_store_dword off, v0, off offset:272 385; GFX10-NEXT: scratch_store_dword off, v0, off offset:300 386; GFX10-NEXT: scratch_store_dword off, v0, off offset:296 387; GFX10-NEXT: scratch_store_dword off, v0, off offset:292 388; GFX10-NEXT: scratch_store_dword off, v0, off offset:288 389; GFX10-NEXT: scratch_store_dword off, v0, off offset:316 390; GFX10-NEXT: scratch_store_dword off, v0, off offset:312 391; GFX10-NEXT: scratch_store_dword off, v0, off offset:308 392; GFX10-NEXT: scratch_store_dword off, v0, off offset:304 393; GFX10-NEXT: scratch_store_dword off, v0, off offset:332 394; GFX10-NEXT: scratch_store_dword off, v0, off offset:328 395; GFX10-NEXT: scratch_store_dword off, v0, off offset:324 396; GFX10-NEXT: scratch_store_dword off, v0, off offset:320 397; GFX10-NEXT: s_endpgm 398 %padding = alloca [64 x i32], align 4, addrspace(5) 399 %alloca = alloca [32 x i16], align 2, addrspace(5) 400 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 401 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 402 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 403 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 404 ret void 405} 406 407define void @zero_init_small_offset_foo() { 408; GFX9-LABEL: zero_init_small_offset_foo: 409; GFX9: ; %bb.0: 410; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 411; GFX9-NEXT: scratch_load_dword v0, off, s32 412; GFX9-NEXT: s_waitcnt vmcnt(0) 413; GFX9-NEXT: v_mov_b32_e32 v0, 0 414; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:268 415; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:264 416; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:260 417; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:256 418; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:284 419; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:280 420; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:276 421; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:272 422; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:300 423; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:296 424; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:292 425; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:288 426; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:316 427; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:312 428; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:308 429; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:304 430; GFX9-NEXT: s_waitcnt vmcnt(0) 431; GFX9-NEXT: s_setpc_b64 s[30:31] 432; 433; GFX10-LABEL: zero_init_small_offset_foo: 434; GFX10: ; %bb.0: 435; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 437; GFX10-NEXT: scratch_load_dword v0, off, s32 438; GFX10-NEXT: s_waitcnt vmcnt(0) 439; GFX10-NEXT: v_mov_b32_e32 v0, 0 440; GFX10-NEXT: ; implicit-def: $vcc_hi 441; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:268 442; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:264 443; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:260 444; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:256 445; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:284 446; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:280 447; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:276 448; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:272 449; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:300 450; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:296 451; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:292 452; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:288 453; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:316 454; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:312 455; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:308 456; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:304 457; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 458; GFX10-NEXT: s_setpc_b64 s[30:31] 459 %padding = alloca [64 x i32], align 4, addrspace(5) 460 %alloca = alloca [32 x i16], align 2, addrspace(5) 461 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 462 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 463 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 464 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 465 ret void 466} 467 468define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 469; GFX9-LABEL: store_load_sindex_small_offset_kernel: 470; GFX9: ; %bb.0: ; %bb 471; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 472; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 473; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 474; GFX9-NEXT: s_mov_b32 vcc_hi, 0 475; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 476; GFX9-NEXT: s_waitcnt lgkmcnt(0) 477; GFX9-NEXT: s_lshl_b32 s1, s0, 2 478; GFX9-NEXT: s_and_b32 s0, s0, 15 479; GFX9-NEXT: s_lshl_b32 s0, s0, 2 480; GFX9-NEXT: s_waitcnt vmcnt(0) 481; GFX9-NEXT: v_mov_b32_e32 v0, 15 482; GFX9-NEXT: s_add_u32 s1, 0x104, s1 483; GFX9-NEXT: scratch_store_dword off, v0, s1 484; GFX9-NEXT: s_add_u32 s0, 0x104, s0 485; GFX9-NEXT: scratch_load_dword v0, off, s0 486; GFX9-NEXT: s_endpgm 487; 488; GFX10-LABEL: store_load_sindex_small_offset_kernel: 489; GFX10: ; %bb.0: ; %bb 490; GFX10-NEXT: s_add_u32 s2, s2, s5 491; GFX10-NEXT: s_addc_u32 s3, s3, 0 492; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 493; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 494; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 495; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 496; GFX10-NEXT: s_waitcnt vmcnt(0) 497; GFX10-NEXT: v_mov_b32_e32 v0, 15 498; GFX10-NEXT: s_waitcnt lgkmcnt(0) 499; GFX10-NEXT: s_and_b32 s1, s0, 15 500; GFX10-NEXT: s_lshl_b32 s0, s0, 2 501; GFX10-NEXT: s_lshl_b32 s1, s1, 2 502; GFX10-NEXT: s_add_u32 s0, 0x104, s0 503; GFX10-NEXT: s_add_u32 s1, 0x104, s1 504; GFX10-NEXT: scratch_store_dword off, v0, s0 505; GFX10-NEXT: scratch_load_dword v0, off, s1 506; GFX10-NEXT: s_endpgm 507bb: 508 %padding = alloca [64 x i32], align 4, addrspace(5) 509 %i = alloca [32 x float], align 4, addrspace(5) 510 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 511 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 512 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 513 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 514 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 515 store volatile i32 15, i32 addrspace(5)* %i8, align 4 516 %i9 = and i32 %idx, 15 517 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 518 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 519 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 520 ret void 521} 522 523define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 524; GFX9-LABEL: store_load_sindex_small_offset_foo: 525; GFX9: ; %bb.0: ; %bb 526; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 527; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 528; GFX9-NEXT: s_mov_b32 vcc_hi, 0 529; GFX9-NEXT: s_lshl_b32 s0, s2, 2 530; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 531; GFX9-NEXT: s_add_u32 s0, 0x104, s0 532; GFX9-NEXT: s_waitcnt vmcnt(0) 533; GFX9-NEXT: v_mov_b32_e32 v0, 15 534; GFX9-NEXT: scratch_store_dword off, v0, s0 535; GFX9-NEXT: s_and_b32 s0, s2, 15 536; GFX9-NEXT: s_lshl_b32 s0, s0, 2 537; GFX9-NEXT: s_add_u32 s0, 0x104, s0 538; GFX9-NEXT: scratch_load_dword v0, off, s0 539; GFX9-NEXT: s_endpgm 540; 541; GFX10-LABEL: store_load_sindex_small_offset_foo: 542; GFX10: ; %bb.0: ; %bb 543; GFX10-NEXT: s_add_u32 s0, s0, s3 544; GFX10-NEXT: s_addc_u32 s1, s1, 0 545; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 546; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 547; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 548; GFX10-NEXT: s_and_b32 s0, s2, 15 549; GFX10-NEXT: s_waitcnt vmcnt(0) 550; GFX10-NEXT: v_mov_b32_e32 v0, 15 551; GFX10-NEXT: s_lshl_b32 s1, s2, 2 552; GFX10-NEXT: s_lshl_b32 s0, s0, 2 553; GFX10-NEXT: s_add_u32 s1, 0x104, s1 554; GFX10-NEXT: s_add_u32 s0, 0x104, s0 555; GFX10-NEXT: scratch_store_dword off, v0, s1 556; GFX10-NEXT: scratch_load_dword v0, off, s0 557; GFX10-NEXT: s_endpgm 558bb: 559 %padding = alloca [64 x i32], align 4, addrspace(5) 560 %i = alloca [32 x float], align 4, addrspace(5) 561 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 562 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 563 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 564 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 565 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 566 store volatile i32 15, i32 addrspace(5)* %i8, align 4 567 %i9 = and i32 %idx, 15 568 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 569 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 570 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 571 ret void 572} 573 574define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 575; GFX9-LABEL: store_load_vindex_small_offset_kernel: 576; GFX9: ; %bb.0: ; %bb 577; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 578; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 579; GFX9-NEXT: s_mov_b32 vcc_hi, 0 580; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 581; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 582; GFX9-NEXT: s_waitcnt vmcnt(0) 583; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 584; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 585; GFX9-NEXT: v_mov_b32_e32 v3, 15 586; GFX9-NEXT: scratch_store_dword v2, v3, off 587; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 588; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 589; GFX9-NEXT: s_endpgm 590; 591; GFX10-LABEL: store_load_vindex_small_offset_kernel: 592; GFX10: ; %bb.0: ; %bb 593; GFX10-NEXT: s_add_u32 s0, s0, s3 594; GFX10-NEXT: s_addc_u32 s1, s1, 0 595; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 596; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 597; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 598; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 599; GFX10-NEXT: v_mov_b32_e32 v3, 15 600; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 601; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 602; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 603; GFX10-NEXT: scratch_store_dword v2, v3, off 604; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 605; GFX10-NEXT: s_endpgm 606bb: 607 %padding = alloca [64 x i32], align 4, addrspace(5) 608 %i = alloca [32 x float], align 4, addrspace(5) 609 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 610 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 611 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 612 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 613 %i3 = zext i32 %i2 to i64 614 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 615 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 616 store volatile i32 15, i32 addrspace(5)* %i8, align 4 617 %i9 = sub nsw i32 31, %i2 618 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 619 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 620 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 621 ret void 622} 623 624define void @store_load_vindex_small_offset_foo(i32 %idx) { 625; GFX9-LABEL: store_load_vindex_small_offset_foo: 626; GFX9: ; %bb.0: ; %bb 627; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GFX9-NEXT: scratch_load_dword v1, off, s32 629; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 630; GFX9-NEXT: s_waitcnt vmcnt(0) 631; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 632; GFX9-NEXT: v_mov_b32_e32 v3, 15 633; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 634; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 635; GFX9-NEXT: scratch_store_dword v2, v3, off 636; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 637; GFX9-NEXT: scratch_load_dword v0, v0, off 638; GFX9-NEXT: s_waitcnt vmcnt(0) 639; GFX9-NEXT: s_setpc_b64 s[30:31] 640; 641; GFX10-LABEL: store_load_vindex_small_offset_foo: 642; GFX10: ; %bb.0: ; %bb 643; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 644; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 645; GFX10-NEXT: v_mov_b32_e32 v1, 15 646; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 647; GFX10-NEXT: ; implicit-def: $vcc_hi 648; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 649; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 650; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 651; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 652; GFX10-NEXT: scratch_load_dword v3, off, s32 653; GFX10-NEXT: scratch_store_dword v0, v1, off 654; GFX10-NEXT: scratch_load_dword v0, v2, off 655; GFX10-NEXT: s_waitcnt vmcnt(0) 656; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 657; GFX10-NEXT: s_setpc_b64 s[30:31] 658bb: 659 %padding = alloca [64 x i32], align 4, addrspace(5) 660 %i = alloca [32 x float], align 4, addrspace(5) 661 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 662 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 663 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 664 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 665 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 666 store volatile i32 15, i32 addrspace(5)* %i8, align 4 667 %i9 = and i32 %idx, 15 668 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 669 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 670 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 671 ret void 672} 673 674define amdgpu_kernel void @zero_init_large_offset_kernel() { 675; GFX9-LABEL: zero_init_large_offset_kernel: 676; GFX9: ; %bb.0: 677; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 678; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 679; GFX9-NEXT: s_mov_b32 vcc_hi, 0 680; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 681; GFX9-NEXT: s_waitcnt vmcnt(0) 682; GFX9-NEXT: v_mov_b32_e32 v0, 0 683; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 684; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 685; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 686; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 687; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 688; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 689; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 690; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi 691; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 692; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 693; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 694; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 695; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 696; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 697; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 698; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 699; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 700; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 701; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 702; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 703; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 704; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 705; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 706; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 707; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 708; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 709; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 710; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 711; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 712; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 713; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 714; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 715; GFX9-NEXT: s_endpgm 716; 717; GFX10-LABEL: zero_init_large_offset_kernel: 718; GFX10: ; %bb.0: 719; GFX10-NEXT: s_add_u32 s0, s0, s3 720; GFX10-NEXT: s_addc_u32 s1, s1, 0 721; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 722; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 723; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 724; GFX10-NEXT: s_waitcnt vmcnt(0) 725; GFX10-NEXT: v_mov_b32_e32 v0, 0 726; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 727; GFX10-NEXT: ; implicit-def: $vcc_hi 728; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 729; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 730; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 731; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 732; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 733; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 734; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo 735; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 736; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 737; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 738; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 739; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 740; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 741; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 742; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 743; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 744; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 745; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 746; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 747; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 748; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 749; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 750; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 751; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 752; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 753; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 754; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 755; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 756; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 757; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 758; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 759; GFX10-NEXT: s_endpgm 760 %padding = alloca [4096 x i32], align 4, addrspace(5) 761 %alloca = alloca [32 x i16], align 2, addrspace(5) 762 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 763 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 764 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 765 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 766 ret void 767} 768 769define void @zero_init_large_offset_foo() { 770; GFX9-LABEL: zero_init_large_offset_foo: 771; GFX9: ; %bb.0: 772; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 773; GFX9-NEXT: scratch_load_dword v0, off, s32 774; GFX9-NEXT: s_waitcnt vmcnt(0) 775; GFX9-NEXT: v_mov_b32_e32 v0, 0 776; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 777; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:12 778; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 779; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:8 780; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 781; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 782; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 783; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi 784; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 785; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:28 786; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 787; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:24 788; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 789; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:20 790; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 791; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:16 792; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 793; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:44 794; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 795; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:40 796; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 797; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:36 798; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 799; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:32 800; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 801; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:60 802; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 803; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:56 804; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 805; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:52 806; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 807; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:48 808; GFX9-NEXT: s_waitcnt vmcnt(0) 809; GFX9-NEXT: s_setpc_b64 s[30:31] 810; 811; GFX10-LABEL: zero_init_large_offset_foo: 812; GFX10: ; %bb.0: 813; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 814; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 815; GFX10-NEXT: scratch_load_dword v0, off, s32 816; GFX10-NEXT: s_waitcnt vmcnt(0) 817; GFX10-NEXT: v_mov_b32_e32 v0, 0 818; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 819; GFX10-NEXT: ; implicit-def: $vcc_hi 820; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:12 821; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 822; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:8 823; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 824; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 825; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 826; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo 827; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 828; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:28 829; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 830; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:24 831; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 832; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:20 833; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 834; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:16 835; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 836; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:44 837; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 838; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:40 839; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 840; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:36 841; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 842; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:32 843; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 844; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:60 845; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 846; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:56 847; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 848; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:52 849; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 850; GFX10-NEXT: scratch_store_dword off, v0, vcc_lo offset:48 851; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 852; GFX10-NEXT: s_setpc_b64 s[30:31] 853 %padding = alloca [4096 x i32], align 4, addrspace(5) 854 %alloca = alloca [32 x i16], align 2, addrspace(5) 855 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 856 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 857 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 858 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 859 ret void 860} 861 862define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 863; GFX9-LABEL: store_load_sindex_large_offset_kernel: 864; GFX9: ; %bb.0: ; %bb 865; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 866; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 867; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 868; GFX9-NEXT: s_mov_b32 vcc_hi, 0 869; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 870; GFX9-NEXT: s_waitcnt lgkmcnt(0) 871; GFX9-NEXT: s_lshl_b32 s1, s0, 2 872; GFX9-NEXT: s_and_b32 s0, s0, 15 873; GFX9-NEXT: s_lshl_b32 s0, s0, 2 874; GFX9-NEXT: s_waitcnt vmcnt(0) 875; GFX9-NEXT: v_mov_b32_e32 v0, 15 876; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 877; GFX9-NEXT: scratch_store_dword off, v0, s1 878; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 879; GFX9-NEXT: scratch_load_dword v0, off, s0 880; GFX9-NEXT: s_endpgm 881; 882; GFX10-LABEL: store_load_sindex_large_offset_kernel: 883; GFX10: ; %bb.0: ; %bb 884; GFX10-NEXT: s_add_u32 s2, s2, s5 885; GFX10-NEXT: s_addc_u32 s3, s3, 0 886; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 887; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 888; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 889; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 890; GFX10-NEXT: s_waitcnt vmcnt(0) 891; GFX10-NEXT: v_mov_b32_e32 v0, 15 892; GFX10-NEXT: s_waitcnt lgkmcnt(0) 893; GFX10-NEXT: s_and_b32 s1, s0, 15 894; GFX10-NEXT: s_lshl_b32 s0, s0, 2 895; GFX10-NEXT: s_lshl_b32 s1, s1, 2 896; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 897; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 898; GFX10-NEXT: scratch_store_dword off, v0, s0 899; GFX10-NEXT: scratch_load_dword v0, off, s1 900; GFX10-NEXT: s_endpgm 901bb: 902 %padding = alloca [4096 x i32], align 4, addrspace(5) 903 %i = alloca [32 x float], align 4, addrspace(5) 904 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 905 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 906 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 907 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 908 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 909 store volatile i32 15, i32 addrspace(5)* %i8, align 4 910 %i9 = and i32 %idx, 15 911 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 912 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 913 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 914 ret void 915} 916 917define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 918; GFX9-LABEL: store_load_sindex_large_offset_foo: 919; GFX9: ; %bb.0: ; %bb 920; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 921; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 922; GFX9-NEXT: s_mov_b32 vcc_hi, 0 923; GFX9-NEXT: s_lshl_b32 s0, s2, 2 924; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 925; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 926; GFX9-NEXT: s_waitcnt vmcnt(0) 927; GFX9-NEXT: v_mov_b32_e32 v0, 15 928; GFX9-NEXT: scratch_store_dword off, v0, s0 929; GFX9-NEXT: s_and_b32 s0, s2, 15 930; GFX9-NEXT: s_lshl_b32 s0, s0, 2 931; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 932; GFX9-NEXT: scratch_load_dword v0, off, s0 933; GFX9-NEXT: s_endpgm 934; 935; GFX10-LABEL: store_load_sindex_large_offset_foo: 936; GFX10: ; %bb.0: ; %bb 937; GFX10-NEXT: s_add_u32 s0, s0, s3 938; GFX10-NEXT: s_addc_u32 s1, s1, 0 939; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 940; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 941; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 942; GFX10-NEXT: s_and_b32 s0, s2, 15 943; GFX10-NEXT: s_waitcnt vmcnt(0) 944; GFX10-NEXT: v_mov_b32_e32 v0, 15 945; GFX10-NEXT: s_lshl_b32 s1, s2, 2 946; GFX10-NEXT: s_lshl_b32 s0, s0, 2 947; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 948; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 949; GFX10-NEXT: scratch_store_dword off, v0, s1 950; GFX10-NEXT: scratch_load_dword v0, off, s0 951; GFX10-NEXT: s_endpgm 952bb: 953 %padding = alloca [4096 x i32], align 4, addrspace(5) 954 %i = alloca [32 x float], align 4, addrspace(5) 955 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 956 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 957 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 958 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 959 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 960 store volatile i32 15, i32 addrspace(5)* %i8, align 4 961 %i9 = and i32 %idx, 15 962 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 963 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 964 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 965 ret void 966} 967 968define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 969; GFX9-LABEL: store_load_vindex_large_offset_kernel: 970; GFX9: ; %bb.0: ; %bb 971; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 972; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 973; GFX9-NEXT: s_mov_b32 vcc_hi, 0 974; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 975; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 976; GFX9-NEXT: s_waitcnt vmcnt(0) 977; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 978; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 979; GFX9-NEXT: v_mov_b32_e32 v3, 15 980; GFX9-NEXT: scratch_store_dword v2, v3, off 981; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 982; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 983; GFX9-NEXT: s_endpgm 984; 985; GFX10-LABEL: store_load_vindex_large_offset_kernel: 986; GFX10: ; %bb.0: ; %bb 987; GFX10-NEXT: s_add_u32 s0, s0, s3 988; GFX10-NEXT: s_addc_u32 s1, s1, 0 989; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 990; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 991; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 992; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 993; GFX10-NEXT: v_mov_b32_e32 v3, 15 994; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 995; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 996; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 997; GFX10-NEXT: scratch_store_dword v2, v3, off 998; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 999; GFX10-NEXT: s_endpgm 1000bb: 1001 %padding = alloca [4096 x i32], align 4, addrspace(5) 1002 %i = alloca [32 x float], align 4, addrspace(5) 1003 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1004 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1005 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1006 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1007 %i3 = zext i32 %i2 to i64 1008 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1009 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1010 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1011 %i9 = sub nsw i32 31, %i2 1012 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1013 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1014 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1015 ret void 1016} 1017 1018define void @store_load_vindex_large_offset_foo(i32 %idx) { 1019; GFX9-LABEL: store_load_vindex_large_offset_foo: 1020; GFX9: ; %bb.0: ; %bb 1021; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1022; GFX9-NEXT: scratch_load_dword v1, off, s32 1023; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1024; GFX9-NEXT: s_waitcnt vmcnt(0) 1025; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1026; GFX9-NEXT: v_mov_b32_e32 v3, 15 1027; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1028; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1029; GFX9-NEXT: scratch_store_dword v2, v3, off 1030; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1031; GFX9-NEXT: scratch_load_dword v0, v0, off 1032; GFX9-NEXT: s_waitcnt vmcnt(0) 1033; GFX9-NEXT: s_setpc_b64 s[30:31] 1034; 1035; GFX10-LABEL: store_load_vindex_large_offset_foo: 1036; GFX10: ; %bb.0: ; %bb 1037; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1038; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1039; GFX10-NEXT: v_mov_b32_e32 v1, 15 1040; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1041; GFX10-NEXT: ; implicit-def: $vcc_hi 1042; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1043; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1044; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1045; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1046; GFX10-NEXT: scratch_load_dword v3, off, s32 1047; GFX10-NEXT: scratch_store_dword v0, v1, off 1048; GFX10-NEXT: scratch_load_dword v0, v2, off 1049; GFX10-NEXT: s_waitcnt vmcnt(0) 1050; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1051; GFX10-NEXT: s_setpc_b64 s[30:31] 1052bb: 1053 %padding = alloca [4096 x i32], align 4, addrspace(5) 1054 %i = alloca [32 x float], align 4, addrspace(5) 1055 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1056 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1057 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1058 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1059 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1060 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1061 %i9 = and i32 %idx, 15 1062 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1063 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1064 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1065 ret void 1066} 1067 1068define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 1069; GFX9-LABEL: store_load_large_imm_offset_kernel: 1070; GFX9: ; %bb.0: ; %bb 1071; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1072; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1073; GFX9-NEXT: s_movk_i32 s0, 0x3000 1074; GFX9-NEXT: v_mov_b32_e32 v0, 13 1075; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1076; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1077; GFX9-NEXT: s_add_u32 s0, 4, s0 1078; GFX9-NEXT: v_mov_b32_e32 v0, 15 1079; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1080; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 1081; GFX9-NEXT: s_endpgm 1082; 1083; GFX10-LABEL: store_load_large_imm_offset_kernel: 1084; GFX10: ; %bb.0: ; %bb 1085; GFX10-NEXT: s_add_u32 s0, s0, s3 1086; GFX10-NEXT: s_addc_u32 s1, s1, 0 1087; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1088; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1089; GFX10-NEXT: v_mov_b32_e32 v0, 13 1090; GFX10-NEXT: v_mov_b32_e32 v1, 15 1091; GFX10-NEXT: s_movk_i32 s0, 0x3800 1092; GFX10-NEXT: s_add_u32 s0, 4, s0 1093; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 1094; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1095; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 1096; GFX10-NEXT: s_endpgm 1097bb: 1098 %i = alloca [4096 x i32], align 4, addrspace(5) 1099 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1100 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1101 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1102 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1103 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1104 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1105 ret void 1106} 1107 1108define void @store_load_large_imm_offset_foo() { 1109; GFX9-LABEL: store_load_large_imm_offset_foo: 1110; GFX9: ; %bb.0: ; %bb 1111; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1112; GFX9-NEXT: s_movk_i32 s4, 0x3000 1113; GFX9-NEXT: v_mov_b32_e32 v0, 13 1114; GFX9-NEXT: scratch_store_dword off, v0, s32 1115; GFX9-NEXT: s_add_u32 s4, s32, s4 1116; GFX9-NEXT: v_mov_b32_e32 v0, 15 1117; GFX9-NEXT: scratch_store_dword off, v0, s4 offset:3712 1118; GFX9-NEXT: scratch_load_dword v0, off, s4 offset:3712 1119; GFX9-NEXT: s_waitcnt vmcnt(0) 1120; GFX9-NEXT: s_setpc_b64 s[30:31] 1121; 1122; GFX10-LABEL: store_load_large_imm_offset_foo: 1123; GFX10: ; %bb.0: ; %bb 1124; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1125; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1126; GFX10-NEXT: v_mov_b32_e32 v0, 13 1127; GFX10-NEXT: v_mov_b32_e32 v1, 15 1128; GFX10-NEXT: s_movk_i32 s4, 0x3800 1129; GFX10-NEXT: ; implicit-def: $vcc_hi 1130; GFX10-NEXT: s_add_u32 s4, s32, s4 1131; GFX10-NEXT: scratch_store_dword off, v0, s32 1132; GFX10-NEXT: scratch_store_dword off, v1, s4 offset:1664 1133; GFX10-NEXT: scratch_load_dword v0, off, s4 offset:1664 1134; GFX10-NEXT: s_waitcnt vmcnt(0) 1135; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1136; GFX10-NEXT: s_setpc_b64 s[30:31] 1137bb: 1138 %i = alloca [4096 x i32], align 4, addrspace(5) 1139 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1140 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1141 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1142 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1143 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1144 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1145 ret void 1146} 1147 1148define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 1149; GFX9-LABEL: store_load_vidx_sidx_offset: 1150; GFX9: ; %bb.0: ; %bb 1151; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1152; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1153; GFX9-NEXT: v_mov_b32_e32 v1, 4 1154; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1155; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1156; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 1157; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1158; GFX9-NEXT: v_mov_b32_e32 v1, 15 1159; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 1160; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 1161; GFX9-NEXT: s_endpgm 1162; 1163; GFX10-LABEL: store_load_vidx_sidx_offset: 1164; GFX10: ; %bb.0: ; %bb 1165; GFX10-NEXT: s_add_u32 s2, s2, s5 1166; GFX10-NEXT: s_addc_u32 s3, s3, 0 1167; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1168; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1169; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1170; GFX10-NEXT: v_mov_b32_e32 v1, 15 1171; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1172; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 1173; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 1174; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 1175; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 1176; GFX10-NEXT: s_endpgm 1177bb: 1178 %alloca = alloca [32 x i32], align 4, addrspace(5) 1179 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 1180 %add1 = add nsw i32 %sidx, %vidx 1181 %add2 = add nsw i32 %add1, 256 1182 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 1183 store volatile i32 15, i32 addrspace(5)* %gep, align 4 1184 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 1185 ret void 1186} 1187 1188; FIXME: Multi-DWORD scratch shall be supported 1189define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 1190; GFX9-LABEL: store_load_i64_aligned: 1191; GFX9: ; %bb.0: ; %bb 1192; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1193; GFX9-NEXT: v_mov_b32_e32 v1, 0 1194; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 1195; GFX9-NEXT: v_mov_b32_e32 v1, 15 1196; GFX9-NEXT: scratch_store_dword v0, v1, off 1197; GFX9-NEXT: scratch_load_dword v1, v0, off offset:4 1198; GFX9-NEXT: scratch_load_dword v0, v0, off 1199; GFX9-NEXT: s_waitcnt vmcnt(0) 1200; GFX9-NEXT: s_setpc_b64 s[30:31] 1201; 1202; GFX10-LABEL: store_load_i64_aligned: 1203; GFX10: ; %bb.0: ; %bb 1204; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1205; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1206; GFX10-NEXT: v_mov_b32_e32 v1, 0 1207; GFX10-NEXT: v_mov_b32_e32 v2, 15 1208; GFX10-NEXT: ; implicit-def: $vcc_hi 1209; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 1210; GFX10-NEXT: scratch_store_dword v0, v2, off 1211; GFX10-NEXT: s_clause 0x1 1212; GFX10-NEXT: scratch_load_dword v1, v0, off offset:4 1213; GFX10-NEXT: scratch_load_dword v0, v0, off 1214; GFX10-NEXT: s_waitcnt vmcnt(0) 1215; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1216; GFX10-NEXT: s_setpc_b64 s[30:31] 1217bb: 1218 store volatile i64 15, i64 addrspace(5)* %arg, align 8 1219 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 1220 ret void 1221} 1222 1223; FIXME: Multi-DWORD unaligned scratch shall be supported 1224define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 1225; GFX9-LABEL: store_load_i64_unaligned: 1226; GFX9: ; %bb.0: ; %bb 1227; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1228; GFX9-NEXT: v_mov_b32_e32 v1, 0 1229; GFX9-NEXT: scratch_store_byte v0, v1, off offset:7 1230; GFX9-NEXT: scratch_store_byte v0, v1, off offset:6 1231; GFX9-NEXT: scratch_store_byte v0, v1, off offset:5 1232; GFX9-NEXT: scratch_store_byte v0, v1, off offset:4 1233; GFX9-NEXT: scratch_store_byte v0, v1, off offset:3 1234; GFX9-NEXT: scratch_store_byte v0, v1, off offset:2 1235; GFX9-NEXT: scratch_store_byte v0, v1, off offset:1 1236; GFX9-NEXT: v_mov_b32_e32 v1, 15 1237; GFX9-NEXT: scratch_store_byte v0, v1, off 1238; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:6 1239; GFX9-NEXT: s_waitcnt vmcnt(0) 1240; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:7 1241; GFX9-NEXT: s_waitcnt vmcnt(0) 1242; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:4 1243; GFX9-NEXT: s_waitcnt vmcnt(0) 1244; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:5 1245; GFX9-NEXT: s_waitcnt vmcnt(0) 1246; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:2 1247; GFX9-NEXT: s_waitcnt vmcnt(0) 1248; GFX9-NEXT: scratch_load_ubyte v1, v0, off offset:3 1249; GFX9-NEXT: s_waitcnt vmcnt(0) 1250; GFX9-NEXT: scratch_load_ubyte v1, v0, off 1251; GFX9-NEXT: scratch_load_ubyte v0, v0, off offset:1 1252; GFX9-NEXT: s_waitcnt vmcnt(0) 1253; GFX9-NEXT: s_setpc_b64 s[30:31] 1254; 1255; GFX10-LABEL: store_load_i64_unaligned: 1256; GFX10: ; %bb.0: ; %bb 1257; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1258; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1259; GFX10-NEXT: v_mov_b32_e32 v1, 0 1260; GFX10-NEXT: v_mov_b32_e32 v2, 15 1261; GFX10-NEXT: ; implicit-def: $vcc_hi 1262; GFX10-NEXT: scratch_store_byte v0, v1, off offset:7 1263; GFX10-NEXT: scratch_store_byte v0, v1, off offset:6 1264; GFX10-NEXT: scratch_store_byte v0, v1, off offset:5 1265; GFX10-NEXT: scratch_store_byte v0, v1, off offset:4 1266; GFX10-NEXT: scratch_store_byte v0, v1, off offset:3 1267; GFX10-NEXT: scratch_store_byte v0, v1, off offset:2 1268; GFX10-NEXT: scratch_store_byte v0, v1, off offset:1 1269; GFX10-NEXT: scratch_store_byte v0, v2, off 1270; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:6 1271; GFX10-NEXT: s_waitcnt vmcnt(0) 1272; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:7 1273; GFX10-NEXT: s_waitcnt vmcnt(0) 1274; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:4 1275; GFX10-NEXT: s_waitcnt vmcnt(0) 1276; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:5 1277; GFX10-NEXT: s_waitcnt vmcnt(0) 1278; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:2 1279; GFX10-NEXT: s_waitcnt vmcnt(0) 1280; GFX10-NEXT: scratch_load_ubyte v1, v0, off offset:3 1281; GFX10-NEXT: s_waitcnt vmcnt(0) 1282; GFX10-NEXT: s_clause 0x1 1283; GFX10-NEXT: scratch_load_ubyte v1, v0, off 1284; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1 1285; GFX10-NEXT: s_waitcnt vmcnt(0) 1286; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1287; GFX10-NEXT: s_setpc_b64 s[30:31] 1288bb: 1289 store volatile i64 15, i64 addrspace(5)* %arg, align 1 1290 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 1291 ret void 1292} 1293 1294declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 1295declare i32 @llvm.amdgcn.workitem.id.x() 1296