1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-PAL %s 6 7define amdgpu_kernel void @zero_init_kernel() { 8; GFX9-LABEL: zero_init_kernel: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 11; GFX9-NEXT: s_mov_b32 s0, 0 12; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 13; GFX9-NEXT: s_mov_b32 s1, s0 14; GFX9-NEXT: s_mov_b32 s2, s0 15; GFX9-NEXT: s_mov_b32 s3, s0 16; GFX9-NEXT: v_mov_b32_e32 v0, s0 17; GFX9-NEXT: v_mov_b32_e32 v1, s1 18; GFX9-NEXT: v_mov_b32_e32 v2, s2 19; GFX9-NEXT: v_mov_b32_e32 v3, s3 20; GFX9-NEXT: s_mov_b32 vcc_hi, 0 21; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 22; GFX9-NEXT: s_mov_b32 vcc_hi, 0 23; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 28; GFX9-NEXT: s_endpgm 29; 30; GFX10-LABEL: zero_init_kernel: 31; GFX10: ; %bb.0: 32; GFX10-NEXT: s_add_u32 s0, s0, s3 33; GFX10-NEXT: s_addc_u32 s1, s1, 0 34; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 35; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 36; GFX10-NEXT: s_mov_b32 s0, 0 37; GFX10-NEXT: s_mov_b32 s1, s0 38; GFX10-NEXT: s_mov_b32 s2, s0 39; GFX10-NEXT: s_mov_b32 s3, s0 40; GFX10-NEXT: v_mov_b32_e32 v0, s0 41; GFX10-NEXT: v_mov_b32_e32 v1, s1 42; GFX10-NEXT: v_mov_b32_e32 v2, s2 43; GFX10-NEXT: v_mov_b32_e32 v3, s3 44; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 45; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 48; GFX10-NEXT: s_endpgm 49; 50; GFX9-PAL-LABEL: zero_init_kernel: 51; GFX9-PAL: ; %bb.0: 52; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 53; GFX9-PAL-NEXT: s_mov_b32 s2, s0 54; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 55; GFX9-PAL-NEXT: s_mov_b32 s0, 0 56; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 57; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 59; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 60; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 61; GFX9-PAL-NEXT: s_mov_b32 s1, s0 62; GFX9-PAL-NEXT: s_mov_b32 s2, s0 63; GFX9-PAL-NEXT: s_mov_b32 s3, s0 64; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 65; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 66; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 67; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 68; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 69; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 70; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 71; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 72; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 73; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 74; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 75; GFX9-PAL-NEXT: s_endpgm 76; 77; GFX10-PAL-LABEL: zero_init_kernel: 78; GFX10-PAL: ; %bb.0: 79; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 80; GFX10-PAL-NEXT: s_mov_b32 s2, s0 81; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 82; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 83; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 84; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 85; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 86; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 87; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 88; GFX10-PAL-NEXT: s_mov_b32 s0, 0 89; GFX10-PAL-NEXT: s_mov_b32 s1, s0 90; GFX10-PAL-NEXT: s_mov_b32 s2, s0 91; GFX10-PAL-NEXT: s_mov_b32 s3, s0 92; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 93; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 94; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 95; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 96; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 97; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 98; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 99; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 100; GFX10-PAL-NEXT: s_endpgm 101 %alloca = alloca [32 x i16], align 2, addrspace(5) 102 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 103 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 104 ret void 105} 106 107define void @zero_init_foo() { 108; GFX9-LABEL: zero_init_foo: 109; GFX9: ; %bb.0: 110; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; GFX9-NEXT: s_mov_b32 s0, 0 112; GFX9-NEXT: s_mov_b32 s1, s0 113; GFX9-NEXT: s_mov_b32 s2, s0 114; GFX9-NEXT: s_mov_b32 s3, s0 115; GFX9-NEXT: v_mov_b32_e32 v0, s0 116; GFX9-NEXT: v_mov_b32_e32 v1, s1 117; GFX9-NEXT: v_mov_b32_e32 v2, s2 118; GFX9-NEXT: v_mov_b32_e32 v3, s3 119; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 120; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 121; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 122; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 123; GFX9-NEXT: s_waitcnt vmcnt(0) 124; GFX9-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX10-LABEL: zero_init_foo: 127; GFX10: ; %bb.0: 128; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 130; GFX10-NEXT: s_mov_b32 s0, 0 131; GFX10-NEXT: s_mov_b32 s1, s0 132; GFX10-NEXT: s_mov_b32 s2, s0 133; GFX10-NEXT: s_mov_b32 s3, s0 134; GFX10-NEXT: v_mov_b32_e32 v0, s0 135; GFX10-NEXT: v_mov_b32_e32 v1, s1 136; GFX10-NEXT: v_mov_b32_e32 v2, s2 137; GFX10-NEXT: v_mov_b32_e32 v3, s3 138; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 139; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 140; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 141; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 142; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 143; GFX10-NEXT: s_setpc_b64 s[30:31] 144; 145; GFX9-PAL-LABEL: zero_init_foo: 146; GFX9-PAL: ; %bb.0: 147; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 148; GFX9-PAL-NEXT: s_mov_b32 s0, 0 149; GFX9-PAL-NEXT: s_mov_b32 s1, s0 150; GFX9-PAL-NEXT: s_mov_b32 s2, s0 151; GFX9-PAL-NEXT: s_mov_b32 s3, s0 152; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 153; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 154; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 155; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 156; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 157; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 158; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 159; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 160; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 161; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX10-PAL-LABEL: zero_init_foo: 164; GFX10-PAL: ; %bb.0: 165; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 167; GFX10-PAL-NEXT: s_mov_b32 s0, 0 168; GFX10-PAL-NEXT: s_mov_b32 s1, s0 169; GFX10-PAL-NEXT: s_mov_b32 s2, s0 170; GFX10-PAL-NEXT: s_mov_b32 s3, s0 171; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 172; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 173; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 174; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 175; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 176; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 177; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 178; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 179; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 180; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 181 %alloca = alloca [32 x i16], align 2, addrspace(5) 182 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 183 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 184 ret void 185} 186 187define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 188; GFX9-LABEL: store_load_sindex_kernel: 189; GFX9: ; %bb.0: ; %bb 190; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 191; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 192; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 193; GFX9-NEXT: v_mov_b32_e32 v0, 15 194; GFX9-NEXT: s_waitcnt lgkmcnt(0) 195; GFX9-NEXT: s_lshl_b32 s1, s0, 2 196; GFX9-NEXT: s_and_b32 s0, s0, 15 197; GFX9-NEXT: s_lshl_b32 s0, s0, 2 198; GFX9-NEXT: s_add_u32 s1, 4, s1 199; GFX9-NEXT: scratch_store_dword off, v0, s1 200; GFX9-NEXT: s_waitcnt vmcnt(0) 201; GFX9-NEXT: s_add_u32 s0, 4, s0 202; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 203; GFX9-NEXT: s_waitcnt vmcnt(0) 204; GFX9-NEXT: s_endpgm 205; 206; GFX10-LABEL: store_load_sindex_kernel: 207; GFX10: ; %bb.0: ; %bb 208; GFX10-NEXT: s_add_u32 s2, s2, s5 209; GFX10-NEXT: s_addc_u32 s3, s3, 0 210; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 211; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 212; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 213; GFX10-NEXT: v_mov_b32_e32 v0, 15 214; GFX10-NEXT: s_waitcnt lgkmcnt(0) 215; GFX10-NEXT: s_and_b32 s1, s0, 15 216; GFX10-NEXT: s_lshl_b32 s0, s0, 2 217; GFX10-NEXT: s_lshl_b32 s1, s1, 2 218; GFX10-NEXT: s_add_u32 s0, 4, s0 219; GFX10-NEXT: s_add_u32 s1, 4, s1 220; GFX10-NEXT: scratch_store_dword off, v0, s0 221; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 222; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 223; GFX10-NEXT: s_waitcnt vmcnt(0) 224; GFX10-NEXT: s_endpgm 225; 226; GFX9-PAL-LABEL: store_load_sindex_kernel: 227; GFX9-PAL: ; %bb.0: ; %bb 228; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 229; GFX9-PAL-NEXT: s_mov_b32 s4, s0 230; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 231; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 232; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 233; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 234; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 235; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 236; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 237; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 238; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 239; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 240; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 241; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 242; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 243; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 244; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 245; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 246; GFX9-PAL-NEXT: s_endpgm 247; 248; GFX10-PAL-LABEL: store_load_sindex_kernel: 249; GFX10-PAL: ; %bb.0: ; %bb 250; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 251; GFX10-PAL-NEXT: s_mov_b32 s4, s0 252; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 253; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 254; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 255; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 256; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 257; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 258; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 259; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 260; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 261; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 262; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 263; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 264; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 265; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 266; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 267; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 268; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 269; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 270; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 271; GFX10-PAL-NEXT: s_endpgm 272bb: 273 %i = alloca [32 x float], align 4, addrspace(5) 274 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 275 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 276 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 277 store volatile i32 15, i32 addrspace(5)* %i8, align 4 278 %i9 = and i32 %idx, 15 279 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 280 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 281 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 282 ret void 283} 284 285define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 286; GFX9-LABEL: store_load_sindex_foo: 287; GFX9: ; %bb.0: ; %bb 288; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 289; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 290; GFX9-NEXT: s_lshl_b32 s0, s2, 2 291; GFX9-NEXT: s_add_u32 s0, 4, s0 292; GFX9-NEXT: v_mov_b32_e32 v0, 15 293; GFX9-NEXT: scratch_store_dword off, v0, s0 294; GFX9-NEXT: s_waitcnt vmcnt(0) 295; GFX9-NEXT: s_and_b32 s0, s2, 15 296; GFX9-NEXT: s_lshl_b32 s0, s0, 2 297; GFX9-NEXT: s_add_u32 s0, 4, s0 298; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 299; GFX9-NEXT: s_waitcnt vmcnt(0) 300; GFX9-NEXT: s_endpgm 301; 302; GFX10-LABEL: store_load_sindex_foo: 303; GFX10: ; %bb.0: ; %bb 304; GFX10-NEXT: s_add_u32 s0, s0, s3 305; GFX10-NEXT: s_addc_u32 s1, s1, 0 306; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 307; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 308; GFX10-NEXT: s_and_b32 s0, s2, 15 309; GFX10-NEXT: v_mov_b32_e32 v0, 15 310; GFX10-NEXT: s_lshl_b32 s1, s2, 2 311; GFX10-NEXT: s_lshl_b32 s0, s0, 2 312; GFX10-NEXT: s_add_u32 s1, 4, s1 313; GFX10-NEXT: s_add_u32 s0, 4, s0 314; GFX10-NEXT: scratch_store_dword off, v0, s1 315; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 316; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 317; GFX10-NEXT: s_waitcnt vmcnt(0) 318; GFX10-NEXT: s_endpgm 319; 320; GFX9-PAL-LABEL: store_load_sindex_foo: 321; GFX9-PAL: ; %bb.0: ; %bb 322; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 323; GFX9-PAL-NEXT: s_mov_b32 s2, s0 324; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 325; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 326; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 327; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 328; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 329; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 330; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 331; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 332; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 333; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 334; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 335; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 336; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 337; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 338; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 339; GFX9-PAL-NEXT: s_endpgm 340; 341; GFX10-PAL-LABEL: store_load_sindex_foo: 342; GFX10-PAL: ; %bb.0: ; %bb 343; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 344; GFX10-PAL-NEXT: s_mov_b32 s2, s0 345; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 346; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 347; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 348; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 349; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 350; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 351; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 352; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 353; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 354; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 355; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 356; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 357; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 358; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 359; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 360; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 361; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 362; GFX10-PAL-NEXT: s_endpgm 363bb: 364 %i = alloca [32 x float], align 4, addrspace(5) 365 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 366 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 367 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 368 store volatile i32 15, i32 addrspace(5)* %i8, align 4 369 %i9 = and i32 %idx, 15 370 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 371 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 372 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 373 ret void 374} 375 376define amdgpu_kernel void @store_load_vindex_kernel() { 377; GFX9-LABEL: store_load_vindex_kernel: 378; GFX9: ; %bb.0: ; %bb 379; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 380; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 381; GFX9-NEXT: v_mov_b32_e32 v1, 4 382; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 383; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 384; GFX9-NEXT: v_mov_b32_e32 v3, 15 385; GFX9-NEXT: scratch_store_dword v2, v3, off 386; GFX9-NEXT: s_waitcnt vmcnt(0) 387; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 388; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 389; GFX9-NEXT: s_waitcnt vmcnt(0) 390; GFX9-NEXT: s_endpgm 391; 392; GFX10-LABEL: store_load_vindex_kernel: 393; GFX10: ; %bb.0: ; %bb 394; GFX10-NEXT: s_add_u32 s0, s0, s3 395; GFX10-NEXT: s_addc_u32 s1, s1, 0 396; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 397; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 398; GFX10-NEXT: v_mov_b32_e32 v1, 4 399; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 400; GFX10-NEXT: v_mov_b32_e32 v3, 15 401; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 402; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 403; GFX10-NEXT: scratch_store_dword v2, v3, off 404; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 405; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 406; GFX10-NEXT: s_waitcnt vmcnt(0) 407; GFX10-NEXT: s_endpgm 408; 409; GFX9-PAL-LABEL: store_load_vindex_kernel: 410; GFX9-PAL: ; %bb.0: ; %bb 411; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 412; GFX9-PAL-NEXT: s_mov_b32 s2, s0 413; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 414; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 415; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 416; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 417; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 418; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 419; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 420; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 421; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 422; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 423; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 424; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 425; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 426; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 427; GFX9-PAL-NEXT: s_endpgm 428; 429; GFX10-PAL-LABEL: store_load_vindex_kernel: 430; GFX10-PAL: ; %bb.0: ; %bb 431; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 432; GFX10-PAL-NEXT: s_mov_b32 s2, s0 433; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 434; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 435; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 436; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 437; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 438; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 439; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 440; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 441; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 442; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 443; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 444; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 445; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 446; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 447; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 448; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 449; GFX10-PAL-NEXT: s_endpgm 450bb: 451 %i = alloca [32 x float], align 4, addrspace(5) 452 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 453 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 454 %i3 = zext i32 %i2 to i64 455 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 456 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 457 store volatile i32 15, i32 addrspace(5)* %i8, align 4 458 %i9 = sub nsw i32 31, %i2 459 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 460 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 461 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 462 ret void 463} 464 465define void @store_load_vindex_foo(i32 %idx) { 466; GFX9-LABEL: store_load_vindex_foo: 467; GFX9: ; %bb.0: ; %bb 468; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 469; GFX9-NEXT: v_mov_b32_e32 v1, s32 470; GFX9-NEXT: v_mov_b32_e32 v3, 15 471; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 472; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 473; GFX9-NEXT: scratch_store_dword v2, v3, off 474; GFX9-NEXT: s_waitcnt vmcnt(0) 475; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 476; GFX9-NEXT: scratch_load_dword v0, v0, off glc 477; GFX9-NEXT: s_waitcnt vmcnt(0) 478; GFX9-NEXT: s_setpc_b64 s[30:31] 479; 480; GFX10-LABEL: store_load_vindex_foo: 481; GFX10: ; %bb.0: ; %bb 482; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 483; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 484; GFX10-NEXT: v_mov_b32_e32 v1, 15 485; GFX10-NEXT: v_mov_b32_e32 v2, s32 486; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 487; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 488; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 489; GFX10-NEXT: scratch_store_dword v0, v1, off 490; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 491; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc 492; GFX10-NEXT: s_waitcnt vmcnt(0) 493; GFX10-NEXT: s_setpc_b64 s[30:31] 494; 495; GFX9-PAL-LABEL: store_load_vindex_foo: 496; GFX9-PAL: ; %bb.0: ; %bb 497; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 498; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 499; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 500; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 501; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 502; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 503; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 504; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 505; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 506; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 507; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 508; 509; GFX10-PAL-LABEL: store_load_vindex_foo: 510; GFX10-PAL: ; %bb.0: ; %bb 511; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 513; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 514; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32 515; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 516; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 517; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 518; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 519; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 520; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc 521; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 522; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 523bb: 524 %i = alloca [32 x float], align 4, addrspace(5) 525 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 526 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 527 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 528 store volatile i32 15, i32 addrspace(5)* %i8, align 4 529 %i9 = and i32 %idx, 15 530 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 531 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 532 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 533 ret void 534} 535 536define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 537; GFX9-LABEL: private_ptr_foo: 538; GFX9: ; %bb.0: 539; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 540; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 541; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 542; GFX9-NEXT: s_waitcnt vmcnt(0) 543; GFX9-NEXT: s_setpc_b64 s[30:31] 544; 545; GFX10-LABEL: private_ptr_foo: 546; GFX10: ; %bb.0: 547; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 548; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 549; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 550; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 551; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 552; GFX10-NEXT: s_setpc_b64 s[30:31] 553; 554; GFX9-PAL-LABEL: private_ptr_foo: 555; GFX9-PAL: ; %bb.0: 556; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 557; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 558; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 559; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 560; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 561; 562; GFX10-PAL-LABEL: private_ptr_foo: 563; GFX10-PAL: ; %bb.0: 564; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 565; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 566; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 567; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 568; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 569; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 570 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 571 store float 1.000000e+01, float addrspace(5)* %gep, align 4 572 ret void 573} 574 575define amdgpu_kernel void @zero_init_small_offset_kernel() { 576; GFX9-LABEL: zero_init_small_offset_kernel: 577; GFX9: ; %bb.0: 578; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 579; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 580; GFX9-NEXT: s_mov_b32 vcc_hi, 0 581; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 582; GFX9-NEXT: s_waitcnt vmcnt(0) 583; GFX9-NEXT: s_mov_b32 s0, 0 584; GFX9-NEXT: s_mov_b32 s1, s0 585; GFX9-NEXT: s_mov_b32 s2, s0 586; GFX9-NEXT: s_mov_b32 s3, s0 587; GFX9-NEXT: v_mov_b32_e32 v0, s0 588; GFX9-NEXT: v_mov_b32_e32 v1, s1 589; GFX9-NEXT: v_mov_b32_e32 v2, s2 590; GFX9-NEXT: v_mov_b32_e32 v3, s3 591; GFX9-NEXT: s_mov_b32 vcc_hi, 0 592; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 593; GFX9-NEXT: s_mov_b32 vcc_hi, 0 594; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 595; GFX9-NEXT: s_mov_b32 vcc_hi, 0 596; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 597; GFX9-NEXT: s_mov_b32 vcc_hi, 0 598; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 599; GFX9-NEXT: s_endpgm 600; 601; GFX10-LABEL: zero_init_small_offset_kernel: 602; GFX10: ; %bb.0: 603; GFX10-NEXT: s_add_u32 s0, s0, s3 604; GFX10-NEXT: s_addc_u32 s1, s1, 0 605; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 606; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 607; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 608; GFX10-NEXT: s_waitcnt vmcnt(0) 609; GFX10-NEXT: s_mov_b32 s0, 0 610; GFX10-NEXT: s_mov_b32 s1, s0 611; GFX10-NEXT: s_mov_b32 s2, s0 612; GFX10-NEXT: s_mov_b32 s3, s0 613; GFX10-NEXT: v_mov_b32_e32 v0, s0 614; GFX10-NEXT: v_mov_b32_e32 v1, s1 615; GFX10-NEXT: v_mov_b32_e32 v2, s2 616; GFX10-NEXT: v_mov_b32_e32 v3, s3 617; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 618; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 619; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 620; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 621; GFX10-NEXT: s_endpgm 622; 623; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 624; GFX9-PAL: ; %bb.0: 625; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 626; GFX9-PAL-NEXT: s_mov_b32 s2, s0 627; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 628; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 629; GFX9-PAL-NEXT: s_mov_b32 s0, 0 630; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 631; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 632; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 633; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 634; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 635; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 636; GFX9-PAL-NEXT: s_mov_b32 s1, s0 637; GFX9-PAL-NEXT: s_mov_b32 s2, s0 638; GFX9-PAL-NEXT: s_mov_b32 s3, s0 639; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 640; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 641; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 642; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 643; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 644; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 645; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 646; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 647; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 648; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 649; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 650; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 651; GFX9-PAL-NEXT: s_endpgm 652; 653; GFX10-PAL-LABEL: zero_init_small_offset_kernel: 654; GFX10-PAL: ; %bb.0: 655; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 656; GFX10-PAL-NEXT: s_mov_b32 s2, s0 657; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 658; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 659; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 660; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 661; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 662; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 663; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 664; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 665; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 666; GFX10-PAL-NEXT: s_mov_b32 s0, 0 667; GFX10-PAL-NEXT: s_mov_b32 s1, s0 668; GFX10-PAL-NEXT: s_mov_b32 s2, s0 669; GFX10-PAL-NEXT: s_mov_b32 s3, s0 670; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 671; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 672; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 673; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 674; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 675; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 676; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 677; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 678; GFX10-PAL-NEXT: s_endpgm 679 %padding = alloca [64 x i32], align 4, addrspace(5) 680 %alloca = alloca [32 x i16], align 2, addrspace(5) 681 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 682 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 683 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 684 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 685 ret void 686} 687 688define void @zero_init_small_offset_foo() { 689; GFX9-LABEL: zero_init_small_offset_foo: 690; GFX9: ; %bb.0: 691; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 692; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 693; GFX9-NEXT: s_waitcnt vmcnt(0) 694; GFX9-NEXT: s_mov_b32 s0, 0 695; GFX9-NEXT: s_mov_b32 s1, s0 696; GFX9-NEXT: s_mov_b32 s2, s0 697; GFX9-NEXT: s_mov_b32 s3, s0 698; GFX9-NEXT: v_mov_b32_e32 v0, s0 699; GFX9-NEXT: v_mov_b32_e32 v1, s1 700; GFX9-NEXT: v_mov_b32_e32 v2, s2 701; GFX9-NEXT: v_mov_b32_e32 v3, s3 702; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 703; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 704; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 705; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 706; GFX9-NEXT: s_waitcnt vmcnt(0) 707; GFX9-NEXT: s_setpc_b64 s[30:31] 708; 709; GFX10-LABEL: zero_init_small_offset_foo: 710; GFX10: ; %bb.0: 711; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 712; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 713; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 714; GFX10-NEXT: s_waitcnt vmcnt(0) 715; GFX10-NEXT: s_mov_b32 s0, 0 716; GFX10-NEXT: s_mov_b32 s1, s0 717; GFX10-NEXT: s_mov_b32 s2, s0 718; GFX10-NEXT: s_mov_b32 s3, s0 719; GFX10-NEXT: v_mov_b32_e32 v0, s0 720; GFX10-NEXT: v_mov_b32_e32 v1, s1 721; GFX10-NEXT: v_mov_b32_e32 v2, s2 722; GFX10-NEXT: v_mov_b32_e32 v3, s3 723; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 724; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 725; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 726; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 727; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 728; GFX10-NEXT: s_setpc_b64 s[30:31] 729; 730; GFX9-PAL-LABEL: zero_init_small_offset_foo: 731; GFX9-PAL: ; %bb.0: 732; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 733; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 734; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 735; GFX9-PAL-NEXT: s_mov_b32 s0, 0 736; GFX9-PAL-NEXT: s_mov_b32 s1, s0 737; GFX9-PAL-NEXT: s_mov_b32 s2, s0 738; GFX9-PAL-NEXT: s_mov_b32 s3, s0 739; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 740; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 741; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 742; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 743; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 744; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 745; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 746; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 747; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 748; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 749; 750; GFX10-PAL-LABEL: zero_init_small_offset_foo: 751; GFX10-PAL: ; %bb.0: 752; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 753; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 754; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 755; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 756; GFX10-PAL-NEXT: s_mov_b32 s0, 0 757; GFX10-PAL-NEXT: s_mov_b32 s1, s0 758; GFX10-PAL-NEXT: s_mov_b32 s2, s0 759; GFX10-PAL-NEXT: s_mov_b32 s3, s0 760; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 761; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 762; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 763; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 764; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 765; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 766; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 767; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 768; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 769; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 770 %padding = alloca [64 x i32], align 4, addrspace(5) 771 %alloca = alloca [32 x i16], align 2, addrspace(5) 772 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 773 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 774 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 775 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 776 ret void 777} 778 779define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 780; GFX9-LABEL: store_load_sindex_small_offset_kernel: 781; GFX9: ; %bb.0: ; %bb 782; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 783; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 784; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 785; GFX9-NEXT: s_mov_b32 vcc_hi, 0 786; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 787; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 788; GFX9-NEXT: s_lshl_b32 s1, s0, 2 789; GFX9-NEXT: s_and_b32 s0, s0, 15 790; GFX9-NEXT: s_lshl_b32 s0, s0, 2 791; GFX9-NEXT: s_waitcnt vmcnt(0) 792; GFX9-NEXT: v_mov_b32_e32 v0, 15 793; GFX9-NEXT: s_add_u32 s1, 0x104, s1 794; GFX9-NEXT: scratch_store_dword off, v0, s1 795; GFX9-NEXT: s_waitcnt vmcnt(0) 796; GFX9-NEXT: s_add_u32 s0, 0x104, s0 797; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 798; GFX9-NEXT: s_waitcnt vmcnt(0) 799; GFX9-NEXT: s_endpgm 800; 801; GFX10-LABEL: store_load_sindex_small_offset_kernel: 802; GFX10: ; %bb.0: ; %bb 803; GFX10-NEXT: s_add_u32 s2, s2, s5 804; GFX10-NEXT: s_addc_u32 s3, s3, 0 805; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 806; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 807; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 808; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 809; GFX10-NEXT: s_waitcnt vmcnt(0) 810; GFX10-NEXT: v_mov_b32_e32 v0, 15 811; GFX10-NEXT: s_waitcnt lgkmcnt(0) 812; GFX10-NEXT: s_and_b32 s1, s0, 15 813; GFX10-NEXT: s_lshl_b32 s0, s0, 2 814; GFX10-NEXT: s_lshl_b32 s1, s1, 2 815; GFX10-NEXT: s_add_u32 s0, 0x104, s0 816; GFX10-NEXT: s_add_u32 s1, 0x104, s1 817; GFX10-NEXT: scratch_store_dword off, v0, s0 818; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 819; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 820; GFX10-NEXT: s_waitcnt vmcnt(0) 821; GFX10-NEXT: s_endpgm 822; 823; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 824; GFX9-PAL: ; %bb.0: ; %bb 825; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 826; GFX9-PAL-NEXT: s_mov_b32 s4, s0 827; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 828; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 829; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 830; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 831; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 832; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 833; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 834; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 835; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 836; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 837; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 838; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 839; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 840; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 841; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 842; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 843; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 844; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 845; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 846; GFX9-PAL-NEXT: s_endpgm 847; 848; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel: 849; GFX10-PAL: ; %bb.0: ; %bb 850; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 851; GFX10-PAL-NEXT: s_mov_b32 s4, s0 852; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 853; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 854; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 855; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 856; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 857; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 858; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 859; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 860; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 861; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 862; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 863; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 864; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 865; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 866; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 867; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 868; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 869; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 870; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 871; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 872; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 873; GFX10-PAL-NEXT: s_endpgm 874bb: 875 %padding = alloca [64 x i32], align 4, addrspace(5) 876 %i = alloca [32 x float], align 4, addrspace(5) 877 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 878 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 879 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 880 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 881 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 882 store volatile i32 15, i32 addrspace(5)* %i8, align 4 883 %i9 = and i32 %idx, 15 884 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 885 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 886 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 887 ret void 888} 889 890define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 891; GFX9-LABEL: store_load_sindex_small_offset_foo: 892; GFX9: ; %bb.0: ; %bb 893; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 894; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 895; GFX9-NEXT: s_mov_b32 vcc_hi, 0 896; GFX9-NEXT: s_lshl_b32 s0, s2, 2 897; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 898; GFX9-NEXT: s_waitcnt vmcnt(0) 899; GFX9-NEXT: s_add_u32 s0, 0x104, s0 900; GFX9-NEXT: v_mov_b32_e32 v0, 15 901; GFX9-NEXT: scratch_store_dword off, v0, s0 902; GFX9-NEXT: s_waitcnt vmcnt(0) 903; GFX9-NEXT: s_and_b32 s0, s2, 15 904; GFX9-NEXT: s_lshl_b32 s0, s0, 2 905; GFX9-NEXT: s_add_u32 s0, 0x104, s0 906; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 907; GFX9-NEXT: s_waitcnt vmcnt(0) 908; GFX9-NEXT: s_endpgm 909; 910; GFX10-LABEL: store_load_sindex_small_offset_foo: 911; GFX10: ; %bb.0: ; %bb 912; GFX10-NEXT: s_add_u32 s0, s0, s3 913; GFX10-NEXT: s_addc_u32 s1, s1, 0 914; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 915; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 916; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 917; GFX10-NEXT: s_waitcnt vmcnt(0) 918; GFX10-NEXT: s_and_b32 s0, s2, 15 919; GFX10-NEXT: v_mov_b32_e32 v0, 15 920; GFX10-NEXT: s_lshl_b32 s1, s2, 2 921; GFX10-NEXT: s_lshl_b32 s0, s0, 2 922; GFX10-NEXT: s_add_u32 s1, 0x104, s1 923; GFX10-NEXT: s_add_u32 s0, 0x104, s0 924; GFX10-NEXT: scratch_store_dword off, v0, s1 925; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 926; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 927; GFX10-NEXT: s_waitcnt vmcnt(0) 928; GFX10-NEXT: s_endpgm 929; 930; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 931; GFX9-PAL: ; %bb.0: ; %bb 932; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 933; GFX9-PAL-NEXT: s_mov_b32 s2, s0 934; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 935; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 936; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 937; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 938; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 939; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 940; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 941; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 942; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 943; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 944; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 945; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 946; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 947; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 948; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 949; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 950; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 951; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 952; GFX9-PAL-NEXT: s_endpgm 953; 954; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo: 955; GFX10-PAL: ; %bb.0: ; %bb 956; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 957; GFX10-PAL-NEXT: s_mov_b32 s2, s0 958; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 959; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 960; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 961; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 962; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 963; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 964; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 965; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 966; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 967; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 968; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 969; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 970; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 971; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 972; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 973; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 974; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 975; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 976; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 977; GFX10-PAL-NEXT: s_endpgm 978bb: 979 %padding = alloca [64 x i32], align 4, addrspace(5) 980 %i = alloca [32 x float], align 4, addrspace(5) 981 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 982 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 983 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 984 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 985 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 986 store volatile i32 15, i32 addrspace(5)* %i8, align 4 987 %i9 = and i32 %idx, 15 988 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 989 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 990 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 991 ret void 992} 993 994define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 995; GFX9-LABEL: store_load_vindex_small_offset_kernel: 996; GFX9: ; %bb.0: ; %bb 997; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 998; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 999; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1000; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1001; GFX9-NEXT: s_waitcnt vmcnt(0) 1002; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1003; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 1004; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1005; GFX9-NEXT: v_mov_b32_e32 v3, 15 1006; GFX9-NEXT: scratch_store_dword v2, v3, off 1007; GFX9-NEXT: s_waitcnt vmcnt(0) 1008; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1009; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1010; GFX9-NEXT: s_waitcnt vmcnt(0) 1011; GFX9-NEXT: s_endpgm 1012; 1013; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1014; GFX10: ; %bb.0: ; %bb 1015; GFX10-NEXT: s_add_u32 s0, s0, s3 1016; GFX10-NEXT: s_addc_u32 s1, s1, 0 1017; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1018; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1019; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 1020; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1021; GFX10-NEXT: v_mov_b32_e32 v3, 15 1022; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1023; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1024; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1025; GFX10-NEXT: s_waitcnt vmcnt(0) 1026; GFX10-NEXT: scratch_store_dword v2, v3, off 1027; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1028; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1029; GFX10-NEXT: s_waitcnt vmcnt(0) 1030; GFX10-NEXT: s_endpgm 1031; 1032; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1033; GFX9-PAL: ; %bb.0: ; %bb 1034; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1035; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1036; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1037; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1038; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1039; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1040; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1041; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1042; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1043; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1044; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1045; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1046; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1047; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1048; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1049; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1050; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1051; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1052; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1053; GFX9-PAL-NEXT: s_endpgm 1054; 1055; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel: 1056; GFX10-PAL: ; %bb.0: ; %bb 1057; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1058; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1059; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1060; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1061; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1062; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1063; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1064; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1065; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1066; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1067; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1068; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1069; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1070; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1071; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1072; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1073; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 1074; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1075; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1076; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1077; GFX10-PAL-NEXT: s_endpgm 1078bb: 1079 %padding = alloca [64 x i32], align 4, addrspace(5) 1080 %i = alloca [32 x float], align 4, addrspace(5) 1081 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1082 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1083 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1084 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1085 %i3 = zext i32 %i2 to i64 1086 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1087 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1088 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1089 %i9 = sub nsw i32 31, %i2 1090 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1091 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1092 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1093 ret void 1094} 1095 1096define void @store_load_vindex_small_offset_foo(i32 %idx) { 1097; GFX9-LABEL: store_load_vindex_small_offset_foo: 1098; GFX9: ; %bb.0: ; %bb 1099; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1100; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1101; GFX9-NEXT: s_waitcnt vmcnt(0) 1102; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 1103; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1104; GFX9-NEXT: v_mov_b32_e32 v3, 15 1105; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1106; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1107; GFX9-NEXT: scratch_store_dword v2, v3, off 1108; GFX9-NEXT: s_waitcnt vmcnt(0) 1109; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1110; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1111; GFX9-NEXT: s_waitcnt vmcnt(0) 1112; GFX9-NEXT: s_setpc_b64 s[30:31] 1113; 1114; GFX10-LABEL: store_load_vindex_small_offset_foo: 1115; GFX10: ; %bb.0: ; %bb 1116; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1117; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1118; GFX10-NEXT: v_mov_b32_e32 v1, 15 1119; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 1120; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1121; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1122; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1123; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1124; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1125; GFX10-NEXT: s_waitcnt vmcnt(0) 1126; GFX10-NEXT: scratch_store_dword v0, v1, off 1127; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1128; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc 1129; GFX10-NEXT: s_waitcnt vmcnt(0) 1130; GFX10-NEXT: s_setpc_b64 s[30:31] 1131; 1132; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1133; GFX9-PAL: ; %bb.0: ; %bb 1134; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1135; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1136; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1137; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100 1138; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1139; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1140; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1141; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1142; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1143; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1144; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1145; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1146; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1147; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1148; 1149; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1150; GFX10-PAL: ; %bb.0: ; %bb 1151; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1152; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1153; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1154; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x100 1155; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1156; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1157; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1158; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1159; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1160; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1161; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1162; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1163; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc 1164; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1165; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1166bb: 1167 %padding = alloca [64 x i32], align 4, addrspace(5) 1168 %i = alloca [32 x float], align 4, addrspace(5) 1169 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1170 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1171 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1172 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1173 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1174 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1175 %i9 = and i32 %idx, 15 1176 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1177 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1178 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1179 ret void 1180} 1181 1182define amdgpu_kernel void @zero_init_large_offset_kernel() { 1183; GFX9-LABEL: zero_init_large_offset_kernel: 1184; GFX9: ; %bb.0: 1185; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1186; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1187; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1188; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1189; GFX9-NEXT: s_waitcnt vmcnt(0) 1190; GFX9-NEXT: s_mov_b32 s0, 0 1191; GFX9-NEXT: s_mov_b32 s1, s0 1192; GFX9-NEXT: s_mov_b32 s2, s0 1193; GFX9-NEXT: s_mov_b32 s3, s0 1194; GFX9-NEXT: v_mov_b32_e32 v0, s0 1195; GFX9-NEXT: v_mov_b32_e32 v1, s1 1196; GFX9-NEXT: v_mov_b32_e32 v2, s2 1197; GFX9-NEXT: v_mov_b32_e32 v3, s3 1198; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1199; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1200; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1201; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1202; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1203; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1204; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1205; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1206; GFX9-NEXT: s_endpgm 1207; 1208; GFX10-LABEL: zero_init_large_offset_kernel: 1209; GFX10: ; %bb.0: 1210; GFX10-NEXT: s_add_u32 s0, s0, s3 1211; GFX10-NEXT: s_addc_u32 s1, s1, 0 1212; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1213; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1214; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1215; GFX10-NEXT: s_waitcnt vmcnt(0) 1216; GFX10-NEXT: s_mov_b32 s0, 0 1217; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1218; GFX10-NEXT: s_mov_b32 s1, s0 1219; GFX10-NEXT: s_mov_b32 s2, s0 1220; GFX10-NEXT: s_mov_b32 s3, s0 1221; GFX10-NEXT: v_mov_b32_e32 v0, s0 1222; GFX10-NEXT: v_mov_b32_e32 v1, s1 1223; GFX10-NEXT: v_mov_b32_e32 v2, s2 1224; GFX10-NEXT: v_mov_b32_e32 v3, s3 1225; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1226; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1227; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1228; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1229; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1230; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1231; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1232; GFX10-NEXT: s_endpgm 1233; 1234; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 1235; GFX9-PAL: ; %bb.0: 1236; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1237; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1238; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1239; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1240; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1241; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1243; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1244; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1245; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1246; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1247; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1248; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1249; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1250; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1251; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1252; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1253; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1254; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1255; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1256; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1257; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1258; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1259; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1260; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1261; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1262; GFX9-PAL-NEXT: s_endpgm 1263; 1264; GFX10-PAL-LABEL: zero_init_large_offset_kernel: 1265; GFX10-PAL: ; %bb.0: 1266; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1267; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1268; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1269; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1271; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1272; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1273; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1274; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1275; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1276; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1277; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1278; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1279; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1280; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1281; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1282; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1283; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1284; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1285; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1286; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1287; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1288; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1289; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1290; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1291; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1292; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1293; GFX10-PAL-NEXT: s_endpgm 1294 %padding = alloca [4096 x i32], align 4, addrspace(5) 1295 %alloca = alloca [32 x i16], align 2, addrspace(5) 1296 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1297 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1298 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1299 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1300 ret void 1301} 1302 1303define void @zero_init_large_offset_foo() { 1304; GFX9-LABEL: zero_init_large_offset_foo: 1305; GFX9: ; %bb.0: 1306; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1307; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 1308; GFX9-NEXT: s_waitcnt vmcnt(0) 1309; GFX9-NEXT: s_mov_b32 s0, 0 1310; GFX9-NEXT: s_mov_b32 s1, s0 1311; GFX9-NEXT: s_mov_b32 s2, s0 1312; GFX9-NEXT: s_mov_b32 s3, s0 1313; GFX9-NEXT: v_mov_b32_e32 v0, s0 1314; GFX9-NEXT: v_mov_b32_e32 v1, s1 1315; GFX9-NEXT: v_mov_b32_e32 v2, s2 1316; GFX9-NEXT: v_mov_b32_e32 v3, s3 1317; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1318; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1319; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1320; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1321; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1322; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1323; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1324; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1325; GFX9-NEXT: s_waitcnt vmcnt(0) 1326; GFX9-NEXT: s_setpc_b64 s[30:31] 1327; 1328; GFX10-LABEL: zero_init_large_offset_foo: 1329; GFX10: ; %bb.0: 1330; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1331; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1332; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 1333; GFX10-NEXT: s_waitcnt vmcnt(0) 1334; GFX10-NEXT: s_mov_b32 s0, 0 1335; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1336; GFX10-NEXT: s_mov_b32 s1, s0 1337; GFX10-NEXT: s_mov_b32 s2, s0 1338; GFX10-NEXT: s_mov_b32 s3, s0 1339; GFX10-NEXT: v_mov_b32_e32 v0, s0 1340; GFX10-NEXT: v_mov_b32_e32 v1, s1 1341; GFX10-NEXT: v_mov_b32_e32 v2, s2 1342; GFX10-NEXT: v_mov_b32_e32 v3, s3 1343; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1344; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1345; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1346; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1347; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1348; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1349; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1350; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1351; GFX10-NEXT: s_setpc_b64 s[30:31] 1352; 1353; GFX9-PAL-LABEL: zero_init_large_offset_foo: 1354; GFX9-PAL: ; %bb.0: 1355; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1356; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 1357; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1358; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1359; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1360; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1361; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1362; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1363; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1364; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1365; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1366; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1367; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1368; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1369; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1370; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1371; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1372; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1373; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1374; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1375; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1376; 1377; GFX10-PAL-LABEL: zero_init_large_offset_foo: 1378; GFX10-PAL: ; %bb.0: 1379; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1380; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1381; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1382; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1383; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1384; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1385; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1386; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1387; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1388; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1389; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1390; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1391; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1392; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1393; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1394; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1395; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1396; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1397; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1398; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1399; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1400; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1401 %padding = alloca [4096 x i32], align 4, addrspace(5) 1402 %alloca = alloca [32 x i16], align 2, addrspace(5) 1403 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1404 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1405 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1406 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1407 ret void 1408} 1409 1410define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 1411; GFX9-LABEL: store_load_sindex_large_offset_kernel: 1412; GFX9: ; %bb.0: ; %bb 1413; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1414; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1415; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1416; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1417; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1418; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1419; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1420; GFX9-NEXT: s_and_b32 s0, s0, 15 1421; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1422; GFX9-NEXT: s_waitcnt vmcnt(0) 1423; GFX9-NEXT: v_mov_b32_e32 v0, 15 1424; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 1425; GFX9-NEXT: scratch_store_dword off, v0, s1 1426; GFX9-NEXT: s_waitcnt vmcnt(0) 1427; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1428; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1429; GFX9-NEXT: s_waitcnt vmcnt(0) 1430; GFX9-NEXT: s_endpgm 1431; 1432; GFX10-LABEL: store_load_sindex_large_offset_kernel: 1433; GFX10: ; %bb.0: ; %bb 1434; GFX10-NEXT: s_add_u32 s2, s2, s5 1435; GFX10-NEXT: s_addc_u32 s3, s3, 0 1436; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1437; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1438; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1439; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1440; GFX10-NEXT: s_waitcnt vmcnt(0) 1441; GFX10-NEXT: v_mov_b32_e32 v0, 15 1442; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1443; GFX10-NEXT: s_and_b32 s1, s0, 15 1444; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1445; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1446; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 1447; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 1448; GFX10-NEXT: scratch_store_dword off, v0, s0 1449; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1450; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1451; GFX10-NEXT: s_waitcnt vmcnt(0) 1452; GFX10-NEXT: s_endpgm 1453; 1454; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 1455; GFX9-PAL: ; %bb.0: ; %bb 1456; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1457; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1458; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1459; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1460; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1461; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1462; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1463; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1464; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1465; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1466; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1467; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1468; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1469; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1470; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1471; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1472; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1473; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1474; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1475; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1476; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1477; GFX9-PAL-NEXT: s_endpgm 1478; 1479; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel: 1480; GFX10-PAL: ; %bb.0: ; %bb 1481; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 1482; GFX10-PAL-NEXT: s_mov_b32 s4, s0 1483; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1484; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1485; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1486; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 1487; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 1488; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1489; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1490; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1491; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1492; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1493; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 1494; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 1496; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 1497; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 1498; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1499; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1500; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 1501; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1502; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1503; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1504; GFX10-PAL-NEXT: s_endpgm 1505bb: 1506 %padding = alloca [4096 x i32], align 4, addrspace(5) 1507 %i = alloca [32 x float], align 4, addrspace(5) 1508 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1509 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1510 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1511 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1512 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1513 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1514 %i9 = and i32 %idx, 15 1515 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1516 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1517 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1518 ret void 1519} 1520 1521define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 1522; GFX9-LABEL: store_load_sindex_large_offset_foo: 1523; GFX9: ; %bb.0: ; %bb 1524; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1525; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1526; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1527; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1528; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1529; GFX9-NEXT: s_waitcnt vmcnt(0) 1530; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1531; GFX9-NEXT: v_mov_b32_e32 v0, 15 1532; GFX9-NEXT: scratch_store_dword off, v0, s0 1533; GFX9-NEXT: s_waitcnt vmcnt(0) 1534; GFX9-NEXT: s_and_b32 s0, s2, 15 1535; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1536; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1537; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1538; GFX9-NEXT: s_waitcnt vmcnt(0) 1539; GFX9-NEXT: s_endpgm 1540; 1541; GFX10-LABEL: store_load_sindex_large_offset_foo: 1542; GFX10: ; %bb.0: ; %bb 1543; GFX10-NEXT: s_add_u32 s0, s0, s3 1544; GFX10-NEXT: s_addc_u32 s1, s1, 0 1545; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1546; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1547; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1548; GFX10-NEXT: s_waitcnt vmcnt(0) 1549; GFX10-NEXT: s_and_b32 s0, s2, 15 1550; GFX10-NEXT: v_mov_b32_e32 v0, 15 1551; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1552; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1553; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 1554; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 1555; GFX10-NEXT: scratch_store_dword off, v0, s1 1556; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1557; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1558; GFX10-NEXT: s_waitcnt vmcnt(0) 1559; GFX10-NEXT: s_endpgm 1560; 1561; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 1562; GFX9-PAL: ; %bb.0: ; %bb 1563; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1564; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1565; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1566; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1567; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1568; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1569; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1570; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1571; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1572; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1573; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1574; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1575; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1576; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1577; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1578; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1579; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1580; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1581; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1582; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1583; GFX9-PAL-NEXT: s_endpgm 1584; 1585; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo: 1586; GFX10-PAL: ; %bb.0: ; %bb 1587; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1588; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1589; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1590; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1591; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1592; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1593; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1594; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1595; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1596; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1597; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1598; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 1599; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 1600; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 1601; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 1602; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1603; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1604; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 1605; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1606; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1607; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1608; GFX10-PAL-NEXT: s_endpgm 1609bb: 1610 %padding = alloca [4096 x i32], align 4, addrspace(5) 1611 %i = alloca [32 x float], align 4, addrspace(5) 1612 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1613 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1614 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1615 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1616 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1617 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1618 %i9 = and i32 %idx, 15 1619 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1620 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1621 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1622 ret void 1623} 1624 1625define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 1626; GFX9-LABEL: store_load_vindex_large_offset_kernel: 1627; GFX9: ; %bb.0: ; %bb 1628; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1629; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1630; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1631; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1632; GFX9-NEXT: s_waitcnt vmcnt(0) 1633; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1634; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 1635; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1636; GFX9-NEXT: v_mov_b32_e32 v3, 15 1637; GFX9-NEXT: scratch_store_dword v2, v3, off 1638; GFX9-NEXT: s_waitcnt vmcnt(0) 1639; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1640; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1641; GFX9-NEXT: s_waitcnt vmcnt(0) 1642; GFX9-NEXT: s_endpgm 1643; 1644; GFX10-LABEL: store_load_vindex_large_offset_kernel: 1645; GFX10: ; %bb.0: ; %bb 1646; GFX10-NEXT: s_add_u32 s0, s0, s3 1647; GFX10-NEXT: s_addc_u32 s1, s1, 0 1648; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1649; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1650; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 1651; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1652; GFX10-NEXT: v_mov_b32_e32 v3, 15 1653; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1654; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1655; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1656; GFX10-NEXT: s_waitcnt vmcnt(0) 1657; GFX10-NEXT: scratch_store_dword v2, v3, off 1658; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1659; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1660; GFX10-NEXT: s_waitcnt vmcnt(0) 1661; GFX10-NEXT: s_endpgm 1662; 1663; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 1664; GFX9-PAL: ; %bb.0: ; %bb 1665; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1666; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1667; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1668; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1669; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1670; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1671; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1672; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1673; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1674; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1675; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1676; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1677; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1678; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1679; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1680; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1681; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1682; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1683; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1684; GFX9-PAL-NEXT: s_endpgm 1685; 1686; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel: 1687; GFX10-PAL: ; %bb.0: ; %bb 1688; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1689; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1690; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1691; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1692; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1693; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1694; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1695; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1696; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1697; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1698; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1699; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1700; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1701; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1702; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1703; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1704; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 1705; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1706; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1707; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1708; GFX10-PAL-NEXT: s_endpgm 1709bb: 1710 %padding = alloca [4096 x i32], align 4, addrspace(5) 1711 %i = alloca [32 x float], align 4, addrspace(5) 1712 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1713 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1714 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1715 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1716 %i3 = zext i32 %i2 to i64 1717 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1718 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1719 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1720 %i9 = sub nsw i32 31, %i2 1721 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1722 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1723 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1724 ret void 1725} 1726 1727define void @store_load_vindex_large_offset_foo(i32 %idx) { 1728; GFX9-LABEL: store_load_vindex_large_offset_foo: 1729; GFX9: ; %bb.0: ; %bb 1730; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1731; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1732; GFX9-NEXT: s_waitcnt vmcnt(0) 1733; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1734; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1735; GFX9-NEXT: v_mov_b32_e32 v3, 15 1736; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1737; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1738; GFX9-NEXT: scratch_store_dword v2, v3, off 1739; GFX9-NEXT: s_waitcnt vmcnt(0) 1740; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1741; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1742; GFX9-NEXT: s_waitcnt vmcnt(0) 1743; GFX9-NEXT: s_setpc_b64 s[30:31] 1744; 1745; GFX10-LABEL: store_load_vindex_large_offset_foo: 1746; GFX10: ; %bb.0: ; %bb 1747; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1748; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1749; GFX10-NEXT: v_mov_b32_e32 v1, 15 1750; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1751; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1752; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1753; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1754; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1755; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1756; GFX10-NEXT: s_waitcnt vmcnt(0) 1757; GFX10-NEXT: scratch_store_dword v0, v1, off 1758; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1759; GFX10-NEXT: scratch_load_dword v0, v2, off glc dlc 1760; GFX10-NEXT: s_waitcnt vmcnt(0) 1761; GFX10-NEXT: s_setpc_b64 s[30:31] 1762; 1763; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 1764; GFX9-PAL: ; %bb.0: ; %bb 1765; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1766; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1767; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1768; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1769; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1770; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1771; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1772; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1773; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1774; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1775; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1776; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1777; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1778; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1779; 1780; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 1781; GFX10-PAL: ; %bb.0: ; %bb 1782; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1783; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1784; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1785; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1786; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1787; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1788; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1789; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1790; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1791; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1792; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1793; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1794; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off glc dlc 1795; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1796; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1797bb: 1798 %padding = alloca [4096 x i32], align 4, addrspace(5) 1799 %i = alloca [32 x float], align 4, addrspace(5) 1800 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1801 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1802 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1803 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1804 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1805 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1806 %i9 = and i32 %idx, 15 1807 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1808 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1809 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1810 ret void 1811} 1812 1813define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 1814; GFX9-LABEL: store_load_large_imm_offset_kernel: 1815; GFX9: ; %bb.0: ; %bb 1816; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1817; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1818; GFX9-NEXT: s_movk_i32 s0, 0x3000 1819; GFX9-NEXT: v_mov_b32_e32 v0, 13 1820; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1821; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1822; GFX9-NEXT: s_waitcnt vmcnt(0) 1823; GFX9-NEXT: s_add_u32 s0, 4, s0 1824; GFX9-NEXT: v_mov_b32_e32 v0, 15 1825; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1826; GFX9-NEXT: s_waitcnt vmcnt(0) 1827; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 1828; GFX9-NEXT: s_waitcnt vmcnt(0) 1829; GFX9-NEXT: s_endpgm 1830; 1831; GFX10-LABEL: store_load_large_imm_offset_kernel: 1832; GFX10: ; %bb.0: ; %bb 1833; GFX10-NEXT: s_add_u32 s0, s0, s3 1834; GFX10-NEXT: s_addc_u32 s1, s1, 0 1835; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1836; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1837; GFX10-NEXT: v_mov_b32_e32 v0, 13 1838; GFX10-NEXT: v_mov_b32_e32 v1, 15 1839; GFX10-NEXT: s_movk_i32 s0, 0x3800 1840; GFX10-NEXT: s_add_u32 s0, 4, s0 1841; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 1842; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1843; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1844; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1845; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 1846; GFX10-NEXT: s_waitcnt vmcnt(0) 1847; GFX10-NEXT: s_endpgm 1848; 1849; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 1850; GFX9-PAL: ; %bb.0: ; %bb 1851; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1852; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1853; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1854; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 1855; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1856; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 1857; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1858; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1859; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1860; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1861; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1862; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1863; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 1864; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1865; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 1866; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1867; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 1868; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1869; GFX9-PAL-NEXT: s_endpgm 1870; 1871; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel: 1872; GFX10-PAL: ; %bb.0: ; %bb 1873; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1874; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1875; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1876; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1877; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1878; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1879; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1880; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1881; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1882; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 1883; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1884; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 1885; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 1886; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4 1887; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1888; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 1889; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1890; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 1891; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1892; GFX10-PAL-NEXT: s_endpgm 1893bb: 1894 %i = alloca [4096 x i32], align 4, addrspace(5) 1895 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1896 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1897 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1898 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1899 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1900 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1901 ret void 1902} 1903 1904define void @store_load_large_imm_offset_foo() { 1905; GFX9-LABEL: store_load_large_imm_offset_foo: 1906; GFX9: ; %bb.0: ; %bb 1907; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1908; GFX9-NEXT: s_movk_i32 s0, 0x3000 1909; GFX9-NEXT: v_mov_b32_e32 v0, 13 1910; GFX9-NEXT: scratch_store_dword off, v0, s32 1911; GFX9-NEXT: s_waitcnt vmcnt(0) 1912; GFX9-NEXT: s_add_u32 s0, s32, s0 1913; GFX9-NEXT: v_mov_b32_e32 v0, 15 1914; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1915; GFX9-NEXT: s_waitcnt vmcnt(0) 1916; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 1917; GFX9-NEXT: s_waitcnt vmcnt(0) 1918; GFX9-NEXT: s_setpc_b64 s[30:31] 1919; 1920; GFX10-LABEL: store_load_large_imm_offset_foo: 1921; GFX10: ; %bb.0: ; %bb 1922; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1923; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1924; GFX10-NEXT: v_mov_b32_e32 v0, 13 1925; GFX10-NEXT: v_mov_b32_e32 v1, 15 1926; GFX10-NEXT: s_movk_i32 s0, 0x3800 1927; GFX10-NEXT: s_add_u32 s0, s32, s0 1928; GFX10-NEXT: scratch_store_dword off, v0, s32 1929; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1930; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1931; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1932; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 1933; GFX10-NEXT: s_waitcnt vmcnt(0) 1934; GFX10-NEXT: s_setpc_b64 s[30:31] 1935; 1936; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 1937; GFX9-PAL: ; %bb.0: ; %bb 1938; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1939; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 1940; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 1941; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 1942; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1943; GFX9-PAL-NEXT: s_add_u32 s0, s32, s0 1944; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1945; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 1946; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1947; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 1948; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1949; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1950; 1951; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 1952; GFX10-PAL: ; %bb.0: ; %bb 1953; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1954; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1955; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 1956; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1957; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 1958; GFX10-PAL-NEXT: s_add_u32 s0, s32, s0 1959; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 1960; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1961; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 1962; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1963; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 1964; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1965; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1966bb: 1967 %i = alloca [4096 x i32], align 4, addrspace(5) 1968 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1969 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1970 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1971 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1972 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1973 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1974 ret void 1975} 1976 1977define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 1978; GFX9-LABEL: store_load_vidx_sidx_offset: 1979; GFX9: ; %bb.0: ; %bb 1980; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1981; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1982; GFX9-NEXT: v_mov_b32_e32 v1, 4 1983; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1984; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 1986; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1987; GFX9-NEXT: v_mov_b32_e32 v1, 15 1988; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 1989; GFX9-NEXT: s_waitcnt vmcnt(0) 1990; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 1991; GFX9-NEXT: s_waitcnt vmcnt(0) 1992; GFX9-NEXT: s_endpgm 1993; 1994; GFX10-LABEL: store_load_vidx_sidx_offset: 1995; GFX10: ; %bb.0: ; %bb 1996; GFX10-NEXT: s_add_u32 s2, s2, s5 1997; GFX10-NEXT: s_addc_u32 s3, s3, 0 1998; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1999; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2000; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2001; GFX10-NEXT: v_mov_b32_e32 v1, 15 2002; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2003; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 2004; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2005; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 2006; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2007; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2008; GFX10-NEXT: s_waitcnt vmcnt(0) 2009; GFX10-NEXT: s_endpgm 2010; 2011; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 2012; GFX9-PAL: ; %bb.0: ; %bb 2013; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2014; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2015; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2016; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 2017; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 2018; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2019; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2020; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2021; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 2022; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2023; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2024; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2025; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2026; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2027; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2028; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2029; GFX9-PAL-NEXT: s_endpgm 2030; 2031; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 2032; GFX10-PAL: ; %bb.0: ; %bb 2033; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 2034; GFX10-PAL-NEXT: s_mov_b32 s4, s0 2035; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2036; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2038; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 2039; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 2040; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2041; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2042; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 2043; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2044; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2045; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 2046; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2047; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2048; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2049; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2050; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2051; GFX10-PAL-NEXT: s_endpgm 2052bb: 2053 %alloca = alloca [32 x i32], align 4, addrspace(5) 2054 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 2055 %add1 = add nsw i32 %sidx, %vidx 2056 %add2 = add nsw i32 %add1, 256 2057 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 2058 store volatile i32 15, i32 addrspace(5)* %gep, align 4 2059 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 2060 ret void 2061} 2062 2063define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 2064; GFX9-LABEL: store_load_i64_aligned: 2065; GFX9: ; %bb.0: ; %bb 2066; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2067; GFX9-NEXT: v_mov_b32_e32 v1, 15 2068; GFX9-NEXT: v_mov_b32_e32 v2, 0 2069; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2070; GFX9-NEXT: s_waitcnt vmcnt(0) 2071; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2072; GFX9-NEXT: s_waitcnt vmcnt(0) 2073; GFX9-NEXT: s_setpc_b64 s[30:31] 2074; 2075; GFX10-LABEL: store_load_i64_aligned: 2076; GFX10: ; %bb.0: ; %bb 2077; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2078; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2079; GFX10-NEXT: v_mov_b32_e32 v1, 15 2080; GFX10-NEXT: v_mov_b32_e32 v2, 0 2081; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2082; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2083; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2084; GFX10-NEXT: s_waitcnt vmcnt(0) 2085; GFX10-NEXT: s_setpc_b64 s[30:31] 2086; 2087; GFX9-PAL-LABEL: store_load_i64_aligned: 2088; GFX9-PAL: ; %bb.0: ; %bb 2089; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2090; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2091; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2092; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2093; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2094; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2095; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2096; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2097; 2098; GFX10-PAL-LABEL: store_load_i64_aligned: 2099; GFX10-PAL: ; %bb.0: ; %bb 2100; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2101; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2102; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2103; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2104; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2105; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2106; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2107; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2108; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2109bb: 2110 store volatile i64 15, i64 addrspace(5)* %arg, align 8 2111 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 2112 ret void 2113} 2114 2115define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 2116; GFX9-LABEL: store_load_i64_unaligned: 2117; GFX9: ; %bb.0: ; %bb 2118; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2119; GFX9-NEXT: v_mov_b32_e32 v1, 15 2120; GFX9-NEXT: v_mov_b32_e32 v2, 0 2121; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2122; GFX9-NEXT: s_waitcnt vmcnt(0) 2123; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2124; GFX9-NEXT: s_waitcnt vmcnt(0) 2125; GFX9-NEXT: s_setpc_b64 s[30:31] 2126; 2127; GFX10-LABEL: store_load_i64_unaligned: 2128; GFX10: ; %bb.0: ; %bb 2129; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2130; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2131; GFX10-NEXT: v_mov_b32_e32 v1, 15 2132; GFX10-NEXT: v_mov_b32_e32 v2, 0 2133; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2134; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2135; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2136; GFX10-NEXT: s_waitcnt vmcnt(0) 2137; GFX10-NEXT: s_setpc_b64 s[30:31] 2138; 2139; GFX9-PAL-LABEL: store_load_i64_unaligned: 2140; GFX9-PAL: ; %bb.0: ; %bb 2141; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2142; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2143; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2144; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2145; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2146; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2147; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2148; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2149; 2150; GFX10-PAL-LABEL: store_load_i64_unaligned: 2151; GFX10-PAL: ; %bb.0: ; %bb 2152; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2153; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2154; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2155; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2156; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2157; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2158; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2159; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2160; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2161bb: 2162 store volatile i64 15, i64 addrspace(5)* %arg, align 1 2163 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 2164 ret void 2165} 2166 2167define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 2168; GFX9-LABEL: store_load_v3i32_unaligned: 2169; GFX9: ; %bb.0: ; %bb 2170; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2171; GFX9-NEXT: v_mov_b32_e32 v1, 1 2172; GFX9-NEXT: v_mov_b32_e32 v2, 2 2173; GFX9-NEXT: v_mov_b32_e32 v3, 3 2174; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2175; GFX9-NEXT: s_waitcnt vmcnt(0) 2176; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 2177; GFX9-NEXT: s_waitcnt vmcnt(0) 2178; GFX9-NEXT: s_setpc_b64 s[30:31] 2179; 2180; GFX10-LABEL: store_load_v3i32_unaligned: 2181; GFX10: ; %bb.0: ; %bb 2182; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2183; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2184; GFX10-NEXT: v_mov_b32_e32 v1, 1 2185; GFX10-NEXT: v_mov_b32_e32 v2, 2 2186; GFX10-NEXT: v_mov_b32_e32 v3, 3 2187; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2188; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2189; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 2190; GFX10-NEXT: s_waitcnt vmcnt(0) 2191; GFX10-NEXT: s_setpc_b64 s[30:31] 2192; 2193; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 2194; GFX9-PAL: ; %bb.0: ; %bb 2195; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2196; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2197; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2198; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2199; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2200; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2201; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 2202; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2203; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2204; 2205; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 2206; GFX10-PAL: ; %bb.0: ; %bb 2207; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2208; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2209; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2210; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2211; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2212; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2213; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2214; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 2215; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2216; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2217bb: 2218 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 2219 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 2220 ret void 2221} 2222 2223define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 2224; GFX9-LABEL: store_load_v4i32_unaligned: 2225; GFX9: ; %bb.0: ; %bb 2226; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2227; GFX9-NEXT: v_mov_b32_e32 v1, 1 2228; GFX9-NEXT: v_mov_b32_e32 v2, 2 2229; GFX9-NEXT: v_mov_b32_e32 v3, 3 2230; GFX9-NEXT: v_mov_b32_e32 v4, 4 2231; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2232; GFX9-NEXT: s_waitcnt vmcnt(0) 2233; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 2234; GFX9-NEXT: s_waitcnt vmcnt(0) 2235; GFX9-NEXT: s_setpc_b64 s[30:31] 2236; 2237; GFX10-LABEL: store_load_v4i32_unaligned: 2238; GFX10: ; %bb.0: ; %bb 2239; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2240; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2241; GFX10-NEXT: v_mov_b32_e32 v1, 1 2242; GFX10-NEXT: v_mov_b32_e32 v2, 2 2243; GFX10-NEXT: v_mov_b32_e32 v3, 3 2244; GFX10-NEXT: v_mov_b32_e32 v4, 4 2245; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2246; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2247; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 2248; GFX10-NEXT: s_waitcnt vmcnt(0) 2249; GFX10-NEXT: s_setpc_b64 s[30:31] 2250; 2251; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 2252; GFX9-PAL: ; %bb.0: ; %bb 2253; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2254; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2255; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2256; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2257; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 2258; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2259; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2260; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 2261; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2262; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2263; 2264; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 2265; GFX10-PAL: ; %bb.0: ; %bb 2266; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2267; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2268; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2269; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2270; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2271; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 2272; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2273; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2274; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 2275; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2276; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2277bb: 2278 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 2279 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 2280 ret void 2281} 2282 2283declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 2284declare i32 @llvm.amdgcn.workitem.id.x() 2285