1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-PAL %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-PAL %s 6 7define amdgpu_kernel void @zero_init_kernel() { 8; GFX9-LABEL: zero_init_kernel: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 11; GFX9-NEXT: s_mov_b32 s0, 0 12; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 13; GFX9-NEXT: s_mov_b32 s1, s0 14; GFX9-NEXT: s_mov_b32 s2, s0 15; GFX9-NEXT: s_mov_b32 s3, s0 16; GFX9-NEXT: v_mov_b32_e32 v0, s0 17; GFX9-NEXT: v_mov_b32_e32 v1, s1 18; GFX9-NEXT: v_mov_b32_e32 v2, s2 19; GFX9-NEXT: v_mov_b32_e32 v3, s3 20; GFX9-NEXT: s_mov_b32 vcc_hi, 0 21; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 22; GFX9-NEXT: s_mov_b32 vcc_hi, 0 23; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 28; GFX9-NEXT: s_endpgm 29; 30; GFX10-LABEL: zero_init_kernel: 31; GFX10: ; %bb.0: 32; GFX10-NEXT: s_add_u32 s0, s0, s3 33; GFX10-NEXT: s_addc_u32 s1, s1, 0 34; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 35; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 36; GFX10-NEXT: s_mov_b32 s0, 0 37; GFX10-NEXT: ; implicit-def: $vcc_hi 38; GFX10-NEXT: s_mov_b32 s1, s0 39; GFX10-NEXT: s_mov_b32 s2, s0 40; GFX10-NEXT: s_mov_b32 s3, s0 41; GFX10-NEXT: v_mov_b32_e32 v0, s0 42; GFX10-NEXT: v_mov_b32_e32 v1, s1 43; GFX10-NEXT: v_mov_b32_e32 v2, s2 44; GFX10-NEXT: v_mov_b32_e32 v3, s3 45; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 49; GFX10-NEXT: s_endpgm 50; 51; GFX9-PAL-LABEL: zero_init_kernel: 52; GFX9-PAL: ; %bb.0: 53; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 54; GFX9-PAL-NEXT: s_mov_b32 s2, s0 55; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 56; GFX9-PAL-NEXT: s_mov_b32 s0, 0 57; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 58; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 59; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 60; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 61; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 62; GFX9-PAL-NEXT: s_mov_b32 s1, s0 63; GFX9-PAL-NEXT: s_mov_b32 s2, s0 64; GFX9-PAL-NEXT: s_mov_b32 s3, s0 65; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 66; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 67; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 68; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 69; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 70; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 71; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 72; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 73; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 74; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 75; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 76; GFX9-PAL-NEXT: s_endpgm 77; 78; GFX10-PAL-LABEL: zero_init_kernel: 79; GFX10-PAL: ; %bb.0: 80; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 81; GFX10-PAL-NEXT: s_mov_b32 s2, s0 82; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 83; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 84; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 85; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 86; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 87; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 88; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 89; GFX10-PAL-NEXT: s_mov_b32 s0, 0 90; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 91; GFX10-PAL-NEXT: s_mov_b32 s1, s0 92; GFX10-PAL-NEXT: s_mov_b32 s2, s0 93; GFX10-PAL-NEXT: s_mov_b32 s3, s0 94; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 95; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 96; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 97; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 98; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 99; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 100; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 101; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 102; GFX10-PAL-NEXT: s_endpgm 103 %alloca = alloca [32 x i16], align 2, addrspace(5) 104 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 105 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 106 ret void 107} 108 109define void @zero_init_foo() { 110; GFX9-LABEL: zero_init_foo: 111; GFX9: ; %bb.0: 112; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GFX9-NEXT: s_mov_b32 s0, 0 114; GFX9-NEXT: s_mov_b32 s1, s0 115; GFX9-NEXT: s_mov_b32 s2, s0 116; GFX9-NEXT: s_mov_b32 s3, s0 117; GFX9-NEXT: v_mov_b32_e32 v0, s0 118; GFX9-NEXT: v_mov_b32_e32 v1, s1 119; GFX9-NEXT: v_mov_b32_e32 v2, s2 120; GFX9-NEXT: v_mov_b32_e32 v3, s3 121; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 122; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 123; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 124; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 125; GFX9-NEXT: s_waitcnt vmcnt(0) 126; GFX9-NEXT: s_setpc_b64 s[30:31] 127; 128; GFX10-LABEL: zero_init_foo: 129; GFX10: ; %bb.0: 130; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 131; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 132; GFX10-NEXT: s_mov_b32 s0, 0 133; GFX10-NEXT: ; implicit-def: $vcc_hi 134; GFX10-NEXT: s_mov_b32 s1, s0 135; GFX10-NEXT: s_mov_b32 s2, s0 136; GFX10-NEXT: s_mov_b32 s3, s0 137; GFX10-NEXT: v_mov_b32_e32 v0, s0 138; GFX10-NEXT: v_mov_b32_e32 v1, s1 139; GFX10-NEXT: v_mov_b32_e32 v2, s2 140; GFX10-NEXT: v_mov_b32_e32 v3, s3 141; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 142; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 143; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 144; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 145; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 146; GFX10-NEXT: s_setpc_b64 s[30:31] 147; 148; GFX9-PAL-LABEL: zero_init_foo: 149; GFX9-PAL: ; %bb.0: 150; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX9-PAL-NEXT: s_mov_b32 s0, 0 152; GFX9-PAL-NEXT: s_mov_b32 s1, s0 153; GFX9-PAL-NEXT: s_mov_b32 s2, s0 154; GFX9-PAL-NEXT: s_mov_b32 s3, s0 155; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 156; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 157; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 158; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 159; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 160; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 161; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 162; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 163; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 164; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 165; 166; GFX10-PAL-LABEL: zero_init_foo: 167; GFX10-PAL: ; %bb.0: 168; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 169; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 170; GFX10-PAL-NEXT: s_mov_b32 s0, 0 171; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 172; GFX10-PAL-NEXT: s_mov_b32 s1, s0 173; GFX10-PAL-NEXT: s_mov_b32 s2, s0 174; GFX10-PAL-NEXT: s_mov_b32 s3, s0 175; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 176; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 177; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 178; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 179; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 180; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 181; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 182; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 183; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 184; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 185 %alloca = alloca [32 x i16], align 2, addrspace(5) 186 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 187 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 188 ret void 189} 190 191define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 192; GFX9-LABEL: store_load_sindex_kernel: 193; GFX9: ; %bb.0: ; %bb 194; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 195; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 196; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 197; GFX9-NEXT: v_mov_b32_e32 v0, 15 198; GFX9-NEXT: s_waitcnt lgkmcnt(0) 199; GFX9-NEXT: s_lshl_b32 s1, s0, 2 200; GFX9-NEXT: s_and_b32 s0, s0, 15 201; GFX9-NEXT: s_lshl_b32 s0, s0, 2 202; GFX9-NEXT: s_add_u32 s1, 4, s1 203; GFX9-NEXT: scratch_store_dword off, v0, s1 204; GFX9-NEXT: s_add_u32 s0, 4, s0 205; GFX9-NEXT: scratch_load_dword v0, off, s0 206; GFX9-NEXT: s_endpgm 207; 208; GFX10-LABEL: store_load_sindex_kernel: 209; GFX10: ; %bb.0: ; %bb 210; GFX10-NEXT: s_add_u32 s2, s2, s5 211; GFX10-NEXT: s_addc_u32 s3, s3, 0 212; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 213; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 214; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 215; GFX10-NEXT: v_mov_b32_e32 v0, 15 216; GFX10-NEXT: s_waitcnt lgkmcnt(0) 217; GFX10-NEXT: s_and_b32 s1, s0, 15 218; GFX10-NEXT: s_lshl_b32 s0, s0, 2 219; GFX10-NEXT: s_lshl_b32 s1, s1, 2 220; GFX10-NEXT: s_add_u32 s0, 4, s0 221; GFX10-NEXT: s_add_u32 s1, 4, s1 222; GFX10-NEXT: scratch_store_dword off, v0, s0 223; GFX10-NEXT: scratch_load_dword v0, off, s1 224; GFX10-NEXT: s_endpgm 225; 226; GFX9-PAL-LABEL: store_load_sindex_kernel: 227; GFX9-PAL: ; %bb.0: ; %bb 228; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 229; GFX9-PAL-NEXT: s_mov_b32 s4, s0 230; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 231; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 232; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 233; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 234; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 235; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 236; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 237; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 238; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 239; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 240; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 241; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 242; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 243; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 244; GFX9-PAL-NEXT: s_endpgm 245; 246; GFX10-PAL-LABEL: store_load_sindex_kernel: 247; GFX10-PAL: ; %bb.0: ; %bb 248; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 249; GFX10-PAL-NEXT: s_mov_b32 s4, s0 250; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 251; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 252; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 253; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 254; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 255; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 256; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 257; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 258; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 259; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 260; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 261; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 262; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 263; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 264; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 265; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 266; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 267; GFX10-PAL-NEXT: s_endpgm 268bb: 269 %i = alloca [32 x float], align 4, addrspace(5) 270 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 271 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 272 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 273 store volatile i32 15, i32 addrspace(5)* %i8, align 4 274 %i9 = and i32 %idx, 15 275 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 276 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 277 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 278 ret void 279} 280 281define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 282; GFX9-LABEL: store_load_sindex_foo: 283; GFX9: ; %bb.0: ; %bb 284; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 285; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 286; GFX9-NEXT: s_lshl_b32 s0, s2, 2 287; GFX9-NEXT: s_add_u32 s0, 4, s0 288; GFX9-NEXT: v_mov_b32_e32 v0, 15 289; GFX9-NEXT: scratch_store_dword off, v0, s0 290; GFX9-NEXT: s_and_b32 s0, s2, 15 291; GFX9-NEXT: s_lshl_b32 s0, s0, 2 292; GFX9-NEXT: s_add_u32 s0, 4, s0 293; GFX9-NEXT: scratch_load_dword v0, off, s0 294; GFX9-NEXT: s_endpgm 295; 296; GFX10-LABEL: store_load_sindex_foo: 297; GFX10: ; %bb.0: ; %bb 298; GFX10-NEXT: s_add_u32 s0, s0, s3 299; GFX10-NEXT: s_addc_u32 s1, s1, 0 300; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 301; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 302; GFX10-NEXT: s_and_b32 s0, s2, 15 303; GFX10-NEXT: v_mov_b32_e32 v0, 15 304; GFX10-NEXT: s_lshl_b32 s1, s2, 2 305; GFX10-NEXT: s_lshl_b32 s0, s0, 2 306; GFX10-NEXT: s_add_u32 s1, 4, s1 307; GFX10-NEXT: s_add_u32 s0, 4, s0 308; GFX10-NEXT: scratch_store_dword off, v0, s1 309; GFX10-NEXT: scratch_load_dword v0, off, s0 310; GFX10-NEXT: s_endpgm 311; 312; GFX9-PAL-LABEL: store_load_sindex_foo: 313; GFX9-PAL: ; %bb.0: ; %bb 314; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 315; GFX9-PAL-NEXT: s_mov_b32 s2, s0 316; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 317; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 318; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 319; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 320; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 321; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 322; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 323; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 324; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 325; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1 326; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 327; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 328; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 329; GFX9-PAL-NEXT: s_endpgm 330; 331; GFX10-PAL-LABEL: store_load_sindex_foo: 332; GFX10-PAL: ; %bb.0: ; %bb 333; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 334; GFX10-PAL-NEXT: s_mov_b32 s2, s0 335; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 336; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 337; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 338; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 339; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 340; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 341; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 342; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 343; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 344; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 345; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 346; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 347; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1 348; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 349; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 350; GFX10-PAL-NEXT: s_endpgm 351bb: 352 %i = alloca [32 x float], align 4, addrspace(5) 353 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 354 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 355 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 356 store volatile i32 15, i32 addrspace(5)* %i8, align 4 357 %i9 = and i32 %idx, 15 358 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 359 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 360 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 361 ret void 362} 363 364define amdgpu_kernel void @store_load_vindex_kernel() { 365; GFX9-LABEL: store_load_vindex_kernel: 366; GFX9: ; %bb.0: ; %bb 367; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 368; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 369; GFX9-NEXT: v_mov_b32_e32 v1, 4 370; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 371; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 372; GFX9-NEXT: v_mov_b32_e32 v3, 15 373; GFX9-NEXT: scratch_store_dword v2, v3, off 374; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 375; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 376; GFX9-NEXT: s_endpgm 377; 378; GFX10-LABEL: store_load_vindex_kernel: 379; GFX10: ; %bb.0: ; %bb 380; GFX10-NEXT: s_add_u32 s0, s0, s3 381; GFX10-NEXT: s_addc_u32 s1, s1, 0 382; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 383; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 384; GFX10-NEXT: v_mov_b32_e32 v1, 4 385; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 386; GFX10-NEXT: v_mov_b32_e32 v3, 15 387; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 388; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 389; GFX10-NEXT: scratch_store_dword v2, v3, off 390; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 391; GFX10-NEXT: s_endpgm 392; 393; GFX9-PAL-LABEL: store_load_vindex_kernel: 394; GFX9-PAL: ; %bb.0: ; %bb 395; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 396; GFX9-PAL-NEXT: s_mov_b32 s2, s0 397; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 398; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 399; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 400; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 401; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 402; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 403; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 404; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 405; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 406; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 407; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 408; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 409; GFX9-PAL-NEXT: s_endpgm 410; 411; GFX10-PAL-LABEL: store_load_vindex_kernel: 412; GFX10-PAL: ; %bb.0: ; %bb 413; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 414; GFX10-PAL-NEXT: s_mov_b32 s2, s0 415; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 416; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 417; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 418; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 419; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 420; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 421; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 422; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 423; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 424; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 425; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 426; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 427; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 428; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 429; GFX10-PAL-NEXT: s_endpgm 430bb: 431 %i = alloca [32 x float], align 4, addrspace(5) 432 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 433 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 434 %i3 = zext i32 %i2 to i64 435 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 436 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 437 store volatile i32 15, i32 addrspace(5)* %i8, align 4 438 %i9 = sub nsw i32 31, %i2 439 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 440 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 441 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 442 ret void 443} 444 445define void @store_load_vindex_foo(i32 %idx) { 446; GFX9-LABEL: store_load_vindex_foo: 447; GFX9: ; %bb.0: ; %bb 448; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 449; GFX9-NEXT: v_mov_b32_e32 v1, s32 450; GFX9-NEXT: v_mov_b32_e32 v3, 15 451; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 452; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 453; GFX9-NEXT: scratch_store_dword v2, v3, off 454; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 455; GFX9-NEXT: scratch_load_dword v0, v0, off 456; GFX9-NEXT: s_waitcnt vmcnt(0) 457; GFX9-NEXT: s_setpc_b64 s[30:31] 458; 459; GFX10-LABEL: store_load_vindex_foo: 460; GFX10: ; %bb.0: ; %bb 461; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 462; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 463; GFX10-NEXT: v_mov_b32_e32 v1, 15 464; GFX10-NEXT: v_mov_b32_e32 v2, s32 465; GFX10-NEXT: ; implicit-def: $vcc_hi 466; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 467; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 468; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 469; GFX10-NEXT: scratch_store_dword v0, v1, off 470; GFX10-NEXT: scratch_load_dword v0, v2, off 471; GFX10-NEXT: s_waitcnt vmcnt(0) 472; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 473; GFX10-NEXT: s_setpc_b64 s[30:31] 474; 475; GFX9-PAL-LABEL: store_load_vindex_foo: 476; GFX9-PAL: ; %bb.0: ; %bb 477; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 478; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 479; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 480; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 481; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 482; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 483; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 484; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off 485; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 486; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 487; 488; GFX10-PAL-LABEL: store_load_vindex_foo: 489; GFX10-PAL: ; %bb.0: ; %bb 490; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 491; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 492; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 493; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32 494; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 495; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 496; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 497; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 498; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 499; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off 500; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 501; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 502; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 503bb: 504 %i = alloca [32 x float], align 4, addrspace(5) 505 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 506 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 507 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 508 store volatile i32 15, i32 addrspace(5)* %i8, align 4 509 %i9 = and i32 %idx, 15 510 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 511 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 512 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 513 ret void 514} 515 516define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 517; GFX9-LABEL: private_ptr_foo: 518; GFX9: ; %bb.0: 519; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 520; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 521; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 522; GFX9-NEXT: s_waitcnt vmcnt(0) 523; GFX9-NEXT: s_setpc_b64 s[30:31] 524; 525; GFX10-LABEL: private_ptr_foo: 526; GFX10: ; %bb.0: 527; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 528; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 529; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 530; GFX10-NEXT: ; implicit-def: $vcc_hi 531; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 532; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 533; GFX10-NEXT: s_setpc_b64 s[30:31] 534; 535; GFX9-PAL-LABEL: private_ptr_foo: 536; GFX9-PAL: ; %bb.0: 537; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 538; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 539; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 540; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 541; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 542; 543; GFX10-PAL-LABEL: private_ptr_foo: 544; GFX10-PAL: ; %bb.0: 545; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 546; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 547; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 548; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 549; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 550; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 551; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 552 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 553 store float 1.000000e+01, float addrspace(5)* %gep, align 4 554 ret void 555} 556 557define amdgpu_kernel void @zero_init_small_offset_kernel() { 558; GFX9-LABEL: zero_init_small_offset_kernel: 559; GFX9: ; %bb.0: 560; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 561; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 562; GFX9-NEXT: s_mov_b32 vcc_hi, 0 563; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 564; GFX9-NEXT: s_mov_b32 s0, 0 565; GFX9-NEXT: s_mov_b32 s1, s0 566; GFX9-NEXT: s_mov_b32 s2, s0 567; GFX9-NEXT: s_mov_b32 s3, s0 568; GFX9-NEXT: s_waitcnt vmcnt(0) 569; GFX9-NEXT: v_mov_b32_e32 v0, s0 570; GFX9-NEXT: v_mov_b32_e32 v1, s1 571; GFX9-NEXT: v_mov_b32_e32 v2, s2 572; GFX9-NEXT: v_mov_b32_e32 v3, s3 573; GFX9-NEXT: s_mov_b32 vcc_hi, 0 574; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 575; GFX9-NEXT: s_mov_b32 vcc_hi, 0 576; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 577; GFX9-NEXT: s_mov_b32 vcc_hi, 0 578; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 579; GFX9-NEXT: s_mov_b32 vcc_hi, 0 580; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 581; GFX9-NEXT: s_endpgm 582; 583; GFX10-LABEL: zero_init_small_offset_kernel: 584; GFX10: ; %bb.0: 585; GFX10-NEXT: s_add_u32 s0, s0, s3 586; GFX10-NEXT: s_addc_u32 s1, s1, 0 587; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 588; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 589; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 590; GFX10-NEXT: s_mov_b32 s0, 0 591; GFX10-NEXT: ; implicit-def: $vcc_hi 592; GFX10-NEXT: s_mov_b32 s1, s0 593; GFX10-NEXT: s_mov_b32 s2, s0 594; GFX10-NEXT: s_mov_b32 s3, s0 595; GFX10-NEXT: s_waitcnt vmcnt(0) 596; GFX10-NEXT: v_mov_b32_e32 v0, s0 597; GFX10-NEXT: v_mov_b32_e32 v1, s1 598; GFX10-NEXT: v_mov_b32_e32 v2, s2 599; GFX10-NEXT: v_mov_b32_e32 v3, s3 600; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 601; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 602; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 603; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 604; GFX10-NEXT: s_endpgm 605; 606; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 607; GFX9-PAL: ; %bb.0: 608; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 609; GFX9-PAL-NEXT: s_mov_b32 s2, s0 610; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 611; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 612; GFX9-PAL-NEXT: s_mov_b32 s0, 0 613; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 614; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 615; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 616; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 617; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 618; GFX9-PAL-NEXT: s_mov_b32 s1, s0 619; GFX9-PAL-NEXT: s_mov_b32 s2, s0 620; GFX9-PAL-NEXT: s_mov_b32 s3, s0 621; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 622; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 623; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 624; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 625; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 626; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 627; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 628; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 629; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 630; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 631; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 632; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 633; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 634; GFX9-PAL-NEXT: s_endpgm 635; 636; GFX10-PAL-LABEL: zero_init_small_offset_kernel: 637; GFX10-PAL: ; %bb.0: 638; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 639; GFX10-PAL-NEXT: s_mov_b32 s2, s0 640; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 641; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 642; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 643; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 644; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 645; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 646; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 647; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 648; GFX10-PAL-NEXT: s_mov_b32 s0, 0 649; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 650; GFX10-PAL-NEXT: s_mov_b32 s1, s0 651; GFX10-PAL-NEXT: s_mov_b32 s2, s0 652; GFX10-PAL-NEXT: s_mov_b32 s3, s0 653; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 654; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 655; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 656; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 657; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 658; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 659; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 660; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 661; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 662; GFX10-PAL-NEXT: s_endpgm 663 %padding = alloca [64 x i32], align 4, addrspace(5) 664 %alloca = alloca [32 x i16], align 2, addrspace(5) 665 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 666 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 667 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 668 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 669 ret void 670} 671 672define void @zero_init_small_offset_foo() { 673; GFX9-LABEL: zero_init_small_offset_foo: 674; GFX9: ; %bb.0: 675; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 676; GFX9-NEXT: scratch_load_dword v0, off, s32 677; GFX9-NEXT: s_mov_b32 s0, 0 678; GFX9-NEXT: s_mov_b32 s1, s0 679; GFX9-NEXT: s_mov_b32 s2, s0 680; GFX9-NEXT: s_mov_b32 s3, s0 681; GFX9-NEXT: s_waitcnt vmcnt(0) 682; GFX9-NEXT: v_mov_b32_e32 v0, s0 683; GFX9-NEXT: v_mov_b32_e32 v1, s1 684; GFX9-NEXT: v_mov_b32_e32 v2, s2 685; GFX9-NEXT: v_mov_b32_e32 v3, s3 686; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 687; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 688; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 689; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 690; GFX9-NEXT: s_waitcnt vmcnt(0) 691; GFX9-NEXT: s_setpc_b64 s[30:31] 692; 693; GFX10-LABEL: zero_init_small_offset_foo: 694; GFX10: ; %bb.0: 695; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 696; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 697; GFX10-NEXT: scratch_load_dword v0, off, s32 698; GFX10-NEXT: s_mov_b32 s0, 0 699; GFX10-NEXT: ; implicit-def: $vcc_hi 700; GFX10-NEXT: s_mov_b32 s1, s0 701; GFX10-NEXT: s_mov_b32 s2, s0 702; GFX10-NEXT: s_mov_b32 s3, s0 703; GFX10-NEXT: s_waitcnt vmcnt(0) 704; GFX10-NEXT: v_mov_b32_e32 v0, s0 705; GFX10-NEXT: v_mov_b32_e32 v1, s1 706; GFX10-NEXT: v_mov_b32_e32 v2, s2 707; GFX10-NEXT: v_mov_b32_e32 v3, s3 708; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 709; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 710; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 711; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 712; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 713; GFX10-NEXT: s_setpc_b64 s[30:31] 714; 715; GFX9-PAL-LABEL: zero_init_small_offset_foo: 716; GFX9-PAL: ; %bb.0: 717; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 718; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 719; GFX9-PAL-NEXT: s_mov_b32 s0, 0 720; GFX9-PAL-NEXT: s_mov_b32 s1, s0 721; GFX9-PAL-NEXT: s_mov_b32 s2, s0 722; GFX9-PAL-NEXT: s_mov_b32 s3, s0 723; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 724; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 725; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 726; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 727; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 728; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 729; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 730; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 731; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 732; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 733; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 734; 735; GFX10-PAL-LABEL: zero_init_small_offset_foo: 736; GFX10-PAL: ; %bb.0: 737; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 738; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 739; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 740; GFX10-PAL-NEXT: s_mov_b32 s0, 0 741; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 742; GFX10-PAL-NEXT: s_mov_b32 s1, s0 743; GFX10-PAL-NEXT: s_mov_b32 s2, s0 744; GFX10-PAL-NEXT: s_mov_b32 s3, s0 745; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 746; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 747; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 748; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 749; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 750; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 751; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 752; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 753; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 754; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 755; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 756 %padding = alloca [64 x i32], align 4, addrspace(5) 757 %alloca = alloca [32 x i16], align 2, addrspace(5) 758 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 759 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 760 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 761 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 762 ret void 763} 764 765define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 766; GFX9-LABEL: store_load_sindex_small_offset_kernel: 767; GFX9: ; %bb.0: ; %bb 768; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 769; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 770; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 771; GFX9-NEXT: s_mov_b32 vcc_hi, 0 772; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 773; GFX9-NEXT: s_waitcnt lgkmcnt(0) 774; GFX9-NEXT: s_lshl_b32 s1, s0, 2 775; GFX9-NEXT: s_and_b32 s0, s0, 15 776; GFX9-NEXT: s_lshl_b32 s0, s0, 2 777; GFX9-NEXT: s_waitcnt vmcnt(0) 778; GFX9-NEXT: v_mov_b32_e32 v0, 15 779; GFX9-NEXT: s_add_u32 s1, 0x104, s1 780; GFX9-NEXT: scratch_store_dword off, v0, s1 781; GFX9-NEXT: s_add_u32 s0, 0x104, s0 782; GFX9-NEXT: scratch_load_dword v0, off, s0 783; GFX9-NEXT: s_endpgm 784; 785; GFX10-LABEL: store_load_sindex_small_offset_kernel: 786; GFX10: ; %bb.0: ; %bb 787; GFX10-NEXT: s_add_u32 s2, s2, s5 788; GFX10-NEXT: s_addc_u32 s3, s3, 0 789; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 790; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 791; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 792; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 793; GFX10-NEXT: s_waitcnt vmcnt(0) 794; GFX10-NEXT: v_mov_b32_e32 v0, 15 795; GFX10-NEXT: s_waitcnt lgkmcnt(0) 796; GFX10-NEXT: s_and_b32 s1, s0, 15 797; GFX10-NEXT: s_lshl_b32 s0, s0, 2 798; GFX10-NEXT: s_lshl_b32 s1, s1, 2 799; GFX10-NEXT: s_add_u32 s0, 0x104, s0 800; GFX10-NEXT: s_add_u32 s1, 0x104, s1 801; GFX10-NEXT: scratch_store_dword off, v0, s0 802; GFX10-NEXT: scratch_load_dword v0, off, s1 803; GFX10-NEXT: s_endpgm 804; 805; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 806; GFX9-PAL: ; %bb.0: ; %bb 807; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 808; GFX9-PAL-NEXT: s_mov_b32 s4, s0 809; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 810; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 811; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 812; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 813; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 814; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 815; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 816; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 817; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 818; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 819; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 820; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 821; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 822; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 823; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 824; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 825; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 826; GFX9-PAL-NEXT: s_endpgm 827; 828; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel: 829; GFX10-PAL: ; %bb.0: ; %bb 830; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 831; GFX10-PAL-NEXT: s_mov_b32 s4, s0 832; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 833; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 834; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 835; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 836; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 837; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 838; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 839; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 840; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 841; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 842; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 843; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 844; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 845; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 846; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 847; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 848; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 849; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 850; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 851; GFX10-PAL-NEXT: s_endpgm 852bb: 853 %padding = alloca [64 x i32], align 4, addrspace(5) 854 %i = alloca [32 x float], align 4, addrspace(5) 855 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 856 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 857 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 858 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 859 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 860 store volatile i32 15, i32 addrspace(5)* %i8, align 4 861 %i9 = and i32 %idx, 15 862 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 863 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 864 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 865 ret void 866} 867 868define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 869; GFX9-LABEL: store_load_sindex_small_offset_foo: 870; GFX9: ; %bb.0: ; %bb 871; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 872; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 873; GFX9-NEXT: s_mov_b32 vcc_hi, 0 874; GFX9-NEXT: s_lshl_b32 s0, s2, 2 875; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 876; GFX9-NEXT: s_add_u32 s0, 0x104, s0 877; GFX9-NEXT: s_waitcnt vmcnt(0) 878; GFX9-NEXT: v_mov_b32_e32 v0, 15 879; GFX9-NEXT: scratch_store_dword off, v0, s0 880; GFX9-NEXT: s_and_b32 s0, s2, 15 881; GFX9-NEXT: s_lshl_b32 s0, s0, 2 882; GFX9-NEXT: s_add_u32 s0, 0x104, s0 883; GFX9-NEXT: scratch_load_dword v0, off, s0 884; GFX9-NEXT: s_endpgm 885; 886; GFX10-LABEL: store_load_sindex_small_offset_foo: 887; GFX10: ; %bb.0: ; %bb 888; GFX10-NEXT: s_add_u32 s0, s0, s3 889; GFX10-NEXT: s_addc_u32 s1, s1, 0 890; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 891; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 892; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 893; GFX10-NEXT: s_and_b32 s0, s2, 15 894; GFX10-NEXT: s_waitcnt vmcnt(0) 895; GFX10-NEXT: v_mov_b32_e32 v0, 15 896; GFX10-NEXT: s_lshl_b32 s1, s2, 2 897; GFX10-NEXT: s_lshl_b32 s0, s0, 2 898; GFX10-NEXT: s_add_u32 s1, 0x104, s1 899; GFX10-NEXT: s_add_u32 s0, 0x104, s0 900; GFX10-NEXT: scratch_store_dword off, v0, s1 901; GFX10-NEXT: scratch_load_dword v0, off, s0 902; GFX10-NEXT: s_endpgm 903; 904; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 905; GFX9-PAL: ; %bb.0: ; %bb 906; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 907; GFX9-PAL-NEXT: s_mov_b32 s2, s0 908; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 909; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 910; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 911; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 912; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 913; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 914; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 915; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 916; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 917; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 918; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1 919; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 920; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 921; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 922; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0 923; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 924; GFX9-PAL-NEXT: s_endpgm 925; 926; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo: 927; GFX10-PAL: ; %bb.0: ; %bb 928; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 929; GFX10-PAL-NEXT: s_mov_b32 s2, s0 930; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 931; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 932; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 933; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 934; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 935; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 936; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 937; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 938; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 939; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 940; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 941; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 942; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 943; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0 944; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1 945; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 946; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 947; GFX10-PAL-NEXT: s_endpgm 948bb: 949 %padding = alloca [64 x i32], align 4, addrspace(5) 950 %i = alloca [32 x float], align 4, addrspace(5) 951 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 952 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 953 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 954 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 955 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 956 store volatile i32 15, i32 addrspace(5)* %i8, align 4 957 %i9 = and i32 %idx, 15 958 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 959 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 960 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 961 ret void 962} 963 964define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 965; GFX9-LABEL: store_load_vindex_small_offset_kernel: 966; GFX9: ; %bb.0: ; %bb 967; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 968; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 969; GFX9-NEXT: s_mov_b32 vcc_hi, 0 970; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 971; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 972; GFX9-NEXT: s_waitcnt vmcnt(0) 973; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 974; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 975; GFX9-NEXT: v_mov_b32_e32 v3, 15 976; GFX9-NEXT: scratch_store_dword v2, v3, off 977; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 978; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 979; GFX9-NEXT: s_endpgm 980; 981; GFX10-LABEL: store_load_vindex_small_offset_kernel: 982; GFX10: ; %bb.0: ; %bb 983; GFX10-NEXT: s_add_u32 s0, s0, s3 984; GFX10-NEXT: s_addc_u32 s1, s1, 0 985; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 986; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 987; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 988; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 989; GFX10-NEXT: v_mov_b32_e32 v3, 15 990; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 991; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 992; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 993; GFX10-NEXT: scratch_store_dword v2, v3, off 994; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 995; GFX10-NEXT: s_endpgm 996; 997; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 998; GFX9-PAL: ; %bb.0: ; %bb 999; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1000; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1001; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1002; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1003; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1004; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1005; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1007; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1008; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1009; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 1010; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1011; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1012; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1013; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1014; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1015; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1016; GFX9-PAL-NEXT: s_endpgm 1017; 1018; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel: 1019; GFX10-PAL: ; %bb.0: ; %bb 1020; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1021; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1022; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1023; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1025; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1026; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1027; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1028; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1029; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1030; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1031; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1032; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1033; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1034; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 1035; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 1036; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1037; GFX10-PAL-NEXT: s_endpgm 1038bb: 1039 %padding = alloca [64 x i32], align 4, addrspace(5) 1040 %i = alloca [32 x float], align 4, addrspace(5) 1041 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1042 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1043 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1044 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1045 %i3 = zext i32 %i2 to i64 1046 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1047 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1048 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1049 %i9 = sub nsw i32 31, %i2 1050 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1051 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1052 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1053 ret void 1054} 1055 1056define void @store_load_vindex_small_offset_foo(i32 %idx) { 1057; GFX9-LABEL: store_load_vindex_small_offset_foo: 1058; GFX9: ; %bb.0: ; %bb 1059; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1060; GFX9-NEXT: scratch_load_dword v1, off, s32 1061; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x100 1062; GFX9-NEXT: s_waitcnt vmcnt(0) 1063; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1064; GFX9-NEXT: v_mov_b32_e32 v3, 15 1065; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1066; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1067; GFX9-NEXT: scratch_store_dword v2, v3, off 1068; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1069; GFX9-NEXT: scratch_load_dword v0, v0, off 1070; GFX9-NEXT: s_waitcnt vmcnt(0) 1071; GFX9-NEXT: s_setpc_b64 s[30:31] 1072; 1073; GFX10-LABEL: store_load_vindex_small_offset_foo: 1074; GFX10: ; %bb.0: ; %bb 1075; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1076; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1077; GFX10-NEXT: v_mov_b32_e32 v1, 15 1078; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x100 1079; GFX10-NEXT: ; implicit-def: $vcc_hi 1080; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1081; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1082; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1083; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1084; GFX10-NEXT: scratch_load_dword v3, off, s32 1085; GFX10-NEXT: scratch_store_dword v0, v1, off 1086; GFX10-NEXT: scratch_load_dword v0, v2, off 1087; GFX10-NEXT: s_waitcnt vmcnt(0) 1088; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1089; GFX10-NEXT: s_setpc_b64 s[30:31] 1090; 1091; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1092; GFX9-PAL: ; %bb.0: ; %bb 1093; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1094; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 1095; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100 1096; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1097; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1098; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1099; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1100; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1101; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1102; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1103; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off 1104; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1105; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1106; 1107; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1108; GFX10-PAL: ; %bb.0: ; %bb 1109; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1110; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1111; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1112; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x100 1113; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 1114; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1115; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1116; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1117; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1118; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 1119; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1120; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off 1121; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1122; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1123; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1124bb: 1125 %padding = alloca [64 x i32], align 4, addrspace(5) 1126 %i = alloca [32 x float], align 4, addrspace(5) 1127 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1128 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1129 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1130 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1131 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1132 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1133 %i9 = and i32 %idx, 15 1134 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1135 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1136 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1137 ret void 1138} 1139 1140define amdgpu_kernel void @zero_init_large_offset_kernel() { 1141; GFX9-LABEL: zero_init_large_offset_kernel: 1142; GFX9: ; %bb.0: 1143; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1144; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1145; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1146; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1147; GFX9-NEXT: s_mov_b32 s0, 0 1148; GFX9-NEXT: s_mov_b32 s1, s0 1149; GFX9-NEXT: s_mov_b32 s2, s0 1150; GFX9-NEXT: s_mov_b32 s3, s0 1151; GFX9-NEXT: s_waitcnt vmcnt(0) 1152; GFX9-NEXT: v_mov_b32_e32 v0, s0 1153; GFX9-NEXT: v_mov_b32_e32 v1, s1 1154; GFX9-NEXT: v_mov_b32_e32 v2, s2 1155; GFX9-NEXT: v_mov_b32_e32 v3, s3 1156; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1157; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1158; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1159; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1160; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1161; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1162; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1163; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1164; GFX9-NEXT: s_endpgm 1165; 1166; GFX10-LABEL: zero_init_large_offset_kernel: 1167; GFX10: ; %bb.0: 1168; GFX10-NEXT: s_add_u32 s0, s0, s3 1169; GFX10-NEXT: s_addc_u32 s1, s1, 0 1170; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1171; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1172; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 1173; GFX10-NEXT: s_mov_b32 s0, 0 1174; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1175; GFX10-NEXT: s_mov_b32 s1, s0 1176; GFX10-NEXT: s_mov_b32 s2, s0 1177; GFX10-NEXT: s_mov_b32 s3, s0 1178; GFX10-NEXT: s_waitcnt vmcnt(0) 1179; GFX10-NEXT: v_mov_b32_e32 v0, s0 1180; GFX10-NEXT: v_mov_b32_e32 v1, s1 1181; GFX10-NEXT: v_mov_b32_e32 v2, s2 1182; GFX10-NEXT: v_mov_b32_e32 v3, s3 1183; GFX10-NEXT: ; implicit-def: $vcc_hi 1184; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1185; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1186; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1187; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1188; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1189; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1190; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1191; GFX10-NEXT: s_endpgm 1192; 1193; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 1194; GFX9-PAL: ; %bb.0: 1195; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1196; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1197; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1198; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1199; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1200; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1201; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1202; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1203; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1204; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1205; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1206; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1207; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1208; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1209; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1210; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1211; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1212; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1213; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1214; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1215; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1216; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1217; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1218; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1219; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1220; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1221; GFX9-PAL-NEXT: s_endpgm 1222; 1223; GFX10-PAL-LABEL: zero_init_large_offset_kernel: 1224; GFX10-PAL: ; %bb.0: 1225; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1226; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1227; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1228; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1230; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1231; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1232; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1233; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1234; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 1235; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1236; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1237; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1238; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1239; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1240; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1241; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1242; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1243; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1244; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1245; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 1246; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1247; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1248; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1249; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1250; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1251; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1252; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1253; GFX10-PAL-NEXT: s_endpgm 1254 %padding = alloca [4096 x i32], align 4, addrspace(5) 1255 %alloca = alloca [32 x i16], align 2, addrspace(5) 1256 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1257 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1258 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1259 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1260 ret void 1261} 1262 1263define void @zero_init_large_offset_foo() { 1264; GFX9-LABEL: zero_init_large_offset_foo: 1265; GFX9: ; %bb.0: 1266; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1267; GFX9-NEXT: scratch_load_dword v0, off, s32 1268; GFX9-NEXT: s_mov_b32 s0, 0 1269; GFX9-NEXT: s_mov_b32 s1, s0 1270; GFX9-NEXT: s_mov_b32 s2, s0 1271; GFX9-NEXT: s_mov_b32 s3, s0 1272; GFX9-NEXT: s_waitcnt vmcnt(0) 1273; GFX9-NEXT: v_mov_b32_e32 v0, s0 1274; GFX9-NEXT: v_mov_b32_e32 v1, s1 1275; GFX9-NEXT: v_mov_b32_e32 v2, s2 1276; GFX9-NEXT: v_mov_b32_e32 v3, s3 1277; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1278; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1279; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1280; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1281; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1282; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1283; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1284; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1285; GFX9-NEXT: s_waitcnt vmcnt(0) 1286; GFX9-NEXT: s_setpc_b64 s[30:31] 1287; 1288; GFX10-LABEL: zero_init_large_offset_foo: 1289; GFX10: ; %bb.0: 1290; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1291; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1292; GFX10-NEXT: scratch_load_dword v0, off, s32 1293; GFX10-NEXT: s_mov_b32 s0, 0 1294; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1295; GFX10-NEXT: s_mov_b32 s1, s0 1296; GFX10-NEXT: s_mov_b32 s2, s0 1297; GFX10-NEXT: s_mov_b32 s3, s0 1298; GFX10-NEXT: s_waitcnt vmcnt(0) 1299; GFX10-NEXT: v_mov_b32_e32 v0, s0 1300; GFX10-NEXT: v_mov_b32_e32 v1, s1 1301; GFX10-NEXT: v_mov_b32_e32 v2, s2 1302; GFX10-NEXT: v_mov_b32_e32 v3, s3 1303; GFX10-NEXT: ; implicit-def: $vcc_hi 1304; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1305; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1306; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1307; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1308; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1309; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1310; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1311; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1312; GFX10-NEXT: s_setpc_b64 s[30:31] 1313; 1314; GFX9-PAL-LABEL: zero_init_large_offset_foo: 1315; GFX9-PAL: ; %bb.0: 1316; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1317; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 1318; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1319; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1320; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1321; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1322; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1323; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1324; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1325; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1326; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1327; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1328; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1329; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1330; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1331; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1332; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1333; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1334; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1335; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1336; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1337; 1338; GFX10-PAL-LABEL: zero_init_large_offset_foo: 1339; GFX10-PAL: ; %bb.0: 1340; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1341; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1342; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 1343; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1344; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1345; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1346; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1347; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1348; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1349; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1350; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1351; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1352; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1353; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 1354; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1355; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1356; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1357; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1358; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1359; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1360; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1361; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1362; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1363 %padding = alloca [4096 x i32], align 4, addrspace(5) 1364 %alloca = alloca [32 x i16], align 2, addrspace(5) 1365 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1366 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1367 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1368 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1369 ret void 1370} 1371 1372define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 1373; GFX9-LABEL: store_load_sindex_large_offset_kernel: 1374; GFX9: ; %bb.0: ; %bb 1375; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1376; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1377; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1378; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1379; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1381; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1382; GFX9-NEXT: s_and_b32 s0, s0, 15 1383; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1384; GFX9-NEXT: s_waitcnt vmcnt(0) 1385; GFX9-NEXT: v_mov_b32_e32 v0, 15 1386; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 1387; GFX9-NEXT: scratch_store_dword off, v0, s1 1388; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1389; GFX9-NEXT: scratch_load_dword v0, off, s0 1390; GFX9-NEXT: s_endpgm 1391; 1392; GFX10-LABEL: store_load_sindex_large_offset_kernel: 1393; GFX10: ; %bb.0: ; %bb 1394; GFX10-NEXT: s_add_u32 s2, s2, s5 1395; GFX10-NEXT: s_addc_u32 s3, s3, 0 1396; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1397; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1398; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1399; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 1400; GFX10-NEXT: s_waitcnt vmcnt(0) 1401; GFX10-NEXT: v_mov_b32_e32 v0, 15 1402; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX10-NEXT: s_and_b32 s1, s0, 15 1404; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1405; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1406; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 1407; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 1408; GFX10-NEXT: scratch_store_dword off, v0, s0 1409; GFX10-NEXT: scratch_load_dword v0, off, s1 1410; GFX10-NEXT: s_endpgm 1411; 1412; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 1413; GFX9-PAL: ; %bb.0: ; %bb 1414; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1415; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1416; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1417; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1418; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1419; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1420; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1421; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1422; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1423; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1424; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1425; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1426; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1427; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1428; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1429; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1430; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1431; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1432; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 1433; GFX9-PAL-NEXT: s_endpgm 1434; 1435; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel: 1436; GFX10-PAL: ; %bb.0: ; %bb 1437; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 1438; GFX10-PAL-NEXT: s_mov_b32 s4, s0 1439; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1440; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1441; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1442; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 1443; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 1444; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1445; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1446; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1447; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 1448; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1449; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 1450; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 1452; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 1453; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 1454; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1455; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1456; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 1457; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 1458; GFX10-PAL-NEXT: s_endpgm 1459bb: 1460 %padding = alloca [4096 x i32], align 4, addrspace(5) 1461 %i = alloca [32 x float], align 4, addrspace(5) 1462 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1463 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1464 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1465 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1466 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1467 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1468 %i9 = and i32 %idx, 15 1469 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1470 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1471 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1472 ret void 1473} 1474 1475define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 1476; GFX9-LABEL: store_load_sindex_large_offset_foo: 1477; GFX9: ; %bb.0: ; %bb 1478; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1479; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1480; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1481; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1482; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1483; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1484; GFX9-NEXT: s_waitcnt vmcnt(0) 1485; GFX9-NEXT: v_mov_b32_e32 v0, 15 1486; GFX9-NEXT: scratch_store_dword off, v0, s0 1487; GFX9-NEXT: s_and_b32 s0, s2, 15 1488; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1489; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 1490; GFX9-NEXT: scratch_load_dword v0, off, s0 1491; GFX9-NEXT: s_endpgm 1492; 1493; GFX10-LABEL: store_load_sindex_large_offset_foo: 1494; GFX10: ; %bb.0: ; %bb 1495; GFX10-NEXT: s_add_u32 s0, s0, s3 1496; GFX10-NEXT: s_addc_u32 s1, s1, 0 1497; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1498; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1499; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 1500; GFX10-NEXT: s_and_b32 s0, s2, 15 1501; GFX10-NEXT: s_waitcnt vmcnt(0) 1502; GFX10-NEXT: v_mov_b32_e32 v0, 15 1503; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1504; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1505; GFX10-NEXT: s_add_u32 s1, 0x4004, s1 1506; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 1507; GFX10-NEXT: scratch_store_dword off, v0, s1 1508; GFX10-NEXT: scratch_load_dword v0, off, s0 1509; GFX10-NEXT: s_endpgm 1510; 1511; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 1512; GFX9-PAL: ; %bb.0: ; %bb 1513; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1514; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1515; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1516; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1517; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1518; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1519; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1520; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1521; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1522; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1523; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 1524; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1525; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1526; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1527; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1528; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1529; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1530; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 1531; GFX9-PAL-NEXT: s_endpgm 1532; 1533; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo: 1534; GFX10-PAL: ; %bb.0: ; %bb 1535; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1536; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1537; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1538; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1539; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1540; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1541; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1542; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1543; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1544; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4 1545; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 1546; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1547; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 1548; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 1549; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 1550; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0 1551; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1 1552; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 1553; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 1554; GFX10-PAL-NEXT: s_endpgm 1555bb: 1556 %padding = alloca [4096 x i32], align 4, addrspace(5) 1557 %i = alloca [32 x float], align 4, addrspace(5) 1558 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1559 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1560 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1561 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1562 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1563 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1564 %i9 = and i32 %idx, 15 1565 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1566 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1567 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1568 ret void 1569} 1570 1571define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 1572; GFX9-LABEL: store_load_vindex_large_offset_kernel: 1573; GFX9: ; %bb.0: ; %bb 1574; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1575; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1576; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1577; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 1578; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1579; GFX9-NEXT: s_waitcnt vmcnt(0) 1580; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 1581; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1582; GFX9-NEXT: v_mov_b32_e32 v3, 15 1583; GFX9-NEXT: scratch_store_dword v2, v3, off 1584; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1585; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 1586; GFX9-NEXT: s_endpgm 1587; 1588; GFX10-LABEL: store_load_vindex_large_offset_kernel: 1589; GFX10: ; %bb.0: ; %bb 1590; GFX10-NEXT: s_add_u32 s0, s0, s3 1591; GFX10-NEXT: s_addc_u32 s1, s1, 0 1592; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1593; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1594; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 1595; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1596; GFX10-NEXT: v_mov_b32_e32 v3, 15 1597; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1598; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1599; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 1600; GFX10-NEXT: scratch_store_dword v2, v3, off 1601; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 1602; GFX10-NEXT: s_endpgm 1603; 1604; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 1605; GFX9-PAL: ; %bb.0: ; %bb 1606; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1607; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1608; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1609; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1610; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1611; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1612; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1613; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1614; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1615; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1616; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 1617; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1618; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1619; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1620; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1621; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1622; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1623; GFX9-PAL-NEXT: s_endpgm 1624; 1625; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel: 1626; GFX10-PAL: ; %bb.0: ; %bb 1627; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1628; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1629; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1630; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1631; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1632; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1633; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1634; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1635; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1636; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 1637; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1638; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1639; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1640; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1641; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4 1642; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 1643; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 1644; GFX10-PAL-NEXT: s_endpgm 1645bb: 1646 %padding = alloca [4096 x i32], align 4, addrspace(5) 1647 %i = alloca [32 x float], align 4, addrspace(5) 1648 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1649 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1650 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1651 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1652 %i3 = zext i32 %i2 to i64 1653 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1654 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1655 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1656 %i9 = sub nsw i32 31, %i2 1657 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1658 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1659 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1660 ret void 1661} 1662 1663define void @store_load_vindex_large_offset_foo(i32 %idx) { 1664; GFX9-LABEL: store_load_vindex_large_offset_foo: 1665; GFX9: ; %bb.0: ; %bb 1666; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1667; GFX9-NEXT: scratch_load_dword v1, off, s32 1668; GFX9-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1669; GFX9-NEXT: s_waitcnt vmcnt(0) 1670; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1671; GFX9-NEXT: v_mov_b32_e32 v3, 15 1672; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1673; GFX9-NEXT: v_and_b32_e32 v0, v0, v3 1674; GFX9-NEXT: scratch_store_dword v2, v3, off 1675; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1676; GFX9-NEXT: scratch_load_dword v0, v0, off 1677; GFX9-NEXT: s_waitcnt vmcnt(0) 1678; GFX9-NEXT: s_setpc_b64 s[30:31] 1679; 1680; GFX10-LABEL: store_load_vindex_large_offset_foo: 1681; GFX10: ; %bb.0: ; %bb 1682; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1683; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1684; GFX10-NEXT: v_mov_b32_e32 v1, 15 1685; GFX10-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1686; GFX10-NEXT: ; implicit-def: $vcc_hi 1687; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo 1688; GFX10-NEXT: v_and_b32_e32 v3, v0, v1 1689; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1690; GFX10-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1691; GFX10-NEXT: scratch_load_dword v3, off, s32 1692; GFX10-NEXT: scratch_store_dword v0, v1, off 1693; GFX10-NEXT: scratch_load_dword v0, v2, off 1694; GFX10-NEXT: s_waitcnt vmcnt(0) 1695; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1696; GFX10-NEXT: s_setpc_b64 s[30:31] 1697; 1698; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 1699; GFX9-PAL: ; %bb.0: ; %bb 1700; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1701; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 1702; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000 1703; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1704; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1705; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1706; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1707; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3 1708; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1709; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1710; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off 1711; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1712; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1713; 1714; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 1715; GFX10-PAL: ; %bb.0: ; %bb 1716; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1717; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1718; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1719; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000 1720; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 1721; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo 1722; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1 1723; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2 1724; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2 1725; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 1726; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off 1727; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off 1728; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1729; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1730; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1731bb: 1732 %padding = alloca [4096 x i32], align 4, addrspace(5) 1733 %i = alloca [32 x float], align 4, addrspace(5) 1734 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1735 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1736 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1737 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1738 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1739 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1740 %i9 = and i32 %idx, 15 1741 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1742 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1743 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1744 ret void 1745} 1746 1747define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 1748; GFX9-LABEL: store_load_large_imm_offset_kernel: 1749; GFX9: ; %bb.0: ; %bb 1750; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1751; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1752; GFX9-NEXT: s_movk_i32 s0, 0x3000 1753; GFX9-NEXT: v_mov_b32_e32 v0, 13 1754; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1755; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1756; GFX9-NEXT: s_add_u32 s0, 4, s0 1757; GFX9-NEXT: v_mov_b32_e32 v0, 15 1758; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1759; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 1760; GFX9-NEXT: s_endpgm 1761; 1762; GFX10-LABEL: store_load_large_imm_offset_kernel: 1763; GFX10: ; %bb.0: ; %bb 1764; GFX10-NEXT: s_add_u32 s0, s0, s3 1765; GFX10-NEXT: s_addc_u32 s1, s1, 0 1766; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1767; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1768; GFX10-NEXT: v_mov_b32_e32 v0, 13 1769; GFX10-NEXT: v_mov_b32_e32 v1, 15 1770; GFX10-NEXT: s_movk_i32 s0, 0x3800 1771; GFX10-NEXT: s_add_u32 s0, 4, s0 1772; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 1773; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1774; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 1775; GFX10-NEXT: s_endpgm 1776; 1777; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 1778; GFX9-PAL: ; %bb.0: ; %bb 1779; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1780; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1781; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1782; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 1783; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1784; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 1785; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1786; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1787; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1788; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1789; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 1790; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0 1791; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1792; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 1793; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 1794; GFX9-PAL-NEXT: s_endpgm 1795; 1796; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel: 1797; GFX10-PAL: ; %bb.0: ; %bb 1798; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 1799; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1800; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1801; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1802; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1803; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 1804; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 1805; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1806; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1807; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 1808; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1809; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 1810; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0 1811; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4 1812; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 1813; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 1814; GFX10-PAL-NEXT: s_endpgm 1815bb: 1816 %i = alloca [4096 x i32], align 4, addrspace(5) 1817 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1818 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1819 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1820 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1821 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1822 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1823 ret void 1824} 1825 1826define void @store_load_large_imm_offset_foo() { 1827; GFX9-LABEL: store_load_large_imm_offset_foo: 1828; GFX9: ; %bb.0: ; %bb 1829; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1830; GFX9-NEXT: s_movk_i32 s0, 0x3000 1831; GFX9-NEXT: v_mov_b32_e32 v0, 13 1832; GFX9-NEXT: scratch_store_dword off, v0, s32 1833; GFX9-NEXT: s_add_u32 s0, s32, s0 1834; GFX9-NEXT: v_mov_b32_e32 v0, 15 1835; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 1836; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 1837; GFX9-NEXT: s_waitcnt vmcnt(0) 1838; GFX9-NEXT: s_setpc_b64 s[30:31] 1839; 1840; GFX10-LABEL: store_load_large_imm_offset_foo: 1841; GFX10: ; %bb.0: ; %bb 1842; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1843; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1844; GFX10-NEXT: v_mov_b32_e32 v0, 13 1845; GFX10-NEXT: v_mov_b32_e32 v1, 15 1846; GFX10-NEXT: s_movk_i32 s0, 0x3800 1847; GFX10-NEXT: ; implicit-def: $vcc_hi 1848; GFX10-NEXT: s_add_u32 s0, s32, s0 1849; GFX10-NEXT: scratch_store_dword off, v0, s32 1850; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 1851; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 1852; GFX10-NEXT: s_waitcnt vmcnt(0) 1853; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1854; GFX10-NEXT: s_setpc_b64 s[30:31] 1855; 1856; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 1857; GFX9-PAL: ; %bb.0: ; %bb 1858; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1859; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 1860; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 1861; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 1862; GFX9-PAL-NEXT: s_add_u32 s0, s32, s0 1863; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1864; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 1865; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 1866; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1867; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1868; 1869; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 1870; GFX10-PAL: ; %bb.0: ; %bb 1871; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1872; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1873; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 1874; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1875; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 1876; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 1877; GFX10-PAL-NEXT: s_add_u32 s0, s32, s0 1878; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 1879; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 1880; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 1881; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1882; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1883; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1884bb: 1885 %i = alloca [4096 x i32], align 4, addrspace(5) 1886 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 1887 store volatile i32 13, i32 addrspace(5)* %i1, align 4 1888 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1889 store volatile i32 15, i32 addrspace(5)* %i7, align 4 1890 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 1891 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 1892 ret void 1893} 1894 1895define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 1896; GFX9-LABEL: store_load_vidx_sidx_offset: 1897; GFX9: ; %bb.0: ; %bb 1898; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1899; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1900; GFX9-NEXT: v_mov_b32_e32 v1, 4 1901; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1902; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1903; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 1904; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1905; GFX9-NEXT: v_mov_b32_e32 v1, 15 1906; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 1907; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 1908; GFX9-NEXT: s_endpgm 1909; 1910; GFX10-LABEL: store_load_vidx_sidx_offset: 1911; GFX10: ; %bb.0: ; %bb 1912; GFX10-NEXT: s_add_u32 s2, s2, s5 1913; GFX10-NEXT: s_addc_u32 s3, s3, 0 1914; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1915; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1916; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1917; GFX10-NEXT: v_mov_b32_e32 v1, 15 1918; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1919; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 1920; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 1921; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 1922; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 1923; GFX10-NEXT: s_endpgm 1924; 1925; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 1926; GFX9-PAL: ; %bb.0: ; %bb 1927; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1928; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1929; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1930; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1931; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 1932; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1933; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1934; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1935; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 1936; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1937; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1938; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 1939; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 1940; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 1941; GFX9-PAL-NEXT: s_endpgm 1942; 1943; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 1944; GFX10-PAL: ; %bb.0: ; %bb 1945; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 1946; GFX10-PAL-NEXT: s_mov_b32 s4, s0 1947; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1948; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1950; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 1951; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 1952; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1953; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1954; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24 1955; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 1956; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 1957; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 1958; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 1959; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 1960; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 1961; GFX10-PAL-NEXT: s_endpgm 1962bb: 1963 %alloca = alloca [32 x i32], align 4, addrspace(5) 1964 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 1965 %add1 = add nsw i32 %sidx, %vidx 1966 %add2 = add nsw i32 %add1, 256 1967 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 1968 store volatile i32 15, i32 addrspace(5)* %gep, align 4 1969 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 1970 ret void 1971} 1972 1973define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 1974; GFX9-LABEL: store_load_i64_aligned: 1975; GFX9: ; %bb.0: ; %bb 1976; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1977; GFX9-NEXT: v_mov_b32_e32 v1, 15 1978; GFX9-NEXT: v_mov_b32_e32 v2, 0 1979; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1980; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off 1981; GFX9-NEXT: s_waitcnt vmcnt(0) 1982; GFX9-NEXT: s_setpc_b64 s[30:31] 1983; 1984; GFX10-LABEL: store_load_i64_aligned: 1985; GFX10: ; %bb.0: ; %bb 1986; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1987; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1988; GFX10-NEXT: v_mov_b32_e32 v1, 15 1989; GFX10-NEXT: v_mov_b32_e32 v2, 0 1990; GFX10-NEXT: ; implicit-def: $vcc_hi 1991; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1992; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off 1993; GFX10-NEXT: s_waitcnt vmcnt(0) 1994; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1995; GFX10-NEXT: s_setpc_b64 s[30:31] 1996; 1997; GFX9-PAL-LABEL: store_load_i64_aligned: 1998; GFX9-PAL: ; %bb.0: ; %bb 1999; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2000; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2001; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2002; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2003; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2004; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2005; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2006; 2007; GFX10-PAL-LABEL: store_load_i64_aligned: 2008; GFX10-PAL: ; %bb.0: ; %bb 2009; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2010; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2011; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2012; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2013; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 2014; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2015; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2016; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2017; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2018; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2019bb: 2020 store volatile i64 15, i64 addrspace(5)* %arg, align 8 2021 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 2022 ret void 2023} 2024 2025define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 2026; GFX9-LABEL: store_load_i64_unaligned: 2027; GFX9: ; %bb.0: ; %bb 2028; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2029; GFX9-NEXT: v_mov_b32_e32 v1, 15 2030; GFX9-NEXT: v_mov_b32_e32 v2, 0 2031; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2032; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2033; GFX9-NEXT: s_waitcnt vmcnt(0) 2034; GFX9-NEXT: s_setpc_b64 s[30:31] 2035; 2036; GFX10-LABEL: store_load_i64_unaligned: 2037; GFX10: ; %bb.0: ; %bb 2038; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2039; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2040; GFX10-NEXT: v_mov_b32_e32 v1, 15 2041; GFX10-NEXT: v_mov_b32_e32 v2, 0 2042; GFX10-NEXT: ; implicit-def: $vcc_hi 2043; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2044; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2045; GFX10-NEXT: s_waitcnt vmcnt(0) 2046; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2047; GFX10-NEXT: s_setpc_b64 s[30:31] 2048; 2049; GFX9-PAL-LABEL: store_load_i64_unaligned: 2050; GFX9-PAL: ; %bb.0: ; %bb 2051; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2052; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2053; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2054; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2055; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2056; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2057; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2058; 2059; GFX10-PAL-LABEL: store_load_i64_unaligned: 2060; GFX10-PAL: ; %bb.0: ; %bb 2061; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2062; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2063; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2064; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2065; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 2066; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2067; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off 2068; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2069; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2070; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2071bb: 2072 store volatile i64 15, i64 addrspace(5)* %arg, align 1 2073 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 2074 ret void 2075} 2076 2077define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 2078; GFX9-LABEL: store_load_v3i32_unaligned: 2079; GFX9: ; %bb.0: ; %bb 2080; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2081; GFX9-NEXT: v_mov_b32_e32 v1, 1 2082; GFX9-NEXT: v_mov_b32_e32 v2, 2 2083; GFX9-NEXT: v_mov_b32_e32 v3, 3 2084; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2085; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2086; GFX9-NEXT: s_waitcnt vmcnt(0) 2087; GFX9-NEXT: s_setpc_b64 s[30:31] 2088; 2089; GFX10-LABEL: store_load_v3i32_unaligned: 2090; GFX10: ; %bb.0: ; %bb 2091; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2092; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2093; GFX10-NEXT: v_mov_b32_e32 v1, 1 2094; GFX10-NEXT: v_mov_b32_e32 v2, 2 2095; GFX10-NEXT: v_mov_b32_e32 v3, 3 2096; GFX10-NEXT: ; implicit-def: $vcc_hi 2097; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2098; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2099; GFX10-NEXT: s_waitcnt vmcnt(0) 2100; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2101; GFX10-NEXT: s_setpc_b64 s[30:31] 2102; 2103; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 2104; GFX9-PAL: ; %bb.0: ; %bb 2105; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2106; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2107; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2108; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2109; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2110; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2111; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2112; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2113; 2114; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 2115; GFX10-PAL: ; %bb.0: ; %bb 2116; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2117; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2118; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2119; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2120; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2121; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 2122; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2123; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off 2124; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2125; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2126; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2127bb: 2128 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 2129 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 2130 ret void 2131} 2132 2133define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 2134; GFX9-LABEL: store_load_v4i32_unaligned: 2135; GFX9: ; %bb.0: ; %bb 2136; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2137; GFX9-NEXT: v_mov_b32_e32 v1, 1 2138; GFX9-NEXT: v_mov_b32_e32 v2, 2 2139; GFX9-NEXT: v_mov_b32_e32 v3, 3 2140; GFX9-NEXT: v_mov_b32_e32 v4, 4 2141; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2142; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2143; GFX9-NEXT: s_waitcnt vmcnt(0) 2144; GFX9-NEXT: s_setpc_b64 s[30:31] 2145; 2146; GFX10-LABEL: store_load_v4i32_unaligned: 2147; GFX10: ; %bb.0: ; %bb 2148; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2149; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2150; GFX10-NEXT: v_mov_b32_e32 v1, 1 2151; GFX10-NEXT: v_mov_b32_e32 v2, 2 2152; GFX10-NEXT: v_mov_b32_e32 v3, 3 2153; GFX10-NEXT: v_mov_b32_e32 v4, 4 2154; GFX10-NEXT: ; implicit-def: $vcc_hi 2155; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2156; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2157; GFX10-NEXT: s_waitcnt vmcnt(0) 2158; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2159; GFX10-NEXT: s_setpc_b64 s[30:31] 2160; 2161; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 2162; GFX9-PAL: ; %bb.0: ; %bb 2163; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2164; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 2165; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 2166; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 2167; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 2168; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2169; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2170; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2171; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2172; 2173; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 2174; GFX10-PAL: ; %bb.0: ; %bb 2175; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2176; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2177; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 2178; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 2179; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 2180; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 2181; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi 2182; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 2183; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off 2184; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2185; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2186; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2187bb: 2188 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 2189 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 2190 ret void 2191} 2192 2193declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 2194declare i32 @llvm.amdgcn.workitem.id.x() 2195