1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck --check-prefix=MUBUF %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=FLATSCR %s 4 5; Make sure we use the correct frame offset is used with the local 6; frame area. 7; 8; %pin.low is allocated to offset 0. 9; 10; %local.area is assigned to the local frame offset by the 11; LocalStackSlotAllocation pass at offset 4096. 12; 13; The %load1 access to %gep.large.offset initially used the stack 14; pointer register and directly referenced the frame index. After 15; LocalStackSlotAllocation, it would no longer refer to a frame index 16; so eliminateFrameIndex would not adjust the access to use the 17; correct FP offset. 18 19define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out) { 20; MUBUF-LABEL: local_stack_offset_uses_sp: 21; MUBUF: ; %bb.0: ; %entry 22; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 23; MUBUF-NEXT: s_add_u32 s0, s0, s9 24; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 25; MUBUF-NEXT: s_addc_u32 s1, s1, 0 26; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 27; MUBUF-NEXT: v_mov_b32_e32 v2, 0 28; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 29; MUBUF-NEXT: s_mov_b32 s6, 0 30; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 31; MUBUF-NEXT: s_waitcnt vmcnt(0) 32; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop 33; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 34; MUBUF-NEXT: v_add_u32_e32 v3, s6, v1 35; MUBUF-NEXT: s_add_i32 s6, s6, 1 36; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 37; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen 38; MUBUF-NEXT: s_waitcnt vmcnt(0) 39; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1 40; MUBUF-NEXT: ; %bb.2: ; %split 41; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 42; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 43; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc 44; MUBUF-NEXT: s_waitcnt vmcnt(0) 45; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc 46; MUBUF-NEXT: s_waitcnt vmcnt(0) 47; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc 48; MUBUF-NEXT: s_waitcnt vmcnt(0) 49; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc 50; MUBUF-NEXT: s_waitcnt vmcnt(0) 51; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 52; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc 53; MUBUF-NEXT: v_mov_b32_e32 v2, 0 54; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 55; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 56; MUBUF-NEXT: s_waitcnt vmcnt(0) 57; MUBUF-NEXT: s_endpgm 58; 59; FLATSCR-LABEL: local_stack_offset_uses_sp: 60; FLATSCR: ; %bb.0: ; %entry 61; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 62; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 63; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 64; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 65; FLATSCR-NEXT: s_movk_i32 vcc_hi, 0x2000 66; FLATSCR-NEXT: s_mov_b32 s2, 0 67; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi 68; FLATSCR-NEXT: s_waitcnt vmcnt(0) 69; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop 70; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 71; FLATSCR-NEXT: s_add_i32 s3, s2, 0x3000 72; FLATSCR-NEXT: s_add_i32 s2, s2, 1 73; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 74; FLATSCR-NEXT: scratch_store_byte off, v0, s3 75; FLATSCR-NEXT: s_waitcnt vmcnt(0) 76; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 77; FLATSCR-NEXT: ; %bb.2: ; %split 78; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 79; FLATSCR-NEXT: s_addk_i32 s2, 0x3000 80; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208 glc 81; FLATSCR-NEXT: s_waitcnt vmcnt(0) 82; FLATSCR-NEXT: s_movk_i32 s2, 0x3000 83; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64 glc 84; FLATSCR-NEXT: s_waitcnt vmcnt(0) 85; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 86; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 87; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 88; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 89; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 90; FLATSCR-NEXT: s_waitcnt vmcnt(0) 91; FLATSCR-NEXT: s_endpgm 92entry: 93 %pin.low = alloca i32, align 8192, addrspace(5) 94 %local.area = alloca [1060 x i64], align 4096, addrspace(5) 95 store volatile i32 0, i32 addrspace(5)* %pin.low 96 %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)* 97 call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) 98 %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050 99 %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8 100 %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset 101 %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset 102 %add0 = add i64 %load0, %load1 103 store volatile i64 %add0, i64 addrspace(1)* %out 104 ret void 105} 106 107define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out) { 108; MUBUF-LABEL: func_local_stack_offset_uses_sp: 109; MUBUF: ; %bb.0: ; %entry 110; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 111; MUBUF-NEXT: s_mov_b32 s5, s33 112; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 113; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 114; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 115; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 116; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 117; MUBUF-NEXT: v_mov_b32_e32 v4, 0 118; MUBUF-NEXT: v_mov_b32_e32 v5, 0x2000 119; MUBUF-NEXT: s_mov_b32 s4, 0 120; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 121; MUBUF-NEXT: buffer_store_dword v4, v5, s[0:3], s33 offen 122; MUBUF-NEXT: s_waitcnt vmcnt(0) 123; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop 124; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 125; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3 126; MUBUF-NEXT: s_add_i32 s4, s4, 1 127; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 128; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen 129; MUBUF-NEXT: s_waitcnt vmcnt(0) 130; MUBUF-NEXT: s_cbranch_scc1 .LBB1_1 131; MUBUF-NEXT: ; %bb.2: ; %split 132; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 133; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 134; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 135; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc 136; MUBUF-NEXT: s_waitcnt vmcnt(0) 137; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc 138; MUBUF-NEXT: s_waitcnt vmcnt(0) 139; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc 140; MUBUF-NEXT: s_waitcnt vmcnt(0) 141; MUBUF-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc 142; MUBUF-NEXT: s_waitcnt vmcnt(0) 143; MUBUF-NEXT: s_add_i32 s32, s32, 0xffe00000 144; MUBUF-NEXT: s_mov_b32 s33, s5 145; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 146; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc 147; MUBUF-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 148; MUBUF-NEXT: s_waitcnt vmcnt(0) 149; MUBUF-NEXT: s_setpc_b64 s[30:31] 150; 151; FLATSCR-LABEL: func_local_stack_offset_uses_sp: 152; FLATSCR: ; %bb.0: ; %entry 153; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; FLATSCR-NEXT: s_mov_b32 s2, s33 155; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff 156; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000 157; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000 158; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 159; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x2000 160; FLATSCR-NEXT: s_mov_b32 s0, 0 161; FLATSCR-NEXT: scratch_store_dword off, v2, vcc_hi 162; FLATSCR-NEXT: s_waitcnt vmcnt(0) 163; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop 164; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 165; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x3000 166; FLATSCR-NEXT: s_add_i32 s1, s0, vcc_hi 167; FLATSCR-NEXT: s_add_i32 s0, s0, 1 168; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 169; FLATSCR-NEXT: scratch_store_byte off, v2, s1 170; FLATSCR-NEXT: s_waitcnt vmcnt(0) 171; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 172; FLATSCR-NEXT: ; %bb.2: ; %split 173; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 174; FLATSCR-NEXT: s_add_i32 s1, s33, 0x3000 175; FLATSCR-NEXT: s_add_i32 s0, s0, s1 176; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc 177; FLATSCR-NEXT: s_waitcnt vmcnt(0) 178; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 179; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc 180; FLATSCR-NEXT: s_waitcnt vmcnt(0) 181; FLATSCR-NEXT: s_addk_i32 s32, 0x8000 182; FLATSCR-NEXT: s_mov_b32 s33, s2 183; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 184; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc 185; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 186; FLATSCR-NEXT: s_waitcnt vmcnt(0) 187; FLATSCR-NEXT: s_setpc_b64 s[30:31] 188entry: 189 %pin.low = alloca i32, align 8192, addrspace(5) 190 %local.area = alloca [1060 x i64], align 4096, addrspace(5) 191 store volatile i32 0, i32 addrspace(5)* %pin.low 192 %local.area.cast = bitcast [1060 x i64] addrspace(5)* %local.area to i8 addrspace(5)* 193 call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) 194 %gep.large.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 1050 195 %gep.small.offset = getelementptr inbounds [1060 x i64], [1060 x i64] addrspace(5)* %local.area, i64 0, i64 8 196 %load0 = load volatile i64, i64 addrspace(5)* %gep.large.offset 197 %load1 = load volatile i64, i64 addrspace(5)* %gep.small.offset 198 %add0 = add i64 %load0, %load1 199 store volatile i64 %add0, i64 addrspace(1)* %out 200 ret void 201} 202 203define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1)* %out) { 204; MUBUF-LABEL: local_stack_offset_uses_sp_flat: 205; MUBUF: ; %bb.0: ; %entry 206; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 207; MUBUF-NEXT: s_add_u32 s0, s0, s9 208; MUBUF-NEXT: s_addc_u32 s1, s1, 0 209; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 210; MUBUF-NEXT: v_mov_b32_e32 v1, 0 211; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 212; MUBUF-NEXT: s_mov_b32 s6, 0 213; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 214; MUBUF-NEXT: s_waitcnt vmcnt(0) 215; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop 216; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 217; MUBUF-NEXT: v_add_u32_e32 v2, s6, v0 218; MUBUF-NEXT: s_add_i32 s6, s6, 1 219; MUBUF-NEXT: s_cmpk_lt_u32 s6, 0x2120 220; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen 221; MUBUF-NEXT: s_waitcnt vmcnt(0) 222; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 223; MUBUF-NEXT: ; %bb.2: ; %split 224; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 225; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d4, v0 226; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc 227; MUBUF-NEXT: s_waitcnt vmcnt(0) 228; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d0, v0 229; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c0, v0 230; MUBUF-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen glc 231; MUBUF-NEXT: s_waitcnt vmcnt(0) 232; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c4, v0 233; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc 234; MUBUF-NEXT: s_waitcnt vmcnt(0) 235; MUBUF-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen glc 236; MUBUF-NEXT: s_waitcnt vmcnt(0) 237; MUBUF-NEXT: v_or_b32_e32 v1, 0x12cc, v0 238; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c8, v0 239; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 240; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc 241; MUBUF-NEXT: s_waitcnt vmcnt(0) 242; MUBUF-NEXT: v_mov_b32_e32 v12, 0 243; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc 244; MUBUF-NEXT: s_waitcnt vmcnt(0) 245; MUBUF-NEXT: buffer_load_dword v8, v13, s[0:3], 0 offen glc 246; MUBUF-NEXT: s_waitcnt vmcnt(0) 247; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 248; MUBUF-NEXT: buffer_load_dword v9, v13, s[0:3], 0 offen offset:4 glc 249; MUBUF-NEXT: s_waitcnt vmcnt(0) 250; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 251; MUBUF-NEXT: buffer_load_dword v2, v13, s[0:3], 0 offen offset:8 glc 252; MUBUF-NEXT: s_waitcnt vmcnt(0) 253; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 254; MUBUF-NEXT: buffer_load_dword v3, v13, s[0:3], 0 offen offset:12 glc 255; MUBUF-NEXT: s_waitcnt vmcnt(0) 256; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 257; MUBUF-NEXT: buffer_load_dword v10, v13, s[0:3], 0 offen offset:16 glc 258; MUBUF-NEXT: s_waitcnt vmcnt(0) 259; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000 260; MUBUF-NEXT: buffer_load_dword v11, v13, s[0:3], 0 offen offset:20 glc 261; MUBUF-NEXT: s_waitcnt vmcnt(0) 262; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 263; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc 264; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v7, v8 265; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v9, vcc 266; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 267; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc 268; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 269; MUBUF-NEXT: global_store_dwordx2 v12, v[4:5], s[4:5] offset:16 270; MUBUF-NEXT: s_waitcnt vmcnt(0) 271; MUBUF-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5] 272; MUBUF-NEXT: s_waitcnt vmcnt(0) 273; MUBUF-NEXT: s_endpgm 274; 275; FLATSCR-LABEL: local_stack_offset_uses_sp_flat: 276; FLATSCR: ; %bb.0: ; %entry 277; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 278; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 279; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 280; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 281; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0 282; FLATSCR-NEXT: s_mov_b32 s2, 0 283; FLATSCR-NEXT: scratch_store_dword off, v0, vcc_hi offset:1024 284; FLATSCR-NEXT: s_waitcnt vmcnt(0) 285; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop 286; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 287; FLATSCR-NEXT: s_add_i32 s3, s2, 0x2000 288; FLATSCR-NEXT: s_add_i32 s2, s2, 1 289; FLATSCR-NEXT: s_cmpk_lt_u32 s2, 0x2120 290; FLATSCR-NEXT: scratch_store_byte off, v0, s3 291; FLATSCR-NEXT: s_waitcnt vmcnt(0) 292; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 293; FLATSCR-NEXT: ; %bb.2: ; %split 294; FLATSCR-NEXT: s_movk_i32 s2, 0x1000 295; FLATSCR-NEXT: s_addk_i32 s2, 0x2000 296; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc 297; FLATSCR-NEXT: s_waitcnt vmcnt(0) 298; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc 299; FLATSCR-NEXT: s_waitcnt vmcnt(0) 300; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 301; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc 302; FLATSCR-NEXT: s_waitcnt vmcnt(0) 303; FLATSCR-NEXT: s_movk_i32 s2, 0x2000 304; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s2 glc 305; FLATSCR-NEXT: s_waitcnt vmcnt(0) 306; FLATSCR-NEXT: v_mov_b32_e32 v12, 0 307; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 308; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc 309; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 310; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc 311; FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, v8, v10 312; FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v11, vcc 313; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 314; FLATSCR-NEXT: global_store_dwordx2 v12, v[4:5], s[0:1] offset:16 315; FLATSCR-NEXT: s_waitcnt vmcnt(0) 316; FLATSCR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] 317; FLATSCR-NEXT: s_waitcnt vmcnt(0) 318; FLATSCR-NEXT: s_endpgm 319entry: 320 %pin.low = alloca i32, align 1024, addrspace(5) 321 %local.area = alloca [160 x <3 x i64>], align 8192, addrspace(5) 322 store volatile i32 0, i32 addrspace(5)* %pin.low 323 %local.area.cast = bitcast [160 x <3 x i64>] addrspace(5)* %local.area to i8 addrspace(5)* 324 call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %local.area.cast, i8 0, i32 8480, i1 true) 325 %gep.large.offset = getelementptr inbounds [160 x <3 x i64>], [160 x <3 x i64>] addrspace(5)* %local.area, i64 0, i64 150 326 %gep.small.offset = getelementptr inbounds [160 x <3 x i64>], [160 x <3 x i64>] addrspace(5)* %local.area, i64 0, i64 0 327 %load0 = load volatile <3 x i64>, <3 x i64> addrspace(5)* %gep.large.offset 328 %load1 = load volatile <3 x i64>, <3 x i64> addrspace(5)* %gep.small.offset 329 %add0 = add <3 x i64> %load0, %load1 330 store volatile <3 x i64> %add0, <3 x i64> addrspace(1)* %out 331 ret void 332} 333 334declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #0 335 336attributes #0 = { argmemonly nounwind willreturn writeonly } 337