1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s 6 7; FIXME: Generated test checks do not check metadata at the end of the 8; function, so this also includes manually added checks. 9 10; Test that we can select a statically sized alloca outside of the 11; entry block. 12 13; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an 14; alignment less than the stack alignment. 15define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 { 16; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 17; MUBUF: ; %bb.0: ; %entry 18; MUBUF-NEXT: s_add_u32 s0, s0, s9 19; MUBUF-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 20; MUBUF-NEXT: s_addc_u32 s1, s1, 0 21; MUBUF-NEXT: s_movk_i32 s32, 0x400 22; MUBUF-NEXT: s_mov_b32 s33, 0 23; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 24; MUBUF-NEXT: s_cmp_lg_u32 s8, 0 25; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 26; MUBUF-NEXT: ; %bb.1: ; %bb.0 27; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 28; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 29; MUBUF-NEXT: ; %bb.2: ; %bb.1 30; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 31; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 32; MUBUF-NEXT: s_mov_b32 s32, s6 33; MUBUF-NEXT: v_mov_b32_e32 v1, 0 34; MUBUF-NEXT: v_mov_b32_e32 v2, s6 35; MUBUF-NEXT: v_mov_b32_e32 v3, 1 36; MUBUF-NEXT: s_add_i32 s6, s6, s7 37; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 38; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 39; MUBUF-NEXT: v_mov_b32_e32 v2, s6 40; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 41; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 42; MUBUF-NEXT: s_waitcnt vmcnt(0) 43; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 44; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 45; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] 46; MUBUF-NEXT: .LBB0_3: ; %bb.2 47; MUBUF-NEXT: v_mov_b32_e32 v0, 0 48; MUBUF-NEXT: global_store_dword v[0:1], v0, off 49; MUBUF-NEXT: s_waitcnt vmcnt(0) 50; MUBUF-NEXT: s_endpgm 51; 52; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: 53; FLATSCR: ; %bb.0: ; %entry 54; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 55; FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 56; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 57; FLATSCR-NEXT: s_mov_b32 s32, 16 58; FLATSCR-NEXT: s_mov_b32 s33, 0 59; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 60; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0 61; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 62; FLATSCR-NEXT: ; %bb.1: ; %bb.0 63; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 64; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 65; FLATSCR-NEXT: ; %bb.2: ; %bb.1 66; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 67; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 68; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 69; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 70; FLATSCR-NEXT: s_mov_b32 s32, s2 71; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 72; FLATSCR-NEXT: s_add_i32 s2, s2, s3 73; FLATSCR-NEXT: scratch_load_dword v2, off, s2 74; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 75; FLATSCR-NEXT: s_waitcnt vmcnt(0) 76; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 77; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 78; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 79; FLATSCR-NEXT: .LBB0_3: ; %bb.2 80; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 81; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 82; FLATSCR-NEXT: s_waitcnt vmcnt(0) 83; FLATSCR-NEXT: s_endpgm 84 85entry: 86 %cond0 = icmp eq i32 %arg.cond0, 0 87 br i1 %cond0, label %bb.0, label %bb.2 88 89bb.0: 90 %alloca = alloca [16 x i32], align 4, addrspace(5) 91 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 92 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 93 %cond1 = icmp eq i32 %arg.cond1, 0 94 br i1 %cond1, label %bb.1, label %bb.2 95 96bb.1: 97 ; Use the alloca outside of the defining block. 98 store i32 0, i32 addrspace(5)* %gep0 99 store i32 1, i32 addrspace(5)* %gep1 100 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 101 %load = load i32, i32 addrspace(5)* %gep2 102 %tid = call i32 @llvm.amdgcn.workitem.id.x() 103 %add = add i32 %load, %tid 104 store i32 %add, i32 addrspace(1)* %out 105 br label %bb.2 106 107bb.2: 108 store volatile i32 0, i32 addrspace(1)* undef 109 ret void 110} 111; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 112; DEFAULTSIZE: ; ScratchSize: 4112 113 114; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 115; ASSUME1024: ; ScratchSize: 1040 116 117define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { 118; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 119; MUBUF: ; %bb.0: ; %entry 120; MUBUF-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 121; MUBUF-NEXT: s_add_u32 s0, s0, s9 122; MUBUF-NEXT: s_addc_u32 s1, s1, 0 123; MUBUF-NEXT: s_movk_i32 s32, 0x1000 124; MUBUF-NEXT: s_mov_b32 s33, 0 125; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 126; MUBUF-NEXT: s_cmp_lg_u32 s6, 0 127; MUBUF-NEXT: s_cbranch_scc1 .LBB1_2 128; MUBUF-NEXT: ; %bb.1: ; %bb.0 129; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 130; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 131; MUBUF-NEXT: s_lshl_b32 s7, s7, 2 132; MUBUF-NEXT: s_mov_b32 s32, s6 133; MUBUF-NEXT: v_mov_b32_e32 v1, 0 134; MUBUF-NEXT: v_mov_b32_e32 v2, s6 135; MUBUF-NEXT: v_mov_b32_e32 v3, 1 136; MUBUF-NEXT: s_add_i32 s6, s6, s7 137; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen 138; MUBUF-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 139; MUBUF-NEXT: v_mov_b32_e32 v2, s6 140; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 141; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 142; MUBUF-NEXT: s_waitcnt vmcnt(0) 143; MUBUF-NEXT: v_add_u32_e32 v0, v2, v0 144; MUBUF-NEXT: s_waitcnt lgkmcnt(0) 145; MUBUF-NEXT: global_store_dword v1, v0, s[4:5] 146; MUBUF-NEXT: .LBB1_2: ; %bb.1 147; MUBUF-NEXT: v_mov_b32_e32 v0, 0 148; MUBUF-NEXT: global_store_dword v[0:1], v0, off 149; MUBUF-NEXT: s_waitcnt vmcnt(0) 150; MUBUF-NEXT: s_endpgm 151; 152; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: 153; FLATSCR: ; %bb.0: ; %entry 154; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 155; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 156; FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 157; FLATSCR-NEXT: s_mov_b32 s32, 64 158; FLATSCR-NEXT: s_mov_b32 s33, 0 159; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 160; FLATSCR-NEXT: s_cmp_lg_u32 s2, 0 161; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 162; FLATSCR-NEXT: ; %bb.1: ; %bb.0 163; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 164; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 165; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 166; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 167; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 168; FLATSCR-NEXT: s_mov_b32 s32, s2 169; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 170; FLATSCR-NEXT: s_add_i32 s2, s2, s3 171; FLATSCR-NEXT: scratch_load_dword v2, off, s2 172; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 173; FLATSCR-NEXT: s_waitcnt vmcnt(0) 174; FLATSCR-NEXT: v_add_u32_e32 v0, v2, v0 175; FLATSCR-NEXT: s_waitcnt lgkmcnt(0) 176; FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] 177; FLATSCR-NEXT: .LBB1_2: ; %bb.1 178; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 179; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 180; FLATSCR-NEXT: s_waitcnt vmcnt(0) 181; FLATSCR-NEXT: s_endpgm 182entry: 183 %cond = icmp eq i32 %arg.cond, 0 184 br i1 %cond, label %bb.0, label %bb.1 185 186bb.0: 187 %alloca = alloca [16 x i32], align 64, addrspace(5) 188 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 189 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 190 store i32 0, i32 addrspace(5)* %gep0 191 store i32 1, i32 addrspace(5)* %gep1 192 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 193 %load = load i32, i32 addrspace(5)* %gep2 194 %tid = call i32 @llvm.amdgcn.workitem.id.x() 195 %add = add i32 %load, %tid 196 store i32 %add, i32 addrspace(1)* %out 197 br label %bb.1 198 199bb.1: 200 store volatile i32 0, i32 addrspace(1)* undef 201 ret void 202} 203 204; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 205; DEFAULTSIZE: ; ScratchSize: 4160 206 207; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 208; ASSUME1024: ; ScratchSize: 1088 209 210 211define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { 212; MUBUF-LABEL: func_non_entry_block_static_alloca_align4: 213; MUBUF: ; %bb.0: ; %entry 214; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; MUBUF-NEXT: s_mov_b32 s7, s33 216; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 217; MUBUF-NEXT: s_mov_b32 s33, s32 218; MUBUF-NEXT: s_addk_i32 s32, 0x400 219; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc 220; MUBUF-NEXT: s_cbranch_execz .LBB2_3 221; MUBUF-NEXT: ; %bb.1: ; %bb.0 222; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 223; MUBUF-NEXT: s_and_b64 exec, exec, vcc 224; MUBUF-NEXT: s_cbranch_execz .LBB2_3 225; MUBUF-NEXT: ; %bb.2: ; %bb.1 226; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 227; MUBUF-NEXT: v_mov_b32_e32 v2, 0 228; MUBUF-NEXT: v_mov_b32_e32 v3, s6 229; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen 230; MUBUF-NEXT: v_mov_b32_e32 v2, 1 231; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen offset:4 232; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 233; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 234; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 235; MUBUF-NEXT: s_mov_b32 s32, s6 236; MUBUF-NEXT: s_waitcnt vmcnt(0) 237; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 238; MUBUF-NEXT: global_store_dword v[0:1], v2, off 239; MUBUF-NEXT: .LBB2_3: ; %bb.2 240; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] 241; MUBUF-NEXT: v_mov_b32_e32 v0, 0 242; MUBUF-NEXT: global_store_dword v[0:1], v0, off 243; MUBUF-NEXT: s_waitcnt vmcnt(0) 244; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 245; MUBUF-NEXT: s_mov_b32 s33, s7 246; MUBUF-NEXT: s_setpc_b64 s[30:31] 247; 248; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4: 249; FLATSCR: ; %bb.0: ; %entry 250; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 251; FLATSCR-NEXT: s_mov_b32 s3, s33 252; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 253; FLATSCR-NEXT: s_mov_b32 s33, s32 254; FLATSCR-NEXT: s_add_i32 s32, s32, 16 255; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 256; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 257; FLATSCR-NEXT: ; %bb.1: ; %bb.0 258; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 259; FLATSCR-NEXT: s_and_b64 exec, exec, vcc 260; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 261; FLATSCR-NEXT: ; %bb.2: ; %bb.1 262; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 263; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 264; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 265; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 266; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 267; FLATSCR-NEXT: scratch_load_dword v2, v2, off 268; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 269; FLATSCR-NEXT: s_mov_b32 s32, s2 270; FLATSCR-NEXT: s_waitcnt vmcnt(0) 271; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 272; FLATSCR-NEXT: global_store_dword v[0:1], v2, off 273; FLATSCR-NEXT: .LBB2_3: ; %bb.2 274; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 275; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 276; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 277; FLATSCR-NEXT: s_waitcnt vmcnt(0) 278; FLATSCR-NEXT: s_add_i32 s32, s32, -16 279; FLATSCR-NEXT: s_mov_b32 s33, s3 280; FLATSCR-NEXT: s_setpc_b64 s[30:31] 281 282entry: 283 %cond0 = icmp eq i32 %arg.cond0, 0 284 br i1 %cond0, label %bb.0, label %bb.2 285 286bb.0: 287 %alloca = alloca [16 x i32], align 4, addrspace(5) 288 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 289 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 290 %cond1 = icmp eq i32 %arg.cond1, 0 291 br i1 %cond1, label %bb.1, label %bb.2 292 293bb.1: 294 ; Use the alloca outside of the defining block. 295 store i32 0, i32 addrspace(5)* %gep0 296 store i32 1, i32 addrspace(5)* %gep1 297 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 298 %load = load i32, i32 addrspace(5)* %gep2 299 %tid = call i32 @llvm.amdgcn.workitem.id.x() 300 %add = add i32 %load, %tid 301 store i32 %add, i32 addrspace(1)* %out 302 br label %bb.2 303 304bb.2: 305 store volatile i32 0, i32 addrspace(1)* undef 306 ret void 307} 308 309define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { 310; MUBUF-LABEL: func_non_entry_block_static_alloca_align64: 311; MUBUF: ; %bb.0: ; %entry 312; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 313; MUBUF-NEXT: s_mov_b32 s7, s33 314; MUBUF-NEXT: s_add_i32 s33, s32, 0xfc0 315; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 316; MUBUF-NEXT: s_and_b32 s33, s33, 0xfffff000 317; MUBUF-NEXT: s_addk_i32 s32, 0x2000 318; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc 319; MUBUF-NEXT: s_cbranch_execz .LBB3_2 320; MUBUF-NEXT: ; %bb.1: ; %bb.0 321; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 322; MUBUF-NEXT: s_and_b32 s6, s6, 0xfffff000 323; MUBUF-NEXT: v_mov_b32_e32 v2, 0 324; MUBUF-NEXT: v_mov_b32_e32 v4, s6 325; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen 326; MUBUF-NEXT: v_mov_b32_e32 v2, 1 327; MUBUF-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 328; MUBUF-NEXT: v_lshl_add_u32 v2, v3, 2, s6 329; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen 330; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 331; MUBUF-NEXT: s_mov_b32 s32, s6 332; MUBUF-NEXT: s_waitcnt vmcnt(0) 333; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 334; MUBUF-NEXT: global_store_dword v[0:1], v2, off 335; MUBUF-NEXT: .LBB3_2: ; %bb.1 336; MUBUF-NEXT: s_or_b64 exec, exec, s[4:5] 337; MUBUF-NEXT: v_mov_b32_e32 v0, 0 338; MUBUF-NEXT: global_store_dword v[0:1], v0, off 339; MUBUF-NEXT: s_waitcnt vmcnt(0) 340; MUBUF-NEXT: s_addk_i32 s32, 0xe000 341; MUBUF-NEXT: s_mov_b32 s33, s7 342; MUBUF-NEXT: s_setpc_b64 s[30:31] 343; 344; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64: 345; FLATSCR: ; %bb.0: ; %entry 346; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 347; FLATSCR-NEXT: s_mov_b32 s3, s33 348; FLATSCR-NEXT: s_add_i32 s33, s32, 63 349; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 350; FLATSCR-NEXT: s_andn2_b32 s33, s33, 63 351; FLATSCR-NEXT: s_addk_i32 s32, 0x80 352; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc 353; FLATSCR-NEXT: s_cbranch_execz .LBB3_2 354; FLATSCR-NEXT: ; %bb.1: ; %bb.0 355; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 356; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 357; FLATSCR-NEXT: v_mov_b32_e32 v4, 0 358; FLATSCR-NEXT: v_mov_b32_e32 v5, 1 359; FLATSCR-NEXT: scratch_store_dwordx2 off, v[4:5], s2 360; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2 361; FLATSCR-NEXT: scratch_load_dword v2, v2, off 362; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 363; FLATSCR-NEXT: s_mov_b32 s32, s2 364; FLATSCR-NEXT: s_waitcnt vmcnt(0) 365; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 366; FLATSCR-NEXT: global_store_dword v[0:1], v2, off 367; FLATSCR-NEXT: .LBB3_2: ; %bb.1 368; FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1] 369; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 370; FLATSCR-NEXT: global_store_dword v[0:1], v0, off 371; FLATSCR-NEXT: s_waitcnt vmcnt(0) 372; FLATSCR-NEXT: s_addk_i32 s32, 0xff80 373; FLATSCR-NEXT: s_mov_b32 s33, s3 374; FLATSCR-NEXT: s_setpc_b64 s[30:31] 375entry: 376 %cond = icmp eq i32 %arg.cond, 0 377 br i1 %cond, label %bb.0, label %bb.1 378 379bb.0: 380 %alloca = alloca [16 x i32], align 64, addrspace(5) 381 %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 382 %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 383 store i32 0, i32 addrspace(5)* %gep0 384 store i32 1, i32 addrspace(5)* %gep1 385 %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in 386 %load = load i32, i32 addrspace(5)* %gep2 387 %tid = call i32 @llvm.amdgcn.workitem.id.x() 388 %add = add i32 %load, %tid 389 store i32 %add, i32 addrspace(1)* %out 390 br label %bb.1 391 392bb.1: 393 store volatile i32 0, i32 addrspace(1)* undef 394 ret void 395} 396 397declare i32 @llvm.amdgcn.workitem.id.x() #0 398 399attributes #0 = { nounwind readnone speculatable } 400attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } 401