1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s 4 5define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind { 6; GCN-LABEL: test_loop: 7; GCN: ; %bb.0: ; %entry 8; GCN-NEXT: s_load_dword s2, s[0:1], 0xa 9; GCN-NEXT: s_waitcnt lgkmcnt(0) 10; GCN-NEXT: s_cmp_eq_u32 s2, -1 11; GCN-NEXT: s_cbranch_scc1 .LBB0_3 12; GCN-NEXT: ; %bb.1: ; %for.body.preheader 13; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 14; GCN-NEXT: s_waitcnt lgkmcnt(0) 15; GCN-NEXT: s_addk_i32 s0, 0x80 16; GCN-NEXT: v_mov_b32_e32 v0, s0 17; GCN-NEXT: s_and_b64 s[0:1], exec, -1 18; GCN-NEXT: s_mov_b32 m0, -1 19; GCN-NEXT: .LBB0_2: ; %for.body 20; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 21; GCN-NEXT: ds_read_b32 v1, v0 22; GCN-NEXT: s_waitcnt lgkmcnt(0) 23; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 24; GCN-NEXT: ds_write_b32 v0, v1 25; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 26; GCN-NEXT: s_mov_b64 vcc, s[0:1] 27; GCN-NEXT: s_cbranch_vccnz .LBB0_2 28; GCN-NEXT: .LBB0_3: ; %for.exit 29; GCN-NEXT: s_endpgm 30; 31; GCN_DBG-LABEL: test_loop: 32; GCN_DBG: ; %bb.0: ; %entry 33; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 34; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 35; GCN_DBG-NEXT: v_writelane_b32 v2, s2, 0 36; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa 37; GCN_DBG-NEXT: s_mov_b32 s0, 0 38; GCN_DBG-NEXT: s_mov_b32 s2, -1 39; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 40; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 41; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 42; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 43; GCN_DBG-NEXT: ; %bb.1: ; %for.exit 44; GCN_DBG-NEXT: s_endpgm 45; GCN_DBG-NEXT: .LBB0_2: ; %for.body 46; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 47; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 48; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 49; GCN_DBG-NEXT: s_mov_b32 s1, 2 50; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 51; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 52; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 53; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 54; GCN_DBG-NEXT: s_mov_b32 m0, -1 55; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 56; GCN_DBG-NEXT: ds_read_b32 v0, v0 57; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 58; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 59; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 60; GCN_DBG-NEXT: s_mov_b32 m0, -1 61; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 62; GCN_DBG-NEXT: ds_write_b32 v0, v1 63; GCN_DBG-NEXT: s_mov_b32 s1, 1 64; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 65; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 66; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 67; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 68; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 69; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock 70; GCN_DBG-NEXT: s_endpgm 71entry: 72 %cmp = icmp eq i32 %n, -1 73 br i1 %cmp, label %for.exit, label %for.body 74 75for.exit: 76 ret void 77 78for.body: 79 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 80 %tmp = add i32 %indvar, 32 81 %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp 82 %vecload = load float, float addrspace(3)* %arrayidx, align 4 83 %add = fadd float %vecload, 1.0 84 store float %add, float addrspace(3)* %arrayidx, align 8 85 %inc = add i32 %indvar, 1 86 br label %for.body 87} 88 89define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind { 90; GCN-LABEL: loop_const_true: 91; GCN: ; %bb.0: ; %entry 92; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 93; GCN-NEXT: s_waitcnt lgkmcnt(0) 94; GCN-NEXT: s_addk_i32 s0, 0x80 95; GCN-NEXT: v_mov_b32_e32 v0, s0 96; GCN-NEXT: s_mov_b32 m0, -1 97; GCN-NEXT: .LBB1_1: ; %for.body 98; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 99; GCN-NEXT: ds_read_b32 v1, v0 100; GCN-NEXT: s_waitcnt lgkmcnt(0) 101; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 102; GCN-NEXT: ds_write_b32 v0, v1 103; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 104; GCN-NEXT: s_branch .LBB1_1 105; 106; GCN_DBG-LABEL: loop_const_true: 107; GCN_DBG: ; %bb.0: ; %entry 108; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 109; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 110; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 111; GCN_DBG-NEXT: s_mov_b32 s0, 0 112; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 113; GCN_DBG-NEXT: s_branch .LBB1_2 114; GCN_DBG-NEXT: .LBB1_1: ; %for.exit 115; GCN_DBG-NEXT: s_endpgm 116; GCN_DBG-NEXT: .LBB1_2: ; %for.body 117; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 118; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 119; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 120; GCN_DBG-NEXT: s_mov_b32 s1, 2 121; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 122; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 123; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 124; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 125; GCN_DBG-NEXT: s_mov_b32 m0, -1 126; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 127; GCN_DBG-NEXT: ds_read_b32 v0, v0 128; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 129; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 130; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 131; GCN_DBG-NEXT: s_mov_b32 m0, -1 132; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 133; GCN_DBG-NEXT: ds_write_b32 v0, v1 134; GCN_DBG-NEXT: s_mov_b32 s1, 1 135; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 136; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 137; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 138; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 139; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 140; GCN_DBG-NEXT: s_branch .LBB1_2 141entry: 142 br label %for.body 143 144for.exit: 145 ret void 146 147for.body: 148 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 149 %tmp = add i32 %indvar, 32 150 %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp 151 %vecload = load float, float addrspace(3)* %arrayidx, align 4 152 %add = fadd float %vecload, 1.0 153 store float %add, float addrspace(3)* %arrayidx, align 8 154 %inc = add i32 %indvar, 1 155 br i1 true, label %for.body, label %for.exit 156} 157 158define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind { 159; GCN-LABEL: loop_const_false: 160; GCN: ; %bb.0: ; %entry 161; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 162; GCN-NEXT: s_waitcnt lgkmcnt(0) 163; GCN-NEXT: v_mov_b32_e32 v0, s0 164; GCN-NEXT: s_mov_b32 m0, -1 165; GCN-NEXT: ds_read_b32 v1, v0 offset:128 166; GCN-NEXT: s_waitcnt lgkmcnt(0) 167; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 168; GCN-NEXT: ds_write_b32 v0, v1 offset:128 169; GCN-NEXT: s_endpgm 170; 171; GCN_DBG-LABEL: loop_const_false: 172; GCN_DBG: ; %bb.0: ; %entry 173; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 174; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 175; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 176; GCN_DBG-NEXT: s_mov_b32 s0, 0 177; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 178; GCN_DBG-NEXT: s_branch .LBB2_2 179; GCN_DBG-NEXT: .LBB2_1: ; %for.exit 180; GCN_DBG-NEXT: s_endpgm 181; GCN_DBG-NEXT: .LBB2_2: ; %for.body 182; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 183; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 184; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 185; GCN_DBG-NEXT: s_mov_b32 s1, 2 186; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 187; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 188; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 189; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 190; GCN_DBG-NEXT: s_mov_b32 m0, -1 191; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 192; GCN_DBG-NEXT: ds_read_b32 v0, v0 193; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 194; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 195; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 196; GCN_DBG-NEXT: s_mov_b32 m0, -1 197; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 198; GCN_DBG-NEXT: ds_write_b32 v0, v1 199; GCN_DBG-NEXT: s_mov_b32 s1, 1 200; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 201; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 202; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 203; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 204; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 205; GCN_DBG-NEXT: s_branch .LBB2_2 206entry: 207 br label %for.body 208 209for.exit: 210 ret void 211 212; XXX - Should there be an S_ENDPGM? 213for.body: 214 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 215 %tmp = add i32 %indvar, 32 216 %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp 217 %vecload = load float, float addrspace(3)* %arrayidx, align 4 218 %add = fadd float %vecload, 1.0 219 store float %add, float addrspace(3)* %arrayidx, align 8 220 %inc = add i32 %indvar, 1 221 br i1 false, label %for.body, label %for.exit 222} 223 224define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind { 225; GCN-LABEL: loop_const_undef: 226; GCN: ; %bb.0: ; %entry 227; GCN-NEXT: s_load_dword s0, s[0:1], 0x9 228; GCN-NEXT: s_waitcnt lgkmcnt(0) 229; GCN-NEXT: v_mov_b32_e32 v0, s0 230; GCN-NEXT: s_mov_b32 m0, -1 231; GCN-NEXT: ds_read_b32 v1, v0 offset:128 232; GCN-NEXT: s_waitcnt lgkmcnt(0) 233; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 234; GCN-NEXT: ds_write_b32 v0, v1 offset:128 235; GCN-NEXT: s_endpgm 236; 237; GCN_DBG-LABEL: loop_const_undef: 238; GCN_DBG: ; %bb.0: ; %entry 239; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 240; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 241; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 242; GCN_DBG-NEXT: s_mov_b32 s0, 0 243; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 244; GCN_DBG-NEXT: s_branch .LBB3_2 245; GCN_DBG-NEXT: .LBB3_1: ; %for.exit 246; GCN_DBG-NEXT: s_endpgm 247; GCN_DBG-NEXT: .LBB3_2: ; %for.body 248; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 249; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 250; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 251; GCN_DBG-NEXT: s_mov_b32 s1, 2 252; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 253; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 254; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 255; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 256; GCN_DBG-NEXT: s_mov_b32 m0, -1 257; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 258; GCN_DBG-NEXT: ds_read_b32 v0, v0 259; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 260; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 261; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 262; GCN_DBG-NEXT: s_mov_b32 m0, -1 263; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 264; GCN_DBG-NEXT: ds_write_b32 v0, v1 265; GCN_DBG-NEXT: s_mov_b32 s1, 1 266; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 267; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 268; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 269; GCN_DBG-NEXT: s_branch .LBB3_2 270entry: 271 br label %for.body 272 273for.exit: 274 ret void 275 276; XXX - Should there be an s_endpgm? 277for.body: 278 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 279 %tmp = add i32 %indvar, 32 280 %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp 281 %vecload = load float, float addrspace(3)* %arrayidx, align 4 282 %add = fadd float %vecload, 1.0 283 store float %add, float addrspace(3)* %arrayidx, align 8 284 %inc = add i32 %indvar, 1 285 br i1 undef, label %for.body, label %for.exit 286} 287 288define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind { 289; GCN-LABEL: loop_arg_0: 290; GCN: ; %bb.0: ; %entry 291; GCN-NEXT: v_mov_b32_e32 v0, 0 292; GCN-NEXT: s_mov_b32 m0, -1 293; GCN-NEXT: ds_read_u8 v0, v0 294; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 295; GCN-NEXT: s_waitcnt lgkmcnt(0) 296; GCN-NEXT: v_readfirstlane_b32 s0, v0 297; GCN-NEXT: s_bitcmp1_b32 s0, 0 298; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 299; GCN-NEXT: s_addk_i32 s2, 0x80 300; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], -1 301; GCN-NEXT: v_mov_b32_e32 v0, s2 302; GCN-NEXT: s_and_b64 s[0:1], exec, s[0:1] 303; GCN-NEXT: .LBB4_1: ; %for.body 304; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 305; GCN-NEXT: ds_read_b32 v1, v0 306; GCN-NEXT: s_waitcnt lgkmcnt(0) 307; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 308; GCN-NEXT: ds_write_b32 v0, v1 309; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 310; GCN-NEXT: s_mov_b64 vcc, s[0:1] 311; GCN-NEXT: s_cbranch_vccz .LBB4_1 312; GCN-NEXT: ; %bb.2: ; %for.exit 313; GCN-NEXT: s_endpgm 314; 315; GCN_DBG-LABEL: loop_arg_0: 316; GCN_DBG: ; %bb.0: ; %entry 317; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 318; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 319; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 320; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 321; GCN_DBG-NEXT: s_mov_b32 m0, -1 322; GCN_DBG-NEXT: ds_read_u8 v0, v0 323; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 324; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 325; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 326; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 327; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 328; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 329; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 330; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 331; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 332; GCN_DBG-NEXT: s_mov_b32 s0, 0 333; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 334; GCN_DBG-NEXT: s_branch .LBB4_2 335; GCN_DBG-NEXT: .LBB4_1: ; %for.exit 336; GCN_DBG-NEXT: s_endpgm 337; GCN_DBG-NEXT: .LBB4_2: ; %for.body 338; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 339; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 340; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 341; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 342; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 343; GCN_DBG-NEXT: s_mov_b32 s1, 2 344; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 345; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 346; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 347; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 348; GCN_DBG-NEXT: s_mov_b32 m0, -1 349; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 350; GCN_DBG-NEXT: ds_read_b32 v0, v0 351; GCN_DBG-NEXT: s_mov_b32 s4, 1.0 352; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 353; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4 354; GCN_DBG-NEXT: s_mov_b32 m0, -1 355; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 356; GCN_DBG-NEXT: ds_write_b32 v0, v1 357; GCN_DBG-NEXT: s_mov_b32 s1, 1 358; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 359; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 360; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 361; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 362; GCN_DBG-NEXT: s_branch .LBB4_2 363entry: 364 %cond = load volatile i1, i1 addrspace(3)* null 365 br label %for.body 366 367for.exit: 368 ret void 369 370for.body: 371 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 372 %tmp = add i32 %indvar, 32 373 %arrayidx = getelementptr float, float addrspace(3)* %ptr, i32 %tmp 374 %vecload = load float, float addrspace(3)* %arrayidx, align 4 375 %add = fadd float %vecload, 1.0 376 store float %add, float addrspace(3)* %arrayidx, align 8 377 %inc = add i32 %indvar, 1 378 br i1 %cond, label %for.body, label %for.exit 379} 380