1; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN 2; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 4 5; GCN-LABEL: {{^}}full_mask: 6; GCN: s_mov_b64 exec, -1 7; GCN: v_add_f32_e32 v0, 8define amdgpu_ps float @full_mask(float %a, float %b) { 9main_body: 10 %s = fadd float %a, %b 11 call void @llvm.amdgcn.init.exec(i64 -1) 12 ret float %s 13} 14 15; GCN-LABEL: {{^}}partial_mask: 16; GCN: s_mov_b64 exec, 0x1e240 17; GCN: v_add_f32_e32 v0, 18define amdgpu_ps float @partial_mask(float %a, float %b) { 19main_body: 20 %s = fadd float %a, %b 21 call void @llvm.amdgcn.init.exec(i64 123456) 22 ret float %s 23} 24 25; GCN-LABEL: {{^}}input_s3off8: 26; GCN: s_bfe_u32 s0, s3, 0x70008 27; GCN: s_bfm_b64 exec, s0, 0 28; GCN: s_cmp_eq_u32 s0, 64 29; GCN: s_cmov_b64 exec, -1 30; GCN: v_add_f32_e32 v0, 31define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { 32main_body: 33 %s = fadd float %a, %b 34 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 35 ret float %s 36} 37 38; GCN-LABEL: {{^}}input_s0off19: 39; GCN: s_bfe_u32 s0, s0, 0x70013 40; GCN: s_bfm_b64 exec, s0, 0 41; GCN: s_cmp_eq_u32 s0, 64 42; GCN: s_cmov_b64 exec, -1 43; GCN: v_add_f32_e32 v0, 44define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) { 45main_body: 46 %s = fadd float %a, %b 47 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 48 ret float %s 49} 50 51; GCN-LABEL: {{^}}reuse_input: 52; GCN: s_bfe_u32 s1, s0, 0x70013 53; GCN: s_bfm_b64 exec, s1, 0 54; GCN: s_cmp_eq_u32 s1, 64 55; GCN: s_cmov_b64 exec, -1 56; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 57define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { 58main_body: 59 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 60 %s = add i32 %a, %count 61 %f = sitofp i32 %s to float 62 ret float %f 63} 64 65; GCN-LABEL: {{^}}reuse_input2: 66; GCN: s_bfe_u32 s1, s0, 0x70013 67; GCN: s_bfm_b64 exec, s1, 0 68; GCN: s_cmp_eq_u32 s1, 64 69; GCN: s_cmov_b64 exec, -1 70; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 71define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { 72main_body: 73 %s = add i32 %a, %count 74 %f = sitofp i32 %s to float 75 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 76 ret float %f 77} 78 79; GCN-LABEL: {{^}}init_unreachable: 80; 81; This used to crash. 82define amdgpu_ps void @init_unreachable() { 83main_body: 84 call void @llvm.amdgcn.init.exec(i64 -1) 85 unreachable 86} 87 88; GCN-LABEL: {{^}}init_exec_before_frame_materialize: 89; GCN-NOT: {{^}}v_ 90; GCN: s_mov_b64 exec, -1 91; GCN: v_mov 92; GCN: v_add 93define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) { 94main_body: 95 %array0 = alloca [1024 x i32], align 16, addrspace(5) 96 %array1 = alloca [20 x i32], align 16, addrspace(5) 97 call void @llvm.amdgcn.init.exec(i64 -1) 98 99 %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 100 store i32 %a, i32 addrspace(5)* %ptr0, align 4 101 102 %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 103 store i32 %a, i32 addrspace(5)* %ptr1, align 4 104 105 %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 106 store i32 %b, i32 addrspace(5)* %ptr2, align 4 107 108 %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b 109 %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 110 111 %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b 112 %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 113 114 %v5 = add i32 %v3, %v4 115 %v = bitcast i32 %v5 to float 116 ret float %v 117} 118 119; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize: 120; GCN-NOT: {{^}}v_ 121; GCN: s_bfe_u32 s2, s2, 0x70008 122; GCN-NEXT: s_bfm_b64 exec, s2, 0 123; GCN-NEXT: s_cmp_eq_u32 s2, 64 124; GCN-NEXT: s_cmov_b64 exec, -1 125; GCN: v_mov 126; GCN: v_add 127define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) { 128main_body: 129 %array0 = alloca [1024 x i32], align 16, addrspace(5) 130 %array1 = alloca [20 x i32], align 16, addrspace(5) 131 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 132 133 %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 134 store i32 %a, i32 addrspace(5)* %ptr0, align 4 135 136 %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 137 store i32 %a, i32 addrspace(5)* %ptr1, align 4 138 139 %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 140 store i32 %b, i32 addrspace(5)* %ptr2, align 4 141 142 %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b 143 %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 144 145 %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b 146 %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 147 148 %v5 = add i32 %v3, %v4 149 %v = bitcast i32 %v5 to float 150 ret float %v 151} 152 153; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry: 154; GCN-NOT: {{^}}v_ 155; GCN: %endif 156; GCN: s_bfe_u32 [[S:s[0-9]+]], s2, 0x70008 157; GCN-NEXT: s_bfm_b64 exec, [[S]], 0 158; GCN-NEXT: s_cmp_eq_u32 [[S]], 64 159; GCN-NEXT: s_cmov_b64 exec, -1 160; GCN: v_mov 161; GCN: v_add 162define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) { 163main_body: 164 ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel 165 %array0 = alloca [1024 x i32], align 16, addrspace(5) 166 %array1 = alloca [20 x i32], align 16, addrspace(5) 167 168 %cc = icmp uge i32 %count, 32 169 br i1 %cc, label %endif, label %if 170 171if: 172 call void asm sideeffect "", ""() 173 br label %endif 174 175endif: 176 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 177 178 %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 179 store i32 %a, i32 addrspace(5)* %ptr0, align 4 180 181 %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 182 store i32 %a, i32 addrspace(5)* %ptr1, align 4 183 184 %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 185 store i32 %b, i32 addrspace(5)* %ptr2, align 4 186 187 %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b 188 %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 189 190 %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b 191 %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 192 193 %v5 = add i32 %v3, %v4 194 %v6 = add i32 %v5, %count 195 %v = bitcast i32 %v6 to float 196 ret float %v 197} 198 199declare void @llvm.amdgcn.init.exec(i64) #1 200declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1 201 202attributes #1 = { convergent } 203