1; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s 2; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s 6 7; GCN-LABEL: {{^}}test_vopc_i32: 8; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}} 9; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc_lo 10; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}} 11; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc{{$}} 12define amdgpu_kernel void @test_vopc_i32(i32 addrspace(1)* %arg) { 13 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 14 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid 15 %load = load i32, i32 addrspace(1)* %gep, align 4 16 %cmp = icmp sgt i32 %load, 0 17 %sel = select i1 %cmp, i32 1, i32 2 18 store i32 %sel, i32 addrspace(1)* %gep, align 4 19 ret void 20} 21 22; GCN-LABEL: {{^}}test_vopc_f32: 23; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}} 24; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc_lo 25; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}} 26; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc{{$}} 27define amdgpu_kernel void @test_vopc_f32(float addrspace(1)* %arg) { 28 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 29 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid 30 %load = load float, float addrspace(1)* %gep, align 4 31 %cmp = fcmp ugt float %load, 0.0 32 %sel = select i1 %cmp, float 1.0, float 2.0 33 store float %sel, float addrspace(1)* %gep, align 4 34 ret void 35} 36 37; GCN-LABEL: {{^}}test_vopc_vcmpx: 38; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}} 39; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}} 40define amdgpu_ps void @test_vopc_vcmpx(float %x) { 41 %cmp = fcmp oge float %x, 0.0 42 call void @llvm.amdgcn.kill(i1 %cmp) 43 ret void 44} 45 46; GCN-LABEL: {{^}}test_vopc_2xf16: 47; GFX1032: v_cmp_le_f16_sdwa [[SC:s[0-9]+]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD 48; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]] 49; GFX1064: v_cmp_le_f16_sdwa [[SC:s\[[0-9:]+\]]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD 50; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]] 51define amdgpu_kernel void @test_vopc_2xf16(<2 x half> addrspace(1)* %arg) { 52 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 53 %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %lid 54 %load = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 55 %elt = extractelement <2 x half> %load, i32 1 56 %cmp = fcmp ugt half %elt, 0.0 57 %sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load 58 store <2 x half> %sel, <2 x half> addrspace(1)* %gep, align 4 59 ret void 60} 61 62; GCN-LABEL: {{^}}test_vopc_class: 63; GFX1032: v_cmp_class_f32_e64 [[C:vcc_lo|s[0-9:]+]], s{{[0-9]+}}, 0x204 64; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]] 65; GFX1064: v_cmp_class_f32_e64 [[C:vcc|s\[[0-9:]+\]]], s{{[0-9]+}}, 0x204 66; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]{{$}} 67define amdgpu_kernel void @test_vopc_class(i32 addrspace(1)* %out, float %x) #0 { 68 %fabs = tail call float @llvm.fabs.f32(float %x) 69 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 70 %ext = zext i1 %cmp to i32 71 store i32 %ext, i32 addrspace(1)* %out, align 4 72 ret void 73} 74 75; GCN-LABEL: {{^}}test_vcmp_vcnd_f16: 76; GFX1032: v_cmp_neq_f16_e64 [[C:vcc_lo|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}} 77; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]] 78 79; GFX1064: v_cmp_neq_f16_e64 [[C:vcc|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}} 80; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]{{$}} 81define amdgpu_kernel void @test_vcmp_vcnd_f16(half addrspace(1)* %out, half %x) #0 { 82 %cmp = fcmp oeq half %x, 0x7FF0000000000000 83 %sel = select i1 %cmp, half 1.0, half %x 84 store half %sel, half addrspace(1)* %out, align 2 85 ret void 86} 87 88; GCN-LABEL: {{^}}test_vop3_cmp_f32_sop_and: 89; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}} 90; GFX1032: v_cmp_nle_f32_e64 [[C2:s[0-9]+]], 1.0, v{{[0-9]+}} 91; GFX1032: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] 92; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]] 93; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}} 94; GFX1064: v_cmp_nle_f32_e64 [[C2:s\[[0-9:]+\]]], 1.0, v{{[0-9]+}} 95; GFX1064: s_and_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] 96; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]] 97define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(float addrspace(1)* %arg) { 98 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 99 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid 100 %load = load float, float addrspace(1)* %gep, align 4 101 %cmp = fcmp ugt float %load, 0.0 102 %cmp2 = fcmp ult float %load, 1.0 103 %and = and i1 %cmp, %cmp2 104 %sel = select i1 %and, float 1.0, float 2.0 105 store float %sel, float addrspace(1)* %gep, align 4 106 ret void 107} 108 109; GCN-LABEL: {{^}}test_vop3_cmp_i32_sop_xor: 110; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}} 111; GFX1032: v_cmp_gt_i32_e64 [[C2:s[0-9]+]], 1, v{{[0-9]+}} 112; GFX1032: s_xor_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] 113; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 114; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}} 115; GFX1064: v_cmp_gt_i32_e64 [[C2:s\[[0-9:]+\]]], 1, v{{[0-9]+}} 116; GFX1064: s_xor_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] 117; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 118define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(i32 addrspace(1)* %arg) { 119 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 120 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid 121 %load = load i32, i32 addrspace(1)* %gep, align 4 122 %cmp = icmp sgt i32 %load, 0 123 %cmp2 = icmp slt i32 %load, 1 124 %xor = xor i1 %cmp, %cmp2 125 %sel = select i1 %xor, i32 1, i32 2 126 store i32 %sel, i32 addrspace(1)* %gep, align 4 127 ret void 128} 129 130; GCN-LABEL: {{^}}test_vop3_cmp_u32_sop_or: 131; GFX1032: v_cmp_lt_u32_e32 vcc_lo, 3, v{{[0-9]+}} 132; GFX1032: v_cmp_gt_u32_e64 [[C2:s[0-9]+]], 2, v{{[0-9]+}} 133; GFX1032: s_or_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] 134; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 135; GFX1064: v_cmp_lt_u32_e32 vcc, 3, v{{[0-9]+}} 136; GFX1064: v_cmp_gt_u32_e64 [[C2:s\[[0-9:]+\]]], 2, v{{[0-9]+}} 137; GFX1064: s_or_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] 138; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] 139define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) { 140 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 141 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid 142 %load = load i32, i32 addrspace(1)* %gep, align 4 143 %cmp = icmp ugt i32 %load, 3 144 %cmp2 = icmp ult i32 %load, 2 145 %or = or i1 %cmp, %cmp2 146 %sel = select i1 %or, i32 1, i32 2 147 store i32 %sel, i32 addrspace(1)* %gep, align 4 148 ret void 149} 150 151; GCN-LABEL: {{^}}test_mask_if: 152; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo 153; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} 154; GCN: ; mask branch 155define amdgpu_kernel void @test_mask_if(i32 addrspace(1)* %arg) #0 { 156 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 157 %cmp = icmp ugt i32 %lid, 10 158 br i1 %cmp, label %if, label %endif 159 160if: 161 store i32 0, i32 addrspace(1)* %arg, align 4 162 br label %endif 163 164endif: 165 ret void 166} 167 168; GCN-LABEL: {{^}}test_loop_with_if: 169; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}} 170; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}} 171; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}] 172; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] 173; GCN: s_cbranch_execz 174; GCN: BB{{.*}}: 175; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo 176; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} 177; GCN: s_cbranch_execz 178; GCN: BB{{.*}}: 179; GCN: BB{{.*}}: 180; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}} 181; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] 182; GCN: ; mask branch BB 183; GCN: BB{{.*}}: 184; GCN: BB{{.*}}: 185; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}} 186; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}} 187; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}] 188; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 189; GCN: ; mask branch BB 190; GCN: BB{{.*}}: 191; GCN: BB{{.*}}: 192; GCN: s_endpgm 193define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 { 194bb: 195 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 196 br label %bb2 197 198bb1: 199 ret void 200 201bb2: 202 %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ] 203 %tmp4 = icmp slt i32 %tmp3, %tmp 204 br i1 %tmp4, label %bb5, label %bb11 205 206bb5: 207 %tmp6 = sext i32 %tmp3 to i64 208 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 209 %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 210 %tmp9 = icmp sgt i32 %tmp8, 10 211 br i1 %tmp9, label %bb10, label %bb11 212 213bb10: 214 store i32 %tmp, i32 addrspace(1)* %tmp7, align 4 215 br label %bb13 216 217bb11: 218 %tmp12 = sdiv i32 %tmp3, 2 219 br label %bb13 220 221bb13: 222 %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ] 223 %tmp15 = add nsw i32 %tmp14, 1 224 %tmp16 = icmp slt i32 %tmp14, 255 225 br i1 %tmp16, label %bb2, label %bb1 226} 227 228; GCN-LABEL: {{^}}test_loop_with_if_else_break: 229; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo 230; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} 231; GCN: ; mask branch 232; GCN: s_cbranch_execz 233; GCN: BB{{.*}}: 234; GCN: BB{{.*}}: 235 236; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo 237; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc 238; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo 239; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec 240; GCN: global_store_dword 241; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo 242; GFX1064: s_and_b64 [[MASK0]], [[MASK0]], exec 243; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]] 244; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]] 245; GCN: BB{{.*}}: ; %Flow 246; GFX1032: s_and_b32 [[MASK0:s[0-9]+]], exec_lo, [[MASK1]] 247; GFX1064: s_and_b64 [[MASK0:s\[[0-9:]+\]]], exec, [[MASK1]] 248; GFX1032: s_or_b32 [[MASK0]], [[MASK0]], [[ACC:s[0-9]+]] 249; GFX1064: s_or_b64 [[MASK0]], [[MASK0]], [[ACC:s\[[0-9:]+\]]] 250; GFX1032: s_mov_b32 [[ACC]], [[MASK0]] 251; GFX1064: s_mov_b64 [[ACC]], [[MASK0]] 252; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[MASK0]] 253; GFX1064: s_andn2_b64 exec, exec, [[MASK0]] 254; GCN: s_cbranch_execz 255; GCN: BB{{.*}}: 256; GCN: s_load_dword [[LOAD:s[0-9]+]] 257; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo 258; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec 259; GCN: s_cmp_lt_i32 [[LOAD]], 11 260define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 { 261bb: 262 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 263 %tmp1 = icmp eq i32 %tmp, 0 264 br i1 %tmp1, label %.loopexit, label %.preheader 265 266.preheader: 267 br label %bb2 268 269bb2: 270 %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ] 271 %tmp4 = zext i32 %tmp3 to i64 272 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 273 %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 274 %tmp7 = icmp sgt i32 %tmp6, 10 275 br i1 %tmp7, label %bb8, label %.loopexit 276 277bb8: 278 store i32 %tmp, i32 addrspace(1)* %tmp5, align 4 279 %tmp9 = add nuw nsw i32 %tmp3, 1 280 %tmp10 = icmp ult i32 %tmp9, 256 281 %tmp11 = icmp ult i32 %tmp9, %tmp 282 %tmp12 = and i1 %tmp10, %tmp11 283 br i1 %tmp12, label %bb2, label %.loopexit 284 285.loopexit: 286 ret void 287} 288 289; GCN-LABEL: {{^}}test_addc_vop2b: 290; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}} 291; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo 292; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}} 293; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} 294define amdgpu_kernel void @test_addc_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { 295bb: 296 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 297 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp 298 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 299 %tmp5 = add nsw i64 %tmp4, %arg1 300 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 301 ret void 302} 303 304; GCN-LABEL: {{^}}test_subbrev_vop2b: 305; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} 306; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}} 307; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} 308; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}} 309define amdgpu_kernel void @test_subbrev_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { 310bb: 311 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 312 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp 313 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 314 %tmp5 = sub nsw i64 %tmp4, %arg1 315 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 316 ret void 317} 318 319; GCN-LABEL: {{^}}test_subb_vop2b: 320; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} 321; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}} 322; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} 323; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}} 324define amdgpu_kernel void @test_subb_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { 325bb: 326 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 327 %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp 328 %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 329 %tmp5 = sub nsw i64 %arg1, %tmp4 330 store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 331 ret void 332} 333 334; GCN-LABEL: {{^}}test_udiv64: 335; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 336; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo 337; GFX1032: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] 338; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} 339; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} 340; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} 341; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo 342; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} 343; GFX1032: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo 344; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo 345; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}} 346; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} 347; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] 348; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 349; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 350; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} 351; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} 352; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} 353; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} 354; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} 355define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 { 356bb: 357 %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1 358 %tmp1 = load i64, i64 addrspace(1)* %tmp, align 8 359 %tmp2 = load i64, i64 addrspace(1)* %arg, align 8 360 %tmp3 = udiv i64 %tmp1, %tmp2 361 %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 2 362 store i64 %tmp3, i64 addrspace(1)* %tmp4, align 8 363 ret void 364} 365 366; GCN-LABEL: {{^}}test_div_scale_f32: 367; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 368; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 369define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { 370 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 371 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 372 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 373 374 %a = load volatile float, float addrspace(1)* %gep.0, align 4 375 %b = load volatile float, float addrspace(1)* %gep.1, align 4 376 377 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone 378 %result0 = extractvalue { float, i1 } %result, 0 379 store float %result0, float addrspace(1)* %out, align 4 380 ret void 381} 382 383; GCN-LABEL: {{^}}test_div_scale_f64: 384; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 385; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 386define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 { 387 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 388 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 389 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 390 391 %a = load volatile double, double addrspace(1)* %gep.0, align 8 392 %b = load volatile double, double addrspace(1)* %gep.1, align 8 393 394 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone 395 %result0 = extractvalue { double, i1 } %result, 0 396 store double %result0, double addrspace(1)* %out, align 8 397 ret void 398} 399 400; GCN-LABEL: {{^}}test_mad_i64_i32: 401; GFX1032: v_mad_i64_i32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 402; GFX1064: v_mad_i64_i32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 403define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 404 %sext0 = sext i32 %arg0 to i64 405 %sext1 = sext i32 %arg1 to i64 406 %mul = mul i64 %sext0, %sext1 407 %mad = add i64 %mul, %arg2 408 ret i64 %mad 409} 410 411; GCN-LABEL: {{^}}test_mad_u64_u32: 412; GFX1032: v_mad_u64_u32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 413; GFX1064: v_mad_u64_u32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] 414define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 415 %sext0 = zext i32 %arg0 to i64 416 %sext1 = zext i32 %arg1 to i64 417 %mul = mul i64 %sext0, %sext1 418 %mad = add i64 %mul, %arg2 419 ret i64 %mad 420} 421 422; GCN-LABEL: {{^}}test_div_fmas_f32: 423; GFX1032: v_cmp_eq_u32_e64 vcc_lo, 424; GFX1064: v_cmp_eq_u32_e64 vcc, 425; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 426define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { 427 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone 428 store float %result, float addrspace(1)* %out, align 4 429 ret void 430} 431 432; GCN-LABEL: {{^}}test_div_fmas_f64: 433; GFX1032: v_cmp_eq_u32_e64 vcc_lo, 434; GFX1064: v_cmp_eq_u32_e64 vcc, 435; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 436define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { 437 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone 438 store double %result, double addrspace(1)* %out, align 8 439 ret void 440} 441 442; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: 443; GFX1032: s_mov_b32 [[VCC:vcc_lo]], 0{{$}} 444; GFX1064: s_mov_b64 [[VCC:vcc]], 0{{$}} 445; GFX1032: s_and_saveexec_b32 [[SAVE:s[0-9]+]], s{{[0-9]+}}{{$}} 446; GFX1064: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], s[{{[0-9:]+}}]{{$}} 447 448; GCN: load_dword [[LOAD:v[0-9]+]] 449; GCN: v_cmp_ne_u32_e32 [[VCC]], 0, [[LOAD]] 450 451; GCN: BB{{[0-9_]+}}: 452; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE]] 453; GFX1064: s_or_b64 exec, exec, [[SAVE]] 454; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 455define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) #0 { 456entry: 457 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 458 %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 459 %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid 460 %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 461 %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 462 463 %a = load float, float addrspace(1)* %gep.a 464 %b = load float, float addrspace(1)* %gep.b 465 %c = load float, float addrspace(1)* %gep.c 466 467 %cmp0 = icmp eq i32 %tid, 0 468 br i1 %cmp0, label %bb, label %exit 469 470bb: 471 %val = load volatile i32, i32 addrspace(1)* %dummy 472 %cmp1 = icmp ne i32 %val, 0 473 br label %exit 474 475exit: 476 %cond = phi i1 [false, %entry], [%cmp1, %bb] 477 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone 478 store float %result, float addrspace(1)* %gep.out, align 4 479 ret void 480} 481 482; GCN-LABEL: {{^}}fdiv_f32: 483; GFC1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} 484; GFC1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} 485; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} 486; GCN-NOT: vcc 487; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 488define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { 489entry: 490 %fdiv = fdiv float %a, %b 491 store float %fdiv, float addrspace(1)* %out 492 ret void 493} 494 495; GCN-LABEL: {{^}}test_br_cc_f16: 496; GFX1032: v_cmp_nlt_f16_e32 vcc_lo, 497; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo 498; GFX1064: v_cmp_nlt_f16_e32 vcc, 499; GFX1064-NEXT: s_and_b64 vcc, exec, vcc{{$}} 500; GCN-NEXT: s_cbranch_vccnz 501define amdgpu_kernel void @test_br_cc_f16( 502 half addrspace(1)* %r, 503 half addrspace(1)* %a, 504 half addrspace(1)* %b) { 505entry: 506 %a.val = load half, half addrspace(1)* %a 507 %b.val = load half, half addrspace(1)* %b 508 %fcmp = fcmp olt half %a.val, %b.val 509 br i1 %fcmp, label %one, label %two 510 511one: 512 store half %a.val, half addrspace(1)* %r 513 ret void 514 515two: 516 store half %b.val, half addrspace(1)* %r 517 ret void 518} 519 520; GCN-LABEL: {{^}}test_brcc_i1: 521; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 522; GCN-NEXT: s_cbranch_scc1 523define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { 524 %cmp0 = icmp ne i1 %val, 0 525 br i1 %cmp0, label %store, label %end 526 527store: 528 store i32 222, i32 addrspace(1)* %out 529 ret void 530 531end: 532 ret void 533} 534 535; GCN-LABEL: {{^}}test_preserve_condition_undef_flag: 536; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 537; GFX1032: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 538; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 539; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} 540; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}} 541; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]] 542; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 543; GFX1064: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 544; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 545; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 546; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}] 547; GFX1064: s_and_b64 vcc, exec, [[OR2]] 548; GCN: s_cbranch_vccnz 549define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 { 550bb0: 551 %tmp = icmp sgt i32 %arg1, 4 552 %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) 553 %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 554 %tmp5 = fcmp ogt float %arg2, 0.000000e+00 555 %tmp6 = fcmp olt float %arg2, 1.000000e+00 556 %tmp7 = fcmp olt float %arg, %tmp4 557 %tmp8 = and i1 %tmp5, %tmp6 558 %tmp9 = and i1 %tmp8, %tmp7 559 br i1 %tmp9, label %bb1, label %bb2 560 561bb1: 562 store volatile i32 0, i32 addrspace(1)* undef 563 br label %bb2 564 565bb2: 566 ret void 567} 568 569; GCN-LABEL: {{^}}test_invert_true_phi_cond_break_loop: 570; GFX1032: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, -1 571; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 572; GFX1064: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], -1 573; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] 574define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { 575bb: 576 %id = call i32 @llvm.amdgcn.workitem.id.x() 577 %tmp = sub i32 %id, %arg 578 br label %bb1 579 580bb1: ; preds = %Flow, %bb 581 %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] 582 %lsr.iv.next = add i32 %lsr.iv, 1 583 %cmp0 = icmp slt i32 %lsr.iv.next, 0 584 br i1 %cmp0, label %bb4, label %Flow 585 586bb4: ; preds = %bb1 587 %load = load volatile i32, i32 addrspace(1)* undef, align 4 588 %cmp1 = icmp sge i32 %tmp, %load 589 br label %Flow 590 591Flow: ; preds = %bb4, %bb1 592 %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] 593 %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] 594 br i1 %tmp3, label %bb1, label %bb9 595 596bb9: ; preds = %Flow 597 store volatile i32 7, i32 addrspace(3)* undef 598 ret void 599} 600 601; GCN-LABEL: {{^}}test_movrels_extract_neg_offset_vgpr: 602; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 1, v{{[0-9]+}} 603; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc_lo 604; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 2, v{{[0-9]+}} 605; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc_lo 606; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 3, v{{[0-9]+}} 607; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc_lo 608; GFX1064: v_cmp_eq_u32_e32 vcc, 1, v{{[0-9]+}} 609; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc 610; GFX1064: v_cmp_ne_u32_e32 vcc, 2, v{{[0-9]+}} 611; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc 612; GFX1064: v_cmp_ne_u32_e32 vcc, 3, v{{[0-9]+}} 613; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc 614define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(i32 addrspace(1)* %out) #0 { 615entry: 616 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 617 %index = add i32 %id, -512 618 %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 619 store i32 %value, i32 addrspace(1)* %out 620 ret void 621} 622 623; GCN-LABEL: {{^}}test_set_inactive: 624; GFX1032: s_not_b32 exec_lo, exec_lo 625; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 42 626; GFX1032: s_not_b32 exec_lo, exec_lo 627; GFX1064: s_not_b64 exec, exec{{$}} 628; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 42 629; GFX1064: s_not_b64 exec, exec{{$}} 630define amdgpu_kernel void @test_set_inactive(i32 addrspace(1)* %out, i32 %in) #0 { 631 %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) 632 store i32 %tmp, i32 addrspace(1)* %out 633 ret void 634} 635 636; GCN-LABEL: {{^}}test_set_inactive_64: 637; GFX1032: s_not_b32 exec_lo, exec_lo 638; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0 639; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0 640; GFX1032: s_not_b32 exec_lo, exec_lo 641; GFX1064: s_not_b64 exec, exec{{$}} 642; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0 643; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0 644; GFX1064: s_not_b64 exec, exec{{$}} 645define amdgpu_kernel void @test_set_inactive_64(i64 addrspace(1)* %out, i64 %in) #0 { 646 %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) 647 store i64 %tmp, i64 addrspace(1)* %out 648 ret void 649} 650 651; GCN-LABEL: {{^}}test_kill_i1_terminator_float: 652; GFX1032: s_mov_b32 exec_lo, 0 653; GFX1064: s_mov_b64 exec, 0 654define amdgpu_ps void @test_kill_i1_terminator_float() #0 { 655 call void @llvm.amdgcn.kill(i1 false) 656 ret void 657} 658 659; GCN-LABEL: {{^}}test_kill_i1_terminator_i1: 660; GFX1032: s_or_b32 [[OR:s[0-9]+]], 661; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]] 662; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]], 663; GFX1064: s_and_b64 exec, exec, [[OR]] 664define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { 665 %c1 = icmp slt i32 %a, %b 666 %c2 = icmp slt i32 %c, %d 667 %x = or i1 %c1, %c2 668 call void @llvm.amdgcn.kill(i1 %x) 669 ret void 670} 671 672; GCN-LABEL: {{^}}test_loop_vcc: 673; GFX1032: v_cmp_lt_f32_e32 vcc_lo, 674; GFX1064: v_cmp_lt_f32_e32 vcc, 675; GCN: s_cbranch_vccnz 676define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { 677entry: 678 br label %loop 679 680loop: 681 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 682 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 683 %cc = fcmp ogt float %ctr.iv, 7.0 684 br i1 %cc, label %break, label %body 685 686body: 687 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 688 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) 689 %ctr.next = fadd float %ctr.iv, 2.0 690 br label %loop 691 692break: 693 ret <4 x float> %c.iv 694} 695 696; GCN-LABEL: {{^}}test_wwm1: 697; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1 698; GFX1032: s_mov_b32 exec_lo, [[SAVE]] 699; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1 700; GFX1064: s_mov_b64 exec, [[SAVE]] 701define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { 702main_body: 703 %out = fadd float %src0, %src1 704 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 705 ret float %out.0 706} 707 708; GCN-LABEL: {{^}}test_wwm2: 709; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 32, v{{[0-9]+}} 710; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo 711; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1 712; GFX1032: s_mov_b32 exec_lo, [[SAVE2]] 713; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]] 714; GFX1064: v_cmp_gt_u32_e32 vcc, 32, v{{[0-9]+}} 715; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}} 716; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1 717; GFX1064: s_mov_b64 exec, [[SAVE2]] 718; GFX1064: s_or_b64 exec, exec, [[SAVE1]] 719define amdgpu_ps float @test_wwm2(i32 inreg %idx) { 720main_body: 721 ; use mbcnt to make sure the branch is divergent 722 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 723 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 724 %cc = icmp uge i32 %hi, 32 725 br i1 %cc, label %endif, label %if 726 727if: 728 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) 729 %out = fadd float %src, %src 730 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 731 %out.1 = fadd float %src, %out.0 732 br label %endif 733 734endif: 735 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 736 ret float %out.2 737} 738 739; GCN-LABEL: {{^}}test_wqm1: 740; GFX1032: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo 741; GFX1032: s_wqm_b32 exec_lo, exec_lo 742; GFX1032: s_and_b32 exec_lo, exec_lo, [[ORIG]] 743; GFX1064: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec{{$}} 744; GFX1064: s_wqm_b64 exec, exec{{$}} 745; GFX1064: s_and_b64 exec, exec, [[ORIG]] 746define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 { 747main_body: 748 %inst23 = extractelement <2 x float> %pos, i32 0 749 %inst24 = extractelement <2 x float> %pos, i32 1 750 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 751 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 752 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 753 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 754 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) 755 ret <4 x float> %tex 756} 757 758; GCN-LABEL: {{^}}test_wqm2: 759; GFX1032: s_wqm_b32 exec_lo, exec_lo 760; GFX1032: s_and_b32 exec_lo, exec_lo, s{{[0-9+]}} 761; GFX1064: s_wqm_b64 exec, exec{{$}} 762; GFX1064: s_and_b64 exec, exec, s[{{[0-9:]+}}] 763define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 { 764main_body: 765 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) 766 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) 767 %out = fadd float %src0, %src1 768 %out.0 = bitcast float %out to i32 769 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 770 %out.2 = bitcast i32 %out.1 to float 771 ret float %out.2 772} 773 774; GCN-LABEL: {{^}}test_intr_fcmp_i64: 775; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}} 776; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 777; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 778; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 779; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 780; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] 781; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], 782define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) { 783 %temp = call float @llvm.fabs.f32(float %a) 784 %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) 785 store i64 %result, i64 addrspace(1)* %out 786 ret void 787} 788 789; GCN-LABEL: {{^}}test_intr_icmp_i64: 790; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}} 791; GFX1032-DAG: v_cmp_eq_u32_e64 [[C_LO:vcc_lo|s[0-9]+]], 0x64, {{s[0-9]+}} 792; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[C_LO]] 793; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}} 794; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 795; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] 796; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], 797define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) { 798 %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) 799 store i64 %result, i64 addrspace(1)* %out 800 ret void 801} 802 803; GCN-LABEL: {{^}}test_intr_fcmp_i32: 804; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 805; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 806; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| 807; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] 808; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], 809define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) { 810 %temp = call float @llvm.fabs.f32(float %a) 811 %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) 812 store i32 %result, i32 addrspace(1)* %out 813 ret void 814} 815 816; GCN-LABEL: {{^}}test_intr_icmp_i32: 817; GFX1032-DAG: v_cmp_eq_u32_e64 s[[C_LO:[0-9]+]], 0x64, {{s[0-9]+}} 818; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} 819; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}} 820; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} 821; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], 822define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) { 823 %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) 824 store i32 %result, i32 addrspace(1)* %out 825 ret void 826} 827 828; GCN-LABEL: {{^}}test_wqm_vote: 829; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0 830; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo 831; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]] 832; GFX1064: v_cmp_neq_f32_e32 vcc, 0 833; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}} 834; GFX1064: s_and_b64 exec, exec, [[WQM]] 835define amdgpu_ps void @test_wqm_vote(float %a) { 836 %c1 = fcmp une float %a, 0.0 837 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) 838 call void @llvm.amdgcn.kill(i1 %c2) 839 ret void 840} 841 842; GCN-LABEL: {{^}}test_branch_true: 843; GFX1032: s_and_b32 vcc_lo, exec_lo, -1 844; GFX1064: s_and_b64 vcc, exec, -1 845define amdgpu_kernel void @test_branch_true() #2 { 846entry: 847 br i1 true, label %for.end, label %for.body.lr.ph 848 849for.body.lr.ph: ; preds = %entry 850 br label %for.body 851 852for.body: ; preds = %for.body, %for.body.lr.ph 853 br i1 undef, label %for.end, label %for.body 854 855for.end: ; preds = %for.body, %entry 856 ret void 857} 858 859; GCN-LABEL: {{^}}test_ps_live: 860; GFX1032: s_mov_b32 [[C:s[0-9]+]], exec_lo 861; GFX1064: s_mov_b64 [[C:s\[[0-9:]+\]]], exec{{$}} 862; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]] 863define amdgpu_ps float @test_ps_live() #0 { 864 %live = call i1 @llvm.amdgcn.ps.live() 865 %live.32 = zext i1 %live to i32 866 %r = bitcast i32 %live.32 to float 867 ret float %r 868} 869 870; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64: 871; GFX1032: v_cmp_neq_f64_e64 [[C:s[0-9]+]], s[{{[0-9:]+}}], 1.0 872; GFX1032: s_and_b32 vcc_lo, exec_lo, [[C]] 873; GFX1064: v_cmp_neq_f64_e64 [[C:s\[[0-9:]+\]]], s[{{[0-9:]+}}], 1.0 874; GFX1064: s_and_b64 vcc, exec, [[C]] 875define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { 876entry: 877 %v = load double, double addrspace(1)* %in 878 %cc = fcmp oeq double %v, 1.000000e+00 879 br i1 %cc, label %if, label %endif 880 881if: 882 %u = fadd double %v, %v 883 br label %endif 884 885endif: 886 %r = phi double [ %v, %entry ], [ %u, %if ] 887 store double %r, double addrspace(1)* %out 888 ret void 889} 890 891; GCN-LABEL: {{^}}test_vgprblocks_w32_attr: 892; Test that the wave size can be overridden in function attributes and that the block size is correct as a result 893; GFX10DEFWAVE: ; VGPRBlocks: 1 894define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e, 895 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 { 896main_body: 897 %s = fadd float %a, %b 898 %s.1 = fadd float %s, %c 899 %s.2 = fadd float %s.1, %d 900 %s.3 = fadd float %s.2, %e 901 %s.4 = fadd float %s.3, %f 902 %s.5 = fadd float %s.4, %g 903 %s.6 = fadd float %s.5, %h 904 %s.7 = fadd float %s.6, %i 905 %s.8 = fadd float %s.7, %j 906 %s.9 = fadd float %s.8, %k 907 %s.10 = fadd float %s.9, %l 908 ret float %s.10 909} 910 911; GCN-LABEL: {{^}}test_vgprblocks_w64_attr: 912; Test that the wave size can be overridden in function attributes and that the block size is correct as a result 913; GFX10DEFWAVE: ; VGPRBlocks: 2 914define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, 915 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { 916main_body: 917 %s = fadd float %a, %b 918 %s.1 = fadd float %s, %c 919 %s.2 = fadd float %s.1, %d 920 %s.3 = fadd float %s.2, %e 921 %s.4 = fadd float %s.3, %f 922 %s.5 = fadd float %s.4, %g 923 %s.6 = fadd float %s.5, %h 924 %s.7 = fadd float %s.6, %i 925 %s.8 = fadd float %s.7, %j 926 %s.9 = fadd float %s.8, %k 927 %s.10 = fadd float %s.9, %l 928 ret float %s.10 929} 930 931; GCN-LABEL: {{^}}icmp64: 932; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v 933; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v 934define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { 935entry: 936 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 937 %mul4 = mul nsw i32 %s, %n 938 %cmp = icmp slt i32 0, %mul4 939 br label %if.end 940 941if.end: ; preds = %entry 942 %rem = urem i32 %id, %s 943 %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32) 944 %shr = lshr i64 %icmp, 1 945 %notmask = shl nsw i64 -1, 0 946 %and = and i64 %notmask, %shr 947 %or = or i64 %and, -9223372036854775808 948 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) 949 %cast = trunc i64 %cttz to i32 950 %cmp3 = icmp ugt i32 10, %cast 951 %cmp6 = icmp ne i32 %rem, 0 952 %brmerge = or i1 %cmp6, %cmp3 953 br i1 %brmerge, label %if.end2, label %if.then 954 955if.then: ; preds = %if.end 956 unreachable 957 958if.end2: ; preds = %if.end 959 ret void 960} 961 962; GCN-LABEL: {{^}}fcmp64: 963; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v 964; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v 965define amdgpu_kernel void @fcmp64(float %n, float %s) { 966entry: 967 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 968 %id.f = uitofp i32 %id to float 969 %mul4 = fmul float %s, %n 970 %cmp = fcmp ult float 0.0, %mul4 971 br label %if.end 972 973if.end: ; preds = %entry 974 %rem.f = frem float %id.f, %s 975 %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1) 976 %shr = lshr i64 %fcmp, 1 977 %notmask = shl nsw i64 -1, 0 978 %and = and i64 %notmask, %shr 979 %or = or i64 %and, -9223372036854775808 980 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) 981 %cast = trunc i64 %cttz to i32 982 %cmp3 = icmp ugt i32 10, %cast 983 %cmp6 = fcmp one float %rem.f, 0.0 984 %brmerge = or i1 %cmp6, %cmp3 985 br i1 %brmerge, label %if.end2, label %if.then 986 987if.then: ; preds = %if.end 988 unreachable 989 990if.end2: ; preds = %if.end 991 ret void 992} 993 994; GCN-LABEL: {{^}}icmp32: 995; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v 996; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v 997define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { 998entry: 999 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 1000 %mul4 = mul nsw i32 %s, %n 1001 %cmp = icmp slt i32 0, %mul4 1002 br label %if.end 1003 1004if.end: ; preds = %entry 1005 %rem = urem i32 %id, %s 1006 %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32) 1007 %shr = lshr i32 %icmp, 1 1008 %notmask = shl nsw i32 -1, 0 1009 %and = and i32 %notmask, %shr 1010 %or = or i32 %and, 2147483648 1011 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) 1012 %cmp3 = icmp ugt i32 10, %cttz 1013 %cmp6 = icmp ne i32 %rem, 0 1014 %brmerge = or i1 %cmp6, %cmp3 1015 br i1 %brmerge, label %if.end2, label %if.then 1016 1017if.then: ; preds = %if.end 1018 unreachable 1019 1020if.end2: ; preds = %if.end 1021 ret void 1022} 1023 1024; GCN-LABEL: {{^}}fcmp32: 1025; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v 1026; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v 1027define amdgpu_kernel void @fcmp32(float %n, float %s) { 1028entry: 1029 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 1030 %id.f = uitofp i32 %id to float 1031 %mul4 = fmul float %s, %n 1032 %cmp = fcmp ult float 0.0, %mul4 1033 br label %if.end 1034 1035if.end: ; preds = %entry 1036 %rem.f = frem float %id.f, %s 1037 %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1) 1038 %shr = lshr i32 %fcmp, 1 1039 %notmask = shl nsw i32 -1, 0 1040 %and = and i32 %notmask, %shr 1041 %or = or i32 %and, 2147483648 1042 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) 1043 %cmp3 = icmp ugt i32 10, %cttz 1044 %cmp6 = fcmp one float %rem.f, 0.0 1045 %brmerge = or i1 %cmp6, %cmp3 1046 br i1 %brmerge, label %if.end2, label %if.then 1047 1048if.then: ; preds = %if.end 1049 unreachable 1050 1051if.end2: ; preds = %if.end 1052 ret void 1053} 1054 1055declare void @external_void_func_void() #1 1056 1057; Test save/restore of VGPR needed for SGPR spilling. 1058 1059; GCN-LABEL: {{^}}callee_no_stack_with_call: 1060; GCN: s_waitcnt 1061; GCN-NEXT: s_waitcnt_vscnt 1062 1063; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} 1064; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} 1065; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill 1066; GCN-NEXT: v_nop 1067; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] 1068; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] 1069 1070; GCN-NEXT: v_writelane_b32 v32, s34, 2 1071; GCN: s_mov_b32 s34, s32 1072; GFX1064: s_add_u32 s32, s32, 0x400 1073; GFX1032: s_add_u32 s32, s32, 0x200 1074 1075 1076; GCN-DAG: v_writelane_b32 v32, s30, 0 1077; GCN-DAG: v_writelane_b32 v32, s31, 1 1078; GCN: s_swappc_b64 1079; GCN-DAG: v_readlane_b32 s4, v32, 0 1080; GCN-DAG: v_readlane_b32 s5, v32, 1 1081 1082 1083; GFX1064: s_sub_u32 s32, s32, 0x400 1084; GFX1032: s_sub_u32 s32, s32, 0x200 1085; GCN: v_readlane_b32 s34, v32, 2 1086; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} 1087; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} 1088; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload 1089; GCN-NEXT: v_nop 1090; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] 1091; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] 1092; GCN-NEXT: s_waitcnt vmcnt(0) 1093; GCN-NEXT: s_setpc_b64 1094define void @callee_no_stack_with_call() #1 { 1095 call void @external_void_func_void() 1096 ret void 1097} 1098 1099 1100declare i32 @llvm.amdgcn.workitem.id.x() 1101declare float @llvm.fabs.f32(float) 1102declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) 1103declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) 1104declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) 1105declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) 1106declare i1 @llvm.amdgcn.class.f32(float, i32) 1107declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) 1108declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) 1109declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) 1110declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) 1111declare float @llvm.amdgcn.wwm.f32(float) 1112declare i32 @llvm.amdgcn.wqm.i32(i32) 1113declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) 1114declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) 1115declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) 1116declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) 1117declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) 1118declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) 1119declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) 1120declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32) 1121declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32) 1122declare void @llvm.amdgcn.kill(i1) 1123declare i1 @llvm.amdgcn.wqm.vote(i1) 1124declare i1 @llvm.amdgcn.ps.live() 1125declare i64 @llvm.cttz.i64(i64, i1) 1126declare i32 @llvm.cttz.i32(i32, i1) 1127 1128attributes #0 = { nounwind readnone speculatable } 1129attributes #1 = { nounwind } 1130attributes #2 = { nounwind readnone optnone noinline } 1131attributes #3 = { "target-features"="+wavefrontsize32" } 1132attributes #4 = { "target-features"="+wavefrontsize64" } 1133