1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s 4; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s 6 7; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) 8; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) 9; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr) 10; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr) 11 12declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) 13declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) 14declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>) 15declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>) 16 17; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget 18; Arguments are flattened to represent the actual VGPR_A layout, so we have no 19; extra moves in the generated kernel. 20define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { 21; GCN-LABEL: image_bvh_intersect_ray: 22; GCN: ; %bb.0: ; %main_body 23; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] 24; GCN-NEXT: s_waitcnt vmcnt(0) 25; GCN-NEXT: ; return to shader part epilog 26main_body: 27 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 28 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 29 %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 30 %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 31 %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 32 %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 33 %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 34 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 35 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 36 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 37 %r = bitcast <4 x i32> %v to <4 x float> 38 ret <4 x float> %r 39} 40 41define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { 42; GFX10-LABEL: image_bvh_intersect_ray_a16: 43; GFX10: ; %bb.0: ; %main_body 44; GFX10-NEXT: s_mov_b32 s15, s12 45; GFX10-NEXT: s_mov_b32 s12, s9 46; GFX10-NEXT: s_lshr_b32 s9, s7, 16 47; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 48; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s8 49; GFX10-NEXT: v_mov_b32_e32 v0, s0 50; GFX10-NEXT: v_mov_b32_e32 v1, s1 51; GFX10-NEXT: v_mov_b32_e32 v2, s2 52; GFX10-NEXT: v_mov_b32_e32 v3, s3 53; GFX10-NEXT: v_mov_b32_e32 v4, s4 54; GFX10-NEXT: v_mov_b32_e32 v5, s5 55; GFX10-NEXT: v_mov_b32_e32 v6, s6 56; GFX10-NEXT: v_mov_b32_e32 v7, s7 57; GFX10-NEXT: s_mov_b32 s14, s11 58; GFX10-NEXT: s_mov_b32 s13, s10 59; GFX10-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16 60; GFX10-NEXT: s_waitcnt vmcnt(0) 61; GFX10-NEXT: ; return to shader part epilog 62; 63; GFX11-LABEL: image_bvh_intersect_ray_a16: 64; GFX11: ; %bb.0: ; %main_body 65; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 66; GFX11-NEXT: s_lshr_b32 s2, s7, 16 67; GFX11-NEXT: s_lshr_b32 s3, s5, 16 68; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 69; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 70; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7 71; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 72; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 73; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8 74; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 75; GFX11-NEXT: s_mov_b32 s15, s12 76; GFX11-NEXT: s_mov_b32 s14, s11 77; GFX11-NEXT: s_mov_b32 s13, s10 78; GFX11-NEXT: s_mov_b32 s12, s9 79; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16 80; GFX11-NEXT: s_waitcnt vmcnt(0) 81; GFX11-NEXT: ; return to shader part epilog 82main_body: 83 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 84 %r = bitcast <4 x i32> %v to <4 x float> 85 ret <4 x float> %r 86} 87 88; Arguments are flattened to represent the actual VGPR_A layout, so we have no 89; extra moves in the generated kernel. 90define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { 91; GCN-LABEL: image_bvh64_intersect_ray: 92; GCN: ; %bb.0: ; %main_body 93; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] 94; GCN-NEXT: s_waitcnt vmcnt(0) 95; GCN-NEXT: ; return to shader part epilog 96main_body: 97 %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 98 %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0 99 %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1 100 %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2 101 %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0 102 %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1 103 %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2 104 %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0 105 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 106 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 107 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 108 %r = bitcast <4 x i32> %v to <4 x float> 109 ret <4 x float> %r 110} 111 112define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) { 113; GFX10-LABEL: image_bvh64_intersect_ray_a16: 114; GFX10: ; %bb.0: ; %main_body 115; GFX10-NEXT: s_mov_b32 s14, s12 116; GFX10-NEXT: s_mov_b32 s12, s10 117; GFX10-NEXT: s_lshr_b32 s10, s8, 16 118; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 119; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s9 120; GFX10-NEXT: v_mov_b32_e32 v0, s0 121; GFX10-NEXT: v_mov_b32_e32 v1, s1 122; GFX10-NEXT: v_mov_b32_e32 v2, s2 123; GFX10-NEXT: v_mov_b32_e32 v3, s3 124; GFX10-NEXT: v_mov_b32_e32 v4, s4 125; GFX10-NEXT: v_mov_b32_e32 v5, s5 126; GFX10-NEXT: v_mov_b32_e32 v6, s6 127; GFX10-NEXT: v_mov_b32_e32 v7, s7 128; GFX10-NEXT: v_mov_b32_e32 v8, s8 129; GFX10-NEXT: s_mov_b32 s15, s13 130; GFX10-NEXT: s_mov_b32 s13, s11 131; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16 132; GFX10-NEXT: s_waitcnt vmcnt(0) 133; GFX10-NEXT: ; return to shader part epilog 134; 135; GFX11-LABEL: image_bvh64_intersect_ray_a16: 136; GFX11: ; %bb.0: ; %main_body 137; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 138; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 139; GFX11-NEXT: s_lshr_b32 s3, s6, 16 140; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8 141; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 142; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 143; GFX11-NEXT: s_lshr_b32 s0, s8, 16 144; GFX11-NEXT: v_mov_b32_e32 v8, s2 145; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 146; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 147; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 148; GFX11-NEXT: s_mov_b32 s15, s13 149; GFX11-NEXT: s_mov_b32 s14, s12 150; GFX11-NEXT: s_mov_b32 s13, s11 151; GFX11-NEXT: s_mov_b32 s12, s10 152; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16 153; GFX11-NEXT: s_waitcnt vmcnt(0) 154; GFX11-NEXT: ; return to shader part epilog 155main_body: 156 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 157 %r = bitcast <4 x i32> %v to <4 x float> 158 ret <4 x float> %r 159} 160 161; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs. 162 163define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { 164; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: 165; GFX1013: ; %bb.0: ; %main_body 166; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 167; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 168; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 169; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 170; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 171; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 172; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 173; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 174; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 175; GFX1013-NEXT: v_add_co_u32 v2, s4, s4, v0 176; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s4, s5, 0, s4 177; GFX1013-NEXT: v_add_co_u32 v4, s4, s6, v0 178; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s4, s7, 0, s4 179; GFX1013-NEXT: flat_load_dword v0, v[2:3] 180; GFX1013-NEXT: flat_load_dword v1, v[4:5] 181; GFX1013-NEXT: v_mov_b32_e32 v2, 0 182; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 183; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 184; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 185; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 186; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] 187; GFX1013-NEXT: s_waitcnt vmcnt(0) 188; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 189; GFX1013-NEXT: s_endpgm 190; 191; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: 192; GFX1030: ; %bb.0: ; %main_body 193; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 194; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 195; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 196; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 197; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 198; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 199; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000 200; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 201; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 202; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 203; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 204; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v2 205; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 206; GFX1030-NEXT: v_add_co_u32 v2, s4, s6, v2 207; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 208; GFX1030-NEXT: flat_load_dword v0, v[0:1] 209; GFX1030-NEXT: flat_load_dword v1, v[2:3] 210; GFX1030-NEXT: v_mov_b32_e32 v2, 0 211; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 212; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 213; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] 214; GFX1030-NEXT: s_waitcnt vmcnt(0) 215; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 216; GFX1030-NEXT: s_endpgm 217; 218; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: 219; GFX11: ; %bb.0: ; %main_body 220; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 221; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 222; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 223; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 224; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 225; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 226; GFX11-NEXT: s_waitcnt lgkmcnt(0) 227; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 228; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 229; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 230; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 231; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 232; GFX11-NEXT: flat_load_b32 v9, v[0:1] 233; GFX11-NEXT: flat_load_b32 v10, v[2:3] 234; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 235; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 236; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 237; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 238; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 239; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[0:3] 240; GFX11-NEXT: s_waitcnt vmcnt(0) 241; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 242; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 243; GFX11-NEXT: s_endpgm 244main_body: 245 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 246 %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid 247 %node_ptr = load i32, i32* %gep_node_ptr, align 4 248 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 249 %ray_extent = load float, float* %gep_ray, align 4 250 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 251 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 252 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 253 %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 254 %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 255 %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 256 %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 257 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 258 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 259 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 260 store <4 x i32> %v, <4 x i32>* undef 261 ret void 262} 263 264define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { 265; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 266; GFX1013: ; %bb.0: ; %main_body 267; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 268; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 269; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 270; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 271; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 272; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 273; GFX1013-NEXT: v_add_co_u32 v2, s4, s4, v0 274; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s4, s5, 0, s4 275; GFX1013-NEXT: v_add_co_u32 v4, s4, s6, v0 276; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s4, s7, 0, s4 277; GFX1013-NEXT: flat_load_dword v0, v[2:3] 278; GFX1013-NEXT: flat_load_dword v1, v[4:5] 279; GFX1013-NEXT: v_mov_b32_e32 v2, 0 280; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 281; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 282; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 283; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 284; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 285; GFX1013-NEXT: s_waitcnt vmcnt(0) 286; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 287; GFX1013-NEXT: s_endpgm 288; 289; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 290; GFX1030: ; %bb.0: ; %main_body 291; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 292; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 293; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 294; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 295; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 296; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 297; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 298; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 299; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v2 300; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 301; GFX1030-NEXT: v_add_co_u32 v2, s4, s6, v2 302; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 303; GFX1030-NEXT: flat_load_dword v0, v[0:1] 304; GFX1030-NEXT: flat_load_dword v1, v[2:3] 305; GFX1030-NEXT: v_mov_b32_e32 v2, 0 306; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 307; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 308; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 309; GFX1030-NEXT: s_waitcnt vmcnt(0) 310; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 311; GFX1030-NEXT: s_endpgm 312; 313; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 314; GFX11: ; %bb.0: ; %main_body 315; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 316; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 317; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 318; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 319; GFX11-NEXT: s_waitcnt lgkmcnt(0) 320; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 321; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 322; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 323; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 324; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 325; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 326; GFX11-NEXT: flat_load_b32 v6, v[0:1] 327; GFX11-NEXT: flat_load_b32 v7, v[2:3] 328; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 329; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 330; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 331; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 332; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16 333; GFX11-NEXT: s_waitcnt vmcnt(0) 334; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 335; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 336; GFX11-NEXT: s_endpgm 337main_body: 338 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 339 %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid 340 %node_ptr = load i32, i32* %gep_node_ptr, align 4 341 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 342 %ray_extent = load float, float* %gep_ray, align 4 343 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 344 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 345 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 346 %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 347 %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 348 %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 349 %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 350 %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 351 %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 352 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 353 store <4 x i32> %v, <4 x i32>* undef 354 ret void 355} 356 357define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { 358; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: 359; GFX1013: ; %bb.0: ; %main_body 360; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 361; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 362; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 363; GFX1013-NEXT: v_mov_b32_e32 v3, 0 364; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 365; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 366; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 367; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 368; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 369; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 370; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 371; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 372; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 373; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0 374; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 375; GFX1013-NEXT: flat_load_dword v2, v[0:1] 376; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 377; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 378; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 379; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] 380; GFX1013-NEXT: s_waitcnt vmcnt(0) 381; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 382; GFX1013-NEXT: s_endpgm 383; 384; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: 385; GFX1030: ; %bb.0: ; %main_body 386; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 387; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 388; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 389; GFX1030-NEXT: v_mov_b32_e32 v3, 0 390; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 391; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 392; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 393; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 394; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 395; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 396; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 397; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 398; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 399; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 400; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 401; GFX1030-NEXT: flat_load_dword v2, v[0:1] 402; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 403; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 404; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 405; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] 406; GFX1030-NEXT: s_waitcnt vmcnt(0) 407; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 408; GFX1030-NEXT: s_endpgm 409; 410; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign: 411; GFX11: ; %bb.0: ; %main_body 412; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 413; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 414; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 415; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 416; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 417; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 418; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 419; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0 420; GFX11-NEXT: s_waitcnt lgkmcnt(0) 421; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 422; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 423; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 424; GFX11-NEXT: flat_load_b32 v11, v[0:1] 425; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 426; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000 427; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 428; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3] 429; GFX11-NEXT: s_waitcnt vmcnt(0) 430; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 431; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 432; GFX11-NEXT: s_endpgm 433main_body: 434 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 435 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 436 %ray_extent = load float, float* %gep_ray, align 4 437 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 438 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 439 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 440 %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0 441 %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1 442 %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2 443 %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0 444 %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1 445 %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2 446 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr) 447 store <4 x i32> %v, <4 x i32>* undef 448 ret void 449} 450 451define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { 452; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 453; GFX1013: ; %bb.0: ; %main_body 454; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 455; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 456; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 457; GFX1013-NEXT: v_mov_b32_e32 v3, 0 458; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 459; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 460; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200 461; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500 462; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700 463; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 464; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0 465; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4 466; GFX1013-NEXT: flat_load_dword v2, v[0:1] 467; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 468; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 469; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 470; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 471; GFX1013-NEXT: s_waitcnt vmcnt(0) 472; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 473; GFX1013-NEXT: s_endpgm 474; 475; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 476; GFX1030: ; %bb.0: ; %main_body 477; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 478; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 479; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 480; GFX1030-NEXT: v_mov_b32_e32 v3, 0 481; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 482; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 483; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 484; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 485; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 486; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 487; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0 488; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 489; GFX1030-NEXT: flat_load_dword v2, v[0:1] 490; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 491; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 492; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 493; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 494; GFX1030-NEXT: s_waitcnt vmcnt(0) 495; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 496; GFX1030-NEXT: s_endpgm 497; 498; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 499; GFX11: ; %bb.0: ; %main_body 500; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 501; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 502; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 503; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 504; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102 505; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0 506; GFX11-NEXT: s_waitcnt lgkmcnt(0) 507; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 508; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 509; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 510; GFX11-NEXT: flat_load_b32 v8, v[0:1] 511; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 512; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 513; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 514; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16 515; GFX11-NEXT: s_waitcnt vmcnt(0) 516; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] 517; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 518; GFX11-NEXT: s_endpgm 519main_body: 520 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 521 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 522 %ray_extent = load float, float* %gep_ray, align 4 523 %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0 524 %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1 525 %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2 526 %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0 527 %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1 528 %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2 529 %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0 530 %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1 531 %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2 532 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) 533 store <4 x i32> %v, <4 x i32>* undef 534 ret void 535} 536 537declare i32 @llvm.amdgcn.workitem.id.x() 538