1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1013 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX1030 %s
4; RUN: not --crash llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
5; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
6
7; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
8; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
9; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
10; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
11
12declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
13declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
14declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <3 x float>, <3 x float>, <3 x float>, <4 x i32>)
15declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <3 x float>, <3 x half>, <3 x half>, <4 x i32>)
16
17; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
18; Arguments are flattened to represent the actual VGPR_A layout, so we have no
19; extra moves in the generated kernel.
20define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
21; GCN-LABEL: image_bvh_intersect_ray:
22; GCN:       ; %bb.0: ; %main_body
23; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
24; GCN-NEXT:    s_waitcnt vmcnt(0)
25; GCN-NEXT:    ; return to shader part epilog
26main_body:
27  %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
28  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
29  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
30  %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
31  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
32  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
33  %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
34  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
35  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
36  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
37 %r = bitcast <4 x i32> %v to <4 x float>
38 ret <4 x float> %r
39}
40
41define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
42; GFX10-LABEL: image_bvh_intersect_ray_a16:
43; GFX10:       ; %bb.0: ; %main_body
44; GFX10-NEXT:    s_mov_b32 s15, s12
45; GFX10-NEXT:    s_mov_b32 s12, s9
46; GFX10-NEXT:    s_lshr_b32 s9, s7, 16
47; GFX10-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
48; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s9, s8
49; GFX10-NEXT:    v_mov_b32_e32 v0, s0
50; GFX10-NEXT:    v_mov_b32_e32 v1, s1
51; GFX10-NEXT:    v_mov_b32_e32 v2, s2
52; GFX10-NEXT:    v_mov_b32_e32 v3, s3
53; GFX10-NEXT:    v_mov_b32_e32 v4, s4
54; GFX10-NEXT:    v_mov_b32_e32 v5, s5
55; GFX10-NEXT:    v_mov_b32_e32 v6, s6
56; GFX10-NEXT:    v_mov_b32_e32 v7, s7
57; GFX10-NEXT:    s_mov_b32 s14, s11
58; GFX10-NEXT:    s_mov_b32 s13, s10
59; GFX10-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[12:15] a16
60; GFX10-NEXT:    s_waitcnt vmcnt(0)
61; GFX10-NEXT:    ; return to shader part epilog
62;
63; GFX11-LABEL: image_bvh_intersect_ray_a16:
64; GFX11:       ; %bb.0: ; %main_body
65; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
66; GFX11-NEXT:    s_lshr_b32 s2, s7, 16
67; GFX11-NEXT:    s_lshr_b32 s3, s5, 16
68; GFX11-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
69; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
70; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s7
71; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
72; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3
73; GFX11-NEXT:    s_pack_ll_b32_b16 s4, s6, s8
74; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
75; GFX11-NEXT:    s_mov_b32 s15, s12
76; GFX11-NEXT:    s_mov_b32 s14, s11
77; GFX11-NEXT:    s_mov_b32 s13, s10
78; GFX11-NEXT:    s_mov_b32 s12, s9
79; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
80; GFX11-NEXT:    s_waitcnt vmcnt(0)
81; GFX11-NEXT:    ; return to shader part epilog
82main_body:
83  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
84  %r = bitcast <4 x i32> %v to <4 x float>
85  ret <4 x float> %r
86}
87
88; Arguments are flattened to represent the actual VGPR_A layout, so we have no
89; extra moves in the generated kernel.
90define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
91; GCN-LABEL: image_bvh64_intersect_ray:
92; GCN:       ; %bb.0: ; %main_body
93; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
94; GCN-NEXT:    s_waitcnt vmcnt(0)
95; GCN-NEXT:    ; return to shader part epilog
96main_body:
97  %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
98  %ray_origin0 = insertelement <3 x float> undef, float %ray_origin_x, i32 0
99  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
100  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
101  %ray_dir0 = insertelement <3 x float> undef, float %ray_dir_x, i32 0
102  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
103  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
104  %ray_inv_dir0 = insertelement <3 x float> undef, float %ray_inv_dir_x, i32 0
105  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
106  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
107  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
108 %r = bitcast <4 x i32> %v to <4 x float>
109 ret <4 x float> %r
110}
111
112define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr, float inreg %ray_extent, <3 x float> inreg %ray_origin, <3 x half> inreg %ray_dir, <3 x half> inreg %ray_inv_dir, <4 x i32> inreg %tdescr) {
113; GFX10-LABEL: image_bvh64_intersect_ray_a16:
114; GFX10:       ; %bb.0: ; %main_body
115; GFX10-NEXT:    s_mov_b32 s14, s12
116; GFX10-NEXT:    s_mov_b32 s12, s10
117; GFX10-NEXT:    s_lshr_b32 s10, s8, 16
118; GFX10-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
119; GFX10-NEXT:    s_pack_ll_b32_b16 s8, s10, s9
120; GFX10-NEXT:    v_mov_b32_e32 v0, s0
121; GFX10-NEXT:    v_mov_b32_e32 v1, s1
122; GFX10-NEXT:    v_mov_b32_e32 v2, s2
123; GFX10-NEXT:    v_mov_b32_e32 v3, s3
124; GFX10-NEXT:    v_mov_b32_e32 v4, s4
125; GFX10-NEXT:    v_mov_b32_e32 v5, s5
126; GFX10-NEXT:    v_mov_b32_e32 v6, s6
127; GFX10-NEXT:    v_mov_b32_e32 v7, s7
128; GFX10-NEXT:    v_mov_b32_e32 v8, s8
129; GFX10-NEXT:    s_mov_b32 s15, s13
130; GFX10-NEXT:    s_mov_b32 s13, s11
131; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[12:15] a16
132; GFX10-NEXT:    s_waitcnt vmcnt(0)
133; GFX10-NEXT:    ; return to shader part epilog
134;
135; GFX11-LABEL: image_bvh64_intersect_ray_a16:
136; GFX11:       ; %bb.0: ; %main_body
137; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
138; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
139; GFX11-NEXT:    s_lshr_b32 s3, s6, 16
140; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
141; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
142; GFX11-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1
143; GFX11-NEXT:    s_lshr_b32 s0, s8, 16
144; GFX11-NEXT:    v_mov_b32_e32 v8, s2
145; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
146; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s7, s9
147; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
148; GFX11-NEXT:    s_mov_b32 s15, s13
149; GFX11-NEXT:    s_mov_b32 s14, s12
150; GFX11-NEXT:    s_mov_b32 s13, s11
151; GFX11-NEXT:    s_mov_b32 s12, s10
152; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
153; GFX11-NEXT:    s_waitcnt vmcnt(0)
154; GFX11-NEXT:    ; return to shader part epilog
155main_body:
156  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
157  %r = bitcast <4 x i32> %v to <4 x float>
158  ret <4 x float> %r
159}
160
161; TODO: NSA reassign is very limited and cannot work with VGPR tuples and subregs.
162
163define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
164; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
165; GFX1013:       ; %bb.0: ; %main_body
166; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
167; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
168; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
169; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
170; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
171; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40c00000
172; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
173; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
174; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX1013-NEXT:    v_add_co_u32 v2, s4, s4, v0
176; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s4, s5, 0, s4
177; GFX1013-NEXT:    v_add_co_u32 v4, s4, s6, v0
178; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s4, s7, 0, s4
179; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
180; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
181; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
182; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
183; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
184; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
185; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
186; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
187; GFX1013-NEXT:    s_waitcnt vmcnt(0)
188; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
189; GFX1013-NEXT:    s_endpgm
190;
191; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
192; GFX1030:       ; %bb.0: ; %main_body
193; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
194; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
195; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
196; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
197; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
198; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x40c00000
199; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x40a00000
200; GFX1030-NEXT:    v_mov_b32_e32 v6, 4.0
201; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
202; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
203; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX1030-NEXT:    v_add_co_u32 v0, s4, s4, v2
205; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
206; GFX1030-NEXT:    v_add_co_u32 v2, s4, s6, v2
207; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s4
208; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
209; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
210; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
211; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
212; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
213; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
214; GFX1030-NEXT:    s_waitcnt vmcnt(0)
215; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
216; GFX1030-NEXT:    s_endpgm
217;
218; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign:
219; GFX11:       ; %bb.0: ; %main_body
220; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
221; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
222; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x34
223; GFX11-NEXT:    v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
224; GFX11-NEXT:    v_mov_b32_e32 v8, 2.0
225; GFX11-NEXT:    v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0
226; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
227; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v2
228; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
229; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
230; GFX11-NEXT:    v_add_co_u32 v2, s4, s6, v2
231; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s4
232; GFX11-NEXT:    flat_load_b32 v9, v[0:1]
233; GFX11-NEXT:    flat_load_b32 v10, v[2:3]
234; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40c00000
235; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40e00000
236; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41000000
237; GFX11-NEXT:    v_mov_b32_e32 v3, 0x40400000
238; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
239; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[0:3]
240; GFX11-NEXT:    s_waitcnt vmcnt(0)
241; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
242; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
243; GFX11-NEXT:    s_endpgm
244main_body:
245  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
246  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
247  %node_ptr = load i32, i32* %gep_node_ptr, align 4
248  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
249  %ray_extent = load float, float* %gep_ray, align 4
250  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
251  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
252  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
253  %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
254  %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
255  %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
256  %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
257  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
258  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
259  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
260  store <4 x i32> %v, <4 x i32>* undef
261  ret void
262}
263
264define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
265; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
266; GFX1013:       ; %bb.0: ; %main_body
267; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
268; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
269; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
270; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x46004500
271; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x48004700
272; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX1013-NEXT:    v_add_co_u32 v2, s4, s4, v0
274; GFX1013-NEXT:    v_add_co_ci_u32_e64 v3, s4, s5, 0, s4
275; GFX1013-NEXT:    v_add_co_u32 v4, s4, s6, v0
276; GFX1013-NEXT:    v_add_co_ci_u32_e64 v5, s4, s7, 0, s4
277; GFX1013-NEXT:    flat_load_dword v0, v[2:3]
278; GFX1013-NEXT:    flat_load_dword v1, v[4:5]
279; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
280; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
281; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
282; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x44004200
283; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
284; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
285; GFX1013-NEXT:    s_waitcnt vmcnt(0)
286; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
287; GFX1013-NEXT:    s_endpgm
288;
289; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
290; GFX1030:       ; %bb.0: ; %main_body
291; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
292; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
293; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
294; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
295; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x44004200
296; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x46004500
297; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x48004700
298; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX1030-NEXT:    v_add_co_u32 v0, s4, s4, v2
300; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
301; GFX1030-NEXT:    v_add_co_u32 v2, s4, s6, v2
302; GFX1030-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s4
303; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
304; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
305; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
306; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
307; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
308; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
309; GFX1030-NEXT:    s_waitcnt vmcnt(0)
310; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
311; GFX1030-NEXT:    s_endpgm
312;
313; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
314; GFX11:       ; %bb.0: ; %main_body
315; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
316; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
317; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x34
318; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0
319; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
321; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v2
322; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
323; GFX11-NEXT:    v_add_co_u32 v2, s4, s6, v2
324; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
325; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s4
326; GFX11-NEXT:    flat_load_b32 v6, v[0:1]
327; GFX11-NEXT:    flat_load_b32 v7, v[2:3]
328; GFX11-NEXT:    v_mov_b32_e32 v1, 0x47004400
329; GFX11-NEXT:    v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0
330; GFX11-NEXT:    v_mov_b32_e32 v2, 0x48004500
331; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
332; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16
333; GFX11-NEXT:    s_waitcnt vmcnt(0)
334; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
335; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
336; GFX11-NEXT:    s_endpgm
337main_body:
338  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
339  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
340  %node_ptr = load i32, i32* %gep_node_ptr, align 4
341  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
342  %ray_extent = load float, float* %gep_ray, align 4
343  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
344  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
345  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
346  %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
347  %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
348  %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
349  %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
350  %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
351  %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
352  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
353  store <4 x i32> %v, <4 x i32>* undef
354  ret void
355}
356
357define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
358; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
359; GFX1013:       ; %bb.0: ; %main_body
360; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
361; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
362; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
363; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
364; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
365; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
366; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x40400000
367; GFX1013-NEXT:    v_mov_b32_e32 v7, 4.0
368; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40a00000
369; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40c00000
370; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x40e00000
371; GFX1013-NEXT:    v_mov_b32_e32 v11, 0x41000000
372; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX1013-NEXT:    v_add_co_u32 v0, s4, s4, v0
374; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
375; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
376; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
377; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
378; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
379; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
380; GFX1013-NEXT:    s_waitcnt vmcnt(0)
381; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
382; GFX1013-NEXT:    s_endpgm
383;
384; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
385; GFX1030:       ; %bb.0: ; %main_body
386; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
387; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
388; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
389; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
390; GFX1030-NEXT:    v_mov_b32_e32 v11, 0x41000000
391; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x40e00000
392; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40c00000
393; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x40a00000
394; GFX1030-NEXT:    v_mov_b32_e32 v7, 4.0
395; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x40400000
396; GFX1030-NEXT:    v_mov_b32_e32 v5, 2.0
397; GFX1030-NEXT:    v_mov_b32_e32 v4, 1.0
398; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX1030-NEXT:    v_add_co_u32 v0, s4, s4, v0
400; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
401; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
402; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
403; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
404; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
405; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
406; GFX1030-NEXT:    s_waitcnt vmcnt(0)
407; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
408; GFX1030-NEXT:    s_endpgm
409;
410; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
411; GFX11:       ; %bb.0: ; %main_body
412; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
413; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
414; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x34
415; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41000000
416; GFX11-NEXT:    v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0
417; GFX11-NEXT:    v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
418; GFX11-NEXT:    v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7
419; GFX11-NEXT:    v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0
420; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v0
422; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
423; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
424; GFX11-NEXT:    flat_load_b32 v11, v[0:1]
425; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40c00000
426; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40e00000
427; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
428; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3]
429; GFX11-NEXT:    s_waitcnt vmcnt(0)
430; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
431; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
432; GFX11-NEXT:    s_endpgm
433main_body:
434  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
435  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
436  %ray_extent = load float, float* %gep_ray, align 4
437  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
438  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
439  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
440  %ray_dir0 = insertelement <3 x float> undef, float 3.0, i32 0
441  %ray_dir1 = insertelement <3 x float> %ray_dir0, float 4.0, i32 1
442  %ray_dir = insertelement <3 x float> %ray_dir1, float 5.0, i32 2
443  %ray_inv_dir0 = insertelement <3 x float> undef, float 6.0, i32 0
444  %ray_inv_dir1 = insertelement <3 x float> %ray_inv_dir0, float 7.0, i32 1
445  %ray_inv_dir = insertelement <3 x float> %ray_inv_dir1, float 8.0, i32 2
446  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <3 x float> %ray_origin, <3 x float> %ray_dir, <3 x float> %ray_inv_dir, <4 x i32> %tdescr)
447  store <4 x i32> %v, <4 x i32>* undef
448  ret void
449}
450
451define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
452; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
453; GFX1013:       ; %bb.0: ; %main_body
454; GFX1013-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
455; GFX1013-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
456; GFX1013-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
457; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
458; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
459; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
460; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x44004200
461; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x46004500
462; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x48004700
463; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX1013-NEXT:    v_add_co_u32 v0, s4, s4, v0
465; GFX1013-NEXT:    v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
466; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
467; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
468; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
469; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
470; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
471; GFX1013-NEXT:    s_waitcnt vmcnt(0)
472; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
473; GFX1013-NEXT:    s_endpgm
474;
475; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
476; GFX1030:       ; %bb.0: ; %main_body
477; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
478; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
479; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
480; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
481; GFX1030-NEXT:    v_mov_b32_e32 v5, 2.0
482; GFX1030-NEXT:    v_mov_b32_e32 v4, 1.0
483; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x44004200
484; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x46004500
485; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x48004700
486; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX1030-NEXT:    v_add_co_u32 v0, s4, s4, v0
488; GFX1030-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
489; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
490; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
491; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
492; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
493; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
494; GFX1030-NEXT:    s_waitcnt vmcnt(0)
495; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
496; GFX1030-NEXT:    s_endpgm
497;
498; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
499; GFX11:       ; %bb.0: ; %main_body
500; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x24
501; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
502; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x34
503; GFX11-NEXT:    v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0
504; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102
505; GFX11-NEXT:    v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0
506; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX11-NEXT:    v_add_co_u32 v0, s4, s4, v0
508; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
509; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s4
510; GFX11-NEXT:    flat_load_b32 v8, v[0:1]
511; GFX11-NEXT:    v_mov_b32_e32 v0, 0x46004200
512; GFX11-NEXT:    v_mov_b32_e32 v1, 0x47004400
513; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
514; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16
515; GFX11-NEXT:    s_waitcnt vmcnt(0)
516; GFX11-NEXT:    flat_store_b128 v[0:1], v[0:3]
517; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
518; GFX11-NEXT:    s_endpgm
519main_body:
520  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
521  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
522  %ray_extent = load float, float* %gep_ray, align 4
523  %ray_origin0 = insertelement <3 x float> undef, float 0.0, i32 0
524  %ray_origin1 = insertelement <3 x float> %ray_origin0, float 1.0, i32 1
525  %ray_origin = insertelement <3 x float> %ray_origin1, float 2.0, i32 2
526  %ray_dir0 = insertelement <3 x half> undef, half 3.0, i32 0
527  %ray_dir1 = insertelement <3 x half> %ray_dir0, half 4.0, i32 1
528  %ray_dir = insertelement <3 x half> %ray_dir1, half 5.0, i32 2
529  %ray_inv_dir0 = insertelement <3 x half> undef, half 6.0, i32 0
530  %ray_inv_dir1 = insertelement <3 x half> %ray_inv_dir0, half 7.0, i32 1
531  %ray_inv_dir = insertelement <3 x half> %ray_inv_dir1, half 8.0, i32 2
532  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
533  store <4 x i32> %v, <4 x i32>* undef
534  ret void
535}
536
537declare i32 @llvm.amdgcn.workitem.id.x()
538