1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() #1
6
7define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
8; GCN-LABEL: v_pack_b32_v2f16:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
11; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
12; GCN-NEXT:    s_waitcnt lgkmcnt(0)
13; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
14; GCN-NEXT:    s_waitcnt vmcnt(0)
15; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
16; GCN-NEXT:    s_waitcnt vmcnt(0)
17; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
18; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
19; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
20; GCN-NEXT:    ;;#ASMSTART
21; GCN-NEXT:    ; use v0
22; GCN-NEXT:    ;;#ASMEND
23; GCN-NEXT:    s_endpgm
24;
25; GISEL-LABEL: v_pack_b32_v2f16:
26; GISEL:       ; %bb.0:
27; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
28; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
29; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
30; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
31; GISEL-NEXT:    s_waitcnt vmcnt(0)
32; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
33; GISEL-NEXT:    s_waitcnt vmcnt(0)
34; GISEL-NEXT:    s_waitcnt_depctr 0xffe3
35; GISEL-NEXT:    s_movk_i32 s0, 0x4000
36; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
37; GISEL-NEXT:    v_add_f16_sdwa v1, v2, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
38; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
39; GISEL-NEXT:    ;;#ASMSTART
40; GISEL-NEXT:    ; use v0
41; GISEL-NEXT:    ;;#ASMEND
42; GISEL-NEXT:    s_endpgm
43  %tid = call i32 @llvm.amdgcn.workitem.id.x()
44  %tid.ext = sext i32 %tid to i64
45  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
46  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
47  %v0 = load volatile half, half addrspace(1)* %in0.gep
48  %v1 = load volatile half, half addrspace(1)* %in1.gep
49  %v0.add = fadd half %v0, 2.0
50  %v1.add = fadd half %v1, 2.0
51  %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
52  %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
53  %vec.i32 = bitcast <2 x half> %vec.1 to i32
54  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
55  ret void
56}
57
58define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
59; GCN-LABEL: v_pack_b32_v2f16_sub:
60; GCN:       ; %bb.0:
61; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
62; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
63; GCN-NEXT:    s_waitcnt lgkmcnt(0)
64; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
65; GCN-NEXT:    s_waitcnt vmcnt(0)
66; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
67; GCN-NEXT:    s_waitcnt vmcnt(0)
68; GCN-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
69; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
70; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
71; GCN-NEXT:    ;;#ASMSTART
72; GCN-NEXT:    ; use v0
73; GCN-NEXT:    ;;#ASMEND
74; GCN-NEXT:    s_endpgm
75;
76; GISEL-LABEL: v_pack_b32_v2f16_sub:
77; GISEL:       ; %bb.0:
78; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
79; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
80; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
82; GISEL-NEXT:    s_waitcnt vmcnt(0)
83; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
84; GISEL-NEXT:    s_waitcnt vmcnt(0)
85; GISEL-NEXT:    v_mov_b32_e32 v0, 0x4000
86; GISEL-NEXT:    v_add_f16_e32 v1, -2.0, v1
87; GISEL-NEXT:    v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
88; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
89; GISEL-NEXT:    ;;#ASMSTART
90; GISEL-NEXT:    ; use v0
91; GISEL-NEXT:    ;;#ASMEND
92; GISEL-NEXT:    s_endpgm
93  %tid = call i32 @llvm.amdgcn.workitem.id.x()
94  %tid.ext = sext i32 %tid to i64
95  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
96  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
97  %v0 = load volatile half, half addrspace(1)* %in0.gep
98  %v1 = load volatile half, half addrspace(1)* %in1.gep
99  %v0.add = fsub half %v0, 2.0
100  %v1.add = fadd half %v1, 2.0
101  %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
102  %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
103  %vec.i32 = bitcast <2 x half> %vec.1 to i32
104  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
105  ret void
106}
107
108define amdgpu_kernel void @fptrunc(
109; GCN-LABEL: fptrunc:
110; GCN:       ; %bb.0:
111; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
112; GCN-NEXT:    s_mov_b32 s6, -1
113; GCN-NEXT:    s_mov_b32 s7, 0x31016000
114; GCN-NEXT:    s_mov_b32 s10, s6
115; GCN-NEXT:    s_mov_b32 s11, s7
116; GCN-NEXT:    s_waitcnt lgkmcnt(0)
117; GCN-NEXT:    s_mov_b32 s8, s2
118; GCN-NEXT:    s_mov_b32 s9, s3
119; GCN-NEXT:    s_mov_b32 s4, s0
120; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
121; GCN-NEXT:    s_mov_b32 s5, s1
122; GCN-NEXT:    s_waitcnt vmcnt(0)
123; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
124; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
125; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
126; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
127; GCN-NEXT:    s_endpgm
128;
129; GISEL-LABEL: fptrunc:
130; GISEL:       ; %bb.0:
131; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
132; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
133; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
134; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
135; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
136; GISEL-NEXT:    v_cvt_f16_f32_sdwa v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
137; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
138; GISEL-NEXT:    v_mov_b32_e32 v1, 0
139; GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
140; GISEL-NEXT:    s_endpgm
141    <2 x half> addrspace(1)* %r,
142    <2 x float> addrspace(1)* %a) {
143  %a.val = load <2 x float>, <2 x float> addrspace(1)* %a
144  %r.val = fptrunc <2 x float> %a.val to <2 x half>
145  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
146  ret void
147}
148
149define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
150; GCN-LABEL: v_pack_b32.fabs:
151; GCN:       ; %bb.0:
152; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
153; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
154; GCN-NEXT:    s_waitcnt lgkmcnt(0)
155; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
156; GCN-NEXT:    s_waitcnt vmcnt(0)
157; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
158; GCN-NEXT:    s_waitcnt vmcnt(0)
159; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
160; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
161; GCN-NEXT:    v_pack_b32_f16 v0, |v0|, |v1|
162; GCN-NEXT:    ;;#ASMSTART
163; GCN-NEXT:    ; use v0
164; GCN-NEXT:    ;;#ASMEND
165; GCN-NEXT:    s_endpgm
166;
167; GISEL-LABEL: v_pack_b32.fabs:
168; GISEL:       ; %bb.0:
169; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
170; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
171; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
172; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
173; GISEL-NEXT:    s_waitcnt vmcnt(0)
174; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
175; GISEL-NEXT:    s_waitcnt vmcnt(0)
176; GISEL-NEXT:    v_mov_b32_e32 v0, 0x7fff
177; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v1
178; GISEL-NEXT:    v_add_f16_e32 v2, 2.0, v2
179; GISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
180; GISEL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
181; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
182; GISEL-NEXT:    ;;#ASMSTART
183; GISEL-NEXT:    ; use v0
184; GISEL-NEXT:    ;;#ASMEND
185; GISEL-NEXT:    s_endpgm
186  %tid = call i32 @llvm.amdgcn.workitem.id.x()
187  %tid.ext = sext i32 %tid to i64
188  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
189  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
190  %v0 = load volatile half, half addrspace(1)* %in0.gep
191  %v1 = load volatile half, half addrspace(1)* %in1.gep
192  %v0.add = fadd half %v0, 2.0
193  %v1.add = fadd half %v1, 2.0
194  %v0.fabs = call half @llvm.fabs.f16(half %v0.add)
195  %v1.fabs = call half @llvm.fabs.f16(half %v1.add)
196  %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0
197  %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1
198  %vec.i32 = bitcast <2 x half> %vec.1 to i32
199  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
200  ret void
201}
202
203define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspace(1)* %in1) #0 {
204; GCN-LABEL: v_pack_b32.fneg:
205; GCN:       ; %bb.0:
206; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
207; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
208; GCN-NEXT:    s_waitcnt lgkmcnt(0)
209; GCN-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
210; GCN-NEXT:    s_waitcnt vmcnt(0)
211; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
212; GCN-NEXT:    s_waitcnt vmcnt(0)
213; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
214; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
215; GCN-NEXT:    v_pack_b32_f16 v0, -v0, -v1
216; GCN-NEXT:    ;;#ASMSTART
217; GCN-NEXT:    ; use v0
218; GCN-NEXT:    ;;#ASMEND
219; GCN-NEXT:    s_endpgm
220;
221; GISEL-LABEL: v_pack_b32.fneg:
222; GISEL:       ; %bb.0:
223; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
224; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
225; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
226; GISEL-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
227; GISEL-NEXT:    s_waitcnt vmcnt(0)
228; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
229; GISEL-NEXT:    s_waitcnt vmcnt(0)
230; GISEL-NEXT:    s_waitcnt_depctr 0xffe3
231; GISEL-NEXT:    s_mov_b32 s0, 0x8000
232; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
233; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
234; GISEL-NEXT:    v_add_f16_e64 v0, 0x8000, -v0
235; GISEL-NEXT:    v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
236; GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
237; GISEL-NEXT:    ;;#ASMSTART
238; GISEL-NEXT:    ; use v0
239; GISEL-NEXT:    ;;#ASMEND
240; GISEL-NEXT:    s_endpgm
241  %tid = call i32 @llvm.amdgcn.workitem.id.x()
242  %tid.ext = sext i32 %tid to i64
243  %in0.gep = getelementptr inbounds half, half addrspace(1)* %in0, i64 %tid.ext
244  %in1.gep = getelementptr inbounds half, half addrspace(1)* %in1, i64 %tid.ext
245  %v0 = load volatile half, half addrspace(1)* %in0.gep
246  %v1 = load volatile half, half addrspace(1)* %in1.gep
247  %v0.add = fadd half %v0, 2.0
248  %v1.add = fadd half %v1, 2.0
249  %v0.fneg = fsub half -0.0, %v0.add
250  %v1.fneg = fsub half -0.0, %v1.add
251  %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0
252  %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1
253  %vec.i32 = bitcast <2 x half> %vec.1 to i32
254  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
255  ret void
256}
257
258declare half @llvm.fabs.f16(half) #1
259
260attributes #0 = { nounwind }
261attributes #1 = { nounwind readnone }
262
263