1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
10; GFX7-LABEL: idot4_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_mov_b32 s3, 0xf000
15; GFX7-NEXT:    s_mov_b32 s10, 0
16; GFX7-NEXT:    s_mov_b32 s11, s3
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
19; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
20; GFX7-NEXT:    v_mov_b32_e32 v1, 0
21; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
22; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
23; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
24; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
25; GFX7-NEXT:    s_mov_b32 s2, -1
26; GFX7-NEXT:    s_waitcnt vmcnt(1)
27; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
28; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
29; GFX7-NEXT:    s_waitcnt vmcnt(0)
30; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
31; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
32; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, s4
34; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
35; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
36; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v6, v1
37; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
38; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
39; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
40; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
41; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
42; GFX7-NEXT:    s_endpgm
43;
44; GFX8-LABEL: idot4_acc32:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
47; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
48; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
49; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX8-NEXT:    v_mov_b32_e32 v1, s5
51; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
52; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
53; GFX8-NEXT:    flat_load_dword v3, v[0:1]
54; GFX8-NEXT:    v_mov_b32_e32 v1, s7
55; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
56; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
57; GFX8-NEXT:    flat_load_dword v0, v[0:1]
58; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
59; GFX8-NEXT:    s_waitcnt vmcnt(1)
60; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
61; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
62; GFX8-NEXT:    v_bfe_i32 v6, v3, 16, 8
63; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
64; GFX8-NEXT:    s_waitcnt vmcnt(0)
65; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
66; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
69; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
70; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
71; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
72; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
73; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
74; GFX8-NEXT:    v_mov_b32_e32 v0, s0
75; GFX8-NEXT:    v_mov_b32_e32 v1, s1
76; GFX8-NEXT:    flat_store_dword v[0:1], v2
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-NODL-LABEL: idot4_acc32:
80; GFX9-NODL:       ; %bb.0: ; %entry
81; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
82; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
83; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
84; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
86; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
87; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
88; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
89; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
90; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
91; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
92; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
93; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
94; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
96; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
97; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
98; GFX9-NODL-NEXT:    s_endpgm
99;
100; GFX9-DL-LABEL: idot4_acc32:
101; GFX9-DL:       ; %bb.0: ; %entry
102; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
103; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
104; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
105; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
107; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
108; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
109; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
111; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s0
112; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
113; GFX9-DL-NEXT:    s_endpgm
114;
115; GFX10-DL-LABEL: idot4_acc32:
116; GFX10-DL:       ; %bb.0: ; %entry
117; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
118; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
119; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
120; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-DL-NEXT:    s_clause 0x1
122; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
123; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
124; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
125; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
126; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
127; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s2
128; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
129; GFX10-DL-NEXT:    s_endpgm
130                                       <4 x i8> addrspace(1)* %src2,
131                                       i32 addrspace(1)* nocapture %dst) {
132entry:
133  %idx = call i32 @llvm.amdgcn.workitem.id.x()
134  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
135  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
136  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
137  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
138
139  %v1e0 = extractelement <4 x i8> %vec1, i64 0
140  %cv1e0 = sext i8 %v1e0 to i32
141  %v2e0 = extractelement <4 x i8> %vec2, i64 0
142  %cv2e0 = sext i8 %v2e0 to i32
143  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
144
145  %v1e1 = extractelement <4 x i8> %vec1, i64 1
146  %cv1e1 = sext i8 %v1e1 to i32
147  %v2e1 = extractelement <4 x i8> %vec2, i64 1
148  %cv2e1 = sext i8 %v2e1 to i32
149  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
150
151  %v1e2 = extractelement <4 x i8> %vec1, i64 2
152  %cv1e2 = sext i8 %v1e2 to i32
153  %v2e2 = extractelement <4 x i8> %vec2, i64 2
154  %cv2e2 = sext i8 %v2e2 to i32
155  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
156
157  %v1e3 = extractelement <4 x i8> %vec1, i64 3
158  %cv1e3 = sext i8 %v1e3 to i32
159  %v2e3 = extractelement <4 x i8> %vec2, i64 3
160  %cv2e3 = sext i8 %v2e3 to i32
161  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
162
163  %acc = load i32, i32 addrspace(1)* %dst, align 4
164  %add1 = add i32 %mul1, %acc
165  %add2 = add i32 %add1, %mul2
166  %add3 = add i32 %add2, %mul3
167  %add4 = add i32 %add3, %mul4
168  store i32 %add4, i32 addrspace(1)* %dst, align 4
169  ret void
170}
171
172; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
173; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
174define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
175; GFX7-LABEL: idot4_acc16:
176; GFX7:       ; %bb.0: ; %entry
177; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
178; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
179; GFX7-NEXT:    s_mov_b32 s3, 0xf000
180; GFX7-NEXT:    s_mov_b32 s10, 0
181; GFX7-NEXT:    s_mov_b32 s11, s3
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
184; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
185; GFX7-NEXT:    v_mov_b32_e32 v1, 0
186; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
187; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
188; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
189; GFX7-NEXT:    s_mov_b32 s2, -1
190; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
191; GFX7-NEXT:    s_mov_b32 s4, 0xffff
192; GFX7-NEXT:    s_waitcnt vmcnt(2)
193; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
194; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
195; GFX7-NEXT:    s_waitcnt vmcnt(1)
196; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
197; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
198; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
199; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
200; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
201; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
202; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
203; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
204; GFX7-NEXT:    s_waitcnt vmcnt(0)
205; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
206; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
207; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
208; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
209; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
210; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
211; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
212; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
213; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
214; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
215; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
216; GFX7-NEXT:    s_endpgm
217;
218; GFX8-LABEL: idot4_acc16:
219; GFX8:       ; %bb.0: ; %entry
220; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
221; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
222; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
223; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX8-NEXT:    v_mov_b32_e32 v1, s5
225; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
226; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
227; GFX8-NEXT:    flat_load_dword v3, v[0:1]
228; GFX8-NEXT:    v_mov_b32_e32 v1, s7
229; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
230; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
231; GFX8-NEXT:    flat_load_dword v2, v[0:1]
232; GFX8-NEXT:    v_mov_b32_e32 v0, s0
233; GFX8-NEXT:    v_mov_b32_e32 v1, s1
234; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
235; GFX8-NEXT:    s_waitcnt vmcnt(2)
236; GFX8-NEXT:    v_bfe_i32 v7, v3, 0, 8
237; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
238; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
239; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 8
240; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
241; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
242; GFX8-NEXT:    s_waitcnt vmcnt(1)
243; GFX8-NEXT:    v_bfe_i32 v8, v2, 0, 8
244; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
245; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
246; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 8
247; GFX8-NEXT:    s_waitcnt vmcnt(0)
248; GFX8-NEXT:    v_mad_u16 v4, v7, v8, v4
249; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
250; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
251; GFX8-NEXT:    v_mad_u16 v4, v9, v10, v4
252; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
253; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
254; GFX8-NEXT:    v_mad_u16 v4, v5, v6, v4
255; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
256; GFX8-NEXT:    flat_store_short v[0:1], v2
257; GFX8-NEXT:    s_endpgm
258;
259; GFX9-NODL-LABEL: idot4_acc16:
260; GFX9-NODL:       ; %bb.0: ; %entry
261; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
262; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
263; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
264; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
266; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
267; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
268; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
269; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
270; GFX9-NODL-NEXT:    v_bfe_i32 v6, v1, 0, 8
271; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
272; GFX9-NODL-NEXT:    v_bfe_i32 v7, v2, 0, 8
273; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
274; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
275; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
276; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
277; GFX9-NODL-NEXT:    v_bfe_i32 v8, v8, 0, 8
278; GFX9-NODL-NEXT:    v_bfe_i32 v9, v9, 0, 8
279; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
280; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
281; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
282; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
283; GFX9-NODL-NEXT:    v_bfe_i32 v4, v4, 0, 8
284; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
285; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
286; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
287; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
288; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
289; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
290; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
291; GFX9-NODL-NEXT:    s_endpgm
292;
293; GFX9-DL-LABEL: idot4_acc16:
294; GFX9-DL:       ; %bb.0: ; %entry
295; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
296; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
297; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
298; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
300; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
301; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
302; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
303; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
304; GFX9-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
305; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
306; GFX9-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
307; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
308; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
309; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
310; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
311; GFX9-DL-NEXT:    v_bfe_i32 v8, v8, 0, 8
312; GFX9-DL-NEXT:    v_bfe_i32 v9, v9, 0, 8
313; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
314; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
315; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
316; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
317; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
318; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
319; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
320; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
321; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
322; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
323; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
324; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
325; GFX9-DL-NEXT:    s_endpgm
326;
327; GFX10-DL-LABEL: idot4_acc16:
328; GFX10-DL:       ; %bb.0: ; %entry
329; GFX10-DL-NEXT:    s_clause 0x1
330; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
331; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
332; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
333; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX10-DL-NEXT:    s_clause 0x1
335; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
336; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
337; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
338; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
339; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
340; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
341; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
342; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
343; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
344; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
345; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
346; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
347; GFX10-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
348; GFX10-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
349; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
350; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
351; GFX10-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
352; GFX10-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
353; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
354; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
355; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
356; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
357; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
358; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
359; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
360; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
361; GFX10-DL-NEXT:    s_endpgm
362                                       <4 x i8> addrspace(1)* %src2,
363                                       i16 addrspace(1)* nocapture %dst) {
364entry:
365  %idx = call i32 @llvm.amdgcn.workitem.id.x()
366  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
367  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
368  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
369  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
370
371  %v1e0 = extractelement <4 x i8> %vec1, i64 0
372  %cv1e0 = sext i8 %v1e0 to i16
373  %v2e0 = extractelement <4 x i8> %vec2, i64 0
374  %cv2e0 = sext i8 %v2e0 to i16
375  %mul1 = mul nsw i16 %cv1e0, %cv2e0
376
377  %v1e1 = extractelement <4 x i8> %vec1, i64 1
378  %cv1e1 = sext i8 %v1e1 to i16
379  %v2e1 = extractelement <4 x i8> %vec2, i64 1
380  %cv2e1 = sext i8 %v2e1 to i16
381  %mul2 = mul nsw i16 %cv1e1, %cv2e1
382
383  %v1e2 = extractelement <4 x i8> %vec1, i64 2
384  %cv1e2 = sext i8 %v1e2 to i16
385  %v2e2 = extractelement <4 x i8> %vec2, i64 2
386  %cv2e2 = sext i8 %v2e2 to i16
387  %mul3 = mul nsw i16 %cv1e2, %cv2e2
388
389  %v1e3 = extractelement <4 x i8> %vec1, i64 3
390  %cv1e3 = sext i8 %v1e3 to i16
391  %v2e3 = extractelement <4 x i8> %vec2, i64 3
392  %cv2e3 = sext i8 %v2e3 to i16
393  %mul4 = mul nsw i16 %cv1e3, %cv2e3
394
395  %acc = load i16, i16 addrspace(1)* %dst, align 2
396  %add1 = add i16 %mul1, %acc
397  %add2 = add i16 %add1, %mul2
398  %add3 = add i16 %add2, %mul3
399  %add4 = add i16 %add3, %mul4
400  store i16 %add4, i16 addrspace(1)* %dst, align 2
401  ret void
402}
403
404define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
405; GFX7-LABEL: idot4_acc8:
406; GFX7:       ; %bb.0: ; %entry
407; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
408; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
409; GFX7-NEXT:    s_mov_b32 s3, 0xf000
410; GFX7-NEXT:    s_mov_b32 s10, 0
411; GFX7-NEXT:    s_mov_b32 s11, s3
412; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
414; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
415; GFX7-NEXT:    v_mov_b32_e32 v1, 0
416; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
417; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
418; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
419; GFX7-NEXT:    s_mov_b32 s2, -1
420; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
421; GFX7-NEXT:    s_movk_i32 s4, 0xff
422; GFX7-NEXT:    s_waitcnt vmcnt(2)
423; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
424; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
425; GFX7-NEXT:    s_waitcnt vmcnt(1)
426; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
427; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
428; GFX7-NEXT:    s_waitcnt vmcnt(0)
429; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
430; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
431; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
432; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
433; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
434; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
435; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
436; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
437; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
438; GFX7-NEXT:    s_endpgm
439;
440; GFX8-LABEL: idot4_acc8:
441; GFX8:       ; %bb.0: ; %entry
442; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
443; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
444; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
445; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX8-NEXT:    v_mov_b32_e32 v1, s5
447; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
448; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
449; GFX8-NEXT:    flat_load_dword v3, v[0:1]
450; GFX8-NEXT:    v_mov_b32_e32 v1, s7
451; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
452; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
453; GFX8-NEXT:    flat_load_dword v2, v[0:1]
454; GFX8-NEXT:    v_mov_b32_e32 v0, s0
455; GFX8-NEXT:    v_mov_b32_e32 v1, s1
456; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
457; GFX8-NEXT:    s_waitcnt vmcnt(2)
458; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
459; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
460; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
461; GFX8-NEXT:    s_waitcnt vmcnt(1)
462; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
463; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
464; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
465; GFX8-NEXT:    s_waitcnt vmcnt(0)
466; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
467; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
468; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
469; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
470; GFX8-NEXT:    flat_store_byte v[0:1], v2
471; GFX8-NEXT:    s_endpgm
472;
473; GFX9-NODL-LABEL: idot4_acc8:
474; GFX9-NODL:       ; %bb.0: ; %entry
475; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
476; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
477; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
478; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
480; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
481; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
482; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
483; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
484; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
485; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
486; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
487; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
488; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
489; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
490; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
491; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
492; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
493; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
494; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
495; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
496; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
497; GFX9-NODL-NEXT:    s_endpgm
498;
499; GFX9-DL-LABEL: idot4_acc8:
500; GFX9-DL:       ; %bb.0: ; %entry
501; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
502; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
503; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
504; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
506; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
507; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
508; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
509; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
510; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
511; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
512; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
513; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
514; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
515; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
516; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
517; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
518; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
519; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
520; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
521; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
522; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
523; GFX9-DL-NEXT:    s_endpgm
524;
525; GFX10-DL-LABEL: idot4_acc8:
526; GFX10-DL:       ; %bb.0: ; %entry
527; GFX10-DL-NEXT:    s_clause 0x1
528; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
529; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
530; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
531; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
532; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX10-DL-NEXT:    s_clause 0x1
534; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
535; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
536; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
537; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
538; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
539; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
540; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
541; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
542; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
543; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
544; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
545; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
546; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
547; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
548; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
549; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
550; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
551; GFX10-DL-NEXT:    s_endpgm
552                                      <4 x i8> addrspace(1)* %src2,
553                                      i8 addrspace(1)* nocapture %dst) {
554entry:
555  %idx = call i32 @llvm.amdgcn.workitem.id.x()
556  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
557  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
558  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
559  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
560
561  %v1e0 = extractelement <4 x i8> %vec1, i64 0
562  %v2e0 = extractelement <4 x i8> %vec2, i64 0
563  %mul1 = mul i8 %v1e0, %v2e0
564
565  %v1e1 = extractelement <4 x i8> %vec1, i64 1
566  %v2e1 = extractelement <4 x i8> %vec2, i64 1
567  %mul2 = mul i8 %v1e1, %v2e1
568
569  %v1e2 = extractelement <4 x i8> %vec1, i64 2
570  %v2e2 = extractelement <4 x i8> %vec2, i64 2
571  %mul3 = mul i8 %v1e2, %v2e2
572
573  %v1e3 = extractelement <4 x i8> %vec1, i64 3
574  %v2e3 = extractelement <4 x i8> %vec2, i64 3
575  %mul4 = mul i8 %v1e3, %v2e3
576
577  %acc = load i8, i8 addrspace(1)* %dst, align 2
578  %add1 = add i8 %mul1, %acc
579  %add2 = add i8 %add1, %mul2
580  %add3 = add i8 %add2, %mul3
581  %add4 = add nsw i8 %add3, %mul4
582  store i8 %add4, i8 addrspace(1)* %dst, align 2
583  ret void
584}
585
586define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
587; GFX7-LABEL: idot4_multiuse_mul1:
588; GFX7:       ; %bb.0: ; %entry
589; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
590; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
591; GFX7-NEXT:    s_mov_b32 s3, 0xf000
592; GFX7-NEXT:    s_mov_b32 s10, 0
593; GFX7-NEXT:    s_mov_b32 s11, s3
594; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
596; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
597; GFX7-NEXT:    v_mov_b32_e32 v1, 0
598; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
599; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
600; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
601; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
602; GFX7-NEXT:    s_mov_b32 s2, -1
603; GFX7-NEXT:    s_waitcnt vmcnt(1)
604; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
605; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
606; GFX7-NEXT:    s_waitcnt vmcnt(0)
607; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
608; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
609; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX7-NEXT:    v_mad_i32_i24 v8, v1, v5, s4
611; GFX7-NEXT:    v_mad_i32_i24 v3, v3, v6, v8
612; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
613; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
614; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, v3
615; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
616; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
617; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
618; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
619; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
620; GFX7-NEXT:    s_endpgm
621;
622; GFX8-LABEL: idot4_multiuse_mul1:
623; GFX8:       ; %bb.0: ; %entry
624; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
625; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
626; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
627; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
628; GFX8-NEXT:    v_mov_b32_e32 v1, s5
629; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
630; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
631; GFX8-NEXT:    flat_load_dword v3, v[0:1]
632; GFX8-NEXT:    v_mov_b32_e32 v1, s7
633; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
634; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
635; GFX8-NEXT:    flat_load_dword v0, v[0:1]
636; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
637; GFX8-NEXT:    s_waitcnt vmcnt(1)
638; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
639; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
640; GFX8-NEXT:    v_bfe_i32 v6, v3, 16, 8
641; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
642; GFX8-NEXT:    s_waitcnt vmcnt(0)
643; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
644; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
645; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX8-NEXT:    v_mad_i32_i24 v8, v1, v2, s2
647; GFX8-NEXT:    v_mad_i32_i24 v4, v4, v5, v8
648; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
649; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, v4
650; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
651; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
652; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
653; GFX8-NEXT:    v_mov_b32_e32 v0, s0
654; GFX8-NEXT:    v_mov_b32_e32 v1, s1
655; GFX8-NEXT:    flat_store_dword v[0:1], v2
656; GFX8-NEXT:    s_endpgm
657;
658; GFX9-NODL-LABEL: idot4_multiuse_mul1:
659; GFX9-NODL:       ; %bb.0: ; %entry
660; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
661; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
662; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
663; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
665; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
666; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
667; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
668; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
669; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
670; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
671; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
672; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
673; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
674; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
675; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
676; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
678; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
679; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
680; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
681; GFX9-NODL-NEXT:    s_endpgm
682;
683; GFX9-DL-LABEL: idot4_multiuse_mul1:
684; GFX9-DL:       ; %bb.0: ; %entry
685; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
686; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
687; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
688; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
690; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
691; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
692; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
693; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
694; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
695; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
696; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
697; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
698; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
699; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
700; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
701; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
703; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
704; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
705; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
706; GFX9-DL-NEXT:    s_endpgm
707;
708; GFX10-DL-LABEL: idot4_multiuse_mul1:
709; GFX10-DL:       ; %bb.0: ; %entry
710; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
711; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
712; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
713; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
714; GFX10-DL-NEXT:    s_clause 0x1
715; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
716; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
717; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
718; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
719; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
720; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
721; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 8
722; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
723; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v5, v0, v3
724; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
725; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
726; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
727; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
728; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
729; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
730; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
731; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
732; GFX10-DL-NEXT:    s_endpgm
733                                               <4 x i8> addrspace(1)* %src2,
734                                               i32 addrspace(1)* nocapture %dst) {
735entry:
736  %idx = call i32 @llvm.amdgcn.workitem.id.x()
737  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
738  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
739  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
740  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
741
742  %v1e0 = extractelement <4 x i8> %vec1, i64 0
743  %cv1e0 = sext i8 %v1e0 to i32
744  %v2e0 = extractelement <4 x i8> %vec2, i64 0
745  %cv2e0 = sext i8 %v2e0 to i32
746  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
747
748  %v1e1 = extractelement <4 x i8> %vec1, i64 1
749  %cv1e1 = sext i8 %v1e1 to i32
750  %v2e1 = extractelement <4 x i8> %vec2, i64 1
751  %cv2e1 = sext i8 %v2e1 to i32
752  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
753
754  %v1e2 = extractelement <4 x i8> %vec1, i64 2
755  %cv1e2 = sext i8 %v1e2 to i32
756  %v2e2 = extractelement <4 x i8> %vec2, i64 2
757  %cv2e2 = sext i8 %v2e2 to i32
758  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
759
760  %v1e3 = extractelement <4 x i8> %vec1, i64 3
761  %cv1e3 = sext i8 %v1e3 to i32
762  %v2e3 = extractelement <4 x i8> %vec2, i64 3
763  %cv2e3 = sext i8 %v2e3 to i32
764  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
765
766  %acc = load i32, i32 addrspace(1)* %dst, align 4
767  %add = add i32 %mul1, %acc
768  %add1 = add i32 %mul2, %add
769  %add2 = add i32 %add1, %mul1
770  %add3 = add i32 %add2, %mul3
771  %add4 = add i32 %add3, %mul4
772
773  store i32 %add4, i32 addrspace(1)* %dst, align 4
774  ret void
775}
776
777; TODO: Support this pattern.
778define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
779; GFX7-LABEL: idot4_acc32_vecMul:
780; GFX7:       ; %bb.0: ; %entry
781; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
782; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
783; GFX7-NEXT:    s_mov_b32 s3, 0xf000
784; GFX7-NEXT:    s_mov_b32 s10, 0
785; GFX7-NEXT:    s_mov_b32 s11, s3
786; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
788; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
789; GFX7-NEXT:    v_mov_b32_e32 v1, 0
790; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
791; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
792; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
793; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
794; GFX7-NEXT:    s_mov_b32 s2, -1
795; GFX7-NEXT:    s_waitcnt vmcnt(1)
796; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 24, v2
797; GFX7-NEXT:    v_bfe_i32 v3, v2, 16, 8
798; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
799; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 8
800; GFX7-NEXT:    s_waitcnt vmcnt(0)
801; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 24, v0
802; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
803; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
804; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
805; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, s4
807; GFX7-NEXT:    v_mad_i32_i24 v0, v4, v7, v0
808; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v6, v0
809; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v5, v0
810; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
811; GFX7-NEXT:    s_endpgm
812;
813; GFX8-LABEL: idot4_acc32_vecMul:
814; GFX8:       ; %bb.0: ; %entry
815; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
816; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
817; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
818; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
819; GFX8-NEXT:    v_mov_b32_e32 v1, s5
820; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
821; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
822; GFX8-NEXT:    flat_load_dword v3, v[0:1]
823; GFX8-NEXT:    v_mov_b32_e32 v1, s7
824; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
825; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
826; GFX8-NEXT:    flat_load_dword v0, v[0:1]
827; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
828; GFX8-NEXT:    s_waitcnt vmcnt(1)
829; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
830; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 24, v3
831; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
832; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
833; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
834; GFX8-NEXT:    s_waitcnt vmcnt(0)
835; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
836; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 24, v0
837; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
838; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
839; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
840; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
841; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
842; GFX8-NEXT:    v_mad_i32_i24 v0, v1, v2, v0
843; GFX8-NEXT:    v_mad_i32_i24 v0, v5, v7, v0
844; GFX8-NEXT:    v_mad_i32_i24 v2, v4, v6, v0
845; GFX8-NEXT:    v_mov_b32_e32 v0, s0
846; GFX8-NEXT:    v_mov_b32_e32 v1, s1
847; GFX8-NEXT:    flat_store_dword v[0:1], v2
848; GFX8-NEXT:    s_endpgm
849;
850; GFX9-NODL-LABEL: idot4_acc32_vecMul:
851; GFX9-NODL:       ; %bb.0: ; %entry
852; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
853; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
854; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
855; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
856; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
857; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
858; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
859; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
860; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
861; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
862; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
863; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
864; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
865; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
866; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
867; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
868; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
869; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, s0, v3
870; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
871; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
872; GFX9-NODL-NEXT:    s_endpgm
873;
874; GFX9-DL-LABEL: idot4_acc32_vecMul:
875; GFX9-DL:       ; %bb.0: ; %entry
876; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
877; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
878; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
879; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
881; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
882; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
883; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
884; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
885; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
886; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
887; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
888; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
889; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
890; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
891; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
892; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX9-DL-NEXT:    v_add3_u32 v2, v5, s0, v3
894; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
895; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
896; GFX9-DL-NEXT:    s_endpgm
897;
898; GFX10-DL-LABEL: idot4_acc32_vecMul:
899; GFX10-DL:       ; %bb.0: ; %entry
900; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
901; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
902; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
903; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX10-DL-NEXT:    s_clause 0x1
905; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
906; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
907; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
908; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
909; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
910; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
911; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
912; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
913; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
914; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
915; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
916; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
917; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
919; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
920; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
921; GFX10-DL-NEXT:    s_endpgm
922                                              <4 x i8> addrspace(1)* %src2,
923                                              i32 addrspace(1)* nocapture %dst) {
924entry:
925  %idx = call i32 @llvm.amdgcn.workitem.id.x()
926  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
927  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
928  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
929  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
930
931  %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
932  %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
933
934  %mul = mul <4 x i32> %cvec1, %cvec2
935  %mul0 = extractelement <4 x i32> %mul, i64 0
936  %mul1 = extractelement <4 x i32> %mul, i64 1
937  %mul2 = extractelement <4 x i32> %mul, i64 2
938  %mul3 = extractelement <4 x i32> %mul, i64 3
939
940  %acc = load i32, i32 addrspace(1)* %dst, align 4
941  %add1 = add i32 %mul0, %acc
942  %add2 = add i32 %add1, %mul1
943  %add3 = add i32 %add2, %mul2
944  %add4 = add i32 %add3, %mul3
945
946  store i32 %add4, i32 addrspace(1)* %dst, align 4
947  ret void
948}
949
950define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
951; GFX7-LABEL: idot4_acc16_vecMul:
952; GFX7:       ; %bb.0: ; %entry
953; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
954; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
955; GFX7-NEXT:    s_mov_b32 s3, 0xf000
956; GFX7-NEXT:    s_mov_b32 s10, 0
957; GFX7-NEXT:    s_mov_b32 s11, s3
958; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
959; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
960; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
961; GFX7-NEXT:    v_mov_b32_e32 v1, 0
962; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
963; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
964; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
965; GFX7-NEXT:    s_mov_b32 s2, -1
966; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
967; GFX7-NEXT:    s_mov_b32 s4, 0xffff
968; GFX7-NEXT:    s_waitcnt vmcnt(2)
969; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
970; GFX7-NEXT:    v_bfe_i32 v4, v2, 0, 8
971; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
972; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
973; GFX7-NEXT:    s_waitcnt vmcnt(1)
974; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
975; GFX7-NEXT:    v_bfe_i32 v7, v0, 0, 8
976; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
977; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
978; GFX7-NEXT:    v_and_b32_e32 v6, s4, v7
979; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
980; GFX7-NEXT:    v_or_b32_e32 v4, v6, v4
981; GFX7-NEXT:    v_and_b32_e32 v7, s4, v8
982; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
983; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
984; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
985; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
986; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
987; GFX7-NEXT:    s_waitcnt vmcnt(0)
988; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
989; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
990; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
991; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
992; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v8, v1
993; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
994; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
995; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v7, v1
996; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
997; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
998; GFX7-NEXT:    s_endpgm
999;
1000; GFX8-LABEL: idot4_acc16_vecMul:
1001; GFX8:       ; %bb.0: ; %entry
1002; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1003; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1004; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1005; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1007; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1008; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1009; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1010; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1011; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1012; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1013; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1014; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1015; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1016; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1017; GFX8-NEXT:    s_waitcnt vmcnt(2)
1018; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
1019; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 8, v3
1020; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
1021; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1022; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
1023; GFX8-NEXT:    s_waitcnt vmcnt(1)
1024; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1025; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1026; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
1027; GFX8-NEXT:    s_waitcnt vmcnt(0)
1028; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1029; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1030; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
1031; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
1032; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
1033; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
1034; GFX8-NEXT:    flat_store_short v[0:1], v2
1035; GFX8-NEXT:    s_endpgm
1036;
1037; GFX9-NODL-LABEL: idot4_acc16_vecMul:
1038; GFX9-NODL:       ; %bb.0: ; %entry
1039; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1040; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1041; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1042; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0xffff
1043; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1045; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1046; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1047; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
1048; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1049; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1050; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1051; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1052; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v7, 8, v1
1053; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1054; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1055; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1056; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1057; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1058; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1059; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1060; GFX9-NODL-NEXT:    v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1061; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1062; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1063; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
1064; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
1065; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1066; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
1067; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1068; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1069; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
1070; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1071; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
1072; GFX9-NODL-NEXT:    s_endpgm
1073;
1074; GFX9-DL-LABEL: idot4_acc16_vecMul:
1075; GFX9-DL:       ; %bb.0: ; %entry
1076; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1077; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1078; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1079; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1080; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1082; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1083; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1084; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1085; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1086; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1087; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1088; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1089; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 8, v1
1090; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1091; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1092; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1093; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1094; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1095; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1096; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1097; GFX9-DL-NEXT:    v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1098; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1099; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1100; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
1101; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
1102; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1103; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
1104; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1105; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1106; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
1107; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1108; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
1109; GFX9-DL-NEXT:    s_endpgm
1110;
1111; GFX10-DL-LABEL: idot4_acc16_vecMul:
1112; GFX10-DL:       ; %bb.0: ; %entry
1113; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1114; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1115; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1116; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1117; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1118; GFX10-DL-NEXT:    s_clause 0x1
1119; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1120; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1121; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1122; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
1123; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1124; GFX10-DL-NEXT:    v_ashrrev_i16 v5, 8, v1
1125; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1126; GFX10-DL-NEXT:    v_ashrrev_i16 v6, 8, v2
1127; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1128; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1129; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1130; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1131; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
1132; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
1133; GFX10-DL-NEXT:    v_ashrrev_i16 v7, 8, v1
1134; GFX10-DL-NEXT:    v_ashrrev_i16 v8, 8, v2
1135; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1136; GFX10-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1137; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v6
1138; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1139; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1140; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1141; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1142; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
1143; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1144; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
1145; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1146; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
1147; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
1148; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
1149; GFX10-DL-NEXT:    s_endpgm
1150                                              <4 x i8> addrspace(1)* %src2,
1151                                              i16 addrspace(1)* nocapture %dst) {
1152entry:
1153  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1154  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1155  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1156  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1157  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1158
1159  %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
1160  %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
1161
1162  %mul = mul <4 x i16> %cvec1, %cvec2
1163  %mul0 = extractelement <4 x i16> %mul, i64 0
1164  %mul1 = extractelement <4 x i16> %mul, i64 1
1165  %mul2 = extractelement <4 x i16> %mul, i64 2
1166  %mul3 = extractelement <4 x i16> %mul, i64 3
1167
1168  %acc = load i16, i16 addrspace(1)* %dst, align 4
1169  %add1 = add i16 %mul0, %acc
1170  %add2 = add i16 %add1, %mul1
1171  %add3 = add i16 %add2, %mul2
1172  %add4 = add i16 %add3, %mul3
1173
1174  store i16 %add4, i16 addrspace(1)* %dst, align 4
1175  ret void
1176}
1177
1178declare i32 @llvm.amdgcn.workitem.id.x()
1179