1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
10; GFX7-LABEL: idot4_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_mov_b32 s3, 0xf000
15; GFX7-NEXT:    s_mov_b32 s10, 0
16; GFX7-NEXT:    s_mov_b32 s11, s3
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
19; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
20; GFX7-NEXT:    v_mov_b32_e32 v1, 0
21; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
22; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
23; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
24; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
25; GFX7-NEXT:    s_mov_b32 s2, -1
26; GFX7-NEXT:    s_waitcnt vmcnt(1)
27; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
28; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
29; GFX7-NEXT:    s_waitcnt vmcnt(0)
30; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
31; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
32; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, s4
34; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
35; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
36; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v6, v1
37; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
38; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
39; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
40; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
41; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
42; GFX7-NEXT:    s_endpgm
43;
44; GFX8-LABEL: idot4_acc32:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
47; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
48; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
49; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX8-NEXT:    v_mov_b32_e32 v1, s5
51; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
52; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
53; GFX8-NEXT:    flat_load_dword v3, v[0:1]
54; GFX8-NEXT:    v_mov_b32_e32 v1, s7
55; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
56; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
57; GFX8-NEXT:    flat_load_dword v0, v[0:1]
58; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
59; GFX8-NEXT:    s_waitcnt vmcnt(1)
60; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
61; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
62; GFX8-NEXT:    v_bfe_i32 v6, v3, 16, 8
63; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
64; GFX8-NEXT:    s_waitcnt vmcnt(0)
65; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
66; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
69; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
70; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
71; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
72; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
73; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
74; GFX8-NEXT:    v_mov_b32_e32 v0, s0
75; GFX8-NEXT:    v_mov_b32_e32 v1, s1
76; GFX8-NEXT:    flat_store_dword v[0:1], v2
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-NODL-LABEL: idot4_acc32:
80; GFX9-NODL:       ; %bb.0: ; %entry
81; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
82; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
83; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
84; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
86; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
87; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
88; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
89; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
90; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
91; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
92; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
93; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
94; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
96; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
97; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
98; GFX9-NODL-NEXT:    s_endpgm
99;
100; GFX9-DL-LABEL: idot4_acc32:
101; GFX9-DL:       ; %bb.0: ; %entry
102; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
103; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
104; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
105; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
107; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
108; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
109; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
111; GFX9-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s0
112; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
113; GFX9-DL-NEXT:    s_endpgm
114;
115; GFX10-DL-LABEL: idot4_acc32:
116; GFX10-DL:       ; %bb.0: ; %entry
117; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
118; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
119; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
120; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-DL-NEXT:    s_clause 0x1
122; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
123; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
124; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
125; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
126; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
127; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s2
128; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
129; GFX10-DL-NEXT:    s_endpgm
130                                       <4 x i8> addrspace(1)* %src2,
131                                       i32 addrspace(1)* nocapture %dst) {
132entry:
133  %idx = call i32 @llvm.amdgcn.workitem.id.x()
134  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
135  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
136  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
137  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
138
139  %v1e0 = extractelement <4 x i8> %vec1, i64 0
140  %cv1e0 = sext i8 %v1e0 to i32
141  %v2e0 = extractelement <4 x i8> %vec2, i64 0
142  %cv2e0 = sext i8 %v2e0 to i32
143  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
144
145  %v1e1 = extractelement <4 x i8> %vec1, i64 1
146  %cv1e1 = sext i8 %v1e1 to i32
147  %v2e1 = extractelement <4 x i8> %vec2, i64 1
148  %cv2e1 = sext i8 %v2e1 to i32
149  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
150
151  %v1e2 = extractelement <4 x i8> %vec1, i64 2
152  %cv1e2 = sext i8 %v1e2 to i32
153  %v2e2 = extractelement <4 x i8> %vec2, i64 2
154  %cv2e2 = sext i8 %v2e2 to i32
155  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
156
157  %v1e3 = extractelement <4 x i8> %vec1, i64 3
158  %cv1e3 = sext i8 %v1e3 to i32
159  %v2e3 = extractelement <4 x i8> %vec2, i64 3
160  %cv2e3 = sext i8 %v2e3 to i32
161  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
162
163  %acc = load i32, i32 addrspace(1)* %dst, align 4
164  %add1 = add i32 %mul1, %acc
165  %add2 = add i32 %add1, %mul2
166  %add3 = add i32 %add2, %mul3
167  %add4 = add i32 %add3, %mul4
168  store i32 %add4, i32 addrspace(1)* %dst, align 4
169  ret void
170}
171
172; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
173; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
174define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
175; GFX7-LABEL: idot4_acc16:
176; GFX7:       ; %bb.0: ; %entry
177; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
178; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
179; GFX7-NEXT:    s_mov_b32 s3, 0xf000
180; GFX7-NEXT:    s_mov_b32 s10, 0
181; GFX7-NEXT:    s_mov_b32 s11, s3
182; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
184; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
185; GFX7-NEXT:    v_mov_b32_e32 v1, 0
186; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
187; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
188; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
189; GFX7-NEXT:    s_mov_b32 s2, -1
190; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
191; GFX7-NEXT:    s_waitcnt vmcnt(2)
192; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
193; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
194; GFX7-NEXT:    s_waitcnt vmcnt(1)
195; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
196; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
197; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
198; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
199; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
200; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
201; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
202; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
203; GFX7-NEXT:    s_waitcnt vmcnt(0)
204; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
205; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
206; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
207; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
208; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
209; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
210; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
211; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
212; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
213; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
214; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
215; GFX7-NEXT:    s_endpgm
216;
217; GFX8-LABEL: idot4_acc16:
218; GFX8:       ; %bb.0: ; %entry
219; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
220; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
221; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
222; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX8-NEXT:    v_mov_b32_e32 v1, s5
224; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
225; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
226; GFX8-NEXT:    flat_load_dword v3, v[0:1]
227; GFX8-NEXT:    v_mov_b32_e32 v1, s7
228; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
229; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
230; GFX8-NEXT:    flat_load_dword v2, v[0:1]
231; GFX8-NEXT:    v_mov_b32_e32 v0, s0
232; GFX8-NEXT:    v_mov_b32_e32 v1, s1
233; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
234; GFX8-NEXT:    s_waitcnt vmcnt(2)
235; GFX8-NEXT:    v_bfe_i32 v7, v3, 0, 8
236; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
237; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
238; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 8
239; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
240; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
241; GFX8-NEXT:    s_waitcnt vmcnt(1)
242; GFX8-NEXT:    v_bfe_i32 v8, v2, 0, 8
243; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
244; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
245; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 8
246; GFX8-NEXT:    s_waitcnt vmcnt(0)
247; GFX8-NEXT:    v_mad_u16 v4, v7, v8, v4
248; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
249; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
250; GFX8-NEXT:    v_mad_u16 v4, v9, v10, v4
251; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
252; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
253; GFX8-NEXT:    v_mad_u16 v4, v5, v6, v4
254; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
255; GFX8-NEXT:    flat_store_short v[0:1], v2
256; GFX8-NEXT:    s_endpgm
257;
258; GFX9-NODL-LABEL: idot4_acc16:
259; GFX9-NODL:       ; %bb.0: ; %entry
260; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
261; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
262; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
263; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
265; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
266; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
267; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
268; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
269; GFX9-NODL-NEXT:    v_bfe_i32 v6, v1, 0, 8
270; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
271; GFX9-NODL-NEXT:    v_bfe_i32 v7, v2, 0, 8
272; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
273; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
274; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
275; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
276; GFX9-NODL-NEXT:    v_bfe_i32 v8, v8, 0, 8
277; GFX9-NODL-NEXT:    v_bfe_i32 v9, v9, 0, 8
278; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
279; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
280; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
281; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
282; GFX9-NODL-NEXT:    v_bfe_i32 v4, v4, 0, 8
283; GFX9-NODL-NEXT:    v_bfe_i32 v5, v5, 0, 8
284; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
285; GFX9-NODL-NEXT:    v_bfe_i32 v1, v1, 0, 8
286; GFX9-NODL-NEXT:    v_bfe_i32 v2, v2, 0, 8
287; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
288; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
289; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
290; GFX9-NODL-NEXT:    s_endpgm
291;
292; GFX9-DL-LABEL: idot4_acc16:
293; GFX9-DL:       ; %bb.0: ; %entry
294; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
295; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
296; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
297; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
299; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
300; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
301; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
302; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
303; GFX9-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
304; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
305; GFX9-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
306; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
307; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
308; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
309; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
310; GFX9-DL-NEXT:    v_bfe_i32 v8, v8, 0, 8
311; GFX9-DL-NEXT:    v_bfe_i32 v9, v9, 0, 8
312; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
313; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
314; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
315; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
316; GFX9-DL-NEXT:    v_bfe_i32 v4, v4, 0, 8
317; GFX9-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
318; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
319; GFX9-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
320; GFX9-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
321; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
322; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
323; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
324; GFX9-DL-NEXT:    s_endpgm
325;
326; GFX10-DL-LABEL: idot4_acc16:
327; GFX10-DL:       ; %bb.0: ; %entry
328; GFX10-DL-NEXT:    s_clause 0x1
329; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
330; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
331; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
332; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX10-DL-NEXT:    s_clause 0x1
334; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
335; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
336; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
337; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
338; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
339; GFX10-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
340; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
341; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
342; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
343; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
344; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
345; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
346; GFX10-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
347; GFX10-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
348; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
349; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
350; GFX10-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
351; GFX10-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
352; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
353; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
354; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
355; GFX10-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
356; GFX10-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
357; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
358; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
359; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
360; GFX10-DL-NEXT:    s_endpgm
361                                       <4 x i8> addrspace(1)* %src2,
362                                       i16 addrspace(1)* nocapture %dst) {
363entry:
364  %idx = call i32 @llvm.amdgcn.workitem.id.x()
365  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
366  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
367  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
368  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
369
370  %v1e0 = extractelement <4 x i8> %vec1, i64 0
371  %cv1e0 = sext i8 %v1e0 to i16
372  %v2e0 = extractelement <4 x i8> %vec2, i64 0
373  %cv2e0 = sext i8 %v2e0 to i16
374  %mul1 = mul nsw i16 %cv1e0, %cv2e0
375
376  %v1e1 = extractelement <4 x i8> %vec1, i64 1
377  %cv1e1 = sext i8 %v1e1 to i16
378  %v2e1 = extractelement <4 x i8> %vec2, i64 1
379  %cv2e1 = sext i8 %v2e1 to i16
380  %mul2 = mul nsw i16 %cv1e1, %cv2e1
381
382  %v1e2 = extractelement <4 x i8> %vec1, i64 2
383  %cv1e2 = sext i8 %v1e2 to i16
384  %v2e2 = extractelement <4 x i8> %vec2, i64 2
385  %cv2e2 = sext i8 %v2e2 to i16
386  %mul3 = mul nsw i16 %cv1e2, %cv2e2
387
388  %v1e3 = extractelement <4 x i8> %vec1, i64 3
389  %cv1e3 = sext i8 %v1e3 to i16
390  %v2e3 = extractelement <4 x i8> %vec2, i64 3
391  %cv2e3 = sext i8 %v2e3 to i16
392  %mul4 = mul nsw i16 %cv1e3, %cv2e3
393
394  %acc = load i16, i16 addrspace(1)* %dst, align 2
395  %add1 = add i16 %mul1, %acc
396  %add2 = add i16 %add1, %mul2
397  %add3 = add i16 %add2, %mul3
398  %add4 = add i16 %add3, %mul4
399  store i16 %add4, i16 addrspace(1)* %dst, align 2
400  ret void
401}
402
403define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
404; GFX7-LABEL: idot4_acc8:
405; GFX7:       ; %bb.0: ; %entry
406; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
407; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
408; GFX7-NEXT:    s_mov_b32 s3, 0xf000
409; GFX7-NEXT:    s_mov_b32 s10, 0
410; GFX7-NEXT:    s_mov_b32 s11, s3
411; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
413; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
414; GFX7-NEXT:    v_mov_b32_e32 v1, 0
415; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
416; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
417; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
418; GFX7-NEXT:    s_mov_b32 s2, -1
419; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
420; GFX7-NEXT:    s_waitcnt vmcnt(2)
421; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
422; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
423; GFX7-NEXT:    s_waitcnt vmcnt(1)
424; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
425; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
426; GFX7-NEXT:    s_waitcnt vmcnt(0)
427; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
428; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
429; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
430; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
431; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
432; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
433; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
434; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
435; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
436; GFX7-NEXT:    s_endpgm
437;
438; GFX8-LABEL: idot4_acc8:
439; GFX8:       ; %bb.0: ; %entry
440; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
441; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
442; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
443; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX8-NEXT:    v_mov_b32_e32 v1, s5
445; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
446; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
447; GFX8-NEXT:    flat_load_dword v3, v[0:1]
448; GFX8-NEXT:    v_mov_b32_e32 v1, s7
449; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
450; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
451; GFX8-NEXT:    flat_load_dword v2, v[0:1]
452; GFX8-NEXT:    v_mov_b32_e32 v0, s0
453; GFX8-NEXT:    v_mov_b32_e32 v1, s1
454; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
455; GFX8-NEXT:    s_waitcnt vmcnt(2)
456; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
457; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
458; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
459; GFX8-NEXT:    s_waitcnt vmcnt(1)
460; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
461; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
462; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
463; GFX8-NEXT:    s_waitcnt vmcnt(0)
464; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
465; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
466; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
467; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
468; GFX8-NEXT:    flat_store_byte v[0:1], v2
469; GFX8-NEXT:    s_endpgm
470;
471; GFX9-NODL-LABEL: idot4_acc8:
472; GFX9-NODL:       ; %bb.0: ; %entry
473; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
474; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
475; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
476; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
478; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
479; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
480; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
481; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
482; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
483; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
484; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
485; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
486; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
487; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
488; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
489; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
490; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
491; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
492; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
493; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
494; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
495; GFX9-NODL-NEXT:    s_endpgm
496;
497; GFX9-DL-LABEL: idot4_acc8:
498; GFX9-DL:       ; %bb.0: ; %entry
499; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
500; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
501; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
502; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
504; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
505; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
506; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
507; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
508; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
509; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
510; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
511; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
512; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
513; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
514; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
515; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
516; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
517; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
518; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
519; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
520; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
521; GFX9-DL-NEXT:    s_endpgm
522;
523; GFX10-DL-LABEL: idot4_acc8:
524; GFX10-DL:       ; %bb.0: ; %entry
525; GFX10-DL-NEXT:    s_clause 0x1
526; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
527; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
528; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
529; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
530; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX10-DL-NEXT:    s_clause 0x1
532; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
533; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
534; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
535; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
536; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
537; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
538; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
539; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
540; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
541; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
542; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
543; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
544; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
545; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
546; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
547; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
548; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
549; GFX10-DL-NEXT:    s_endpgm
550                                      <4 x i8> addrspace(1)* %src2,
551                                      i8 addrspace(1)* nocapture %dst) {
552entry:
553  %idx = call i32 @llvm.amdgcn.workitem.id.x()
554  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
555  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
556  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
557  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
558
559  %v1e0 = extractelement <4 x i8> %vec1, i64 0
560  %v2e0 = extractelement <4 x i8> %vec2, i64 0
561  %mul1 = mul i8 %v1e0, %v2e0
562
563  %v1e1 = extractelement <4 x i8> %vec1, i64 1
564  %v2e1 = extractelement <4 x i8> %vec2, i64 1
565  %mul2 = mul i8 %v1e1, %v2e1
566
567  %v1e2 = extractelement <4 x i8> %vec1, i64 2
568  %v2e2 = extractelement <4 x i8> %vec2, i64 2
569  %mul3 = mul i8 %v1e2, %v2e2
570
571  %v1e3 = extractelement <4 x i8> %vec1, i64 3
572  %v2e3 = extractelement <4 x i8> %vec2, i64 3
573  %mul4 = mul i8 %v1e3, %v2e3
574
575  %acc = load i8, i8 addrspace(1)* %dst, align 2
576  %add1 = add i8 %mul1, %acc
577  %add2 = add i8 %add1, %mul2
578  %add3 = add i8 %add2, %mul3
579  %add4 = add nsw i8 %add3, %mul4
580  store i8 %add4, i8 addrspace(1)* %dst, align 2
581  ret void
582}
583
584define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
585; GFX7-LABEL: idot4_multiuse_mul1:
586; GFX7:       ; %bb.0: ; %entry
587; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
588; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
589; GFX7-NEXT:    s_mov_b32 s3, 0xf000
590; GFX7-NEXT:    s_mov_b32 s10, 0
591; GFX7-NEXT:    s_mov_b32 s11, s3
592; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
594; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
595; GFX7-NEXT:    v_mov_b32_e32 v1, 0
596; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
597; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
598; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
599; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
600; GFX7-NEXT:    s_mov_b32 s2, -1
601; GFX7-NEXT:    s_waitcnt vmcnt(1)
602; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
603; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
604; GFX7-NEXT:    s_waitcnt vmcnt(0)
605; GFX7-NEXT:    v_bfe_i32 v5, v0, 0, 8
606; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
607; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
608; GFX7-NEXT:    v_mad_i32_i24 v8, v1, v5, s4
609; GFX7-NEXT:    v_mad_i32_i24 v3, v3, v6, v8
610; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 8
611; GFX7-NEXT:    v_bfe_i32 v7, v0, 16, 8
612; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v5, v3
613; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
614; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
615; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v7, v1
616; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
617; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
618; GFX7-NEXT:    s_endpgm
619;
620; GFX8-LABEL: idot4_multiuse_mul1:
621; GFX8:       ; %bb.0: ; %entry
622; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
623; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
624; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
625; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
626; GFX8-NEXT:    v_mov_b32_e32 v1, s5
627; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
628; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
629; GFX8-NEXT:    flat_load_dword v3, v[0:1]
630; GFX8-NEXT:    v_mov_b32_e32 v1, s7
631; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
632; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
633; GFX8-NEXT:    flat_load_dword v0, v[0:1]
634; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
635; GFX8-NEXT:    s_waitcnt vmcnt(1)
636; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
637; GFX8-NEXT:    v_bfe_i32 v4, v3, 8, 8
638; GFX8-NEXT:    v_bfe_i32 v6, v3, 16, 8
639; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
640; GFX8-NEXT:    s_waitcnt vmcnt(0)
641; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
642; GFX8-NEXT:    v_bfe_i32 v5, v0, 8, 8
643; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX8-NEXT:    v_mad_i32_i24 v8, v1, v2, s2
645; GFX8-NEXT:    v_mad_i32_i24 v4, v4, v5, v8
646; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
647; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, v4
648; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
649; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
650; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
651; GFX8-NEXT:    v_mov_b32_e32 v0, s0
652; GFX8-NEXT:    v_mov_b32_e32 v1, s1
653; GFX8-NEXT:    flat_store_dword v[0:1], v2
654; GFX8-NEXT:    s_endpgm
655;
656; GFX9-NODL-LABEL: idot4_multiuse_mul1:
657; GFX9-NODL:       ; %bb.0: ; %entry
658; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
659; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
660; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
661; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
663; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
664; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
665; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
666; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
667; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 8
668; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
669; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 8
670; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
671; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
672; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
673; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
674; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
676; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
677; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
678; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
679; GFX9-NODL-NEXT:    s_endpgm
680;
681; GFX9-DL-LABEL: idot4_multiuse_mul1:
682; GFX9-DL:       ; %bb.0: ; %entry
683; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
684; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
685; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
686; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
688; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
689; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
690; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
691; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
692; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 8
693; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
694; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 8
695; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
696; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
697; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
698; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v4
699; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, s0
701; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
702; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
703; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
704; GFX9-DL-NEXT:    s_endpgm
705;
706; GFX10-DL-LABEL: idot4_multiuse_mul1:
707; GFX10-DL:       ; %bb.0: ; %entry
708; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
709; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
710; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
711; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
712; GFX10-DL-NEXT:    s_clause 0x1
713; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
714; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
715; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
716; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
717; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 8
718; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
719; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 8
720; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
721; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v5, v0, v3
722; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
723; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
724; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
725; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
726; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
727; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
728; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
729; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
730; GFX10-DL-NEXT:    s_endpgm
731                                               <4 x i8> addrspace(1)* %src2,
732                                               i32 addrspace(1)* nocapture %dst) {
733entry:
734  %idx = call i32 @llvm.amdgcn.workitem.id.x()
735  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
736  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
737  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
738  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
739
740  %v1e0 = extractelement <4 x i8> %vec1, i64 0
741  %cv1e0 = sext i8 %v1e0 to i32
742  %v2e0 = extractelement <4 x i8> %vec2, i64 0
743  %cv2e0 = sext i8 %v2e0 to i32
744  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
745
746  %v1e1 = extractelement <4 x i8> %vec1, i64 1
747  %cv1e1 = sext i8 %v1e1 to i32
748  %v2e1 = extractelement <4 x i8> %vec2, i64 1
749  %cv2e1 = sext i8 %v2e1 to i32
750  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
751
752  %v1e2 = extractelement <4 x i8> %vec1, i64 2
753  %cv1e2 = sext i8 %v1e2 to i32
754  %v2e2 = extractelement <4 x i8> %vec2, i64 2
755  %cv2e2 = sext i8 %v2e2 to i32
756  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
757
758  %v1e3 = extractelement <4 x i8> %vec1, i64 3
759  %cv1e3 = sext i8 %v1e3 to i32
760  %v2e3 = extractelement <4 x i8> %vec2, i64 3
761  %cv2e3 = sext i8 %v2e3 to i32
762  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
763
764  %acc = load i32, i32 addrspace(1)* %dst, align 4
765  %add = add i32 %mul1, %acc
766  %add1 = add i32 %mul2, %add
767  %add2 = add i32 %add1, %mul1
768  %add3 = add i32 %add2, %mul3
769  %add4 = add i32 %add3, %mul4
770
771  store i32 %add4, i32 addrspace(1)* %dst, align 4
772  ret void
773}
774
775; TODO: Support this pattern.
776define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
777; GFX7-LABEL: idot4_acc32_vecMul:
778; GFX7:       ; %bb.0: ; %entry
779; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
780; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
781; GFX7-NEXT:    s_mov_b32 s3, 0xf000
782; GFX7-NEXT:    s_mov_b32 s10, 0
783; GFX7-NEXT:    s_mov_b32 s11, s3
784; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
785; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
786; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
787; GFX7-NEXT:    v_mov_b32_e32 v1, 0
788; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
789; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
790; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
791; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
792; GFX7-NEXT:    s_mov_b32 s2, -1
793; GFX7-NEXT:    s_waitcnt vmcnt(1)
794; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 24, v2
795; GFX7-NEXT:    v_bfe_i32 v3, v2, 16, 8
796; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 8
797; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 8
798; GFX7-NEXT:    s_waitcnt vmcnt(0)
799; GFX7-NEXT:    v_ashrrev_i32_e32 v5, 24, v0
800; GFX7-NEXT:    v_bfe_i32 v6, v0, 16, 8
801; GFX7-NEXT:    v_bfe_i32 v7, v0, 8, 8
802; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 8
803; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
804; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, s4
805; GFX7-NEXT:    v_mad_i32_i24 v0, v4, v7, v0
806; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v6, v0
807; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v5, v0
808; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
809; GFX7-NEXT:    s_endpgm
810;
811; GFX8-LABEL: idot4_acc32_vecMul:
812; GFX8:       ; %bb.0: ; %entry
813; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
814; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
815; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
816; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX8-NEXT:    v_mov_b32_e32 v1, s5
818; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
819; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
820; GFX8-NEXT:    flat_load_dword v3, v[0:1]
821; GFX8-NEXT:    v_mov_b32_e32 v1, s7
822; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
823; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
824; GFX8-NEXT:    flat_load_dword v0, v[0:1]
825; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
826; GFX8-NEXT:    s_waitcnt vmcnt(1)
827; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
828; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 24, v3
829; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 8
830; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
831; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 8
832; GFX8-NEXT:    s_waitcnt vmcnt(0)
833; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
834; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 24, v0
835; GFX8-NEXT:    v_bfe_i32 v7, v0, 16, 8
836; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
837; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
838; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
839; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
840; GFX8-NEXT:    v_mad_i32_i24 v0, v1, v2, v0
841; GFX8-NEXT:    v_mad_i32_i24 v0, v5, v7, v0
842; GFX8-NEXT:    v_mad_i32_i24 v2, v4, v6, v0
843; GFX8-NEXT:    v_mov_b32_e32 v0, s0
844; GFX8-NEXT:    v_mov_b32_e32 v1, s1
845; GFX8-NEXT:    flat_store_dword v[0:1], v2
846; GFX8-NEXT:    s_endpgm
847;
848; GFX9-NODL-LABEL: idot4_acc32_vecMul:
849; GFX9-NODL:       ; %bb.0: ; %entry
850; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
851; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
852; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
853; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
855; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
856; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
857; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
858; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
859; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
860; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
861; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
862; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
863; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
864; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
865; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
866; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, s0, v3
868; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
869; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
870; GFX9-NODL-NEXT:    s_endpgm
871;
872; GFX9-DL-LABEL: idot4_acc32_vecMul:
873; GFX9-DL:       ; %bb.0: ; %entry
874; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
875; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
876; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
877; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
879; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
880; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
881; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
882; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
883; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
884; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
885; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
886; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
887; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
888; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
889; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
890; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
891; GFX9-DL-NEXT:    v_add3_u32 v2, v5, s0, v3
892; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
893; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
894; GFX9-DL-NEXT:    s_endpgm
895;
896; GFX10-DL-LABEL: idot4_acc32_vecMul:
897; GFX10-DL:       ; %bb.0: ; %entry
898; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
899; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
900; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
901; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX10-DL-NEXT:    s_clause 0x1
903; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
904; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
905; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
906; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
907; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
908; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
909; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
910; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
911; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
912; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
913; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
914; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
915; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
917; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
918; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
919; GFX10-DL-NEXT:    s_endpgm
920                                              <4 x i8> addrspace(1)* %src2,
921                                              i32 addrspace(1)* nocapture %dst) {
922entry:
923  %idx = call i32 @llvm.amdgcn.workitem.id.x()
924  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
925  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
926  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
927  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
928
929  %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
930  %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
931
932  %mul = mul <4 x i32> %cvec1, %cvec2
933  %mul0 = extractelement <4 x i32> %mul, i64 0
934  %mul1 = extractelement <4 x i32> %mul, i64 1
935  %mul2 = extractelement <4 x i32> %mul, i64 2
936  %mul3 = extractelement <4 x i32> %mul, i64 3
937
938  %acc = load i32, i32 addrspace(1)* %dst, align 4
939  %add1 = add i32 %mul0, %acc
940  %add2 = add i32 %add1, %mul1
941  %add3 = add i32 %add2, %mul2
942  %add4 = add i32 %add3, %mul3
943
944  store i32 %add4, i32 addrspace(1)* %dst, align 4
945  ret void
946}
947
948define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
949; GFX7-LABEL: idot4_acc16_vecMul:
950; GFX7:       ; %bb.0: ; %entry
951; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
952; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
953; GFX7-NEXT:    s_mov_b32 s3, 0xf000
954; GFX7-NEXT:    s_mov_b32 s10, 0
955; GFX7-NEXT:    s_mov_b32 s11, s3
956; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
957; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
958; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
959; GFX7-NEXT:    v_mov_b32_e32 v1, 0
960; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
961; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
962; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
963; GFX7-NEXT:    s_mov_b32 s2, -1
964; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
965; GFX7-NEXT:    s_waitcnt vmcnt(2)
966; GFX7-NEXT:    v_bfe_i32 v3, v2, 8, 8
967; GFX7-NEXT:    v_bfe_i32 v4, v2, 0, 8
968; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
969; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
970; GFX7-NEXT:    s_waitcnt vmcnt(1)
971; GFX7-NEXT:    v_bfe_i32 v6, v0, 8, 8
972; GFX7-NEXT:    v_bfe_i32 v7, v0, 0, 8
973; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
974; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
975; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v7
976; GFX7-NEXT:    v_bfe_i32 v8, v0, 16, 8
977; GFX7-NEXT:    v_or_b32_e32 v4, v6, v4
978; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v8
979; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
980; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
981; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
982; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
983; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 8
984; GFX7-NEXT:    s_waitcnt vmcnt(0)
985; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
986; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 24, v2
987; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
988; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
989; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v8, v1
990; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
991; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
992; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v7, v1
993; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
994; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
995; GFX7-NEXT:    s_endpgm
996;
997; GFX8-LABEL: idot4_acc16_vecMul:
998; GFX8:       ; %bb.0: ; %entry
999; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1000; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1001; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1002; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1003; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1004; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1005; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1006; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1007; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1008; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1009; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1010; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1011; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1012; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1013; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1014; GFX8-NEXT:    s_waitcnt vmcnt(2)
1015; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
1016; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 8, v3
1017; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
1018; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1019; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 8
1020; GFX8-NEXT:    s_waitcnt vmcnt(1)
1021; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1022; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1023; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 8
1024; GFX8-NEXT:    s_waitcnt vmcnt(0)
1025; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1026; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1027; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 8
1028; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
1029; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
1030; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
1031; GFX8-NEXT:    flat_store_short v[0:1], v2
1032; GFX8-NEXT:    s_endpgm
1033;
1034; GFX9-NODL-LABEL: idot4_acc16_vecMul:
1035; GFX9-NODL:       ; %bb.0: ; %entry
1036; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1037; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1038; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1039; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0xffff
1040; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1041; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1042; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1043; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1044; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
1045; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1046; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1047; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1048; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1049; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v7, 8, v1
1050; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1051; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1052; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1053; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1054; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1055; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1056; GFX9-NODL-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1057; GFX9-NODL-NEXT:    v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1058; GFX9-NODL-NEXT:    v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1059; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1060; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
1061; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
1062; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1063; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
1064; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1065; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1066; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
1067; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1068; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
1069; GFX9-NODL-NEXT:    s_endpgm
1070;
1071; GFX9-DL-LABEL: idot4_acc16_vecMul:
1072; GFX9-DL:       ; %bb.0: ; %entry
1073; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1074; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1075; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1076; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1077; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1078; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1079; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1080; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1081; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1082; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1083; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
1084; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1085; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
1086; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 8, v1
1087; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 8, v2
1088; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1089; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1090; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1091; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1092; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 8, v5
1093; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 8, v6
1094; GFX9-DL-NEXT:    v_and_b32_sdwa v6, v4, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1095; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v4, sext(v5) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1096; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1097; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
1098; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
1099; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1100; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
1101; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1102; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1103; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
1104; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1105; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
1106; GFX9-DL-NEXT:    s_endpgm
1107;
1108; GFX10-DL-LABEL: idot4_acc16_vecMul:
1109; GFX10-DL:       ; %bb.0: ; %entry
1110; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1111; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1112; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1113; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1114; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1115; GFX10-DL-NEXT:    s_clause 0x1
1116; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1117; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1118; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1119; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
1120; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1121; GFX10-DL-NEXT:    v_ashrrev_i16 v5, 8, v1
1122; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1123; GFX10-DL-NEXT:    v_ashrrev_i16 v6, 8, v2
1124; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1125; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1126; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1127; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1128; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
1129; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
1130; GFX10-DL-NEXT:    v_ashrrev_i16 v7, 8, v1
1131; GFX10-DL-NEXT:    v_ashrrev_i16 v8, 8, v2
1132; GFX10-DL-NEXT:    v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1133; GFX10-DL-NEXT:    v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1134; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v6
1135; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v8, 16, v2
1136; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v7, 16, v1
1137; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1138; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1139; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
1140; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1141; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
1142; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1143; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
1144; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
1145; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
1146; GFX10-DL-NEXT:    s_endpgm
1147                                              <4 x i8> addrspace(1)* %src2,
1148                                              i16 addrspace(1)* nocapture %dst) {
1149entry:
1150  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1151  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1152  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1153  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1154  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1155
1156  %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
1157  %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
1158
1159  %mul = mul <4 x i16> %cvec1, %cvec2
1160  %mul0 = extractelement <4 x i16> %mul, i64 0
1161  %mul1 = extractelement <4 x i16> %mul, i64 1
1162  %mul2 = extractelement <4 x i16> %mul, i64 2
1163  %mul3 = extractelement <4 x i16> %mul, i64 3
1164
1165  %acc = load i16, i16 addrspace(1)* %dst, align 4
1166  %add1 = add i16 %mul0, %acc
1167  %add2 = add i16 %add1, %mul1
1168  %add3 = add i16 %add2, %mul2
1169  %add4 = add i16 %add3, %mul3
1170
1171  store i16 %add4, i16 addrspace(1)* %dst, align 4
1172  ret void
1173}
1174
1175declare i32 @llvm.amdgcn.workitem.id.x()
1176