1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1,
10; GFX7-LABEL: udot8_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
13; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
14; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
15; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
16; GFX7-NEXT:    s_mov_b32 s14, -1
17; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
18; GFX7-NEXT:    s_add_u32 s12, s12, s3
19; GFX7-NEXT:    s_mov_b32 s3, 0xf000
20; GFX7-NEXT:    s_mov_b32 s10, 0
21; GFX7-NEXT:    s_mov_b32 s11, s3
22; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
23; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
24; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
25; GFX7-NEXT:    v_mov_b32_e32 v1, 0
26; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
27; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
28; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
29; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
30; GFX7-NEXT:    s_mov_b32 s2, -1
31; GFX7-NEXT:    s_addc_u32 s13, s13, 0
32; GFX7-NEXT:    s_waitcnt vmcnt(1)
33; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
34; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
35; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
36; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
37; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
38; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
39; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
40; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
41; GFX7-NEXT:    s_waitcnt vmcnt(0)
42; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
43; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
44; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
45; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
46; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
47; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
48; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
49; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
50; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
52; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
53; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
54; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
55; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
56; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
57; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
58; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
59; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
60; GFX7-NEXT:    s_endpgm
61;
62; GFX8-LABEL: udot8_acc32:
63; GFX8:       ; %bb.0: ; %entry
64; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
65; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
66; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
67; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
68; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    v_mov_b32_e32 v1, s5
71; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
72; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
73; GFX8-NEXT:    flat_load_dword v3, v[0:1]
74; GFX8-NEXT:    v_mov_b32_e32 v1, s7
75; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
76; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
77; GFX8-NEXT:    flat_load_dword v0, v[0:1]
78; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
79; GFX8-NEXT:    s_mov_b32 s10, -1
80; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
81; GFX8-NEXT:    s_add_u32 s8, s8, s3
82; GFX8-NEXT:    s_addc_u32 s9, s9, 0
83; GFX8-NEXT:    s_waitcnt vmcnt(1)
84; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
85; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
86; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
87; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
88; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
89; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
90; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
91; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
92; GFX8-NEXT:    s_waitcnt vmcnt(0)
93; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
94; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
95; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
96; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
97; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
98; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
99; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
100; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
101; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
103; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
104; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
105; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
106; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
107; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
108; GFX8-NEXT:    v_mad_u32_u24 v0, v2, v10, v0
109; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v9, v0
110; GFX8-NEXT:    v_mov_b32_e32 v0, s0
111; GFX8-NEXT:    v_mov_b32_e32 v1, s1
112; GFX8-NEXT:    flat_store_dword v[0:1], v2
113; GFX8-NEXT:    s_endpgm
114;
115; GFX9-LABEL: udot8_acc32:
116; GFX9:       ; %bb.0: ; %entry
117; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
118; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
119; GFX9-NEXT:    s_mov_b32 s10, -1
120; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
121; GFX9-NEXT:    s_add_u32 s8, s8, s3
122; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
123; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
124; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
125; GFX9-NEXT:    s_addc_u32 s9, s9, 0
126; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
128; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
129; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
130; GFX9-NEXT:    v_mov_b32_e32 v0, 0
131; GFX9-NEXT:    s_waitcnt vmcnt(1)
132; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
133; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
134; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
135; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
136; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
137; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
138; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
139; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
140; GFX9-NEXT:    s_waitcnt vmcnt(0)
141; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
142; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
143; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
144; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
145; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
146; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
147; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
148; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
149; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
150; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
151; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
152; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
153; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
155; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
156; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
157; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
158; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
159; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
160; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
161; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
162; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
163; GFX9-NEXT:    s_endpgm
164;
165; GFX9-DL-LABEL: udot8_acc32:
166; GFX9-DL:       ; %bb.0: ; %entry
167; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
168; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
169; GFX9-DL-NEXT:    s_mov_b32 s10, -1
170; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
171; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
172; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
173; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
174; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
175; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
176; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
177; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
178; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
179; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
180; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
181; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
182; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s0
183; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
184; GFX9-DL-NEXT:    s_endpgm
185;
186; GFX10-DL-LABEL: udot8_acc32:
187; GFX10-DL:       ; %bb.0: ; %entry
188; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
189; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
190; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
191; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
192; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
193; GFX10-DL-NEXT:    s_mov_b32 s10, -1
194; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
195; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
196; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
197; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX10-DL-NEXT:    s_clause 0x1
199; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
200; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
201; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
202; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
203; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
204; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
205; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
206; GFX10-DL-NEXT:    s_endpgm
207                                       <8 x i4> addrspace(1)* %src2,
208                                       i32 addrspace(1)* nocapture %dst) {
209entry:
210  %idx = call i32 @llvm.amdgcn.workitem.id.x()
211  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
212  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
213  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
214  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
215
216  %v1e0 = extractelement <8 x i4> %vec1, i64 0
217  %cv1e0 = zext i4 %v1e0 to i32
218  %v2e0 = extractelement <8 x i4> %vec2, i64 0
219  %cv2e0 = zext i4 %v2e0 to i32
220  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
221
222  %v1e1 = extractelement <8 x i4> %vec1, i64 1
223  %cv1e1 = zext i4 %v1e1 to i32
224  %v2e1 = extractelement <8 x i4> %vec2, i64 1
225  %cv2e1 = zext i4 %v2e1 to i32
226  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
227
228  %v1e2 = extractelement <8 x i4> %vec1, i64 2
229  %cv1e2 = zext i4 %v1e2 to i32
230  %v2e2 = extractelement <8 x i4> %vec2, i64 2
231  %cv2e2 = zext i4 %v2e2 to i32
232  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
233
234  %v1e3 = extractelement <8 x i4> %vec1, i64 3
235  %cv1e3 = zext i4 %v1e3 to i32
236  %v2e3 = extractelement <8 x i4> %vec2, i64 3
237  %cv2e3 = zext i4 %v2e3 to i32
238  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
239
240  %v1e4 = extractelement <8 x i4> %vec1, i64 4
241  %cv1e4 = zext i4 %v1e4 to i32
242  %v2e4 = extractelement <8 x i4> %vec2, i64 4
243  %cv2e4 = zext i4 %v2e4 to i32
244  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
245
246  %v1e5 = extractelement <8 x i4> %vec1, i64 5
247  %cv1e5 = zext i4 %v1e5 to i32
248  %v2e5 = extractelement <8 x i4> %vec2, i64 5
249  %cv2e5 = zext i4 %v2e5 to i32
250  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
251
252  %v1e6 = extractelement <8 x i4> %vec1, i64 6
253  %cv1e6 = zext i4 %v1e6 to i32
254  %v2e6 = extractelement <8 x i4> %vec2, i64 6
255  %cv2e6 = zext i4 %v2e6 to i32
256  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
257
258  %v1e7 = extractelement <8 x i4> %vec1, i64 7
259  %cv1e7 = zext i4 %v1e7 to i32
260  %v2e7 = extractelement <8 x i4> %vec2, i64 7
261  %cv2e7 = zext i4 %v2e7 to i32
262  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
263
264  %acc = load i32, i32 addrspace(1)* %dst, align 4
265  %add1 = add i32 %mul0, %acc
266  %add2 = add i32 %add1, %mul1
267  %add3 = add i32 %add2, %mul2
268  %add4 = add i32 %add3, %mul3
269  %add5 = add i32 %add4, %mul4
270  %add6 = add i32 %add5, %mul5
271  %add7 = add i32 %add6, %mul6
272  %add8 = add i32 %add7, %mul7
273
274  store i32 %add8, i32 addrspace(1)* %dst, align 4
275  ret void
276}
277
278; TODO: Remove the unnecessary instruction(that is zero-extending the
279; 2nd MAD) to have the pattern-recognizer to kick in.
280define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
281; GFX7-LABEL: udot8_acc16:
282; GFX7:       ; %bb.0: ; %entry
283; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
284; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
285; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
286; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
287; GFX7-NEXT:    s_mov_b32 s14, -1
288; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
289; GFX7-NEXT:    s_add_u32 s12, s12, s3
290; GFX7-NEXT:    s_mov_b32 s3, 0xf000
291; GFX7-NEXT:    s_mov_b32 s10, 0
292; GFX7-NEXT:    s_mov_b32 s11, s3
293; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
295; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
296; GFX7-NEXT:    v_mov_b32_e32 v1, 0
297; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
298; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
299; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
300; GFX7-NEXT:    s_mov_b32 s2, -1
301; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
302; GFX7-NEXT:    s_addc_u32 s13, s13, 0
303; GFX7-NEXT:    s_waitcnt vmcnt(2)
304; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
305; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
306; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
307; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
308; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
309; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
310; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
311; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
312; GFX7-NEXT:    s_waitcnt vmcnt(1)
313; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
314; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
315; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
316; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
317; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
318; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
319; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
320; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
321; GFX7-NEXT:    s_waitcnt vmcnt(0)
322; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
323; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
324; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
325; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
326; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
327; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
328; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
329; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
330; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
331; GFX7-NEXT:    s_endpgm
332;
333; GFX8-LABEL: udot8_acc16:
334; GFX8:       ; %bb.0: ; %entry
335; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
336; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
337; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
338; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
339; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
340; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX8-NEXT:    v_mov_b32_e32 v1, s5
342; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
343; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
344; GFX8-NEXT:    flat_load_dword v3, v[0:1]
345; GFX8-NEXT:    v_mov_b32_e32 v1, s7
346; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
347; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
348; GFX8-NEXT:    flat_load_dword v2, v[0:1]
349; GFX8-NEXT:    v_mov_b32_e32 v0, s0
350; GFX8-NEXT:    v_mov_b32_e32 v1, s1
351; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
352; GFX8-NEXT:    s_mov_b32 s10, -1
353; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
354; GFX8-NEXT:    s_add_u32 s8, s8, s3
355; GFX8-NEXT:    s_addc_u32 s9, s9, 0
356; GFX8-NEXT:    s_waitcnt vmcnt(2)
357; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
358; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
359; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
360; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
361; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
362; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
363; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
364; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
365; GFX8-NEXT:    s_waitcnt vmcnt(1)
366; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
367; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
368; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
369; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
370; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
371; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
372; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
373; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
374; GFX8-NEXT:    s_waitcnt vmcnt(0)
375; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
376; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
377; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
378; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
379; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
380; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
381; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
382; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
383; GFX8-NEXT:    flat_store_short v[0:1], v2
384; GFX8-NEXT:    s_endpgm
385;
386; GFX9-LABEL: udot8_acc16:
387; GFX9:       ; %bb.0: ; %entry
388; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
389; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
390; GFX9-NEXT:    s_mov_b32 s10, -1
391; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
392; GFX9-NEXT:    s_add_u32 s8, s8, s3
393; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
394; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
395; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
396; GFX9-NEXT:    s_addc_u32 s9, s9, 0
397; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
398; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
399; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
400; GFX9-NEXT:    v_mov_b32_e32 v0, 0
401; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
402; GFX9-NEXT:    s_waitcnt vmcnt(2)
403; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
404; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
405; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
406; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
407; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
408; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
409; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
410; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
411; GFX9-NEXT:    s_waitcnt vmcnt(1)
412; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
413; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
414; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
415; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
416; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
417; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
418; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
419; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
420; GFX9-NEXT:    s_waitcnt vmcnt(0)
421; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
422; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
423; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
424; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
425; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
426; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
427; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
428; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
429; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
430; GFX9-NEXT:    s_endpgm
431;
432; GFX9-DL-LABEL: udot8_acc16:
433; GFX9-DL:       ; %bb.0: ; %entry
434; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
435; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
436; GFX9-DL-NEXT:    s_mov_b32 s10, -1
437; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
438; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
439; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
440; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
441; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
442; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
443; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
445; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
446; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
447; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
448; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
449; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
450; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
451; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
452; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
453; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
454; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
455; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
456; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
457; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
458; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
459; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
460; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
461; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
462; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
463; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
464; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
465; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
466; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
467; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
468; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
469; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
470; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
471; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
472; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
473; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
474; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
475; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
476; GFX9-DL-NEXT:    s_endpgm
477;
478; GFX10-DL-LABEL: udot8_acc16:
479; GFX10-DL:       ; %bb.0: ; %entry
480; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
481; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
482; GFX10-DL-NEXT:    s_mov_b32 s10, -1
483; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
484; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
485; GFX10-DL-NEXT:    s_clause 0x1
486; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
487; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
488; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
489; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
490; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
491; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX10-DL-NEXT:    s_clause 0x1
493; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
494; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
495; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
496; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
497; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
498; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
499; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
500; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
501; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
502; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
503; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
504; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
505; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
506; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
507; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
508; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
509; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
510; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
511; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
512; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
513; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
514; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
515; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
516; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
517; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
518; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
519; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
520; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
521; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
522; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
523; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
524; GFX10-DL-NEXT:    s_endpgm
525                                       <8 x i4> addrspace(1)* %src2,
526                                       i16 addrspace(1)* nocapture %dst) {
527entry:
528  %idx = call i32 @llvm.amdgcn.workitem.id.x()
529  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
530  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
531  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
532  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
533
534  %v1e0 = extractelement <8 x i4> %vec1, i64 0
535  %cv1e0 = zext i4 %v1e0 to i16
536  %v2e0 = extractelement <8 x i4> %vec2, i64 0
537  %cv2e0 = zext i4 %v2e0 to i16
538  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
539
540  %v1e1 = extractelement <8 x i4> %vec1, i64 1
541  %cv1e1 = zext i4 %v1e1 to i16
542  %v2e1 = extractelement <8 x i4> %vec2, i64 1
543  %cv2e1 = zext i4 %v2e1 to i16
544  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
545
546  %v1e2 = extractelement <8 x i4> %vec1, i64 2
547  %cv1e2 = zext i4 %v1e2 to i16
548  %v2e2 = extractelement <8 x i4> %vec2, i64 2
549  %cv2e2 = zext i4 %v2e2 to i16
550  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
551
552  %v1e3 = extractelement <8 x i4> %vec1, i64 3
553  %cv1e3 = zext i4 %v1e3 to i16
554  %v2e3 = extractelement <8 x i4> %vec2, i64 3
555  %cv2e3 = zext i4 %v2e3 to i16
556  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
557
558  %v1e4 = extractelement <8 x i4> %vec1, i64 4
559  %cv1e4 = zext i4 %v1e4 to i16
560  %v2e4 = extractelement <8 x i4> %vec2, i64 4
561  %cv2e4 = zext i4 %v2e4 to i16
562  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
563
564  %v1e5 = extractelement <8 x i4> %vec1, i64 5
565  %cv1e5 = zext i4 %v1e5 to i16
566  %v2e5 = extractelement <8 x i4> %vec2, i64 5
567  %cv2e5 = zext i4 %v2e5 to i16
568  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
569
570  %v1e6 = extractelement <8 x i4> %vec1, i64 6
571  %cv1e6 = zext i4 %v1e6 to i16
572  %v2e6 = extractelement <8 x i4> %vec2, i64 6
573  %cv2e6 = zext i4 %v2e6 to i16
574  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
575
576  %v1e7 = extractelement <8 x i4> %vec1, i64 7
577  %cv1e7 = zext i4 %v1e7 to i16
578  %v2e7 = extractelement <8 x i4> %vec2, i64 7
579  %cv2e7 = zext i4 %v2e7 to i16
580  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
581
582  %acc = load i16, i16 addrspace(1)* %dst, align 4
583  %add1 = add i16 %mul0, %acc
584  %add2 = add i16 %add1, %mul1
585  %add3 = add i16 %add2, %mul2
586  %add4 = add i16 %add3, %mul3
587  %add5 = add i16 %add4, %mul4
588  %add6 = add i16 %add5, %mul5
589  %add7 = add i16 %add6, %mul6
590  %add8 = add i16 %add7, %mul7
591
592  store i16 %add8, i16 addrspace(1)* %dst, align 4
593  ret void
594}
595
596; TODO: Remove the unnecessary instruction(that is zero-extending the
597; 2nd MAD) to have the pattern-recognizer to kick in.
598define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
599; GFX7-LABEL: udot8_acc8:
600; GFX7:       ; %bb.0: ; %entry
601; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
602; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
603; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
604; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
605; GFX7-NEXT:    s_mov_b32 s14, -1
606; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
607; GFX7-NEXT:    s_add_u32 s12, s12, s3
608; GFX7-NEXT:    s_mov_b32 s3, 0xf000
609; GFX7-NEXT:    s_mov_b32 s10, 0
610; GFX7-NEXT:    s_mov_b32 s11, s3
611; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
613; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
614; GFX7-NEXT:    v_mov_b32_e32 v1, 0
615; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
616; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
617; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
618; GFX7-NEXT:    s_mov_b32 s2, -1
619; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
620; GFX7-NEXT:    s_addc_u32 s13, s13, 0
621; GFX7-NEXT:    s_waitcnt vmcnt(2)
622; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
623; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
624; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
625; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
626; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
627; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
628; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
629; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
630; GFX7-NEXT:    s_waitcnt vmcnt(1)
631; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
632; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
633; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
634; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
635; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
636; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
637; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
638; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
639; GFX7-NEXT:    s_waitcnt vmcnt(0)
640; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
641; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
642; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
643; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
644; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
645; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
646; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
647; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
648; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
649; GFX7-NEXT:    s_endpgm
650;
651; GFX8-LABEL: udot8_acc8:
652; GFX8:       ; %bb.0: ; %entry
653; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
654; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
655; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
656; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
657; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
658; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX8-NEXT:    v_mov_b32_e32 v1, s5
660; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
661; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
662; GFX8-NEXT:    flat_load_dword v3, v[0:1]
663; GFX8-NEXT:    v_mov_b32_e32 v1, s7
664; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
665; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
666; GFX8-NEXT:    flat_load_dword v2, v[0:1]
667; GFX8-NEXT:    v_mov_b32_e32 v0, s0
668; GFX8-NEXT:    v_mov_b32_e32 v1, s1
669; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
670; GFX8-NEXT:    s_mov_b32 s10, -1
671; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
672; GFX8-NEXT:    s_add_u32 s8, s8, s3
673; GFX8-NEXT:    s_addc_u32 s9, s9, 0
674; GFX8-NEXT:    s_waitcnt vmcnt(2)
675; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
676; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
677; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
678; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
679; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
680; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
681; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
682; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
683; GFX8-NEXT:    s_waitcnt vmcnt(1)
684; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
685; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
686; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
687; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
688; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
689; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
690; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
691; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
692; GFX8-NEXT:    s_waitcnt vmcnt(0)
693; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
694; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
695; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
696; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
697; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
698; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
699; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
700; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
701; GFX8-NEXT:    flat_store_byte v[0:1], v2
702; GFX8-NEXT:    s_endpgm
703;
704; GFX9-LABEL: udot8_acc8:
705; GFX9:       ; %bb.0: ; %entry
706; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
707; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
708; GFX9-NEXT:    s_mov_b32 s10, -1
709; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
710; GFX9-NEXT:    s_add_u32 s8, s8, s3
711; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
712; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
713; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
714; GFX9-NEXT:    s_addc_u32 s9, s9, 0
715; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
717; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
718; GFX9-NEXT:    v_mov_b32_e32 v0, 0
719; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
720; GFX9-NEXT:    s_waitcnt vmcnt(2)
721; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
722; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
723; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
724; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
725; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
726; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
727; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
728; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
729; GFX9-NEXT:    s_waitcnt vmcnt(1)
730; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
731; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
732; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
733; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
734; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
735; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
736; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
737; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
738; GFX9-NEXT:    s_waitcnt vmcnt(0)
739; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
740; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
741; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
742; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
743; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
744; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
745; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
746; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
747; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
748; GFX9-NEXT:    s_endpgm
749;
750; GFX9-DL-LABEL: udot8_acc8:
751; GFX9-DL:       ; %bb.0: ; %entry
752; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
753; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
754; GFX9-DL-NEXT:    s_mov_b32 s10, -1
755; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
756; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
757; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
758; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
759; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
760; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
761; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
763; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
764; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
765; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
766; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
767; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
768; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
769; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
770; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
771; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
772; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
773; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
774; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
775; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
776; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
777; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
778; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
779; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
780; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
781; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
782; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
783; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
784; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
785; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
786; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
787; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
788; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
789; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
790; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
791; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
792; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
793; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
794; GFX9-DL-NEXT:    s_endpgm
795;
796; GFX10-DL-LABEL: udot8_acc8:
797; GFX10-DL:       ; %bb.0: ; %entry
798; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
799; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
800; GFX10-DL-NEXT:    s_mov_b32 s10, -1
801; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
802; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
803; GFX10-DL-NEXT:    s_clause 0x1
804; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
805; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
806; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
807; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
808; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
809; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX10-DL-NEXT:    s_clause 0x1
811; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
812; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
813; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
814; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
815; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
816; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
817; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
818; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
819; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
820; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
821; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
822; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
823; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
824; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
825; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
826; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
827; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
828; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
829; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
830; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
831; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
832; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
833; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
834; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
835; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
836; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
837; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
838; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
839; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
840; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
841; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
842; GFX10-DL-NEXT:    s_endpgm
843                                      <8 x i4> addrspace(1)* %src2,
844                                      i8 addrspace(1)* nocapture %dst) {
845entry:
846  %idx = call i32 @llvm.amdgcn.workitem.id.x()
847  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
848  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
849  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
850  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
851
852  %v1e0 = extractelement <8 x i4> %vec1, i64 0
853  %cv1e0 = zext i4 %v1e0 to i8
854  %v2e0 = extractelement <8 x i4> %vec2, i64 0
855  %cv2e0 = zext i4 %v2e0 to i8
856  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
857
858  %v1e1 = extractelement <8 x i4> %vec1, i64 1
859  %cv1e1 = zext i4 %v1e1 to i8
860  %v2e1 = extractelement <8 x i4> %vec2, i64 1
861  %cv2e1 = zext i4 %v2e1 to i8
862  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
863
864  %v1e2 = extractelement <8 x i4> %vec1, i64 2
865  %cv1e2 = zext i4 %v1e2 to i8
866  %v2e2 = extractelement <8 x i4> %vec2, i64 2
867  %cv2e2 = zext i4 %v2e2 to i8
868  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
869
870  %v1e3 = extractelement <8 x i4> %vec1, i64 3
871  %cv1e3 = zext i4 %v1e3 to i8
872  %v2e3 = extractelement <8 x i4> %vec2, i64 3
873  %cv2e3 = zext i4 %v2e3 to i8
874  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
875
876  %v1e4 = extractelement <8 x i4> %vec1, i64 4
877  %cv1e4 = zext i4 %v1e4 to i8
878  %v2e4 = extractelement <8 x i4> %vec2, i64 4
879  %cv2e4 = zext i4 %v2e4 to i8
880  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
881
882  %v1e5 = extractelement <8 x i4> %vec1, i64 5
883  %cv1e5 = zext i4 %v1e5 to i8
884  %v2e5 = extractelement <8 x i4> %vec2, i64 5
885  %cv2e5 = zext i4 %v2e5 to i8
886  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
887
888  %v1e6 = extractelement <8 x i4> %vec1, i64 6
889  %cv1e6 = zext i4 %v1e6 to i8
890  %v2e6 = extractelement <8 x i4> %vec2, i64 6
891  %cv2e6 = zext i4 %v2e6 to i8
892  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
893
894  %v1e7 = extractelement <8 x i4> %vec1, i64 7
895  %cv1e7 = zext i4 %v1e7 to i8
896  %v2e7 = extractelement <8 x i4> %vec2, i64 7
897  %cv2e7 = zext i4 %v2e7 to i8
898  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
899
900  %acc = load i8, i8 addrspace(1)* %dst, align 4
901  %add1 = add i8 %mul0, %acc
902  %add2 = add i8 %add1, %mul1
903  %add3 = add i8 %add2, %mul2
904  %add4 = add i8 %add3, %mul3
905  %add5 = add i8 %add4, %mul4
906  %add6 = add i8 %add5, %mul5
907  %add7 = add i8 %add6, %mul6
908  %add8 = add i8 %add7, %mul7
909
910  store i8 %add8, i8 addrspace(1)* %dst, align 4
911  ret void
912}
913
914; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD)
915; to have the pattern-recognizer to kick in.
916define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
917; GFX7-LABEL: udot8_acc4:
918; GFX7:       ; %bb.0: ; %entry
919; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
920; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
921; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
922; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
923; GFX7-NEXT:    s_mov_b32 s14, -1
924; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
925; GFX7-NEXT:    s_add_u32 s12, s12, s3
926; GFX7-NEXT:    s_mov_b32 s3, 0xf000
927; GFX7-NEXT:    s_mov_b32 s10, 0
928; GFX7-NEXT:    s_mov_b32 s11, s3
929; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
930; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
931; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
932; GFX7-NEXT:    v_mov_b32_e32 v1, 0
933; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
934; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
935; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
936; GFX7-NEXT:    s_mov_b32 s2, -1
937; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
938; GFX7-NEXT:    s_addc_u32 s13, s13, 0
939; GFX7-NEXT:    s_waitcnt vmcnt(2)
940; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
941; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
942; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
943; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
944; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
945; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
946; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
947; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
948; GFX7-NEXT:    s_waitcnt vmcnt(1)
949; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
950; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
951; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
952; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
953; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
954; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
955; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
956; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
957; GFX7-NEXT:    s_waitcnt vmcnt(0)
958; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
959; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
960; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
961; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
962; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
963; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
964; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
965; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
966; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
967; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
968; GFX7-NEXT:    s_endpgm
969;
970; GFX8-LABEL: udot8_acc4:
971; GFX8:       ; %bb.0: ; %entry
972; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
973; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
974; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
975; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
976; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
977; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX8-NEXT:    v_mov_b32_e32 v1, s5
979; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
980; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
981; GFX8-NEXT:    flat_load_dword v3, v[0:1]
982; GFX8-NEXT:    v_mov_b32_e32 v1, s7
983; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
984; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
985; GFX8-NEXT:    flat_load_dword v2, v[0:1]
986; GFX8-NEXT:    v_mov_b32_e32 v0, s0
987; GFX8-NEXT:    v_mov_b32_e32 v1, s1
988; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
989; GFX8-NEXT:    s_mov_b32 s10, -1
990; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
991; GFX8-NEXT:    s_add_u32 s8, s8, s3
992; GFX8-NEXT:    s_addc_u32 s9, s9, 0
993; GFX8-NEXT:    s_waitcnt vmcnt(2)
994; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
995; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
996; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
997; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
998; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
999; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
1000; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
1001; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1002; GFX8-NEXT:    s_waitcnt vmcnt(1)
1003; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
1004; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
1005; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
1006; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
1007; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
1008; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
1009; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
1010; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1011; GFX8-NEXT:    s_waitcnt vmcnt(0)
1012; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1013; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
1014; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
1015; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
1016; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
1017; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
1018; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
1019; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
1020; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1021; GFX8-NEXT:    flat_store_byte v[0:1], v2
1022; GFX8-NEXT:    s_endpgm
1023;
1024; GFX9-LABEL: udot8_acc4:
1025; GFX9:       ; %bb.0: ; %entry
1026; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1027; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1028; GFX9-NEXT:    s_mov_b32 s10, -1
1029; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1030; GFX9-NEXT:    s_add_u32 s8, s8, s3
1031; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1032; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1033; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1034; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1035; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1036; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1037; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1038; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1039; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
1040; GFX9-NEXT:    s_waitcnt vmcnt(2)
1041; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1042; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1043; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1044; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1045; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1046; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1047; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
1048; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1049; GFX9-NEXT:    s_waitcnt vmcnt(1)
1050; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1051; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1052; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1053; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1054; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1055; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1056; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
1057; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1058; GFX9-NEXT:    s_waitcnt vmcnt(0)
1059; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1060; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1061; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1062; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1063; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1064; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1065; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1066; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1067; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1068; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
1069; GFX9-NEXT:    s_endpgm
1070;
1071; GFX9-DL-LABEL: udot8_acc4:
1072; GFX9-DL:       ; %bb.0: ; %entry
1073; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1074; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1075; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1076; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1077; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1078; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1079; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1080; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1081; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1082; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1084; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1085; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1086; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
1087; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1088; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1089; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1090; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1091; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1092; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1093; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1094; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
1095; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1096; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1097; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1098; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1099; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1100; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1101; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1102; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1103; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
1104; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1105; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1106; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1107; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1108; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1109; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1110; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1111; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1112; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1113; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1114; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1115; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
1116; GFX9-DL-NEXT:    s_endpgm
1117;
1118; GFX10-DL-LABEL: udot8_acc4:
1119; GFX10-DL:       ; %bb.0: ; %entry
1120; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1121; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1122; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1123; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1124; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1125; GFX10-DL-NEXT:    s_clause 0x1
1126; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1127; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1128; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1129; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1130; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1131; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1132; GFX10-DL-NEXT:    s_clause 0x1
1133; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1134; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1135; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
1136; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1137; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
1138; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1139; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
1140; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
1141; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
1142; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1143; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
1144; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
1145; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
1146; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1147; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
1148; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
1149; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1150; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
1151; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
1152; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1153; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
1154; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
1155; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1156; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1157; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
1158; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
1159; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
1160; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1161; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1162; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
1163; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1164; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1165; GFX10-DL-NEXT:    s_endpgm
1166                                      <8 x i4> addrspace(1)* %src2,
1167                                      i4 addrspace(1)* nocapture %dst) {
1168entry:
1169  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1170  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1171  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1172  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1173  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1174
1175  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1176  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1177  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1178
1179  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1180  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1181  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1182
1183  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1184  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1185  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1186
1187  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1188  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1189  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1190
1191  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1192  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1193  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1194
1195  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1196  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1197  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1198
1199  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1200  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1201  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1202
1203  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1204  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1205  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1206
1207  %acc = load i4, i4 addrspace(1)* %dst, align 4
1208  %add1 = add i4 %mul0, %acc
1209  %add2 = add i4 %add1, %mul1
1210  %add3 = add i4 %add2, %mul2
1211  %add4 = add i4 %add3, %mul3
1212  %add5 = add i4 %add4, %mul4
1213  %add6 = add i4 %add5, %mul5
1214  %add7 = add i4 %add6, %mul6
1215  %add8 = add i4 %add7, %mul7
1216
1217  store i4 %add8, i4 addrspace(1)* %dst, align 4
1218  ret void
1219}
1220
1221; TODO: Currently, permutation of udot8 is turned off due to a huge increase
1222; in the compile time.
1223define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
1224; GFX7-LABEL: udot8_CommutationInsideMAD:
1225; GFX7:       ; %bb.0: ; %entry
1226; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1227; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1228; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1229; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1230; GFX7-NEXT:    s_mov_b32 s14, -1
1231; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1232; GFX7-NEXT:    s_add_u32 s12, s12, s3
1233; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1234; GFX7-NEXT:    s_mov_b32 s10, 0
1235; GFX7-NEXT:    s_mov_b32 s11, s3
1236; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1237; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1238; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1239; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1240; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1241; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1242; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1243; GFX7-NEXT:    s_mov_b32 s2, -1
1244; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
1245; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1246; GFX7-NEXT:    s_waitcnt vmcnt(2)
1247; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
1248; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
1249; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
1250; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
1251; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
1252; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
1253; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
1254; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1255; GFX7-NEXT:    s_waitcnt vmcnt(1)
1256; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
1257; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
1258; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
1259; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
1260; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
1261; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
1262; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
1263; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1264; GFX7-NEXT:    s_waitcnt vmcnt(0)
1265; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1266; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
1267; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1268; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1269; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1270; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1271; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1272; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1273; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1274; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1275; GFX7-NEXT:    s_endpgm
1276;
1277; GFX8-LABEL: udot8_CommutationInsideMAD:
1278; GFX8:       ; %bb.0: ; %entry
1279; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1280; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1281; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1282; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1283; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1284; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1286; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1287; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1289; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1290; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1291; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1292; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1293; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1294; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1295; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
1296; GFX8-NEXT:    s_mov_b32 s10, -1
1297; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1298; GFX8-NEXT:    s_add_u32 s8, s8, s3
1299; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1300; GFX8-NEXT:    s_waitcnt vmcnt(2)
1301; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
1302; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
1303; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
1304; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
1305; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
1306; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
1307; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
1308; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1309; GFX8-NEXT:    s_waitcnt vmcnt(1)
1310; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
1311; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
1312; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
1313; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
1314; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
1315; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
1316; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
1317; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1318; GFX8-NEXT:    s_waitcnt vmcnt(0)
1319; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1320; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
1321; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
1322; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
1323; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
1324; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
1325; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
1326; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
1327; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
1328; GFX8-NEXT:    flat_store_byte v[0:1], v2
1329; GFX8-NEXT:    s_endpgm
1330;
1331; GFX9-LABEL: udot8_CommutationInsideMAD:
1332; GFX9:       ; %bb.0: ; %entry
1333; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1334; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1335; GFX9-NEXT:    s_mov_b32 s10, -1
1336; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1337; GFX9-NEXT:    s_add_u32 s8, s8, s3
1338; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1339; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1340; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1341; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1342; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1344; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1345; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1346; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
1347; GFX9-NEXT:    s_waitcnt vmcnt(2)
1348; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1349; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1350; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1351; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1352; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1353; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1354; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
1355; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1356; GFX9-NEXT:    s_waitcnt vmcnt(1)
1357; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1358; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1359; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1360; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1361; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1362; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1363; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
1364; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1365; GFX9-NEXT:    s_waitcnt vmcnt(0)
1366; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1367; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1368; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1369; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1370; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1371; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1372; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1373; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1374; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1375; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
1376; GFX9-NEXT:    s_endpgm
1377;
1378; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
1379; GFX9-DL:       ; %bb.0: ; %entry
1380; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1381; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1382; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1383; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1384; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1385; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1386; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1387; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1388; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1389; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1390; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1391; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1392; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1393; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
1394; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1395; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1396; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1397; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1398; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1399; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1400; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1401; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
1402; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1403; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1404; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1405; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1406; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1407; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1408; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1409; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1410; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
1411; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1412; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1413; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1414; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
1415; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
1416; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
1417; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
1418; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
1419; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
1420; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
1421; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1422; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
1423; GFX9-DL-NEXT:    s_endpgm
1424;
1425; GFX10-DL-LABEL: udot8_CommutationInsideMAD:
1426; GFX10-DL:       ; %bb.0: ; %entry
1427; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1428; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1429; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1430; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1431; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1432; GFX10-DL-NEXT:    s_clause 0x1
1433; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1434; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1435; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1436; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1437; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1438; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1439; GFX10-DL-NEXT:    s_clause 0x1
1440; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1441; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1442; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
1443; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1444; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
1445; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1446; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
1447; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
1448; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
1449; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1450; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
1451; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
1452; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
1453; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1454; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
1455; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
1456; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1457; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
1458; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
1459; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1460; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
1461; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
1462; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1463; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
1464; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
1465; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
1466; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
1467; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
1468; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
1469; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
1470; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
1471; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1472; GFX10-DL-NEXT:    s_endpgm
1473                                                      <8 x i4> addrspace(1)* %src2,
1474                                                      i4 addrspace(1)* nocapture %dst) {
1475entry:
1476  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1477  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1478  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1479  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1480  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1481
1482  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1483  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1484  %mul0 = mul nuw nsw i4 %v1e0, %v2e0
1485
1486  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1487  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1488  %mul1 = mul nuw nsw i4 %v1e1, %v2e1
1489
1490  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1491  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1492  %mul2 = mul nuw nsw i4 %v1e2, %v2e2
1493
1494  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1495  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1496  %mul3 = mul nuw nsw i4 %v1e3, %v2e3
1497
1498  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1499  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1500  %mul4 = mul nuw nsw i4 %v1e4, %v2e4
1501
1502  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1503  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1504  %mul5 = mul nuw nsw i4 %v1e5, %v2e5
1505
1506  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1507  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1508  %mul6 = mul nuw nsw i4 %v1e6, %v2e6
1509
1510  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1511  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1512  %mul7 = mul nuw nsw i4 %v1e7, %v2e7
1513
1514  %acc = load i4, i4 addrspace(1)* %dst, align 4
1515  %add1 = add i4 %mul0, %acc
1516  %add2 = add i4 %mul1, %add1
1517  %add3 = add i4 %mul2, %add2
1518  %add4 = add i4 %mul3, %add3
1519  %add5 = add i4 %mul4, %add4
1520  %add6 = add i4 %mul5, %add5
1521  %add7 = add i4 %mul6, %add6
1522  %add8 = add i4 %mul7, %add7
1523
1524  store i4 %add8, i4 addrspace(1)* %dst, align 4
1525  ret void
1526}
1527
1528define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1529; GFX7-LABEL: udot8_multiuses_mul1:
1530; GFX7:       ; %bb.0: ; %entry
1531; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1532; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1533; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1534; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1535; GFX7-NEXT:    s_mov_b32 s14, -1
1536; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1537; GFX7-NEXT:    s_add_u32 s12, s12, s3
1538; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1539; GFX7-NEXT:    s_mov_b32 s10, 0
1540; GFX7-NEXT:    s_mov_b32 s11, s3
1541; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1542; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1543; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1544; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1545; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1546; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1547; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1548; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1549; GFX7-NEXT:    s_mov_b32 s2, -1
1550; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1551; GFX7-NEXT:    s_waitcnt vmcnt(1)
1552; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1553; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1554; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1555; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1556; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1557; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1558; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1559; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1560; GFX7-NEXT:    s_waitcnt vmcnt(0)
1561; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1562; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1563; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1564; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1565; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1566; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1567; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1568; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1569; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1570; GFX7-NEXT:    v_mad_u32_u24 v16, v2, v0, s4
1571; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v16
1572; GFX7-NEXT:    v_mad_u32_u24 v2, v8, v15, v16
1573; GFX7-NEXT:    v_mad_u32_u24 v2, v7, v14, v2
1574; GFX7-NEXT:    v_mad_u32_u24 v2, v6, v13, v2
1575; GFX7-NEXT:    v_mad_u32_u24 v2, v5, v12, v2
1576; GFX7-NEXT:    v_mad_u32_u24 v2, v4, v11, v2
1577; GFX7-NEXT:    v_mad_u32_u24 v2, v3, v10, v2
1578; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v9, v2
1579; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1580; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1581; GFX7-NEXT:    s_endpgm
1582;
1583; GFX8-LABEL: udot8_multiuses_mul1:
1584; GFX8:       ; %bb.0: ; %entry
1585; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1586; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1587; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1588; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1589; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1590; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1592; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1593; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1594; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1595; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1596; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1597; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1598; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1599; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1600; GFX8-NEXT:    s_mov_b32 s10, -1
1601; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1602; GFX8-NEXT:    s_add_u32 s8, s8, s3
1603; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1604; GFX8-NEXT:    s_waitcnt vmcnt(1)
1605; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
1606; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
1607; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
1608; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
1609; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
1610; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
1611; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
1612; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1613; GFX8-NEXT:    s_waitcnt vmcnt(0)
1614; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1615; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
1616; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
1617; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
1618; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
1619; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
1620; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
1621; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1622; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1623; GFX8-NEXT:    v_mad_u32_u24 v16, v3, v0, s2
1624; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v16
1625; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v15, v16
1626; GFX8-NEXT:    v_mad_u32_u24 v3, v7, v14, v3
1627; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v13, v3
1628; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v12, v3
1629; GFX8-NEXT:    v_mad_u32_u24 v3, v4, v11, v3
1630; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v10, v3
1631; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v9, v2
1632; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
1633; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1634; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1635; GFX8-NEXT:    flat_store_dword v[0:1], v2
1636; GFX8-NEXT:    s_endpgm
1637;
1638; GFX9-LABEL: udot8_multiuses_mul1:
1639; GFX9:       ; %bb.0: ; %entry
1640; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1641; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1642; GFX9-NEXT:    s_mov_b32 s10, -1
1643; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1644; GFX9-NEXT:    s_add_u32 s8, s8, s3
1645; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1646; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1647; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1648; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1649; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1650; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1651; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1652; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
1653; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1654; GFX9-NEXT:    s_waitcnt vmcnt(1)
1655; GFX9-NEXT:    v_bfe_u32 v3, v1, 4, 4
1656; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1657; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
1658; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
1659; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
1660; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
1661; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
1662; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1663; GFX9-NEXT:    s_waitcnt vmcnt(0)
1664; GFX9-NEXT:    v_bfe_u32 v10, v2, 4, 4
1665; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1666; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
1667; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
1668; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
1669; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
1670; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
1671; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
1672; GFX9-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
1673; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1674; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
1675; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
1676; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
1677; GFX9-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
1678; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
1679; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
1680; GFX9-NEXT:    v_add3_u32 v2, v2, v9, v8
1681; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
1682; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
1683; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v6
1684; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v4
1685; GFX9-NEXT:    v_add3_u32 v1, v17, v1, v2
1686; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
1687; GFX9-NEXT:    s_endpgm
1688;
1689; GFX9-DL-LABEL: udot8_multiuses_mul1:
1690; GFX9-DL:       ; %bb.0: ; %entry
1691; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1692; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1693; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1694; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1695; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1696; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1697; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1698; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1699; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1700; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1701; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1702; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1703; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1704; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1705; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1706; GFX9-DL-NEXT:    v_bfe_u32 v3, v1, 4, 4
1707; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1708; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
1709; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
1710; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
1711; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
1712; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
1713; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
1714; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1715; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
1716; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1717; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
1718; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
1719; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
1720; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
1721; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
1722; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
1723; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
1724; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1725; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v1, v2, s0
1726; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
1727; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
1728; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
1729; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
1730; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
1731; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v9, v8
1732; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
1733; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
1734; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v7, v6
1735; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v5, v4
1736; GFX9-DL-NEXT:    v_add3_u32 v1, v17, v1, v2
1737; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1738; GFX9-DL-NEXT:    s_endpgm
1739;
1740; GFX10-DL-LABEL: udot8_multiuses_mul1:
1741; GFX10-DL:       ; %bb.0: ; %entry
1742; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1743; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1744; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1745; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1746; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1747; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1748; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1749; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1750; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1751; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1752; GFX10-DL-NEXT:    s_clause 0x1
1753; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1754; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1755; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1756; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1757; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v1
1758; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1759; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v2
1760; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
1761; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
1762; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
1763; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
1764; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
1765; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
1766; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 8, 4
1767; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
1768; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
1769; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
1770; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1771; GFX10-DL-NEXT:    v_mad_u32_u24 v13, v8, v9, s2
1772; GFX10-DL-NEXT:    v_bfe_u32 v14, v2, 20, 4
1773; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
1774; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v11
1775; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v12
1776; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v10, v13
1777; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1778; GFX10-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
1779; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v15
1780; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v14
1781; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v7
1782; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v4, v2
1783; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v10
1784; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v3, v8, v9
1785; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v6, v5
1786; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
1787; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1788; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
1789; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
1790; GFX10-DL-NEXT:    s_endpgm
1791                                                <8 x i4> addrspace(1)* %src2,
1792                                                i32 addrspace(1)* nocapture %dst) {
1793entry:
1794  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1795  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1796  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1797  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1798  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1799
1800  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1801  %cv1e0 = zext i4 %v1e0 to i32
1802  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1803  %cv2e0 = zext i4 %v2e0 to i32
1804  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1805
1806  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1807  %cv1e1 = zext i4 %v1e1 to i32
1808  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1809  %cv2e1 = zext i4 %v2e1 to i32
1810  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1811
1812  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1813  %cv1e2 = zext i4 %v1e2 to i32
1814  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1815  %cv2e2 = zext i4 %v2e2 to i32
1816  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1817
1818  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1819  %cv1e3 = zext i4 %v1e3 to i32
1820  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1821  %cv2e3 = zext i4 %v2e3 to i32
1822  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1823
1824  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1825  %cv1e4 = zext i4 %v1e4 to i32
1826  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1827  %cv2e4 = zext i4 %v2e4 to i32
1828  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1829
1830  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1831  %cv1e5 = zext i4 %v1e5 to i32
1832  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1833  %cv2e5 = zext i4 %v2e5 to i32
1834  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1835
1836  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1837  %cv1e6 = zext i4 %v1e6 to i32
1838  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1839  %cv2e6 = zext i4 %v2e6 to i32
1840  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1841
1842  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1843  %cv1e7 = zext i4 %v1e7 to i32
1844  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1845  %cv2e7 = zext i4 %v2e7 to i32
1846  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1847
1848  %acc = load i32, i32 addrspace(1)* %dst, align 4
1849  %add1 = add i32 %mul0, %acc
1850  %add = add i32  %mul0, %add1
1851  %add2 = add i32 %add1, %mul1
1852  %add3 = add i32 %add2, %mul2
1853  %add4 = add i32 %add3, %mul3
1854  %add5 = add i32 %add4, %mul4
1855  %add6 = add i32 %add5, %mul5
1856  %add7 = add i32 %add6, %mul6
1857  %add8 = add i32 %add7, %mul7
1858
1859  %res = add i32 %add, %add8
1860  store i32 %res, i32 addrspace(1)* %dst, align 4
1861  ret void
1862}
1863
1864define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1865; GFX7-LABEL: udot8_acc32_vecMul:
1866; GFX7:       ; %bb.0: ; %entry
1867; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1868; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1869; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1870; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1871; GFX7-NEXT:    s_mov_b32 s14, -1
1872; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1873; GFX7-NEXT:    s_add_u32 s12, s12, s3
1874; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1875; GFX7-NEXT:    s_mov_b32 s10, 0
1876; GFX7-NEXT:    s_mov_b32 s11, s3
1877; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1878; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1879; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1880; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1881; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1882; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1883; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1884; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1885; GFX7-NEXT:    s_mov_b32 s2, -1
1886; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1887; GFX7-NEXT:    s_waitcnt vmcnt(1)
1888; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 28, v2
1889; GFX7-NEXT:    v_bfe_u32 v3, v2, 24, 4
1890; GFX7-NEXT:    v_bfe_u32 v4, v2, 20, 4
1891; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 4
1892; GFX7-NEXT:    v_bfe_u32 v6, v2, 12, 4
1893; GFX7-NEXT:    v_bfe_u32 v7, v2, 8, 4
1894; GFX7-NEXT:    v_bfe_u32 v8, v2, 4, 4
1895; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
1896; GFX7-NEXT:    s_waitcnt vmcnt(0)
1897; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1898; GFX7-NEXT:    v_bfe_u32 v10, v0, 24, 4
1899; GFX7-NEXT:    v_bfe_u32 v11, v0, 20, 4
1900; GFX7-NEXT:    v_bfe_u32 v12, v0, 16, 4
1901; GFX7-NEXT:    v_bfe_u32 v13, v0, 12, 4
1902; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 4
1903; GFX7-NEXT:    v_bfe_u32 v15, v0, 4, 4
1904; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
1905; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1906; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
1907; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1908; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1909; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1910; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1911; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1912; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
1913; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v9, v0
1914; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1915; GFX7-NEXT:    s_endpgm
1916;
1917; GFX8-LABEL: udot8_acc32_vecMul:
1918; GFX8:       ; %bb.0: ; %entry
1919; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1920; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1921; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1922; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1923; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1924; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1926; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1927; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1928; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1929; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1930; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1931; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1932; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1933; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1934; GFX8-NEXT:    s_mov_b32 s10, -1
1935; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1936; GFX8-NEXT:    s_add_u32 s8, s8, s3
1937; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1938; GFX8-NEXT:    s_waitcnt vmcnt(1)
1939; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
1940; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
1941; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
1942; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
1943; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
1944; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
1945; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
1946; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
1947; GFX8-NEXT:    s_waitcnt vmcnt(0)
1948; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
1949; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
1950; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
1951; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
1952; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
1953; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
1954; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
1955; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
1956; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1957; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
1958; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
1959; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
1960; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
1961; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
1962; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
1963; GFX8-NEXT:    v_mad_u32_u24 v0, v2, v10, v0
1964; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v9, v0
1965; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1966; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1967; GFX8-NEXT:    flat_store_dword v[0:1], v2
1968; GFX8-NEXT:    s_endpgm
1969;
1970; GFX9-LABEL: udot8_acc32_vecMul:
1971; GFX9:       ; %bb.0: ; %entry
1972; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1973; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1974; GFX9-NEXT:    s_mov_b32 s10, -1
1975; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1976; GFX9-NEXT:    s_add_u32 s8, s8, s3
1977; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1978; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1979; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1980; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1981; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1983; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1984; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
1985; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1986; GFX9-NEXT:    s_waitcnt vmcnt(1)
1987; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
1988; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
1989; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
1990; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
1991; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
1992; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
1993; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
1994; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
1995; GFX9-NEXT:    s_waitcnt vmcnt(0)
1996; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1997; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
1998; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
1999; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
2000; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
2001; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
2002; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
2003; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
2004; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
2005; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
2006; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
2007; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
2008; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2009; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
2010; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
2011; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
2012; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
2013; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
2014; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
2015; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
2016; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
2017; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2018; GFX9-NEXT:    s_endpgm
2019;
2020; GFX9-DL-LABEL: udot8_acc32_vecMul:
2021; GFX9-DL:       ; %bb.0: ; %entry
2022; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2023; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2024; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2025; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2026; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2027; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2028; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2029; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2030; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2031; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2032; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2033; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2034; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2035; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2036; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2037; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s0
2038; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2039; GFX9-DL-NEXT:    s_endpgm
2040;
2041; GFX10-DL-LABEL: udot8_acc32_vecMul:
2042; GFX10-DL:       ; %bb.0: ; %entry
2043; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2044; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2045; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2046; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2047; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2048; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2049; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2050; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2051; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2052; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX10-DL-NEXT:    s_clause 0x1
2054; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2055; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2056; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2057; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2058; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2059; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
2060; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
2061; GFX10-DL-NEXT:    s_endpgm
2062                                              <8 x i4> addrspace(1)* %src2,
2063                                              i32 addrspace(1)* nocapture %dst) {
2064entry:
2065  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2066  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2067  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2068  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2069  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2070
2071  %cvec1 = zext <8 x i4> %vec1 to <8 x i32>
2072  %cvec2 = zext <8 x i4> %vec2 to <8 x i32>
2073
2074  %mul = mul <8 x i32> %cvec1, %cvec2
2075  %mul0 = extractelement <8 x i32> %mul, i64 0
2076  %mul1 = extractelement <8 x i32> %mul, i64 1
2077  %mul2 = extractelement <8 x i32> %mul, i64 2
2078  %mul3 = extractelement <8 x i32> %mul, i64 3
2079  %mul4 = extractelement <8 x i32> %mul, i64 4
2080  %mul5 = extractelement <8 x i32> %mul, i64 5
2081  %mul6 = extractelement <8 x i32> %mul, i64 6
2082  %mul7 = extractelement <8 x i32> %mul, i64 7
2083
2084  %acc = load i32, i32 addrspace(1)* %dst, align 4
2085  %add1 = add i32 %mul0, %acc
2086  %add2 = add i32 %add1, %mul1
2087  %add3 = add i32 %add2, %mul2
2088  %add4 = add i32 %add3, %mul3
2089  %add5 = add i32 %add4, %mul4
2090  %add6 = add i32 %add5, %mul5
2091  %add7 = add i32 %add6, %mul6
2092  %add8 = add i32 %add7, %mul7
2093
2094  store i32 %add8, i32 addrspace(1)* %dst, align 4
2095  ret void
2096}
2097
2098; TODO: Clean up the code(by default pk_mad_I16 should be generated), then
2099; support the pattern.
2100define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2101; GFX7-LABEL: udot8_acc16_vecMul:
2102; GFX7:       ; %bb.0: ; %entry
2103; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2104; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2105; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2106; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2107; GFX7-NEXT:    s_mov_b32 s14, -1
2108; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2109; GFX7-NEXT:    s_add_u32 s12, s12, s3
2110; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2111; GFX7-NEXT:    s_mov_b32 s10, 0
2112; GFX7-NEXT:    s_mov_b32 s11, s3
2113; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2114; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2115; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2116; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2117; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2118; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2119; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2120; GFX7-NEXT:    s_mov_b32 s2, -1
2121; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2122; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2123; GFX7-NEXT:    s_waitcnt vmcnt(2)
2124; GFX7-NEXT:    v_bfe_u32 v8, v2, 20, 4
2125; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 12, v2
2126; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
2127; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
2128; GFX7-NEXT:    v_bfe_u32 v5, v2, 12, 4
2129; GFX7-NEXT:    v_bfe_u32 v6, v2, 8, 4
2130; GFX7-NEXT:    v_and_b32_e32 v7, 15, v2
2131; GFX7-NEXT:    v_alignbit_b32 v2, v8, v2, 16
2132; GFX7-NEXT:    v_and_b32_e32 v8, 0xf0000, v9
2133; GFX7-NEXT:    s_waitcnt vmcnt(1)
2134; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 12, v0
2135; GFX7-NEXT:    v_and_b32_e32 v14, 15, v0
2136; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
2137; GFX7-NEXT:    v_and_b32_e32 v8, 0xf0000, v9
2138; GFX7-NEXT:    v_or_b32_e32 v8, v14, v8
2139; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
2140; GFX7-NEXT:    v_and_b32_e32 v7, 15, v7
2141; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
2142; GFX7-NEXT:    v_and_b32_e32 v8, 15, v8
2143; GFX7-NEXT:    s_waitcnt vmcnt(0)
2144; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v8, v1
2145; GFX7-NEXT:    v_bfe_u32 v13, v0, 8, 4
2146; GFX7-NEXT:    v_bfe_u32 v15, v0, 20, 4
2147; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v14, v1
2148; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
2149; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
2150; GFX7-NEXT:    v_bfe_u32 v12, v0, 12, 4
2151; GFX7-NEXT:    v_alignbit_b32 v0, v15, v0, 16
2152; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
2153; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
2154; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2155; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
2156; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2157; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
2158; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2159; GFX7-NEXT:    v_mad_u32_u24 v0, v16, v15, v0
2160; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2161; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2162; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2163; GFX7-NEXT:    s_endpgm
2164;
2165; GFX8-LABEL: udot8_acc16_vecMul:
2166; GFX8:       ; %bb.0: ; %entry
2167; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2168; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2169; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2170; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2171; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2172; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2173; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2174; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2175; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2176; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2177; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2178; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2179; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2180; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2181; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2182; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2183; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2184; GFX8-NEXT:    s_mov_b32 s10, -1
2185; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2186; GFX8-NEXT:    s_add_u32 s8, s8, s3
2187; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2188; GFX8-NEXT:    s_waitcnt vmcnt(2)
2189; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
2190; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
2191; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
2192; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
2193; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
2194; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
2195; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
2196; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
2197; GFX8-NEXT:    s_waitcnt vmcnt(1)
2198; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2199; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
2200; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
2201; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
2202; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
2203; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
2204; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
2205; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2206; GFX8-NEXT:    s_waitcnt vmcnt(0)
2207; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2208; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
2209; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
2210; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
2211; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
2212; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2213; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
2214; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
2215; GFX8-NEXT:    flat_store_short v[0:1], v2
2216; GFX8-NEXT:    s_endpgm
2217;
2218; GFX9-LABEL: udot8_acc16_vecMul:
2219; GFX9:       ; %bb.0: ; %entry
2220; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2221; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2222; GFX9-NEXT:    s_mov_b32 s10, -1
2223; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2224; GFX9-NEXT:    s_add_u32 s8, s8, s3
2225; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2226; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2227; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2228; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2229; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2230; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2231; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2232; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2233; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
2234; GFX9-NEXT:    s_waitcnt vmcnt(2)
2235; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
2236; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
2237; GFX9-NEXT:    s_waitcnt vmcnt(1)
2238; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
2239; GFX9-NEXT:    v_bfe_u32 v4, v1, 4, 4
2240; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
2241; GFX9-NEXT:    v_bfe_u32 v11, v2, 4, 4
2242; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2243; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2244; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2245; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 4
2246; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
2247; GFX9-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2248; GFX9-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
2249; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2250; GFX9-NEXT:    v_bfe_u32 v8, v1, 20, 4
2251; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
2252; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff, v9
2253; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff, v14
2254; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2255; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
2256; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
2257; GFX9-NEXT:    v_bfe_u32 v15, v2, 20, 4
2258; GFX9-NEXT:    v_bfe_u32 v16, v2, 16, 4
2259; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
2260; GFX9-NEXT:    v_bfe_u32 v2, v2, 24, 4
2261; GFX9-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
2262; GFX9-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
2263; GFX9-NEXT:    s_waitcnt vmcnt(0)
2264; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
2265; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2266; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2267; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff, v16
2268; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2269; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2270; GFX9-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
2271; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
2272; GFX9-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
2273; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
2274; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2275; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2276; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2277; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
2278; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2279; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
2280; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2281; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
2282; GFX9-NEXT:    s_endpgm
2283;
2284; GFX9-DL-LABEL: udot8_acc16_vecMul:
2285; GFX9-DL:       ; %bb.0: ; %entry
2286; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2287; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2288; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2289; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2290; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2291; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2292; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2293; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2294; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2295; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2296; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2297; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2298; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2299; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
2300; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2301; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
2302; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
2303; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2304; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
2305; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
2306; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
2307; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 4, 4
2308; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2309; GFX9-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2310; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2311; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 16, 4
2312; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
2313; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2314; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
2315; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2316; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 20, 4
2317; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
2318; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
2319; GFX9-DL-NEXT:    v_and_b32_e32 v14, 0xffff, v14
2320; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2321; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
2322; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
2323; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
2324; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 16, 4
2325; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
2326; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
2327; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
2328; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
2329; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2330; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
2331; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2332; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2333; GFX9-DL-NEXT:    v_and_b32_e32 v16, 0xffff, v16
2334; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2335; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2336; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
2337; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
2338; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
2339; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
2340; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2341; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2342; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2343; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
2344; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2345; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
2346; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2347; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
2348; GFX9-DL-NEXT:    s_endpgm
2349;
2350; GFX10-DL-LABEL: udot8_acc16_vecMul:
2351; GFX10-DL:       ; %bb.0: ; %entry
2352; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2353; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2354; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2355; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2356; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2357; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2358; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2359; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2360; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2361; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2362; GFX10-DL-NEXT:    s_clause 0x1
2363; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2364; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2365; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2366; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
2367; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2368; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v1
2369; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2370; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
2371; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
2372; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
2373; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
2374; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
2375; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2376; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 8, 4
2377; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
2378; GFX10-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
2379; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
2380; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v9, 16, v5
2381; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
2382; GFX10-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2383; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
2384; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
2385; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
2386; GFX10-DL-NEXT:    v_bfe_u32 v5, v2, 16, 4
2387; GFX10-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v12
2388; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
2389; GFX10-DL-NEXT:    v_and_b32_e32 v11, 0xffff, v11
2390; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
2391; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2392; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
2393; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 20, 4
2394; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2395; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v7, v9
2396; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
2397; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
2398; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
2399; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 24, 4
2400; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2401; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v11
2402; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
2403; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
2404; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
2405; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v8
2406; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2407; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
2408; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
2409; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v7
2410; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
2411; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2412; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
2413; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2414; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
2415; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2416; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
2417; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
2418; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
2419; GFX10-DL-NEXT:    s_endpgm
2420                                              <8 x i4> addrspace(1)* %src2,
2421                                              i16 addrspace(1)* nocapture %dst) {
2422entry:
2423  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2424  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2425  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2426  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2427  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2428
2429  %cvec1 = zext <8 x i4> %vec1 to <8 x i16>
2430  %cvec2 = zext <8 x i4> %vec2 to <8 x i16>
2431
2432  %mul = mul <8 x i16> %cvec1, %cvec2
2433  %mul0 = extractelement <8 x i16> %mul, i64 0
2434  %mul1 = extractelement <8 x i16> %mul, i64 1
2435  %mul2 = extractelement <8 x i16> %mul, i64 2
2436  %mul3 = extractelement <8 x i16> %mul, i64 3
2437  %mul4 = extractelement <8 x i16> %mul, i64 4
2438  %mul5 = extractelement <8 x i16> %mul, i64 5
2439  %mul6 = extractelement <8 x i16> %mul, i64 6
2440  %mul7 = extractelement <8 x i16> %mul, i64 7
2441
2442  %acc = load i16, i16 addrspace(1)* %dst, align 4
2443  %add1 = add i16 %mul0, %acc
2444  %add2 = add i16 %add1, %mul1
2445  %add3 = add i16 %add2, %mul2
2446  %add4 = add i16 %add3, %mul3
2447  %add5 = add i16 %add4, %mul4
2448  %add6 = add i16 %add5, %mul5
2449  %add7 = add i16 %add6, %mul6
2450  %add8 = add i16 %add7, %mul7
2451
2452  store i16 %add8, i16 addrspace(1)* %dst, align 4
2453  ret void
2454}
2455
2456; TODO: Cleanup the code to generate MAD; pattern should be recognized then.
2457define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2458; GFX7-LABEL: udot8_acc8_vecMul:
2459; GFX7:       ; %bb.0: ; %entry
2460; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2461; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2462; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2463; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2464; GFX7-NEXT:    s_mov_b32 s14, -1
2465; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2466; GFX7-NEXT:    s_add_u32 s12, s12, s3
2467; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2468; GFX7-NEXT:    s_mov_b32 s10, 0
2469; GFX7-NEXT:    s_mov_b32 s11, s3
2470; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2471; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2472; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2473; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2474; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2475; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2476; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2477; GFX7-NEXT:    s_mov_b32 s2, -1
2478; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2479; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2480; GFX7-NEXT:    s_waitcnt vmcnt(2)
2481; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 4, v2
2482; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
2483; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 4
2484; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 28, v2
2485; GFX7-NEXT:    v_bfe_u32 v7, v2, 16, 4
2486; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v2
2487; GFX7-NEXT:    s_waitcnt vmcnt(1)
2488; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
2489; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
2490; GFX7-NEXT:    v_and_b32_e32 v8, 0xf00, v8
2491; GFX7-NEXT:    v_and_b32_e32 v4, 0xf00, v4
2492; GFX7-NEXT:    v_and_b32_e32 v5, 15, v2
2493; GFX7-NEXT:    v_bfe_u32 v10, v0, 8, 4
2494; GFX7-NEXT:    v_and_b32_e32 v12, 15, v0
2495; GFX7-NEXT:    v_bfe_u32 v14, v0, 16, 4
2496; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
2497; GFX7-NEXT:    v_alignbit_b32 v2, v6, v2, 24
2498; GFX7-NEXT:    v_and_b32_e32 v6, 0xf00, v9
2499; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 4, v0
2500; GFX7-NEXT:    v_or_b32_e32 v7, v7, v8
2501; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
2502; GFX7-NEXT:    v_alignbit_b32 v0, v13, v0, 24
2503; GFX7-NEXT:    v_and_b32_e32 v8, 0xf00, v11
2504; GFX7-NEXT:    v_or_b32_e32 v5, v5, v6
2505; GFX7-NEXT:    v_and_b32_e32 v4, 0xf00, v15
2506; GFX7-NEXT:    v_and_b32_e32 v6, 0xf00, v9
2507; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2508; GFX7-NEXT:    v_and_b32_e32 v0, 0xf0f, v0
2509; GFX7-NEXT:    v_or_b32_e32 v8, v10, v8
2510; GFX7-NEXT:    v_and_b32_e32 v2, 0xf0f, v2
2511; GFX7-NEXT:    v_or_b32_e32 v4, v14, v4
2512; GFX7-NEXT:    v_or_b32_e32 v6, v12, v6
2513; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
2514; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2515; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
2516; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2517; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
2518; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
2519; GFX7-NEXT:    v_or_b32_e32 v2, v7, v2
2520; GFX7-NEXT:    v_and_b32_e32 v7, 15, v3
2521; GFX7-NEXT:    v_and_b32_e32 v13, 15, v4
2522; GFX7-NEXT:    v_bfe_u32 v8, v3, 8, 4
2523; GFX7-NEXT:    v_bfe_u32 v14, v4, 8, 4
2524; GFX7-NEXT:    s_waitcnt vmcnt(0)
2525; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v13, v1
2526; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
2527; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 4
2528; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 24, v4
2529; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 4
2530; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
2531; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
2532; GFX7-NEXT:    v_and_b32_e32 v9, 15, v2
2533; GFX7-NEXT:    v_and_b32_e32 v15, 15, v0
2534; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v11, v1
2535; GFX7-NEXT:    v_bfe_u32 v10, v2, 8, 4
2536; GFX7-NEXT:    v_bfe_u32 v16, v0, 8, 4
2537; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v15, v1
2538; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
2539; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 4
2540; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
2541; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 4
2542; GFX7-NEXT:    v_mad_u32_u24 v1, v10, v16, v1
2543; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2544; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v12, v0
2545; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2546; GFX7-NEXT:    s_endpgm
2547;
2548; GFX8-LABEL: udot8_acc8_vecMul:
2549; GFX8:       ; %bb.0: ; %entry
2550; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2551; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2552; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2553; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2554; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2555; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2556; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2557; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2558; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2559; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2560; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2561; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2562; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2563; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2564; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2565; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2566; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2567; GFX8-NEXT:    s_mov_b32 s10, -1
2568; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2569; GFX8-NEXT:    s_add_u32 s8, s8, s3
2570; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2571; GFX8-NEXT:    s_waitcnt vmcnt(2)
2572; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v3
2573; GFX8-NEXT:    v_bfe_u32 v10, v3, 24, 4
2574; GFX8-NEXT:    v_bfe_u32 v11, v3, 20, 4
2575; GFX8-NEXT:    v_bfe_u32 v7, v3, 12, 4
2576; GFX8-NEXT:    v_bfe_u32 v8, v3, 8, 4
2577; GFX8-NEXT:    v_bfe_u32 v12, v3, 16, 4
2578; GFX8-NEXT:    s_waitcnt vmcnt(1)
2579; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 28, v2
2580; GFX8-NEXT:    v_bfe_u32 v17, v2, 24, 4
2581; GFX8-NEXT:    v_bfe_u32 v18, v2, 20, 4
2582; GFX8-NEXT:    v_bfe_u32 v14, v2, 12, 4
2583; GFX8-NEXT:    v_bfe_u32 v15, v2, 8, 4
2584; GFX8-NEXT:    v_bfe_u32 v19, v2, 16, 4
2585; GFX8-NEXT:    v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2586; GFX8-NEXT:    v_mul_lo_u16_e32 v18, v10, v17
2587; GFX8-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2588; GFX8-NEXT:    v_bfe_u32 v5, v3, 4, 4
2589; GFX8-NEXT:    v_and_b32_e32 v6, 15, v3
2590; GFX8-NEXT:    v_bfe_u32 v3, v2, 4, 4
2591; GFX8-NEXT:    v_and_b32_e32 v13, 15, v2
2592; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v12, v19
2593; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
2594; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2595; GFX8-NEXT:    v_or_b32_e32 v9, v18, v9
2596; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2597; GFX8-NEXT:    v_or_b32_e32 v3, v2, v11
2598; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
2599; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
2600; GFX8-NEXT:    v_mul_lo_u16_e32 v6, v6, v13
2601; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
2602; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2603; GFX8-NEXT:    v_or_b32_e32 v6, v6, v5
2604; GFX8-NEXT:    v_or_b32_e32 v5, v5, v2
2605; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
2606; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
2607; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2608; GFX8-NEXT:    s_waitcnt vmcnt(0)
2609; GFX8-NEXT:    v_add_u16_e32 v3, v6, v4
2610; GFX8-NEXT:    v_add_u16_e32 v3, v3, v5
2611; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
2612; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
2613; GFX8-NEXT:    v_mad_u16 v2, v12, v19, v2
2614; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
2615; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
2616; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
2617; GFX8-NEXT:    v_add_u16_e32 v2, v2, v9
2618; GFX8-NEXT:    flat_store_byte v[0:1], v2
2619; GFX8-NEXT:    s_endpgm
2620;
2621; GFX9-LABEL: udot8_acc8_vecMul:
2622; GFX9:       ; %bb.0: ; %entry
2623; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2624; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2625; GFX9-NEXT:    s_mov_b32 s10, -1
2626; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2627; GFX9-NEXT:    s_add_u32 s8, s8, s3
2628; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2629; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2630; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2631; GFX9-NEXT:    v_mov_b32_e32 v3, 0
2632; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2633; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2634; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2635; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
2636; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2637; GFX9-NEXT:    s_waitcnt vmcnt(2)
2638; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2639; GFX9-NEXT:    v_bfe_u32 v9, v1, 24, 4
2640; GFX9-NEXT:    v_bfe_u32 v10, v1, 20, 4
2641; GFX9-NEXT:    s_waitcnt vmcnt(1)
2642; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
2643; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
2644; GFX9-NEXT:    v_bfe_u32 v17, v2, 20, 4
2645; GFX9-NEXT:    v_bfe_u32 v0, v1, 4, 4
2646; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
2647; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
2648; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
2649; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
2650; GFX9-NEXT:    v_bfe_u32 v1, v2, 4, 4
2651; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
2652; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
2653; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
2654; GFX9-NEXT:    v_bfe_u32 v2, v2, 16, 4
2655; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2656; GFX9-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
2657; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2658; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
2659; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
2660; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2661; GFX9-NEXT:    v_or_b32_e32 v8, v17, v8
2662; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
2663; GFX9-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2664; GFX9-NEXT:    v_or_b32_e32 v1, v18, v10
2665; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
2666; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2667; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
2668; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2669; GFX9-NEXT:    v_or_b32_e32 v5, v5, v12
2670; GFX9-NEXT:    v_or_b32_e32 v7, v12, v0
2671; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
2672; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
2673; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2674; GFX9-NEXT:    s_waitcnt vmcnt(0)
2675; GFX9-NEXT:    v_add_u16_e32 v1, v5, v4
2676; GFX9-NEXT:    v_add_u16_e32 v1, v1, v7
2677; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
2678; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
2679; GFX9-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
2680; GFX9-NEXT:    v_add_u16_e32 v0, v0, v10
2681; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
2682; GFX9-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
2683; GFX9-NEXT:    v_add_u16_e32 v0, v0, v8
2684; GFX9-NEXT:    global_store_byte v3, v0, s[2:3]
2685; GFX9-NEXT:    s_endpgm
2686;
2687; GFX9-DL-LABEL: udot8_acc8_vecMul:
2688; GFX9-DL:       ; %bb.0: ; %entry
2689; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2690; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2691; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2692; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2693; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2694; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2695; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2696; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2697; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
2698; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2699; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2700; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2701; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
2702; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2703; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2704; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2705; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
2706; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
2707; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2708; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
2709; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
2710; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 20, 4
2711; GFX9-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
2712; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
2713; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
2714; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
2715; GFX9-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
2716; GFX9-DL-NEXT:    v_bfe_u32 v1, v2, 4, 4
2717; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
2718; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
2719; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
2720; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 16, 4
2721; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2722; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
2723; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2724; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
2725; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
2726; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2727; GFX9-DL-NEXT:    v_or_b32_e32 v8, v17, v8
2728; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
2729; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2730; GFX9-DL-NEXT:    v_or_b32_e32 v1, v18, v10
2731; GFX9-DL-NEXT:    v_or_b32_e32 v6, v7, v6
2732; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
2733; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
2734; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2735; GFX9-DL-NEXT:    v_or_b32_e32 v5, v5, v12
2736; GFX9-DL-NEXT:    v_or_b32_e32 v7, v12, v0
2737; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
2738; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
2739; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2740; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2741; GFX9-DL-NEXT:    v_add_u16_e32 v1, v5, v4
2742; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v7
2743; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
2744; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
2745; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
2746; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v10
2747; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
2748; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
2749; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v8
2750; GFX9-DL-NEXT:    global_store_byte v3, v0, s[2:3]
2751; GFX9-DL-NEXT:    s_endpgm
2752;
2753; GFX10-DL-LABEL: udot8_acc8_vecMul:
2754; GFX10-DL:       ; %bb.0: ; %entry
2755; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2756; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2757; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2758; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0
2759; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2760; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2761; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2762; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2763; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2764; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2765; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2766; GFX10-DL-NEXT:    s_clause 0x1
2767; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2768; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2769; GFX10-DL-NEXT:    global_load_ubyte v3, v4, s[0:1]
2770; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2771; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
2772; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2773; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 12, 4
2774; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
2775; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
2776; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
2777; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
2778; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v10
2779; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
2780; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
2781; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
2782; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
2783; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
2784; GFX10-DL-NEXT:    v_bfe_u32 v1, v2, 4, 4
2785; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v7, v13
2786; GFX10-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
2787; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v2
2788; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 24, 4
2789; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
2790; GFX10-DL-NEXT:    v_bfe_u32 v16, v2, 16, 4
2791; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v8, v14
2792; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v1
2793; GFX10-DL-NEXT:    v_or_b32_e32 v6, v7, v6
2794; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v11, v13
2795; GFX10-DL-NEXT:    v_mul_lo_u16 v7, v9, v15
2796; GFX10-DL-NEXT:    v_lshlrev_b16 v2, 8, v2
2797; GFX10-DL-NEXT:    v_lshlrev_b16 v8, 8, v0
2798; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
2799; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v10
2800; GFX10-DL-NEXT:    v_mul_lo_u16 v10, v12, v16
2801; GFX10-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
2802; GFX10-DL-NEXT:    v_or_b32_e32 v7, v7, v2
2803; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2804; GFX10-DL-NEXT:    v_or_b32_e32 v5, v5, v8
2805; GFX10-DL-NEXT:    v_or_b32_e32 v1, v10, v1
2806; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
2807; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
2808; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2809; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
2810; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2811; GFX10-DL-NEXT:    v_add_nc_u16 v5, v3, v2
2812; GFX10-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
2813; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
2814; GFX10-DL-NEXT:    v_add_nc_u16 v0, v5, v6
2815; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v2
2816; GFX10-DL-NEXT:    v_mad_u16 v0, v12, v16, v0
2817; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2818; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v7
2819; GFX10-DL-NEXT:    v_mad_u16 v0, v9, v15, v0
2820; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
2821; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
2822; GFX10-DL-NEXT:    s_endpgm
2823                                             <8 x i4> addrspace(1)* %src2,
2824                                             i8 addrspace(1)* nocapture %dst) {
2825entry:
2826  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2827  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2828  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2829  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2830  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2831
2832  %cvec1 = zext <8 x i4> %vec1 to <8 x i8>
2833  %cvec2 = zext <8 x i4> %vec2 to <8 x i8>
2834
2835  %mul = mul <8 x i8> %cvec1, %cvec2
2836  %mul0 = extractelement <8 x i8> %mul, i64 0
2837  %mul1 = extractelement <8 x i8> %mul, i64 1
2838  %mul2 = extractelement <8 x i8> %mul, i64 2
2839  %mul3 = extractelement <8 x i8> %mul, i64 3
2840  %mul4 = extractelement <8 x i8> %mul, i64 4
2841  %mul5 = extractelement <8 x i8> %mul, i64 5
2842  %mul6 = extractelement <8 x i8> %mul, i64 6
2843  %mul7 = extractelement <8 x i8> %mul, i64 7
2844
2845  %acc = load i8, i8 addrspace(1)* %dst, align 4
2846  %add1 = add i8 %mul0, %acc
2847  %add2 = add i8 %add1, %mul1
2848  %add3 = add i8 %add2, %mul2
2849  %add4 = add i8 %add3, %mul3
2850  %add5 = add i8 %add4, %mul4
2851  %add6 = add i8 %add5, %mul5
2852  %add7 = add i8 %add6, %mul6
2853  %add8 = add i8 %add7, %mul7
2854
2855  store i8 %add8, i8 addrspace(1)* %dst, align 4
2856  ret void
2857}
2858
2859; TODO: Once the adictional "and+add" are removed, the pattern will be recognized.
2860define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
2861; GFX7-LABEL: udot8_acc4_vecMul:
2862; GFX7:       ; %bb.0: ; %entry
2863; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2864; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2865; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2866; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2867; GFX7-NEXT:    s_mov_b32 s14, -1
2868; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2869; GFX7-NEXT:    s_add_u32 s12, s12, s3
2870; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2871; GFX7-NEXT:    s_mov_b32 s10, 0
2872; GFX7-NEXT:    s_mov_b32 s11, s3
2873; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2874; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2875; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2876; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2877; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2878; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2879; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2880; GFX7-NEXT:    s_mov_b32 s2, -1
2881; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2882; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2883; GFX7-NEXT:    s_waitcnt vmcnt(2)
2884; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
2885; GFX7-NEXT:    v_bfe_u32 v4, v2, 24, 4
2886; GFX7-NEXT:    v_bfe_u32 v5, v2, 20, 4
2887; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
2888; GFX7-NEXT:    v_bfe_u32 v7, v2, 12, 4
2889; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 4
2890; GFX7-NEXT:    v_bfe_u32 v9, v2, 4, 4
2891; GFX7-NEXT:    v_and_b32_e32 v2, 15, v2
2892; GFX7-NEXT:    s_waitcnt vmcnt(1)
2893; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
2894; GFX7-NEXT:    v_bfe_u32 v11, v0, 24, 4
2895; GFX7-NEXT:    v_bfe_u32 v12, v0, 20, 4
2896; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
2897; GFX7-NEXT:    v_bfe_u32 v14, v0, 12, 4
2898; GFX7-NEXT:    v_bfe_u32 v15, v0, 8, 4
2899; GFX7-NEXT:    v_bfe_u32 v16, v0, 4, 4
2900; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2901; GFX7-NEXT:    s_waitcnt vmcnt(0)
2902; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2903; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v16, v0
2904; GFX7-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
2905; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
2906; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
2907; GFX7-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
2908; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
2909; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v10, v0
2910; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
2911; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2912; GFX7-NEXT:    s_endpgm
2913;
2914; GFX8-LABEL: udot8_acc4_vecMul:
2915; GFX8:       ; %bb.0: ; %entry
2916; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2917; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2918; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2919; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2920; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2921; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2922; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2923; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2924; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2925; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2926; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2927; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2928; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2929; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2930; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2931; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2932; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2933; GFX8-NEXT:    s_mov_b32 s10, -1
2934; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2935; GFX8-NEXT:    s_add_u32 s8, s8, s3
2936; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2937; GFX8-NEXT:    s_waitcnt vmcnt(2)
2938; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
2939; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
2940; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
2941; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
2942; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
2943; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
2944; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
2945; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
2946; GFX8-NEXT:    s_waitcnt vmcnt(1)
2947; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2948; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
2949; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
2950; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
2951; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
2952; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
2953; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
2954; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2955; GFX8-NEXT:    s_waitcnt vmcnt(0)
2956; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2957; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
2958; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
2959; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
2960; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
2961; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2962; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
2963; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
2964; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
2965; GFX8-NEXT:    flat_store_byte v[0:1], v2
2966; GFX8-NEXT:    s_endpgm
2967;
2968; GFX9-LABEL: udot8_acc4_vecMul:
2969; GFX9:       ; %bb.0: ; %entry
2970; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2971; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2972; GFX9-NEXT:    s_mov_b32 s10, -1
2973; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2974; GFX9-NEXT:    s_add_u32 s8, s8, s3
2975; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2976; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2977; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2978; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2979; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2980; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2981; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2982; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2983; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
2984; GFX9-NEXT:    s_waitcnt vmcnt(2)
2985; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
2986; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
2987; GFX9-NEXT:    s_waitcnt vmcnt(1)
2988; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
2989; GFX9-NEXT:    v_bfe_u32 v4, v1, 4, 4
2990; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
2991; GFX9-NEXT:    v_bfe_u32 v11, v2, 4, 4
2992; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2993; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2994; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2995; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 4
2996; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
2997; GFX9-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2998; GFX9-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
2999; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
3000; GFX9-NEXT:    v_bfe_u32 v8, v1, 20, 4
3001; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
3002; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff, v9
3003; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff, v14
3004; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
3005; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
3006; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
3007; GFX9-NEXT:    v_bfe_u32 v15, v2, 20, 4
3008; GFX9-NEXT:    v_bfe_u32 v16, v2, 16, 4
3009; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
3010; GFX9-NEXT:    v_bfe_u32 v2, v2, 24, 4
3011; GFX9-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
3012; GFX9-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
3013; GFX9-NEXT:    s_waitcnt vmcnt(0)
3014; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
3015; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3016; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3017; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff, v16
3018; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
3019; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3020; GFX9-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
3021; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
3022; GFX9-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
3023; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
3024; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
3025; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
3026; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3027; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
3028; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3029; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
3030; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3031; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
3032; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
3033; GFX9-NEXT:    s_endpgm
3034;
3035; GFX9-DL-LABEL: udot8_acc4_vecMul:
3036; GFX9-DL:       ; %bb.0: ; %entry
3037; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3038; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3039; GFX9-DL-NEXT:    s_mov_b32 s10, -1
3040; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
3041; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
3042; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3043; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3044; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3045; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
3046; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3047; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3048; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3049; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3050; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
3051; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
3052; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
3053; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
3054; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3055; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
3056; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
3057; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
3058; GFX9-DL-NEXT:    v_bfe_u32 v11, v2, 4, 4
3059; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
3060; GFX9-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
3061; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
3062; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 16, 4
3063; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
3064; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
3065; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
3066; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
3067; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 20, 4
3068; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
3069; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
3070; GFX9-DL-NEXT:    v_and_b32_e32 v14, 0xffff, v14
3071; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
3072; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
3073; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
3074; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
3075; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 16, 4
3076; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
3077; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
3078; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
3079; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
3080; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3081; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
3082; GFX9-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
3083; GFX9-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3084; GFX9-DL-NEXT:    v_and_b32_e32 v16, 0xffff, v16
3085; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
3086; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3087; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
3088; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
3089; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
3090; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
3091; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
3092; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
3093; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3094; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
3095; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3096; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
3097; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3098; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3099; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
3100; GFX9-DL-NEXT:    s_endpgm
3101;
3102; GFX10-DL-LABEL: udot8_acc4_vecMul:
3103; GFX10-DL:       ; %bb.0: ; %entry
3104; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3105; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3106; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3107; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3108; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3109; GFX10-DL-NEXT:    s_mov_b32 s10, -1
3110; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
3111; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
3112; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
3113; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3114; GFX10-DL-NEXT:    s_clause 0x1
3115; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3116; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3117; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3118; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
3119; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
3120; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v1
3121; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
3122; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
3123; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 4, 4
3124; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
3125; GFX10-DL-NEXT:    v_bfe_u32 v8, v1, 8, 4
3126; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
3127; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
3128; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 8, 4
3129; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
3130; GFX10-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
3131; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v6
3132; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v9, 16, v5
3133; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
3134; GFX10-DL-NEXT:    v_and_b32_e32 v12, 0xffff, v12
3135; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
3136; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
3137; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
3138; GFX10-DL-NEXT:    v_bfe_u32 v5, v2, 16, 4
3139; GFX10-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v12
3140; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
3141; GFX10-DL-NEXT:    v_and_b32_e32 v11, 0xffff, v11
3142; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
3143; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3144; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
3145; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 20, 4
3146; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
3147; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v7, v9
3148; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3149; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
3150; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
3151; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 24, 4
3152; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
3153; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v10, 16, v11
3154; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
3155; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
3156; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3157; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xffff, v8
3158; GFX10-DL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3159; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
3160; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
3161; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v7
3162; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
3163; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
3164; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
3165; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
3166; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
3167; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
3168; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
3169; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
3170; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
3171; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
3172; GFX10-DL-NEXT:    s_endpgm
3173                                             <8 x i4> addrspace(1)* %src2,
3174                                             i4 addrspace(1)* nocapture %dst) {
3175entry:
3176  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3177  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3178  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3179  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3180  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3181
3182  %mul = mul <8 x i4> %vec1, %vec2
3183  %mul0 = extractelement <8 x i4> %mul, i64 0
3184  %mul1 = extractelement <8 x i4> %mul, i64 1
3185  %mul2 = extractelement <8 x i4> %mul, i64 2
3186  %mul3 = extractelement <8 x i4> %mul, i64 3
3187  %mul4 = extractelement <8 x i4> %mul, i64 4
3188  %mul5 = extractelement <8 x i4> %mul, i64 5
3189  %mul6 = extractelement <8 x i4> %mul, i64 6
3190  %mul7 = extractelement <8 x i4> %mul, i64 7
3191
3192  %acc = load i4, i4 addrspace(1)* %dst, align 4
3193  %add1 = add i4 %mul0, %acc
3194  %add2 = add i4 %add1, %mul1
3195  %add3 = add i4 %add2, %mul2
3196  %add4 = add i4 %add3, %mul3
3197  %add5 = add i4 %add4, %mul4
3198  %add6 = add i4 %add5, %mul5
3199  %add7 = add i4 %add6, %mul6
3200  %add8 = add i4 %add7, %mul7
3201
3202  store i4 %add8, i4 addrspace(1)* %dst, align 4
3203  ret void
3204}
3205
3206define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
3207; GFX7-LABEL: udot8_variant1:
3208; GFX7:       ; %bb.0: ; %entry
3209; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
3210; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3211; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3212; GFX7-NEXT:    s_mov_b32 s10, 0
3213; GFX7-NEXT:    s_mov_b32 s11, s3
3214; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3215; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
3216; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3217; GFX7-NEXT:    v_mov_b32_e32 v1, 0
3218; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
3219; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
3220; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
3221; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
3222; GFX7-NEXT:    s_mov_b32 s2, -1
3223; GFX7-NEXT:    s_waitcnt vmcnt(1)
3224; GFX7-NEXT:    v_and_b32_e32 v1, 15, v2
3225; GFX7-NEXT:    v_bfe_u32 v3, v2, 4, 4
3226; GFX7-NEXT:    s_waitcnt vmcnt(0)
3227; GFX7-NEXT:    v_and_b32_e32 v9, 15, v0
3228; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 4
3229; GFX7-NEXT:    v_bfe_u32 v5, v2, 12, 4
3230; GFX7-NEXT:    v_bfe_u32 v6, v2, 16, 4
3231; GFX7-NEXT:    v_bfe_u32 v7, v2, 20, 4
3232; GFX7-NEXT:    v_bfe_u32 v8, v2, 24, 4
3233; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3234; GFX7-NEXT:    v_bfe_u32 v10, v0, 4, 4
3235; GFX7-NEXT:    v_bfe_u32 v11, v0, 8, 4
3236; GFX7-NEXT:    v_bfe_u32 v12, v0, 12, 4
3237; GFX7-NEXT:    v_bfe_u32 v13, v0, 16, 4
3238; GFX7-NEXT:    v_bfe_u32 v14, v0, 20, 4
3239; GFX7-NEXT:    v_bfe_u32 v15, v0, 24, 4
3240; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
3241; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3242; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v1, s4
3243; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
3244; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v3, v0
3245; GFX7-NEXT:    v_mad_u32_u24 v0, v11, v4, v0
3246; GFX7-NEXT:    v_mad_u32_u24 v0, v12, v5, v0
3247; GFX7-NEXT:    v_mad_u32_u24 v0, v13, v6, v0
3248; GFX7-NEXT:    v_mad_u32_u24 v0, v14, v7, v0
3249; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v8, v0
3250; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
3251; GFX7-NEXT:    s_endpgm
3252;
3253; GFX8-LABEL: udot8_variant1:
3254; GFX8:       ; %bb.0: ; %entry
3255; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3256; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3257; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
3258; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX8-NEXT:    v_mov_b32_e32 v1, s5
3260; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
3261; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3262; GFX8-NEXT:    flat_load_dword v3, v[0:1]
3263; GFX8-NEXT:    v_mov_b32_e32 v1, s7
3264; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
3265; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3266; GFX8-NEXT:    flat_load_dword v0, v[0:1]
3267; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
3268; GFX8-NEXT:    s_waitcnt vmcnt(1)
3269; GFX8-NEXT:    v_and_b32_e32 v1, 15, v3
3270; GFX8-NEXT:    v_bfe_u32 v4, v3, 4, 4
3271; GFX8-NEXT:    v_bfe_u32 v6, v3, 8, 4
3272; GFX8-NEXT:    v_bfe_u32 v8, v3, 12, 4
3273; GFX8-NEXT:    v_bfe_u32 v10, v3, 16, 4
3274; GFX8-NEXT:    v_bfe_u32 v12, v3, 20, 4
3275; GFX8-NEXT:    s_waitcnt vmcnt(0)
3276; GFX8-NEXT:    v_and_b32_e32 v2, 15, v0
3277; GFX8-NEXT:    v_bfe_u32 v5, v0, 4, 4
3278; GFX8-NEXT:    v_bfe_u32 v7, v0, 8, 4
3279; GFX8-NEXT:    v_bfe_u32 v9, v0, 12, 4
3280; GFX8-NEXT:    v_bfe_u32 v11, v0, 16, 4
3281; GFX8-NEXT:    v_bfe_u32 v13, v0, 20, 4
3282; GFX8-NEXT:    v_bfe_u32 v14, v3, 24, 4
3283; GFX8-NEXT:    v_bfe_u32 v15, v0, 24, 4
3284; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
3285; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
3286; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3287; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, s2
3288; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v1
3289; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v4, v0
3290; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v6, v0
3291; GFX8-NEXT:    v_mad_u32_u24 v0, v9, v8, v0
3292; GFX8-NEXT:    v_mad_u32_u24 v0, v11, v10, v0
3293; GFX8-NEXT:    v_mad_u32_u24 v0, v13, v12, v0
3294; GFX8-NEXT:    v_mad_u32_u24 v2, v15, v14, v0
3295; GFX8-NEXT:    v_mov_b32_e32 v0, s0
3296; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3297; GFX8-NEXT:    flat_store_dword v[0:1], v2
3298; GFX8-NEXT:    s_endpgm
3299;
3300; GFX9-LABEL: udot8_variant1:
3301; GFX9:       ; %bb.0: ; %entry
3302; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3303; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3304; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3305; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3306; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
3307; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
3308; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
3309; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3310; GFX9-NEXT:    s_waitcnt vmcnt(1)
3311; GFX9-NEXT:    v_and_b32_e32 v3, 15, v1
3312; GFX9-NEXT:    s_waitcnt vmcnt(0)
3313; GFX9-NEXT:    v_and_b32_e32 v4, 15, v2
3314; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
3315; GFX9-NEXT:    v_bfe_u32 v6, v2, 4, 4
3316; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
3317; GFX9-NEXT:    v_bfe_u32 v8, v2, 8, 4
3318; GFX9-NEXT:    v_bfe_u32 v9, v1, 12, 4
3319; GFX9-NEXT:    v_bfe_u32 v10, v2, 12, 4
3320; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
3321; GFX9-NEXT:    v_bfe_u32 v12, v2, 16, 4
3322; GFX9-NEXT:    v_bfe_u32 v13, v1, 20, 4
3323; GFX9-NEXT:    v_bfe_u32 v14, v2, 20, 4
3324; GFX9-NEXT:    v_bfe_u32 v15, v1, 24, 4
3325; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
3326; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
3327; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
3328; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v4, v3
3329; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v2, v1
3330; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v6, v5
3331; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v8, v7
3332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3333; GFX9-NEXT:    v_add3_u32 v1, v3, s0, v1
3334; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v10, v9
3335; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v12, v11
3336; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v5
3337; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v14, v13
3338; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v16, v15
3339; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v7
3340; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v9
3341; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
3342; GFX9-NEXT:    s_endpgm
3343;
3344; GFX9-DL-LABEL: udot8_variant1:
3345; GFX9-DL:       ; %bb.0: ; %entry
3346; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3347; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3348; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3349; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3350; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3351; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3352; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
3353; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
3354; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3355; GFX9-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s0
3356; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
3357; GFX9-DL-NEXT:    s_endpgm
3358;
3359; GFX10-DL-LABEL: udot8_variant1:
3360; GFX10-DL:       ; %bb.0: ; %entry
3361; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3362; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3363; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3364; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3365; GFX10-DL-NEXT:    s_clause 0x1
3366; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3367; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3368; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3369; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
3370; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3371; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s2
3372; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
3373; GFX10-DL-NEXT:    s_endpgm
3374                                          i32 addrspace(1)* %v2addr,
3375                                          i32 addrspace(1)* %dst) {
3376entry:
3377  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3378  %gep1 = getelementptr i32, i32 addrspace(1)* %v1addr, i32 %idx
3379  %v1 = load i32, i32 addrspace(1)* %gep1, align 4
3380  %gep2 = getelementptr i32, i32 addrspace(1)* %v2addr, i32 %idx
3381  %v2 = load i32, i32 addrspace(1)* %gep2, align 4
3382  %and = and i32 %v1, 15
3383  %and1 = and i32 %v2, 15
3384  %mul1 = mul nuw nsw i32 %and1, %and
3385
3386  %shr = lshr i32 %v1, 4
3387  %and2 = and i32 %shr, 15
3388  %shr3 = lshr i32 %v2, 4
3389  %and4 = and i32 %shr3, 15
3390  %mul2 = mul nuw nsw i32 %and4, %and2
3391
3392  %shr6 = lshr i32 %v1, 8
3393  %and7 = and i32 %shr6, 15
3394  %shr8 = lshr i32 %v2, 8
3395  %and9 = and i32 %shr8, 15
3396  %mul3 = mul nuw nsw i32 %and9, %and7
3397
3398  %shr12 = lshr i32 %v1, 12
3399  %and13 = and i32 %shr12, 15
3400  %shr14 = lshr i32 %v2, 12
3401  %and15 = and i32 %shr14, 15
3402  %mul4 = mul nuw nsw i32 %and15, %and13
3403
3404  %shr18 = lshr i32 %v1, 16
3405  %and19 = and i32 %shr18, 15
3406  %shr20 = lshr i32 %v2, 16
3407  %and21 = and i32 %shr20, 15
3408  %mul5 = mul nuw nsw i32 %and21, %and19
3409
3410  %shr24 = lshr i32 %v1, 20
3411  %and25 = and i32 %shr24, 15
3412  %shr26 = lshr i32 %v2, 20
3413  %and27 = and i32 %shr26, 15
3414  %mul6 = mul nuw nsw i32 %and27, %and25
3415
3416  %shr30 = lshr i32 %v1, 24
3417  %and31 = and i32 %shr30, 15
3418  %shr32 = lshr i32 %v2, 24
3419  %and33 = and i32 %shr32, 15
3420  %mul7 = mul nuw nsw i32 %and33, %and31
3421
3422  %shr36 = lshr i32 %v1, 28
3423  %shr37 = lshr i32 %v2, 28
3424  %mul8 = mul nuw nsw i32 %shr37, %shr36
3425  %acc = load i32, i32 addrspace(1)* %dst, align 4
3426
3427  %add1 = add i32 %mul1, %acc
3428  %add2 = add i32 %add1, %mul8
3429  %add3 = add i32 %add2, %mul2
3430  %add4 = add i32 %add3, %mul3
3431  %add5 = add i32 %add4, %mul4
3432  %add6 = add i32 %add5, %mul5
3433  %add7 = add i32 %add6, %mul6
3434  %add8 = add i32 %add7, %mul7
3435  store i32 %add8, i32 addrspace(1)* %dst, align 4
3436  ret void
3437}
3438
3439declare i32 @llvm.amdgcn.workitem.id.x()
3440