1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
10; GFX7-LABEL: udot4_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_mov_b32 s3, 0xf000
15; GFX7-NEXT:    s_mov_b32 s10, 0
16; GFX7-NEXT:    s_mov_b32 s11, s3
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
19; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
20; GFX7-NEXT:    v_mov_b32_e32 v1, 0
21; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
22; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
23; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
24; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
25; GFX7-NEXT:    s_movk_i32 s4, 0xff
26; GFX7-NEXT:    s_mov_b32 s2, -1
27; GFX7-NEXT:    s_waitcnt vmcnt(1)
28; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
29; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
30; GFX7-NEXT:    s_waitcnt vmcnt(0)
31; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
32; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
33; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, s5
35; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
36; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
37; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
38; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
39; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
40; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
41; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
42; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
43; GFX7-NEXT:    s_endpgm
44;
45; GFX8-LABEL: udot4_acc32:
46; GFX8:       ; %bb.0: ; %entry
47; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
48; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
49; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
50; GFX8-NEXT:    s_movk_i32 s2, 0xff
51; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX8-NEXT:    v_mov_b32_e32 v1, s5
53; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
54; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
55; GFX8-NEXT:    flat_load_dword v3, v[0:1]
56; GFX8-NEXT:    v_mov_b32_e32 v1, s7
57; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
58; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
59; GFX8-NEXT:    flat_load_dword v0, v[0:1]
60; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
61; GFX8-NEXT:    s_waitcnt vmcnt(1)
62; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
63; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
64; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
65; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
66; GFX8-NEXT:    s_waitcnt vmcnt(0)
67; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
68; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s3
71; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
72; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
73; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
74; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
75; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
76; GFX8-NEXT:    v_mov_b32_e32 v0, s0
77; GFX8-NEXT:    v_mov_b32_e32 v1, s1
78; GFX8-NEXT:    flat_store_dword v[0:1], v2
79; GFX8-NEXT:    s_endpgm
80;
81; GFX9-NODL-LABEL: udot4_acc32:
82; GFX9-NODL:       ; %bb.0: ; %entry
83; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
84; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
85; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
86; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
88; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
89; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
90; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
91; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
92; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
93; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
94; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
95; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
96; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
98; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
99; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
100; GFX9-NODL-NEXT:    s_endpgm
101;
102; GFX9-DL-LABEL: udot4_acc32:
103; GFX9-DL:       ; %bb.0: ; %entry
104; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
105; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
106; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
107; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
109; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
110; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
111; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
112; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
113; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
114; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
115; GFX9-DL-NEXT:    s_endpgm
116;
117; GFX10-DL-LABEL: udot4_acc32:
118; GFX10-DL:       ; %bb.0: ; %entry
119; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
120; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
121; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
122; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX10-DL-NEXT:    s_clause 0x1
124; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
125; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
126; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
127; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
128; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
129; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
130; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
131; GFX10-DL-NEXT:    s_endpgm
132                                       <4 x i8> addrspace(1)* %src2,
133                                       i32 addrspace(1)* nocapture %dst) {
134entry:
135  %idx = call i32 @llvm.amdgcn.workitem.id.x()
136  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
137  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
138  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
139  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
140
141  %v1e0 = extractelement <4 x i8> %vec1, i64 0
142  %cv1e0 = zext i8 %v1e0 to i32
143  %v2e0 = extractelement <4 x i8> %vec2, i64 0
144  %cv2e0 = zext i8 %v2e0 to i32
145  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
146
147  %v1e1 = extractelement <4 x i8> %vec1, i64 1
148  %cv1e1 = zext i8 %v1e1 to i32
149  %v2e1 = extractelement <4 x i8> %vec2, i64 1
150  %cv2e1 = zext i8 %v2e1 to i32
151  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
152
153  %v1e2 = extractelement <4 x i8> %vec1, i64 2
154  %cv1e2 = zext i8 %v1e2 to i32
155  %v2e2 = extractelement <4 x i8> %vec2, i64 2
156  %cv2e2 = zext i8 %v2e2 to i32
157  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
158
159  %v1e3 = extractelement <4 x i8> %vec1, i64 3
160  %cv1e3 = zext i8 %v1e3 to i32
161  %v2e3 = extractelement <4 x i8> %vec2, i64 3
162  %cv2e3 = zext i8 %v2e3 to i32
163  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
164
165  %acc = load i32, i32 addrspace(1)* %dst, align 4
166  %mad1 = add i32 %mul1, %acc
167  %mad2 = add i32 %mad1, %mul2
168  %mad3 = add i32 %mad2, %mul3
169  %mad4 = add i32 %mad3, %mul4
170
171  store i32 %mad4, i32 addrspace(1)* %dst, align 4
172  ret void
173}
174
175define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
176; GFX7-LABEL: udot4_acc16:
177; GFX7:       ; %bb.0: ; %entry
178; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
179; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
180; GFX7-NEXT:    s_mov_b32 s3, 0xf000
181; GFX7-NEXT:    s_mov_b32 s10, 0
182; GFX7-NEXT:    s_mov_b32 s11, s3
183; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
185; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
186; GFX7-NEXT:    v_mov_b32_e32 v1, 0
187; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
188; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
189; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
190; GFX7-NEXT:    s_mov_b32 s2, -1
191; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
192; GFX7-NEXT:    s_movk_i32 s4, 0xff
193; GFX7-NEXT:    s_waitcnt vmcnt(2)
194; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
195; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
196; GFX7-NEXT:    s_waitcnt vmcnt(1)
197; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
198; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
199; GFX7-NEXT:    s_waitcnt vmcnt(0)
200; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
201; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
202; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
203; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
204; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
205; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
206; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
207; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
208; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
209; GFX7-NEXT:    s_endpgm
210;
211; GFX8-LABEL: udot4_acc16:
212; GFX8:       ; %bb.0: ; %entry
213; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
214; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
215; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
216; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX8-NEXT:    v_mov_b32_e32 v1, s5
218; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
219; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
220; GFX8-NEXT:    flat_load_dword v3, v[0:1]
221; GFX8-NEXT:    v_mov_b32_e32 v1, s7
222; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
223; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
224; GFX8-NEXT:    flat_load_dword v2, v[0:1]
225; GFX8-NEXT:    v_mov_b32_e32 v0, s0
226; GFX8-NEXT:    v_mov_b32_e32 v1, s1
227; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
228; GFX8-NEXT:    s_movk_i32 s0, 0xff
229; GFX8-NEXT:    v_mov_b32_e32 v5, s0
230; GFX8-NEXT:    s_waitcnt vmcnt(2)
231; GFX8-NEXT:    v_and_b32_e32 v6, s0, v3
232; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
233; GFX8-NEXT:    v_and_b32_e32 v8, s0, v8
234; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
235; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
236; GFX8-NEXT:    s_waitcnt vmcnt(1)
237; GFX8-NEXT:    v_and_b32_e32 v7, s0, v2
238; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
239; GFX8-NEXT:    v_and_b32_e32 v9, s0, v9
240; GFX8-NEXT:    s_waitcnt vmcnt(0)
241; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
242; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
243; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
244; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
245; GFX8-NEXT:    v_mad_u16 v4, v10, v5, v4
246; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
247; GFX8-NEXT:    flat_store_short v[0:1], v2
248; GFX8-NEXT:    s_endpgm
249;
250; GFX9-NODL-LABEL: udot4_acc16:
251; GFX9-NODL:       ; %bb.0: ; %entry
252; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
253; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
254; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
255; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
256; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
258; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
259; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
260; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
261; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
262; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v1
263; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
264; GFX9-NODL-NEXT:    v_and_b32_e32 v5, s0, v2
265; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
266; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
267; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s0, v6
268; GFX9-NODL-NEXT:    v_and_b32_e32 v7, s0, v7
269; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
270; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
271; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
272; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
273; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
274; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
275; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
276; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
277; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
278; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
279; GFX9-NODL-NEXT:    s_endpgm
280;
281; GFX9-DL-LABEL: udot4_acc16:
282; GFX9-DL:       ; %bb.0: ; %entry
283; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
284; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
285; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
286; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
287; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
289; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
290; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
291; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
292; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
293; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v1
294; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
295; GFX9-DL-NEXT:    v_and_b32_e32 v5, s0, v2
296; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
297; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
298; GFX9-DL-NEXT:    v_and_b32_e32 v6, s0, v6
299; GFX9-DL-NEXT:    v_and_b32_e32 v7, s0, v7
300; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
301; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
302; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
303; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
304; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
305; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
306; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
307; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
308; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
309; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
310; GFX9-DL-NEXT:    s_endpgm
311;
312; GFX10-DL-LABEL: udot4_acc16:
313; GFX10-DL:       ; %bb.0: ; %entry
314; GFX10-DL-NEXT:    s_clause 0x1
315; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
316; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
317; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
318; GFX10-DL-NEXT:    s_movk_i32 s0, 0xff
319; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX10-DL-NEXT:    s_clause 0x1
321; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
322; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
323; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
324; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
325; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
326; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v1
327; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
328; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
329; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
330; GFX10-DL-NEXT:    v_and_b32_e32 v7, s0, v2
331; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
332; GFX10-DL-NEXT:    v_and_b32_e32 v6, s0, v6
333; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
334; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
335; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
336; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
337; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
338; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
339; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
340; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
341; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
342; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
343; GFX10-DL-NEXT:    s_endpgm
344                                       <4 x i8> addrspace(1)* %src2,
345                                       i16 addrspace(1)* nocapture %dst) {
346entry:
347  %idx = call i32 @llvm.amdgcn.workitem.id.x()
348  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
349  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
350  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
351  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
352
353  %v1e0 = extractelement <4 x i8> %vec1, i64 0
354  %cv1e0 = zext i8 %v1e0 to i16
355  %v2e0 = extractelement <4 x i8> %vec2, i64 0
356  %cv2e0 = zext i8 %v2e0 to i16
357  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
358
359  %v1e1 = extractelement <4 x i8> %vec1, i64 1
360  %cv1e1 = zext i8 %v1e1 to i16
361  %v2e1 = extractelement <4 x i8> %vec2, i64 1
362  %cv2e1 = zext i8 %v2e1 to i16
363  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
364
365  %v1e2 = extractelement <4 x i8> %vec1, i64 2
366  %cv1e2 = zext i8 %v1e2 to i16
367  %v2e2 = extractelement <4 x i8> %vec2, i64 2
368  %cv2e2 = zext i8 %v2e2 to i16
369  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
370
371  %v1e3 = extractelement <4 x i8> %vec1, i64 3
372  %cv1e3 = zext i8 %v1e3 to i16
373  %v2e3 = extractelement <4 x i8> %vec2, i64 3
374  %cv2e3 = zext i8 %v2e3 to i16
375  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
376
377  %acc = load i16, i16 addrspace(1)* %dst, align 2
378  %mad1 = add i16 %mul1, %acc
379  %mad2 = add i16 %mad1, %mul2
380  %mad3 = add i16 %mad2, %mul3
381  %mad4 = add i16 %mad3, %mul4
382
383  store i16 %mad4, i16 addrspace(1)* %dst, align 2
384  ret void
385}
386
387define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
388; GFX7-LABEL: udot4_acc8:
389; GFX7:       ; %bb.0: ; %entry
390; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
391; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
392; GFX7-NEXT:    s_mov_b32 s3, 0xf000
393; GFX7-NEXT:    s_mov_b32 s10, 0
394; GFX7-NEXT:    s_mov_b32 s11, s3
395; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
397; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
398; GFX7-NEXT:    v_mov_b32_e32 v1, 0
399; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
400; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
401; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
402; GFX7-NEXT:    s_mov_b32 s2, -1
403; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
404; GFX7-NEXT:    s_movk_i32 s4, 0xff
405; GFX7-NEXT:    s_waitcnt vmcnt(2)
406; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
407; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
408; GFX7-NEXT:    s_waitcnt vmcnt(1)
409; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
410; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
411; GFX7-NEXT:    s_waitcnt vmcnt(0)
412; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
413; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
414; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
415; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
416; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
417; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
418; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
419; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
420; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
421; GFX7-NEXT:    s_endpgm
422;
423; GFX8-LABEL: udot4_acc8:
424; GFX8:       ; %bb.0: ; %entry
425; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
426; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
427; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
428; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX8-NEXT:    v_mov_b32_e32 v1, s5
430; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
431; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
432; GFX8-NEXT:    flat_load_dword v3, v[0:1]
433; GFX8-NEXT:    v_mov_b32_e32 v1, s7
434; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
435; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
436; GFX8-NEXT:    flat_load_dword v2, v[0:1]
437; GFX8-NEXT:    v_mov_b32_e32 v0, s0
438; GFX8-NEXT:    v_mov_b32_e32 v1, s1
439; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
440; GFX8-NEXT:    s_waitcnt vmcnt(2)
441; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
442; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
443; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
444; GFX8-NEXT:    s_waitcnt vmcnt(1)
445; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
446; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
447; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
448; GFX8-NEXT:    s_waitcnt vmcnt(0)
449; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
450; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
451; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
452; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
453; GFX8-NEXT:    flat_store_byte v[0:1], v2
454; GFX8-NEXT:    s_endpgm
455;
456; GFX9-NODL-LABEL: udot4_acc8:
457; GFX9-NODL:       ; %bb.0: ; %entry
458; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
459; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
460; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
461; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
463; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
464; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
465; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
466; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
467; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
468; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
469; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
470; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
471; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
472; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
473; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
474; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
475; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
476; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
477; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
478; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
479; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
480; GFX9-NODL-NEXT:    s_endpgm
481;
482; GFX9-DL-LABEL: udot4_acc8:
483; GFX9-DL:       ; %bb.0: ; %entry
484; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
485; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
486; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
487; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
489; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
490; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
491; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
492; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
493; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
494; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
495; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
496; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
497; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
498; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
499; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
500; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
501; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
502; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
503; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
504; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
505; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
506; GFX9-DL-NEXT:    s_endpgm
507;
508; GFX10-DL-LABEL: udot4_acc8:
509; GFX10-DL:       ; %bb.0: ; %entry
510; GFX10-DL-NEXT:    s_clause 0x1
511; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
512; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
513; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
514; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
515; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX10-DL-NEXT:    s_clause 0x1
517; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
518; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
519; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
520; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
521; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
522; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
523; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
524; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
525; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
526; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
527; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
528; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
529; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
530; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
531; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
532; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
533; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
534; GFX10-DL-NEXT:    s_endpgm
535                                      <4 x i8> addrspace(1)* %src2,
536                                      i8 addrspace(1)* nocapture %dst) {
537entry:
538  %idx = call i32 @llvm.amdgcn.workitem.id.x()
539  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
540  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
541  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
542  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
543
544  %v1e0 = extractelement <4 x i8> %vec1, i64 0
545  %v2e0 = extractelement <4 x i8> %vec2, i64 0
546  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
547
548  %v1e1 = extractelement <4 x i8> %vec1, i64 1
549  %v2e1 = extractelement <4 x i8> %vec2, i64 1
550  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
551
552  %v1e2 = extractelement <4 x i8> %vec1, i64 2
553  %v2e2 = extractelement <4 x i8> %vec2, i64 2
554  %mul3 = mul nuw nsw i8 %v1e2, %v2e2
555
556  %v1e3 = extractelement <4 x i8> %vec1, i64 3
557  %v2e3 = extractelement <4 x i8> %vec2, i64 3
558  %mul4 = mul nuw nsw i8 %v1e3, %v2e3
559
560  %acc = load i8, i8 addrspace(1)* %dst, align 2
561  %mad1 = add i8 %mul1, %acc
562  %mad2 = add i8 %mad1, %mul2
563  %mad3 = add i8 %mad2, %mul3
564  %mad4 = add i8 %mad3, %mul4
565
566  store i8 %mad4, i8 addrspace(1)* %dst, align 2
567  ret void
568}
569
570; TODO: Generate udot4?
571define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
572; GFX7-LABEL: udot2_8:
573; GFX7:       ; %bb.0: ; %entry
574; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
575; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
576; GFX7-NEXT:    s_mov_b32 s3, 0xf000
577; GFX7-NEXT:    s_mov_b32 s10, 0
578; GFX7-NEXT:    s_mov_b32 s11, s3
579; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
581; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
582; GFX7-NEXT:    v_mov_b32_e32 v1, 0
583; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
584; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
585; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
586; GFX7-NEXT:    s_mov_b32 s2, -1
587; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
588; GFX7-NEXT:    s_movk_i32 s4, 0xff
589; GFX7-NEXT:    s_waitcnt vmcnt(2)
590; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
591; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
592; GFX7-NEXT:    s_waitcnt vmcnt(1)
593; GFX7-NEXT:    v_and_b32_e32 v4, s4, v0
594; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
595; GFX7-NEXT:    s_waitcnt vmcnt(0)
596; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
597; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
598; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
599; GFX7-NEXT:    s_endpgm
600;
601; GFX8-LABEL: udot2_8:
602; GFX8:       ; %bb.0: ; %entry
603; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
604; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
605; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
606; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX8-NEXT:    v_mov_b32_e32 v1, s5
608; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
609; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
610; GFX8-NEXT:    flat_load_dword v3, v[0:1]
611; GFX8-NEXT:    v_mov_b32_e32 v1, s7
612; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
613; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
614; GFX8-NEXT:    flat_load_dword v2, v[0:1]
615; GFX8-NEXT:    v_mov_b32_e32 v0, s0
616; GFX8-NEXT:    v_mov_b32_e32 v1, s1
617; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
618; GFX8-NEXT:    s_waitcnt vmcnt(2)
619; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
620; GFX8-NEXT:    s_waitcnt vmcnt(1)
621; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
622; GFX8-NEXT:    s_waitcnt vmcnt(0)
623; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
624; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
625; GFX8-NEXT:    flat_store_byte v[0:1], v2
626; GFX8-NEXT:    s_endpgm
627;
628; GFX9-NODL-LABEL: udot2_8:
629; GFX9-NODL:       ; %bb.0: ; %entry
630; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
631; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
632; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
633; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
634; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
636; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
637; GFX9-NODL-NEXT:    global_load_ubyte v4, v1, s[2:3]
638; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
639; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
640; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
641; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
642; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
643; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
644; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
645; GFX9-NODL-NEXT:    global_store_byte v1, v0, s[2:3]
646; GFX9-NODL-NEXT:    s_endpgm
647;
648; GFX9-DL-LABEL: udot2_8:
649; GFX9-DL:       ; %bb.0: ; %entry
650; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
651; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
652; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
653; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
654; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
656; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
657; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
658; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
659; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
660; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
661; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
662; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
663; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
664; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
665; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
666; GFX9-DL-NEXT:    s_endpgm
667;
668; GFX10-DL-LABEL: udot2_8:
669; GFX10-DL:       ; %bb.0: ; %entry
670; GFX10-DL-NEXT:    s_clause 0x1
671; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
672; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
673; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
674; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
675; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX10-DL-NEXT:    s_clause 0x1
677; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
678; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
679; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
680; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
681; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
682; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
683; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
684; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
685; GFX10-DL-NEXT:    v_mad_u16 v2, v2, v3, v4
686; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v2
687; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
688; GFX10-DL-NEXT:    s_endpgm
689                                   <4 x i8> addrspace(1)* %src2,
690                                   i8 addrspace(1)* nocapture %dst) {
691entry:
692  %idx = call i32 @llvm.amdgcn.workitem.id.x()
693  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
694  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
695  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
696  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
697
698  %v1e0 = extractelement <4 x i8> %vec1, i64 0
699  %v2e0 = extractelement <4 x i8> %vec2, i64 0
700  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
701
702  %v1e1 = extractelement <4 x i8> %vec1, i64 1
703  %v2e1 = extractelement <4 x i8> %vec2, i64 1
704  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
705
706  %acc = load i8, i8 addrspace(1)* %dst, align 2
707  %mad1 = add i8 %mul1, %acc
708  %mad2 = add i8 %mad1, %mul2
709  store i8 %mad2, i8 addrspace(1)* %dst, align 2
710  ret void
711}
712
713define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
714; GFX7-LABEL: udot4_CommutationInsideMAD:
715; GFX7:       ; %bb.0: ; %entry
716; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
717; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
718; GFX7-NEXT:    s_mov_b32 s3, 0xf000
719; GFX7-NEXT:    s_mov_b32 s10, 0
720; GFX7-NEXT:    s_mov_b32 s11, s3
721; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
723; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
724; GFX7-NEXT:    v_mov_b32_e32 v1, 0
725; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
726; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
727; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
728; GFX7-NEXT:    s_mov_b32 s2, -1
729; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
730; GFX7-NEXT:    s_movk_i32 s4, 0xff
731; GFX7-NEXT:    s_waitcnt vmcnt(2)
732; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
733; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
734; GFX7-NEXT:    s_waitcnt vmcnt(1)
735; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
736; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
737; GFX7-NEXT:    s_waitcnt vmcnt(0)
738; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
739; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
740; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
741; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
742; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
743; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
744; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v5, v1
745; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
746; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
747; GFX7-NEXT:    s_endpgm
748;
749; GFX8-LABEL: udot4_CommutationInsideMAD:
750; GFX8:       ; %bb.0: ; %entry
751; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
752; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
753; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
754; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX8-NEXT:    v_mov_b32_e32 v1, s5
756; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
757; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
758; GFX8-NEXT:    flat_load_dword v3, v[0:1]
759; GFX8-NEXT:    v_mov_b32_e32 v1, s7
760; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
761; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
762; GFX8-NEXT:    flat_load_dword v2, v[0:1]
763; GFX8-NEXT:    v_mov_b32_e32 v0, s0
764; GFX8-NEXT:    v_mov_b32_e32 v1, s1
765; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
766; GFX8-NEXT:    s_waitcnt vmcnt(2)
767; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
768; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
769; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
770; GFX8-NEXT:    s_waitcnt vmcnt(1)
771; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
772; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
773; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
774; GFX8-NEXT:    s_waitcnt vmcnt(0)
775; GFX8-NEXT:    v_mad_u16 v2, v2, v3, v4
776; GFX8-NEXT:    v_mad_u16 v2, v8, v7, v2
777; GFX8-NEXT:    v_mad_u16 v2, v6, v5, v2
778; GFX8-NEXT:    v_mad_u16 v2, v10, v9, v2
779; GFX8-NEXT:    flat_store_byte v[0:1], v2
780; GFX8-NEXT:    s_endpgm
781;
782; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
783; GFX9-NODL:       ; %bb.0: ; %entry
784; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
785; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
786; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
787; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
789; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
790; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
791; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
792; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
793; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
794; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
795; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
796; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
797; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
798; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
799; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
800; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
801; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
802; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
803; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
804; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
805; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
806; GFX9-NODL-NEXT:    s_endpgm
807;
808; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
809; GFX9-DL:       ; %bb.0: ; %entry
810; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
811; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
812; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
813; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
815; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
816; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
817; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
818; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
819; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
820; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
821; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
822; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
823; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
824; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
825; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
826; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
827; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
828; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
829; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
830; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
831; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
832; GFX9-DL-NEXT:    s_endpgm
833;
834; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
835; GFX10-DL:       ; %bb.0: ; %entry
836; GFX10-DL-NEXT:    s_clause 0x1
837; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
838; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
839; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
840; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
841; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
842; GFX10-DL-NEXT:    s_clause 0x1
843; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
844; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
845; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
846; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
847; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
848; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
849; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
850; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
851; GFX10-DL-NEXT:    v_mad_u16 v4, v3, v2, v4
852; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
853; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
854; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
855; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
856; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
857; GFX10-DL-NEXT:    v_mad_u16 v0, v7, v6, v0
858; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
859; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
860; GFX10-DL-NEXT:    s_endpgm
861                                                      <4 x i8> addrspace(1)* %src2,
862                                                      i8 addrspace(1)* nocapture %dst) {
863entry:
864  %idx = call i32 @llvm.amdgcn.workitem.id.x()
865  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
866  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
867  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
868  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
869
870  %v1e0 = extractelement <4 x i8> %vec1, i64 0
871  %v2e0 = extractelement <4 x i8> %vec2, i64 0
872  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
873
874  %v1e1 = extractelement <4 x i8> %vec1, i64 1
875  %v2e1 = extractelement <4 x i8> %vec2, i64 1
876  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
877
878  %v1e2 = extractelement <4 x i8> %vec1, i64 2
879  %v2e2 = extractelement <4 x i8> %vec2, i64 2
880  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
881
882  %v1e3 = extractelement <4 x i8> %vec1, i64 3
883  %v2e3 = extractelement <4 x i8> %vec2, i64 3
884  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
885
886  %acc = load i8, i8 addrspace(1)* %dst, align 2
887  %mad1 = add i8 %acc, %mul1
888  %mad2 = add i8 %mul2, %mad1
889  %mad3 = add i8 %mul3, %mad2
890  %mad4 = add i8 %mul4, %mad3
891
892  store i8 %mad4, i8 addrspace(1)* %dst, align 2
893  ret void
894}
895
896; TODO: Support commutation accross the adds.
897define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
898; GFX7-LABEL: udot4_CommutationAccrossMADs:
899; GFX7:       ; %bb.0: ; %entry
900; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
901; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
902; GFX7-NEXT:    s_mov_b32 s3, 0xf000
903; GFX7-NEXT:    s_mov_b32 s10, 0
904; GFX7-NEXT:    s_mov_b32 s11, s3
905; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
907; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
908; GFX7-NEXT:    v_mov_b32_e32 v1, 0
909; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
910; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
911; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
912; GFX7-NEXT:    s_mov_b32 s2, -1
913; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
914; GFX7-NEXT:    s_movk_i32 s4, 0xff
915; GFX7-NEXT:    s_waitcnt vmcnt(2)
916; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
917; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
918; GFX7-NEXT:    s_waitcnt vmcnt(1)
919; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
920; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
921; GFX7-NEXT:    s_waitcnt vmcnt(0)
922; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
923; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
924; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
925; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
926; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
927; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
928; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v5, v1
929; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
930; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
931; GFX7-NEXT:    s_endpgm
932;
933; GFX8-LABEL: udot4_CommutationAccrossMADs:
934; GFX8:       ; %bb.0: ; %entry
935; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
936; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
937; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
938; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX8-NEXT:    v_mov_b32_e32 v1, s5
940; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
941; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
942; GFX8-NEXT:    flat_load_dword v3, v[0:1]
943; GFX8-NEXT:    v_mov_b32_e32 v1, s7
944; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
945; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
946; GFX8-NEXT:    flat_load_dword v2, v[0:1]
947; GFX8-NEXT:    v_mov_b32_e32 v0, s0
948; GFX8-NEXT:    v_mov_b32_e32 v1, s1
949; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
950; GFX8-NEXT:    s_waitcnt vmcnt(2)
951; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
952; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
953; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
954; GFX8-NEXT:    s_waitcnt vmcnt(1)
955; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
956; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
957; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
958; GFX8-NEXT:    s_waitcnt vmcnt(0)
959; GFX8-NEXT:    v_mad_u16 v4, v8, v7, v4
960; GFX8-NEXT:    v_mad_u16 v2, v2, v3, v4
961; GFX8-NEXT:    v_mad_u16 v2, v6, v5, v2
962; GFX8-NEXT:    v_mad_u16 v2, v10, v9, v2
963; GFX8-NEXT:    flat_store_byte v[0:1], v2
964; GFX8-NEXT:    s_endpgm
965;
966; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
967; GFX9-NODL:       ; %bb.0: ; %entry
968; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
969; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
970; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
971; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
973; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
974; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
975; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
976; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
977; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
978; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
979; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
980; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
981; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
982; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
983; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
984; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
985; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
986; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
987; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
988; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
989; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
990; GFX9-NODL-NEXT:    s_endpgm
991;
992; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
993; GFX9-DL:       ; %bb.0: ; %entry
994; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
995; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
996; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
997; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
998; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
999; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1000; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1001; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
1002; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1003; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1004; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1005; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1006; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
1007; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1008; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
1009; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1010; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
1011; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
1012; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
1013; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
1014; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
1015; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
1016; GFX9-DL-NEXT:    s_endpgm
1017;
1018; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
1019; GFX10-DL:       ; %bb.0: ; %entry
1020; GFX10-DL-NEXT:    s_clause 0x1
1021; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1022; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1023; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1024; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1025; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1026; GFX10-DL-NEXT:    s_clause 0x1
1027; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1028; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1029; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
1030; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1031; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1032; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1033; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
1034; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1035; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
1036; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
1037; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
1038; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
1039; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1040; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1041; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v4, v0
1042; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
1043; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1044; GFX10-DL-NEXT:    s_endpgm
1045                                                        <4 x i8> addrspace(1)* %src2,
1046                                                        i8 addrspace(1)* nocapture %dst) {
1047entry:
1048  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1049  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1050  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1051  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1052  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1053
1054  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1055  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1056  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
1057
1058  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1059  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1060  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
1061
1062  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1063  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1064  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
1065
1066  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1067  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1068  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
1069
1070  %acc = load i8, i8 addrspace(1)* %dst, align 2
1071  %mad1 = add i8 %acc, %mul2
1072  %mad2 = add i8 %mad1, %mul1
1073  %mad3 = add i8 %mad2, %mul3
1074  %mad4 = add i8 %mad3, %mul4
1075
1076  store i8 %mad4, i8 addrspace(1)* %dst, align 2
1077  ret void
1078}
1079
1080define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
1081; GFX7-LABEL: udot4_multiuse_mul1:
1082; GFX7:       ; %bb.0: ; %entry
1083; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1084; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1085; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1086; GFX7-NEXT:    s_mov_b32 s10, 0
1087; GFX7-NEXT:    s_mov_b32 s11, s3
1088; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1089; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1090; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1091; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1092; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1093; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1094; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1095; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1096; GFX7-NEXT:    s_movk_i32 s4, 0xff
1097; GFX7-NEXT:    s_mov_b32 s2, -1
1098; GFX7-NEXT:    s_waitcnt vmcnt(1)
1099; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
1100; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1101; GFX7-NEXT:    s_waitcnt vmcnt(0)
1102; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
1103; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1104; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1105; GFX7-NEXT:    v_mad_u32_u24 v8, v1, v5, s5
1106; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, v8
1107; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1108; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1109; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
1110; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1111; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1112; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1113; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1114; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1115; GFX7-NEXT:    s_endpgm
1116;
1117; GFX8-LABEL: udot4_multiuse_mul1:
1118; GFX8:       ; %bb.0: ; %entry
1119; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1120; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1121; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1122; GFX8-NEXT:    s_movk_i32 s2, 0xff
1123; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1125; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1126; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1127; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1128; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1129; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1130; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1131; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1132; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1133; GFX8-NEXT:    s_waitcnt vmcnt(1)
1134; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
1135; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
1136; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
1137; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1138; GFX8-NEXT:    s_waitcnt vmcnt(0)
1139; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
1140; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
1141; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1142; GFX8-NEXT:    v_mad_u32_u24 v8, v1, v2, s3
1143; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v8
1144; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
1145; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
1146; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1147; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1148; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
1149; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1150; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1151; GFX8-NEXT:    flat_store_dword v[0:1], v2
1152; GFX8-NEXT:    s_endpgm
1153;
1154; GFX9-NODL-LABEL: udot4_multiuse_mul1:
1155; GFX9-NODL:       ; %bb.0: ; %entry
1156; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1157; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1158; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1159; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1160; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1162; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1163; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
1164; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1165; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1166; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s0, v1
1167; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1168; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v2
1169; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1170; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1171; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1172; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
1173; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1174; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v3, v4, s1
1175; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
1176; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
1177; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1178; GFX9-NODL-NEXT:    s_endpgm
1179;
1180; GFX9-DL-LABEL: udot4_multiuse_mul1:
1181; GFX9-DL:       ; %bb.0: ; %entry
1182; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1183; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1184; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1185; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
1186; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1188; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1189; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1190; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1191; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1192; GFX9-DL-NEXT:    v_and_b32_e32 v3, s0, v1
1193; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1194; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v2
1195; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1196; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1197; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1198; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
1199; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s1
1201; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
1202; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
1203; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1204; GFX9-DL-NEXT:    s_endpgm
1205;
1206; GFX10-DL-LABEL: udot4_multiuse_mul1:
1207; GFX10-DL:       ; %bb.0: ; %entry
1208; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1209; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1210; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1211; GFX10-DL-NEXT:    s_movk_i32 s3, 0xff
1212; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1213; GFX10-DL-NEXT:    s_clause 0x1
1214; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1215; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1216; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1217; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1218; GFX10-DL-NEXT:    v_and_b32_e32 v0, s3, v1
1219; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1220; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v2
1221; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1222; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v0, v3
1223; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1224; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
1225; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1226; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1227; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1228; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
1229; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
1230; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1231; GFX10-DL-NEXT:    s_endpgm
1232                                               <4 x i8> addrspace(1)* %src2,
1233                                               i32 addrspace(1)* nocapture %dst) {
1234entry:
1235  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1236  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1237  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1238  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1239  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1240
1241  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1242  %cv1e0 = zext i8 %v1e0 to i32
1243  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1244  %cv2e0 = zext i8 %v2e0 to i32
1245  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1246
1247  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1248  %cv1e1 = zext i8 %v1e1 to i32
1249  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1250  %cv2e1 = zext i8 %v2e1 to i32
1251  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1252
1253  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1254  %cv1e2 = zext i8 %v1e2 to i32
1255  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1256  %cv2e2 = zext i8 %v2e2 to i32
1257  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1258
1259  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1260  %cv1e3 = zext i8 %v1e3 to i32
1261  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1262  %cv2e3 = zext i8 %v2e3 to i32
1263  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1264
1265  %acc = load i32, i32 addrspace(1)* %dst, align 4
1266  %add = add i32 %mul1, %acc
1267  %add1 = add i32 %mul2, %add
1268  %add2 = add i32 %add1, %mul1
1269  %add3 = add i32 %add2, %mul3
1270  %add4 = add i32 %add3, %mul4
1271
1272  store i32 %add4, i32 addrspace(1)* %dst, align 4
1273  ret void
1274}
1275
1276define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
1277; GFX7-LABEL: udot4_multiuse_add1:
1278; GFX7:       ; %bb.0: ; %entry
1279; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1280; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1281; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1282; GFX7-NEXT:    s_mov_b32 s10, 0
1283; GFX7-NEXT:    s_mov_b32 s11, s3
1284; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1286; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1287; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1288; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1289; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1290; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1291; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1292; GFX7-NEXT:    s_movk_i32 s4, 0xff
1293; GFX7-NEXT:    s_mov_b32 s2, -1
1294; GFX7-NEXT:    s_waitcnt vmcnt(1)
1295; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1296; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
1297; GFX7-NEXT:    s_waitcnt vmcnt(0)
1298; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1299; GFX7-NEXT:    v_and_b32_e32 v5, s4, v0
1300; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1301; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, s5
1302; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1303; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1304; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
1305; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1306; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1307; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1308; GFX7-NEXT:    v_add_i32_e32 v6, vcc, s5, v3
1309; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1310; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
1311; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1312; GFX7-NEXT:    s_endpgm
1313;
1314; GFX8-LABEL: udot4_multiuse_add1:
1315; GFX8:       ; %bb.0: ; %entry
1316; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1317; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1318; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1319; GFX8-NEXT:    s_movk_i32 s2, 0xff
1320; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1321; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1322; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1323; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1324; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1325; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1326; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1327; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1328; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1329; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1330; GFX8-NEXT:    s_waitcnt vmcnt(1)
1331; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
1332; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
1333; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
1334; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1335; GFX8-NEXT:    s_waitcnt vmcnt(0)
1336; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
1337; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
1338; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1339; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s3
1340; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
1341; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
1342; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1343; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1344; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s3, v4
1345; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
1346; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v5
1347; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1348; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1349; GFX8-NEXT:    flat_store_dword v[0:1], v2
1350; GFX8-NEXT:    s_endpgm
1351;
1352; GFX9-NODL-LABEL: udot4_multiuse_add1:
1353; GFX9-NODL:       ; %bb.0: ; %entry
1354; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1355; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1356; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1357; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1359; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1360; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1361; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1362; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1363; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
1364; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1365; GFX9-NODL-NEXT:    v_bfe_u32 v5, v2, 8, 8
1366; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1367; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1368; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1369; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
1371; GFX9-NODL-NEXT:    v_add_u32_e32 v4, s0, v2
1372; GFX9-NODL-NEXT:    v_add3_u32 v2, v2, v3, v6
1373; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v1, v4
1374; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1375; GFX9-NODL-NEXT:    s_endpgm
1376;
1377; GFX9-DL-LABEL: udot4_multiuse_add1:
1378; GFX9-DL:       ; %bb.0: ; %entry
1379; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1380; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1381; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1382; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1384; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1385; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1386; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1387; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1388; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 8, 8
1389; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1390; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 8, 8
1391; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1392; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1393; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1394; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1395; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
1396; GFX9-DL-NEXT:    v_add_u32_e32 v4, s0, v2
1397; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v3, v6
1398; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v1, v4
1399; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1400; GFX9-DL-NEXT:    s_endpgm
1401;
1402; GFX10-DL-LABEL: udot4_multiuse_add1:
1403; GFX10-DL:       ; %bb.0: ; %entry
1404; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1405; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1406; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1407; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1408; GFX10-DL-NEXT:    s_clause 0x1
1409; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1410; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1411; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1412; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1413; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 8, 8
1414; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1415; GFX10-DL-NEXT:    v_bfe_u32 v3, v2, 8, 8
1416; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1417; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1418; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
1419; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1420; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1421; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s2, v0
1422; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v4, v3
1423; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1424; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
1425; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
1426; GFX10-DL-NEXT:    s_endpgm
1427                                               <4 x i8> addrspace(1)* %src2,
1428                                               i32 addrspace(1)* nocapture %dst) {
1429entry:
1430  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1431  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1432  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1433  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1434  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1435
1436  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1437  %cv1e0 = zext i8 %v1e0 to i32
1438  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1439  %cv2e0 = zext i8 %v2e0 to i32
1440  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1441
1442  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1443  %cv1e1 = zext i8 %v1e1 to i32
1444  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1445  %cv2e1 = zext i8 %v2e1 to i32
1446  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1447
1448  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1449  %cv1e2 = zext i8 %v1e2 to i32
1450  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1451  %cv2e2 = zext i8 %v2e2 to i32
1452  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1453
1454  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1455  %cv1e3 = zext i8 %v1e3 to i32
1456  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1457  %cv2e3 = zext i8 %v2e3 to i32
1458  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1459
1460  %acc = load i32, i32 addrspace(1)* %dst, align 4
1461  %add1 = add i32 %mul2, %acc
1462  %add = add i32 %add1, %acc
1463  %add2 = add i32 %add1, %mul1
1464  %add3 = add i32 %add2, %mul3
1465  %add4 = add i32 %add3, %mul4
1466  %res = add i32 %add4, %add
1467  store i32 %res, i32 addrspace(1)* %dst, align 4
1468  ret void
1469}
1470
1471define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
1472; GFX7-LABEL: notdot4_mixedtypes:
1473; GFX7:       ; %bb.0: ; %entry
1474; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1475; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1476; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1477; GFX7-NEXT:    s_mov_b32 s10, 0
1478; GFX7-NEXT:    s_mov_b32 s11, s3
1479; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1481; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1482; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1483; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1484; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1485; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1486; GFX7-NEXT:    s_mov_b32 s2, -1
1487; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
1488; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1489; GFX7-NEXT:    s_waitcnt vmcnt(2)
1490; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
1491; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
1492; GFX7-NEXT:    s_waitcnt vmcnt(1)
1493; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
1494; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
1495; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
1496; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
1497; GFX7-NEXT:    s_waitcnt vmcnt(0)
1498; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1499; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
1500; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
1501; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
1502; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1503; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1504; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
1505; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1506; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1507; GFX7-NEXT:    s_endpgm
1508;
1509; GFX8-LABEL: notdot4_mixedtypes:
1510; GFX8:       ; %bb.0: ; %entry
1511; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1512; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1513; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1514; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1516; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1517; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1518; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1519; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1520; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1521; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1522; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1523; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1524; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1525; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1526; GFX8-NEXT:    s_movk_i32 s0, 0xff
1527; GFX8-NEXT:    v_mov_b32_e32 v5, s0
1528; GFX8-NEXT:    s_waitcnt vmcnt(2)
1529; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
1530; GFX8-NEXT:    v_and_b32_e32 v8, s0, v8
1531; GFX8-NEXT:    v_bfe_i32 v6, v3, 0, 8
1532; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1533; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1534; GFX8-NEXT:    s_waitcnt vmcnt(1)
1535; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
1536; GFX8-NEXT:    v_and_b32_e32 v9, s0, v9
1537; GFX8-NEXT:    v_bfe_i32 v7, v2, 0, 8
1538; GFX8-NEXT:    s_waitcnt vmcnt(0)
1539; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
1540; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1541; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
1542; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1543; GFX8-NEXT:    v_mad_u16 v4, v10, v5, v4
1544; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1545; GFX8-NEXT:    flat_store_short v[0:1], v2
1546; GFX8-NEXT:    s_endpgm
1547;
1548; GFX9-NODL-LABEL: notdot4_mixedtypes:
1549; GFX9-NODL:       ; %bb.0: ; %entry
1550; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1551; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1552; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1553; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1554; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1555; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1556; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1557; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1558; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
1559; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1560; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1561; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1562; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1563; GFX9-NODL-NEXT:    v_and_b32_e32 v6, s0, v6
1564; GFX9-NODL-NEXT:    v_and_b32_e32 v7, s0, v7
1565; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
1566; GFX9-NODL-NEXT:    v_bfe_i32 v5, v2, 0, 8
1567; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1568; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
1569; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1570; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1571; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
1572; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1573; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1574; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
1575; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1576; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
1577; GFX9-NODL-NEXT:    s_endpgm
1578;
1579; GFX9-DL-LABEL: notdot4_mixedtypes:
1580; GFX9-DL:       ; %bb.0: ; %entry
1581; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1582; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1583; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1584; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
1585; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1586; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1587; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1588; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1589; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1590; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1591; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1592; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1593; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1594; GFX9-DL-NEXT:    v_and_b32_e32 v6, s0, v6
1595; GFX9-DL-NEXT:    v_and_b32_e32 v7, s0, v7
1596; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
1597; GFX9-DL-NEXT:    v_bfe_i32 v5, v2, 0, 8
1598; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1599; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
1600; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1601; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1602; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
1603; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1604; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1605; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
1606; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1607; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
1608; GFX9-DL-NEXT:    s_endpgm
1609;
1610; GFX10-DL-LABEL: notdot4_mixedtypes:
1611; GFX10-DL:       ; %bb.0: ; %entry
1612; GFX10-DL-NEXT:    s_clause 0x1
1613; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1614; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1615; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1616; GFX10-DL-NEXT:    s_movk_i32 s0, 0xff
1617; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1618; GFX10-DL-NEXT:    s_clause 0x1
1619; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1620; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1621; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1622; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1623; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1624; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
1625; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1626; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
1627; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
1628; GFX10-DL-NEXT:    v_bfe_i32 v7, v2, 0, 8
1629; GFX10-DL-NEXT:    v_and_b32_e32 v4, s0, v4
1630; GFX10-DL-NEXT:    v_and_b32_e32 v5, s0, v5
1631; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1632; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1633; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1634; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1635; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1636; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1637; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
1638; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1639; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
1640; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
1641; GFX10-DL-NEXT:    s_endpgm
1642                                              <4 x i8> addrspace(1)* %src2,
1643                                              i16 addrspace(1)* nocapture %dst) {
1644entry:
1645  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1646  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1647  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1648  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1649  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1650
1651  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1652  %cv1e0 = sext i8 %v1e0 to i16
1653  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1654  %cv2e0 = sext i8 %v2e0 to i16
1655  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1656
1657  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1658  %cv1e1 = zext i8 %v1e1 to i16
1659  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1660  %cv2e1 = zext i8 %v2e1 to i16
1661  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1662
1663  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1664  %cv1e2 = zext i8 %v1e2 to i16
1665  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1666  %cv2e2 = zext i8 %v2e2 to i16
1667  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1668
1669  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1670  %cv1e3 = zext i8 %v1e3 to i16
1671  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1672  %cv2e3 = zext i8 %v2e3 to i16
1673  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1674
1675  %acc = load i16, i16 addrspace(1)* %dst, align 2
1676  %add1 = add i16 %mul2, %acc
1677  %add2 = add i16 %add1, %mul1
1678  %add3 = add i16 %add2, %mul3
1679  %add4 = add i16 %add3, %mul4
1680
1681  store i16 %add4, i16 addrspace(1)* %dst, align 2
1682  ret void
1683}
1684
1685; TODO: cleanup s_lshr_b32 and support this pattern.
1686define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
1687; GFX7-LABEL: udot4_acc32_vecMul:
1688; GFX7:       ; %bb.0: ; %entry
1689; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1690; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1691; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1692; GFX7-NEXT:    s_mov_b32 s10, 0
1693; GFX7-NEXT:    s_mov_b32 s11, s3
1694; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1695; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1696; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1697; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1698; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1699; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1700; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1701; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1702; GFX7-NEXT:    s_movk_i32 s4, 0xff
1703; GFX7-NEXT:    s_mov_b32 s2, -1
1704; GFX7-NEXT:    s_waitcnt vmcnt(1)
1705; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
1706; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1707; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1708; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
1709; GFX7-NEXT:    s_waitcnt vmcnt(0)
1710; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
1711; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1712; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1713; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
1714; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1715; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s5
1716; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
1717; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v7, v0
1718; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v5, v0
1719; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1720; GFX7-NEXT:    s_endpgm
1721;
1722; GFX8-LABEL: udot4_acc32_vecMul:
1723; GFX8:       ; %bb.0: ; %entry
1724; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1725; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1726; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1727; GFX8-NEXT:    s_movk_i32 s2, 0xff
1728; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1729; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1730; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1731; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1732; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1733; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1734; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1735; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1736; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1737; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1738; GFX8-NEXT:    s_waitcnt vmcnt(1)
1739; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
1740; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 8
1741; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 8, v3
1742; GFX8-NEXT:    v_and_b32_e32 v3, s2, v3
1743; GFX8-NEXT:    s_waitcnt vmcnt(0)
1744; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
1745; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
1746; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v0
1747; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
1748; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1749; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s3
1750; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v7, v0
1751; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v6, v0
1752; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
1753; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1754; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1755; GFX8-NEXT:    flat_store_dword v[0:1], v2
1756; GFX8-NEXT:    s_endpgm
1757;
1758; GFX9-NODL-LABEL: udot4_acc32_vecMul:
1759; GFX9-NODL:       ; %bb.0: ; %entry
1760; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1761; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1762; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1763; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1764; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1765; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1766; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1767; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1768; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1769; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1770; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1771; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1772; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1773; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1774; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
1775; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
1776; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1777; GFX9-NODL-NEXT:    s_endpgm
1778;
1779; GFX9-DL-LABEL: udot4_acc32_vecMul:
1780; GFX9-DL:       ; %bb.0: ; %entry
1781; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1782; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1783; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1784; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1785; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1786; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1787; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1788; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1789; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1790; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1791; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1792; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1793; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1794; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1795; GFX9-DL-NEXT:    v_add3_u32 v2, v3, s0, v4
1796; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v5, v1
1797; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1798; GFX9-DL-NEXT:    s_endpgm
1799;
1800; GFX10-DL-LABEL: udot4_acc32_vecMul:
1801; GFX10-DL:       ; %bb.0: ; %entry
1802; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1803; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1804; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1805; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
1806; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1807; GFX10-DL-NEXT:    s_clause 0x1
1808; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1809; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1810; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1811; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1812; GFX10-DL-NEXT:    v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1813; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1814; GFX10-DL-NEXT:    v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1815; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1816; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
1817; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1818; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1819; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1820; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1821; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
1822; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
1823; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1824; GFX10-DL-NEXT:    s_endpgm
1825                                              <4 x i8> addrspace(1)* %src2,
1826                                              i32 addrspace(1)* nocapture %dst) {
1827entry:
1828  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1829  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1830  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1831  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1832  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1833
1834  %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
1835  %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
1836
1837  %mul = mul <4 x i32> %cvec1, %cvec2
1838  %mul0 = extractelement <4 x i32> %mul, i64 0
1839  %mul1 = extractelement <4 x i32> %mul, i64 1
1840  %mul2 = extractelement <4 x i32> %mul, i64 2
1841  %mul3 = extractelement <4 x i32> %mul, i64 3
1842
1843  %acc = load i32, i32 addrspace(1)* %dst, align 4
1844  %add1 = add i32 %mul0, %acc
1845  %add2 = add i32 %add1, %mul1
1846  %add3 = add i32 %add2, %mul2
1847  %add4 = add i32 %add3, %mul3
1848
1849  store i32 %add4, i32 addrspace(1)* %dst, align 4
1850  ret void
1851}
1852
1853; TODO: This pattern should be recognized.
1854define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
1855; GFX7-LABEL: udot4_acc16_vecMul:
1856; GFX7:       ; %bb.0: ; %entry
1857; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1858; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1859; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1860; GFX7-NEXT:    s_mov_b32 s10, 0
1861; GFX7-NEXT:    s_mov_b32 s11, s3
1862; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1863; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1864; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1865; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1866; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1867; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1868; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1869; GFX7-NEXT:    s_mov_b32 s2, -1
1870; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
1871; GFX7-NEXT:    s_mov_b32 s4, 0xff00
1872; GFX7-NEXT:    s_movk_i32 s5, 0xff
1873; GFX7-NEXT:    s_waitcnt vmcnt(2)
1874; GFX7-NEXT:    v_and_b32_e32 v3, s4, v2
1875; GFX7-NEXT:    v_and_b32_e32 v4, s5, v2
1876; GFX7-NEXT:    s_waitcnt vmcnt(1)
1877; GFX7-NEXT:    v_and_b32_e32 v6, s4, v0
1878; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1879; GFX7-NEXT:    v_and_b32_e32 v7, s5, v0
1880; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
1881; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
1882; GFX7-NEXT:    v_or_b32_e32 v4, v7, v4
1883; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1884; GFX7-NEXT:    v_and_b32_e32 v3, s5, v3
1885; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1886; GFX7-NEXT:    v_and_b32_e32 v4, s5, v4
1887; GFX7-NEXT:    s_waitcnt vmcnt(0)
1888; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
1889; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
1890; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
1891; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1892; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1893; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1894; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
1895; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1896; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1897; GFX7-NEXT:    s_endpgm
1898;
1899; GFX8-LABEL: udot4_acc16_vecMul:
1900; GFX8:       ; %bb.0: ; %entry
1901; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1902; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1903; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1904; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1905; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1906; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1907; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1908; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1909; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1910; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1911; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1912; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1913; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1914; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1915; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1916; GFX8-NEXT:    s_movk_i32 s0, 0xff
1917; GFX8-NEXT:    v_mov_b32_e32 v5, s0
1918; GFX8-NEXT:    s_waitcnt vmcnt(2)
1919; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
1920; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v3
1921; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1922; GFX8-NEXT:    v_and_b32_e32 v3, s0, v3
1923; GFX8-NEXT:    s_waitcnt vmcnt(1)
1924; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
1925; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 8, v2
1926; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1927; GFX8-NEXT:    v_and_b32_e32 v2, s0, v2
1928; GFX8-NEXT:    s_waitcnt vmcnt(0)
1929; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1930; GFX8-NEXT:    v_mad_u16 v2, v7, v9, v2
1931; GFX8-NEXT:    v_mad_u16 v2, v10, v5, v2
1932; GFX8-NEXT:    v_mad_u16 v2, v6, v8, v2
1933; GFX8-NEXT:    flat_store_short v[0:1], v2
1934; GFX8-NEXT:    s_endpgm
1935;
1936; GFX9-NODL-LABEL: udot4_acc16_vecMul:
1937; GFX9-NODL:       ; %bb.0: ; %entry
1938; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1939; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1940; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1941; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1942; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0xffff
1943; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1944; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1945; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1946; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1947; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
1948; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1949; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
1950; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
1951; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1952; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v7, 8, v2
1953; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
1954; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1955; GFX9-NODL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1956; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1957; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1958; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
1959; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
1960; GFX9-NODL-NEXT:    v_and_b32_e32 v10, v4, v10
1961; GFX9-NODL-NEXT:    v_and_b32_e32 v4, v4, v9
1962; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1963; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v8, 16, v10
1964; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
1965; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1966; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
1967; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
1968; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1969; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
1970; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1971; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
1972; GFX9-NODL-NEXT:    s_endpgm
1973;
1974; GFX9-DL-LABEL: udot4_acc16_vecMul:
1975; GFX9-DL:       ; %bb.0: ; %entry
1976; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1977; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1978; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1979; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
1980; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1981; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1982; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1983; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1984; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1985; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1986; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1987; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
1988; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
1989; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1990; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v7, 8, v2
1991; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
1992; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1993; GFX9-DL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1994; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1995; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1996; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
1997; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
1998; GFX9-DL-NEXT:    v_and_b32_e32 v10, v4, v10
1999; GFX9-DL-NEXT:    v_and_b32_e32 v4, v4, v9
2000; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2001; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v8, 16, v10
2002; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
2003; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2004; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
2005; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v5
2006; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2007; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
2008; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2009; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
2010; GFX9-DL-NEXT:    s_endpgm
2011;
2012; GFX10-DL-LABEL: udot4_acc16_vecMul:
2013; GFX10-DL:       ; %bb.0: ; %entry
2014; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2015; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2016; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2017; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
2018; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
2019; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2020; GFX10-DL-NEXT:    s_clause 0x1
2021; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2022; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2023; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2024; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
2025; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2026; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
2027; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2028; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v2
2029; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2030; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2031; GFX10-DL-NEXT:    v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2032; GFX10-DL-NEXT:    v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2033; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
2034; GFX10-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2035; GFX10-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v8
2036; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
2037; GFX10-DL-NEXT:    v_and_b32_e32 v7, v4, v10
2038; GFX10-DL-NEXT:    v_and_b32_e32 v4, v4, v9
2039; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
2040; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v7
2041; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
2042; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
2043; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2044; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
2045; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2046; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v4
2047; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2048; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
2049; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
2050; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
2051; GFX10-DL-NEXT:    s_endpgm
2052                                              <4 x i8> addrspace(1)* %src2,
2053                                              i16 addrspace(1)* nocapture %dst) {
2054entry:
2055  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2056  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
2057  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
2058  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
2059  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
2060
2061  %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
2062  %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
2063
2064  %mul = mul <4 x i16> %cvec1, %cvec2
2065  %mul0 = extractelement <4 x i16> %mul, i64 0
2066  %mul1 = extractelement <4 x i16> %mul, i64 1
2067  %mul2 = extractelement <4 x i16> %mul, i64 2
2068  %mul3 = extractelement <4 x i16> %mul, i64 3
2069
2070  %acc = load i16, i16 addrspace(1)* %dst, align 4
2071  %add1 = add i16 %mul0, %acc
2072  %add2 = add i16 %add1, %mul1
2073  %add3 = add i16 %add2, %mul2
2074  %add4 = add i16 %add3, %mul3
2075
2076  store i16 %add4, i16 addrspace(1)* %dst, align 4
2077  ret void
2078}
2079
2080; TODO: Support this pattern.
2081define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
2082; GFX7-LABEL: udot4_acc8_vecMul:
2083; GFX7:       ; %bb.0: ; %entry
2084; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2085; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2086; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2087; GFX7-NEXT:    s_mov_b32 s10, 0
2088; GFX7-NEXT:    s_mov_b32 s11, s3
2089; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2090; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2091; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2092; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2093; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2094; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2095; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2096; GFX7-NEXT:    s_mov_b32 s2, -1
2097; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2098; GFX7-NEXT:    s_movk_i32 s4, 0xff
2099; GFX7-NEXT:    s_waitcnt vmcnt(2)
2100; GFX7-NEXT:    v_and_b32_e32 v4, s4, v2
2101; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
2102; GFX7-NEXT:    s_waitcnt vmcnt(1)
2103; GFX7-NEXT:    v_and_b32_e32 v7, s4, v0
2104; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
2105; GFX7-NEXT:    s_waitcnt vmcnt(0)
2106; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
2107; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
2108; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
2109; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
2110; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
2111; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
2112; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2113; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
2114; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2115; GFX7-NEXT:    s_endpgm
2116;
2117; GFX8-LABEL: udot4_acc8_vecMul:
2118; GFX8:       ; %bb.0: ; %entry
2119; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2120; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2121; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2122; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2123; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2124; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2125; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2126; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2127; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2128; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2129; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2130; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2131; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2132; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2133; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2134; GFX8-NEXT:    s_waitcnt vmcnt(2)
2135; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2136; GFX8-NEXT:    s_waitcnt vmcnt(1)
2137; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2138; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2139; GFX8-NEXT:    v_mul_lo_u16_e32 v9, v5, v6
2140; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2141; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2142; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
2143; GFX8-NEXT:    v_or_b32_e32 v8, v8, v9
2144; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
2145; GFX8-NEXT:    s_waitcnt vmcnt(0)
2146; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2147; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
2148; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2149; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
2150; GFX8-NEXT:    v_add_u16_e32 v2, v2, v7
2151; GFX8-NEXT:    flat_store_byte v[0:1], v2
2152; GFX8-NEXT:    s_endpgm
2153;
2154; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2155; GFX9-NODL:       ; %bb.0: ; %entry
2156; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2157; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2158; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2159; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2161; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2162; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2163; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
2164; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2165; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2166; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2167; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2168; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2169; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
2170; GFX9-NODL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2171; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2172; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
2173; GFX9-NODL-NEXT:    v_or_b32_e32 v7, v7, v8
2174; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2175; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2176; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
2177; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v7
2178; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2179; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
2180; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v6
2181; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
2182; GFX9-NODL-NEXT:    s_endpgm
2183;
2184; GFX9-DL-LABEL: udot4_acc8_vecMul:
2185; GFX9-DL:       ; %bb.0: ; %entry
2186; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2187; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2188; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2189; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2190; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2191; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2192; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2193; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
2194; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2195; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2196; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2197; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2198; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2199; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
2200; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2201; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2202; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
2203; GFX9-DL-NEXT:    v_or_b32_e32 v7, v7, v8
2204; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2205; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2206; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
2207; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v7
2208; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2209; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
2210; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
2211; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
2212; GFX9-DL-NEXT:    s_endpgm
2213;
2214; GFX10-DL-LABEL: udot4_acc8_vecMul:
2215; GFX10-DL:       ; %bb.0: ; %entry
2216; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2217; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2218; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2219; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2220; GFX10-DL-NEXT:    s_clause 0x1
2221; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2222; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2223; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2224; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
2225; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2226; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
2227; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2228; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
2229; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
2230; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2231; GFX10-DL-NEXT:    v_lshrrev_b16 v8, 8, v2
2232; GFX10-DL-NEXT:    v_mul_lo_u16 v4, v4, v5
2233; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
2234; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v6, v7
2235; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2236; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
2237; GFX10-DL-NEXT:    v_lshlrev_b16 v4, 8, v4
2238; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v8
2239; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2240; GFX10-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
2241; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
2242; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
2243; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2244; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2245; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
2246; GFX10-DL-NEXT:    v_mad_u16 v1, v6, v7, v1
2247; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
2248; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
2249; GFX10-DL-NEXT:    s_endpgm
2250                                             <4 x i8> addrspace(1)* %src2,
2251                                             i8 addrspace(1)* nocapture %dst) {
2252entry:
2253  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2254  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
2255  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
2256  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
2257  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
2258
2259  %mul = mul <4 x i8> %vec1, %vec2
2260  %mul0 = extractelement <4 x i8> %mul, i64 0
2261  %mul1 = extractelement <4 x i8> %mul, i64 1
2262  %mul2 = extractelement <4 x i8> %mul, i64 2
2263  %mul3 = extractelement <4 x i8> %mul, i64 3
2264
2265  %acc = load i8, i8 addrspace(1)* %dst, align 4
2266  %add1 = add i8 %mul0, %acc
2267  %add2 = add i8 %add1, %mul1
2268  %add3 = add i8 %add2, %mul2
2269  %add4 = add i8 %add3, %mul3
2270
2271  store i8 %add4, i8 addrspace(1)* %dst, align 4
2272  ret void
2273}
2274
2275declare i32 @llvm.amdgcn.workitem.id.x()
2276