1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
8
9define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
10; GFX7-LABEL: udot4_acc32:
11; GFX7:       ; %bb.0: ; %entry
12; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
13; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
14; GFX7-NEXT:    s_mov_b32 s3, 0xf000
15; GFX7-NEXT:    s_mov_b32 s10, 0
16; GFX7-NEXT:    s_mov_b32 s11, s3
17; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
19; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
20; GFX7-NEXT:    v_mov_b32_e32 v1, 0
21; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
22; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
23; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
24; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
25; GFX7-NEXT:    s_mov_b32 s2, -1
26; GFX7-NEXT:    s_waitcnt vmcnt(1)
27; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
28; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
29; GFX7-NEXT:    s_waitcnt vmcnt(0)
30; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
31; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
32; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, s4
34; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
35; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
36; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
37; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
38; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
39; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
40; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
41; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
42; GFX7-NEXT:    s_endpgm
43;
44; GFX8-LABEL: udot4_acc32:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
47; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
48; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
49; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX8-NEXT:    v_mov_b32_e32 v1, s5
51; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
52; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
53; GFX8-NEXT:    flat_load_dword v3, v[0:1]
54; GFX8-NEXT:    v_mov_b32_e32 v1, s7
55; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
56; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
57; GFX8-NEXT:    flat_load_dword v0, v[0:1]
58; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
59; GFX8-NEXT:    s_waitcnt vmcnt(1)
60; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
61; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
62; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
63; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
64; GFX8-NEXT:    s_waitcnt vmcnt(0)
65; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
66; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
69; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
70; GFX8-NEXT:    v_mad_u32_u24 v1, v4, v5, v1
71; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
72; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
73; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
74; GFX8-NEXT:    v_mov_b32_e32 v0, s0
75; GFX8-NEXT:    v_mov_b32_e32 v1, s1
76; GFX8-NEXT:    flat_store_dword v[0:1], v2
77; GFX8-NEXT:    s_endpgm
78;
79; GFX9-NODL-LABEL: udot4_acc32:
80; GFX9-NODL:       ; %bb.0: ; %entry
81; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
82; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
83; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
84; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
86; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
87; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
88; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
89; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
90; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
91; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
92; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
93; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
94; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
96; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
97; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
98; GFX9-NODL-NEXT:    s_endpgm
99;
100; GFX9-DL-LABEL: udot4_acc32:
101; GFX9-DL:       ; %bb.0: ; %entry
102; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
103; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
104; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
105; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
107; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
108; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
109; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
111; GFX9-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s0
112; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
113; GFX9-DL-NEXT:    s_endpgm
114;
115; GFX10-DL-LABEL: udot4_acc32:
116; GFX10-DL:       ; %bb.0: ; %entry
117; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
118; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
119; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
120; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX10-DL-NEXT:    s_clause 0x1
122; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
123; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
124; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
125; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
126; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
127; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
128; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
129; GFX10-DL-NEXT:    s_endpgm
130                                       <4 x i8> addrspace(1)* %src2,
131                                       i32 addrspace(1)* nocapture %dst) {
132entry:
133  %idx = call i32 @llvm.amdgcn.workitem.id.x()
134  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
135  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
136  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
137  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
138
139  %v1e0 = extractelement <4 x i8> %vec1, i64 0
140  %cv1e0 = zext i8 %v1e0 to i32
141  %v2e0 = extractelement <4 x i8> %vec2, i64 0
142  %cv2e0 = zext i8 %v2e0 to i32
143  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
144
145  %v1e1 = extractelement <4 x i8> %vec1, i64 1
146  %cv1e1 = zext i8 %v1e1 to i32
147  %v2e1 = extractelement <4 x i8> %vec2, i64 1
148  %cv2e1 = zext i8 %v2e1 to i32
149  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
150
151  %v1e2 = extractelement <4 x i8> %vec1, i64 2
152  %cv1e2 = zext i8 %v1e2 to i32
153  %v2e2 = extractelement <4 x i8> %vec2, i64 2
154  %cv2e2 = zext i8 %v2e2 to i32
155  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
156
157  %v1e3 = extractelement <4 x i8> %vec1, i64 3
158  %cv1e3 = zext i8 %v1e3 to i32
159  %v2e3 = extractelement <4 x i8> %vec2, i64 3
160  %cv2e3 = zext i8 %v2e3 to i32
161  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
162
163  %acc = load i32, i32 addrspace(1)* %dst, align 4
164  %mad1 = add i32 %mul1, %acc
165  %mad2 = add i32 %mad1, %mul2
166  %mad3 = add i32 %mad2, %mul3
167  %mad4 = add i32 %mad3, %mul4
168
169  store i32 %mad4, i32 addrspace(1)* %dst, align 4
170  ret void
171}
172
173define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
174; GFX7-LABEL: udot4_acc16:
175; GFX7:       ; %bb.0: ; %entry
176; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
177; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
178; GFX7-NEXT:    s_mov_b32 s3, 0xf000
179; GFX7-NEXT:    s_mov_b32 s10, 0
180; GFX7-NEXT:    s_mov_b32 s11, s3
181; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
183; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
184; GFX7-NEXT:    v_mov_b32_e32 v1, 0
185; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
186; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
187; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
188; GFX7-NEXT:    s_mov_b32 s2, -1
189; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
190; GFX7-NEXT:    s_waitcnt vmcnt(2)
191; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
192; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
193; GFX7-NEXT:    s_waitcnt vmcnt(1)
194; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
195; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
196; GFX7-NEXT:    s_waitcnt vmcnt(0)
197; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
198; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
199; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
200; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
201; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
202; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
203; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
204; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
205; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
206; GFX7-NEXT:    s_endpgm
207;
208; GFX8-LABEL: udot4_acc16:
209; GFX8:       ; %bb.0: ; %entry
210; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
211; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
212; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
213; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
214; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX8-NEXT:    v_mov_b32_e32 v1, s5
216; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
217; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
218; GFX8-NEXT:    flat_load_dword v3, v[0:1]
219; GFX8-NEXT:    v_mov_b32_e32 v1, s7
220; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
221; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
222; GFX8-NEXT:    flat_load_dword v2, v[0:1]
223; GFX8-NEXT:    v_mov_b32_e32 v0, s0
224; GFX8-NEXT:    v_mov_b32_e32 v1, s1
225; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
226; GFX8-NEXT:    s_waitcnt vmcnt(2)
227; GFX8-NEXT:    v_and_b32_e32 v6, 0xff, v3
228; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
229; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
230; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
231; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
232; GFX8-NEXT:    s_waitcnt vmcnt(1)
233; GFX8-NEXT:    v_and_b32_e32 v7, 0xff, v2
234; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
235; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
236; GFX8-NEXT:    s_waitcnt vmcnt(0)
237; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
238; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
239; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
240; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
241; GFX8-NEXT:    v_mad_u16 v4, v10, v5, v4
242; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
243; GFX8-NEXT:    flat_store_short v[0:1], v2
244; GFX8-NEXT:    s_endpgm
245;
246; GFX9-NODL-LABEL: udot4_acc16:
247; GFX9-NODL:       ; %bb.0: ; %entry
248; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
249; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
250; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
251; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
252; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
254; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
255; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
256; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
257; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
258; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v1
259; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
260; GFX9-NODL-NEXT:    v_and_b32_e32 v5, 0xff, v2
261; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
262; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
263; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
264; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
265; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
266; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
267; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
268; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
269; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
270; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
271; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
272; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
273; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
274; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
275; GFX9-NODL-NEXT:    s_endpgm
276;
277; GFX9-DL-LABEL: udot4_acc16:
278; GFX9-DL:       ; %bb.0: ; %entry
279; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
280; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
281; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
282; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
283; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
284; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
285; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
286; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
287; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
288; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
289; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
290; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
291; GFX9-DL-NEXT:    v_and_b32_e32 v5, 0xff, v2
292; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
293; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
294; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
295; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
296; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
297; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
298; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
299; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
300; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
301; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
302; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
303; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
304; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
305; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
306; GFX9-DL-NEXT:    s_endpgm
307;
308; GFX10-DL-LABEL: udot4_acc16:
309; GFX10-DL:       ; %bb.0: ; %entry
310; GFX10-DL-NEXT:    s_clause 0x1
311; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
312; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
313; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
314; GFX10-DL-NEXT:    v_mov_b32_e32 v8, 0xff
315; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
316; GFX10-DL-NEXT:    s_clause 0x1
317; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
318; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
319; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
320; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
321; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
322; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
323; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
324; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
325; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
326; GFX10-DL-NEXT:    v_and_b32_e32 v7, 0xff, v2
327; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
328; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
329; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
330; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
331; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
332; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
333; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
334; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
335; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
336; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
337; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
338; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
339; GFX10-DL-NEXT:    s_endpgm
340                                       <4 x i8> addrspace(1)* %src2,
341                                       i16 addrspace(1)* nocapture %dst) {
342entry:
343  %idx = call i32 @llvm.amdgcn.workitem.id.x()
344  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
345  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
346  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
347  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
348
349  %v1e0 = extractelement <4 x i8> %vec1, i64 0
350  %cv1e0 = zext i8 %v1e0 to i16
351  %v2e0 = extractelement <4 x i8> %vec2, i64 0
352  %cv2e0 = zext i8 %v2e0 to i16
353  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
354
355  %v1e1 = extractelement <4 x i8> %vec1, i64 1
356  %cv1e1 = zext i8 %v1e1 to i16
357  %v2e1 = extractelement <4 x i8> %vec2, i64 1
358  %cv2e1 = zext i8 %v2e1 to i16
359  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
360
361  %v1e2 = extractelement <4 x i8> %vec1, i64 2
362  %cv1e2 = zext i8 %v1e2 to i16
363  %v2e2 = extractelement <4 x i8> %vec2, i64 2
364  %cv2e2 = zext i8 %v2e2 to i16
365  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
366
367  %v1e3 = extractelement <4 x i8> %vec1, i64 3
368  %cv1e3 = zext i8 %v1e3 to i16
369  %v2e3 = extractelement <4 x i8> %vec2, i64 3
370  %cv2e3 = zext i8 %v2e3 to i16
371  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
372
373  %acc = load i16, i16 addrspace(1)* %dst, align 2
374  %mad1 = add i16 %mul1, %acc
375  %mad2 = add i16 %mad1, %mul2
376  %mad3 = add i16 %mad2, %mul3
377  %mad4 = add i16 %mad3, %mul4
378
379  store i16 %mad4, i16 addrspace(1)* %dst, align 2
380  ret void
381}
382
383define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
384; GFX7-LABEL: udot4_acc8:
385; GFX7:       ; %bb.0: ; %entry
386; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
387; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
388; GFX7-NEXT:    s_mov_b32 s3, 0xf000
389; GFX7-NEXT:    s_mov_b32 s10, 0
390; GFX7-NEXT:    s_mov_b32 s11, s3
391; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
393; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
394; GFX7-NEXT:    v_mov_b32_e32 v1, 0
395; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
396; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
397; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
398; GFX7-NEXT:    s_mov_b32 s2, -1
399; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
400; GFX7-NEXT:    s_waitcnt vmcnt(2)
401; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
402; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
403; GFX7-NEXT:    s_waitcnt vmcnt(1)
404; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
405; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
406; GFX7-NEXT:    s_waitcnt vmcnt(0)
407; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
408; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
409; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
410; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
411; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
412; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
413; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
414; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
415; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
416; GFX7-NEXT:    s_endpgm
417;
418; GFX8-LABEL: udot4_acc8:
419; GFX8:       ; %bb.0: ; %entry
420; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
421; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
422; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
423; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX8-NEXT:    v_mov_b32_e32 v1, s5
425; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
426; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
427; GFX8-NEXT:    flat_load_dword v3, v[0:1]
428; GFX8-NEXT:    v_mov_b32_e32 v1, s7
429; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
430; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
431; GFX8-NEXT:    flat_load_dword v2, v[0:1]
432; GFX8-NEXT:    v_mov_b32_e32 v0, s0
433; GFX8-NEXT:    v_mov_b32_e32 v1, s1
434; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
435; GFX8-NEXT:    s_waitcnt vmcnt(2)
436; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
437; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
438; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
439; GFX8-NEXT:    s_waitcnt vmcnt(1)
440; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
441; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
442; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
443; GFX8-NEXT:    s_waitcnt vmcnt(0)
444; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
445; GFX8-NEXT:    v_mad_u16 v2, v7, v8, v2
446; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
447; GFX8-NEXT:    v_mad_u16 v2, v9, v10, v2
448; GFX8-NEXT:    flat_store_byte v[0:1], v2
449; GFX8-NEXT:    s_endpgm
450;
451; GFX9-NODL-LABEL: udot4_acc8:
452; GFX9-NODL:       ; %bb.0: ; %entry
453; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
454; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
455; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
456; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
458; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
459; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
460; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
461; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
462; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
463; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
464; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
465; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
466; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
467; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
468; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
469; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
470; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
471; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
472; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
473; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
474; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
475; GFX9-NODL-NEXT:    s_endpgm
476;
477; GFX9-DL-LABEL: udot4_acc8:
478; GFX9-DL:       ; %bb.0: ; %entry
479; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
480; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
481; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
482; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
484; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
485; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
486; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
487; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
488; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
489; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
490; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
491; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
492; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
493; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
494; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
495; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
496; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v7, v1
497; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
498; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
499; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v9, v1
500; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
501; GFX9-DL-NEXT:    s_endpgm
502;
503; GFX10-DL-LABEL: udot4_acc8:
504; GFX10-DL:       ; %bb.0: ; %entry
505; GFX10-DL-NEXT:    s_clause 0x1
506; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
507; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
508; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
509; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
510; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
511; GFX10-DL-NEXT:    s_clause 0x1
512; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
513; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
514; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
515; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
516; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
517; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
518; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
519; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
520; GFX10-DL-NEXT:    v_mad_u16 v4, v2, v3, v4
521; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
522; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
523; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
524; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
525; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
526; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
527; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
528; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
529; GFX10-DL-NEXT:    s_endpgm
530                                      <4 x i8> addrspace(1)* %src2,
531                                      i8 addrspace(1)* nocapture %dst) {
532entry:
533  %idx = call i32 @llvm.amdgcn.workitem.id.x()
534  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
535  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
536  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
537  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
538
539  %v1e0 = extractelement <4 x i8> %vec1, i64 0
540  %v2e0 = extractelement <4 x i8> %vec2, i64 0
541  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
542
543  %v1e1 = extractelement <4 x i8> %vec1, i64 1
544  %v2e1 = extractelement <4 x i8> %vec2, i64 1
545  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
546
547  %v1e2 = extractelement <4 x i8> %vec1, i64 2
548  %v2e2 = extractelement <4 x i8> %vec2, i64 2
549  %mul3 = mul nuw nsw i8 %v1e2, %v2e2
550
551  %v1e3 = extractelement <4 x i8> %vec1, i64 3
552  %v2e3 = extractelement <4 x i8> %vec2, i64 3
553  %mul4 = mul nuw nsw i8 %v1e3, %v2e3
554
555  %acc = load i8, i8 addrspace(1)* %dst, align 2
556  %mad1 = add i8 %mul1, %acc
557  %mad2 = add i8 %mad1, %mul2
558  %mad3 = add i8 %mad2, %mul3
559  %mad4 = add i8 %mad3, %mul4
560
561  store i8 %mad4, i8 addrspace(1)* %dst, align 2
562  ret void
563}
564
565; TODO: Generate udot4?
566define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
567; GFX7-LABEL: udot2_8:
568; GFX7:       ; %bb.0: ; %entry
569; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
570; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
571; GFX7-NEXT:    s_mov_b32 s3, 0xf000
572; GFX7-NEXT:    s_mov_b32 s10, 0
573; GFX7-NEXT:    s_mov_b32 s11, s3
574; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
576; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
577; GFX7-NEXT:    v_mov_b32_e32 v1, 0
578; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
579; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
580; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
581; GFX7-NEXT:    s_mov_b32 s2, -1
582; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
583; GFX7-NEXT:    s_waitcnt vmcnt(2)
584; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
585; GFX7-NEXT:    v_bfe_u32 v2, v2, 8, 8
586; GFX7-NEXT:    s_waitcnt vmcnt(1)
587; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v0
588; GFX7-NEXT:    v_bfe_u32 v0, v0, 8, 8
589; GFX7-NEXT:    s_waitcnt vmcnt(0)
590; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
591; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
592; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
593; GFX7-NEXT:    s_endpgm
594;
595; GFX8-LABEL: udot2_8:
596; GFX8:       ; %bb.0: ; %entry
597; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
598; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
599; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX8-NEXT:    v_mov_b32_e32 v1, s5
602; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
603; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
604; GFX8-NEXT:    flat_load_dword v3, v[0:1]
605; GFX8-NEXT:    v_mov_b32_e32 v1, s7
606; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
607; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
608; GFX8-NEXT:    flat_load_dword v2, v[0:1]
609; GFX8-NEXT:    v_mov_b32_e32 v0, s0
610; GFX8-NEXT:    v_mov_b32_e32 v1, s1
611; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
612; GFX8-NEXT:    s_waitcnt vmcnt(2)
613; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
614; GFX8-NEXT:    s_waitcnt vmcnt(1)
615; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
616; GFX8-NEXT:    s_waitcnt vmcnt(0)
617; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
618; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
619; GFX8-NEXT:    flat_store_byte v[0:1], v2
620; GFX8-NEXT:    s_endpgm
621;
622; GFX9-NODL-LABEL: udot2_8:
623; GFX9-NODL:       ; %bb.0: ; %entry
624; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
625; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
626; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
627; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
628; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
630; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
631; GFX9-NODL-NEXT:    global_load_ubyte v4, v1, s[2:3]
632; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
633; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
634; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
635; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
636; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
637; GFX9-NODL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
638; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
639; GFX9-NODL-NEXT:    global_store_byte v1, v0, s[2:3]
640; GFX9-NODL-NEXT:    s_endpgm
641;
642; GFX9-DL-LABEL: udot2_8:
643; GFX9-DL:       ; %bb.0: ; %entry
644; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
645; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
646; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
647; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
648; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
650; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
651; GFX9-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
652; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
653; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
654; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
655; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
656; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
657; GFX9-DL-NEXT:    v_mad_legacy_u16 v2, v2, v3, v4
658; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v2
659; GFX9-DL-NEXT:    global_store_byte v1, v0, s[2:3]
660; GFX9-DL-NEXT:    s_endpgm
661;
662; GFX10-DL-LABEL: udot2_8:
663; GFX10-DL:       ; %bb.0: ; %entry
664; GFX10-DL-NEXT:    s_clause 0x1
665; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
666; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
667; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
668; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
669; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX10-DL-NEXT:    s_clause 0x1
671; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
672; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
673; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
674; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
675; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
676; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
677; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
678; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
679; GFX10-DL-NEXT:    v_mad_u16 v2, v2, v3, v4
680; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v2
681; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
682; GFX10-DL-NEXT:    s_endpgm
683                                   <4 x i8> addrspace(1)* %src2,
684                                   i8 addrspace(1)* nocapture %dst) {
685entry:
686  %idx = call i32 @llvm.amdgcn.workitem.id.x()
687  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
688  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
689  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
690  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
691
692  %v1e0 = extractelement <4 x i8> %vec1, i64 0
693  %v2e0 = extractelement <4 x i8> %vec2, i64 0
694  %mul1 = mul nuw nsw i8 %v1e0, %v2e0
695
696  %v1e1 = extractelement <4 x i8> %vec1, i64 1
697  %v2e1 = extractelement <4 x i8> %vec2, i64 1
698  %mul2 = mul nuw nsw i8 %v1e1, %v2e1
699
700  %acc = load i8, i8 addrspace(1)* %dst, align 2
701  %mad1 = add i8 %mul1, %acc
702  %mad2 = add i8 %mad1, %mul2
703  store i8 %mad2, i8 addrspace(1)* %dst, align 2
704  ret void
705}
706
707define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
708; GFX7-LABEL: udot4_CommutationInsideMAD:
709; GFX7:       ; %bb.0: ; %entry
710; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
711; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
712; GFX7-NEXT:    s_mov_b32 s3, 0xf000
713; GFX7-NEXT:    s_mov_b32 s10, 0
714; GFX7-NEXT:    s_mov_b32 s11, s3
715; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
717; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
718; GFX7-NEXT:    v_mov_b32_e32 v1, 0
719; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
720; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
721; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
722; GFX7-NEXT:    s_mov_b32 s2, -1
723; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
724; GFX7-NEXT:    s_waitcnt vmcnt(2)
725; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
726; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
727; GFX7-NEXT:    s_waitcnt vmcnt(1)
728; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
729; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
730; GFX7-NEXT:    s_waitcnt vmcnt(0)
731; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
732; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
733; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
734; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
735; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
736; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
737; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v5, v1
738; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
739; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
740; GFX7-NEXT:    s_endpgm
741;
742; GFX8-LABEL: udot4_CommutationInsideMAD:
743; GFX8:       ; %bb.0: ; %entry
744; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
745; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
746; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
747; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX8-NEXT:    v_mov_b32_e32 v1, s5
749; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
750; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
751; GFX8-NEXT:    flat_load_dword v3, v[0:1]
752; GFX8-NEXT:    v_mov_b32_e32 v1, s7
753; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
754; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
755; GFX8-NEXT:    flat_load_dword v2, v[0:1]
756; GFX8-NEXT:    v_mov_b32_e32 v0, s0
757; GFX8-NEXT:    v_mov_b32_e32 v1, s1
758; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
759; GFX8-NEXT:    s_waitcnt vmcnt(2)
760; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
761; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
762; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
763; GFX8-NEXT:    s_waitcnt vmcnt(1)
764; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
765; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
766; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
767; GFX8-NEXT:    s_waitcnt vmcnt(0)
768; GFX8-NEXT:    v_mad_u16 v2, v2, v3, v4
769; GFX8-NEXT:    v_mad_u16 v2, v8, v7, v2
770; GFX8-NEXT:    v_mad_u16 v2, v6, v5, v2
771; GFX8-NEXT:    v_mad_u16 v2, v10, v9, v2
772; GFX8-NEXT:    flat_store_byte v[0:1], v2
773; GFX8-NEXT:    s_endpgm
774;
775; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
776; GFX9-NODL:       ; %bb.0: ; %entry
777; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
778; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
779; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
780; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
781; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
782; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
783; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
784; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
785; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
786; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
787; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
788; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
789; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
790; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
791; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
792; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
793; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
794; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
795; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
796; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
797; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
798; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
799; GFX9-NODL-NEXT:    s_endpgm
800;
801; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
802; GFX9-DL:       ; %bb.0: ; %entry
803; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
804; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
805; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
806; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
808; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
809; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
810; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
811; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
812; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
813; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
814; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
815; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
816; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
817; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
818; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
819; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
820; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v6, v1
821; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
822; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
823; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
824; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
825; GFX9-DL-NEXT:    s_endpgm
826;
827; GFX10-DL-LABEL: udot4_CommutationInsideMAD:
828; GFX10-DL:       ; %bb.0: ; %entry
829; GFX10-DL-NEXT:    s_clause 0x1
830; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
831; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
832; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
833; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
834; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX10-DL-NEXT:    s_clause 0x1
836; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
837; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
838; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
839; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
840; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
841; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
842; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
843; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
844; GFX10-DL-NEXT:    v_mad_u16 v4, v3, v2, v4
845; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
846; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
847; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
848; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
849; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
850; GFX10-DL-NEXT:    v_mad_u16 v0, v7, v6, v0
851; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
852; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
853; GFX10-DL-NEXT:    s_endpgm
854                                                      <4 x i8> addrspace(1)* %src2,
855                                                      i8 addrspace(1)* nocapture %dst) {
856entry:
857  %idx = call i32 @llvm.amdgcn.workitem.id.x()
858  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
859  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
860  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
861  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
862
863  %v1e0 = extractelement <4 x i8> %vec1, i64 0
864  %v2e0 = extractelement <4 x i8> %vec2, i64 0
865  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
866
867  %v1e1 = extractelement <4 x i8> %vec1, i64 1
868  %v2e1 = extractelement <4 x i8> %vec2, i64 1
869  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
870
871  %v1e2 = extractelement <4 x i8> %vec1, i64 2
872  %v2e2 = extractelement <4 x i8> %vec2, i64 2
873  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
874
875  %v1e3 = extractelement <4 x i8> %vec1, i64 3
876  %v2e3 = extractelement <4 x i8> %vec2, i64 3
877  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
878
879  %acc = load i8, i8 addrspace(1)* %dst, align 2
880  %mad1 = add i8 %acc, %mul1
881  %mad2 = add i8 %mul2, %mad1
882  %mad3 = add i8 %mul3, %mad2
883  %mad4 = add i8 %mul4, %mad3
884
885  store i8 %mad4, i8 addrspace(1)* %dst, align 2
886  ret void
887}
888
889; TODO: Support commutation accross the adds.
890define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
891; GFX7-LABEL: udot4_CommutationAccrossMADs:
892; GFX7:       ; %bb.0: ; %entry
893; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
894; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
895; GFX7-NEXT:    s_mov_b32 s3, 0xf000
896; GFX7-NEXT:    s_mov_b32 s10, 0
897; GFX7-NEXT:    s_mov_b32 s11, s3
898; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
900; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
901; GFX7-NEXT:    v_mov_b32_e32 v1, 0
902; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
903; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
904; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
905; GFX7-NEXT:    s_mov_b32 s2, -1
906; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
907; GFX7-NEXT:    s_waitcnt vmcnt(2)
908; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
909; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v2
910; GFX7-NEXT:    s_waitcnt vmcnt(1)
911; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
912; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v0
913; GFX7-NEXT:    s_waitcnt vmcnt(0)
914; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v4, v1
915; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
916; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
917; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v3, v1
918; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
919; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
920; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v5, v1
921; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
922; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
923; GFX7-NEXT:    s_endpgm
924;
925; GFX8-LABEL: udot4_CommutationAccrossMADs:
926; GFX8:       ; %bb.0: ; %entry
927; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
928; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
929; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
930; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX8-NEXT:    v_mov_b32_e32 v1, s5
932; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
933; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
934; GFX8-NEXT:    flat_load_dword v3, v[0:1]
935; GFX8-NEXT:    v_mov_b32_e32 v1, s7
936; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
937; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
938; GFX8-NEXT:    flat_load_dword v2, v[0:1]
939; GFX8-NEXT:    v_mov_b32_e32 v0, s0
940; GFX8-NEXT:    v_mov_b32_e32 v1, s1
941; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
942; GFX8-NEXT:    s_waitcnt vmcnt(2)
943; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
944; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
945; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
946; GFX8-NEXT:    s_waitcnt vmcnt(1)
947; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
948; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
949; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 24, v2
950; GFX8-NEXT:    s_waitcnt vmcnt(0)
951; GFX8-NEXT:    v_mad_u16 v4, v8, v7, v4
952; GFX8-NEXT:    v_mad_u16 v2, v2, v3, v4
953; GFX8-NEXT:    v_mad_u16 v2, v6, v5, v2
954; GFX8-NEXT:    v_mad_u16 v2, v10, v9, v2
955; GFX8-NEXT:    flat_store_byte v[0:1], v2
956; GFX8-NEXT:    s_endpgm
957;
958; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
959; GFX9-NODL:       ; %bb.0: ; %entry
960; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
961; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
962; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
963; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
965; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
966; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
967; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
968; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
969; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
970; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
971; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
972; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
973; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
974; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
975; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
976; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
977; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
978; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
979; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
980; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
981; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
982; GFX9-NODL-NEXT:    s_endpgm
983;
984; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
985; GFX9-DL:       ; %bb.0: ; %entry
986; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
987; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
988; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
989; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
991; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
992; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
993; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
994; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
995; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
996; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
997; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
998; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
999; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1000; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v6, v3
1001; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1002; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
1003; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v2, v1, v3
1004; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 24, v2
1005; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v4, v1
1006; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v8, v1
1007; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
1008; GFX9-DL-NEXT:    s_endpgm
1009;
1010; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
1011; GFX10-DL:       ; %bb.0: ; %entry
1012; GFX10-DL-NEXT:    s_clause 0x1
1013; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1014; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1015; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1016; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
1017; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1018; GFX10-DL-NEXT:    s_clause 0x1
1019; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
1020; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
1021; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[2:3]
1022; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1023; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1024; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1025; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
1026; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1027; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v0, v4
1028; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
1029; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
1030; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
1031; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1032; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1033; GFX10-DL-NEXT:    v_mad_u16 v0, v5, v4, v0
1034; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
1035; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
1036; GFX10-DL-NEXT:    s_endpgm
1037                                                        <4 x i8> addrspace(1)* %src2,
1038                                                        i8 addrspace(1)* nocapture %dst) {
1039entry:
1040  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1041  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1042  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1043  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1044  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1045
1046  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1047  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1048  %mul1 = mul nuw nsw i8 %v2e0, %v1e0
1049
1050  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1051  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1052  %mul2 = mul nuw nsw i8 %v2e1, %v1e1
1053
1054  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1055  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1056  %mul3 = mul nuw nsw i8 %v2e2, %v1e2
1057
1058  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1059  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1060  %mul4 = mul nuw nsw i8 %v2e3, %v1e3
1061
1062  %acc = load i8, i8 addrspace(1)* %dst, align 2
1063  %mad1 = add i8 %acc, %mul2
1064  %mad2 = add i8 %mad1, %mul1
1065  %mad3 = add i8 %mad2, %mul3
1066  %mad4 = add i8 %mad3, %mul4
1067
1068  store i8 %mad4, i8 addrspace(1)* %dst, align 2
1069  ret void
1070}
1071
1072define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
1073; GFX7-LABEL: udot4_multiuse_mul1:
1074; GFX7:       ; %bb.0: ; %entry
1075; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1076; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1077; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1078; GFX7-NEXT:    s_mov_b32 s10, 0
1079; GFX7-NEXT:    s_mov_b32 s11, s3
1080; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1082; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1083; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1084; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1085; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1086; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1087; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1088; GFX7-NEXT:    s_mov_b32 s2, -1
1089; GFX7-NEXT:    s_waitcnt vmcnt(1)
1090; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
1091; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1092; GFX7-NEXT:    s_waitcnt vmcnt(0)
1093; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
1094; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1095; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1096; GFX7-NEXT:    v_mad_u32_u24 v8, v1, v5, s4
1097; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, v8
1098; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1099; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1100; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
1101; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1102; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1103; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1104; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1105; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1106; GFX7-NEXT:    s_endpgm
1107;
1108; GFX8-LABEL: udot4_multiuse_mul1:
1109; GFX8:       ; %bb.0: ; %entry
1110; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1111; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1112; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1113; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1114; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1115; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1116; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1117; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1118; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1119; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1120; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1121; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1122; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1123; GFX8-NEXT:    s_waitcnt vmcnt(1)
1124; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
1125; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
1126; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
1127; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1128; GFX8-NEXT:    s_waitcnt vmcnt(0)
1129; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
1130; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
1131; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1132; GFX8-NEXT:    v_mad_u32_u24 v8, v1, v2, s2
1133; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v8
1134; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
1135; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
1136; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1137; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1138; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v0, v1
1139; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1140; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1141; GFX8-NEXT:    flat_store_dword v[0:1], v2
1142; GFX8-NEXT:    s_endpgm
1143;
1144; GFX9-NODL-LABEL: udot4_multiuse_mul1:
1145; GFX9-NODL:       ; %bb.0: ; %entry
1146; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1147; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1148; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1149; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1150; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1151; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1152; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1153; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1154; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1155; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xff, v1
1156; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1157; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xff, v2
1158; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1159; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1160; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1161; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
1162; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
1164; GFX9-NODL-NEXT:    v_add3_u32 v2, v5, v3, v2
1165; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v6, v1
1166; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1167; GFX9-NODL-NEXT:    s_endpgm
1168;
1169; GFX9-DL-LABEL: udot4_multiuse_mul1:
1170; GFX9-DL:       ; %bb.0: ; %entry
1171; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1172; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1173; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1174; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1175; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1176; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1177; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1178; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1179; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1180; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xff, v1
1181; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1182; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xff, v2
1183; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1184; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1185; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1186; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
1187; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1188; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v3, v4, s0
1189; GFX9-DL-NEXT:    v_add3_u32 v2, v5, v3, v2
1190; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v6, v1
1191; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1192; GFX9-DL-NEXT:    s_endpgm
1193;
1194; GFX10-DL-LABEL: udot4_multiuse_mul1:
1195; GFX10-DL:       ; %bb.0: ; %entry
1196; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1197; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1198; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1199; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX10-DL-NEXT:    s_clause 0x1
1201; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1202; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1203; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1204; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1205; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xff, v1
1206; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1207; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xff, v2
1208; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1209; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v0, v3
1210; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1211; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
1212; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1213; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1214; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1215; GFX10-DL-NEXT:    v_add3_u32 v0, v4, v0, v5
1216; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
1217; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1218; GFX10-DL-NEXT:    s_endpgm
1219                                               <4 x i8> addrspace(1)* %src2,
1220                                               i32 addrspace(1)* nocapture %dst) {
1221entry:
1222  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1223  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1224  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1225  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1226  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1227
1228  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1229  %cv1e0 = zext i8 %v1e0 to i32
1230  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1231  %cv2e0 = zext i8 %v2e0 to i32
1232  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1233
1234  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1235  %cv1e1 = zext i8 %v1e1 to i32
1236  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1237  %cv2e1 = zext i8 %v2e1 to i32
1238  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1239
1240  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1241  %cv1e2 = zext i8 %v1e2 to i32
1242  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1243  %cv2e2 = zext i8 %v2e2 to i32
1244  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1245
1246  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1247  %cv1e3 = zext i8 %v1e3 to i32
1248  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1249  %cv2e3 = zext i8 %v2e3 to i32
1250  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1251
1252  %acc = load i32, i32 addrspace(1)* %dst, align 4
1253  %add = add i32 %mul1, %acc
1254  %add1 = add i32 %mul2, %add
1255  %add2 = add i32 %add1, %mul1
1256  %add3 = add i32 %add2, %mul3
1257  %add4 = add i32 %add3, %mul4
1258
1259  store i32 %add4, i32 addrspace(1)* %dst, align 4
1260  ret void
1261}
1262
1263define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
1264; GFX7-LABEL: udot4_multiuse_add1:
1265; GFX7:       ; %bb.0: ; %entry
1266; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1267; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1268; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1269; GFX7-NEXT:    s_mov_b32 s10, 0
1270; GFX7-NEXT:    s_mov_b32 s11, s3
1271; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1273; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1274; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1275; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1276; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1277; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1278; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1279; GFX7-NEXT:    s_mov_b32 s2, -1
1280; GFX7-NEXT:    s_waitcnt vmcnt(1)
1281; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1282; GFX7-NEXT:    v_and_b32_e32 v1, 0xff, v2
1283; GFX7-NEXT:    s_waitcnt vmcnt(0)
1284; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1285; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v0
1286; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1287; GFX7-NEXT:    v_mad_u32_u24 v3, v3, v6, s4
1288; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1289; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1290; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v5, v3
1291; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1292; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1293; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1294; GFX7-NEXT:    v_add_i32_e32 v6, vcc, s4, v3
1295; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1296; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
1297; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1298; GFX7-NEXT:    s_endpgm
1299;
1300; GFX8-LABEL: udot4_multiuse_add1:
1301; GFX8:       ; %bb.0: ; %entry
1302; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1303; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1304; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1305; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1306; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1307; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1308; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1309; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1310; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1311; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1312; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1313; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1314; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1315; GFX8-NEXT:    s_waitcnt vmcnt(1)
1316; GFX8-NEXT:    v_bfe_u32 v4, v3, 8, 8
1317; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v3
1318; GFX8-NEXT:    v_bfe_u32 v6, v3, 16, 8
1319; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1320; GFX8-NEXT:    s_waitcnt vmcnt(0)
1321; GFX8-NEXT:    v_bfe_u32 v5, v0, 8, 8
1322; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v0
1323; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1324; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s2
1325; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 8
1326; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v2, v4
1327; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1328; GFX8-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1329; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s2, v4
1330; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v1
1331; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v5
1332; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1333; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1334; GFX8-NEXT:    flat_store_dword v[0:1], v2
1335; GFX8-NEXT:    s_endpgm
1336;
1337; GFX9-NODL-LABEL: udot4_multiuse_add1:
1338; GFX9-NODL:       ; %bb.0: ; %entry
1339; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1340; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1341; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1342; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1344; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1345; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1346; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1347; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1348; GFX9-NODL-NEXT:    v_bfe_u32 v4, v1, 8, 8
1349; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1350; GFX9-NODL-NEXT:    v_bfe_u32 v5, v2, 8, 8
1351; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1352; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1353; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1354; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1355; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
1356; GFX9-NODL-NEXT:    v_add_u32_e32 v4, s0, v2
1357; GFX9-NODL-NEXT:    v_add3_u32 v2, v2, v3, v6
1358; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v1, v4
1359; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1360; GFX9-NODL-NEXT:    s_endpgm
1361;
1362; GFX9-DL-LABEL: udot4_multiuse_add1:
1363; GFX9-DL:       ; %bb.0: ; %entry
1364; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1365; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1366; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1367; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1369; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1370; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1371; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1372; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1373; GFX9-DL-NEXT:    v_bfe_u32 v4, v1, 8, 8
1374; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1375; GFX9-DL-NEXT:    v_bfe_u32 v5, v2, 8, 8
1376; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1377; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1378; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1379; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1380; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, s0
1381; GFX9-DL-NEXT:    v_add_u32_e32 v4, s0, v2
1382; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v3, v6
1383; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v1, v4
1384; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1385; GFX9-DL-NEXT:    s_endpgm
1386;
1387; GFX10-DL-LABEL: udot4_multiuse_add1:
1388; GFX10-DL:       ; %bb.0: ; %entry
1389; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1390; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1391; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1392; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1393; GFX10-DL-NEXT:    s_clause 0x1
1394; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1395; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1396; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1397; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1398; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 8, 8
1399; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1400; GFX10-DL-NEXT:    v_bfe_u32 v3, v2, 8, 8
1401; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1402; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1403; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
1404; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1405; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1406; GFX10-DL-NEXT:    v_add_nc_u32_e32 v2, s2, v0
1407; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v4, v3
1408; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
1409; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
1410; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
1411; GFX10-DL-NEXT:    s_endpgm
1412                                               <4 x i8> addrspace(1)* %src2,
1413                                               i32 addrspace(1)* nocapture %dst) {
1414entry:
1415  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1416  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1417  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1418  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1419  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1420
1421  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1422  %cv1e0 = zext i8 %v1e0 to i32
1423  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1424  %cv2e0 = zext i8 %v2e0 to i32
1425  %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1426
1427  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1428  %cv1e1 = zext i8 %v1e1 to i32
1429  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1430  %cv2e1 = zext i8 %v2e1 to i32
1431  %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1432
1433  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1434  %cv1e2 = zext i8 %v1e2 to i32
1435  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1436  %cv2e2 = zext i8 %v2e2 to i32
1437  %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1438
1439  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1440  %cv1e3 = zext i8 %v1e3 to i32
1441  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1442  %cv2e3 = zext i8 %v2e3 to i32
1443  %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1444
1445  %acc = load i32, i32 addrspace(1)* %dst, align 4
1446  %add1 = add i32 %mul2, %acc
1447  %add = add i32 %add1, %acc
1448  %add2 = add i32 %add1, %mul1
1449  %add3 = add i32 %add2, %mul3
1450  %add4 = add i32 %add3, %mul4
1451  %res = add i32 %add4, %add
1452  store i32 %res, i32 addrspace(1)* %dst, align 4
1453  ret void
1454}
1455
1456define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
1457; GFX7-LABEL: notdot4_mixedtypes:
1458; GFX7:       ; %bb.0: ; %entry
1459; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1460; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1461; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1462; GFX7-NEXT:    s_mov_b32 s10, 0
1463; GFX7-NEXT:    s_mov_b32 s11, s3
1464; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1465; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1466; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1467; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1468; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1469; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1470; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1471; GFX7-NEXT:    s_mov_b32 s2, -1
1472; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
1473; GFX7-NEXT:    s_waitcnt vmcnt(2)
1474; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 8
1475; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
1476; GFX7-NEXT:    s_waitcnt vmcnt(1)
1477; GFX7-NEXT:    v_bfe_i32 v6, v0, 0, 8
1478; GFX7-NEXT:    v_bfe_u32 v7, v0, 8, 8
1479; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1480; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
1481; GFX7-NEXT:    s_waitcnt vmcnt(0)
1482; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
1483; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
1484; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
1485; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v6, v1
1486; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1487; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1488; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
1489; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1490; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1491; GFX7-NEXT:    s_endpgm
1492;
1493; GFX8-LABEL: notdot4_mixedtypes:
1494; GFX8:       ; %bb.0: ; %entry
1495; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1496; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1497; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1498; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
1499; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1500; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1501; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1502; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1503; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1504; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1505; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1506; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1507; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1508; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1509; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1510; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1511; GFX8-NEXT:    s_waitcnt vmcnt(2)
1512; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
1513; GFX8-NEXT:    v_and_b32_e32 v8, 0xff, v8
1514; GFX8-NEXT:    v_bfe_i32 v6, v3, 0, 8
1515; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1516; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
1517; GFX8-NEXT:    s_waitcnt vmcnt(1)
1518; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
1519; GFX8-NEXT:    v_and_b32_e32 v9, 0xff, v9
1520; GFX8-NEXT:    v_bfe_i32 v7, v2, 0, 8
1521; GFX8-NEXT:    s_waitcnt vmcnt(0)
1522; GFX8-NEXT:    v_mad_u16 v4, v8, v9, v4
1523; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1524; GFX8-NEXT:    v_mad_u16 v4, v6, v7, v4
1525; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1526; GFX8-NEXT:    v_mad_u16 v4, v10, v5, v4
1527; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1528; GFX8-NEXT:    flat_store_short v[0:1], v2
1529; GFX8-NEXT:    s_endpgm
1530;
1531; GFX9-NODL-LABEL: notdot4_mixedtypes:
1532; GFX9-NODL:       ; %bb.0: ; %entry
1533; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1534; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1535; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1536; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1537; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1539; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1540; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1541; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
1542; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1543; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1544; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1545; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1546; GFX9-NODL-NEXT:    v_and_b32_e32 v6, 0xff, v6
1547; GFX9-NODL-NEXT:    v_and_b32_e32 v7, 0xff, v7
1548; GFX9-NODL-NEXT:    v_bfe_i32 v4, v1, 0, 8
1549; GFX9-NODL-NEXT:    v_bfe_i32 v5, v2, 0, 8
1550; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1551; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
1552; GFX9-NODL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1553; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1554; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
1555; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1556; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1557; GFX9-NODL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
1558; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1559; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
1560; GFX9-NODL-NEXT:    s_endpgm
1561;
1562; GFX9-DL-LABEL: notdot4_mixedtypes:
1563; GFX9-DL:       ; %bb.0: ; %entry
1564; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1565; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1566; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1567; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
1568; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1569; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1570; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1571; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1572; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1573; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1574; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
1575; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1576; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
1577; GFX9-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
1578; GFX9-DL-NEXT:    v_and_b32_e32 v7, 0xff, v7
1579; GFX9-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
1580; GFX9-DL-NEXT:    v_bfe_i32 v5, v2, 0, 8
1581; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1582; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v7, v3
1583; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1584; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1585; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v5, v3
1586; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1587; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1588; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v9, v3
1589; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1590; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
1591; GFX9-DL-NEXT:    s_endpgm
1592;
1593; GFX10-DL-LABEL: notdot4_mixedtypes:
1594; GFX10-DL:       ; %bb.0: ; %entry
1595; GFX10-DL-NEXT:    s_clause 0x1
1596; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1597; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1598; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1599; GFX10-DL-NEXT:    v_mov_b32_e32 v7, 0xff
1600; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX10-DL-NEXT:    s_clause 0x1
1602; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1603; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1604; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1605; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1606; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
1607; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
1608; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1609; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
1610; GFX10-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
1611; GFX10-DL-NEXT:    v_bfe_i32 v8, v2, 0, 8
1612; GFX10-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
1613; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
1614; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1615; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1616; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1617; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1618; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
1619; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1620; GFX10-DL-NEXT:    v_mad_u16 v3, v6, v8, v3
1621; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
1622; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
1623; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
1624; GFX10-DL-NEXT:    s_endpgm
1625                                              <4 x i8> addrspace(1)* %src2,
1626                                              i16 addrspace(1)* nocapture %dst) {
1627entry:
1628  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1629  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1630  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1631  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1632  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1633
1634  %v1e0 = extractelement <4 x i8> %vec1, i64 0
1635  %cv1e0 = sext i8 %v1e0 to i16
1636  %v2e0 = extractelement <4 x i8> %vec2, i64 0
1637  %cv2e0 = sext i8 %v2e0 to i16
1638  %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1639
1640  %v1e1 = extractelement <4 x i8> %vec1, i64 1
1641  %cv1e1 = zext i8 %v1e1 to i16
1642  %v2e1 = extractelement <4 x i8> %vec2, i64 1
1643  %cv2e1 = zext i8 %v2e1 to i16
1644  %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1645
1646  %v1e2 = extractelement <4 x i8> %vec1, i64 2
1647  %cv1e2 = zext i8 %v1e2 to i16
1648  %v2e2 = extractelement <4 x i8> %vec2, i64 2
1649  %cv2e2 = zext i8 %v2e2 to i16
1650  %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1651
1652  %v1e3 = extractelement <4 x i8> %vec1, i64 3
1653  %cv1e3 = zext i8 %v1e3 to i16
1654  %v2e3 = extractelement <4 x i8> %vec2, i64 3
1655  %cv2e3 = zext i8 %v2e3 to i16
1656  %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1657
1658  %acc = load i16, i16 addrspace(1)* %dst, align 2
1659  %add1 = add i16 %mul2, %acc
1660  %add2 = add i16 %add1, %mul1
1661  %add3 = add i16 %add2, %mul3
1662  %add4 = add i16 %add3, %mul4
1663
1664  store i16 %add4, i16 addrspace(1)* %dst, align 2
1665  ret void
1666}
1667
1668; TODO: cleanup s_lshr_b32 and support this pattern.
1669define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
1670; GFX7-LABEL: udot4_acc32_vecMul:
1671; GFX7:       ; %bb.0: ; %entry
1672; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1673; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1674; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1675; GFX7-NEXT:    s_mov_b32 s10, 0
1676; GFX7-NEXT:    s_mov_b32 s11, s3
1677; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1678; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1679; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1680; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1681; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1682; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1683; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1684; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1685; GFX7-NEXT:    s_mov_b32 s2, -1
1686; GFX7-NEXT:    s_waitcnt vmcnt(1)
1687; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
1688; GFX7-NEXT:    v_bfe_u32 v3, v2, 8, 8
1689; GFX7-NEXT:    v_bfe_u32 v4, v2, 16, 8
1690; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
1691; GFX7-NEXT:    s_waitcnt vmcnt(0)
1692; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
1693; GFX7-NEXT:    v_bfe_u32 v6, v0, 8, 8
1694; GFX7-NEXT:    v_bfe_u32 v7, v0, 16, 8
1695; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
1696; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, s4
1698; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
1699; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v7, v0
1700; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v5, v0
1701; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1702; GFX7-NEXT:    s_endpgm
1703;
1704; GFX8-LABEL: udot4_acc32_vecMul:
1705; GFX8:       ; %bb.0: ; %entry
1706; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1707; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1708; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1709; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1711; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1712; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1713; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1714; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1715; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1716; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1717; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1718; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1719; GFX8-NEXT:    s_waitcnt vmcnt(1)
1720; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 24, v3
1721; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 8
1722; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 8, v3
1723; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v3
1724; GFX8-NEXT:    s_waitcnt vmcnt(0)
1725; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 24, v0
1726; GFX8-NEXT:    v_bfe_u32 v6, v0, 16, 8
1727; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v0
1728; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
1729; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1730; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
1731; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v7, v0
1732; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v6, v0
1733; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
1734; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1735; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1736; GFX8-NEXT:    flat_store_dword v[0:1], v2
1737; GFX8-NEXT:    s_endpgm
1738;
1739; GFX9-NODL-LABEL: udot4_acc32_vecMul:
1740; GFX9-NODL:       ; %bb.0: ; %entry
1741; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1742; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1743; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1744; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1745; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1746; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1747; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1748; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1749; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1750; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1751; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1752; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1753; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1754; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX9-NODL-NEXT:    v_add3_u32 v2, v3, s0, v4
1756; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, v5, v1
1757; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1758; GFX9-NODL-NEXT:    s_endpgm
1759;
1760; GFX9-DL-LABEL: udot4_acc32_vecMul:
1761; GFX9-DL:       ; %bb.0: ; %entry
1762; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1763; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1764; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1765; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1766; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1767; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1768; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1769; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1770; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1771; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1772; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
1773; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1774; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1775; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1776; GFX9-DL-NEXT:    v_add3_u32 v2, v3, s0, v4
1777; GFX9-DL-NEXT:    v_add3_u32 v1, v2, v5, v1
1778; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1779; GFX9-DL-NEXT:    s_endpgm
1780;
1781; GFX10-DL-LABEL: udot4_acc32_vecMul:
1782; GFX10-DL:       ; %bb.0: ; %entry
1783; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1784; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1785; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1786; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX10-DL-NEXT:    s_clause 0x1
1788; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1789; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1790; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0xffff
1791; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1792; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1793; GFX10-DL-NEXT:    v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1794; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1795; GFX10-DL-NEXT:    v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
1796; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1797; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v3, v0
1798; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
1799; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
1800; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1801; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1802; GFX10-DL-NEXT:    v_add3_u32 v0, v4, s2, v0
1803; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
1804; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1805; GFX10-DL-NEXT:    s_endpgm
1806                                              <4 x i8> addrspace(1)* %src2,
1807                                              i32 addrspace(1)* nocapture %dst) {
1808entry:
1809  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1810  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
1811  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
1812  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
1813  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
1814
1815  %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
1816  %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
1817
1818  %mul = mul <4 x i32> %cvec1, %cvec2
1819  %mul0 = extractelement <4 x i32> %mul, i64 0
1820  %mul1 = extractelement <4 x i32> %mul, i64 1
1821  %mul2 = extractelement <4 x i32> %mul, i64 2
1822  %mul3 = extractelement <4 x i32> %mul, i64 3
1823
1824  %acc = load i32, i32 addrspace(1)* %dst, align 4
1825  %add1 = add i32 %mul0, %acc
1826  %add2 = add i32 %add1, %mul1
1827  %add3 = add i32 %add2, %mul2
1828  %add4 = add i32 %add3, %mul3
1829
1830  store i32 %add4, i32 addrspace(1)* %dst, align 4
1831  ret void
1832}
1833
1834; TODO: This pattern should be recognized.
1835define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
1836; GFX7-LABEL: udot4_acc16_vecMul:
1837; GFX7:       ; %bb.0: ; %entry
1838; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1839; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1840; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1841; GFX7-NEXT:    s_mov_b32 s10, 0
1842; GFX7-NEXT:    s_mov_b32 s11, s3
1843; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1845; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1846; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1847; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1848; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1849; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1850; GFX7-NEXT:    s_mov_b32 s2, -1
1851; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
1852; GFX7-NEXT:    s_waitcnt vmcnt(2)
1853; GFX7-NEXT:    v_and_b32_e32 v3, 0xff00, v2
1854; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
1855; GFX7-NEXT:    s_waitcnt vmcnt(1)
1856; GFX7-NEXT:    v_and_b32_e32 v6, 0xff00, v0
1857; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1858; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
1859; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
1860; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
1861; GFX7-NEXT:    v_or_b32_e32 v4, v7, v4
1862; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1863; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
1864; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
1865; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
1866; GFX7-NEXT:    s_waitcnt vmcnt(0)
1867; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
1868; GFX7-NEXT:    v_bfe_u32 v5, v2, 16, 8
1869; GFX7-NEXT:    v_bfe_u32 v8, v0, 16, 8
1870; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v7, v1
1871; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
1872; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
1873; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
1874; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
1875; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
1876; GFX7-NEXT:    s_endpgm
1877;
1878; GFX8-LABEL: udot4_acc16_vecMul:
1879; GFX8:       ; %bb.0: ; %entry
1880; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1881; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1882; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1883; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
1884; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1885; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1886; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1887; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1888; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1889; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1890; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1891; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1892; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1893; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1894; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1895; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
1896; GFX8-NEXT:    s_waitcnt vmcnt(2)
1897; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
1898; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 8, v3
1899; GFX8-NEXT:    v_and_b32_sdwa v10, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1900; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v3
1901; GFX8-NEXT:    s_waitcnt vmcnt(1)
1902; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
1903; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 8, v2
1904; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1905; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v2
1906; GFX8-NEXT:    s_waitcnt vmcnt(0)
1907; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1908; GFX8-NEXT:    v_mad_u16 v2, v7, v9, v2
1909; GFX8-NEXT:    v_mad_u16 v2, v10, v5, v2
1910; GFX8-NEXT:    v_mad_u16 v2, v6, v8, v2
1911; GFX8-NEXT:    flat_store_short v[0:1], v2
1912; GFX8-NEXT:    s_endpgm
1913;
1914; GFX9-NODL-LABEL: udot4_acc16_vecMul:
1915; GFX9-NODL:       ; %bb.0: ; %entry
1916; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1917; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1918; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1919; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
1920; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0xffff
1921; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1922; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1923; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1924; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1925; GFX9-NODL-NEXT:    global_load_ushort v3, v0, s[2:3]
1926; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
1927; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
1928; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
1929; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
1930; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v7, 8, v2
1931; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
1932; GFX9-NODL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1933; GFX9-NODL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1934; GFX9-NODL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1935; GFX9-NODL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1936; GFX9-NODL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
1937; GFX9-NODL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
1938; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v10
1939; GFX9-NODL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
1940; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1941; GFX9-NODL-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
1942; GFX9-NODL-NEXT:    v_lshl_or_b32 v5, v6, 16, v9
1943; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1944; GFX9-NODL-NEXT:    v_add_u16_e32 v3, v1, v3
1945; GFX9-NODL-NEXT:    v_pk_mul_lo_u16 v2, v5, v4
1946; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1947; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v2
1948; GFX9-NODL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1949; GFX9-NODL-NEXT:    global_store_short v0, v1, s[2:3]
1950; GFX9-NODL-NEXT:    s_endpgm
1951;
1952; GFX9-DL-LABEL: udot4_acc16_vecMul:
1953; GFX9-DL:       ; %bb.0: ; %entry
1954; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1955; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1956; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1957; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
1958; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1959; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1960; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1961; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1962; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1963; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
1964; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1965; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
1966; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
1967; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1968; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v7, 8, v2
1969; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 24, v2
1970; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1971; GFX9-DL-NEXT:    v_and_b32_sdwa v10, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1972; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1973; GFX9-DL-NEXT:    v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1974; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v7, 16, v2
1975; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v5, 16, v1
1976; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xffff, v10
1977; GFX9-DL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
1978; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
1979; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v8, 16, v4
1980; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v6, 16, v9
1981; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1982; GFX9-DL-NEXT:    v_add_u16_e32 v3, v1, v3
1983; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v5, v4
1984; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1985; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
1986; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1987; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
1988; GFX9-DL-NEXT:    s_endpgm
1989;
1990; GFX10-DL-LABEL: udot4_acc16_vecMul:
1991; GFX10-DL:       ; %bb.0: ; %entry
1992; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1993; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1994; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1995; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
1996; GFX10-DL-NEXT:    v_mov_b32_e32 v5, 0xff
1997; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1998; GFX10-DL-NEXT:    s_clause 0x1
1999; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2000; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2001; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2002; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
2003; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2004; GFX10-DL-NEXT:    v_lshrrev_b16 v6, 8, v1
2005; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2006; GFX10-DL-NEXT:    v_lshrrev_b16 v7, 8, v2
2007; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2008; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2009; GFX10-DL-NEXT:    v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2010; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2011; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
2012; GFX10-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
2013; GFX10-DL-NEXT:    v_lshl_or_b32 v4, v6, 16, v4
2014; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
2015; GFX10-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2016; GFX10-DL-NEXT:    v_and_b32_e32 v6, 0xffff, v9
2017; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2018; GFX10-DL-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
2019; GFX10-DL-NEXT:    v_lshl_or_b32 v1, v1, 16, v6
2020; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2021; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2022; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
2023; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2024; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
2025; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2026; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
2027; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
2028; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
2029; GFX10-DL-NEXT:    s_endpgm
2030                                              <4 x i8> addrspace(1)* %src2,
2031                                              i16 addrspace(1)* nocapture %dst) {
2032entry:
2033  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2034  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
2035  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
2036  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
2037  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
2038
2039  %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
2040  %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
2041
2042  %mul = mul <4 x i16> %cvec1, %cvec2
2043  %mul0 = extractelement <4 x i16> %mul, i64 0
2044  %mul1 = extractelement <4 x i16> %mul, i64 1
2045  %mul2 = extractelement <4 x i16> %mul, i64 2
2046  %mul3 = extractelement <4 x i16> %mul, i64 3
2047
2048  %acc = load i16, i16 addrspace(1)* %dst, align 4
2049  %add1 = add i16 %mul0, %acc
2050  %add2 = add i16 %add1, %mul1
2051  %add3 = add i16 %add2, %mul2
2052  %add4 = add i16 %add3, %mul3
2053
2054  store i16 %add4, i16 addrspace(1)* %dst, align 4
2055  ret void
2056}
2057
2058; TODO: Support this pattern.
2059define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
2060; GFX7-LABEL: udot4_acc8_vecMul:
2061; GFX7:       ; %bb.0: ; %entry
2062; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2063; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2064; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2065; GFX7-NEXT:    s_mov_b32 s10, 0
2066; GFX7-NEXT:    s_mov_b32 s11, s3
2067; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2068; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2069; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2070; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2071; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2072; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2073; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2074; GFX7-NEXT:    s_mov_b32 s2, -1
2075; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2076; GFX7-NEXT:    s_waitcnt vmcnt(2)
2077; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v2
2078; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
2079; GFX7-NEXT:    s_waitcnt vmcnt(1)
2080; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v0
2081; GFX7-NEXT:    v_bfe_u32 v8, v0, 8, 8
2082; GFX7-NEXT:    s_waitcnt vmcnt(0)
2083; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v7, v1
2084; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
2085; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
2086; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
2087; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
2088; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v8, v1
2089; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2090; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
2091; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2092; GFX7-NEXT:    s_endpgm
2093;
2094; GFX8-LABEL: udot4_acc8_vecMul:
2095; GFX8:       ; %bb.0: ; %entry
2096; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2097; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2098; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2099; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2100; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2101; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2102; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2103; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2104; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2105; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2106; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2107; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2108; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2109; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2110; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2111; GFX8-NEXT:    s_waitcnt vmcnt(2)
2112; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2113; GFX8-NEXT:    s_waitcnt vmcnt(1)
2114; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2115; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2116; GFX8-NEXT:    v_mul_lo_u16_e32 v9, v5, v6
2117; GFX8-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2118; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2119; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
2120; GFX8-NEXT:    v_or_b32_e32 v8, v8, v9
2121; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v8
2122; GFX8-NEXT:    s_waitcnt vmcnt(0)
2123; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2124; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
2125; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2126; GFX8-NEXT:    v_mad_u16 v2, v5, v6, v2
2127; GFX8-NEXT:    v_add_u16_e32 v2, v2, v7
2128; GFX8-NEXT:    flat_store_byte v[0:1], v2
2129; GFX8-NEXT:    s_endpgm
2130;
2131; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2132; GFX9-NODL:       ; %bb.0: ; %entry
2133; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2134; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2135; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2136; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2137; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2138; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2139; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2140; GFX9-NODL-NEXT:    global_load_ubyte v3, v0, s[2:3]
2141; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2142; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2143; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2144; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2145; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2146; GFX9-NODL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
2147; GFX9-NODL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2148; GFX9-NODL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2149; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
2150; GFX9-NODL-NEXT:    v_or_b32_e32 v7, v7, v8
2151; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2152; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2153; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
2154; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v7
2155; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2156; GFX9-NODL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
2157; GFX9-NODL-NEXT:    v_add_u16_e32 v1, v1, v6
2158; GFX9-NODL-NEXT:    global_store_byte v0, v1, s[2:3]
2159; GFX9-NODL-NEXT:    s_endpgm
2160;
2161; GFX9-DL-LABEL: udot4_acc8_vecMul:
2162; GFX9-DL:       ; %bb.0: ; %entry
2163; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2164; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2165; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2166; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2167; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2168; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2169; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2170; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
2171; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2172; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
2173; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2174; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
2175; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
2176; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v4, v5
2177; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2178; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2179; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
2180; GFX9-DL-NEXT:    v_or_b32_e32 v7, v7, v8
2181; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v7
2182; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2183; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
2184; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v7
2185; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
2186; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v5, v1
2187; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
2188; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
2189; GFX9-DL-NEXT:    s_endpgm
2190;
2191; GFX10-DL-LABEL: udot4_acc8_vecMul:
2192; GFX10-DL:       ; %bb.0: ; %entry
2193; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2194; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2195; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2196; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2197; GFX10-DL-NEXT:    s_clause 0x1
2198; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2199; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2200; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2201; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
2202; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2203; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
2204; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2205; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v2
2206; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
2207; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
2208; GFX10-DL-NEXT:    v_lshrrev_b16 v8, 8, v2
2209; GFX10-DL-NEXT:    v_mul_lo_u16 v4, v4, v5
2210; GFX10-DL-NEXT:    v_lshrrev_b16 v5, 8, v1
2211; GFX10-DL-NEXT:    v_mul_lo_u16 v9, v6, v7
2212; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2213; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
2214; GFX10-DL-NEXT:    v_lshlrev_b16 v4, 8, v4
2215; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v8
2216; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2217; GFX10-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
2218; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
2219; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
2220; GFX10-DL-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2221; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2222; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v5
2223; GFX10-DL-NEXT:    v_mad_u16 v1, v6, v7, v1
2224; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
2225; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
2226; GFX10-DL-NEXT:    s_endpgm
2227                                             <4 x i8> addrspace(1)* %src2,
2228                                             i8 addrspace(1)* nocapture %dst) {
2229entry:
2230  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2231  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src1, i32 %idx
2232  %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1
2233  %gep2 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %src2, i32 %idx
2234  %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %gep2
2235
2236  %mul = mul <4 x i8> %vec1, %vec2
2237  %mul0 = extractelement <4 x i8> %mul, i64 0
2238  %mul1 = extractelement <4 x i8> %mul, i64 1
2239  %mul2 = extractelement <4 x i8> %mul, i64 2
2240  %mul3 = extractelement <4 x i8> %mul, i64 3
2241
2242  %acc = load i8, i8 addrspace(1)* %dst, align 4
2243  %add1 = add i8 %mul0, %acc
2244  %add2 = add i8 %add1, %mul1
2245  %add3 = add i8 %add2, %mul2
2246  %add4 = add i8 %add3, %mul3
2247
2248  store i8 %add4, i8 addrspace(1)* %dst, align 4
2249  ret void
2250}
2251
2252declare i32 @llvm.amdgcn.workitem.id.x()
2253