1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
10
11define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
12; GFX7-LABEL: idot8_acc32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
15; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
16; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
17; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
18; GFX7-NEXT:    s_mov_b32 s14, -1
19; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
20; GFX7-NEXT:    s_add_u32 s12, s12, s3
21; GFX7-NEXT:    s_mov_b32 s3, 0xf000
22; GFX7-NEXT:    s_mov_b32 s10, 0
23; GFX7-NEXT:    s_mov_b32 s11, s3
24; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
26; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
27; GFX7-NEXT:    v_mov_b32_e32 v1, 0
28; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
29; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
30; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
31; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
32; GFX7-NEXT:    s_mov_b32 s2, -1
33; GFX7-NEXT:    s_addc_u32 s13, s13, 0
34; GFX7-NEXT:    s_waitcnt vmcnt(1)
35; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
36; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
37; GFX7-NEXT:    s_waitcnt vmcnt(0)
38; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
39; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
40; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v9, s4
42; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
43; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
44; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v10, v1
45; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
46; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
47; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v11, v1
48; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
49; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
50; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v12, v1
51; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
52; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
53; GFX7-NEXT:    v_mad_i32_i24 v1, v6, v13, v1
54; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
55; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
56; GFX7-NEXT:    v_mad_i32_i24 v1, v7, v14, v1
57; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
58; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
59; GFX7-NEXT:    v_mad_i32_i24 v1, v8, v15, v1
60; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
61; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
62; GFX7-NEXT:    s_endpgm
63;
64; GFX8-LABEL: idot8_acc32:
65; GFX8:       ; %bb.0: ; %entry
66; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
67; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
68; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
69; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
70; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
71; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX8-NEXT:    v_mov_b32_e32 v1, s5
73; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
74; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
75; GFX8-NEXT:    flat_load_dword v3, v[0:1]
76; GFX8-NEXT:    v_mov_b32_e32 v1, s7
77; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
78; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
79; GFX8-NEXT:    flat_load_dword v0, v[0:1]
80; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
81; GFX8-NEXT:    s_mov_b32 s10, -1
82; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
83; GFX8-NEXT:    s_add_u32 s8, s8, s3
84; GFX8-NEXT:    s_addc_u32 s9, s9, 0
85; GFX8-NEXT:    s_waitcnt vmcnt(1)
86; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 4
87; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
88; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
89; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
90; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
91; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
92; GFX8-NEXT:    s_waitcnt vmcnt(0)
93; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
94; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
95; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
97; GFX8-NEXT:    v_bfe_i32 v7, v0, 8, 4
98; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
99; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
100; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
101; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
102; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
103; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
104; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
105; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
106; GFX8-NEXT:    v_bfe_i32 v15, v0, 24, 4
107; GFX8-NEXT:    v_mad_i32_i24 v1, v12, v13, v1
108; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
109; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
110; GFX8-NEXT:    v_mad_i32_i24 v1, v14, v15, v1
111; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
112; GFX8-NEXT:    v_mov_b32_e32 v0, s0
113; GFX8-NEXT:    v_mov_b32_e32 v1, s1
114; GFX8-NEXT:    flat_store_dword v[0:1], v2
115; GFX8-NEXT:    s_endpgm
116;
117; GFX9-LABEL: idot8_acc32:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
120; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
121; GFX9-NEXT:    s_mov_b32 s10, -1
122; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
123; GFX9-NEXT:    s_add_u32 s8, s8, s3
124; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
125; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
126; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
127; GFX9-NEXT:    s_addc_u32 s9, s9, 0
128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
130; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
131; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
132; GFX9-NEXT:    v_mov_b32_e32 v0, 0
133; GFX9-NEXT:    s_waitcnt vmcnt(1)
134; GFX9-NEXT:    v_bfe_i32 v3, v1, 0, 4
135; GFX9-NEXT:    s_waitcnt vmcnt(0)
136; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
137; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
138; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
139; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
140; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
141; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
142; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
143; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
144; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
145; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
146; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
147; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
148; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
149; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
150; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
151; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
152; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
153; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v7, v8
154; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v9, v10
155; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    v_add3_u32 v2, v3, s0, v4
158; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v11, v12
159; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v13, v14
160; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v6
161; GFX9-NEXT:    v_mul_i32_i24_e32 v9, v15, v16
162; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v8
163; GFX9-NEXT:    v_add3_u32 v1, v2, v9, v1
164; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
165; GFX9-NEXT:    s_endpgm
166;
167; GFX9-DL-LABEL: idot8_acc32:
168; GFX9-DL:       ; %bb.0: ; %entry
169; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
170; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
171; GFX9-DL-NEXT:    s_mov_b32 s10, -1
172; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
173; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
174; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
175; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
176; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
177; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
178; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
180; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
181; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
182; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
183; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
184; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
185; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
186; GFX9-DL-NEXT:    s_endpgm
187;
188; GFX10-DL-XNACK-LABEL: idot8_acc32:
189; GFX10-DL-XNACK:       ; %bb.0: ; %entry
190; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
191; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
192; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
193; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
194; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
195; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
196; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
197; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
198; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
199; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX10-DL-XNACK-NEXT:    s_clause 0x1
201; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
202; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
203; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
204; GFX10-DL-XNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
205; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
206; GFX10-DL-XNACK-NEXT:    v_dot8_i32_i4 v1, v1, v2, s2
207; GFX10-DL-XNACK-NEXT:    global_store_dword v0, v1, s[0:1]
208; GFX10-DL-XNACK-NEXT:    s_endpgm
209;
210; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
211; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
212; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
213; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
214; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
215; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
216; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
217; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
218; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
219; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
220; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
221; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
222; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
224; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
225; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
226; GFX10-DL-NOXNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
227; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
228; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s2
229; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
230; GFX10-DL-NOXNACK-NEXT:    s_endpgm
231; GFX10-DL-LABEL: idot8_acc32:
232; GFX10-DL:       ; %bb.0: ; %entry
233; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
234; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
235; GFX10-DL-NEXT:    s_mov_b32 s10, -1
236; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
237; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
238; GFX10-DL-NEXT:    s_clause 0x1
239; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
240; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
241; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
242; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
243; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
245; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
246; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
247; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
249; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
250; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
251; GFX10-DL-NEXT:    s_endpgm
252                                       <8 x i4> addrspace(1)* %src2,
253                                       i32 addrspace(1)* nocapture %dst) {
254entry:
255  %idx = call i32 @llvm.amdgcn.workitem.id.x()
256  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
257  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
258  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
259  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
260
261  %v1e0 = extractelement <8 x i4> %vec1, i64 0
262  %cv1e0 = sext i4 %v1e0 to i32
263  %v2e0 = extractelement <8 x i4> %vec2, i64 0
264  %cv2e0 = sext i4 %v2e0 to i32
265  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
266
267  %v1e1 = extractelement <8 x i4> %vec1, i64 1
268  %cv1e1 = sext i4 %v1e1 to i32
269  %v2e1 = extractelement <8 x i4> %vec2, i64 1
270  %cv2e1 = sext i4 %v2e1 to i32
271  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
272
273  %v1e2 = extractelement <8 x i4> %vec1, i64 2
274  %cv1e2 = sext i4 %v1e2 to i32
275  %v2e2 = extractelement <8 x i4> %vec2, i64 2
276  %cv2e2 = sext i4 %v2e2 to i32
277  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
278
279  %v1e3 = extractelement <8 x i4> %vec1, i64 3
280  %cv1e3 = sext i4 %v1e3 to i32
281  %v2e3 = extractelement <8 x i4> %vec2, i64 3
282  %cv2e3 = sext i4 %v2e3 to i32
283  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
284
285  %v1e4 = extractelement <8 x i4> %vec1, i64 4
286  %cv1e4 = sext i4 %v1e4 to i32
287  %v2e4 = extractelement <8 x i4> %vec2, i64 4
288  %cv2e4 = sext i4 %v2e4 to i32
289  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
290
291  %v1e5 = extractelement <8 x i4> %vec1, i64 5
292  %cv1e5 = sext i4 %v1e5 to i32
293  %v2e5 = extractelement <8 x i4> %vec2, i64 5
294  %cv2e5 = sext i4 %v2e5 to i32
295  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
296
297  %v1e6 = extractelement <8 x i4> %vec1, i64 6
298  %cv1e6 = sext i4 %v1e6 to i32
299  %v2e6 = extractelement <8 x i4> %vec2, i64 6
300  %cv2e6 = sext i4 %v2e6 to i32
301  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
302
303  %v1e7 = extractelement <8 x i4> %vec1, i64 7
304  %cv1e7 = sext i4 %v1e7 to i32
305  %v2e7 = extractelement <8 x i4> %vec2, i64 7
306  %cv2e7 = sext i4 %v2e7 to i32
307  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
308
309  %acc = load i32, i32 addrspace(1)* %dst, align 4
310  %add1 = add i32 %mul0, %acc
311  %add2 = add i32 %add1, %mul1
312  %add3 = add i32 %add2, %mul2
313  %add4 = add i32 %add3, %mul3
314  %add5 = add i32 %add4, %mul4
315  %add6 = add i32 %add5, %mul5
316  %add7 = add i32 %add6, %mul6
317  %add8 = add i32 %add7, %mul7
318
319  store i32 %add8, i32 addrspace(1)* %dst, align 4
320  ret void
321}
322
323; TODO: Once the unnecessary zero extentions of the elements are removed;
324; pattern recognizer will kick in.
325define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
326; GFX7-LABEL: idot8_acc16:
327; GFX7:       ; %bb.0: ; %entry
328; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
329; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
330; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
331; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
332; GFX7-NEXT:    s_mov_b32 s14, -1
333; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
334; GFX7-NEXT:    s_add_u32 s12, s12, s3
335; GFX7-NEXT:    s_mov_b32 s3, 0xf000
336; GFX7-NEXT:    s_mov_b32 s10, 0
337; GFX7-NEXT:    s_mov_b32 s11, s3
338; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
340; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
341; GFX7-NEXT:    v_mov_b32_e32 v1, 0
342; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
343; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
344; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
345; GFX7-NEXT:    s_mov_b32 s2, -1
346; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
347; GFX7-NEXT:    s_addc_u32 s13, s13, 0
348; GFX7-NEXT:    s_waitcnt vmcnt(2)
349; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
350; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
351; GFX7-NEXT:    s_waitcnt vmcnt(1)
352; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
353; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
354; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
355; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff, v10
356; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
357; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
358; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
359; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v11
360; GFX7-NEXT:    s_waitcnt vmcnt(0)
361; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
362; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
363; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
364; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
365; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v12
366; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
367; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
368; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
369; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
370; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v13
371; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
372; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
373; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
374; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
375; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v14
376; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
377; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
378; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
379; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
380; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff, v15
381; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
382; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
383; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
384; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
385; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff, v16
386; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
387; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
388; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
389; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
390; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
391; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
392; GFX7-NEXT:    s_endpgm
393;
394; GFX8-LABEL: idot8_acc16:
395; GFX8:       ; %bb.0: ; %entry
396; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
397; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
398; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
399; GFX8-NEXT:    v_mov_b32_e32 v5, 12
400; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
401; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX8-NEXT:    v_mov_b32_e32 v1, s5
403; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
404; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
405; GFX8-NEXT:    flat_load_dword v3, v[0:1]
406; GFX8-NEXT:    v_mov_b32_e32 v1, s7
407; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
408; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
409; GFX8-NEXT:    flat_load_dword v2, v[0:1]
410; GFX8-NEXT:    v_mov_b32_e32 v0, s0
411; GFX8-NEXT:    v_mov_b32_e32 v1, s1
412; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
413; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
414; GFX8-NEXT:    s_mov_b32 s10, -1
415; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
416; GFX8-NEXT:    s_add_u32 s8, s8, s3
417; GFX8-NEXT:    s_addc_u32 s9, s9, 0
418; GFX8-NEXT:    s_waitcnt vmcnt(2)
419; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
420; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
421; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
422; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
423; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
424; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
425; GFX8-NEXT:    s_waitcnt vmcnt(1)
426; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
427; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
428; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
429; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
430; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
431; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
432; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
433; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
434; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
435; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
436; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
437; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
438; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
439; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
440; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
441; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
442; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
443; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
444; GFX8-NEXT:    s_waitcnt vmcnt(0)
445; GFX8-NEXT:    v_mad_u16 v4, v5, v16, v4
446; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
447; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
448; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
449; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
450; GFX8-NEXT:    v_mad_u16 v4, v10, v15, v4
451; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
452; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
453; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
454; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
455; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
456; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
457; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
458; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
459; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
460; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
461; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
462; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
463; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
464; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
465; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
466; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
467; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
468; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
469; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
470; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
471; GFX8-NEXT:    flat_store_short v[0:1], v2
472; GFX8-NEXT:    s_endpgm
473;
474; GFX9-LABEL: idot8_acc16:
475; GFX9:       ; %bb.0: ; %entry
476; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
477; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
478; GFX9-NEXT:    s_mov_b32 s10, -1
479; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
480; GFX9-NEXT:    s_add_u32 s8, s8, s3
481; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
482; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
483; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
484; GFX9-NEXT:    v_mov_b32_e32 v4, 12
485; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
487; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
488; GFX9-NEXT:    v_mov_b32_e32 v0, 0
489; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
490; GFX9-NEXT:    s_addc_u32 s9, s9, 0
491; GFX9-NEXT:    s_waitcnt vmcnt(2)
492; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
493; GFX9-NEXT:    s_waitcnt vmcnt(1)
494; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
495; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
496; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
497; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
498; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
499; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
500; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
501; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
502; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
503; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
504; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
505; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
506; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
507; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
508; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
509; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
510; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
511; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
512; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
513; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
514; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
515; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
516; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
517; GFX9-NEXT:    s_waitcnt vmcnt(0)
518; GFX9-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
519; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
520; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
521; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
522; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
523; GFX9-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
524; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
525; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
526; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
527; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
528; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
529; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
530; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
531; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
532; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
533; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
534; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
535; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
536; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
537; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
538; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
539; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
540; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
541; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
542; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
543; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
544; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
545; GFX9-NEXT:    s_endpgm
546;
547; GFX9-DL-LABEL: idot8_acc16:
548; GFX9-DL:       ; %bb.0: ; %entry
549; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
550; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
551; GFX9-DL-NEXT:    s_mov_b32 s10, -1
552; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
553; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
554; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
555; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
556; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
557; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
558; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
560; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
561; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
562; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
563; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
564; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
565; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
566; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
567; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
568; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
569; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
570; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
571; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
572; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
573; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
574; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
575; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
576; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
577; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
578; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
579; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
580; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
581; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
582; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
583; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
584; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
585; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
586; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
587; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
588; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
589; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
590; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
591; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
592; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
593; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
594; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
595; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
596; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
597; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
598; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
599; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
600; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
601; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
602; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
603; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
604; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
605; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
606; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
607; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
608; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
609; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
610; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
611; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
612; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
613; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
614; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
615; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
616; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
617; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
618; GFX9-DL-NEXT:    s_endpgm
619;
620; GFX10-DL-XNACK-LABEL: idot8_acc16:
621; GFX10-DL-XNACK:       ; %bb.0: ; %entry
622; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
623; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
624; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
625; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
626; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
627; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
628; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
629; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
630; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
631; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX10-DL-XNACK-NEXT:    s_clause 0x1
633; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
634; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
635; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
636; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
637; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
638; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
639; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
640; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
641; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
642; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
643; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
644; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
645; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
646; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
647; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
648; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
649; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
650; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
651; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
652; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
653; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
654; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
655; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
656; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
657; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
658; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
659; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
660; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
661; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
662; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
663; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
664; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
665; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
666; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
667; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
668; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
669; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
670; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
671; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
672; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
673; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
674; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
675; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
676; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
677; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
678; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
679; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
680; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
681; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
682; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
683; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
684; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
685; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
686; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
687; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
688; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
689; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
690; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
691; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
692; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
693; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
694; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
695; GFX10-DL-XNACK-NEXT:    s_endpgm
696;
697; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
698; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
699; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
700; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
701; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
702; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
703; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
704; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
705; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
706; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
707; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
708; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
709; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
710; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
712; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
713; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
714; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
715; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
716; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
717; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
718; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
719; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
720; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
721; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
722; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
723; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
724; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
725; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
726; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
727; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
728; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
729; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
730; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
731; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
732; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
733; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
734; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
735; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
736; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
737; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
738; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
739; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
740; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
741; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
742; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
743; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
744; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
745; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
746; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
747; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
748; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
749; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
750; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
751; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
752; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
753; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
754; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
755; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
756; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
757; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
758; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
759; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
760; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
761; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
762; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
763; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
764; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
765; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
766; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
767; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
768; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
769; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
770; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
771; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
772; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
773; GFX10-DL-NOXNACK-NEXT:    s_endpgm
774; GFX10-DL-LABEL: idot8_acc16:
775; GFX10-DL:       ; %bb.0: ; %entry
776; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
777; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
778; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
779; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
780; GFX10-DL-NEXT:    s_mov_b32 s14, -1
781; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
782; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
783; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
784; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
785; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
786; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
787; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
788; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
789; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
791; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
792; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
793; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
794; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
795; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
796; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
797; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
798; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
799; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
800; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
801; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
802; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
803; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
804; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
805; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
806; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
807; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
808; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
809; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
810; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
811; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
812; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
813; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
814; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
815; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
816; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
817; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
818; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
819; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
820; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
821; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
822; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
823; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
824; GFX10-DL-NEXT:    s_endpgm
825                                       <8 x i4> addrspace(1)* %src2,
826                                       i16 addrspace(1)* nocapture %dst) {
827entry:
828  %idx = call i32 @llvm.amdgcn.workitem.id.x()
829  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
830  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
831  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
832  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
833
834  %v1e0 = extractelement <8 x i4> %vec1, i64 0
835  %cv1e0 = sext i4 %v1e0 to i16
836  %v2e0 = extractelement <8 x i4> %vec2, i64 0
837  %cv2e0 = sext i4 %v2e0 to i16
838  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
839
840  %v1e1 = extractelement <8 x i4> %vec1, i64 1
841  %cv1e1 = sext i4 %v1e1 to i16
842  %v2e1 = extractelement <8 x i4> %vec2, i64 1
843  %cv2e1 = sext i4 %v2e1 to i16
844  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
845
846  %v1e2 = extractelement <8 x i4> %vec1, i64 2
847  %cv1e2 = sext i4 %v1e2 to i16
848  %v2e2 = extractelement <8 x i4> %vec2, i64 2
849  %cv2e2 = sext i4 %v2e2 to i16
850  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
851
852  %v1e3 = extractelement <8 x i4> %vec1, i64 3
853  %cv1e3 = sext i4 %v1e3 to i16
854  %v2e3 = extractelement <8 x i4> %vec2, i64 3
855  %cv2e3 = sext i4 %v2e3 to i16
856  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
857
858  %v1e4 = extractelement <8 x i4> %vec1, i64 4
859  %cv1e4 = sext i4 %v1e4 to i16
860  %v2e4 = extractelement <8 x i4> %vec2, i64 4
861  %cv2e4 = sext i4 %v2e4 to i16
862  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
863
864  %v1e5 = extractelement <8 x i4> %vec1, i64 5
865  %cv1e5 = sext i4 %v1e5 to i16
866  %v2e5 = extractelement <8 x i4> %vec2, i64 5
867  %cv2e5 = sext i4 %v2e5 to i16
868  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
869
870  %v1e6 = extractelement <8 x i4> %vec1, i64 6
871  %cv1e6 = sext i4 %v1e6 to i16
872  %v2e6 = extractelement <8 x i4> %vec2, i64 6
873  %cv2e6 = sext i4 %v2e6 to i16
874  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
875
876  %v1e7 = extractelement <8 x i4> %vec1, i64 7
877  %cv1e7 = sext i4 %v1e7 to i16
878  %v2e7 = extractelement <8 x i4> %vec2, i64 7
879  %cv2e7 = sext i4 %v2e7 to i16
880  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
881
882  %acc = load i16, i16 addrspace(1)* %dst, align 4
883  %add1 = add i16 %mul0, %acc
884  %add2 = add i16 %add1, %mul1
885  %add3 = add i16 %add2, %mul2
886  %add4 = add i16 %add3, %mul3
887  %add5 = add i16 %add4, %mul4
888  %add6 = add i16 %add5, %mul5
889  %add7 = add i16 %add6, %mul6
890  %add8 = add i16 %add7, %mul7
891
892  store i16 %add8, i16 addrspace(1)* %dst, align 4
893  ret void
894}
895
896; TODO: Support this pattern.
897define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
898; GFX7-LABEL: idot8_acc8:
899; GFX7:       ; %bb.0: ; %entry
900; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
901; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
902; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
903; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
904; GFX7-NEXT:    s_mov_b32 s14, -1
905; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
906; GFX7-NEXT:    s_add_u32 s12, s12, s3
907; GFX7-NEXT:    s_mov_b32 s3, 0xf000
908; GFX7-NEXT:    s_mov_b32 s10, 0
909; GFX7-NEXT:    s_mov_b32 s11, s3
910; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
911; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
912; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
913; GFX7-NEXT:    v_mov_b32_e32 v1, 0
914; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
915; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
916; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
917; GFX7-NEXT:    s_mov_b32 s2, -1
918; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
919; GFX7-NEXT:    s_addc_u32 s13, s13, 0
920; GFX7-NEXT:    s_waitcnt vmcnt(2)
921; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
922; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
923; GFX7-NEXT:    s_waitcnt vmcnt(1)
924; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
925; GFX7-NEXT:    v_and_b32_e32 v3, 0xff, v3
926; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
927; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v10
928; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
929; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
930; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
931; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v11
932; GFX7-NEXT:    s_waitcnt vmcnt(0)
933; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
934; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
935; GFX7-NEXT:    v_and_b32_e32 v5, 0xff, v5
936; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
937; GFX7-NEXT:    v_and_b32_e32 v12, 0xff, v12
938; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
939; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
940; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
941; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
942; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v13
943; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
944; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
945; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v7
946; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
947; GFX7-NEXT:    v_and_b32_e32 v14, 0xff, v14
948; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
949; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
950; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
951; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
952; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v15
953; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
954; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
955; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v9
956; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
957; GFX7-NEXT:    v_and_b32_e32 v16, 0xff, v16
958; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
959; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
960; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
961; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
962; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
963; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
964; GFX7-NEXT:    s_endpgm
965;
966; GFX8-LABEL: idot8_acc8:
967; GFX8:       ; %bb.0: ; %entry
968; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
969; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
970; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
971; GFX8-NEXT:    v_mov_b32_e32 v5, 12
972; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
973; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
974; GFX8-NEXT:    v_mov_b32_e32 v1, s5
975; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
976; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
977; GFX8-NEXT:    flat_load_dword v3, v[0:1]
978; GFX8-NEXT:    v_mov_b32_e32 v1, s7
979; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
980; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
981; GFX8-NEXT:    flat_load_dword v2, v[0:1]
982; GFX8-NEXT:    v_mov_b32_e32 v0, s0
983; GFX8-NEXT:    v_mov_b32_e32 v1, s1
984; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
985; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
986; GFX8-NEXT:    s_mov_b32 s10, -1
987; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
988; GFX8-NEXT:    s_add_u32 s8, s8, s3
989; GFX8-NEXT:    s_addc_u32 s9, s9, 0
990; GFX8-NEXT:    s_waitcnt vmcnt(2)
991; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
992; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
993; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
994; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
995; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
996; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
997; GFX8-NEXT:    s_waitcnt vmcnt(1)
998; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
999; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
1000; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1001; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
1002; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
1003; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
1004; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1005; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1006; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1007; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1008; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
1009; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1010; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1011; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
1012; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1013; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1014; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1015; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
1016; GFX8-NEXT:    s_waitcnt vmcnt(0)
1017; GFX8-NEXT:    v_mad_u16 v4, v5, v16, v4
1018; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1019; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1020; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1021; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1022; GFX8-NEXT:    v_mad_u16 v4, v10, v15, v4
1023; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1024; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1025; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
1026; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1027; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
1028; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1029; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1030; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
1031; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1032; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1033; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
1034; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
1035; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1036; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1037; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1038; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
1039; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1040; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1041; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1042; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
1043; GFX8-NEXT:    flat_store_byte v[0:1], v2
1044; GFX8-NEXT:    s_endpgm
1045;
1046; GFX9-LABEL: idot8_acc8:
1047; GFX9:       ; %bb.0: ; %entry
1048; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1049; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1050; GFX9-NEXT:    s_mov_b32 s10, -1
1051; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1052; GFX9-NEXT:    s_add_u32 s8, s8, s3
1053; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1054; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1055; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1056; GFX9-NEXT:    v_mov_b32_e32 v4, 12
1057; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1058; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1059; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1060; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1061; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
1062; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1063; GFX9-NEXT:    s_waitcnt vmcnt(2)
1064; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
1065; GFX9-NEXT:    s_waitcnt vmcnt(1)
1066; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
1067; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
1068; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
1069; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
1070; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1071; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
1072; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
1073; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1074; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
1075; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
1076; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
1077; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1078; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1079; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1080; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1081; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
1082; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
1083; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1084; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1085; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1086; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1087; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1088; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1089; GFX9-NEXT:    s_waitcnt vmcnt(0)
1090; GFX9-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
1091; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1092; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1093; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1094; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1095; GFX9-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
1096; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1097; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1098; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
1099; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1100; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1101; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1102; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1103; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
1104; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1105; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1106; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
1107; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
1108; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1109; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
1110; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1111; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
1112; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
1113; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1114; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1115; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
1116; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
1117; GFX9-NEXT:    s_endpgm
1118;
1119; GFX9-DL-LABEL: idot8_acc8:
1120; GFX9-DL:       ; %bb.0: ; %entry
1121; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1122; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1123; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1124; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1125; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1126; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1127; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1128; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1129; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
1130; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1131; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1132; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1133; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1134; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
1135; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1136; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1137; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
1138; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1139; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
1140; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
1141; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
1142; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
1143; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1144; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
1145; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
1146; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1147; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
1148; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
1149; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
1150; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1151; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1152; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1153; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1154; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
1155; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
1156; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1157; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1158; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1159; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1160; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1161; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1162; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1163; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
1164; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1165; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1166; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1167; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1168; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
1169; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1170; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1171; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
1172; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1173; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1174; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1175; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1176; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
1177; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1178; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1179; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
1180; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
1181; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1182; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
1183; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1184; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
1185; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
1186; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1187; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1188; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
1189; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
1190; GFX9-DL-NEXT:    s_endpgm
1191;
1192; GFX10-DL-XNACK-LABEL: idot8_acc8:
1193; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1194; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1195; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1196; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1197; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1198; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1199; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
1200; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1201; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
1202; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
1203; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1204; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1205; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1206; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
1207; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
1208; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v0, s[0:1]
1209; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
1210; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1211; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1212; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1213; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1214; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
1215; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
1216; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
1217; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
1218; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
1219; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
1220; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
1221; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1222; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1223; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
1224; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
1225; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
1226; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
1227; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
1228; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
1229; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
1230; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
1231; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
1232; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
1233; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
1234; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
1235; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
1236; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
1237; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
1238; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
1239; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
1240; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
1241; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
1242; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
1243; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
1244; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
1245; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
1246; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
1247; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
1248; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
1249; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
1250; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
1251; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
1252; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
1253; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
1254; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
1255; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
1256; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
1257; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
1258; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
1259; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
1260; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
1261; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
1262; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
1263; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
1264; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
1265; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
1266; GFX10-DL-XNACK-NEXT:    global_store_byte v0, v1, s[0:1]
1267; GFX10-DL-XNACK-NEXT:    s_endpgm
1268;
1269; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
1270; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1271; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1272; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1273; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1274; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1275; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1276; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1277; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1278; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
1279; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1280; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
1281; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
1282; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1284; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1285; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
1286; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v3, v2, s[0:1]
1287; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
1288; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1289; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1290; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1291; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1292; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
1293; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
1294; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
1295; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
1296; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
1297; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
1298; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
1299; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
1300; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
1301; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
1302; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
1303; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
1304; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
1305; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
1306; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
1307; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
1308; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
1309; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
1310; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
1311; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
1312; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
1313; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
1314; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
1315; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
1316; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
1317; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
1318; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
1319; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
1320; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
1321; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
1322; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
1323; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
1324; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
1325; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
1326; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
1327; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
1328; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
1329; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
1330; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
1331; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
1332; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
1333; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
1334; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
1335; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
1336; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
1337; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
1338; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
1339; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
1340; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
1341; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
1342; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
1343; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
1344; GFX10-DL-NOXNACK-NEXT:    global_store_byte v2, v0, s[0:1]
1345; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1346; GFX10-DL-LABEL: idot8_acc8:
1347; GFX10-DL:       ; %bb.0: ; %entry
1348; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1349; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1350; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1351; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1352; GFX10-DL-NEXT:    s_mov_b32 s14, -1
1353; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
1354; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
1355; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1356; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
1357; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1358; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
1359; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1360; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1361; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
1363; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
1364; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
1365; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
1366; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
1367; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
1368; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
1369; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
1370; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
1371; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
1372; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
1373; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
1374; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
1375; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
1376; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1377; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
1378; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
1379; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
1380; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
1381; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
1382; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1383; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
1384; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
1385; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1386; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
1387; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
1388; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1389; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
1390; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
1391; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
1392; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
1393; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1394; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
1395; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
1396; GFX10-DL-NEXT:    s_endpgm
1397                                       <8 x i4> addrspace(1)* %src2,
1398                                       i8 addrspace(1)* nocapture %dst) {
1399entry:
1400  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1401  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1402  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1403  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1404  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1405
1406  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1407  %cv1e0 = sext i4 %v1e0 to i8
1408  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1409  %cv2e0 = sext i4 %v2e0 to i8
1410  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
1411
1412  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1413  %cv1e1 = sext i4 %v1e1 to i8
1414  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1415  %cv2e1 = sext i4 %v2e1 to i8
1416  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
1417
1418  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1419  %cv1e2 = sext i4 %v1e2 to i8
1420  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1421  %cv2e2 = sext i4 %v2e2 to i8
1422  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
1423
1424  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1425  %cv1e3 = sext i4 %v1e3 to i8
1426  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1427  %cv2e3 = sext i4 %v2e3 to i8
1428  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
1429
1430  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1431  %cv1e4 = sext i4 %v1e4 to i8
1432  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1433  %cv2e4 = sext i4 %v2e4 to i8
1434  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
1435
1436  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1437  %cv1e5 = sext i4 %v1e5 to i8
1438  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1439  %cv2e5 = sext i4 %v2e5 to i8
1440  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
1441
1442  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1443  %cv1e6 = sext i4 %v1e6 to i8
1444  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1445  %cv2e6 = sext i4 %v2e6 to i8
1446  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
1447
1448  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1449  %cv1e7 = sext i4 %v1e7 to i8
1450  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1451  %cv2e7 = sext i4 %v2e7 to i8
1452  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
1453
1454  %acc = load i8, i8 addrspace(1)* %dst, align 4
1455  %add1 = add i8 %mul0, %acc
1456  %add2 = add i8 %add1, %mul1
1457  %add3 = add i8 %add2, %mul2
1458  %add4 = add i8 %add3, %mul3
1459  %add5 = add i8 %add4, %mul4
1460  %add6 = add i8 %add5, %mul5
1461  %add7 = add i8 %add6, %mul6
1462  %add8 = add i8 %add7, %mul7
1463
1464  store i8 %add8, i8 addrspace(1)* %dst, align 4
1465  ret void
1466}
1467
1468; Make sure the pattern is not recognized if there are multiple uses of the
1469; intermediate multiplications.
1470define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1471; GFX7-LABEL: idot8_multiuses_mul1:
1472; GFX7:       ; %bb.0: ; %entry
1473; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1474; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1475; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1476; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1477; GFX7-NEXT:    s_mov_b32 s14, -1
1478; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1479; GFX7-NEXT:    s_add_u32 s12, s12, s3
1480; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1481; GFX7-NEXT:    s_mov_b32 s10, 0
1482; GFX7-NEXT:    s_mov_b32 s11, s3
1483; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1485; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1486; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1487; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1488; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1489; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1490; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1491; GFX7-NEXT:    s_mov_b32 s2, -1
1492; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1493; GFX7-NEXT:    s_waitcnt vmcnt(1)
1494; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
1495; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
1496; GFX7-NEXT:    s_waitcnt vmcnt(0)
1497; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
1498; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX7-NEXT:    v_mad_i32_i24 v16, v1, v9, s4
1500; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
1501; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v9, v16
1502; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
1503; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
1504; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v10, v1
1505; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
1506; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
1507; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v11, v1
1508; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
1509; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
1510; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v12, v1
1511; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
1512; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
1513; GFX7-NEXT:    v_mad_i32_i24 v1, v6, v13, v1
1514; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
1515; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
1516; GFX7-NEXT:    v_mad_i32_i24 v1, v7, v14, v1
1517; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1518; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1519; GFX7-NEXT:    v_mad_i32_i24 v1, v8, v15, v1
1520; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1521; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
1522; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1523; GFX7-NEXT:    s_endpgm
1524;
1525; GFX8-LABEL: idot8_multiuses_mul1:
1526; GFX8:       ; %bb.0: ; %entry
1527; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1528; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1529; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1530; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1531; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1532; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1533; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1534; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1535; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1536; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1537; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1538; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1539; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1540; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1541; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1542; GFX8-NEXT:    s_mov_b32 s10, -1
1543; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1544; GFX8-NEXT:    s_add_u32 s8, s8, s3
1545; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1546; GFX8-NEXT:    s_waitcnt vmcnt(1)
1547; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 4
1548; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
1549; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
1550; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
1551; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
1552; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
1553; GFX8-NEXT:    s_waitcnt vmcnt(0)
1554; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
1555; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1556; GFX8-NEXT:    v_mad_i32_i24 v16, v1, v2, s2
1557; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
1558; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, v16
1559; GFX8-NEXT:    v_bfe_i32 v7, v0, 8, 4
1560; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
1561; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
1562; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
1563; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
1564; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
1565; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
1566; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
1567; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
1568; GFX8-NEXT:    v_bfe_i32 v15, v0, 24, 4
1569; GFX8-NEXT:    v_mad_i32_i24 v1, v12, v13, v1
1570; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
1571; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1572; GFX8-NEXT:    v_mad_i32_i24 v1, v14, v15, v1
1573; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
1574; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v16, v0
1575; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1576; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1577; GFX8-NEXT:    flat_store_dword v[0:1], v2
1578; GFX8-NEXT:    s_endpgm
1579;
1580; GFX9-LABEL: idot8_multiuses_mul1:
1581; GFX9:       ; %bb.0: ; %entry
1582; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1583; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1584; GFX9-NEXT:    s_mov_b32 s10, -1
1585; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1586; GFX9-NEXT:    s_add_u32 s8, s8, s3
1587; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1588; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1589; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1590; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1591; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1592; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1593; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1594; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
1595; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1596; GFX9-NEXT:    s_waitcnt vmcnt(1)
1597; GFX9-NEXT:    v_bfe_i32 v3, v1, 0, 4
1598; GFX9-NEXT:    s_waitcnt vmcnt(0)
1599; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
1600; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
1601; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
1602; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
1603; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
1604; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
1605; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
1606; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
1607; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
1608; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
1609; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
1610; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
1611; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
1612; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1613; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1614; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1615; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1616; GFX9-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1617; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
1618; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v7, v8
1619; GFX9-NEXT:    v_mad_i32_i24 v3, v3, v4, v2
1620; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v9, v10
1621; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v11, v12
1622; GFX9-NEXT:    v_add3_u32 v3, v3, v5, v6
1623; GFX9-NEXT:    v_mul_i32_i24_e32 v9, v13, v14
1624; GFX9-NEXT:    v_mul_i32_i24_e32 v10, v15, v16
1625; GFX9-NEXT:    v_add3_u32 v3, v3, v7, v8
1626; GFX9-NEXT:    v_add3_u32 v3, v3, v9, v10
1627; GFX9-NEXT:    v_add3_u32 v1, v3, v1, v2
1628; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
1629; GFX9-NEXT:    s_endpgm
1630;
1631; GFX9-DL-LABEL: idot8_multiuses_mul1:
1632; GFX9-DL:       ; %bb.0: ; %entry
1633; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1634; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1635; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1636; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1637; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1638; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1639; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1640; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1641; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1642; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1643; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1644; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1645; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1646; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1647; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1648; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 4
1649; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1650; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 4
1651; GFX9-DL-NEXT:    v_bfe_i32 v5, v1, 4, 4
1652; GFX9-DL-NEXT:    v_bfe_i32 v6, v2, 4, 4
1653; GFX9-DL-NEXT:    v_bfe_i32 v7, v1, 8, 4
1654; GFX9-DL-NEXT:    v_bfe_i32 v8, v2, 8, 4
1655; GFX9-DL-NEXT:    v_bfe_i32 v9, v1, 12, 4
1656; GFX9-DL-NEXT:    v_bfe_i32 v10, v2, 12, 4
1657; GFX9-DL-NEXT:    v_bfe_i32 v11, v1, 16, 4
1658; GFX9-DL-NEXT:    v_bfe_i32 v12, v2, 16, 4
1659; GFX9-DL-NEXT:    v_bfe_i32 v13, v1, 20, 4
1660; GFX9-DL-NEXT:    v_bfe_i32 v14, v2, 20, 4
1661; GFX9-DL-NEXT:    v_bfe_i32 v15, v1, 24, 4
1662; GFX9-DL-NEXT:    v_bfe_i32 v16, v2, 24, 4
1663; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1664; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1665; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1666; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1667; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1668; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
1669; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v6, v7, v8
1670; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, v2
1671; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v7, v9, v10
1672; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v8, v11, v12
1673; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v5, v6
1674; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v9, v13, v14
1675; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v10, v15, v16
1676; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v7, v8
1677; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v9, v10
1678; GFX9-DL-NEXT:    v_add3_u32 v1, v3, v1, v2
1679; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1680; GFX9-DL-NEXT:    s_endpgm
1681;
1682; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1:
1683; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1684; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1685; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1686; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1687; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1688; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1689; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
1690; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1691; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
1692; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
1693; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1695; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1696; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
1697; GFX10-DL-XNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
1698; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
1699; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v0, v1, 0, 4
1700; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v3, v1, 4, 4
1701; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
1702; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v4, v2, 4, 4
1703; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v5, v1, 8, 4
1704; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v2, 8, 4
1705; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v2, 0, 4
1706; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v8, v1, 12, 4
1707; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
1708; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v9, v2, 12, 4
1709; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
1710; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1711; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v5, v0, v7, s2
1712; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
1713; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v10, v2, 16, 4
1714; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
1715; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v12, v2, 20, 4
1716; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v0, v0, v7, v5
1717; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
1718; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v13, v2, 24, 4
1719; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
1720; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
1721; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
1722; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
1723; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
1724; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1725; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1726; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v8, v6
1727; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1728; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
1729; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v2, 0
1730; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v1, v5
1731; GFX10-DL-XNACK-NEXT:    global_store_dword v2, v0, s[0:1]
1732; GFX10-DL-XNACK-NEXT:    s_endpgm
1733;
1734; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
1735; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1736; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1737; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1738; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1739; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1740; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1741; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
1742; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1743; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
1744; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
1745; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1746; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1747; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1748; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
1749; GFX10-DL-NOXNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
1750; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
1751; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v2, v1, 0, 4
1752; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v3, v1, 4, 4
1753; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
1754; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v4, v0, 4, 4
1755; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v5, v1, 8, 4
1756; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v0, 8, 4
1757; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v0, 0, 4
1758; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v8, v1, 12, 4
1759; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
1760; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v9, v0, 12, 4
1761; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
1762; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1763; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v5, v2, v7, s2
1764; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
1765; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v10, v0, 16, 4
1766; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
1767; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v12, v0, 20, 4
1768; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v2, v2, v7, v5
1769; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
1770; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v13, v0, 24, 4
1771; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
1772; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
1773; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v3, v4
1774; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
1775; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
1776; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1777; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1778; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v8, v6
1779; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
1780; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v1, v2, v3, v4
1781; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1782; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v0, v1, v0, v5
1783; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
1784; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1785; GFX10-DL-LABEL: idot8_multiuses_mul1:
1786; GFX10-DL:       ; %bb.0: ; %entry
1787; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1788; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1789; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1790; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1791; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1792; GFX10-DL-NEXT:    s_clause 0x1
1793; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1794; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1795; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1796; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1797; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1799; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1800; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1801; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1802; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1803; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40000
1804; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40000
1805; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
1806; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v0
1807; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40004
1808; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40004
1809; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1810; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40008
1811; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40008
1812; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1813; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x4000c
1814; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x4000c
1815; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1816; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
1817; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
1818; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1819; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
1820; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
1821; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1822; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
1823; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
1824; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
1825; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
1826; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1827; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
1828; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
1829; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1830; GFX10-DL-NEXT:    s_endpgm
1831                                                <8 x i4> addrspace(1)* %src2,
1832                                                i32 addrspace(1)* nocapture %dst) {
1833entry:
1834  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1835  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1836  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1837  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1838  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1839
1840  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1841  %cv1e0 = sext i4 %v1e0 to i32
1842  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1843  %cv2e0 = sext i4 %v2e0 to i32
1844  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1845
1846  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1847  %cv1e1 = sext i4 %v1e1 to i32
1848  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1849  %cv2e1 = sext i4 %v2e1 to i32
1850  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1851
1852  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1853  %cv1e2 = sext i4 %v1e2 to i32
1854  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1855  %cv2e2 = sext i4 %v2e2 to i32
1856  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1857
1858  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1859  %cv1e3 = sext i4 %v1e3 to i32
1860  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1861  %cv2e3 = sext i4 %v2e3 to i32
1862  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1863
1864  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1865  %cv1e4 = sext i4 %v1e4 to i32
1866  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1867  %cv2e4 = sext i4 %v2e4 to i32
1868  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1869
1870  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1871  %cv1e5 = sext i4 %v1e5 to i32
1872  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1873  %cv2e5 = sext i4 %v2e5 to i32
1874  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1875
1876  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1877  %cv1e6 = sext i4 %v1e6 to i32
1878  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1879  %cv2e6 = sext i4 %v2e6 to i32
1880  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1881
1882  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1883  %cv1e7 = sext i4 %v1e7 to i32
1884  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1885  %cv2e7 = sext i4 %v2e7 to i32
1886  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1887
1888  %acc = load i32, i32 addrspace(1)* %dst, align 4
1889  %add =  add i32  %mul0, %acc
1890  %add1 = add i32 %mul0, %add
1891  %add2 = add i32 %add1, %mul1
1892  %add3 = add i32 %add2, %mul2
1893  %add4 = add i32 %add3, %mul3
1894  %add5 = add i32 %add4, %mul4
1895  %add6 = add i32 %add5, %mul5
1896  %add7 = add i32 %add6, %mul6
1897  %add8 = add i32 %add7, %mul7
1898
1899  %res = add i32 %add, %add8
1900  store i32 %res, i32 addrspace(1)* %dst, align 4
1901  ret void
1902}
1903
1904; TODO: Support this pattern.
1905define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1906; GFX7-LABEL: idot8_acc32_vecMul:
1907; GFX7:       ; %bb.0: ; %entry
1908; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1909; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1910; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1911; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1912; GFX7-NEXT:    s_mov_b32 s14, -1
1913; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1914; GFX7-NEXT:    s_add_u32 s12, s12, s3
1915; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1916; GFX7-NEXT:    s_mov_b32 s10, 0
1917; GFX7-NEXT:    s_mov_b32 s11, s3
1918; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1919; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1920; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1921; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1922; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1923; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1924; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1925; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1926; GFX7-NEXT:    s_mov_b32 s2, -1
1927; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1928; GFX7-NEXT:    s_waitcnt vmcnt(1)
1929; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v2
1930; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
1931; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
1932; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
1933; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
1934; GFX7-NEXT:    v_bfe_i32 v7, v2, 8, 4
1935; GFX7-NEXT:    v_bfe_i32 v8, v2, 4, 4
1936; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
1937; GFX7-NEXT:    s_waitcnt vmcnt(0)
1938; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v0
1939; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
1940; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
1941; GFX7-NEXT:    v_bfe_i32 v12, v0, 16, 4
1942; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
1943; GFX7-NEXT:    v_bfe_i32 v14, v0, 8, 4
1944; GFX7-NEXT:    v_bfe_i32 v15, v0, 4, 4
1945; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
1946; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1947; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, s4
1948; GFX7-NEXT:    v_mad_i32_i24 v0, v8, v15, v0
1949; GFX7-NEXT:    v_mad_i32_i24 v0, v7, v14, v0
1950; GFX7-NEXT:    v_mad_i32_i24 v0, v6, v13, v0
1951; GFX7-NEXT:    v_mad_i32_i24 v0, v5, v12, v0
1952; GFX7-NEXT:    v_mad_i32_i24 v0, v4, v11, v0
1953; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v10, v0
1954; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v9, v0
1955; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1956; GFX7-NEXT:    s_endpgm
1957;
1958; GFX8-LABEL: idot8_acc32_vecMul:
1959; GFX8:       ; %bb.0: ; %entry
1960; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1961; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1962; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1963; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1964; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1965; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1966; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1967; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1968; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1969; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1970; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1971; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1972; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1973; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1974; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1975; GFX8-NEXT:    s_mov_b32 s10, -1
1976; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1977; GFX8-NEXT:    s_add_u32 s8, s8, s3
1978; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1979; GFX8-NEXT:    s_waitcnt vmcnt(1)
1980; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 28, v3
1981; GFX8-NEXT:    v_bfe_i32 v2, v3, 24, 4
1982; GFX8-NEXT:    v_bfe_i32 v4, v3, 20, 4
1983; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 4
1984; GFX8-NEXT:    v_bfe_i32 v6, v3, 12, 4
1985; GFX8-NEXT:    v_bfe_i32 v7, v3, 8, 4
1986; GFX8-NEXT:    v_bfe_i32 v8, v3, 4, 4
1987; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 4
1988; GFX8-NEXT:    s_waitcnt vmcnt(0)
1989; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 28, v0
1990; GFX8-NEXT:    v_bfe_i32 v10, v0, 24, 4
1991; GFX8-NEXT:    v_bfe_i32 v11, v0, 20, 4
1992; GFX8-NEXT:    v_bfe_i32 v12, v0, 16, 4
1993; GFX8-NEXT:    v_bfe_i32 v13, v0, 12, 4
1994; GFX8-NEXT:    v_bfe_i32 v14, v0, 8, 4
1995; GFX8-NEXT:    v_bfe_i32 v15, v0, 4, 4
1996; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 4
1997; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1998; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
1999; GFX8-NEXT:    v_mad_i32_i24 v0, v8, v15, v0
2000; GFX8-NEXT:    v_mad_i32_i24 v0, v7, v14, v0
2001; GFX8-NEXT:    v_mad_i32_i24 v0, v6, v13, v0
2002; GFX8-NEXT:    v_mad_i32_i24 v0, v5, v12, v0
2003; GFX8-NEXT:    v_mad_i32_i24 v0, v4, v11, v0
2004; GFX8-NEXT:    v_mad_i32_i24 v0, v2, v10, v0
2005; GFX8-NEXT:    v_mad_i32_i24 v2, v1, v9, v0
2006; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2007; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2008; GFX8-NEXT:    flat_store_dword v[0:1], v2
2009; GFX8-NEXT:    s_endpgm
2010;
2011; GFX9-LABEL: idot8_acc32_vecMul:
2012; GFX9:       ; %bb.0: ; %entry
2013; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2014; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2015; GFX9-NEXT:    s_mov_b32 s10, -1
2016; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2017; GFX9-NEXT:    s_add_u32 s8, s8, s3
2018; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2019; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2020; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2021; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2022; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2023; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2024; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2025; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
2026; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2027; GFX9-NEXT:    s_waitcnt vmcnt(1)
2028; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 28, v1
2029; GFX9-NEXT:    v_bfe_i32 v4, v1, 24, 4
2030; GFX9-NEXT:    v_bfe_i32 v5, v1, 20, 4
2031; GFX9-NEXT:    v_bfe_i32 v6, v1, 16, 4
2032; GFX9-NEXT:    v_bfe_i32 v7, v1, 12, 4
2033; GFX9-NEXT:    v_bfe_i32 v8, v1, 8, 4
2034; GFX9-NEXT:    v_bfe_i32 v9, v1, 4, 4
2035; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 4
2036; GFX9-NEXT:    s_waitcnt vmcnt(0)
2037; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 28, v2
2038; GFX9-NEXT:    v_bfe_i32 v11, v2, 24, 4
2039; GFX9-NEXT:    v_bfe_i32 v12, v2, 20, 4
2040; GFX9-NEXT:    v_bfe_i32 v13, v2, 16, 4
2041; GFX9-NEXT:    v_bfe_i32 v14, v2, 12, 4
2042; GFX9-NEXT:    v_bfe_i32 v15, v2, 8, 4
2043; GFX9-NEXT:    v_bfe_i32 v16, v2, 4, 4
2044; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 4
2045; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
2046; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v9, v16
2047; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v8, v15
2048; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v7, v14
2049; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2050; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
2051; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v6, v13
2052; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v5, v12
2053; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
2054; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v4, v11
2055; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v10
2056; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
2057; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
2058; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2059; GFX9-NEXT:    s_endpgm
2060;
2061; GFX9-DL-LABEL: idot8_acc32_vecMul:
2062; GFX9-DL:       ; %bb.0: ; %entry
2063; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2064; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2065; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2066; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2067; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2068; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2069; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2070; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2071; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2072; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2074; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2075; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2076; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2077; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2078; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
2079; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2080; GFX9-DL-NEXT:    s_endpgm
2081;
2082; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul:
2083; GFX10-DL-XNACK:       ; %bb.0: ; %entry
2084; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2085; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2086; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2087; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2088; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2089; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
2090; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2091; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
2092; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
2093; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
2094; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2095; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2096; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
2097; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
2098; GFX10-DL-XNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
2099; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2100; GFX10-DL-XNACK-NEXT:    v_dot8_i32_i4 v1, v1, v2, s2
2101; GFX10-DL-XNACK-NEXT:    global_store_dword v0, v1, s[0:1]
2102; GFX10-DL-XNACK-NEXT:    s_endpgm
2103;
2104; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
2105; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
2106; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2107; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2108; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2109; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2110; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
2111; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2112; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
2113; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2114; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
2115; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
2116; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
2117; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
2118; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2119; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
2120; GFX10-DL-NOXNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
2121; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2122; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s2
2123; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
2124; GFX10-DL-NOXNACK-NEXT:    s_endpgm
2125; GFX10-DL-LABEL: idot8_acc32_vecMul:
2126; GFX10-DL:       ; %bb.0: ; %entry
2127; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2128; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2129; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2130; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2131; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2132; GFX10-DL-NEXT:    s_clause 0x1
2133; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2134; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2135; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2136; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2137; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2139; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2140; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2141; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2143; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
2144; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2145; GFX10-DL-NEXT:    s_endpgm
2146                                              <8 x i4> addrspace(1)* %src2,
2147                                              i32 addrspace(1)* nocapture %dst) {
2148entry:
2149  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2150  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2151  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2152  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2153  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2154
2155  %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
2156  %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
2157
2158  %mul = mul <8 x i32> %cvec1, %cvec2
2159  %mul0 = extractelement <8 x i32> %mul, i64 0
2160  %mul1 = extractelement <8 x i32> %mul, i64 1
2161  %mul2 = extractelement <8 x i32> %mul, i64 2
2162  %mul3 = extractelement <8 x i32> %mul, i64 3
2163  %mul4 = extractelement <8 x i32> %mul, i64 4
2164  %mul5 = extractelement <8 x i32> %mul, i64 5
2165  %mul6 = extractelement <8 x i32> %mul, i64 6
2166  %mul7 = extractelement <8 x i32> %mul, i64 7
2167
2168  %acc = load i32, i32 addrspace(1)* %dst, align 4
2169  %add1 = add i32 %mul0, %acc
2170  %add2 = add i32 %add1, %mul1
2171  %add3 = add i32 %add2, %mul2
2172  %add4 = add i32 %add3, %mul3
2173  %add5 = add i32 %add4, %mul4
2174  %add6 = add i32 %add5, %mul5
2175  %add7 = add i32 %add6, %mul6
2176  %add8 = add i32 %add7, %mul7
2177
2178  store i32 %add8, i32 addrspace(1)* %dst, align 4
2179  ret void
2180}
2181
2182; TODO: Support this pattern.
2183define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2184; GFX7-LABEL: idot8_acc16_vecMul:
2185; GFX7:       ; %bb.0: ; %entry
2186; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2187; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2188; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2189; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2190; GFX7-NEXT:    s_mov_b32 s14, -1
2191; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2192; GFX7-NEXT:    s_add_u32 s12, s12, s3
2193; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2194; GFX7-NEXT:    s_mov_b32 s10, 0
2195; GFX7-NEXT:    s_mov_b32 s11, s3
2196; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2197; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2198; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2199; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2200; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2201; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2202; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2203; GFX7-NEXT:    s_mov_b32 s2, -1
2204; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2205; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2206; GFX7-NEXT:    s_waitcnt vmcnt(2)
2207; GFX7-NEXT:    v_bfe_i32 v3, v2, 20, 4
2208; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 4
2209; GFX7-NEXT:    v_bfe_i32 v5, v2, 4, 4
2210; GFX7-NEXT:    v_bfe_i32 v6, v2, 0, 4
2211; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2212; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
2213; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2214; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
2215; GFX7-NEXT:    s_waitcnt vmcnt(1)
2216; GFX7-NEXT:    v_bfe_i32 v10, v0, 20, 4
2217; GFX7-NEXT:    v_bfe_i32 v11, v0, 16, 4
2218; GFX7-NEXT:    v_bfe_i32 v12, v0, 4, 4
2219; GFX7-NEXT:    v_bfe_i32 v13, v0, 0, 4
2220; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
2221; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
2222; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
2223; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v11
2224; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
2225; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff, v13
2226; GFX7-NEXT:    v_bfe_i32 v14, v0, 24, 4
2227; GFX7-NEXT:    v_ashrrev_i32_e32 v16, 28, v0
2228; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
2229; GFX7-NEXT:    v_or_b32_e32 v6, v11, v10
2230; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff, v14
2231; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff, v16
2232; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
2233; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
2234; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
2235; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff, v6
2236; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
2237; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
2238; GFX7-NEXT:    s_waitcnt vmcnt(0)
2239; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v6, v1
2240; GFX7-NEXT:    v_bfe_i32 v7, v2, 24, 4
2241; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v2
2242; GFX7-NEXT:    v_bfe_i32 v2, v2, 12, 4
2243; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff, v8
2244; GFX7-NEXT:    v_bfe_i32 v0, v0, 12, 4
2245; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff, v15
2246; GFX7-NEXT:    v_mad_u32_u24 v1, v16, v11, v1
2247; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2248; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2249; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v13, v1
2250; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
2251; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
2252; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
2253; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2254; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2255; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v5, v0
2256; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2257; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v10, v0
2258; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff, v9
2259; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
2260; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v14, v0
2261; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2262; GFX7-NEXT:    s_endpgm
2263;
2264; GFX8-LABEL: idot8_acc16_vecMul:
2265; GFX8:       ; %bb.0: ; %entry
2266; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2267; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2268; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2269; GFX8-NEXT:    v_mov_b32_e32 v5, 12
2270; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2271; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2272; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2273; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2274; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2275; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2276; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2277; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2278; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2279; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2280; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2281; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2282; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2283; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2284; GFX8-NEXT:    s_mov_b32 s10, -1
2285; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2286; GFX8-NEXT:    s_add_u32 s8, s8, s3
2287; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2288; GFX8-NEXT:    s_waitcnt vmcnt(2)
2289; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
2290; GFX8-NEXT:    v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2291; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
2292; GFX8-NEXT:    v_lshlrev_b16_sdwa v9, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2293; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
2294; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
2295; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
2296; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
2297; GFX8-NEXT:    s_waitcnt vmcnt(1)
2298; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
2299; GFX8-NEXT:    v_lshlrev_b16_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2300; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
2301; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2302; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
2303; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
2304; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
2305; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v2
2306; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2307; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2308; GFX8-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
2309; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2310; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
2311; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v17
2312; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2313; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
2314; GFX8-NEXT:    s_waitcnt vmcnt(0)
2315; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2316; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
2317; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
2318; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2319; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2320; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
2321; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2322; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2323; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
2324; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2325; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2326; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
2327; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2328; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
2329; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2330; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2331; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
2332; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2333; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2334; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2335; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2336; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
2337; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2338; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2339; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
2340; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
2341; GFX8-NEXT:    flat_store_short v[0:1], v2
2342; GFX8-NEXT:    s_endpgm
2343;
2344; GFX9-LABEL: idot8_acc16_vecMul:
2345; GFX9:       ; %bb.0: ; %entry
2346; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2347; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2348; GFX9-NEXT:    s_mov_b32 s10, -1
2349; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2350; GFX9-NEXT:    s_add_u32 s8, s8, s3
2351; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2352; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2353; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2354; GFX9-NEXT:    v_mov_b32_e32 v4, 12
2355; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2356; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2357; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2358; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2359; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
2360; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2361; GFX9-NEXT:    s_waitcnt vmcnt(2)
2362; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
2363; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v1
2364; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
2365; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
2366; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
2367; GFX9-NEXT:    v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2368; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
2369; GFX9-NEXT:    s_waitcnt vmcnt(1)
2370; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 4, v2
2371; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2372; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v2
2373; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 12, v2
2374; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
2375; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 20, v2
2376; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2377; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
2378; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2379; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 12, v5
2380; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v6
2381; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v7
2382; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v8
2383; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v9
2384; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v10
2385; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v11
2386; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v12
2387; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v13
2388; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2389; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2390; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2391; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2392; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v14
2393; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v15
2394; GFX9-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2395; GFX9-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
2396; GFX9-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2397; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2398; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2399; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2400; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2401; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v16
2402; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
2403; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v18
2404; GFX9-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
2405; GFX9-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
2406; GFX9-NEXT:    s_waitcnt vmcnt(0)
2407; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
2408; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
2409; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2410; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2411; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2412; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2413; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2414; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2415; GFX9-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
2416; GFX9-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
2417; GFX9-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
2418; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
2419; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2420; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2421; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2422; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
2423; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2424; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
2425; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2426; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
2427; GFX9-NEXT:    s_endpgm
2428;
2429; GFX9-DL-LABEL: idot8_acc16_vecMul:
2430; GFX9-DL:       ; %bb.0: ; %entry
2431; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2432; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2433; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2434; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2435; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2436; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2437; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2438; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2439; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
2440; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2441; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2442; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2443; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2444; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
2445; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2446; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2447; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
2448; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v1
2449; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
2450; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
2451; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
2452; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v10, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2453; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v1
2454; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2455; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 4, v2
2456; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2457; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v2
2458; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 12, v2
2459; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
2460; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v16, 20, v2
2461; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2462; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v18, 28, v2
2463; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2464; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v4, 12, v5
2465; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v6
2466; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v7
2467; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v8
2468; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v9
2469; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v10
2470; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v11
2471; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v12
2472; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v13
2473; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
2474; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2475; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2476; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2477; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v14
2478; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v15
2479; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2480; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v11, 16, v12
2481; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2482; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2483; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2484; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2485; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
2486; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v16
2487; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
2488; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v18
2489; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
2490; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v13, 16, v14
2491; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2492; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
2493; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
2494; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2495; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2496; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2497; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2498; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
2499; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2500; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v17, 16, v2
2501; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
2502; GFX9-DL-NEXT:    v_lshl_or_b32 v10, v15, 16, v16
2503; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
2504; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2505; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
2506; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2507; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
2508; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2509; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
2510; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2511; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
2512; GFX9-DL-NEXT:    s_endpgm
2513;
2514; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
2515; GFX10-DL-XNACK:       ; %bb.0: ; %entry
2516; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2517; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2518; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2519; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2520; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2521; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
2522; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2523; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
2524; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
2525; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
2526; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2527; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2528; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
2529; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
2530; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
2531; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
2532; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
2533; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v1
2534; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
2535; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 4, v2
2536; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v2
2537; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
2538; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
2539; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
2540; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
2541; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
2542; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
2543; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 12, v1
2544; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
2545; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
2546; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
2547; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
2548; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
2549; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2550; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2551; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2552; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
2553; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
2554; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
2555; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
2556; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
2557; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v11, v11, 16, v12
2558; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2559; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 20, v1
2560; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
2561; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
2562; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
2563; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
2564; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v13
2565; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v14
2566; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2567; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
2568; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
2569; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
2570; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
2571; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
2572; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
2573; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
2574; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
2575; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v16
2576; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v5, v5, 16, v12
2577; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2578; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
2579; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
2580; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v4, v3
2581; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
2582; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
2583; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v15
2584; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
2585; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v4, 0xffff, v13
2586; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v9, 0xffff, v9
2587; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v5, v6, v5
2588; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2589; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
2590; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
2591; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
2592; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
2593; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v4, v11, 16, v4
2594; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v6, v8, 16, v9
2595; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
2596; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v5
2597; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
2598; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v17
2599; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2600; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2601; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v6, v4
2602; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2603; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
2604; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
2605; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2606; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v3, v4
2607; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2608; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v2, v3, v5
2609; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2610; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v2, v1
2611; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
2612; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
2613; GFX10-DL-XNACK-NEXT:    s_endpgm
2614;
2615; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
2616; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
2617; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2618; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2619; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2620; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
2621; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2622; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2623; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
2624; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2625; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
2626; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
2627; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
2628; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
2629; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2630; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
2631; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
2632; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
2633; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
2634; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v1
2635; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
2636; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 4, v0
2637; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v0
2638; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
2639; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 8, v0
2640; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
2641; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
2642; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
2643; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
2644; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 12, v1
2645; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 12, v0
2646; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
2647; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
2648; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
2649; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
2650; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v12
2651; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v5, 0xffff, v5
2652; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
2653; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
2654; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
2655; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
2656; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
2657; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
2658; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v11, v11, 16, v12
2659; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v4, v4, 16, v5
2660; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 20, v1
2661; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 20, v0
2662; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
2663; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
2664; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
2665; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v13
2666; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v12, 0xffff, v14
2667; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2668; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
2669; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
2670; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
2671; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 28, v0
2672; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
2673; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
2674; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
2675; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
2676; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v16
2677; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v5, v5, 16, v12
2678; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
2679; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
2680; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
2681; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v4, v3
2682; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
2683; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
2684; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v15
2685; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
2686; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v4, 0xffff, v13
2687; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v9, 0xffff, v9
2688; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v5, v6, v5
2689; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2690; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
2691; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
2692; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
2693; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
2694; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v4, v11, 16, v4
2695; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v6, v8, 16, v9
2696; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
2697; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v5
2698; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
2699; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v17
2700; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2701; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2702; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v6, v4
2703; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v7
2704; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
2705; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v10, 16, v1
2706; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2707; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v3, v3, v4
2708; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
2709; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v3, v5
2710; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2711; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v1, v0
2712; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v3
2713; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
2714; GFX10-DL-NOXNACK-NEXT:    s_endpgm
2715; GFX10-DL-LABEL: idot8_acc16_vecMul:
2716; GFX10-DL:       ; %bb.0: ; %entry
2717; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2718; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2719; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2720; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2721; GFX10-DL-NEXT:    s_mov_b32 s14, -1
2722; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
2723; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
2724; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2725; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
2726; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2727; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2728; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2729; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2730; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2731; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
2732; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 28
2733; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
2734; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40014
2735; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
2736; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x4000c
2737; GFX10-DL-NEXT:    s_and_b32 s10, s0, 15
2738; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x40004
2739; GFX10-DL-NEXT:    s_and_b32 s11, s1, 15
2740; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
2741; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40004
2742; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
2743; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s11, s10
2744; GFX10-DL-NEXT:    s_bfe_u32 s11, s1, 0x4000c
2745; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
2746; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40008
2747; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2748; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
2749; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s11
2750; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2751; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
2752; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
2753; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
2754; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
2755; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
2756; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
2757; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
2758; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
2759; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
2760; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
2761; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2762; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
2763; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
2764; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 28
2765; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s2, s3
2766; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2767; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
2768; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2769; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2770; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
2771; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2772; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
2773; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
2774; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2775; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
2776; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2777; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2778; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
2779; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2780; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
2781; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2782; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2783; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2784; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
2785; GFX10-DL-NEXT:    s_endpgm
2786                                              <8 x i4> addrspace(1)* %src2,
2787                                              i16 addrspace(1)* nocapture %dst) {
2788entry:
2789  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2790  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2791  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2792  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2793  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2794
2795  %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
2796  %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
2797
2798  %mul = mul <8 x i16> %cvec1, %cvec2
2799  %mul0 = extractelement <8 x i16> %mul, i64 0
2800  %mul1 = extractelement <8 x i16> %mul, i64 1
2801  %mul2 = extractelement <8 x i16> %mul, i64 2
2802  %mul3 = extractelement <8 x i16> %mul, i64 3
2803  %mul4 = extractelement <8 x i16> %mul, i64 4
2804  %mul5 = extractelement <8 x i16> %mul, i64 5
2805  %mul6 = extractelement <8 x i16> %mul, i64 6
2806  %mul7 = extractelement <8 x i16> %mul, i64 7
2807
2808  %acc = load i16, i16 addrspace(1)* %dst, align 4
2809  %add1 = add i16 %mul0, %acc
2810  %add2 = add i16 %add1, %mul1
2811  %add3 = add i16 %add2, %mul2
2812  %add4 = add i16 %add3, %mul3
2813  %add5 = add i16 %add4, %mul4
2814  %add6 = add i16 %add5, %mul5
2815  %add7 = add i16 %add6, %mul6
2816  %add8 = add i16 %add7, %mul7
2817
2818  store i16 %add8, i16 addrspace(1)* %dst, align 4
2819  ret void
2820}
2821
2822; TODO: Support this pattern.
2823define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2824; GFX7-LABEL: idot8_acc8_vecMul:
2825; GFX7:       ; %bb.0: ; %entry
2826; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2827; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2828; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2829; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2830; GFX7-NEXT:    s_mov_b32 s14, -1
2831; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2832; GFX7-NEXT:    s_add_u32 s12, s12, s3
2833; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2834; GFX7-NEXT:    s_mov_b32 s10, 0
2835; GFX7-NEXT:    s_mov_b32 s11, s3
2836; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2838; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2839; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2840; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2841; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2842; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2843; GFX7-NEXT:    s_mov_b32 s2, -1
2844; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2845; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2846; GFX7-NEXT:    s_waitcnt vmcnt(2)
2847; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v2
2848; GFX7-NEXT:    v_bfe_i32 v4, v2, 24, 4
2849; GFX7-NEXT:    v_bfe_i32 v5, v2, 20, 4
2850; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
2851; GFX7-NEXT:    v_bfe_i32 v7, v2, 12, 4
2852; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
2853; GFX7-NEXT:    v_bfe_i32 v9, v2, 4, 4
2854; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
2855; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
2856; GFX7-NEXT:    v_and_b32_e32 v4, 0xff, v4
2857; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
2858; GFX7-NEXT:    v_and_b32_e32 v6, 0xff, v6
2859; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
2860; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v8
2861; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
2862; GFX7-NEXT:    v_and_b32_e32 v2, 0xff, v2
2863; GFX7-NEXT:    s_waitcnt vmcnt(1)
2864; GFX7-NEXT:    v_ashrrev_i32_e32 v10, 28, v0
2865; GFX7-NEXT:    v_bfe_i32 v11, v0, 24, 4
2866; GFX7-NEXT:    v_bfe_i32 v12, v0, 20, 4
2867; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
2868; GFX7-NEXT:    v_bfe_i32 v14, v0, 12, 4
2869; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
2870; GFX7-NEXT:    v_bfe_i32 v16, v0, 4, 4
2871; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
2872; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
2873; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
2874; GFX7-NEXT:    v_or_b32_e32 v5, v8, v7
2875; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
2876; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v10
2877; GFX7-NEXT:    v_and_b32_e32 v7, 0xff, v11
2878; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v12
2879; GFX7-NEXT:    v_and_b32_e32 v9, 0xff, v13
2880; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v14
2881; GFX7-NEXT:    v_and_b32_e32 v11, 0xff, v15
2882; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v16
2883; GFX7-NEXT:    v_and_b32_e32 v0, 0xff, v0
2884; GFX7-NEXT:    v_or_b32_e32 v6, v7, v6
2885; GFX7-NEXT:    v_or_b32_e32 v7, v9, v8
2886; GFX7-NEXT:    v_or_b32_e32 v8, v11, v10
2887; GFX7-NEXT:    v_or_b32_e32 v0, v0, v12
2888; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2889; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2890; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
2891; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2892; GFX7-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
2893; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff, v4
2894; GFX7-NEXT:    v_or_b32_e32 v2, v2, v5
2895; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
2896; GFX7-NEXT:    v_or_b32_e32 v4, v4, v13
2897; GFX7-NEXT:    v_and_b32_e32 v8, 0xff, v2
2898; GFX7-NEXT:    v_and_b32_e32 v13, 0xff, v0
2899; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
2900; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff, v7
2901; GFX7-NEXT:    v_bfe_u32 v9, v2, 8, 8
2902; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 8
2903; GFX7-NEXT:    s_waitcnt vmcnt(0)
2904; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v13, v1
2905; GFX7-NEXT:    v_or_b32_e32 v5, v7, v5
2906; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
2907; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
2908; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
2909; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
2910; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v14, v1
2911; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2912; GFX7-NEXT:    v_and_b32_e32 v10, 0xff, v4
2913; GFX7-NEXT:    v_and_b32_e32 v15, 0xff, v5
2914; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
2915; GFX7-NEXT:    v_bfe_u32 v11, v4, 8, 8
2916; GFX7-NEXT:    v_bfe_u32 v16, v5, 8, 8
2917; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v15, v0
2918; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
2919; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
2920; GFX7-NEXT:    v_mad_u32_u24 v0, v11, v16, v0
2921; GFX7-NEXT:    v_bfe_u32 v3, v3, 8, 8
2922; GFX7-NEXT:    v_bfe_u32 v6, v6, 8, 8
2923; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v5, v0
2924; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v6, v0
2925; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2926; GFX7-NEXT:    s_endpgm
2927;
2928; GFX8-LABEL: idot8_acc8_vecMul:
2929; GFX8:       ; %bb.0: ; %entry
2930; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2931; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2932; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2933; GFX8-NEXT:    v_mov_b32_e32 v5, 12
2934; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2935; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2936; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2937; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2938; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2939; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2940; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2941; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2942; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2943; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2944; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2945; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2946; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2947; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2948; GFX8-NEXT:    s_mov_b32 s10, -1
2949; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2950; GFX8-NEXT:    s_add_u32 s8, s8, s3
2951; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2952; GFX8-NEXT:    s_waitcnt vmcnt(2)
2953; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 20, v3
2954; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 28, v3
2955; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
2956; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
2957; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
2958; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
2959; GFX8-NEXT:    s_waitcnt vmcnt(1)
2960; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
2961; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2962; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
2963; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
2964; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
2965; GFX8-NEXT:    v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2966; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2967; GFX8-NEXT:    v_lshlrev_b16_e32 v18, 12, v2
2968; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2969; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2970; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 12, v10
2971; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v16
2972; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
2973; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2974; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v3
2975; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v6
2976; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v15
2977; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v18
2978; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
2979; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2980; GFX8-NEXT:    v_ashrrev_i16_e32 v19, 12, v2
2981; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v11
2982; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
2983; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2984; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
2985; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2986; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2987; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2988; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2989; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2990; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2991; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2992; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v14
2993; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2994; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
2995; GFX8-NEXT:    v_mul_lo_u16_e32 v15, v16, v18
2996; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2997; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2998; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2999; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
3000; GFX8-NEXT:    v_mul_lo_u16_e32 v14, v17, v19
3001; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3002; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v9, v11
3003; GFX8-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3004; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3005; GFX8-NEXT:    v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3006; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3007; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
3008; GFX8-NEXT:    v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3009; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
3010; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
3011; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3012; GFX8-NEXT:    v_or_b32_e32 v5, v5, v2
3013; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
3014; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
3015; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
3016; GFX8-NEXT:    s_waitcnt vmcnt(0)
3017; GFX8-NEXT:    v_add_u16_e32 v3, v8, v4
3018; GFX8-NEXT:    v_add_u16_e32 v3, v3, v5
3019; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
3020; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
3021; GFX8-NEXT:    v_mad_u16 v2, v17, v19, v2
3022; GFX8-NEXT:    v_add_u16_e32 v2, v2, v6
3023; GFX8-NEXT:    v_mad_u16 v2, v16, v18, v2
3024; GFX8-NEXT:    v_add_u16_e32 v2, v2, v10
3025; GFX8-NEXT:    flat_store_byte v[0:1], v2
3026; GFX8-NEXT:    s_endpgm
3027;
3028; GFX9-LABEL: idot8_acc8_vecMul:
3029; GFX9:       ; %bb.0: ; %entry
3030; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3031; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3032; GFX9-NEXT:    s_mov_b32 s10, -1
3033; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
3034; GFX9-NEXT:    s_add_u32 s8, s8, s3
3035; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3036; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3037; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3038; GFX9-NEXT:    v_mov_b32_e32 v3, 0
3039; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3040; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
3041; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
3042; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
3043; GFX9-NEXT:    v_mov_b32_e32 v0, 12
3044; GFX9-NEXT:    s_addc_u32 s9, s9, 0
3045; GFX9-NEXT:    s_waitcnt vmcnt(2)
3046; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
3047; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3048; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
3049; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
3050; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
3051; GFX9-NEXT:    s_waitcnt vmcnt(1)
3052; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
3053; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
3054; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
3055; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
3056; GFX9-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3057; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3058; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
3059; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3060; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3061; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
3062; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
3063; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
3064; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
3065; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
3066; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
3067; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
3068; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
3069; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
3070; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
3071; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
3072; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
3073; GFX9-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
3074; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
3075; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
3076; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
3077; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
3078; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
3079; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
3080; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
3081; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
3082; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
3083; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
3084; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
3085; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
3086; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
3087; GFX9-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
3088; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3089; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3090; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
3091; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
3092; GFX9-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
3093; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3094; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
3095; GFX9-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3096; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
3097; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3098; GFX9-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3099; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3100; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
3101; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3102; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
3103; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3104; GFX9-NEXT:    v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3105; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
3106; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
3107; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
3108; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
3109; GFX9-NEXT:    s_waitcnt vmcnt(0)
3110; GFX9-NEXT:    v_add_u16_e32 v1, v7, v4
3111; GFX9-NEXT:    v_add_u16_e32 v1, v1, v2
3112; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
3113; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
3114; GFX9-NEXT:    v_mad_legacy_u16 v0, v16, v18, v0
3115; GFX9-NEXT:    v_add_u16_e32 v0, v0, v5
3116; GFX9-NEXT:    v_mad_legacy_u16 v0, v15, v17, v0
3117; GFX9-NEXT:    v_add_u16_e32 v0, v0, v9
3118; GFX9-NEXT:    global_store_byte v3, v0, s[2:3]
3119; GFX9-NEXT:    s_endpgm
3120;
3121; GFX9-DL-LABEL: idot8_acc8_vecMul:
3122; GFX9-DL:       ; %bb.0: ; %entry
3123; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3124; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3125; GFX9-DL-NEXT:    s_mov_b32 s10, -1
3126; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
3127; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
3128; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3129; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3130; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3131; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
3132; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3133; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3134; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3135; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
3136; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 12
3137; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
3138; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
3139; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
3140; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3141; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
3142; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
3143; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
3144; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3145; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
3146; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
3147; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
3148; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
3149; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3150; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3151; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
3152; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3153; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3154; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
3155; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
3156; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
3157; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
3158; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
3159; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
3160; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
3161; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
3162; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
3163; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
3164; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
3165; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
3166; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
3167; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
3168; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
3169; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
3170; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
3171; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
3172; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
3173; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
3174; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
3175; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
3176; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
3177; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
3178; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
3179; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
3180; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
3181; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3182; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3183; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
3184; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
3185; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
3186; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3187; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
3188; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3189; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
3190; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3191; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3192; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3193; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
3194; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3195; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
3196; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3197; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3198; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v0
3199; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
3200; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
3201; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
3202; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3203; GFX9-DL-NEXT:    v_add_u16_e32 v1, v7, v4
3204; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
3205; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
3206; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
3207; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v16, v18, v0
3208; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v5
3209; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v15, v17, v0
3210; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v9
3211; GFX9-DL-NEXT:    global_store_byte v3, v0, s[2:3]
3212; GFX9-DL-NEXT:    s_endpgm
3213;
3214; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul:
3215; GFX10-DL-XNACK:       ; %bb.0: ; %entry
3216; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3217; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3218; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3219; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v4, 0
3220; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3221; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3222; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
3223; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
3224; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
3225; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
3226; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
3227; GFX10-DL-XNACK-NEXT:    s_clause 0x1
3228; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
3229; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
3230; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v4, s[0:1]
3231; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
3232; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
3233; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
3234; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
3235; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3236; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
3237; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3238; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
3239; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
3240; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
3241; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
3242; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
3243; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
3244; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
3245; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
3246; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
3247; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v0, 20, v1
3248; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
3249; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
3250; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
3251; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
3252; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
3253; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
3254; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
3255; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
3256; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
3257; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
3258; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3259; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
3260; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
3261; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
3262; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
3263; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
3264; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
3265; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
3266; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
3267; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
3268; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
3269; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
3270; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
3271; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
3272; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
3273; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
3274; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
3275; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
3276; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
3277; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
3278; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
3279; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
3280; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
3281; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
3282; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3283; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
3284; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
3285; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v0, v11
3286; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
3287; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
3288; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
3289; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
3290; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
3291; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v5, v12
3292; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v9
3293; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3294; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3295; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3296; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3297; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
3298; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v11
3299; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
3300; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v1, v3
3301; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3302; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v9, v3, v10
3303; GFX10-DL-XNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
3304; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
3305; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v9, v8
3306; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v2
3307; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v5, v12, v0
3308; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3309; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
3310; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
3311; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3312; GFX10-DL-XNACK-NEXT:    global_store_byte v4, v0, s[0:1]
3313; GFX10-DL-XNACK-NEXT:    s_endpgm
3314;
3315; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
3316; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
3317; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3318; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3319; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3320; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v4, 0
3321; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3322; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3323; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
3324; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
3325; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
3326; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
3327; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
3328; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
3329; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
3330; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
3331; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v2, v4, s[0:1]
3332; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
3333; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
3334; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
3335; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
3336; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3337; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v0
3338; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
3339; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
3340; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
3341; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
3342; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
3343; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
3344; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
3345; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
3346; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3347; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
3348; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
3349; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v18, 12, v0
3350; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v16
3351; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
3352; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
3353; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
3354; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
3355; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
3356; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
3357; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
3358; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
3359; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
3360; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
3361; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
3362; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3363; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
3364; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 12, v3
3365; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
3366; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
3367; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
3368; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
3369; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
3370; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
3371; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
3372; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v0, v9, v0
3373; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
3374; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
3375; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
3376; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
3377; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v3
3378; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
3379; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v11
3380; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
3381; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
3382; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
3383; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3384; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
3385; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v12
3386; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v9
3387; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v9, v7, v14
3388; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
3389; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
3390; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
3391; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v18
3392; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v12, v5, v11
3393; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 8, v3
3394; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3395; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3396; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3397; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3398; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
3399; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
3400; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
3401; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v2, v1, v2
3402; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3403; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v9, v2, v9
3404; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
3405; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
3406; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v9, v8
3407; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v2
3408; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v5, v11, v0
3409; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3410; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
3411; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
3412; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3413; GFX10-DL-NOXNACK-NEXT:    global_store_byte v4, v0, s[0:1]
3414; GFX10-DL-NOXNACK-NEXT:    s_endpgm
3415; GFX10-DL-LABEL: idot8_acc8_vecMul:
3416; GFX10-DL:       ; %bb.0: ; %entry
3417; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
3418; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3419; GFX10-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
3420; GFX10-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
3421; GFX10-DL-NEXT:    s_mov_b32 s22, -1
3422; GFX10-DL-NEXT:    s_mov_b32 s23, 0x31c16000
3423; GFX10-DL-NEXT:    s_add_u32 s20, s20, s3
3424; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3425; GFX10-DL-NEXT:    s_addc_u32 s21, s21, 0
3426; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3427; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
3428; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
3429; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
3430; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
3431; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3432; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 4
3433; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 4
3434; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
3435; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
3436; GFX10-DL-NEXT:    s_lshr_b32 s10, s0, 12
3437; GFX10-DL-NEXT:    s_lshr_b32 s17, s1, 12
3438; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s0
3439; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s1
3440; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s17
3441; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
3442; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
3443; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
3444; GFX10-DL-NEXT:    s_lshr_b32 s11, s0, 8
3445; GFX10-DL-NEXT:    s_lshr_b32 s18, s1, 8
3446; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
3447; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
3448; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
3449; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
3450; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v5
3451; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v6, v12
3452; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v13, 12, v13
3453; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
3454; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
3455; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, v2, v3
3456; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v6
3457; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v19, v13
3458; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 20
3459; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 16
3460; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 28
3461; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 24
3462; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 20
3463; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3464; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
3465; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
3466; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
3467; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s3
3468; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s12
3469; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v11
3470; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
3471; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 16
3472; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 28
3473; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s13
3474; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v7
3475; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v8
3476; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v9
3477; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3478; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
3479; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
3480; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v10
3481; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v12
3482; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 24
3483; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
3484; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v15
3485; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
3486; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
3487; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v9
3488; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v15, v8, v6
3489; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v10
3490; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v14
3491; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
3492; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3493; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
3494; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v4
3495; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v5, v11
3496; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v7
3497; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v8
3498; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3499; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3500; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
3501; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
3502; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3503; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v4
3504; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
3505; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
3506; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
3507; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3508; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3509; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
3510; GFX10-DL-NEXT:    s_endpgm
3511                                             <8 x i4> addrspace(1)* %src2,
3512                                             i8 addrspace(1)* nocapture %dst) {
3513entry:
3514  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3515  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3516  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3517  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3518  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3519
3520  %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
3521  %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
3522
3523  %mul = mul <8 x i8> %cvec1, %cvec2
3524  %mul0 = extractelement <8 x i8> %mul, i64 0
3525  %mul1 = extractelement <8 x i8> %mul, i64 1
3526  %mul2 = extractelement <8 x i8> %mul, i64 2
3527  %mul3 = extractelement <8 x i8> %mul, i64 3
3528  %mul4 = extractelement <8 x i8> %mul, i64 4
3529  %mul5 = extractelement <8 x i8> %mul, i64 5
3530  %mul6 = extractelement <8 x i8> %mul, i64 6
3531  %mul7 = extractelement <8 x i8> %mul, i64 7
3532
3533  %acc = load i8, i8 addrspace(1)* %dst, align 4
3534  %add1 = add i8 %mul0, %acc
3535  %add2 = add i8 %add1, %mul1
3536  %add3 = add i8 %add2, %mul2
3537  %add4 = add i8 %add3, %mul3
3538  %add5 = add i8 %add4, %mul4
3539  %add6 = add i8 %add5, %mul5
3540  %add7 = add i8 %add6, %mul6
3541  %add8 = add i8 %add7, %mul7
3542
3543  store i8 %add8, i8 addrspace(1)* %dst, align 4
3544  ret void
3545}
3546
3547declare i32 @llvm.amdgcn.workitem.id.x()
3548