1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
10
11define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
12; GFX7-LABEL: idot8_acc32:
13; GFX7:       ; %bb.0: ; %entry
14; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
15; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
16; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
17; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
18; GFX7-NEXT:    s_mov_b32 s14, -1
19; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
20; GFX7-NEXT:    s_add_u32 s12, s12, s3
21; GFX7-NEXT:    s_mov_b32 s3, 0xf000
22; GFX7-NEXT:    s_mov_b32 s10, 0
23; GFX7-NEXT:    s_mov_b32 s11, s3
24; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
26; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
27; GFX7-NEXT:    v_mov_b32_e32 v1, 0
28; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
29; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
30; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
31; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
32; GFX7-NEXT:    s_mov_b32 s2, -1
33; GFX7-NEXT:    s_addc_u32 s13, s13, 0
34; GFX7-NEXT:    s_waitcnt vmcnt(1)
35; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
36; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
37; GFX7-NEXT:    s_waitcnt vmcnt(0)
38; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
39; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
40; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v9, s4
42; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
43; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
44; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v10, v1
45; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
46; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
47; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v11, v1
48; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
49; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
50; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v12, v1
51; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
52; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
53; GFX7-NEXT:    v_mad_i32_i24 v1, v6, v13, v1
54; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
55; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
56; GFX7-NEXT:    v_mad_i32_i24 v1, v7, v14, v1
57; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
58; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
59; GFX7-NEXT:    v_mad_i32_i24 v1, v8, v15, v1
60; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
61; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
62; GFX7-NEXT:    s_endpgm
63;
64; GFX8-LABEL: idot8_acc32:
65; GFX8:       ; %bb.0: ; %entry
66; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
67; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
68; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
69; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
70; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
71; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
72; GFX8-NEXT:    v_mov_b32_e32 v1, s5
73; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
74; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
75; GFX8-NEXT:    flat_load_dword v3, v[0:1]
76; GFX8-NEXT:    v_mov_b32_e32 v1, s7
77; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
78; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
79; GFX8-NEXT:    flat_load_dword v0, v[0:1]
80; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
81; GFX8-NEXT:    s_mov_b32 s10, -1
82; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
83; GFX8-NEXT:    s_add_u32 s8, s8, s3
84; GFX8-NEXT:    s_addc_u32 s9, s9, 0
85; GFX8-NEXT:    s_waitcnt vmcnt(1)
86; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 4
87; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
88; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
89; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
90; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
91; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
92; GFX8-NEXT:    s_waitcnt vmcnt(0)
93; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
94; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
95; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, s2
97; GFX8-NEXT:    v_bfe_i32 v7, v0, 8, 4
98; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
99; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
100; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
101; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
102; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
103; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
104; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
105; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
106; GFX8-NEXT:    v_bfe_i32 v15, v0, 24, 4
107; GFX8-NEXT:    v_mad_i32_i24 v1, v12, v13, v1
108; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
109; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
110; GFX8-NEXT:    v_mad_i32_i24 v1, v14, v15, v1
111; GFX8-NEXT:    v_mad_i32_i24 v2, v3, v0, v1
112; GFX8-NEXT:    v_mov_b32_e32 v0, s0
113; GFX8-NEXT:    v_mov_b32_e32 v1, s1
114; GFX8-NEXT:    flat_store_dword v[0:1], v2
115; GFX8-NEXT:    s_endpgm
116;
117; GFX9-LABEL: idot8_acc32:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
120; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
121; GFX9-NEXT:    s_mov_b32 s10, -1
122; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
123; GFX9-NEXT:    s_add_u32 s8, s8, s3
124; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
125; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
126; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
127; GFX9-NEXT:    s_addc_u32 s9, s9, 0
128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
130; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
131; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
132; GFX9-NEXT:    v_mov_b32_e32 v0, 0
133; GFX9-NEXT:    s_waitcnt vmcnt(1)
134; GFX9-NEXT:    v_bfe_i32 v3, v1, 0, 4
135; GFX9-NEXT:    s_waitcnt vmcnt(0)
136; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
137; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
138; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
139; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
140; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
141; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
142; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
143; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
144; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
145; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
146; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
147; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
148; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
149; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
150; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
151; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
152; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
153; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v7, v8
154; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v9, v10
155; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
156; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9-NEXT:    v_add3_u32 v2, v3, s0, v4
158; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v11, v12
159; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v13, v14
160; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v6
161; GFX9-NEXT:    v_mul_i32_i24_e32 v9, v15, v16
162; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v8
163; GFX9-NEXT:    v_add3_u32 v1, v2, v9, v1
164; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
165; GFX9-NEXT:    s_endpgm
166;
167; GFX9-DL-LABEL: idot8_acc32:
168; GFX9-DL:       ; %bb.0: ; %entry
169; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
170; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
171; GFX9-DL-NEXT:    s_mov_b32 s10, -1
172; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
173; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
174; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
175; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
176; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
177; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
178; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
180; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
181; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
182; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
183; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
184; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
185; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
186; GFX9-DL-NEXT:    s_endpgm
187;
188; GFX10-DL-XNACK-LABEL: idot8_acc32:
189; GFX10-DL-XNACK:       ; %bb.0: ; %entry
190; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
191; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
192; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
193; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
194; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
195; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
196; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
197; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
198; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
199; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX10-DL-XNACK-NEXT:    s_clause 0x1
201; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
202; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
203; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
204; GFX10-DL-XNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
205; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
206; GFX10-DL-XNACK-NEXT:    v_dot8_i32_i4 v1, v1, v2, s2
207; GFX10-DL-XNACK-NEXT:    global_store_dword v0, v1, s[0:1]
208; GFX10-DL-XNACK-NEXT:    s_endpgm
209;
210; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
211; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
212; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
213; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
214; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
215; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
216; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
217; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
218; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
219; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
220; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
221; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
222; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
224; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
225; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
226; GFX10-DL-NOXNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
227; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
228; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s2
229; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
230; GFX10-DL-NOXNACK-NEXT:    s_endpgm
231; GFX10-DL-LABEL: idot8_acc32:
232; GFX10-DL:       ; %bb.0: ; %entry
233; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
234; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
235; GFX10-DL-NEXT:    s_mov_b32 s10, -1
236; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
237; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
238; GFX10-DL-NEXT:    s_clause 0x1
239; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
240; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
241; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
242; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
243; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
245; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
246; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
247; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
249; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
250; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
251; GFX10-DL-NEXT:    s_endpgm
252                                       <8 x i4> addrspace(1)* %src2,
253                                       i32 addrspace(1)* nocapture %dst) {
254entry:
255  %idx = call i32 @llvm.amdgcn.workitem.id.x()
256  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
257  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
258  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
259  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
260
261  %v1e0 = extractelement <8 x i4> %vec1, i64 0
262  %cv1e0 = sext i4 %v1e0 to i32
263  %v2e0 = extractelement <8 x i4> %vec2, i64 0
264  %cv2e0 = sext i4 %v2e0 to i32
265  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
266
267  %v1e1 = extractelement <8 x i4> %vec1, i64 1
268  %cv1e1 = sext i4 %v1e1 to i32
269  %v2e1 = extractelement <8 x i4> %vec2, i64 1
270  %cv2e1 = sext i4 %v2e1 to i32
271  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
272
273  %v1e2 = extractelement <8 x i4> %vec1, i64 2
274  %cv1e2 = sext i4 %v1e2 to i32
275  %v2e2 = extractelement <8 x i4> %vec2, i64 2
276  %cv2e2 = sext i4 %v2e2 to i32
277  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
278
279  %v1e3 = extractelement <8 x i4> %vec1, i64 3
280  %cv1e3 = sext i4 %v1e3 to i32
281  %v2e3 = extractelement <8 x i4> %vec2, i64 3
282  %cv2e3 = sext i4 %v2e3 to i32
283  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
284
285  %v1e4 = extractelement <8 x i4> %vec1, i64 4
286  %cv1e4 = sext i4 %v1e4 to i32
287  %v2e4 = extractelement <8 x i4> %vec2, i64 4
288  %cv2e4 = sext i4 %v2e4 to i32
289  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
290
291  %v1e5 = extractelement <8 x i4> %vec1, i64 5
292  %cv1e5 = sext i4 %v1e5 to i32
293  %v2e5 = extractelement <8 x i4> %vec2, i64 5
294  %cv2e5 = sext i4 %v2e5 to i32
295  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
296
297  %v1e6 = extractelement <8 x i4> %vec1, i64 6
298  %cv1e6 = sext i4 %v1e6 to i32
299  %v2e6 = extractelement <8 x i4> %vec2, i64 6
300  %cv2e6 = sext i4 %v2e6 to i32
301  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
302
303  %v1e7 = extractelement <8 x i4> %vec1, i64 7
304  %cv1e7 = sext i4 %v1e7 to i32
305  %v2e7 = extractelement <8 x i4> %vec2, i64 7
306  %cv2e7 = sext i4 %v2e7 to i32
307  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
308
309  %acc = load i32, i32 addrspace(1)* %dst, align 4
310  %add1 = add i32 %mul0, %acc
311  %add2 = add i32 %add1, %mul1
312  %add3 = add i32 %add2, %mul2
313  %add4 = add i32 %add3, %mul3
314  %add5 = add i32 %add4, %mul4
315  %add6 = add i32 %add5, %mul5
316  %add7 = add i32 %add6, %mul6
317  %add8 = add i32 %add7, %mul7
318
319  store i32 %add8, i32 addrspace(1)* %dst, align 4
320  ret void
321}
322
323; TODO: Once the unnecessary zero extentions of the elements are removed;
324; pattern recognizer will kick in.
325define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
326; GFX7-LABEL: idot8_acc16:
327; GFX7:       ; %bb.0: ; %entry
328; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
329; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
330; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
331; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
332; GFX7-NEXT:    s_mov_b32 s14, -1
333; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
334; GFX7-NEXT:    s_add_u32 s12, s12, s3
335; GFX7-NEXT:    s_mov_b32 s3, 0xf000
336; GFX7-NEXT:    s_mov_b32 s10, 0
337; GFX7-NEXT:    s_mov_b32 s11, s3
338; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
340; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
341; GFX7-NEXT:    v_mov_b32_e32 v1, 0
342; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
343; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
344; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
345; GFX7-NEXT:    s_mov_b32 s2, -1
346; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
347; GFX7-NEXT:    s_mov_b32 s4, 0xffff
348; GFX7-NEXT:    s_addc_u32 s13, s13, 0
349; GFX7-NEXT:    s_waitcnt vmcnt(2)
350; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
351; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
352; GFX7-NEXT:    s_waitcnt vmcnt(1)
353; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
354; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
355; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
356; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
357; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
358; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
359; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
360; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
361; GFX7-NEXT:    s_waitcnt vmcnt(0)
362; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
363; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
364; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
365; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
366; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
367; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
368; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
369; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
370; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
371; GFX7-NEXT:    v_and_b32_e32 v13, s4, v13
372; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
373; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
374; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
375; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
376; GFX7-NEXT:    v_and_b32_e32 v14, s4, v14
377; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
378; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
379; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
380; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
381; GFX7-NEXT:    v_and_b32_e32 v15, s4, v15
382; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
383; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
384; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
385; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
386; GFX7-NEXT:    v_and_b32_e32 v16, s4, v16
387; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
388; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
389; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
390; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
391; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
392; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
393; GFX7-NEXT:    s_endpgm
394;
395; GFX8-LABEL: idot8_acc16:
396; GFX8:       ; %bb.0: ; %entry
397; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
398; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
399; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
400; GFX8-NEXT:    v_mov_b32_e32 v5, 12
401; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
402; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX8-NEXT:    v_mov_b32_e32 v1, s5
404; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
405; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
406; GFX8-NEXT:    flat_load_dword v3, v[0:1]
407; GFX8-NEXT:    v_mov_b32_e32 v1, s7
408; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
409; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
410; GFX8-NEXT:    flat_load_dword v2, v[0:1]
411; GFX8-NEXT:    v_mov_b32_e32 v0, s0
412; GFX8-NEXT:    v_mov_b32_e32 v1, s1
413; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
414; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
415; GFX8-NEXT:    s_mov_b32 s10, -1
416; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
417; GFX8-NEXT:    s_add_u32 s8, s8, s3
418; GFX8-NEXT:    s_addc_u32 s9, s9, 0
419; GFX8-NEXT:    s_waitcnt vmcnt(2)
420; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
421; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
422; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
423; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
424; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
425; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
426; GFX8-NEXT:    s_waitcnt vmcnt(1)
427; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
428; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
429; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
430; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
431; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
432; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
433; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
434; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
435; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
436; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
437; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
438; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
439; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
440; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
441; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
442; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
443; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
444; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
445; GFX8-NEXT:    s_waitcnt vmcnt(0)
446; GFX8-NEXT:    v_mad_u16 v4, v5, v16, v4
447; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
448; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
449; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
450; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
451; GFX8-NEXT:    v_mad_u16 v4, v10, v15, v4
452; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
453; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
454; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
455; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
456; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
457; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
458; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
459; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
460; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
461; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
462; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
463; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
464; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
465; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
466; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
467; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
468; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
469; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
470; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
471; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
472; GFX8-NEXT:    flat_store_short v[0:1], v2
473; GFX8-NEXT:    s_endpgm
474;
475; GFX9-LABEL: idot8_acc16:
476; GFX9:       ; %bb.0: ; %entry
477; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
478; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
479; GFX9-NEXT:    s_mov_b32 s10, -1
480; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
481; GFX9-NEXT:    s_add_u32 s8, s8, s3
482; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
483; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
484; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
485; GFX9-NEXT:    v_mov_b32_e32 v4, 12
486; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
488; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
489; GFX9-NEXT:    v_mov_b32_e32 v0, 0
490; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
491; GFX9-NEXT:    s_addc_u32 s9, s9, 0
492; GFX9-NEXT:    s_waitcnt vmcnt(2)
493; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
494; GFX9-NEXT:    s_waitcnt vmcnt(1)
495; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
496; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
497; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
498; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
499; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
500; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
501; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
502; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
503; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
504; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
505; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
506; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
507; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
508; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
509; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
510; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
511; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
512; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
513; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
514; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
515; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
516; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
517; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
518; GFX9-NEXT:    s_waitcnt vmcnt(0)
519; GFX9-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
520; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
521; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
522; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
523; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
524; GFX9-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
525; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
526; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
527; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
528; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
529; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
530; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
531; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
532; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
533; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
534; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
535; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
536; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
537; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
538; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
539; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
540; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
541; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
542; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
543; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
544; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
545; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
546; GFX9-NEXT:    s_endpgm
547;
548; GFX9-DL-LABEL: idot8_acc16:
549; GFX9-DL:       ; %bb.0: ; %entry
550; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
551; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
552; GFX9-DL-NEXT:    s_mov_b32 s10, -1
553; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
554; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
555; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
556; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
557; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
558; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
559; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
561; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
562; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
563; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
564; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
565; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
566; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
567; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
568; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
569; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
570; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
571; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
572; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
573; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
574; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
575; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
576; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
577; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
578; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
579; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
580; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
581; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
582; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
583; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
584; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
585; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
586; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
587; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
588; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
589; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
590; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
591; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
592; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
593; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
594; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
595; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
596; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
597; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
598; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
599; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
600; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
601; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
602; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
603; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
604; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
605; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
606; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
607; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
608; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
609; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
610; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
611; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
612; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
613; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
614; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
615; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
616; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
617; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
618; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
619; GFX9-DL-NEXT:    s_endpgm
620;
621; GFX10-DL-XNACK-LABEL: idot8_acc16:
622; GFX10-DL-XNACK:       ; %bb.0: ; %entry
623; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
624; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
625; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
626; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
627; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
628; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
629; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
630; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
631; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
632; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX10-DL-XNACK-NEXT:    s_clause 0x1
634; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
635; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
636; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
637; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
638; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
639; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
640; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
641; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
642; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
643; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
644; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
645; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
646; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
647; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
648; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
649; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
650; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
651; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
652; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
653; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
654; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
655; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
656; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
657; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
658; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
659; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
660; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
661; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
662; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
663; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
664; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
665; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
666; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
667; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
668; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
669; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
670; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
671; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
672; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
673; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
674; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
675; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
676; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
677; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
678; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
679; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
680; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
681; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
682; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
683; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
684; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
685; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
686; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
687; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
688; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
689; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
690; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
691; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
692; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
693; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
694; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
695; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
696; GFX10-DL-XNACK-NEXT:    s_endpgm
697;
698; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
699; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
700; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
701; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
702; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
703; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
704; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
705; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
706; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
707; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
708; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
709; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
710; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
711; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
712; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
713; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
714; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
715; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
716; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
717; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
718; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
719; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
720; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
721; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
722; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
723; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
724; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
725; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
726; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
727; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
728; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
729; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
730; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
731; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
732; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
733; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
734; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
735; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
736; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
737; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
738; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
739; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
740; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
741; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
742; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
743; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
744; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
745; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
746; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
747; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
748; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
749; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
750; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
751; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
752; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
753; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
754; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
755; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
756; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
757; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
758; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
759; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
760; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
761; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
762; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
763; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
764; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
765; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
766; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
767; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
768; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
769; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
770; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
771; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
772; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
773; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
774; GFX10-DL-NOXNACK-NEXT:    s_endpgm
775; GFX10-DL-LABEL: idot8_acc16:
776; GFX10-DL:       ; %bb.0: ; %entry
777; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
778; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
779; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
780; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
781; GFX10-DL-NEXT:    s_mov_b32 s14, -1
782; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
783; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
784; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
785; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
786; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
788; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
789; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
790; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
792; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
793; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
794; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
795; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
796; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
797; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
798; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
799; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
800; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
801; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
802; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
803; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
804; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
805; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
806; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
807; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
808; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
809; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
810; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
811; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
812; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
813; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
814; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
815; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
816; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
817; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
818; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
819; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
820; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
821; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
822; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
823; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
824; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
825; GFX10-DL-NEXT:    s_endpgm
826                                       <8 x i4> addrspace(1)* %src2,
827                                       i16 addrspace(1)* nocapture %dst) {
828entry:
829  %idx = call i32 @llvm.amdgcn.workitem.id.x()
830  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
831  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
832  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
833  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
834
835  %v1e0 = extractelement <8 x i4> %vec1, i64 0
836  %cv1e0 = sext i4 %v1e0 to i16
837  %v2e0 = extractelement <8 x i4> %vec2, i64 0
838  %cv2e0 = sext i4 %v2e0 to i16
839  %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0
840
841  %v1e1 = extractelement <8 x i4> %vec1, i64 1
842  %cv1e1 = sext i4 %v1e1 to i16
843  %v2e1 = extractelement <8 x i4> %vec2, i64 1
844  %cv2e1 = sext i4 %v2e1 to i16
845  %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1
846
847  %v1e2 = extractelement <8 x i4> %vec1, i64 2
848  %cv1e2 = sext i4 %v1e2 to i16
849  %v2e2 = extractelement <8 x i4> %vec2, i64 2
850  %cv2e2 = sext i4 %v2e2 to i16
851  %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2
852
853  %v1e3 = extractelement <8 x i4> %vec1, i64 3
854  %cv1e3 = sext i4 %v1e3 to i16
855  %v2e3 = extractelement <8 x i4> %vec2, i64 3
856  %cv2e3 = sext i4 %v2e3 to i16
857  %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3
858
859  %v1e4 = extractelement <8 x i4> %vec1, i64 4
860  %cv1e4 = sext i4 %v1e4 to i16
861  %v2e4 = extractelement <8 x i4> %vec2, i64 4
862  %cv2e4 = sext i4 %v2e4 to i16
863  %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4
864
865  %v1e5 = extractelement <8 x i4> %vec1, i64 5
866  %cv1e5 = sext i4 %v1e5 to i16
867  %v2e5 = extractelement <8 x i4> %vec2, i64 5
868  %cv2e5 = sext i4 %v2e5 to i16
869  %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5
870
871  %v1e6 = extractelement <8 x i4> %vec1, i64 6
872  %cv1e6 = sext i4 %v1e6 to i16
873  %v2e6 = extractelement <8 x i4> %vec2, i64 6
874  %cv2e6 = sext i4 %v2e6 to i16
875  %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6
876
877  %v1e7 = extractelement <8 x i4> %vec1, i64 7
878  %cv1e7 = sext i4 %v1e7 to i16
879  %v2e7 = extractelement <8 x i4> %vec2, i64 7
880  %cv2e7 = sext i4 %v2e7 to i16
881  %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7
882
883  %acc = load i16, i16 addrspace(1)* %dst, align 4
884  %add1 = add i16 %mul0, %acc
885  %add2 = add i16 %add1, %mul1
886  %add3 = add i16 %add2, %mul2
887  %add4 = add i16 %add3, %mul3
888  %add5 = add i16 %add4, %mul4
889  %add6 = add i16 %add5, %mul5
890  %add7 = add i16 %add6, %mul6
891  %add8 = add i16 %add7, %mul7
892
893  store i16 %add8, i16 addrspace(1)* %dst, align 4
894  ret void
895}
896
897; TODO: Support this pattern.
898define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
899; GFX7-LABEL: idot8_acc8:
900; GFX7:       ; %bb.0: ; %entry
901; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
902; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
903; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
904; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
905; GFX7-NEXT:    s_mov_b32 s14, -1
906; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
907; GFX7-NEXT:    s_add_u32 s12, s12, s3
908; GFX7-NEXT:    s_mov_b32 s3, 0xf000
909; GFX7-NEXT:    s_mov_b32 s10, 0
910; GFX7-NEXT:    s_mov_b32 s11, s3
911; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
913; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
914; GFX7-NEXT:    v_mov_b32_e32 v1, 0
915; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
916; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
917; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
918; GFX7-NEXT:    s_mov_b32 s2, -1
919; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
920; GFX7-NEXT:    s_movk_i32 s4, 0xff
921; GFX7-NEXT:    s_addc_u32 s13, s13, 0
922; GFX7-NEXT:    s_waitcnt vmcnt(2)
923; GFX7-NEXT:    v_bfe_i32 v3, v2, 0, 4
924; GFX7-NEXT:    v_bfe_i32 v4, v2, 4, 4
925; GFX7-NEXT:    s_waitcnt vmcnt(1)
926; GFX7-NEXT:    v_bfe_i32 v10, v0, 0, 4
927; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
928; GFX7-NEXT:    v_bfe_i32 v11, v0, 4, 4
929; GFX7-NEXT:    v_and_b32_e32 v10, s4, v10
930; GFX7-NEXT:    v_bfe_i32 v5, v2, 8, 4
931; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
932; GFX7-NEXT:    v_bfe_i32 v12, v0, 8, 4
933; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
934; GFX7-NEXT:    s_waitcnt vmcnt(0)
935; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v10, v1
936; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
937; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
938; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
939; GFX7-NEXT:    v_and_b32_e32 v12, s4, v12
940; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v11, v1
941; GFX7-NEXT:    v_bfe_i32 v7, v2, 16, 4
942; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
943; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
944; GFX7-NEXT:    v_and_b32_e32 v13, s4, v13
945; GFX7-NEXT:    v_mad_u32_u24 v1, v5, v12, v1
946; GFX7-NEXT:    v_bfe_i32 v8, v2, 20, 4
947; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
948; GFX7-NEXT:    v_bfe_i32 v15, v0, 20, 4
949; GFX7-NEXT:    v_and_b32_e32 v14, s4, v14
950; GFX7-NEXT:    v_mad_u32_u24 v1, v6, v13, v1
951; GFX7-NEXT:    v_bfe_i32 v9, v2, 24, 4
952; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
953; GFX7-NEXT:    v_bfe_i32 v16, v0, 24, 4
954; GFX7-NEXT:    v_and_b32_e32 v15, s4, v15
955; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v14, v1
956; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
957; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
958; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
959; GFX7-NEXT:    v_and_b32_e32 v16, s4, v16
960; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v15, v1
961; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
962; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
963; GFX7-NEXT:    v_mad_u32_u24 v1, v9, v16, v1
964; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
965; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
966; GFX7-NEXT:    s_endpgm
967;
968; GFX8-LABEL: idot8_acc8:
969; GFX8:       ; %bb.0: ; %entry
970; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
971; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
972; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
973; GFX8-NEXT:    v_mov_b32_e32 v5, 12
974; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
975; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
976; GFX8-NEXT:    v_mov_b32_e32 v1, s5
977; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
978; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
979; GFX8-NEXT:    flat_load_dword v3, v[0:1]
980; GFX8-NEXT:    v_mov_b32_e32 v1, s7
981; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
982; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
983; GFX8-NEXT:    flat_load_dword v2, v[0:1]
984; GFX8-NEXT:    v_mov_b32_e32 v0, s0
985; GFX8-NEXT:    v_mov_b32_e32 v1, s1
986; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
987; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
988; GFX8-NEXT:    s_mov_b32 s10, -1
989; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
990; GFX8-NEXT:    s_add_u32 s8, s8, s3
991; GFX8-NEXT:    s_addc_u32 s9, s9, 0
992; GFX8-NEXT:    s_waitcnt vmcnt(2)
993; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
994; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
995; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
996; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 20, v3
997; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
998; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
999; GFX8-NEXT:    s_waitcnt vmcnt(1)
1000; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
1001; GFX8-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
1002; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1003; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
1004; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
1005; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
1006; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1007; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1008; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1009; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1010; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v16
1011; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1012; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1013; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
1014; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1015; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1016; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1017; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
1018; GFX8-NEXT:    s_waitcnt vmcnt(0)
1019; GFX8-NEXT:    v_mad_u16 v4, v5, v16, v4
1020; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1021; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1022; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1023; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1024; GFX8-NEXT:    v_mad_u16 v4, v10, v15, v4
1025; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1026; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1027; GFX8-NEXT:    v_mad_u16 v4, v9, v14, v4
1028; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1029; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
1030; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1031; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1032; GFX8-NEXT:    v_mad_u16 v4, v8, v13, v4
1033; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1034; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1035; GFX8-NEXT:    v_mad_u16 v4, v17, v18, v4
1036; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
1037; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1038; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1039; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1040; GFX8-NEXT:    v_mad_u16 v4, v7, v12, v4
1041; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1042; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1043; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
1044; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
1045; GFX8-NEXT:    flat_store_byte v[0:1], v2
1046; GFX8-NEXT:    s_endpgm
1047;
1048; GFX9-LABEL: idot8_acc8:
1049; GFX9:       ; %bb.0: ; %entry
1050; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1051; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1052; GFX9-NEXT:    s_mov_b32 s10, -1
1053; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1054; GFX9-NEXT:    s_add_u32 s8, s8, s3
1055; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1056; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1057; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1058; GFX9-NEXT:    v_mov_b32_e32 v4, 12
1059; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1061; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1062; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1063; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3]
1064; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1065; GFX9-NEXT:    s_waitcnt vmcnt(2)
1066; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
1067; GFX9-NEXT:    s_waitcnt vmcnt(1)
1068; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
1069; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
1070; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
1071; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
1072; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1073; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
1074; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
1075; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1076; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
1077; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
1078; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
1079; GFX9-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1080; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1081; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1082; GFX9-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1083; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
1084; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
1085; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1086; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1087; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1088; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1089; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1090; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1091; GFX9-NEXT:    s_waitcnt vmcnt(0)
1092; GFX9-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
1093; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1094; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1095; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1096; GFX9-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1097; GFX9-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
1098; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1099; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1100; GFX9-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
1101; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1102; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1103; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1104; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1105; GFX9-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
1106; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1107; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1108; GFX9-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
1109; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
1110; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1111; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
1112; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1113; GFX9-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
1114; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
1115; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1116; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1117; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
1118; GFX9-NEXT:    global_store_byte v0, v1, s[2:3]
1119; GFX9-NEXT:    s_endpgm
1120;
1121; GFX9-DL-LABEL: idot8_acc8:
1122; GFX9-DL:       ; %bb.0: ; %entry
1123; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1124; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1125; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1126; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1127; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1128; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1129; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1130; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1131; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 12
1132; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1134; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1135; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1136; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[2:3]
1137; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1138; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
1139; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
1140; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1141; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
1142; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
1143; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v16, 12, v2
1144; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
1145; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1146; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
1147; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
1148; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
1149; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
1150; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
1151; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
1152; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v17, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1153; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1154; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1155; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
1156; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v15
1157; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
1158; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
1159; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
1160; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
1161; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
1162; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
1163; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
1164; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1165; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v4, v15, v3
1166; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
1167; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
1168; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
1169; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
1170; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v9, v14, v3
1171; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
1172; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
1173; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v8, v13, v3
1174; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
1175; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
1176; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
1177; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
1178; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v7, v12, v3
1179; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
1180; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
1181; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v16, v17, v3
1182; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
1183; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
1184; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v5
1185; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
1186; GFX9-DL-NEXT:    v_mad_legacy_u16 v3, v6, v11, v3
1187; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
1188; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
1189; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
1190; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v10, v1
1191; GFX9-DL-NEXT:    global_store_byte v0, v1, s[2:3]
1192; GFX9-DL-NEXT:    s_endpgm
1193;
1194; GFX10-DL-XNACK-LABEL: idot8_acc8:
1195; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1196; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1197; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1198; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1199; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1200; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1201; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
1202; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1203; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
1204; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
1205; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1207; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1208; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
1209; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
1210; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v0, s[0:1]
1211; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
1212; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1213; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1214; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1215; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1216; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
1217; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
1218; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
1219; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
1220; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
1221; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v2
1222; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v2
1223; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
1224; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
1225; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v2
1226; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
1227; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
1228; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
1229; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
1230; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
1231; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
1232; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
1233; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
1234; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
1235; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
1236; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
1237; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
1238; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
1239; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
1240; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
1241; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
1242; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
1243; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
1244; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
1245; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
1246; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
1247; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
1248; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v3, v2, v1
1249; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v7
1250; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
1251; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
1252; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
1253; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v8, v9, v1
1254; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
1255; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
1256; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
1257; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
1258; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
1259; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v5
1260; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
1261; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
1262; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
1263; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v6, v7, v1
1264; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
1265; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
1266; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v2, v3, v1
1267; GFX10-DL-XNACK-NEXT:    v_mad_u16 v1, v4, v5, v1
1268; GFX10-DL-XNACK-NEXT:    global_store_byte v0, v1, s[0:1]
1269; GFX10-DL-XNACK-NEXT:    s_endpgm
1270;
1271; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
1272; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1273; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1274; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1275; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1276; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1277; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1278; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1279; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1280; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
1281; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1282; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
1283; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
1284; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1286; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1287; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
1288; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v3, v2, s[0:1]
1289; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
1290; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
1291; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
1292; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
1293; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1294; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
1295; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
1296; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
1297; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
1298; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
1299; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
1300; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v0
1301; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
1302; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
1303; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
1304; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
1305; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
1306; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
1307; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
1308; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
1309; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
1310; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v17, 12, v17
1311; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
1312; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
1313; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
1314; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
1315; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
1316; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v1, v17, v3
1317; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v9
1318; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
1319; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
1320; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v15
1321; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v1, v10, v16, v1
1322; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
1323; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v14
1324; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
1325; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
1326; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v3, v0, v1
1327; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v7
1328; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v10
1329; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
1330; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v13
1331; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v8, v9, v0
1332; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
1333; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v12
1334; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
1335; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
1336; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
1337; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v5
1338; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v8
1339; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v4, 12, v4
1340; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v11
1341; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v6, v7, v0
1342; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v4, 12, v4
1343; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
1344; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v1, v3, v0
1345; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
1346; GFX10-DL-NOXNACK-NEXT:    global_store_byte v2, v0, s[0:1]
1347; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1348; GFX10-DL-LABEL: idot8_acc8:
1349; GFX10-DL:       ; %bb.0: ; %entry
1350; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1351; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1352; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1353; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1354; GFX10-DL-NEXT:    s_mov_b32 s14, -1
1355; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
1356; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
1357; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1358; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
1359; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1360; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
1361; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1362; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1363; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX10-DL-NEXT:    s_lshr_b32 s2, s0, 12
1365; GFX10-DL-NEXT:    s_lshr_b32 s3, s1, 12
1366; GFX10-DL-NEXT:    s_bfe_i32 s6, s0, 0x40000
1367; GFX10-DL-NEXT:    s_bfe_i32 s7, s1, 0x40000
1368; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s2
1369; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s3
1370; GFX10-DL-NEXT:    s_bfe_i32 s8, s0, 0x40004
1371; GFX10-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
1372; GFX10-DL-NEXT:    s_bfe_i32 s10, s1, 0x40008
1373; GFX10-DL-NEXT:    s_bfe_i32 s2, s1, 0x40004
1374; GFX10-DL-NEXT:    v_mul_i32_i24_e64 v4, s9, s10
1375; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
1376; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
1377; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
1378; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1379; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s6, s7, v1
1380; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s8, s2, v1
1381; GFX10-DL-NEXT:    s_movk_i32 s2, 0xff
1382; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
1383; GFX10-DL-NEXT:    v_and_b32_e32 v3, s2, v3
1384; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
1385; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
1386; GFX10-DL-NEXT:    v_mad_u32_u24 v1, v2, v3, v1
1387; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1388; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
1389; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
1390; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1391; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
1392; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
1393; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
1394; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
1395; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1396; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
1397; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
1398; GFX10-DL-NEXT:    s_endpgm
1399                                       <8 x i4> addrspace(1)* %src2,
1400                                       i8 addrspace(1)* nocapture %dst) {
1401entry:
1402  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1403  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1404  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1405  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1406  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1407
1408  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1409  %cv1e0 = sext i4 %v1e0 to i8
1410  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1411  %cv2e0 = sext i4 %v2e0 to i8
1412  %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0
1413
1414  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1415  %cv1e1 = sext i4 %v1e1 to i8
1416  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1417  %cv2e1 = sext i4 %v2e1 to i8
1418  %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1
1419
1420  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1421  %cv1e2 = sext i4 %v1e2 to i8
1422  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1423  %cv2e2 = sext i4 %v2e2 to i8
1424  %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2
1425
1426  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1427  %cv1e3 = sext i4 %v1e3 to i8
1428  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1429  %cv2e3 = sext i4 %v2e3 to i8
1430  %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3
1431
1432  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1433  %cv1e4 = sext i4 %v1e4 to i8
1434  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1435  %cv2e4 = sext i4 %v2e4 to i8
1436  %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4
1437
1438  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1439  %cv1e5 = sext i4 %v1e5 to i8
1440  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1441  %cv2e5 = sext i4 %v2e5 to i8
1442  %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5
1443
1444  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1445  %cv1e6 = sext i4 %v1e6 to i8
1446  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1447  %cv2e6 = sext i4 %v2e6 to i8
1448  %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6
1449
1450  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1451  %cv1e7 = sext i4 %v1e7 to i8
1452  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1453  %cv2e7 = sext i4 %v2e7 to i8
1454  %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7
1455
1456  %acc = load i8, i8 addrspace(1)* %dst, align 4
1457  %add1 = add i8 %mul0, %acc
1458  %add2 = add i8 %add1, %mul1
1459  %add3 = add i8 %add2, %mul2
1460  %add4 = add i8 %add3, %mul3
1461  %add5 = add i8 %add4, %mul4
1462  %add6 = add i8 %add5, %mul5
1463  %add7 = add i8 %add6, %mul6
1464  %add8 = add i8 %add7, %mul7
1465
1466  store i8 %add8, i8 addrspace(1)* %dst, align 4
1467  ret void
1468}
1469
1470; Make sure the pattern is not recognized if there are multiple uses of the
1471; intermediate multiplications.
1472define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1,
1473; GFX7-LABEL: idot8_multiuses_mul1:
1474; GFX7:       ; %bb.0: ; %entry
1475; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1476; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1477; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1478; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1479; GFX7-NEXT:    s_mov_b32 s14, -1
1480; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1481; GFX7-NEXT:    s_add_u32 s12, s12, s3
1482; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1483; GFX7-NEXT:    s_mov_b32 s10, 0
1484; GFX7-NEXT:    s_mov_b32 s11, s3
1485; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1486; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1487; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1488; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1489; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1490; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1491; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1492; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1493; GFX7-NEXT:    s_mov_b32 s2, -1
1494; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1495; GFX7-NEXT:    s_waitcnt vmcnt(1)
1496; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 4
1497; GFX7-NEXT:    v_bfe_i32 v3, v2, 4, 4
1498; GFX7-NEXT:    s_waitcnt vmcnt(0)
1499; GFX7-NEXT:    v_bfe_i32 v9, v0, 0, 4
1500; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1501; GFX7-NEXT:    v_mad_i32_i24 v16, v1, v9, s4
1502; GFX7-NEXT:    v_bfe_i32 v10, v0, 4, 4
1503; GFX7-NEXT:    v_mad_i32_i24 v1, v1, v9, v16
1504; GFX7-NEXT:    v_bfe_i32 v4, v2, 8, 4
1505; GFX7-NEXT:    v_bfe_i32 v11, v0, 8, 4
1506; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v10, v1
1507; GFX7-NEXT:    v_bfe_i32 v5, v2, 12, 4
1508; GFX7-NEXT:    v_bfe_i32 v12, v0, 12, 4
1509; GFX7-NEXT:    v_mad_i32_i24 v1, v4, v11, v1
1510; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
1511; GFX7-NEXT:    v_bfe_i32 v13, v0, 16, 4
1512; GFX7-NEXT:    v_mad_i32_i24 v1, v5, v12, v1
1513; GFX7-NEXT:    v_bfe_i32 v7, v2, 20, 4
1514; GFX7-NEXT:    v_bfe_i32 v14, v0, 20, 4
1515; GFX7-NEXT:    v_mad_i32_i24 v1, v6, v13, v1
1516; GFX7-NEXT:    v_bfe_i32 v8, v2, 24, 4
1517; GFX7-NEXT:    v_bfe_i32 v15, v0, 24, 4
1518; GFX7-NEXT:    v_mad_i32_i24 v1, v7, v14, v1
1519; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1520; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1521; GFX7-NEXT:    v_mad_i32_i24 v1, v8, v15, v1
1522; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, v1
1523; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
1524; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1525; GFX7-NEXT:    s_endpgm
1526;
1527; GFX8-LABEL: idot8_multiuses_mul1:
1528; GFX8:       ; %bb.0: ; %entry
1529; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1530; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1531; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1532; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1533; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1534; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1535; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1536; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1537; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1538; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1539; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1540; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1541; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1542; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1543; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1544; GFX8-NEXT:    s_mov_b32 s10, -1
1545; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1546; GFX8-NEXT:    s_add_u32 s8, s8, s3
1547; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1548; GFX8-NEXT:    s_waitcnt vmcnt(1)
1549; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 4
1550; GFX8-NEXT:    v_bfe_i32 v4, v3, 4, 4
1551; GFX8-NEXT:    v_bfe_i32 v6, v3, 8, 4
1552; GFX8-NEXT:    v_bfe_i32 v8, v3, 12, 4
1553; GFX8-NEXT:    v_bfe_i32 v10, v3, 16, 4
1554; GFX8-NEXT:    v_bfe_i32 v12, v3, 20, 4
1555; GFX8-NEXT:    s_waitcnt vmcnt(0)
1556; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 4
1557; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1558; GFX8-NEXT:    v_mad_i32_i24 v16, v1, v2, s2
1559; GFX8-NEXT:    v_bfe_i32 v5, v0, 4, 4
1560; GFX8-NEXT:    v_mad_i32_i24 v1, v1, v2, v16
1561; GFX8-NEXT:    v_bfe_i32 v7, v0, 8, 4
1562; GFX8-NEXT:    v_mad_i32_i24 v1, v4, v5, v1
1563; GFX8-NEXT:    v_bfe_i32 v9, v0, 12, 4
1564; GFX8-NEXT:    v_mad_i32_i24 v1, v6, v7, v1
1565; GFX8-NEXT:    v_bfe_i32 v11, v0, 16, 4
1566; GFX8-NEXT:    v_mad_i32_i24 v1, v8, v9, v1
1567; GFX8-NEXT:    v_bfe_i32 v13, v0, 20, 4
1568; GFX8-NEXT:    v_mad_i32_i24 v1, v10, v11, v1
1569; GFX8-NEXT:    v_bfe_i32 v14, v3, 24, 4
1570; GFX8-NEXT:    v_bfe_i32 v15, v0, 24, 4
1571; GFX8-NEXT:    v_mad_i32_i24 v1, v12, v13, v1
1572; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 28, v3
1573; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1574; GFX8-NEXT:    v_mad_i32_i24 v1, v14, v15, v1
1575; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, v1
1576; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v16, v0
1577; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1578; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1579; GFX8-NEXT:    flat_store_dword v[0:1], v2
1580; GFX8-NEXT:    s_endpgm
1581;
1582; GFX9-LABEL: idot8_multiuses_mul1:
1583; GFX9:       ; %bb.0: ; %entry
1584; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1585; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1586; GFX9-NEXT:    s_mov_b32 s10, -1
1587; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
1588; GFX9-NEXT:    s_add_u32 s8, s8, s3
1589; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1590; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1591; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1592; GFX9-NEXT:    s_addc_u32 s9, s9, 0
1593; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1594; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
1595; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
1596; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
1597; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1598; GFX9-NEXT:    s_waitcnt vmcnt(1)
1599; GFX9-NEXT:    v_bfe_i32 v3, v1, 0, 4
1600; GFX9-NEXT:    s_waitcnt vmcnt(0)
1601; GFX9-NEXT:    v_bfe_i32 v4, v2, 0, 4
1602; GFX9-NEXT:    v_bfe_i32 v5, v1, 4, 4
1603; GFX9-NEXT:    v_bfe_i32 v6, v2, 4, 4
1604; GFX9-NEXT:    v_bfe_i32 v7, v1, 8, 4
1605; GFX9-NEXT:    v_bfe_i32 v8, v2, 8, 4
1606; GFX9-NEXT:    v_bfe_i32 v9, v1, 12, 4
1607; GFX9-NEXT:    v_bfe_i32 v10, v2, 12, 4
1608; GFX9-NEXT:    v_bfe_i32 v11, v1, 16, 4
1609; GFX9-NEXT:    v_bfe_i32 v12, v2, 16, 4
1610; GFX9-NEXT:    v_bfe_i32 v13, v1, 20, 4
1611; GFX9-NEXT:    v_bfe_i32 v14, v2, 20, 4
1612; GFX9-NEXT:    v_bfe_i32 v15, v1, 24, 4
1613; GFX9-NEXT:    v_bfe_i32 v16, v2, 24, 4
1614; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1615; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1616; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1617; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1618; GFX9-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1619; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
1620; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v7, v8
1621; GFX9-NEXT:    v_mad_i32_i24 v3, v3, v4, v2
1622; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v9, v10
1623; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v11, v12
1624; GFX9-NEXT:    v_add3_u32 v3, v3, v5, v6
1625; GFX9-NEXT:    v_mul_i32_i24_e32 v9, v13, v14
1626; GFX9-NEXT:    v_mul_i32_i24_e32 v10, v15, v16
1627; GFX9-NEXT:    v_add3_u32 v3, v3, v7, v8
1628; GFX9-NEXT:    v_add3_u32 v3, v3, v9, v10
1629; GFX9-NEXT:    v_add3_u32 v1, v3, v1, v2
1630; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
1631; GFX9-NEXT:    s_endpgm
1632;
1633; GFX9-DL-LABEL: idot8_multiuses_mul1:
1634; GFX9-DL:       ; %bb.0: ; %entry
1635; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1636; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1637; GFX9-DL-NEXT:    s_mov_b32 s10, -1
1638; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
1639; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
1640; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1641; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1642; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1643; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
1644; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1645; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1646; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1647; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1648; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1649; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
1650; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 4
1651; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1652; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 4
1653; GFX9-DL-NEXT:    v_bfe_i32 v5, v1, 4, 4
1654; GFX9-DL-NEXT:    v_bfe_i32 v6, v2, 4, 4
1655; GFX9-DL-NEXT:    v_bfe_i32 v7, v1, 8, 4
1656; GFX9-DL-NEXT:    v_bfe_i32 v8, v2, 8, 4
1657; GFX9-DL-NEXT:    v_bfe_i32 v9, v1, 12, 4
1658; GFX9-DL-NEXT:    v_bfe_i32 v10, v2, 12, 4
1659; GFX9-DL-NEXT:    v_bfe_i32 v11, v1, 16, 4
1660; GFX9-DL-NEXT:    v_bfe_i32 v12, v2, 16, 4
1661; GFX9-DL-NEXT:    v_bfe_i32 v13, v1, 20, 4
1662; GFX9-DL-NEXT:    v_bfe_i32 v14, v2, 20, 4
1663; GFX9-DL-NEXT:    v_bfe_i32 v15, v1, 24, 4
1664; GFX9-DL-NEXT:    v_bfe_i32 v16, v2, 24, 4
1665; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1666; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1667; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1668; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX9-DL-NEXT:    v_mad_i32_i24 v2, v3, v4, s0
1670; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
1671; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v6, v7, v8
1672; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v3, v4, v2
1673; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v7, v9, v10
1674; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v8, v11, v12
1675; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v5, v6
1676; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v9, v13, v14
1677; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v10, v15, v16
1678; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v7, v8
1679; GFX9-DL-NEXT:    v_add3_u32 v3, v3, v9, v10
1680; GFX9-DL-NEXT:    v_add3_u32 v1, v3, v1, v2
1681; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1682; GFX9-DL-NEXT:    s_endpgm
1683;
1684; GFX10-DL-XNACK-LABEL: idot8_multiuses_mul1:
1685; GFX10-DL-XNACK:       ; %bb.0: ; %entry
1686; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1687; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1688; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1689; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1690; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1691; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
1692; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1693; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
1694; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
1695; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1696; GFX10-DL-XNACK-NEXT:    s_clause 0x1
1697; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1698; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
1699; GFX10-DL-XNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
1700; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
1701; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v0, v1, 0, 4
1702; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v3, v1, 4, 4
1703; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
1704; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v4, v2, 4, 4
1705; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v5, v1, 8, 4
1706; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v2, 8, 4
1707; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v2, 0, 4
1708; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v8, v1, 12, 4
1709; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
1710; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v9, v2, 12, 4
1711; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
1712; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
1713; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v5, v0, v7, s2
1714; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
1715; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v10, v2, 16, 4
1716; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
1717; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v12, v2, 20, 4
1718; GFX10-DL-XNACK-NEXT:    v_mad_i32_i24 v0, v0, v7, v5
1719; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
1720; GFX10-DL-XNACK-NEXT:    v_bfe_i32 v13, v2, 24, 4
1721; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
1722; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
1723; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
1724; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
1725; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
1726; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1727; GFX10-DL-XNACK-NEXT:    v_ashrrev_i32_e32 v2, 28, v2
1728; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v8, v6
1729; GFX10-DL-XNACK-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
1730; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v3, v4
1731; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v2, 0
1732; GFX10-DL-XNACK-NEXT:    v_add3_u32 v0, v0, v1, v5
1733; GFX10-DL-XNACK-NEXT:    global_store_dword v2, v0, s[0:1]
1734; GFX10-DL-XNACK-NEXT:    s_endpgm
1735;
1736; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
1737; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
1738; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1739; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1740; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1741; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1742; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1743; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
1744; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
1745; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
1746; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
1747; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1748; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
1749; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
1750; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
1751; GFX10-DL-NOXNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
1752; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
1753; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v2, v1, 0, 4
1754; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v3, v1, 4, 4
1755; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
1756; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v4, v0, 4, 4
1757; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v5, v1, 8, 4
1758; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v0, 8, 4
1759; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v0, 0, 4
1760; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v8, v1, 12, 4
1761; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
1762; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v9, v0, 12, 4
1763; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
1764; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
1765; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v5, v2, v7, s2
1766; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v6, v1, 16, 4
1767; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v10, v0, 16, 4
1768; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v11, v1, 20, 4
1769; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v12, v0, 20, 4
1770; GFX10-DL-NOXNACK-NEXT:    v_mad_i32_i24 v2, v2, v7, v5
1771; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v7, v1, 24, 4
1772; GFX10-DL-NOXNACK-NEXT:    v_bfe_i32 v13, v0, 24, 4
1773; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
1774; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
1775; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v3, v4
1776; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
1777; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
1778; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
1779; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
1780; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v2, v2, v8, v6
1781; GFX10-DL-NOXNACK-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
1782; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v1, v2, v3, v4
1783; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
1784; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v0, v1, v0, v5
1785; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
1786; GFX10-DL-NOXNACK-NEXT:    s_endpgm
1787; GFX10-DL-LABEL: idot8_multiuses_mul1:
1788; GFX10-DL:       ; %bb.0: ; %entry
1789; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1790; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1791; GFX10-DL-NEXT:    s_mov_b32 s10, -1
1792; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
1793; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
1794; GFX10-DL-NEXT:    s_clause 0x1
1795; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1796; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1797; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
1798; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1799; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1800; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
1801; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
1802; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
1803; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1804; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
1805; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40000
1806; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40000
1807; GFX10-DL-NEXT:    v_mad_i32_i24 v0, s2, s3, v0
1808; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v0
1809; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40004
1810; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40004
1811; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1812; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40008
1813; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40008
1814; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1815; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x4000c
1816; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x4000c
1817; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1818; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40010
1819; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40010
1820; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1821; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40014
1822; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40014
1823; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1824; GFX10-DL-NEXT:    s_bfe_i32 s2, s0, 0x40018
1825; GFX10-DL-NEXT:    s_bfe_i32 s3, s1, 0x40018
1826; GFX10-DL-NEXT:    s_ashr_i32 s0, s0, 28
1827; GFX10-DL-NEXT:    s_ashr_i32 s1, s1, 28
1828; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s2, s3, v1
1829; GFX10-DL-NEXT:    v_mad_i32_i24 v1, s0, s1, v1
1830; GFX10-DL-NEXT:    v_add_nc_u32_e32 v0, v0, v1
1831; GFX10-DL-NEXT:    global_store_dword v2, v0, s[4:5]
1832; GFX10-DL-NEXT:    s_endpgm
1833                                                <8 x i4> addrspace(1)* %src2,
1834                                                i32 addrspace(1)* nocapture %dst) {
1835entry:
1836  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1837  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
1838  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
1839  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
1840  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
1841
1842  %v1e0 = extractelement <8 x i4> %vec1, i64 0
1843  %cv1e0 = sext i4 %v1e0 to i32
1844  %v2e0 = extractelement <8 x i4> %vec2, i64 0
1845  %cv2e0 = sext i4 %v2e0 to i32
1846  %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0
1847
1848  %v1e1 = extractelement <8 x i4> %vec1, i64 1
1849  %cv1e1 = sext i4 %v1e1 to i32
1850  %v2e1 = extractelement <8 x i4> %vec2, i64 1
1851  %cv2e1 = sext i4 %v2e1 to i32
1852  %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1
1853
1854  %v1e2 = extractelement <8 x i4> %vec1, i64 2
1855  %cv1e2 = sext i4 %v1e2 to i32
1856  %v2e2 = extractelement <8 x i4> %vec2, i64 2
1857  %cv2e2 = sext i4 %v2e2 to i32
1858  %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2
1859
1860  %v1e3 = extractelement <8 x i4> %vec1, i64 3
1861  %cv1e3 = sext i4 %v1e3 to i32
1862  %v2e3 = extractelement <8 x i4> %vec2, i64 3
1863  %cv2e3 = sext i4 %v2e3 to i32
1864  %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3
1865
1866  %v1e4 = extractelement <8 x i4> %vec1, i64 4
1867  %cv1e4 = sext i4 %v1e4 to i32
1868  %v2e4 = extractelement <8 x i4> %vec2, i64 4
1869  %cv2e4 = sext i4 %v2e4 to i32
1870  %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4
1871
1872  %v1e5 = extractelement <8 x i4> %vec1, i64 5
1873  %cv1e5 = sext i4 %v1e5 to i32
1874  %v2e5 = extractelement <8 x i4> %vec2, i64 5
1875  %cv2e5 = sext i4 %v2e5 to i32
1876  %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5
1877
1878  %v1e6 = extractelement <8 x i4> %vec1, i64 6
1879  %cv1e6 = sext i4 %v1e6 to i32
1880  %v2e6 = extractelement <8 x i4> %vec2, i64 6
1881  %cv2e6 = sext i4 %v2e6 to i32
1882  %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6
1883
1884  %v1e7 = extractelement <8 x i4> %vec1, i64 7
1885  %cv1e7 = sext i4 %v1e7 to i32
1886  %v2e7 = extractelement <8 x i4> %vec2, i64 7
1887  %cv2e7 = sext i4 %v2e7 to i32
1888  %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7
1889
1890  %acc = load i32, i32 addrspace(1)* %dst, align 4
1891  %add =  add i32  %mul0, %acc
1892  %add1 = add i32 %mul0, %add
1893  %add2 = add i32 %add1, %mul1
1894  %add3 = add i32 %add2, %mul2
1895  %add4 = add i32 %add3, %mul3
1896  %add5 = add i32 %add4, %mul4
1897  %add6 = add i32 %add5, %mul5
1898  %add7 = add i32 %add6, %mul6
1899  %add8 = add i32 %add7, %mul7
1900
1901  %res = add i32 %add, %add8
1902  store i32 %res, i32 addrspace(1)* %dst, align 4
1903  ret void
1904}
1905
1906; TODO: Support this pattern.
1907define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1,
1908; GFX7-LABEL: idot8_acc32_vecMul:
1909; GFX7:       ; %bb.0: ; %entry
1910; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1911; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1912; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1913; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1914; GFX7-NEXT:    s_mov_b32 s14, -1
1915; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
1916; GFX7-NEXT:    s_add_u32 s12, s12, s3
1917; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1918; GFX7-NEXT:    s_mov_b32 s10, 0
1919; GFX7-NEXT:    s_mov_b32 s11, s3
1920; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1921; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1922; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1923; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1924; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1925; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1926; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1927; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1928; GFX7-NEXT:    s_mov_b32 s2, -1
1929; GFX7-NEXT:    s_addc_u32 s13, s13, 0
1930; GFX7-NEXT:    s_waitcnt vmcnt(1)
1931; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 28, v2
1932; GFX7-NEXT:    v_bfe_i32 v3, v2, 24, 4
1933; GFX7-NEXT:    v_bfe_i32 v4, v2, 20, 4
1934; GFX7-NEXT:    v_bfe_i32 v5, v2, 16, 4
1935; GFX7-NEXT:    v_bfe_i32 v6, v2, 12, 4
1936; GFX7-NEXT:    v_bfe_i32 v7, v2, 8, 4
1937; GFX7-NEXT:    v_bfe_i32 v8, v2, 4, 4
1938; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
1939; GFX7-NEXT:    s_waitcnt vmcnt(0)
1940; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v0
1941; GFX7-NEXT:    v_bfe_i32 v10, v0, 24, 4
1942; GFX7-NEXT:    v_bfe_i32 v11, v0, 20, 4
1943; GFX7-NEXT:    v_bfe_i32 v12, v0, 16, 4
1944; GFX7-NEXT:    v_bfe_i32 v13, v0, 12, 4
1945; GFX7-NEXT:    v_bfe_i32 v14, v0, 8, 4
1946; GFX7-NEXT:    v_bfe_i32 v15, v0, 4, 4
1947; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
1948; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX7-NEXT:    v_mad_i32_i24 v0, v2, v0, s4
1950; GFX7-NEXT:    v_mad_i32_i24 v0, v8, v15, v0
1951; GFX7-NEXT:    v_mad_i32_i24 v0, v7, v14, v0
1952; GFX7-NEXT:    v_mad_i32_i24 v0, v6, v13, v0
1953; GFX7-NEXT:    v_mad_i32_i24 v0, v5, v12, v0
1954; GFX7-NEXT:    v_mad_i32_i24 v0, v4, v11, v0
1955; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v10, v0
1956; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v9, v0
1957; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1958; GFX7-NEXT:    s_endpgm
1959;
1960; GFX8-LABEL: idot8_acc32_vecMul:
1961; GFX8:       ; %bb.0: ; %entry
1962; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1963; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1964; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1965; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1966; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1967; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1968; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1969; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1970; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1971; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1972; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1973; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1974; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1975; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1976; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1977; GFX8-NEXT:    s_mov_b32 s10, -1
1978; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
1979; GFX8-NEXT:    s_add_u32 s8, s8, s3
1980; GFX8-NEXT:    s_addc_u32 s9, s9, 0
1981; GFX8-NEXT:    s_waitcnt vmcnt(1)
1982; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 28, v3
1983; GFX8-NEXT:    v_bfe_i32 v2, v3, 24, 4
1984; GFX8-NEXT:    v_bfe_i32 v4, v3, 20, 4
1985; GFX8-NEXT:    v_bfe_i32 v5, v3, 16, 4
1986; GFX8-NEXT:    v_bfe_i32 v6, v3, 12, 4
1987; GFX8-NEXT:    v_bfe_i32 v7, v3, 8, 4
1988; GFX8-NEXT:    v_bfe_i32 v8, v3, 4, 4
1989; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 4
1990; GFX8-NEXT:    s_waitcnt vmcnt(0)
1991; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 28, v0
1992; GFX8-NEXT:    v_bfe_i32 v10, v0, 24, 4
1993; GFX8-NEXT:    v_bfe_i32 v11, v0, 20, 4
1994; GFX8-NEXT:    v_bfe_i32 v12, v0, 16, 4
1995; GFX8-NEXT:    v_bfe_i32 v13, v0, 12, 4
1996; GFX8-NEXT:    v_bfe_i32 v14, v0, 8, 4
1997; GFX8-NEXT:    v_bfe_i32 v15, v0, 4, 4
1998; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 4
1999; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2000; GFX8-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
2001; GFX8-NEXT:    v_mad_i32_i24 v0, v8, v15, v0
2002; GFX8-NEXT:    v_mad_i32_i24 v0, v7, v14, v0
2003; GFX8-NEXT:    v_mad_i32_i24 v0, v6, v13, v0
2004; GFX8-NEXT:    v_mad_i32_i24 v0, v5, v12, v0
2005; GFX8-NEXT:    v_mad_i32_i24 v0, v4, v11, v0
2006; GFX8-NEXT:    v_mad_i32_i24 v0, v2, v10, v0
2007; GFX8-NEXT:    v_mad_i32_i24 v2, v1, v9, v0
2008; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2009; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2010; GFX8-NEXT:    flat_store_dword v[0:1], v2
2011; GFX8-NEXT:    s_endpgm
2012;
2013; GFX9-LABEL: idot8_acc32_vecMul:
2014; GFX9:       ; %bb.0: ; %entry
2015; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2016; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2017; GFX9-NEXT:    s_mov_b32 s10, -1
2018; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2019; GFX9-NEXT:    s_add_u32 s8, s8, s3
2020; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2021; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2022; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2023; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2024; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2025; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2026; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2027; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
2028; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2029; GFX9-NEXT:    s_waitcnt vmcnt(1)
2030; GFX9-NEXT:    v_ashrrev_i32_e32 v3, 28, v1
2031; GFX9-NEXT:    v_bfe_i32 v4, v1, 24, 4
2032; GFX9-NEXT:    v_bfe_i32 v5, v1, 20, 4
2033; GFX9-NEXT:    v_bfe_i32 v6, v1, 16, 4
2034; GFX9-NEXT:    v_bfe_i32 v7, v1, 12, 4
2035; GFX9-NEXT:    v_bfe_i32 v8, v1, 8, 4
2036; GFX9-NEXT:    v_bfe_i32 v9, v1, 4, 4
2037; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 4
2038; GFX9-NEXT:    s_waitcnt vmcnt(0)
2039; GFX9-NEXT:    v_ashrrev_i32_e32 v10, 28, v2
2040; GFX9-NEXT:    v_bfe_i32 v11, v2, 24, 4
2041; GFX9-NEXT:    v_bfe_i32 v12, v2, 20, 4
2042; GFX9-NEXT:    v_bfe_i32 v13, v2, 16, 4
2043; GFX9-NEXT:    v_bfe_i32 v14, v2, 12, 4
2044; GFX9-NEXT:    v_bfe_i32 v15, v2, 8, 4
2045; GFX9-NEXT:    v_bfe_i32 v16, v2, 4, 4
2046; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 4
2047; GFX9-NEXT:    v_mul_i32_i24_e32 v1, v1, v2
2048; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v9, v16
2049; GFX9-NEXT:    v_mul_i32_i24_e32 v8, v8, v15
2050; GFX9-NEXT:    v_mul_i32_i24_e32 v7, v7, v14
2051; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2052; GFX9-NEXT:    v_add3_u32 v1, v1, s0, v2
2053; GFX9-NEXT:    v_mul_i32_i24_e32 v6, v6, v13
2054; GFX9-NEXT:    v_mul_i32_i24_e32 v5, v5, v12
2055; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
2056; GFX9-NEXT:    v_mul_i32_i24_e32 v4, v4, v11
2057; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v3, v10
2058; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
2059; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
2060; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
2061; GFX9-NEXT:    s_endpgm
2062;
2063; GFX9-DL-LABEL: idot8_acc32_vecMul:
2064; GFX9-DL:       ; %bb.0: ; %entry
2065; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2066; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2067; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2068; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2069; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2070; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2071; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2072; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2073; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2074; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2075; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2076; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2077; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2078; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2079; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2080; GFX9-DL-NEXT:    v_dot8_i32_i4 v1, v1, v2, s0
2081; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2082; GFX9-DL-NEXT:    s_endpgm
2083;
2084; GFX10-DL-XNACK-LABEL: idot8_acc32_vecMul:
2085; GFX10-DL-XNACK:       ; %bb.0: ; %entry
2086; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2087; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2088; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2089; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2090; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2091; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
2092; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2093; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
2094; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
2095; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
2096; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2097; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2098; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
2099; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
2100; GFX10-DL-XNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
2101; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2102; GFX10-DL-XNACK-NEXT:    v_dot8_i32_i4 v1, v1, v2, s2
2103; GFX10-DL-XNACK-NEXT:    global_store_dword v0, v1, s[0:1]
2104; GFX10-DL-XNACK-NEXT:    s_endpgm
2105;
2106; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
2107; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
2108; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2109; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2110; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2111; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2112; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
2113; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2114; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
2115; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2116; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
2117; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
2118; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
2120; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2121; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
2122; GFX10-DL-NOXNACK-NEXT:    s_load_dword s2, s[0:1], 0x0
2123; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2124; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s2
2125; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
2126; GFX10-DL-NOXNACK-NEXT:    s_endpgm
2127; GFX10-DL-LABEL: idot8_acc32_vecMul:
2128; GFX10-DL:       ; %bb.0: ; %entry
2129; GFX10-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2130; GFX10-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2131; GFX10-DL-NEXT:    s_mov_b32 s10, -1
2132; GFX10-DL-NEXT:    s_mov_b32 s11, 0x31c16000
2133; GFX10-DL-NEXT:    s_add_u32 s8, s8, s3
2134; GFX10-DL-NEXT:    s_clause 0x1
2135; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2136; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2137; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2138; GFX10-DL-NEXT:    s_addc_u32 s9, s9, 0
2139; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2140; GFX10-DL-NEXT:    s_load_dword s6, s[4:5], 0x0
2141; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2142; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2143; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2144; GFX10-DL-NEXT:    v_mov_b32_e32 v0, s6
2145; GFX10-DL-NEXT:    v_dot8_i32_i4 v0, s0, s1, v0
2146; GFX10-DL-NEXT:    global_store_dword v1, v0, s[4:5]
2147; GFX10-DL-NEXT:    s_endpgm
2148                                              <8 x i4> addrspace(1)* %src2,
2149                                              i32 addrspace(1)* nocapture %dst) {
2150entry:
2151  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2152  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2153  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2154  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2155  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2156
2157  %cvec1 = sext <8 x i4> %vec1 to <8 x i32>
2158  %cvec2 = sext <8 x i4> %vec2 to <8 x i32>
2159
2160  %mul = mul <8 x i32> %cvec1, %cvec2
2161  %mul0 = extractelement <8 x i32> %mul, i64 0
2162  %mul1 = extractelement <8 x i32> %mul, i64 1
2163  %mul2 = extractelement <8 x i32> %mul, i64 2
2164  %mul3 = extractelement <8 x i32> %mul, i64 3
2165  %mul4 = extractelement <8 x i32> %mul, i64 4
2166  %mul5 = extractelement <8 x i32> %mul, i64 5
2167  %mul6 = extractelement <8 x i32> %mul, i64 6
2168  %mul7 = extractelement <8 x i32> %mul, i64 7
2169
2170  %acc = load i32, i32 addrspace(1)* %dst, align 4
2171  %add1 = add i32 %mul0, %acc
2172  %add2 = add i32 %add1, %mul1
2173  %add3 = add i32 %add2, %mul2
2174  %add4 = add i32 %add3, %mul3
2175  %add5 = add i32 %add4, %mul4
2176  %add6 = add i32 %add5, %mul5
2177  %add7 = add i32 %add6, %mul6
2178  %add8 = add i32 %add7, %mul7
2179
2180  store i32 %add8, i32 addrspace(1)* %dst, align 4
2181  ret void
2182}
2183
2184; TODO: Support this pattern.
2185define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
2186; GFX7-LABEL: idot8_acc16_vecMul:
2187; GFX7:       ; %bb.0: ; %entry
2188; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2189; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2190; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2191; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2192; GFX7-NEXT:    s_mov_b32 s14, -1
2193; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2194; GFX7-NEXT:    s_add_u32 s12, s12, s3
2195; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2196; GFX7-NEXT:    s_mov_b32 s10, 0
2197; GFX7-NEXT:    s_mov_b32 s11, s3
2198; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2199; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2200; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2201; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2202; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2203; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2204; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2205; GFX7-NEXT:    s_mov_b32 s2, -1
2206; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2207; GFX7-NEXT:    s_mov_b32 s4, 0xffff
2208; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2209; GFX7-NEXT:    s_waitcnt vmcnt(2)
2210; GFX7-NEXT:    v_bfe_i32 v3, v2, 20, 4
2211; GFX7-NEXT:    v_bfe_i32 v4, v2, 16, 4
2212; GFX7-NEXT:    v_bfe_i32 v5, v2, 4, 4
2213; GFX7-NEXT:    v_bfe_i32 v6, v2, 0, 4
2214; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2215; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
2216; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2217; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
2218; GFX7-NEXT:    s_waitcnt vmcnt(1)
2219; GFX7-NEXT:    v_bfe_i32 v10, v0, 20, 4
2220; GFX7-NEXT:    v_bfe_i32 v11, v0, 16, 4
2221; GFX7-NEXT:    v_bfe_i32 v12, v0, 4, 4
2222; GFX7-NEXT:    v_bfe_i32 v13, v0, 0, 4
2223; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
2224; GFX7-NEXT:    v_or_b32_e32 v4, v6, v5
2225; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
2226; GFX7-NEXT:    v_and_b32_e32 v6, s4, v11
2227; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
2228; GFX7-NEXT:    v_and_b32_e32 v11, s4, v13
2229; GFX7-NEXT:    v_bfe_i32 v14, v0, 24, 4
2230; GFX7-NEXT:    v_ashrrev_i32_e32 v16, 28, v0
2231; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
2232; GFX7-NEXT:    v_or_b32_e32 v6, v11, v10
2233; GFX7-NEXT:    v_and_b32_e32 v12, s4, v14
2234; GFX7-NEXT:    v_and_b32_e32 v14, s4, v16
2235; GFX7-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
2236; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
2237; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
2238; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
2239; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
2240; GFX7-NEXT:    v_bfe_i32 v15, v0, 8, 4
2241; GFX7-NEXT:    s_waitcnt vmcnt(0)
2242; GFX7-NEXT:    v_mad_u32_u24 v1, v4, v6, v1
2243; GFX7-NEXT:    v_bfe_i32 v7, v2, 24, 4
2244; GFX7-NEXT:    v_ashrrev_i32_e32 v9, 28, v2
2245; GFX7-NEXT:    v_bfe_i32 v2, v2, 12, 4
2246; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
2247; GFX7-NEXT:    v_bfe_i32 v0, v0, 12, 4
2248; GFX7-NEXT:    v_and_b32_e32 v13, s4, v15
2249; GFX7-NEXT:    v_mad_u32_u24 v1, v16, v11, v1
2250; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
2251; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
2252; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v13, v1
2253; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
2254; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
2255; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
2256; GFX7-NEXT:    v_and_b32_e32 v5, s4, v5
2257; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2258; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v5, v0
2259; GFX7-NEXT:    v_and_b32_e32 v7, s4, v7
2260; GFX7-NEXT:    v_mad_u32_u24 v0, v15, v10, v0
2261; GFX7-NEXT:    v_and_b32_e32 v9, s4, v9
2262; GFX7-NEXT:    v_mad_u32_u24 v0, v7, v12, v0
2263; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v14, v0
2264; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2265; GFX7-NEXT:    s_endpgm
2266;
2267; GFX8-LABEL: idot8_acc16_vecMul:
2268; GFX8:       ; %bb.0: ; %entry
2269; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2270; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2271; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2272; GFX8-NEXT:    v_mov_b32_e32 v5, 12
2273; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2274; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2275; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2276; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2277; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2278; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2279; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2280; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2281; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2282; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2283; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2284; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2285; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2286; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2287; GFX8-NEXT:    s_mov_b32 s10, -1
2288; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2289; GFX8-NEXT:    s_add_u32 s8, s8, s3
2290; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2291; GFX8-NEXT:    s_waitcnt vmcnt(2)
2292; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 4, v3
2293; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
2294; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
2295; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 20, v3
2296; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 28, v3
2297; GFX8-NEXT:    v_lshlrev_b16_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2298; GFX8-NEXT:    s_waitcnt vmcnt(1)
2299; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 4, v2
2300; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
2301; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
2302; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
2303; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
2304; GFX8-NEXT:    v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2305; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
2306; GFX8-NEXT:    v_lshlrev_b16_sdwa v18, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2307; GFX8-NEXT:    v_lshlrev_b16_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2308; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v2
2309; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
2310; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2311; GFX8-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
2312; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2313; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2314; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2315; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2316; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
2317; GFX8-NEXT:    s_waitcnt vmcnt(0)
2318; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2319; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2320; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2321; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2322; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2323; GFX8-NEXT:    v_mad_u16 v2, v6, v11, v2
2324; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2325; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2326; GFX8-NEXT:    v_mad_u16 v2, v7, v12, v2
2327; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
2328; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v17
2329; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
2330; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2331; GFX8-NEXT:    v_mad_u16 v2, v8, v13, v2
2332; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2333; GFX8-NEXT:    v_ashrrev_i16_e32 v14, 12, v14
2334; GFX8-NEXT:    v_mad_u16 v2, v17, v5, v2
2335; GFX8-NEXT:    v_lshlrev_b16_e32 v10, 12, v10
2336; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v16
2337; GFX8-NEXT:    v_lshlrev_b16_e32 v15, 12, v15
2338; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v18
2339; GFX8-NEXT:    v_mad_u16 v2, v9, v14, v2
2340; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v10
2341; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v15
2342; GFX8-NEXT:    v_mad_u16 v2, v16, v18, v2
2343; GFX8-NEXT:    v_mad_u16 v2, v10, v15, v2
2344; GFX8-NEXT:    flat_store_short v[0:1], v2
2345; GFX8-NEXT:    s_endpgm
2346;
2347; GFX9-LABEL: idot8_acc16_vecMul:
2348; GFX9:       ; %bb.0: ; %entry
2349; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2350; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2351; GFX9-NEXT:    s_mov_b32 s10, -1
2352; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
2353; GFX9-NEXT:    s_add_u32 s8, s8, s3
2354; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2355; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2356; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2357; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
2358; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2359; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
2360; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
2361; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2362; GFX9-NEXT:    global_load_ushort v3, v0, s[2:3]
2363; GFX9-NEXT:    s_addc_u32 s9, s9, 0
2364; GFX9-NEXT:    s_waitcnt vmcnt(2)
2365; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
2366; GFX9-NEXT:    v_bfe_u32 v6, v1, 24, 4
2367; GFX9-NEXT:    v_bfe_u32 v7, v1, 20, 4
2368; GFX9-NEXT:    v_bfe_u32 v8, v1, 16, 4
2369; GFX9-NEXT:    v_bfe_u32 v9, v1, 12, 4
2370; GFX9-NEXT:    v_bfe_u32 v10, v1, 8, 4
2371; GFX9-NEXT:    v_bfe_u32 v11, v1, 4, 4
2372; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
2373; GFX9-NEXT:    s_waitcnt vmcnt(1)
2374; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2375; GFX9-NEXT:    v_bfe_u32 v13, v2, 24, 4
2376; GFX9-NEXT:    v_bfe_u32 v14, v2, 20, 4
2377; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 4
2378; GFX9-NEXT:    v_bfe_u32 v16, v2, 12, 4
2379; GFX9-NEXT:    v_bfe_u32 v17, v2, 8, 4
2380; GFX9-NEXT:    v_bfe_u32 v18, v2, 4, 4
2381; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
2382; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
2383; GFX9-NEXT:    v_and_b32_e32 v2, v4, v2
2384; GFX9-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
2385; GFX9-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
2386; GFX9-NEXT:    v_and_b32_e32 v10, v4, v10
2387; GFX9-NEXT:    v_and_b32_e32 v6, v4, v6
2388; GFX9-NEXT:    v_and_b32_e32 v17, v4, v17
2389; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
2390; GFX9-NEXT:    v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
2391; GFX9-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
2392; GFX9-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
2393; GFX9-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
2394; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2395; GFX9-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2396; GFX9-NEXT:    v_and_b32_e32 v8, v4, v8
2397; GFX9-NEXT:    v_and_b32_e32 v15, v4, v15
2398; GFX9-NEXT:    v_and_b32_e32 v4, v4, v13
2399; GFX9-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
2400; GFX9-NEXT:    v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
2401; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2402; GFX9-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
2403; GFX9-NEXT:    v_lshl_or_b32 v8, v14, 16, v15
2404; GFX9-NEXT:    v_lshl_or_b32 v4, v12, 16, v4
2405; GFX9-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
2406; GFX9-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
2407; GFX9-NEXT:    s_waitcnt vmcnt(0)
2408; GFX9-NEXT:    v_add_u16_e32 v2, v1, v3
2409; GFX9-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2410; GFX9-NEXT:    v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
2411; GFX9-NEXT:    v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
2412; GFX9-NEXT:    v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
2413; GFX9-NEXT:    v_pk_mul_lo_u16 v6, v9, v6
2414; GFX9-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2415; GFX9-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2416; GFX9-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
2417; GFX9-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
2418; GFX9-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
2419; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
2420; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
2421; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v7, v8
2422; GFX9-NEXT:    v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2423; GFX9-NEXT:    v_add_u16_e32 v1, v1, v5
2424; GFX9-NEXT:    v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2425; GFX9-NEXT:    v_add_u16_e32 v1, v1, v4
2426; GFX9-NEXT:    v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2427; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
2428; GFX9-NEXT:    s_endpgm
2429;
2430; GFX9-DL-LABEL: idot8_acc16_vecMul:
2431; GFX9-DL:       ; %bb.0: ; %entry
2432; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2433; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2434; GFX9-DL-NEXT:    s_mov_b32 s10, -1
2435; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
2436; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
2437; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2438; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2439; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2440; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0xffff
2441; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2442; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2443; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2444; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2445; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[2:3]
2446; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
2447; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2448; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
2449; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 24, 4
2450; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 20, 4
2451; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 16, 4
2452; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
2453; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 8, 4
2454; GFX9-DL-NEXT:    v_bfe_u32 v11, v1, 4, 4
2455; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
2456; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2457; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2458; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 24, 4
2459; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 20, 4
2460; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
2461; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 12, 4
2462; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 8, 4
2463; GFX9-DL-NEXT:    v_bfe_u32 v18, v2, 4, 4
2464; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
2465; GFX9-DL-NEXT:    v_and_b32_e32 v1, v4, v1
2466; GFX9-DL-NEXT:    v_and_b32_e32 v2, v4, v2
2467; GFX9-DL-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
2468; GFX9-DL-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
2469; GFX9-DL-NEXT:    v_and_b32_e32 v10, v4, v10
2470; GFX9-DL-NEXT:    v_and_b32_e32 v6, v4, v6
2471; GFX9-DL-NEXT:    v_and_b32_e32 v17, v4, v17
2472; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
2473; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
2474; GFX9-DL-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
2475; GFX9-DL-NEXT:    v_lshl_or_b32 v5, v5, 16, v6
2476; GFX9-DL-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
2477; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2478; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2479; GFX9-DL-NEXT:    v_and_b32_e32 v8, v4, v8
2480; GFX9-DL-NEXT:    v_and_b32_e32 v15, v4, v15
2481; GFX9-DL-NEXT:    v_and_b32_e32 v4, v4, v13
2482; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
2483; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, v6 op_sel_hi:[0,1]
2484; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
2485; GFX9-DL-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
2486; GFX9-DL-NEXT:    v_lshl_or_b32 v8, v14, 16, v15
2487; GFX9-DL-NEXT:    v_lshl_or_b32 v4, v12, 16, v4
2488; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
2489; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1]
2490; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2491; GFX9-DL-NEXT:    v_add_u16_e32 v2, v1, v3
2492; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2493; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, v5 op_sel_hi:[0,1]
2494; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v8, 12, v8 op_sel_hi:[0,1]
2495; GFX9-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, v4 op_sel_hi:[0,1]
2496; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v6, v9, v6
2497; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2498; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2499; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
2500; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1]
2501; GFX9-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
2502; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
2503; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
2504; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v7, v8
2505; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2506; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v5
2507; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2508; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v4
2509; GFX9-DL-NEXT:    v_add_u16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2510; GFX9-DL-NEXT:    global_store_short v0, v1, s[2:3]
2511; GFX9-DL-NEXT:    s_endpgm
2512;
2513; GFX10-DL-XNACK-LABEL: idot8_acc16_vecMul:
2514; GFX10-DL-XNACK:       ; %bb.0: ; %entry
2515; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2516; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2517; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2518; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v4, 0xffff
2519; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2520; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2521; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
2522; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2523; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
2524; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
2525; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
2526; GFX10-DL-XNACK-NEXT:    s_clause 0x1
2527; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2528; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
2529; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v0, 0
2530; GFX10-DL-XNACK-NEXT:    global_load_ushort v3, v0, s[0:1]
2531; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
2532; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
2533; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v6, v1, 24, 4
2534; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v7, v1, 20, 4
2535; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v8, v1, 16, 4
2536; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v9, v1, 12, 4
2537; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v10, v1, 8, 4
2538; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v11, v1, 4, 4
2539; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v1, 15, v1
2540; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
2541; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, 15, v2
2542; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v16, v2, 4, 4
2543; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2544; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v14, v2, 24, 4
2545; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v1, v4, v1
2546; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v13, v4, v13
2547; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v15, v2, 20, 4
2548; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v17, v2, 16, 4
2549; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v18, v2, 12, 4
2550; GFX10-DL-XNACK-NEXT:    v_bfe_u32 v2, v2, 8, 4
2551; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
2552; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
2553; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v10, v4, v10
2554; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v8, v4, v8
2555; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v2, v4, v2
2556; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
2557; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]
2558; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
2559; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
2560; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v2, v18, 16, v2
2561; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2562; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
2563; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v10, v4, v17
2564; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
2565; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v2, 12, v2 op_sel_hi:[0,1]
2566; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2567; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v8
2568; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v8, v15, 16, v10
2569; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
2570; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2571; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2572; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
2573; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
2574; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
2575; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v3, v4, v6
2576; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1]
2577; GFX10-DL-XNACK-NEXT:    v_and_b32_e32 v4, v4, v14
2578; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v2, v9, v2
2579; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v10
2580; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v3, v5, 16, v3
2581; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
2582; GFX10-DL-XNACK-NEXT:    v_lshl_or_b32 v4, v12, 16, v4
2583; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2584; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v2
2585; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1]
2586; GFX10-DL-XNACK-NEXT:    v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
2587; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v4, v7, v5
2588; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v6
2589; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2590; GFX10-DL-XNACK-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2591; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2592; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v4
2593; GFX10-DL-XNACK-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
2594; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v5
2595; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2596; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v2
2597; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v1, v1, v3
2598; GFX10-DL-XNACK-NEXT:    global_store_short v0, v1, s[0:1]
2599; GFX10-DL-XNACK-NEXT:    s_endpgm
2600;
2601; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
2602; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
2603; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2604; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2605; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2606; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v2, 0
2607; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v4, 0xffff
2608; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2609; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2610; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
2611; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
2612; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
2613; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
2614; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
2615; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
2616; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
2617; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
2618; GFX10-DL-NOXNACK-NEXT:    global_load_ushort v3, v2, s[0:1]
2619; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
2620; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
2621; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v6, v1, 24, 4
2622; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v7, v1, 20, 4
2623; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v8, v1, 16, 4
2624; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v9, v1, 12, 4
2625; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v10, v1, 8, 4
2626; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v11, v1, 4, 4
2627; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v1, 15, v1
2628; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
2629; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, 15, v0
2630; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v16, v0, 4, 4
2631; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 28, v0
2632; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v14, v0, 24, 4
2633; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v1, v4, v1
2634; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v13, v4, v13
2635; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v15, v0, 20, 4
2636; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v17, v0, 16, 4
2637; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v18, v0, 12, 4
2638; GFX10-DL-NOXNACK-NEXT:    v_bfe_u32 v0, v0, 8, 4
2639; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v1, v11, 16, v1
2640; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v11, v16, 16, v13
2641; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v10, v4, v10
2642; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v8, v4, v8
2643; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v0, v4, v0
2644; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v1 op_sel_hi:[0,1]
2645; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v11, 12, v11 op_sel_hi:[0,1]
2646; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
2647; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v7, v7, 16, v8
2648; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v0, v18, 16, v0
2649; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2650; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v8, 12, v11 op_sel_hi:[0,1]
2651; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v10, v4, v17
2652; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v9, 12, v9 op_sel_hi:[0,1]
2653; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v0, 12, v0 op_sel_hi:[0,1]
2654; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v7, 12, v7 op_sel_hi:[0,1]
2655; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v8
2656; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v8, v15, 16, v10
2657; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1]
2658; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
2659; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1]
2660; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
2661; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
2662; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v1, v3
2663; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v3, v4, v6
2664; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v6, 12, v8 op_sel_hi:[0,1]
2665; GFX10-DL-NOXNACK-NEXT:    v_and_b32_e32 v4, v4, v14
2666; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v0, v9, v0
2667; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v1, v1, v10
2668; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v3, v5, 16, v3
2669; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1]
2670; GFX10-DL-NOXNACK-NEXT:    v_lshl_or_b32 v4, v12, 16, v4
2671; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
2672; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v1, v0
2673; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1]
2674; GFX10-DL-NOXNACK-NEXT:    v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
2675; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v4, v7, v5
2676; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v6
2677; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
2678; GFX10-DL-NOXNACK-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2679; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
2680; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v4
2681; GFX10-DL-NOXNACK-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
2682; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v5
2683; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2684; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
2685; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v3
2686; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
2687; GFX10-DL-NOXNACK-NEXT:    s_endpgm
2688; GFX10-DL-LABEL: idot8_acc16_vecMul:
2689; GFX10-DL:       ; %bb.0: ; %entry
2690; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
2691; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
2692; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2693; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2694; GFX10-DL-NEXT:    s_mov_b32 s14, -1
2695; GFX10-DL-NEXT:    s_mov_b32 s15, 0x31c16000
2696; GFX10-DL-NEXT:    s_add_u32 s12, s12, s3
2697; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2698; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
2699; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2700; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2701; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
2702; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2703; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2704; GFX10-DL-NEXT:    s_bfe_u32 s2, s0, 0x40018
2705; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 28
2706; GFX10-DL-NEXT:    s_bfe_u32 s6, s0, 0x40010
2707; GFX10-DL-NEXT:    s_bfe_u32 s7, s0, 0x40014
2708; GFX10-DL-NEXT:    s_bfe_u32 s8, s0, 0x40008
2709; GFX10-DL-NEXT:    s_bfe_u32 s9, s0, 0x4000c
2710; GFX10-DL-NEXT:    s_and_b32 s10, s0, 15
2711; GFX10-DL-NEXT:    s_bfe_u32 s0, s0, 0x40004
2712; GFX10-DL-NEXT:    s_and_b32 s11, s1, 15
2713; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
2714; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40004
2715; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v2, 12, s0 op_sel_hi:[0,1]
2716; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s11, s10
2717; GFX10-DL-NEXT:    s_bfe_u32 s11, s1, 0x4000c
2718; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1]
2719; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40008
2720; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
2721; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s8, s8, s9
2722; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s0, s11
2723; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
2724; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v4, 12, s8 op_sel_hi:[0,1]
2725; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1]
2726; GFX10-DL-NEXT:    s_bfe_u32 s8, s1, 0x40010
2727; GFX10-DL-NEXT:    s_bfe_u32 s0, s1, 0x40014
2728; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v3
2729; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v4 op_sel_hi:[0,1]
2730; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1]
2731; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s6, s6, s7
2732; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s8, s0
2733; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s6 op_sel_hi:[0,1]
2734; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2735; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
2736; GFX10-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
2737; GFX10-DL-NEXT:    s_lshr_b32 s0, s1, 28
2738; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s1, s2, s3
2739; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2740; GFX10-DL-NEXT:    s_pack_ll_b32_b16 s0, s10, s0
2741; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1]
2742; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2743; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
2744; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2745; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v2, 12, v5 op_sel_hi:[0,1]
2746; GFX10-DL-NEXT:    v_pk_lshlrev_b16 v5, 12, s1 op_sel_hi:[0,1]
2747; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2748; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v2, v2, v4
2749; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1]
2750; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2751; GFX10-DL-NEXT:    v_pk_ashrrev_i16 v3, 12, v5 op_sel_hi:[0,1]
2752; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2753; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v3, v3, v4
2754; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2755; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v3
2756; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2757; GFX10-DL-NEXT:    global_store_short v0, v1, s[4:5]
2758; GFX10-DL-NEXT:    s_endpgm
2759                                              <8 x i4> addrspace(1)* %src2,
2760                                              i16 addrspace(1)* nocapture %dst) {
2761entry:
2762  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2763  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
2764  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
2765  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
2766  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
2767
2768  %cvec1 = sext <8 x i4> %vec1 to <8 x i16>
2769  %cvec2 = sext <8 x i4> %vec2 to <8 x i16>
2770
2771  %mul = mul <8 x i16> %cvec1, %cvec2
2772  %mul0 = extractelement <8 x i16> %mul, i64 0
2773  %mul1 = extractelement <8 x i16> %mul, i64 1
2774  %mul2 = extractelement <8 x i16> %mul, i64 2
2775  %mul3 = extractelement <8 x i16> %mul, i64 3
2776  %mul4 = extractelement <8 x i16> %mul, i64 4
2777  %mul5 = extractelement <8 x i16> %mul, i64 5
2778  %mul6 = extractelement <8 x i16> %mul, i64 6
2779  %mul7 = extractelement <8 x i16> %mul, i64 7
2780
2781  %acc = load i16, i16 addrspace(1)* %dst, align 4
2782  %add1 = add i16 %mul0, %acc
2783  %add2 = add i16 %add1, %mul1
2784  %add3 = add i16 %add2, %mul2
2785  %add4 = add i16 %add3, %mul3
2786  %add5 = add i16 %add4, %mul4
2787  %add6 = add i16 %add5, %mul5
2788  %add7 = add i16 %add6, %mul6
2789  %add8 = add i16 %add7, %mul7
2790
2791  store i16 %add8, i16 addrspace(1)* %dst, align 4
2792  ret void
2793}
2794
2795; TODO: Support this pattern.
2796define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
2797; GFX7-LABEL: idot8_acc8_vecMul:
2798; GFX7:       ; %bb.0: ; %entry
2799; GFX7-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2800; GFX7-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2801; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2802; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2803; GFX7-NEXT:    s_mov_b32 s14, -1
2804; GFX7-NEXT:    s_mov_b32 s15, 0xe8f000
2805; GFX7-NEXT:    s_add_u32 s12, s12, s3
2806; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2807; GFX7-NEXT:    s_mov_b32 s10, 0
2808; GFX7-NEXT:    s_mov_b32 s11, s3
2809; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2810; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2811; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2812; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2813; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2814; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2815; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2816; GFX7-NEXT:    s_mov_b32 s2, -1
2817; GFX7-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0
2818; GFX7-NEXT:    s_movk_i32 s4, 0xff
2819; GFX7-NEXT:    s_mov_b32 s5, 0xffff
2820; GFX7-NEXT:    s_addc_u32 s13, s13, 0
2821; GFX7-NEXT:    s_waitcnt vmcnt(2)
2822; GFX7-NEXT:    v_ashrrev_i32_e32 v3, 28, v2
2823; GFX7-NEXT:    v_bfe_i32 v4, v2, 24, 4
2824; GFX7-NEXT:    v_bfe_i32 v5, v2, 20, 4
2825; GFX7-NEXT:    v_bfe_i32 v6, v2, 16, 4
2826; GFX7-NEXT:    v_bfe_i32 v7, v2, 12, 4
2827; GFX7-NEXT:    v_bfe_i32 v8, v2, 8, 4
2828; GFX7-NEXT:    v_bfe_i32 v9, v2, 4, 4
2829; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 4
2830; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 8, v3
2831; GFX7-NEXT:    v_and_b32_e32 v4, s4, v4
2832; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
2833; GFX7-NEXT:    v_and_b32_e32 v6, s4, v6
2834; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
2835; GFX7-NEXT:    v_and_b32_e32 v8, s4, v8
2836; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
2837; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
2838; GFX7-NEXT:    s_waitcnt vmcnt(1)
2839; GFX7-NEXT:    v_ashrrev_i32_e32 v11, 28, v0
2840; GFX7-NEXT:    v_bfe_i32 v12, v0, 24, 4
2841; GFX7-NEXT:    v_bfe_i32 v13, v0, 20, 4
2842; GFX7-NEXT:    v_bfe_i32 v14, v0, 16, 4
2843; GFX7-NEXT:    v_bfe_i32 v15, v0, 12, 4
2844; GFX7-NEXT:    v_bfe_i32 v16, v0, 8, 4
2845; GFX7-NEXT:    v_bfe_i32 v17, v0, 4, 4
2846; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 4
2847; GFX7-NEXT:    v_or_b32_e32 v4, v4, v10
2848; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
2849; GFX7-NEXT:    v_or_b32_e32 v6, v8, v7
2850; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
2851; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 8, v11
2852; GFX7-NEXT:    v_and_b32_e32 v8, s4, v12
2853; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 8, v13
2854; GFX7-NEXT:    v_and_b32_e32 v10, s4, v14
2855; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 8, v15
2856; GFX7-NEXT:    v_and_b32_e32 v13, s4, v16
2857; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 8, v17
2858; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
2859; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
2860; GFX7-NEXT:    v_and_b32_e32 v5, s5, v5
2861; GFX7-NEXT:    v_or_b32_e32 v7, v8, v7
2862; GFX7-NEXT:    v_or_b32_e32 v8, v10, v9
2863; GFX7-NEXT:    v_or_b32_e32 v9, v13, v12
2864; GFX7-NEXT:    v_or_b32_e32 v0, v0, v14
2865; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
2866; GFX7-NEXT:    v_and_b32_e32 v2, s5, v2
2867; GFX7-NEXT:    v_or_b32_e32 v4, v5, v4
2868; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
2869; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
2870; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
2871; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
2872; GFX7-NEXT:    v_or_b32_e32 v0, v0, v7
2873; GFX7-NEXT:    v_and_b32_e32 v7, s4, v2
2874; GFX7-NEXT:    v_and_b32_e32 v13, s4, v0
2875; GFX7-NEXT:    v_and_b32_e32 v6, s5, v8
2876; GFX7-NEXT:    v_bfe_u32 v8, v2, 8, 8
2877; GFX7-NEXT:    v_bfe_u32 v14, v0, 8, 8
2878; GFX7-NEXT:    s_waitcnt vmcnt(0)
2879; GFX7-NEXT:    v_mad_u32_u24 v1, v7, v13, v1
2880; GFX7-NEXT:    v_or_b32_e32 v5, v6, v5
2881; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
2882; GFX7-NEXT:    v_bfe_u32 v2, v2, 16, 8
2883; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
2884; GFX7-NEXT:    v_bfe_u32 v0, v0, 16, 8
2885; GFX7-NEXT:    v_mad_u32_u24 v1, v8, v14, v1
2886; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2887; GFX7-NEXT:    v_and_b32_e32 v9, s4, v4
2888; GFX7-NEXT:    v_and_b32_e32 v15, s4, v5
2889; GFX7-NEXT:    v_mad_u32_u24 v0, v6, v12, v0
2890; GFX7-NEXT:    v_bfe_u32 v10, v4, 8, 8
2891; GFX7-NEXT:    v_bfe_u32 v16, v5, 8, 8
2892; GFX7-NEXT:    v_mad_u32_u24 v0, v9, v15, v0
2893; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 8
2894; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 8
2895; GFX7-NEXT:    v_mad_u32_u24 v0, v10, v16, v0
2896; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
2897; GFX7-NEXT:    v_and_b32_e32 v11, s4, v11
2898; GFX7-NEXT:    v_mad_u32_u24 v0, v4, v5, v0
2899; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v11, v0
2900; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2901; GFX7-NEXT:    s_endpgm
2902;
2903; GFX8-LABEL: idot8_acc8_vecMul:
2904; GFX8:       ; %bb.0: ; %entry
2905; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2906; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2907; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2908; GFX8-NEXT:    v_mov_b32_e32 v5, 12
2909; GFX8-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2910; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2911; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2912; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2913; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2914; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2915; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2916; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2917; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2918; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2919; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2920; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2921; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
2922; GFX8-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2923; GFX8-NEXT:    s_mov_b32 s10, -1
2924; GFX8-NEXT:    s_mov_b32 s11, 0xe80000
2925; GFX8-NEXT:    s_add_u32 s8, s8, s3
2926; GFX8-NEXT:    s_addc_u32 s9, s9, 0
2927; GFX8-NEXT:    s_waitcnt vmcnt(2)
2928; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 20, v3
2929; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 28, v3
2930; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
2931; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
2932; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 4, v3
2933; GFX8-NEXT:    v_lshlrev_b16_e32 v16, 12, v3
2934; GFX8-NEXT:    s_waitcnt vmcnt(1)
2935; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
2936; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
2937; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
2938; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
2939; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 4, v2
2940; GFX8-NEXT:    v_lshlrev_b16_sdwa v17, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2941; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2942; GFX8-NEXT:    v_lshlrev_b16_e32 v18, 12, v2
2943; GFX8-NEXT:    v_lshlrev_b16_sdwa v19, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2944; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2945; GFX8-NEXT:    v_lshlrev_b16_e32 v5, 12, v10
2946; GFX8-NEXT:    v_ashrrev_i16_e32 v10, 12, v16
2947; GFX8-NEXT:    v_ashrrev_i16_e32 v16, 12, v17
2948; GFX8-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
2949; GFX8-NEXT:    v_ashrrev_i16_e32 v17, 12, v3
2950; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 12, v6
2951; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 12, v15
2952; GFX8-NEXT:    v_ashrrev_i16_e32 v15, 12, v18
2953; GFX8-NEXT:    v_ashrrev_i16_e32 v18, 12, v19
2954; GFX8-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
2955; GFX8-NEXT:    v_ashrrev_i16_e32 v19, 12, v2
2956; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 12, v11
2957; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 12, v9
2958; GFX8-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
2959; GFX8-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
2960; GFX8-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
2961; GFX8-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
2962; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 12, v3
2963; GFX8-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
2964; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
2965; GFX8-NEXT:    v_ashrrev_i16_e32 v9, 12, v9
2966; GFX8-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
2967; GFX8-NEXT:    v_ashrrev_i16_e32 v11, 12, v14
2968; GFX8-NEXT:    v_ashrrev_i16_e32 v13, 12, v13
2969; GFX8-NEXT:    v_mul_lo_u16_e32 v10, v10, v15
2970; GFX8-NEXT:    v_mul_lo_u16_e32 v15, v16, v18
2971; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2972; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2973; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
2974; GFX8-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
2975; GFX8-NEXT:    v_mul_lo_u16_e32 v14, v17, v19
2976; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v8, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2977; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v9, v11
2978; GFX8-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2979; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2980; GFX8-NEXT:    v_or_b32_sdwa v6, v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2981; GFX8-NEXT:    v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2982; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
2983; GFX8-NEXT:    v_or_b32_sdwa v8, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2984; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
2985; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
2986; GFX8-NEXT:    v_or_b32_sdwa v3, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2987; GFX8-NEXT:    v_or_b32_e32 v5, v5, v2
2988; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
2989; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
2990; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
2991; GFX8-NEXT:    s_waitcnt vmcnt(0)
2992; GFX8-NEXT:    v_add_u16_e32 v3, v8, v4
2993; GFX8-NEXT:    v_add_u16_e32 v3, v3, v5
2994; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
2995; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
2996; GFX8-NEXT:    v_mad_u16 v2, v17, v19, v2
2997; GFX8-NEXT:    v_add_u16_e32 v2, v2, v6
2998; GFX8-NEXT:    v_mad_u16 v2, v16, v18, v2
2999; GFX8-NEXT:    v_add_u16_e32 v2, v2, v10
3000; GFX8-NEXT:    flat_store_byte v[0:1], v2
3001; GFX8-NEXT:    s_endpgm
3002;
3003; GFX9-LABEL: idot8_acc8_vecMul:
3004; GFX9:       ; %bb.0: ; %entry
3005; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3006; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3007; GFX9-NEXT:    s_mov_b32 s10, -1
3008; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
3009; GFX9-NEXT:    s_add_u32 s8, s8, s3
3010; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3011; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3012; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3013; GFX9-NEXT:    v_mov_b32_e32 v3, 0
3014; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3015; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
3016; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
3017; GFX9-NEXT:    global_load_ubyte v4, v3, s[2:3]
3018; GFX9-NEXT:    v_mov_b32_e32 v0, 12
3019; GFX9-NEXT:    s_addc_u32 s9, s9, 0
3020; GFX9-NEXT:    s_waitcnt vmcnt(2)
3021; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
3022; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3023; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
3024; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
3025; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
3026; GFX9-NEXT:    s_waitcnt vmcnt(1)
3027; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
3028; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
3029; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
3030; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
3031; GFX9-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3032; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3033; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
3034; GFX9-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3035; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3036; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
3037; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
3038; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
3039; GFX9-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
3040; GFX9-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
3041; GFX9-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
3042; GFX9-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
3043; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
3044; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
3045; GFX9-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
3046; GFX9-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
3047; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
3048; GFX9-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
3049; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
3050; GFX9-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
3051; GFX9-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
3052; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
3053; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
3054; GFX9-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
3055; GFX9-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
3056; GFX9-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
3057; GFX9-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
3058; GFX9-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
3059; GFX9-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
3060; GFX9-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
3061; GFX9-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
3062; GFX9-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
3063; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3064; GFX9-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3065; GFX9-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
3066; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
3067; GFX9-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
3068; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3069; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
3070; GFX9-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3071; GFX9-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
3072; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3073; GFX9-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3074; GFX9-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3075; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
3076; GFX9-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3077; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
3078; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3079; GFX9-NEXT:    v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3080; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
3081; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
3082; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
3083; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
3084; GFX9-NEXT:    s_waitcnt vmcnt(0)
3085; GFX9-NEXT:    v_add_u16_e32 v1, v7, v4
3086; GFX9-NEXT:    v_add_u16_e32 v1, v1, v2
3087; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
3088; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
3089; GFX9-NEXT:    v_mad_legacy_u16 v0, v16, v18, v0
3090; GFX9-NEXT:    v_add_u16_e32 v0, v0, v5
3091; GFX9-NEXT:    v_mad_legacy_u16 v0, v15, v17, v0
3092; GFX9-NEXT:    v_add_u16_e32 v0, v0, v9
3093; GFX9-NEXT:    global_store_byte v3, v0, s[2:3]
3094; GFX9-NEXT:    s_endpgm
3095;
3096; GFX9-DL-LABEL: idot8_acc8_vecMul:
3097; GFX9-DL:       ; %bb.0: ; %entry
3098; GFX9-DL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3099; GFX9-DL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3100; GFX9-DL-NEXT:    s_mov_b32 s10, -1
3101; GFX9-DL-NEXT:    s_mov_b32 s11, 0xe00000
3102; GFX9-DL-NEXT:    s_add_u32 s8, s8, s3
3103; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3104; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
3105; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3106; GFX9-DL-NEXT:    v_mov_b32_e32 v3, 0
3107; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
3108; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
3109; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
3110; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[2:3]
3111; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 12
3112; GFX9-DL-NEXT:    s_addc_u32 s9, s9, 0
3113; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
3114; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
3115; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3116; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
3117; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
3118; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 4, v1
3119; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
3120; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v2
3121; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
3122; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 4, v2
3123; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v15, 12, v1
3124; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3125; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3126; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v17, 12, v2
3127; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v18, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3128; GFX9-DL-NEXT:    v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3129; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 12, v2
3130; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v2
3131; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v2, 12, v9
3132; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v9, 12, v15
3133; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v15, 12, v16
3134; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v6, 12, v6
3135; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v16, 12, v1
3136; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v1, 12, v5
3137; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v5, 12, v14
3138; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v14, 12, v17
3139; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v17, 12, v18
3140; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v11, 12, v11
3141; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v18, 12, v0
3142; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v0, 12, v10
3143; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v8, 12, v8
3144; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v7, 12, v7
3145; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v13, 12, v13
3146; GFX9-DL-NEXT:    v_lshlrev_b16_e32 v12, 12, v12
3147; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v6, 12, v6
3148; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v1, 12, v1
3149; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v11, 12, v11
3150; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v0, 12, v0
3151; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v8, 12, v8
3152; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v7, 12, v7
3153; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v10, 12, v13
3154; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v12, 12, v12
3155; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v19, v15, v17
3156; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3157; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v1, v6, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3158; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v2, 12, v2
3159; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
3160; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v13, v16, v18
3161; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v7, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3162; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v8, v10
3163; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3164; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, v9, v14
3165; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v2, v2, v5 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3166; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3167; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3168; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
3169; GFX9-DL-NEXT:    v_or_b32_sdwa v7, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3170; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
3171; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3172; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3173; GFX9-DL-NEXT:    v_or_b32_e32 v2, v2, v0
3174; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
3175; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
3176; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
3177; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
3178; GFX9-DL-NEXT:    v_add_u16_e32 v1, v7, v4
3179; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v2
3180; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
3181; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
3182; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v16, v18, v0
3183; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v5
3184; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v15, v17, v0
3185; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v9
3186; GFX9-DL-NEXT:    global_store_byte v3, v0, s[2:3]
3187; GFX9-DL-NEXT:    s_endpgm
3188;
3189; GFX10-DL-XNACK-LABEL: idot8_acc8_vecMul:
3190; GFX10-DL-XNACK:       ; %bb.0: ; %entry
3191; GFX10-DL-XNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3192; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3193; GFX10-DL-XNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3194; GFX10-DL-XNACK-NEXT:    v_mov_b32_e32 v4, 0
3195; GFX10-DL-XNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3196; GFX10-DL-XNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3197; GFX10-DL-XNACK-NEXT:    s_mov_b32 s10, -1
3198; GFX10-DL-XNACK-NEXT:    s_mov_b32 s11, 0x31c16000
3199; GFX10-DL-XNACK-NEXT:    s_add_u32 s8, s8, s3
3200; GFX10-DL-XNACK-NEXT:    s_addc_u32 s9, s9, 0
3201; GFX10-DL-XNACK-NEXT:    s_waitcnt lgkmcnt(0)
3202; GFX10-DL-XNACK-NEXT:    s_clause 0x1
3203; GFX10-DL-XNACK-NEXT:    global_load_dword v1, v0, s[4:5]
3204; GFX10-DL-XNACK-NEXT:    global_load_dword v2, v0, s[6:7]
3205; GFX10-DL-XNACK-NEXT:    global_load_ubyte v3, v4, s[0:1]
3206; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(2)
3207; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
3208; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(1)
3209; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
3210; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3211; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
3212; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3213; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
3214; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
3215; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
3216; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
3217; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
3218; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
3219; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v16, 12, v16
3220; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
3221; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
3222; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v0, 20, v1
3223; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
3224; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v2
3225; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
3226; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
3227; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
3228; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
3229; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
3230; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
3231; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v16, 12, v16
3232; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
3233; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3234; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
3235; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
3236; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
3237; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
3238; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
3239; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
3240; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
3241; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
3242; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v2, 12, v2
3243; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
3244; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
3245; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v9, v16
3246; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
3247; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
3248; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
3249; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
3250; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
3251; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
3252; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v11, 12, v11
3253; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
3254; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
3255; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v2, 12, v2
3256; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
3257; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3258; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
3259; GFX10-DL-XNACK-NEXT:    v_ashrrev_i16 v12, 12, v12
3260; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v9, v0, v11
3261; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v11, v7, v14
3262; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
3263; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
3264; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
3265; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v1, v1, v2
3266; GFX10-DL-XNACK-NEXT:    v_mul_lo_u16 v2, v5, v12
3267; GFX10-DL-XNACK-NEXT:    v_lshlrev_b16 v9, 8, v9
3268; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3269; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3270; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3271; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3272; GFX10-DL-XNACK-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
3273; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v10, 8, v11
3274; GFX10-DL-XNACK-NEXT:    s_waitcnt vmcnt(0)
3275; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v3, v1, v3
3276; GFX10-DL-XNACK-NEXT:    v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3277; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v9, v3, v10
3278; GFX10-DL-XNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
3279; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
3280; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v9, v8
3281; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v2
3282; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v5, v12, v0
3283; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3284; GFX10-DL-XNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
3285; GFX10-DL-XNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
3286; GFX10-DL-XNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3287; GFX10-DL-XNACK-NEXT:    global_store_byte v4, v0, s[0:1]
3288; GFX10-DL-XNACK-NEXT:    s_endpgm
3289;
3290; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
3291; GFX10-DL-NOXNACK:       ; %bb.0: ; %entry
3292; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
3293; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3294; GFX10-DL-NOXNACK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
3295; GFX10-DL-NOXNACK-NEXT:    v_mov_b32_e32 v4, 0
3296; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
3297; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
3298; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s10, -1
3299; GFX10-DL-NOXNACK-NEXT:    s_mov_b32 s11, 0x31c16000
3300; GFX10-DL-NOXNACK-NEXT:    s_add_u32 s8, s8, s3
3301; GFX10-DL-NOXNACK-NEXT:    s_addc_u32 s9, s9, 0
3302; GFX10-DL-NOXNACK-NEXT:    s_waitcnt lgkmcnt(0)
3303; GFX10-DL-NOXNACK-NEXT:    s_clause 0x1
3304; GFX10-DL-NOXNACK-NEXT:    global_load_dword v1, v0, s[4:5]
3305; GFX10-DL-NOXNACK-NEXT:    global_load_dword v0, v0, s[6:7]
3306; GFX10-DL-NOXNACK-NEXT:    global_load_ubyte v2, v4, s[0:1]
3307; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(2)
3308; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
3309; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(1)
3310; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
3311; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
3312; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v11, 20, v0
3313; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
3314; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
3315; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
3316; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
3317; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
3318; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v0
3319; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 12, v8
3320; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v15, 12, v15
3321; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
3322; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
3323; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v9, 12, v9
3324; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v18, 12, v0
3325; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v0, 12, v16
3326; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v8, 12, v8
3327; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v15
3328; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
3329; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
3330; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 12, v6
3331; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v13, 12, v13
3332; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 12, v10
3333; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v17, 12, v17
3334; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v9
3335; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v0, 12, v0
3336; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v8, v8, v15
3337; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
3338; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v7, 12, v7
3339; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 12, v3
3340; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v14, 12, v14
3341; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v11, 12, v11
3342; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v6, 12, v6
3343; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v13, 12, v13
3344; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v1, 12, v1
3345; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v10, 12, v10
3346; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v15, 12, v17
3347; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v0, v9, v0
3348; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v8, 8, v8
3349; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v5, 12, v5
3350; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v12, 12, v12
3351; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v7, 12, v7
3352; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v3, 12, v3
3353; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v14, 12, v14
3354; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v9, 12, v11
3355; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v6, v6, v13
3356; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v1, 12, v1
3357; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v10, v10, v15
3358; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3359; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v5, 12, v5
3360; GFX10-DL-NOXNACK-NEXT:    v_ashrrev_i16 v11, 12, v12
3361; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v3, v3, v9
3362; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v9, v7, v14
3363; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v6, 8, v6
3364; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v10, 8, v10
3365; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
3366; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v1, v1, v18
3367; GFX10-DL-NOXNACK-NEXT:    v_mul_lo_u16 v12, v5, v11
3368; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b16 v3, 8, v3
3369; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3370; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3371; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3372; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3373; GFX10-DL-NOXNACK-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
3374; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
3375; GFX10-DL-NOXNACK-NEXT:    s_waitcnt vmcnt(0)
3376; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v2, v1, v2
3377; GFX10-DL-NOXNACK-NEXT:    v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3378; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v9, v2, v9
3379; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
3380; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
3381; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v9, v8
3382; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v2
3383; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v5, v11, v0
3384; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3385; GFX10-DL-NOXNACK-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
3386; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v7, v14, v0
3387; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
3388; GFX10-DL-NOXNACK-NEXT:    global_store_byte v4, v0, s[0:1]
3389; GFX10-DL-NOXNACK-NEXT:    s_endpgm
3390; GFX10-DL-LABEL: idot8_acc8_vecMul:
3391; GFX10-DL:       ; %bb.0: ; %entry
3392; GFX10-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
3393; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
3394; GFX10-DL-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
3395; GFX10-DL-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
3396; GFX10-DL-NEXT:    s_mov_b32 s22, -1
3397; GFX10-DL-NEXT:    s_mov_b32 s23, 0x31c16000
3398; GFX10-DL-NEXT:    s_add_u32 s20, s20, s3
3399; GFX10-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3400; GFX10-DL-NEXT:    s_addc_u32 s21, s21, 0
3401; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3402; GFX10-DL-NEXT:    global_load_ubyte v1, v0, s[4:5]
3403; GFX10-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
3404; GFX10-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
3405; GFX10-DL-NEXT:    s_mov_b32 s2, 0xffff
3406; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
3407; GFX10-DL-NEXT:    s_lshr_b32 s9, s0, 4
3408; GFX10-DL-NEXT:    s_lshr_b32 s16, s1, 4
3409; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s9
3410; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s16
3411; GFX10-DL-NEXT:    s_lshr_b32 s10, s0, 12
3412; GFX10-DL-NEXT:    s_lshr_b32 s17, s1, 12
3413; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 12, s0
3414; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 12, s1
3415; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v13, 12, s17
3416; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s10
3417; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
3418; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v12, 12, v12
3419; GFX10-DL-NEXT:    s_lshr_b32 s11, s0, 8
3420; GFX10-DL-NEXT:    s_lshr_b32 s18, s1, 8
3421; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s11
3422; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s18
3423; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v2, 12, v2
3424; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v3, 12, v3
3425; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v19, 12, v5
3426; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v6, v6, v12
3427; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v13, 12, v13
3428; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v4
3429; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v11
3430; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v2, v2, v3
3431; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v6
3432; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v3, v19, v13
3433; GFX10-DL-NEXT:    s_lshr_b32 s3, s0, 20
3434; GFX10-DL-NEXT:    s_lshr_b32 s6, s0, 16
3435; GFX10-DL-NEXT:    s_lshr_b32 s7, s0, 28
3436; GFX10-DL-NEXT:    s_lshr_b32 s8, s0, 24
3437; GFX10-DL-NEXT:    s_lshr_b32 s12, s1, 20
3438; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3439; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v7, 12, s8
3440; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s7
3441; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s6
3442; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s3
3443; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v12, 12, s12
3444; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v11
3445; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v3, 8, v3
3446; GFX10-DL-NEXT:    s_lshr_b32 s13, s1, 16
3447; GFX10-DL-NEXT:    s_lshr_b32 s14, s1, 28
3448; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v6, 12, s13
3449; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v5, 12, v7
3450; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v7, 12, v8
3451; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v8, 12, v9
3452; GFX10-DL-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3453; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
3454; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v15, 12, s14
3455; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v4, 12, v10
3456; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v9, 12, v12
3457; GFX10-DL-NEXT:    s_lshr_b32 s15, s1, 24
3458; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v6, 12, v6
3459; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v10, 12, v15
3460; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v14, 12, s15
3461; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v3
3462; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v4, v9
3463; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v15, v8, v6
3464; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v7, v7, v10
3465; GFX10-DL-NEXT:    v_ashrrev_i16_e64 v11, 12, v14
3466; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
3467; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
3468; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v2, v1
3469; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v2, 8, v4
3470; GFX10-DL-NEXT:    v_mul_lo_u16_e64 v4, v5, v11
3471; GFX10-DL-NEXT:    v_lshlrev_b16_e64 v5, 8, v7
3472; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v8
3473; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3474; GFX10-DL-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
3475; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
3476; GFX10-DL-NEXT:    v_and_b32_e32 v2, s2, v2
3477; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3478; GFX10-DL-NEXT:    v_or_b32_e32 v3, v2, v4
3479; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
3480; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
3481; GFX10-DL-NEXT:    v_add_nc_u32_e32 v1, v1, v2
3482; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3483; GFX10-DL-NEXT:    v_add_nc_u32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
3484; GFX10-DL-NEXT:    global_store_byte v0, v1, s[4:5]
3485; GFX10-DL-NEXT:    s_endpgm
3486                                             <8 x i4> addrspace(1)* %src2,
3487                                             i8 addrspace(1)* nocapture %dst) {
3488entry:
3489  %idx = call i32 @llvm.amdgcn.workitem.id.x()
3490  %gep1 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src1, i32 %idx
3491  %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %gep1
3492  %gep2 = getelementptr <8 x i4>, <8 x i4> addrspace(1)* %src2, i32 %idx
3493  %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %gep2
3494
3495  %cvec1 = sext <8 x i4> %vec1 to <8 x i8>
3496  %cvec2 = sext <8 x i4> %vec2 to <8 x i8>
3497
3498  %mul = mul <8 x i8> %cvec1, %cvec2
3499  %mul0 = extractelement <8 x i8> %mul, i64 0
3500  %mul1 = extractelement <8 x i8> %mul, i64 1
3501  %mul2 = extractelement <8 x i8> %mul, i64 2
3502  %mul3 = extractelement <8 x i8> %mul, i64 3
3503  %mul4 = extractelement <8 x i8> %mul, i64 4
3504  %mul5 = extractelement <8 x i8> %mul, i64 5
3505  %mul6 = extractelement <8 x i8> %mul, i64 6
3506  %mul7 = extractelement <8 x i8> %mul, i64 7
3507
3508  %acc = load i8, i8 addrspace(1)* %dst, align 4
3509  %add1 = add i8 %mul0, %acc
3510  %add2 = add i8 %add1, %mul1
3511  %add3 = add i8 %add2, %mul2
3512  %add4 = add i8 %add3, %mul3
3513  %add5 = add i8 %add4, %mul4
3514  %add6 = add i8 %add5, %mul5
3515  %add7 = add i8 %add6, %mul6
3516  %add8 = add i8 %add7, %mul7
3517
3518  store i8 %add8, i8 addrspace(1)* %dst, align 4
3519  ret void
3520}
3521
3522declare i32 @llvm.amdgcn.workitem.id.x()
3523