1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
8
9; add(mul(S0.x, S1.y),
10;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
11
12define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
13; GFX7-LABEL: udot2:
14; GFX7:       ; %bb.0: ; %entry
15; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
16; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
17; GFX7-NEXT:    s_mov_b32 s3, 0xf000
18; GFX7-NEXT:    s_mov_b32 s10, 0
19; GFX7-NEXT:    s_mov_b32 s11, s3
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
22; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
23; GFX7-NEXT:    v_mov_b32_e32 v1, 0
24; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
25; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
26; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
27; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
28; GFX7-NEXT:    s_mov_b32 s2, -1
29; GFX7-NEXT:    s_waitcnt vmcnt(1)
30; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
31; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
32; GFX7-NEXT:    s_waitcnt vmcnt(0)
33; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
34; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
35; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
37; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
38; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
39; GFX7-NEXT:    s_endpgm
40;
41; GFX8-LABEL: udot2:
42; GFX8:       ; %bb.0: ; %entry
43; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
44; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
45; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
46; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX8-NEXT:    v_mov_b32_e32 v1, s5
48; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
49; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
50; GFX8-NEXT:    flat_load_dword v3, v[0:1]
51; GFX8-NEXT:    v_mov_b32_e32 v1, s7
52; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
53; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
54; GFX8-NEXT:    flat_load_dword v0, v[0:1]
55; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
56; GFX8-NEXT:    s_waitcnt vmcnt(1)
57; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
58; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
59; GFX8-NEXT:    s_waitcnt vmcnt(0)
60; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
61; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
62; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
64; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
65; GFX8-NEXT:    v_mov_b32_e32 v0, s0
66; GFX8-NEXT:    v_mov_b32_e32 v1, s1
67; GFX8-NEXT:    flat_store_dword v[0:1], v2
68; GFX8-NEXT:    s_endpgm
69;
70; GFX9-NODL-LABEL: udot2:
71; GFX9-NODL:       ; %bb.0: ; %entry
72; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
73; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
74; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
75; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
77; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
78; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
79; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
80; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
81; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
82; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
83; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
85; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
86; GFX9-NODL-NEXT:    s_endpgm
87;
88; GFX9-DL-LABEL: udot2:
89; GFX9-DL:       ; %bb.0: ; %entry
90; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
91; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
92; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
93; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
95; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
96; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
97; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
98; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
99; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
100; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
101; GFX9-DL-NEXT:    s_endpgm
102;
103; GFX10-DL-LABEL: udot2:
104; GFX10-DL:       ; %bb.0: ; %entry
105; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
106; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
107; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
108; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
109; GFX10-DL-NEXT:    s_clause 0x1
110; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
111; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
112; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
113; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
114; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
115; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
116; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
117; GFX10-DL-NEXT:    s_endpgm
118                                 <2 x i16> addrspace(1)* %src2,
119                                 i32 addrspace(1)* nocapture %dst) {
120entry:
121  %idx = call i32 @llvm.amdgcn.workitem.id.x()
122  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
123  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
124  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
125  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
126
127  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
128  %conv = zext i16 %s1.elt1 to i32
129  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
130  %conv2 = zext i16 %s2.elt1 to i32
131  %mul1 = mul nuw i32 %conv2, %conv
132
133  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
134  %conv3 = zext i16 %s1.elt2 to i32
135  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
136  %conv4 = zext i16 %s2.elt2 to i32
137  %mul2 = mul nuw i32 %conv4, %conv3
138
139  %s3 = load i32, i32 addrspace(1)* %dst, align 4
140  %add = add i32 %mul2, %s3
141  %add6 = add i32 %add, %mul1
142  store i32 %add6, i32 addrspace(1)* %dst, align 4
143  ret void
144}
145
146; TODO: Support this pattern
147;      add(S3,
148;          add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
149define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
150; GFX7-LABEL: udot2_MulMul:
151; GFX7:       ; %bb.0: ; %entry
152; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
153; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
154; GFX7-NEXT:    s_mov_b32 s3, 0xf000
155; GFX7-NEXT:    s_mov_b32 s10, 0
156; GFX7-NEXT:    s_mov_b32 s11, s3
157; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
159; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
160; GFX7-NEXT:    v_mov_b32_e32 v1, 0
161; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
162; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
163; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
164; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
165; GFX7-NEXT:    s_mov_b32 s2, -1
166; GFX7-NEXT:    s_waitcnt vmcnt(1)
167; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
168; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
169; GFX7-NEXT:    s_waitcnt vmcnt(0)
170; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
171; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
172; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
173; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
174; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
176; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
177; GFX7-NEXT:    s_endpgm
178;
179; GFX8-LABEL: udot2_MulMul:
180; GFX8:       ; %bb.0: ; %entry
181; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
182; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
183; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
184; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX8-NEXT:    v_mov_b32_e32 v1, s5
186; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
187; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
188; GFX8-NEXT:    flat_load_dword v3, v[0:1]
189; GFX8-NEXT:    v_mov_b32_e32 v1, s7
190; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
191; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
192; GFX8-NEXT:    flat_load_dword v0, v[0:1]
193; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
194; GFX8-NEXT:    s_waitcnt vmcnt(1)
195; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
196; GFX8-NEXT:    s_waitcnt vmcnt(0)
197; GFX8-NEXT:    v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
198; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
199; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
200; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
202; GFX8-NEXT:    v_mov_b32_e32 v0, s0
203; GFX8-NEXT:    v_mov_b32_e32 v1, s1
204; GFX8-NEXT:    flat_store_dword v[0:1], v2
205; GFX8-NEXT:    s_endpgm
206;
207; GFX9-NODL-LABEL: udot2_MulMul:
208; GFX9-NODL:       ; %bb.0: ; %entry
209; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
210; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
211; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
212; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
214; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
215; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
216; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
217; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
218; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
219; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
220; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, s0
222; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
223; GFX9-NODL-NEXT:    s_endpgm
224;
225; GFX9-DL-LABEL: udot2_MulMul:
226; GFX9-DL:       ; %bb.0: ; %entry
227; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
228; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
229; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
230; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
232; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
233; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
234; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
235; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
236; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
237; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
238; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
239; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, s0
240; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
241; GFX9-DL-NEXT:    s_endpgm
242;
243; GFX10-DL-LABEL: udot2_MulMul:
244; GFX10-DL:       ; %bb.0: ; %entry
245; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
246; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
247; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
248; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX10-DL-NEXT:    s_clause 0x1
250; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
251; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
252; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
253; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
254; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
255; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
256; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
257; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, s2
259; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
260; GFX10-DL-NEXT:    s_endpgm
261                                        <2 x i16> addrspace(1)* %src2,
262                                        i32 addrspace(1)* nocapture %dst) {
263entry:
264  %idx = call i32 @llvm.amdgcn.workitem.id.x()
265  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
266  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
267  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
268  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
269
270  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
271  %conv = zext i16 %s1.elt1 to i32
272  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
273  %conv2 = zext i16 %s2.elt1 to i32
274  %mul1 = mul nuw i32 %conv2, %conv
275
276  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
277  %conv3 = zext i16 %s1.elt2 to i32
278  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
279  %conv4 = zext i16 %s2.elt2 to i32
280  %mul2 = mul nuw i32 %conv4, %conv3
281  %s3 = load i32, i32 addrspace(1)* %dst, align 4
282  %add = add i32 %mul2, %mul1
283  %add6 = add i32 %add, %s3
284  store i32 %add6, i32 addrspace(1)* %dst, align 4
285  ret void
286}
287
288define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
289; GFX7-LABEL: idot2:
290; GFX7:       ; %bb.0: ; %entry
291; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
292; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
293; GFX7-NEXT:    s_mov_b32 s3, 0xf000
294; GFX7-NEXT:    s_mov_b32 s10, 0
295; GFX7-NEXT:    s_mov_b32 s11, s3
296; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
298; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
299; GFX7-NEXT:    v_mov_b32_e32 v1, 0
300; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
301; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
302; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
303; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
304; GFX7-NEXT:    s_mov_b32 s2, -1
305; GFX7-NEXT:    s_waitcnt vmcnt(1)
306; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
307; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
308; GFX7-NEXT:    s_waitcnt vmcnt(0)
309; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
310; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
311; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
313; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
314; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
315; GFX7-NEXT:    s_endpgm
316;
317; GFX8-LABEL: idot2:
318; GFX8:       ; %bb.0: ; %entry
319; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
320; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
321; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
322; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX8-NEXT:    v_mov_b32_e32 v1, s5
324; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
325; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
326; GFX8-NEXT:    flat_load_dword v3, v[0:1]
327; GFX8-NEXT:    v_mov_b32_e32 v1, s7
328; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
329; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
330; GFX8-NEXT:    flat_load_dword v0, v[0:1]
331; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
332; GFX8-NEXT:    s_waitcnt vmcnt(1)
333; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
334; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
335; GFX8-NEXT:    s_waitcnt vmcnt(0)
336; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
337; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
338; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
340; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
341; GFX8-NEXT:    v_mov_b32_e32 v0, s0
342; GFX8-NEXT:    v_mov_b32_e32 v1, s1
343; GFX8-NEXT:    flat_store_dword v[0:1], v2
344; GFX8-NEXT:    s_endpgm
345;
346; GFX9-NODL-LABEL: idot2:
347; GFX9-NODL:       ; %bb.0: ; %entry
348; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
349; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
350; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
351; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
353; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
354; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
355; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
356; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
357; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
358; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
359; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
361; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
362; GFX9-NODL-NEXT:    s_endpgm
363;
364; GFX9-DL-LABEL: idot2:
365; GFX9-DL:       ; %bb.0: ; %entry
366; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
367; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
368; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
369; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
371; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
372; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
373; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
374; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
375; GFX9-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s0
376; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
377; GFX9-DL-NEXT:    s_endpgm
378;
379; GFX10-DL-LABEL: idot2:
380; GFX10-DL:       ; %bb.0: ; %entry
381; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
382; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
383; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
384; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX10-DL-NEXT:    s_clause 0x1
386; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
387; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
388; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
389; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
390; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
391; GFX10-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s2
392; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
393; GFX10-DL-NEXT:    s_endpgm
394                                 <2 x i16> addrspace(1)* %src2,
395                                 i32 addrspace(1)* nocapture %dst) {
396entry:
397  %idx = call i32 @llvm.amdgcn.workitem.id.x()
398  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
399  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
400  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
401  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
402
403  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
404  %conv = sext i16 %s1.elt1 to i32
405  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
406  %conv2 = sext i16 %s2.elt1 to i32
407  %mul1 = mul nuw i32 %conv2, %conv
408
409  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
410  %conv3 = sext i16 %s1.elt2 to i32
411  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
412  %conv4 = sext i16 %s2.elt2 to i32
413  %mul2 = mul nuw i32 %conv4, %conv3
414
415  %s3 = load i32, i32 addrspace(1)* %dst, align 4
416  %add = add i32 %mul2, %s3
417  %add6 = add i32 %add, %mul1
418  store i32 %add6, i32 addrspace(1)* %dst, align 4
419  ret void
420}
421
422define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
423; GFX7-LABEL: idot2_MixedTypedMul:
424; GFX7:       ; %bb.0: ; %entry
425; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
426; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
427; GFX7-NEXT:    s_mov_b32 s3, 0xf000
428; GFX7-NEXT:    s_mov_b32 s10, 0
429; GFX7-NEXT:    s_mov_b32 s11, s3
430; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
431; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
432; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
433; GFX7-NEXT:    v_mov_b32_e32 v1, 0
434; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
435; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
436; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
437; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
438; GFX7-NEXT:    s_mov_b32 s2, -1
439; GFX7-NEXT:    s_waitcnt vmcnt(1)
440; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
441; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
442; GFX7-NEXT:    s_waitcnt vmcnt(0)
443; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
444; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
445; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
447; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v1
448; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
449; GFX7-NEXT:    s_endpgm
450;
451; GFX8-LABEL: idot2_MixedTypedMul:
452; GFX8:       ; %bb.0: ; %entry
453; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
454; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
455; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
456; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
457; GFX8-NEXT:    v_mov_b32_e32 v1, s5
458; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
459; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
460; GFX8-NEXT:    flat_load_dword v3, v[0:1]
461; GFX8-NEXT:    v_mov_b32_e32 v1, s7
462; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
463; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
464; GFX8-NEXT:    flat_load_dword v0, v[0:1]
465; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
466; GFX8-NEXT:    s_waitcnt vmcnt(1)
467; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
468; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
469; GFX8-NEXT:    s_waitcnt vmcnt(0)
470; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
471; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
472; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
474; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
475; GFX8-NEXT:    v_mov_b32_e32 v0, s0
476; GFX8-NEXT:    v_mov_b32_e32 v1, s1
477; GFX8-NEXT:    flat_store_dword v[0:1], v2
478; GFX8-NEXT:    s_endpgm
479;
480; GFX9-NODL-LABEL: idot2_MixedTypedMul:
481; GFX9-NODL:       ; %bb.0: ; %entry
482; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
483; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
484; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
485; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
487; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
488; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
489; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
490; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
491; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
492; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
493; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
494; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
495; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
496; GFX9-NODL-NEXT:    s_endpgm
497;
498; GFX9-DL-LABEL: idot2_MixedTypedMul:
499; GFX9-DL:       ; %bb.0: ; %entry
500; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
501; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
502; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
503; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
504; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
505; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
506; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
507; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
508; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
509; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
510; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
511; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
513; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
514; GFX9-DL-NEXT:    s_endpgm
515;
516; GFX10-DL-LABEL: idot2_MixedTypedMul:
517; GFX10-DL:       ; %bb.0: ; %entry
518; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
519; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
520; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
521; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
522; GFX10-DL-NEXT:    s_clause 0x1
523; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
524; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
525; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
526; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
527; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
528; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
529; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
530; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
532; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
533; GFX10-DL-NEXT:    s_endpgm
534                                               <2 x i16> addrspace(1)* %src2,
535                                               i32 addrspace(1)* nocapture %dst) {
536entry:
537  %idx = call i32 @llvm.amdgcn.workitem.id.x()
538  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
539  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
540  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
541  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
542
543  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
544  %conv = sext i16 %s1.elt1 to i32
545  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
546  %conv2 = sext i16 %s2.elt1 to i32
547  %mul1 = mul nuw i32 %conv2, %conv
548
549  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
550  %conv3 = zext i16 %s1.elt2 to i32
551  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
552  %conv4 = zext i16 %s2.elt2 to i32
553  %mul2 = mul nuw i32 %conv4, %conv3
554
555  %s3 = load i32, i32 addrspace(1)* %dst, align 4
556  %add = add i32 %mul2, %s3
557  %add6 = add i32 %add, %mul1
558  store i32 %add6, i32 addrspace(1)* %dst, align 4
559  ret void
560}
561
562define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
563; GFX7-LABEL: udot2_alt_AddOperands:
564; GFX7:       ; %bb.0: ; %entry
565; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
566; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
567; GFX7-NEXT:    s_mov_b32 s3, 0xf000
568; GFX7-NEXT:    s_mov_b32 s10, 0
569; GFX7-NEXT:    s_mov_b32 s11, s3
570; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
572; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
573; GFX7-NEXT:    v_mov_b32_e32 v1, 0
574; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
575; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
576; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
577; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
578; GFX7-NEXT:    s_mov_b32 s2, -1
579; GFX7-NEXT:    s_waitcnt vmcnt(1)
580; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
581; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
582; GFX7-NEXT:    s_waitcnt vmcnt(0)
583; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
584; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
585; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
587; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
588; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
589; GFX7-NEXT:    s_endpgm
590;
591; GFX8-LABEL: udot2_alt_AddOperands:
592; GFX8:       ; %bb.0: ; %entry
593; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
594; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
595; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
596; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
597; GFX8-NEXT:    v_mov_b32_e32 v1, s5
598; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
599; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
600; GFX8-NEXT:    flat_load_dword v3, v[0:1]
601; GFX8-NEXT:    v_mov_b32_e32 v1, s7
602; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
603; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
604; GFX8-NEXT:    flat_load_dword v0, v[0:1]
605; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
606; GFX8-NEXT:    s_waitcnt vmcnt(1)
607; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
608; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
609; GFX8-NEXT:    s_waitcnt vmcnt(0)
610; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
611; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
612; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
614; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
615; GFX8-NEXT:    v_mov_b32_e32 v0, s0
616; GFX8-NEXT:    v_mov_b32_e32 v1, s1
617; GFX8-NEXT:    flat_store_dword v[0:1], v2
618; GFX8-NEXT:    s_endpgm
619;
620; GFX9-NODL-LABEL: udot2_alt_AddOperands:
621; GFX9-NODL:       ; %bb.0: ; %entry
622; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
623; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
624; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
625; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
626; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
627; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
628; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
629; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
630; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
631; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
632; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
633; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
634; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
635; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
636; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
638; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
639; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
640; GFX9-NODL-NEXT:    s_endpgm
641;
642; GFX9-DL-LABEL: udot2_alt_AddOperands:
643; GFX9-DL:       ; %bb.0: ; %entry
644; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
645; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
646; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
647; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
648; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
649; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
650; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
651; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
652; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
653; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
654; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
655; GFX9-DL-NEXT:    s_endpgm
656;
657; GFX10-DL-LABEL: udot2_alt_AddOperands:
658; GFX10-DL:       ; %bb.0: ; %entry
659; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
660; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
661; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
662; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
663; GFX10-DL-NEXT:    s_clause 0x1
664; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
665; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
666; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
667; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
668; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
669; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
670; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
671; GFX10-DL-NEXT:    s_endpgm
672                                                 <2 x i16> addrspace(1)* %src2,
673                                                 i32 addrspace(1)* nocapture %dst) {
674entry:
675  %idx = call i32 @llvm.amdgcn.workitem.id.x()
676  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
677  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
678  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
679  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
680
681  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
682  %conv = zext i16 %s1.elt1 to i32
683  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
684  %conv2 = zext i16 %s2.elt1 to i32
685  %mul1 = mul nuw i32 %conv2, %conv
686
687  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
688  %conv3 = zext i16 %s1.elt2 to i32
689  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
690  %conv4 = zext i16 %s2.elt2 to i32
691  %mul2 = mul nuw i32 %conv4, %conv3
692
693  %s3 = load i32, i32 addrspace(1)* %dst, align 4
694  %add = add i32 %s3, %mul2
695  %add6 = add i32 %mul1, %add
696  store i32 %add6, i32 addrspace(1)* %dst, align 4
697  ret void
698}
699
700define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
701; GFX7-LABEL: idot2_MixedExt:
702; GFX7:       ; %bb.0: ; %entry
703; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
704; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
705; GFX7-NEXT:    s_mov_b32 s3, 0xf000
706; GFX7-NEXT:    s_mov_b32 s10, 0
707; GFX7-NEXT:    s_mov_b32 s11, s3
708; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
710; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
711; GFX7-NEXT:    v_mov_b32_e32 v1, 0
712; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
713; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
714; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
715; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
716; GFX7-NEXT:    s_mov_b32 s2, -1
717; GFX7-NEXT:    s_waitcnt vmcnt(1)
718; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
719; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
720; GFX7-NEXT:    s_waitcnt vmcnt(0)
721; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
722; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
723; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
724; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
725; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
726; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
727; GFX7-NEXT:    s_endpgm
728;
729; GFX8-LABEL: idot2_MixedExt:
730; GFX8:       ; %bb.0: ; %entry
731; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
732; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
733; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
734; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX8-NEXT:    v_mov_b32_e32 v1, s5
736; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
737; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
738; GFX8-NEXT:    flat_load_dword v3, v[0:1]
739; GFX8-NEXT:    v_mov_b32_e32 v1, s7
740; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
741; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
742; GFX8-NEXT:    flat_load_dword v0, v[0:1]
743; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
744; GFX8-NEXT:    s_waitcnt vmcnt(1)
745; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
746; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
747; GFX8-NEXT:    s_waitcnt vmcnt(0)
748; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
749; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
750; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
752; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
753; GFX8-NEXT:    v_mov_b32_e32 v0, s0
754; GFX8-NEXT:    v_mov_b32_e32 v1, s1
755; GFX8-NEXT:    flat_store_dword v[0:1], v2
756; GFX8-NEXT:    s_endpgm
757;
758; GFX9-NODL-LABEL: idot2_MixedExt:
759; GFX9-NODL:       ; %bb.0: ; %entry
760; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
761; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
762; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
763; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
765; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
766; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
767; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
768; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
769; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
770; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
771; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
773; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
774; GFX9-NODL-NEXT:    s_endpgm
775;
776; GFX9-DL-LABEL: idot2_MixedExt:
777; GFX9-DL:       ; %bb.0: ; %entry
778; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
779; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
780; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
781; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
783; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
784; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
785; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
786; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
787; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
788; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
789; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
791; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
792; GFX9-DL-NEXT:    s_endpgm
793;
794; GFX10-DL-LABEL: idot2_MixedExt:
795; GFX10-DL:       ; %bb.0: ; %entry
796; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
797; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
798; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
799; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX10-DL-NEXT:    s_clause 0x1
801; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
802; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
803; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
804; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
805; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
806; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
807; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
808; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
810; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
811; GFX10-DL-NEXT:    s_endpgm
812                                          <2 x i16> addrspace(1)* %src2,
813                                          i32 addrspace(1)* nocapture %dst) {
814entry:
815  %idx = call i32 @llvm.amdgcn.workitem.id.x()
816  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
817  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
818  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
819  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
820
821  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
822  %conv = sext i16 %s1.elt1 to i32
823  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
824  %conv2 = zext i16 %s2.elt1 to i32
825  %mul1 = mul nuw i32 %conv2, %conv
826
827  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
828  %conv3 = sext i16 %s1.elt2 to i32
829  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
830  %conv4 = sext i16 %s2.elt2 to i32
831  %mul2 = mul nuw i32 %conv4, %conv3
832
833  %s3 = load i32, i32 addrspace(1)* %dst, align 4
834  %add = add i32 %mul2, %s3
835  %add6 = add i32 %add, %mul1
836  store i32 %add6, i32 addrspace(1)* %dst, align 4
837  ret void
838}
839
840define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
841; GFX7-LABEL: notudot2_SameVec:
842; GFX7:       ; %bb.0: ; %entry
843; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
844; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
845; GFX7-NEXT:    s_mov_b32 s3, 0xf000
846; GFX7-NEXT:    s_mov_b32 s10, 0
847; GFX7-NEXT:    s_mov_b32 s11, s3
848; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
849; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
850; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
851; GFX7-NEXT:    v_mov_b32_e32 v1, 0
852; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
853; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
854; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
855; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
856; GFX7-NEXT:    s_mov_b32 s2, -1
857; GFX7-NEXT:    s_waitcnt vmcnt(1)
858; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
859; GFX7-NEXT:    s_waitcnt vmcnt(0)
860; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
861; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
862; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v0, s4
863; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v1, v0
864; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
865; GFX7-NEXT:    s_endpgm
866;
867; GFX8-LABEL: notudot2_SameVec:
868; GFX8:       ; %bb.0: ; %entry
869; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
870; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
871; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
872; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
873; GFX8-NEXT:    v_mov_b32_e32 v1, s5
874; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
875; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
876; GFX8-NEXT:    flat_load_dword v3, v[0:1]
877; GFX8-NEXT:    v_mov_b32_e32 v1, s7
878; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
879; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
880; GFX8-NEXT:    flat_load_dword v0, v[0:1]
881; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
882; GFX8-NEXT:    s_waitcnt vmcnt(1)
883; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
884; GFX8-NEXT:    s_waitcnt vmcnt(0)
885; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
886; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
887; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v0, s2
888; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v1, v0
889; GFX8-NEXT:    v_mov_b32_e32 v0, s0
890; GFX8-NEXT:    v_mov_b32_e32 v1, s1
891; GFX8-NEXT:    flat_store_dword v[0:1], v2
892; GFX8-NEXT:    s_endpgm
893;
894; GFX9-NODL-LABEL: notudot2_SameVec:
895; GFX9-NODL:       ; %bb.0: ; %entry
896; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
897; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
898; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
899; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
901; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
902; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
903; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
904; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
905; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
906; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
907; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
908; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
909; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, s0, v1
910; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
911; GFX9-NODL-NEXT:    s_endpgm
912;
913; GFX9-DL-LABEL: notudot2_SameVec:
914; GFX9-DL:       ; %bb.0: ; %entry
915; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
916; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
917; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
918; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
919; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
920; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
921; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
922; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
923; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
924; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
925; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
926; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
927; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX9-DL-NEXT:    v_add3_u32 v1, v2, s0, v1
929; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
930; GFX9-DL-NEXT:    s_endpgm
931;
932; GFX10-DL-LABEL: notudot2_SameVec:
933; GFX10-DL:       ; %bb.0: ; %entry
934; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
935; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
936; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
937; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
938; GFX10-DL-NEXT:    s_clause 0x1
939; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
940; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
941; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
942; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
943; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
944; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
945; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
946; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
947; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
949; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
950; GFX10-DL-NEXT:    s_endpgm
951                                            <2 x i16> addrspace(1)* %src2,
952                                            i32 addrspace(1)* nocapture %dst) {
953entry:
954  %idx = call i32 @llvm.amdgcn.workitem.id.x()
955  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
956  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
957  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
958  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
959
960  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
961  %conv = zext i16 %s1.elt1 to i32
962  %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
963  %conv2 = zext i16 %s2.elt1 to i32
964  %mul1 = mul i32 %conv2, %conv
965
966  %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
967  %conv3 = zext i16 %s1.elt2 to i32
968  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
969  %conv4 = zext i16 %s2.elt2 to i32
970  %mul2 = mul i32 %conv4, %conv3
971
972  %s3 = load i32, i32 addrspace(1)* %dst, align 4
973  %add = add i32 %mul2, %s3
974  %add6 = add i32 %add, %mul1
975  store i32 %add6, i32 addrspace(1)* %dst, align 4
976  ret void
977}
978
979define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
980; GFX7-LABEL: udot2_v4i16:
981; GFX7:       ; %bb.0: ; %entry
982; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
983; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
984; GFX7-NEXT:    s_mov_b32 s3, 0xf000
985; GFX7-NEXT:    s_mov_b32 s10, 0
986; GFX7-NEXT:    s_mov_b32 s11, s3
987; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
989; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
990; GFX7-NEXT:    v_mov_b32_e32 v1, 0
991; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
992; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
993; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
994; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
995; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
996; GFX7-NEXT:    s_mov_b32 s2, -1
997; GFX7-NEXT:    s_waitcnt vmcnt(1)
998; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
999; GFX7-NEXT:    s_waitcnt vmcnt(0)
1000; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
1001; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1002; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1003; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s4
1005; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
1006; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1007; GFX7-NEXT:    s_endpgm
1008;
1009; GFX8-LABEL: udot2_v4i16:
1010; GFX8:       ; %bb.0: ; %entry
1011; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1012; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1013; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1014; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1016; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1017; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1018; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1019; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1020; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1021; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1022; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1023; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1024; GFX8-NEXT:    s_waitcnt vmcnt(1)
1025; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1026; GFX8-NEXT:    s_waitcnt vmcnt(0)
1027; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v1
1028; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1029; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1030; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX8-NEXT:    v_mad_u32_u24 v0, v1, v0, s2
1032; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v0
1033; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1034; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1035; GFX8-NEXT:    flat_store_dword v[0:1], v2
1036; GFX8-NEXT:    s_endpgm
1037;
1038; GFX9-NODL-LABEL: udot2_v4i16:
1039; GFX9-NODL:       ; %bb.0: ; %entry
1040; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1041; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1042; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1043; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1044; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1045; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1046; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1047; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1048; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1049; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1050; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1051; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1053; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1054; GFX9-NODL-NEXT:    s_endpgm
1055;
1056; GFX9-DL-LABEL: udot2_v4i16:
1057; GFX9-DL:       ; %bb.0: ; %entry
1058; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1059; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1060; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1061; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1062; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1063; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1064; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1065; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1066; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1067; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1068; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1069; GFX9-DL-NEXT:    s_endpgm
1070;
1071; GFX10-DL-LABEL: udot2_v4i16:
1072; GFX10-DL:       ; %bb.0: ; %entry
1073; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1074; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1075; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1076; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1077; GFX10-DL-NEXT:    s_clause 0x1
1078; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1079; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1080; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1081; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1082; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1083; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
1084; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1085; GFX10-DL-NEXT:    s_endpgm
1086                                       <4 x i16> addrspace(1)* %src2,
1087                                       i32 addrspace(1)* nocapture %dst) {
1088entry:
1089  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1090  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1091  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1092  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1093  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1094
1095  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1096  %conv = zext i16 %s1.elt1 to i32
1097  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1098  %conv2 = zext i16 %s2.elt1 to i32
1099  %mul1 = mul i32 %conv2, %conv
1100
1101  %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
1102  %conv3 = zext i16 %s1.elt2 to i32
1103  %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
1104  %conv4 = zext i16 %s2.elt2 to i32
1105  %mul2 = mul i32 %conv4, %conv3
1106
1107  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1108  %add = add i32 %mul2, %s3
1109  %add6 = add i32 %add, %mul1
1110  store i32 %add6, i32 addrspace(1)* %dst, align 4
1111  ret void
1112}
1113
1114define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
1115; GFX7-LABEL: udot2_v4i16_Hi:
1116; GFX7:       ; %bb.0: ; %entry
1117; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1118; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1119; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1120; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1121; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1122; GFX7-NEXT:    s_mov_b32 s10, 0
1123; GFX7-NEXT:    s_mov_b32 s11, s3
1124; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1126; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
1127; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1128; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
1129; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1130; GFX7-NEXT:    s_mov_b32 s2, -1
1131; GFX7-NEXT:    s_waitcnt vmcnt(1)
1132; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1133; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1134; GFX7-NEXT:    s_waitcnt vmcnt(0)
1135; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
1136; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s4
1139; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
1140; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1141; GFX7-NEXT:    s_endpgm
1142;
1143; GFX8-LABEL: udot2_v4i16_Hi:
1144; GFX8:       ; %bb.0: ; %entry
1145; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1146; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1147; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1148; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1150; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
1151; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1152; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1153; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
1154; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1155; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
1156; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1157; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1158; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
1159; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1160; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1161; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1162; GFX8-NEXT:    s_waitcnt vmcnt(1)
1163; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v2
1164; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1165; GFX8-NEXT:    s_waitcnt vmcnt(0)
1166; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v0
1167; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1168; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1169; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, s2
1170; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v1, v0
1171; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1172; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1173; GFX8-NEXT:    flat_store_dword v[0:1], v2
1174; GFX8-NEXT:    s_endpgm
1175;
1176; GFX9-NODL-LABEL: udot2_v4i16_Hi:
1177; GFX9-NODL:       ; %bb.0: ; %entry
1178; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1179; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1180; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1181; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1182; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
1183; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
1184; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1185; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1186; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1187; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1188; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1189; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1191; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1192; GFX9-NODL-NEXT:    s_endpgm
1193;
1194; GFX9-DL-LABEL: udot2_v4i16_Hi:
1195; GFX9-DL:       ; %bb.0: ; %entry
1196; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1197; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1198; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1199; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
1201; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
1202; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1203; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1204; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1205; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1206; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1207; GFX9-DL-NEXT:    s_endpgm
1208;
1209; GFX10-DL-LABEL: udot2_v4i16_Hi:
1210; GFX10-DL:       ; %bb.0: ; %entry
1211; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1212; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1213; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1214; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX10-DL-NEXT:    s_clause 0x1
1216; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
1217; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
1218; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1219; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1220; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1221; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
1222; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1223; GFX10-DL-NEXT:    s_endpgm
1224                                          <4 x i16> addrspace(1)* %src2,
1225                                          i32 addrspace(1)* nocapture %dst) {
1226entry:
1227  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1228  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1229  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1230  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1231  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1232
1233  %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1234  %conv = zext i16 %s1.elt1 to i32
1235  %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1236  %conv2 = zext i16 %s2.elt1 to i32
1237  %mul1 = mul i32 %conv2, %conv
1238
1239  %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1240  %conv3 = zext i16 %s1.elt2 to i32
1241  %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1242  %conv4 = zext i16 %s2.elt2 to i32
1243  %mul2 = mul i32 %conv4, %conv3
1244
1245  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1246  %add = add i32 %mul2, %s3
1247  %add6 = add i32 %add, %mul1
1248  store i32 %add6, i32 addrspace(1)* %dst, align 4
1249  ret void
1250}
1251
1252define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
1253; GFX7-LABEL: notudot2_v4i16_Even:
1254; GFX7:       ; %bb.0: ; %entry
1255; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1256; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1257; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1258; GFX7-NEXT:    s_mov_b32 s10, 0
1259; GFX7-NEXT:    s_mov_b32 s11, s3
1260; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1262; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1263; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1264; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
1265; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
1266; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1267; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1268; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1269; GFX7-NEXT:    s_mov_b32 s2, -1
1270; GFX7-NEXT:    s_waitcnt vmcnt(1)
1271; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1272; GFX7-NEXT:    s_waitcnt vmcnt(0)
1273; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1274; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1275; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1276; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1277; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s4
1278; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1279; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1280; GFX7-NEXT:    s_endpgm
1281;
1282; GFX8-LABEL: notudot2_v4i16_Even:
1283; GFX8:       ; %bb.0: ; %entry
1284; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1285; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1286; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1287; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1289; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1290; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1291; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1292; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1293; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1294; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1295; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1296; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1297; GFX8-NEXT:    s_waitcnt vmcnt(1)
1298; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1299; GFX8-NEXT:    s_waitcnt vmcnt(0)
1300; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1301; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1302; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1303; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s2
1305; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
1306; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1307; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1308; GFX8-NEXT:    flat_store_dword v[0:1], v2
1309; GFX8-NEXT:    s_endpgm
1310;
1311; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1312; GFX9-NODL:       ; %bb.0: ; %entry
1313; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1314; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1315; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1316; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1318; GFX9-NODL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1319; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1320; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0
1321; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1322; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1323; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1324; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1325; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, s0, v0
1326; GFX9-NODL-NEXT:    global_store_dword v4, v0, s[2:3]
1327; GFX9-NODL-NEXT:    s_endpgm
1328;
1329; GFX9-DL-LABEL: notudot2_v4i16_Even:
1330; GFX9-DL:       ; %bb.0: ; %entry
1331; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1332; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1333; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1334; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1336; GFX9-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1337; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1338; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0
1339; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1340; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1341; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1342; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX9-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1344; GFX9-DL-NEXT:    global_store_dword v4, v0, s[2:3]
1345; GFX9-DL-NEXT:    s_endpgm
1346;
1347; GFX10-DL-LABEL: notudot2_v4i16_Even:
1348; GFX10-DL:       ; %bb.0: ; %entry
1349; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1350; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1351; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1352; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1353; GFX10-DL-NEXT:    s_clause 0x1
1354; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1355; GFX10-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1356; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1357; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1358; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1359; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1360; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1361; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
1363; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1364; GFX10-DL-NEXT:    s_endpgm
1365                                               <4 x i16> addrspace(1)* %src2,
1366                                               i32 addrspace(1)* nocapture %dst) {
1367entry:
1368  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1369  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1370  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1371  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1372  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1373
1374  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1375  %conv = zext i16 %s1.elt1 to i32
1376  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1377  %conv2 = zext i16 %s2.elt1 to i32
1378  %mul1 = mul i32 %conv2, %conv
1379
1380  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1381  %conv3 = zext i16 %s1.elt2 to i32
1382  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1383  %conv4 = zext i16 %s2.elt2 to i32
1384  %mul2 = mul i32 %conv4, %conv3
1385
1386  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1387  %add = add i32 %mul2, %s3
1388  %add6 = add i32 %add, %mul1
1389  store i32 %add6, i32 addrspace(1)* %dst, align 4
1390  ret void
1391}
1392
1393define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
1394; GFX7-LABEL: notudot2_v4i16_Middle:
1395; GFX7:       ; %bb.0: ; %entry
1396; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1397; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1398; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1399; GFX7-NEXT:    s_mov_b32 s10, 0
1400; GFX7-NEXT:    s_mov_b32 s11, s3
1401; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1403; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1404; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1405; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
1406; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
1407; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1408; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1409; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1410; GFX7-NEXT:    s_mov_b32 s2, -1
1411; GFX7-NEXT:    s_waitcnt vmcnt(1)
1412; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1413; GFX7-NEXT:    s_waitcnt vmcnt(0)
1414; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1415; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1416; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1417; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1418; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s4
1419; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1420; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1421; GFX7-NEXT:    s_endpgm
1422;
1423; GFX8-LABEL: notudot2_v4i16_Middle:
1424; GFX8:       ; %bb.0: ; %entry
1425; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1426; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1427; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1428; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1429; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1430; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1431; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1432; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1433; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1434; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1435; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1436; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1437; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1438; GFX8-NEXT:    s_waitcnt vmcnt(1)
1439; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1440; GFX8-NEXT:    s_waitcnt vmcnt(0)
1441; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1442; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1443; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1444; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1445; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s2
1446; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
1447; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1448; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1449; GFX8-NEXT:    flat_store_dword v[0:1], v2
1450; GFX8-NEXT:    s_endpgm
1451;
1452; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1453; GFX9-NODL:       ; %bb.0: ; %entry
1454; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1455; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1456; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1457; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1458; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1459; GFX9-NODL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1460; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1461; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0
1462; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1463; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1464; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1465; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1466; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, s0, v0
1467; GFX9-NODL-NEXT:    global_store_dword v4, v0, s[2:3]
1468; GFX9-NODL-NEXT:    s_endpgm
1469;
1470; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1471; GFX9-DL:       ; %bb.0: ; %entry
1472; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1473; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1474; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1475; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1476; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1477; GFX9-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1478; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1479; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0
1480; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1481; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1482; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1483; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX9-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1485; GFX9-DL-NEXT:    global_store_dword v4, v0, s[2:3]
1486; GFX9-DL-NEXT:    s_endpgm
1487;
1488; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1489; GFX10-DL:       ; %bb.0: ; %entry
1490; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1491; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1492; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1493; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1494; GFX10-DL-NEXT:    s_clause 0x1
1495; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1496; GFX10-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1497; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1498; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1499; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1500; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1501; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1502; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
1504; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1505; GFX10-DL-NEXT:    s_endpgm
1506                                                 <4 x i16> addrspace(1)* %src2,
1507                                                 i32 addrspace(1)* nocapture %dst) {
1508entry:
1509  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1510  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1511  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1512  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1513  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1514
1515  %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1516  %conv = zext i16 %s1.elt1 to i32
1517  %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1518  %conv2 = zext i16 %s2.elt1 to i32
1519  %mul1 = mul i32 %conv2, %conv
1520
1521  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1522  %conv3 = zext i16 %s1.elt2 to i32
1523  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1524  %conv4 = zext i16 %s2.elt2 to i32
1525  %mul2 = mul i32 %conv4, %conv3
1526
1527  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1528  %add = add i32 %mul2, %s3
1529  %add6 = add i32 %add, %mul1
1530  store i32 %add6, i32 addrspace(1)* %dst, align 4
1531  ret void
1532}
1533
1534define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
1535; GFX7-LABEL: notudot2_DiffIndex:
1536; GFX7:       ; %bb.0: ; %entry
1537; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1538; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1539; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1540; GFX7-NEXT:    s_mov_b32 s10, 0
1541; GFX7-NEXT:    s_mov_b32 s11, s3
1542; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1543; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1544; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1545; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1546; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1547; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1548; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1549; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1550; GFX7-NEXT:    s_mov_b32 s2, -1
1551; GFX7-NEXT:    s_waitcnt vmcnt(1)
1552; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1553; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1554; GFX7-NEXT:    s_waitcnt vmcnt(0)
1555; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1556; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1557; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1558; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, s4
1559; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v2, v0
1560; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1561; GFX7-NEXT:    s_endpgm
1562;
1563; GFX8-LABEL: notudot2_DiffIndex:
1564; GFX8:       ; %bb.0: ; %entry
1565; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1566; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1567; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1568; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1569; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1570; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1571; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1572; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1573; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1574; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1575; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1576; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1577; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1578; GFX8-NEXT:    s_waitcnt vmcnt(1)
1579; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
1580; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1581; GFX8-NEXT:    s_waitcnt vmcnt(0)
1582; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1583; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1584; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
1586; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
1587; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1588; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1589; GFX8-NEXT:    flat_store_dword v[0:1], v2
1590; GFX8-NEXT:    s_endpgm
1591;
1592; GFX9-NODL-LABEL: notudot2_DiffIndex:
1593; GFX9-NODL:       ; %bb.0: ; %entry
1594; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1595; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1596; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1597; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1598; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1599; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1600; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1601; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1602; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1603; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1604; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1605; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1606; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1607; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1608; GFX9-NODL-NEXT:    s_endpgm
1609;
1610; GFX9-DL-LABEL: notudot2_DiffIndex:
1611; GFX9-DL:       ; %bb.0: ; %entry
1612; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1613; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1614; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1615; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1616; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1617; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1618; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1619; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1620; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1621; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1622; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1623; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
1625; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1626; GFX9-DL-NEXT:    s_endpgm
1627;
1628; GFX10-DL-LABEL: notudot2_DiffIndex:
1629; GFX10-DL:       ; %bb.0: ; %entry
1630; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1631; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1632; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1633; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1634; GFX10-DL-NEXT:    s_clause 0x1
1635; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1636; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1637; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1638; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1639; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1640; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1641; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1642; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1643; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
1644; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1645; GFX10-DL-NEXT:    s_endpgm
1646                                              <2 x i16> addrspace(1)* %src2,
1647                                              i32 addrspace(1)* nocapture %dst) {
1648entry:
1649  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1650  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
1651  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
1652  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
1653  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
1654
1655  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1656  %conv = zext i16 %s1.elt1 to i32
1657  %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1658  %conv2 = zext i16 %s2.elt1 to i32
1659  %mul1 = mul i32 %conv2, %conv
1660
1661  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1662  %conv3 = zext i16 %s1.elt2 to i32
1663  %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1664  %conv4 = zext i16 %s2.elt2 to i32
1665  %mul2 = mul i32 %conv4, %conv3
1666
1667  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1668  %add = add i32 %mul2, %s3
1669  %add6 = add i32 %add, %mul1
1670  store i32 %add6, i32 addrspace(1)* %dst, align 4
1671  ret void
1672}
1673
1674define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1675; GFX7-LABEL: udot2_MultipleUses_add1:
1676; GFX7:       ; %bb.0: ; %entry
1677; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1678; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1679; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1680; GFX7-NEXT:    s_mov_b32 s10, 0
1681; GFX7-NEXT:    s_mov_b32 s11, s3
1682; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1684; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1685; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1686; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1687; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1688; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1689; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1690; GFX7-NEXT:    s_mov_b32 s2, -1
1691; GFX7-NEXT:    s_waitcnt vmcnt(1)
1692; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1693; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1694; GFX7-NEXT:    s_waitcnt vmcnt(0)
1695; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1696; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1697; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1698; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
1699; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1700; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1701; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1702; GFX7-NEXT:    s_endpgm
1703;
1704; GFX8-LABEL: udot2_MultipleUses_add1:
1705; GFX8:       ; %bb.0: ; %entry
1706; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1707; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1708; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1709; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1711; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1712; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1713; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1714; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1715; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1716; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1717; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1718; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1719; GFX8-NEXT:    s_waitcnt vmcnt(1)
1720; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
1721; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1722; GFX8-NEXT:    s_waitcnt vmcnt(0)
1723; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
1724; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1725; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1726; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
1727; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v0
1728; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1729; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1730; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1731; GFX8-NEXT:    flat_store_dword v[0:1], v2
1732; GFX8-NEXT:    s_endpgm
1733;
1734; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1735; GFX9-NODL:       ; %bb.0: ; %entry
1736; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1737; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1738; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1739; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1740; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1741; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1742; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1743; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1744; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1745; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1746; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1747; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1748; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1749; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
1750; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v1
1751; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1752; GFX9-NODL-NEXT:    s_endpgm
1753;
1754; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1755; GFX9-DL:       ; %bb.0: ; %entry
1756; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1757; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1758; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1759; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1760; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1761; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1762; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1763; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1764; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1765; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1766; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1767; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1768; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1769; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
1770; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v1
1771; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1772; GFX9-DL-NEXT:    s_endpgm
1773;
1774; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1775; GFX10-DL:       ; %bb.0: ; %entry
1776; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1777; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1778; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1779; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1780; GFX10-DL-NEXT:    s_clause 0x1
1781; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1782; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1783; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1784; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1785; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1786; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1787; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1788; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1789; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1790; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1791; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
1792; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
1793; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1794; GFX10-DL-NEXT:    s_endpgm
1795                                                   <2 x i16> addrspace(1)* %src2,
1796                                                   i32 addrspace(1)* nocapture %dst) {
1797entry:
1798  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1799  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
1800  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
1801  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
1802  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
1803
1804  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1805  %conv = zext i16 %s1.elt1 to i32
1806  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1807  %conv2 = zext i16 %s2.elt1 to i32
1808  %mul1 = mul i32 %conv2, %conv
1809
1810  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1811  %conv3 = zext i16 %s1.elt2 to i32
1812  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1813  %conv4 = zext i16 %s2.elt2 to i32
1814  %mul2 = mul i32 %conv4, %conv3
1815
1816  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1817  %add1 = add i32 %mul2, %s3
1818  %add2 = add i32 %add1, %mul1
1819
1820  %res = add i32 %add2, %add1
1821  store i32 %res, i32 addrspace(1)* %dst, align 4
1822  ret void
1823}
1824
1825define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1826; GFX7-LABEL: idot2_MultipleUses_add1:
1827; GFX7:       ; %bb.0: ; %entry
1828; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1829; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1830; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1831; GFX7-NEXT:    s_mov_b32 s10, 0
1832; GFX7-NEXT:    s_mov_b32 s11, s3
1833; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1834; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1835; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1836; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1837; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1838; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1839; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1840; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1841; GFX7-NEXT:    s_mov_b32 s2, -1
1842; GFX7-NEXT:    s_waitcnt vmcnt(1)
1843; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
1844; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1845; GFX7-NEXT:    s_waitcnt vmcnt(0)
1846; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
1847; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
1848; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1849; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
1850; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v1, v0
1851; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1852; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1853; GFX7-NEXT:    s_endpgm
1854;
1855; GFX8-LABEL: idot2_MultipleUses_add1:
1856; GFX8:       ; %bb.0: ; %entry
1857; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1858; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1859; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1860; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1861; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1862; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1863; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1864; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1865; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1866; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1867; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1868; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1869; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1870; GFX8-NEXT:    s_waitcnt vmcnt(1)
1871; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
1872; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
1873; GFX8-NEXT:    s_waitcnt vmcnt(0)
1874; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
1875; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
1876; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1877; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
1878; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v1, v0
1879; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1880; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1881; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1882; GFX8-NEXT:    flat_store_dword v[0:1], v2
1883; GFX8-NEXT:    s_endpgm
1884;
1885; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1886; GFX9-NODL:       ; %bb.0: ; %entry
1887; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1888; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1889; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1890; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1891; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1892; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1893; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1894; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1895; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1896; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1897; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
1898; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1899; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1900; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
1901; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v1
1902; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1903; GFX9-NODL-NEXT:    s_endpgm
1904;
1905; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1906; GFX9-DL:       ; %bb.0: ; %entry
1907; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1908; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1909; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1910; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1911; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1912; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1913; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1914; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1915; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1916; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1917; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
1918; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1919; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1920; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
1921; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v1
1922; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1923; GFX9-DL-NEXT:    s_endpgm
1924;
1925; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1926; GFX10-DL:       ; %bb.0: ; %entry
1927; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1928; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1929; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1930; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1931; GFX10-DL-NEXT:    s_clause 0x1
1932; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1933; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1934; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1935; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1936; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v0, 16, v1
1937; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1938; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
1939; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1940; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1941; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1942; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
1943; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
1944; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1945; GFX10-DL-NEXT:    s_endpgm
1946                                                   <2 x i16> addrspace(1)* %src2,
1947                                                   i32 addrspace(1)* nocapture %dst) {
1948entry:
1949  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1950  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
1951  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
1952  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
1953  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
1954
1955  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1956  %conv = sext i16 %s1.elt1 to i32
1957  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1958  %conv2 = sext i16 %s2.elt1 to i32
1959  %mul1 = mul i32 %conv2, %conv
1960
1961  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1962  %conv3 = sext i16 %s1.elt2 to i32
1963  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1964  %conv4 = sext i16 %s2.elt2 to i32
1965  %mul2 = mul i32 %conv4, %conv3
1966
1967  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1968  %add1 = add i32 %mul2, %s3
1969  %add2 = add i32 %add1, %mul1
1970
1971  %res = add i32 %add2, %add1
1972  store i32 %res, i32 addrspace(1)* %dst, align 4
1973  ret void
1974}
1975
1976define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1977; GFX7-LABEL: udot2_MultipleUses_mul1:
1978; GFX7:       ; %bb.0: ; %entry
1979; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1980; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1981; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1982; GFX7-NEXT:    s_mov_b32 s10, 0
1983; GFX7-NEXT:    s_mov_b32 s11, s3
1984; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1985; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1986; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1987; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1988; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1989; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1990; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1991; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1992; GFX7-NEXT:    s_mov_b32 s2, -1
1993; GFX7-NEXT:    s_waitcnt vmcnt(1)
1994; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1995; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1996; GFX7-NEXT:    s_waitcnt vmcnt(0)
1997; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1998; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1999; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2000; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s4
2001; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
2002; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
2003; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2004; GFX7-NEXT:    s_endpgm
2005;
2006; GFX8-LABEL: udot2_MultipleUses_mul1:
2007; GFX8:       ; %bb.0: ; %entry
2008; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2009; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2010; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2011; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2012; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2013; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2014; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2015; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2016; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2017; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2018; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2019; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2020; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2021; GFX8-NEXT:    s_waitcnt vmcnt(1)
2022; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
2023; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2024; GFX8-NEXT:    s_waitcnt vmcnt(0)
2025; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
2026; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2027; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2028; GFX8-NEXT:    v_mad_u32_u24 v4, v2, v1, s2
2029; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
2030; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
2031; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2032; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2033; GFX8-NEXT:    flat_store_dword v[0:1], v2
2034; GFX8-NEXT:    s_endpgm
2035;
2036; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
2037; GFX9-NODL:       ; %bb.0: ; %entry
2038; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2039; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2040; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2041; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2042; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2043; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2044; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2045; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2046; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2047; GFX9-NODL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
2048; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2049; GFX9-NODL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
2050; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2051; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
2052; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2053; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v3, s0
2054; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
2055; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2056; GFX9-NODL-NEXT:    s_endpgm
2057;
2058; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
2059; GFX9-DL:       ; %bb.0: ; %entry
2060; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2061; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2062; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2063; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2064; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2065; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2066; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2067; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2068; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2069; GFX9-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
2070; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2071; GFX9-DL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
2072; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2073; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
2074; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2075; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, v3, s0
2076; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
2077; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2078; GFX9-DL-NEXT:    s_endpgm
2079;
2080; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2081; GFX10-DL:       ; %bb.0: ; %entry
2082; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2083; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2084; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2085; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2086; GFX10-DL-NEXT:    s_clause 0x1
2087; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2088; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2089; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2090; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2091; GFX10-DL-NEXT:    v_and_b32_e32 v0, 0xffff, v1
2092; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2093; GFX10-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v2
2094; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2095; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
2096; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2097; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
2098; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2099; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
2100; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2101; GFX10-DL-NEXT:    s_endpgm
2102                                                   <2 x i16> addrspace(1)* %src2,
2103                                                   i32 addrspace(1)* nocapture %dst) {
2104entry:
2105  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2106  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2107  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2108  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2109  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2110
2111  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2112  %conv = zext i16 %s1.elt1 to i32
2113  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2114  %conv2 = zext i16 %s2.elt1 to i32
2115  %mul1 = mul i32 %conv2, %conv
2116
2117  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2118  %conv3 = zext i16 %s1.elt2 to i32
2119  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2120  %conv4 = zext i16 %s2.elt2 to i32
2121  %mul2 = mul i32 %conv4, %conv3
2122
2123  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2124  %add0 = add i32 %mul1, %s3
2125
2126  %add1 = add i32 %mul2, %add0
2127  %add2 = add i32 %add1, %mul1
2128
2129  store i32 %add2, i32 addrspace(1)* %dst, align 4
2130  ret void
2131}
2132
2133define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
2134; GFX7-LABEL: idot2_MultipleUses_mul1:
2135; GFX7:       ; %bb.0: ; %entry
2136; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2137; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2138; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2139; GFX7-NEXT:    s_mov_b32 s10, 0
2140; GFX7-NEXT:    s_mov_b32 s11, s3
2141; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2143; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2144; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2145; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2146; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2147; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2148; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2149; GFX7-NEXT:    s_mov_b32 s2, -1
2150; GFX7-NEXT:    s_waitcnt vmcnt(1)
2151; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
2152; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2153; GFX7-NEXT:    s_waitcnt vmcnt(0)
2154; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
2155; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2156; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2157; GFX7-NEXT:    v_mad_i32_i24 v4, v3, v1, s4
2158; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
2159; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2160; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2161; GFX7-NEXT:    s_endpgm
2162;
2163; GFX8-LABEL: idot2_MultipleUses_mul1:
2164; GFX8:       ; %bb.0: ; %entry
2165; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2166; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2167; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2168; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2169; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2170; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2171; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2172; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2173; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2174; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2175; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2176; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2177; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2178; GFX8-NEXT:    s_waitcnt vmcnt(1)
2179; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
2180; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
2181; GFX8-NEXT:    s_waitcnt vmcnt(0)
2182; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
2183; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2184; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2185; GFX8-NEXT:    v_mad_i32_i24 v4, v2, v1, s2
2186; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, v4
2187; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2188; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2189; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2190; GFX8-NEXT:    flat_store_dword v[0:1], v2
2191; GFX8-NEXT:    s_endpgm
2192;
2193; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
2194; GFX9-NODL:       ; %bb.0: ; %entry
2195; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2196; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2197; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2198; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2199; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2200; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2201; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2202; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2203; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2204; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 16
2205; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2206; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 16
2207; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2208; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v2, v4, v3
2209; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2210; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v3, s0
2211; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
2212; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2213; GFX9-NODL-NEXT:    s_endpgm
2214;
2215; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
2216; GFX9-DL:       ; %bb.0: ; %entry
2217; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2218; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2219; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2220; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2222; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2223; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2224; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2225; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2226; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 16
2227; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2228; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 16
2229; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2230; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v4, v3
2231; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2232; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v4, v3, s0
2233; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
2234; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2235; GFX9-DL-NEXT:    s_endpgm
2236;
2237; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2238; GFX10-DL:       ; %bb.0: ; %entry
2239; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2240; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2241; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2242; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2243; GFX10-DL-NEXT:    s_clause 0x1
2244; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2245; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2246; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2247; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2248; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 16
2249; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2250; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 16
2251; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2252; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v0
2253; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2254; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
2255; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2256; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
2257; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2258; GFX10-DL-NEXT:    s_endpgm
2259                                                   <2 x i16> addrspace(1)* %src2,
2260                                                   i32 addrspace(1)* nocapture %dst) {
2261entry:
2262  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2263  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2264  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2265  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2266  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2267
2268  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2269  %conv = sext i16 %s1.elt1 to i32
2270  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2271  %conv2 = sext i16 %s2.elt1 to i32
2272  %mul1 = mul i32 %conv2, %conv
2273
2274  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2275  %conv3 = sext i16 %s1.elt2 to i32
2276  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2277  %conv4 = sext i16 %s2.elt2 to i32
2278  %mul2 = mul i32 %conv4, %conv3
2279
2280  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2281  %add0 = add i32 %mul1, %s3
2282
2283  %add1 = add i32 %mul2, %add0
2284  %add2 = add i32 %add1, %mul1
2285
2286  store i32 %add2, i32 addrspace(1)* %dst, align 4
2287  ret void
2288}
2289
2290define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2291; GFX7-LABEL: udot2_MultipleUses_mul2:
2292; GFX7:       ; %bb.0: ; %entry
2293; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2294; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2295; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2296; GFX7-NEXT:    s_mov_b32 s10, 0
2297; GFX7-NEXT:    s_mov_b32 s11, s3
2298; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2299; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2300; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2301; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2302; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2303; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2304; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2305; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2306; GFX7-NEXT:    s_mov_b32 s2, -1
2307; GFX7-NEXT:    s_waitcnt vmcnt(1)
2308; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
2309; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2310; GFX7-NEXT:    s_waitcnt vmcnt(0)
2311; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2312; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2313; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s4
2314; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2315; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
2316; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
2317; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2318; GFX7-NEXT:    s_endpgm
2319;
2320; GFX8-LABEL: udot2_MultipleUses_mul2:
2321; GFX8:       ; %bb.0: ; %entry
2322; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2323; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2324; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2325; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2326; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2327; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2328; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2329; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2330; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2331; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2332; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2333; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2334; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2335; GFX8-NEXT:    s_waitcnt vmcnt(1)
2336; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
2337; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2338; GFX8-NEXT:    s_waitcnt vmcnt(0)
2339; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
2340; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2341; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2342; GFX8-NEXT:    v_mad_u32_u24 v4, v0, v3, s2
2343; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
2344; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
2345; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2346; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2347; GFX8-NEXT:    flat_store_dword v[0:1], v2
2348; GFX8-NEXT:    s_endpgm
2349;
2350; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
2351; GFX9-NODL:       ; %bb.0: ; %entry
2352; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2353; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2354; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2355; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2356; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2357; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2358; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2359; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2360; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2361; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2362; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2363; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2364; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
2365; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2366; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
2367; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
2368; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2369; GFX9-NODL-NEXT:    s_endpgm
2370;
2371; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
2372; GFX9-DL:       ; %bb.0: ; %entry
2373; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2374; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2375; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2376; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2377; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2378; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2379; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2380; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2381; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2382; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2383; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2384; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2385; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
2386; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2387; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
2388; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
2389; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2390; GFX9-DL-NEXT:    s_endpgm
2391;
2392; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2393; GFX10-DL:       ; %bb.0: ; %entry
2394; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2395; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2396; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2397; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2398; GFX10-DL-NEXT:    s_clause 0x1
2399; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2400; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2401; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2402; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2403; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
2404; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2405; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2406; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2407; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
2408; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2409; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
2410; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2411; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
2412; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2413; GFX10-DL-NEXT:    s_endpgm
2414                                                   <2 x i16> addrspace(1)* %src2,
2415                                                   i32 addrspace(1)* nocapture %dst) {
2416entry:
2417  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2418  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2419  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2420  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2421  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2422
2423  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2424  %conv = zext i16 %s1.elt1 to i32
2425  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2426  %conv2 = zext i16 %s2.elt1 to i32
2427  %mul1 = mul i32 %conv2, %conv
2428
2429  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2430  %conv3 = zext i16 %s1.elt2 to i32
2431  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2432  %conv4 = zext i16 %s2.elt2 to i32
2433  %mul2 = mul i32 %conv4, %conv3
2434
2435  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2436  %add0 = add i32 %mul2, %s3
2437
2438  %add1 = add i32 %mul2, %add0
2439  %add2 = add i32 %add1, %mul1
2440
2441  store i32 %add2, i32 addrspace(1)* %dst, align 4
2442  ret void
2443}
2444
2445define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2446; GFX7-LABEL: idot2_MultipleUses_mul2:
2447; GFX7:       ; %bb.0: ; %entry
2448; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2449; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2450; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2451; GFX7-NEXT:    s_mov_b32 s10, 0
2452; GFX7-NEXT:    s_mov_b32 s11, s3
2453; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2454; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2455; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2456; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2457; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2458; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2459; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2460; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2461; GFX7-NEXT:    s_mov_b32 s2, -1
2462; GFX7-NEXT:    s_waitcnt vmcnt(1)
2463; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
2464; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2465; GFX7-NEXT:    s_waitcnt vmcnt(0)
2466; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
2467; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2468; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2469; GFX7-NEXT:    v_mad_i32_i24 v4, v0, v2, s4
2470; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
2471; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2472; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2473; GFX7-NEXT:    s_endpgm
2474;
2475; GFX8-LABEL: idot2_MultipleUses_mul2:
2476; GFX8:       ; %bb.0: ; %entry
2477; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2478; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2479; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2480; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2482; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2483; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2484; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2485; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2486; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2487; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2488; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2489; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2490; GFX8-NEXT:    s_waitcnt vmcnt(1)
2491; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
2492; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
2493; GFX8-NEXT:    s_waitcnt vmcnt(0)
2494; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
2495; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2496; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2497; GFX8-NEXT:    v_mad_i32_i24 v4, v0, v3, s2
2498; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, v4
2499; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2500; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2501; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2502; GFX8-NEXT:    flat_store_dword v[0:1], v2
2503; GFX8-NEXT:    s_endpgm
2504;
2505; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2506; GFX9-NODL:       ; %bb.0: ; %entry
2507; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2508; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2509; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2510; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2512; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2513; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2514; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2515; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2516; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2517; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2518; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2519; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
2520; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2521; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
2522; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
2523; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2524; GFX9-NODL-NEXT:    s_endpgm
2525;
2526; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2527; GFX9-DL:       ; %bb.0: ; %entry
2528; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2529; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2530; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2531; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2532; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2533; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2534; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2535; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2536; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2537; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2538; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2539; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2540; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
2541; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2542; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
2543; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
2544; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2545; GFX9-DL-NEXT:    s_endpgm
2546;
2547; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2548; GFX10-DL:       ; %bb.0: ; %entry
2549; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2550; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2551; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2552; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2553; GFX10-DL-NEXT:    s_clause 0x1
2554; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2555; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2556; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2557; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2558; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v0, 16, v1
2559; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2560; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
2561; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2562; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v0
2563; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2564; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
2565; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2566; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
2567; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2568; GFX10-DL-NEXT:    s_endpgm
2569                                                   <2 x i16> addrspace(1)* %src2,
2570                                                   i32 addrspace(1)* nocapture %dst) {
2571entry:
2572  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2573  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2574  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2575  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2576  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2577
2578  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2579  %conv = sext i16 %s1.elt1 to i32
2580  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2581  %conv2 = sext i16 %s2.elt1 to i32
2582  %mul1 = mul i32 %conv2, %conv
2583
2584  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2585  %conv3 = sext i16 %s1.elt2 to i32
2586  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2587  %conv4 = sext i16 %s2.elt2 to i32
2588  %mul2 = mul i32 %conv4, %conv3
2589
2590  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2591  %add0 = add i32 %mul2, %s3
2592
2593  %add1 = add i32 %mul2, %add0
2594  %add2 = add i32 %add1, %mul1
2595
2596  store i32 %add2, i32 addrspace(1)* %dst, align 4
2597  ret void
2598}
2599
2600define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
2601; GFX7-LABEL: udot2_acc16:
2602; GFX7:       ; %bb.0: ; %entry
2603; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2604; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2605; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2606; GFX7-NEXT:    s_mov_b32 s10, 0
2607; GFX7-NEXT:    s_mov_b32 s11, s3
2608; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2609; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2610; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2611; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2612; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2613; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2614; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2615; GFX7-NEXT:    s_mov_b32 s2, -1
2616; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2617; GFX7-NEXT:    s_waitcnt vmcnt(2)
2618; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2619; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
2620; GFX7-NEXT:    s_waitcnt vmcnt(1)
2621; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
2622; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2623; GFX7-NEXT:    s_waitcnt vmcnt(0)
2624; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
2625; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2626; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2627; GFX7-NEXT:    s_endpgm
2628;
2629; GFX8-LABEL: udot2_acc16:
2630; GFX8:       ; %bb.0: ; %entry
2631; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2632; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2633; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2634; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2635; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2636; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2637; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2638; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2639; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2640; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2641; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2642; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2643; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2644; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2645; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2646; GFX8-NEXT:    s_waitcnt vmcnt(2)
2647; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2648; GFX8-NEXT:    s_waitcnt vmcnt(1)
2649; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2650; GFX8-NEXT:    s_waitcnt vmcnt(0)
2651; GFX8-NEXT:    v_mad_u16 v4, v5, v6, v4
2652; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2653; GFX8-NEXT:    flat_store_short v[0:1], v2
2654; GFX8-NEXT:    s_endpgm
2655;
2656; GFX9-NODL-LABEL: udot2_acc16:
2657; GFX9-NODL:       ; %bb.0: ; %entry
2658; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2659; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2660; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2661; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
2662; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2663; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
2664; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
2665; GFX9-NODL-NEXT:    global_load_ushort v4, v1, s[2:3]
2666; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2667; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2668; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2669; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2670; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2671; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v4
2672; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v2, v3, v0
2673; GFX9-NODL-NEXT:    global_store_short v1, v0, s[2:3]
2674; GFX9-NODL-NEXT:    s_endpgm
2675;
2676; GFX9-DL-LABEL: udot2_acc16:
2677; GFX9-DL:       ; %bb.0: ; %entry
2678; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2679; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2680; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2681; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
2682; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2683; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
2684; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
2685; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
2686; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2687; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2688; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2689; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2690; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2691; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v4
2692; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v2, v3, v0
2693; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
2694; GFX9-DL-NEXT:    s_endpgm
2695;
2696; GFX10-DL-LABEL: udot2_acc16:
2697; GFX10-DL:       ; %bb.0: ; %entry
2698; GFX10-DL-NEXT:    s_clause 0x1
2699; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2700; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2701; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2702; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2703; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2704; GFX10-DL-NEXT:    s_clause 0x1
2705; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
2706; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
2707; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
2708; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2709; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2710; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2711; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2712; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2713; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
2714; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
2715; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
2716; GFX10-DL-NEXT:    s_endpgm
2717                                       <2 x i16> addrspace(1)* %src2,
2718                                       i16 addrspace(1)* nocapture %dst) {
2719entry:
2720  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2721  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2722  %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2723  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2724  %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2725
2726  %v1e1 = extractelement <2 x i16> %v1, i64 0
2727  %v2e1 = extractelement <2 x i16> %v2, i64 0
2728  %mul1 = mul i16 %v1e1, %v2e1
2729
2730  %v1e2 = extractelement <2 x i16> %v1, i64 1
2731  %v2e2 = extractelement <2 x i16> %v2, i64 1
2732  %mul2 = mul i16 %v1e2, %v2e2
2733
2734  %s2 = load i16, i16 addrspace(1)* %dst, align 2
2735  %add1 = add i16 %mul2, %s2
2736  %add2 = add i16 %add1, %mul1
2737  store i16 %add2, i16 addrspace(1)* %dst, align 2
2738  ret void
2739}
2740
2741define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
2742; GFX7-LABEL: notsdot2_sext8:
2743; GFX7:       ; %bb.0: ; %entry
2744; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2745; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2746; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2747; GFX7-NEXT:    s_mov_b32 s10, 0
2748; GFX7-NEXT:    s_mov_b32 s11, s3
2749; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2750; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2751; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2752; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2753; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
2754; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2755; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
2756; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2757; GFX7-NEXT:    s_mov_b32 s2, -1
2758; GFX7-NEXT:    s_waitcnt vmcnt(1)
2759; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
2760; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
2761; GFX7-NEXT:    s_waitcnt vmcnt(0)
2762; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
2763; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
2764; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2765; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
2766; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2767; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2768; GFX7-NEXT:    s_endpgm
2769;
2770; GFX8-LABEL: notsdot2_sext8:
2771; GFX8:       ; %bb.0: ; %entry
2772; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2773; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2774; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2775; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2776; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2777; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2778; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2779; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
2780; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2781; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2782; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2783; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2784; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2785; GFX8-NEXT:    s_waitcnt vmcnt(1)
2786; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
2787; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
2788; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
2789; GFX8-NEXT:    s_waitcnt vmcnt(0)
2790; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
2791; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
2792; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
2793; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2794; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
2795; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2796; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2797; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2798; GFX8-NEXT:    flat_store_dword v[0:1], v2
2799; GFX8-NEXT:    s_endpgm
2800;
2801; GFX9-NODL-LABEL: notsdot2_sext8:
2802; GFX9-NODL:       ; %bb.0: ; %entry
2803; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2804; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2805; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2806; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2807; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[4:5]
2808; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[6:7]
2809; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2810; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2811; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2812; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2813; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2814; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2815; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2816; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2817; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
2818; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2819; GFX9-NODL-NEXT:    s_endpgm
2820;
2821; GFX9-DL-LABEL: notsdot2_sext8:
2822; GFX9-DL:       ; %bb.0: ; %entry
2823; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2824; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2825; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2826; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2828; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
2829; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2830; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2831; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2832; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2833; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2834; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2835; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2836; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2837; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
2838; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2839; GFX9-DL-NEXT:    s_endpgm
2840;
2841; GFX10-DL-LABEL: notsdot2_sext8:
2842; GFX10-DL:       ; %bb.0: ; %entry
2843; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2844; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2845; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2846; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2847; GFX10-DL-NEXT:    s_clause 0x1
2848; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2849; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
2850; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2851; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2852; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
2853; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2854; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
2855; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2856; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
2857; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2858; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2859; GFX10-DL-NEXT:    v_add3_u32 v0, v0, s2, v1
2860; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
2861; GFX10-DL-NEXT:    s_endpgm
2862                                          <2 x i8> addrspace(1)* %src2,
2863                                          i32 addrspace(1)* nocapture %dst) {
2864entry:
2865  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2866  %gep1 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src1, i32 %idx
2867  %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %gep1
2868  %gep2 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src2, i32 %idx
2869  %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %gep2
2870
2871  %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2872  %conv = sext i8 %s1.elt1 to i32
2873  %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2874  %conv2 = sext i8 %s2.elt1 to i32
2875  %mul1 = mul nuw i32 %conv2, %conv
2876
2877  %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2878  %conv3 = sext i8 %s1.elt2 to i32
2879  %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2880  %conv4 = sext i8 %s2.elt2 to i32
2881  %mul2 = mul nuw i32 %conv4, %conv3
2882
2883  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2884  %add = add i32 %mul2, %s3
2885  %add6 = add i32 %add, %mul1
2886  store i32 %add6, i32 addrspace(1)* %dst, align 4
2887  ret void
2888}
2889
2890declare i32 @llvm.amdgcn.workitem.id.x()
2891