1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
8
9; add(mul(S0.x, S1.y),
10;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
11
12define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
13; GFX7-LABEL: udot2:
14; GFX7:       ; %bb.0: ; %entry
15; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
16; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
17; GFX7-NEXT:    s_mov_b32 s3, 0xf000
18; GFX7-NEXT:    s_mov_b32 s10, 0
19; GFX7-NEXT:    s_mov_b32 s11, s3
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
22; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
23; GFX7-NEXT:    v_mov_b32_e32 v1, 0
24; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
25; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
26; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
27; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
28; GFX7-NEXT:    s_mov_b32 s4, 0xffff
29; GFX7-NEXT:    s_mov_b32 s2, -1
30; GFX7-NEXT:    s_waitcnt vmcnt(1)
31; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
32; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
33; GFX7-NEXT:    s_waitcnt vmcnt(0)
34; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
35; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
36; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s5
38; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
39; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
40; GFX7-NEXT:    s_endpgm
41;
42; GFX8-LABEL: udot2:
43; GFX8:       ; %bb.0: ; %entry
44; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
45; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
46; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
47; GFX8-NEXT:    s_mov_b32 s2, 0xffff
48; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX8-NEXT:    v_mov_b32_e32 v1, s5
50; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
51; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
52; GFX8-NEXT:    flat_load_dword v3, v[0:1]
53; GFX8-NEXT:    v_mov_b32_e32 v1, s7
54; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
55; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
56; GFX8-NEXT:    flat_load_dword v0, v[0:1]
57; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
58; GFX8-NEXT:    s_waitcnt vmcnt(1)
59; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
60; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
61; GFX8-NEXT:    s_waitcnt vmcnt(0)
62; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
63; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
64; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
65; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
66; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
67; GFX8-NEXT:    v_mov_b32_e32 v0, s0
68; GFX8-NEXT:    v_mov_b32_e32 v1, s1
69; GFX8-NEXT:    flat_store_dword v[0:1], v2
70; GFX8-NEXT:    s_endpgm
71;
72; GFX9-NODL-LABEL: udot2:
73; GFX9-NODL:       ; %bb.0: ; %entry
74; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
75; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
76; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
77; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
79; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
80; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
81; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
82; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
83; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
84; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
85; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
86; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
87; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
88; GFX9-NODL-NEXT:    s_endpgm
89;
90; GFX9-DL-LABEL: udot2:
91; GFX9-DL:       ; %bb.0: ; %entry
92; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
93; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
94; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
95; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
97; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
98; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
99; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
100; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
101; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
102; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
103; GFX9-DL-NEXT:    s_endpgm
104;
105; GFX10-DL-LABEL: udot2:
106; GFX10-DL:       ; %bb.0: ; %entry
107; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
108; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
109; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
110; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX10-DL-NEXT:    s_clause 0x1
112; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
113; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
114; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
115; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
116; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
117; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
118; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
119; GFX10-DL-NEXT:    s_endpgm
120                                 <2 x i16> addrspace(1)* %src2,
121                                 i32 addrspace(1)* nocapture %dst) {
122entry:
123  %idx = call i32 @llvm.amdgcn.workitem.id.x()
124  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
125  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
126  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
127  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
128
129  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
130  %conv = zext i16 %s1.elt1 to i32
131  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
132  %conv2 = zext i16 %s2.elt1 to i32
133  %mul1 = mul nuw i32 %conv2, %conv
134
135  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
136  %conv3 = zext i16 %s1.elt2 to i32
137  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
138  %conv4 = zext i16 %s2.elt2 to i32
139  %mul2 = mul nuw i32 %conv4, %conv3
140
141  %s3 = load i32, i32 addrspace(1)* %dst, align 4
142  %add = add i32 %mul2, %s3
143  %add6 = add i32 %add, %mul1
144  store i32 %add6, i32 addrspace(1)* %dst, align 4
145  ret void
146}
147
148; TODO: Support this pattern
149;      add(S3,
150;          add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
151define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
152; GFX7-LABEL: udot2_MulMul:
153; GFX7:       ; %bb.0: ; %entry
154; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
155; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
156; GFX7-NEXT:    s_mov_b32 s3, 0xf000
157; GFX7-NEXT:    s_mov_b32 s10, 0
158; GFX7-NEXT:    s_mov_b32 s11, s3
159; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
160; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
161; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
162; GFX7-NEXT:    v_mov_b32_e32 v1, 0
163; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
164; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
165; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
166; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
167; GFX7-NEXT:    s_mov_b32 s4, 0xffff
168; GFX7-NEXT:    s_mov_b32 s2, -1
169; GFX7-NEXT:    s_waitcnt vmcnt(1)
170; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
171; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
172; GFX7-NEXT:    s_waitcnt vmcnt(0)
173; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
174; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
175; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
176; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
177; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
178; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
179; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
180; GFX7-NEXT:    s_endpgm
181;
182; GFX8-LABEL: udot2_MulMul:
183; GFX8:       ; %bb.0: ; %entry
184; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
185; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
186; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
187; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX8-NEXT:    v_mov_b32_e32 v1, s5
189; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
190; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
191; GFX8-NEXT:    flat_load_dword v3, v[0:1]
192; GFX8-NEXT:    v_mov_b32_e32 v1, s7
193; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
194; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
195; GFX8-NEXT:    flat_load_dword v0, v[0:1]
196; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
197; GFX8-NEXT:    s_waitcnt vmcnt(1)
198; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
199; GFX8-NEXT:    s_waitcnt vmcnt(0)
200; GFX8-NEXT:    v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
201; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
202; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
203; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
205; GFX8-NEXT:    v_mov_b32_e32 v0, s0
206; GFX8-NEXT:    v_mov_b32_e32 v1, s1
207; GFX8-NEXT:    flat_store_dword v[0:1], v2
208; GFX8-NEXT:    s_endpgm
209;
210; GFX9-NODL-LABEL: udot2_MulMul:
211; GFX9-NODL:       ; %bb.0: ; %entry
212; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
213; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
214; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
215; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
217; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
218; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
219; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
220; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
221; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
222; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
223; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
224; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, s0
225; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
226; GFX9-NODL-NEXT:    s_endpgm
227;
228; GFX9-DL-LABEL: udot2_MulMul:
229; GFX9-DL:       ; %bb.0: ; %entry
230; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
231; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
232; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
233; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
235; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
236; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
237; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
238; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
239; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
240; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
241; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, s0
243; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
244; GFX9-DL-NEXT:    s_endpgm
245;
246; GFX10-DL-LABEL: udot2_MulMul:
247; GFX10-DL:       ; %bb.0: ; %entry
248; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
249; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
250; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
251; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-DL-NEXT:    s_clause 0x1
253; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
254; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
255; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
256; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
257; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
258; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
259; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
260; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, s2
262; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
263; GFX10-DL-NEXT:    s_endpgm
264                                        <2 x i16> addrspace(1)* %src2,
265                                        i32 addrspace(1)* nocapture %dst) {
266entry:
267  %idx = call i32 @llvm.amdgcn.workitem.id.x()
268  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
269  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
270  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
271  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
272
273  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
274  %conv = zext i16 %s1.elt1 to i32
275  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
276  %conv2 = zext i16 %s2.elt1 to i32
277  %mul1 = mul nuw i32 %conv2, %conv
278
279  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
280  %conv3 = zext i16 %s1.elt2 to i32
281  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
282  %conv4 = zext i16 %s2.elt2 to i32
283  %mul2 = mul nuw i32 %conv4, %conv3
284  %s3 = load i32, i32 addrspace(1)* %dst, align 4
285  %add = add i32 %mul2, %mul1
286  %add6 = add i32 %add, %s3
287  store i32 %add6, i32 addrspace(1)* %dst, align 4
288  ret void
289}
290
291define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
292; GFX7-LABEL: idot2:
293; GFX7:       ; %bb.0: ; %entry
294; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
295; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
296; GFX7-NEXT:    s_mov_b32 s3, 0xf000
297; GFX7-NEXT:    s_mov_b32 s10, 0
298; GFX7-NEXT:    s_mov_b32 s11, s3
299; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
300; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
301; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
302; GFX7-NEXT:    v_mov_b32_e32 v1, 0
303; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
304; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
305; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
306; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
307; GFX7-NEXT:    s_mov_b32 s2, -1
308; GFX7-NEXT:    s_waitcnt vmcnt(1)
309; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
310; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
311; GFX7-NEXT:    s_waitcnt vmcnt(0)
312; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
313; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
314; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
315; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
316; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
317; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
318; GFX7-NEXT:    s_endpgm
319;
320; GFX8-LABEL: idot2:
321; GFX8:       ; %bb.0: ; %entry
322; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
323; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
324; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
325; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
326; GFX8-NEXT:    v_mov_b32_e32 v1, s5
327; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
328; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
329; GFX8-NEXT:    flat_load_dword v3, v[0:1]
330; GFX8-NEXT:    v_mov_b32_e32 v1, s7
331; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
332; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
333; GFX8-NEXT:    flat_load_dword v0, v[0:1]
334; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
335; GFX8-NEXT:    s_waitcnt vmcnt(1)
336; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
337; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
338; GFX8-NEXT:    s_waitcnt vmcnt(0)
339; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
340; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
341; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
343; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
344; GFX8-NEXT:    v_mov_b32_e32 v0, s0
345; GFX8-NEXT:    v_mov_b32_e32 v1, s1
346; GFX8-NEXT:    flat_store_dword v[0:1], v2
347; GFX8-NEXT:    s_endpgm
348;
349; GFX9-NODL-LABEL: idot2:
350; GFX9-NODL:       ; %bb.0: ; %entry
351; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
352; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
353; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
354; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
356; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
357; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
358; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
359; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
360; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
361; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
362; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
364; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
365; GFX9-NODL-NEXT:    s_endpgm
366;
367; GFX9-DL-LABEL: idot2:
368; GFX9-DL:       ; %bb.0: ; %entry
369; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
370; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
371; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
372; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
374; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
375; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
376; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
377; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
378; GFX9-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s0
379; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
380; GFX9-DL-NEXT:    s_endpgm
381;
382; GFX10-DL-LABEL: idot2:
383; GFX10-DL:       ; %bb.0: ; %entry
384; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
385; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
386; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
387; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX10-DL-NEXT:    s_clause 0x1
389; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
390; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
391; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
392; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
393; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
394; GFX10-DL-NEXT:    v_dot2_i32_i16 v1, v2, v1, s2
395; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
396; GFX10-DL-NEXT:    s_endpgm
397                                 <2 x i16> addrspace(1)* %src2,
398                                 i32 addrspace(1)* nocapture %dst) {
399entry:
400  %idx = call i32 @llvm.amdgcn.workitem.id.x()
401  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
402  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
403  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
404  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
405
406  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
407  %conv = sext i16 %s1.elt1 to i32
408  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
409  %conv2 = sext i16 %s2.elt1 to i32
410  %mul1 = mul nuw i32 %conv2, %conv
411
412  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
413  %conv3 = sext i16 %s1.elt2 to i32
414  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
415  %conv4 = sext i16 %s2.elt2 to i32
416  %mul2 = mul nuw i32 %conv4, %conv3
417
418  %s3 = load i32, i32 addrspace(1)* %dst, align 4
419  %add = add i32 %mul2, %s3
420  %add6 = add i32 %add, %mul1
421  store i32 %add6, i32 addrspace(1)* %dst, align 4
422  ret void
423}
424
425define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
426; GFX7-LABEL: idot2_MixedTypedMul:
427; GFX7:       ; %bb.0: ; %entry
428; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
429; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
430; GFX7-NEXT:    s_mov_b32 s3, 0xf000
431; GFX7-NEXT:    s_mov_b32 s10, 0
432; GFX7-NEXT:    s_mov_b32 s11, s3
433; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
435; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
436; GFX7-NEXT:    v_mov_b32_e32 v1, 0
437; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
438; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
439; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
440; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
441; GFX7-NEXT:    s_mov_b32 s2, -1
442; GFX7-NEXT:    s_waitcnt vmcnt(1)
443; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
444; GFX7-NEXT:    v_bfe_i32 v2, v2, 0, 16
445; GFX7-NEXT:    s_waitcnt vmcnt(0)
446; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
447; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
448; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s4
450; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v1
451; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
452; GFX7-NEXT:    s_endpgm
453;
454; GFX8-LABEL: idot2_MixedTypedMul:
455; GFX8:       ; %bb.0: ; %entry
456; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
457; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
458; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
459; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX8-NEXT:    v_mov_b32_e32 v1, s5
461; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
462; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
463; GFX8-NEXT:    flat_load_dword v3, v[0:1]
464; GFX8-NEXT:    v_mov_b32_e32 v1, s7
465; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
466; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
467; GFX8-NEXT:    flat_load_dword v0, v[0:1]
468; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
469; GFX8-NEXT:    s_waitcnt vmcnt(1)
470; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
471; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
472; GFX8-NEXT:    s_waitcnt vmcnt(0)
473; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
474; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
475; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s2
477; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
478; GFX8-NEXT:    v_mov_b32_e32 v0, s0
479; GFX8-NEXT:    v_mov_b32_e32 v1, s1
480; GFX8-NEXT:    flat_store_dword v[0:1], v2
481; GFX8-NEXT:    s_endpgm
482;
483; GFX9-NODL-LABEL: idot2_MixedTypedMul:
484; GFX9-NODL:       ; %bb.0: ; %entry
485; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
486; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
487; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
488; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
490; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
491; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
492; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
493; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
494; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
495; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
496; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
498; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
499; GFX9-NODL-NEXT:    s_endpgm
500;
501; GFX9-DL-LABEL: idot2_MixedTypedMul:
502; GFX9-DL:       ; %bb.0: ; %entry
503; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
504; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
505; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
506; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
508; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
509; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
510; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
511; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
512; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
513; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
514; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
516; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
517; GFX9-DL-NEXT:    s_endpgm
518;
519; GFX10-DL-LABEL: idot2_MixedTypedMul:
520; GFX10-DL:       ; %bb.0: ; %entry
521; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
522; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
523; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
524; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX10-DL-NEXT:    s_clause 0x1
526; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
527; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
528; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
529; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
530; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
531; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
532; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
533; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
535; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
536; GFX10-DL-NEXT:    s_endpgm
537                                               <2 x i16> addrspace(1)* %src2,
538                                               i32 addrspace(1)* nocapture %dst) {
539entry:
540  %idx = call i32 @llvm.amdgcn.workitem.id.x()
541  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
542  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
543  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
544  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
545
546  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
547  %conv = sext i16 %s1.elt1 to i32
548  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
549  %conv2 = sext i16 %s2.elt1 to i32
550  %mul1 = mul nuw i32 %conv2, %conv
551
552  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
553  %conv3 = zext i16 %s1.elt2 to i32
554  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
555  %conv4 = zext i16 %s2.elt2 to i32
556  %mul2 = mul nuw i32 %conv4, %conv3
557
558  %s3 = load i32, i32 addrspace(1)* %dst, align 4
559  %add = add i32 %mul2, %s3
560  %add6 = add i32 %add, %mul1
561  store i32 %add6, i32 addrspace(1)* %dst, align 4
562  ret void
563}
564
565define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
566; GFX7-LABEL: udot2_alt_AddOperands:
567; GFX7:       ; %bb.0: ; %entry
568; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
569; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
570; GFX7-NEXT:    s_mov_b32 s3, 0xf000
571; GFX7-NEXT:    s_mov_b32 s10, 0
572; GFX7-NEXT:    s_mov_b32 s11, s3
573; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
575; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
576; GFX7-NEXT:    v_mov_b32_e32 v1, 0
577; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
578; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
579; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
580; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
581; GFX7-NEXT:    s_mov_b32 s4, 0xffff
582; GFX7-NEXT:    s_mov_b32 s2, -1
583; GFX7-NEXT:    s_waitcnt vmcnt(1)
584; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
585; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
586; GFX7-NEXT:    s_waitcnt vmcnt(0)
587; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
588; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
589; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s5
591; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
592; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
593; GFX7-NEXT:    s_endpgm
594;
595; GFX8-LABEL: udot2_alt_AddOperands:
596; GFX8:       ; %bb.0: ; %entry
597; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
598; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
599; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
600; GFX8-NEXT:    s_mov_b32 s2, 0xffff
601; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX8-NEXT:    v_mov_b32_e32 v1, s5
603; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
604; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
605; GFX8-NEXT:    flat_load_dword v3, v[0:1]
606; GFX8-NEXT:    v_mov_b32_e32 v1, s7
607; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
608; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
609; GFX8-NEXT:    flat_load_dword v0, v[0:1]
610; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
611; GFX8-NEXT:    s_waitcnt vmcnt(1)
612; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
613; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
614; GFX8-NEXT:    s_waitcnt vmcnt(0)
615; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
616; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
617; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
618; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
619; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
620; GFX8-NEXT:    v_mov_b32_e32 v0, s0
621; GFX8-NEXT:    v_mov_b32_e32 v1, s1
622; GFX8-NEXT:    flat_store_dword v[0:1], v2
623; GFX8-NEXT:    s_endpgm
624;
625; GFX9-NODL-LABEL: udot2_alt_AddOperands:
626; GFX9-NODL:       ; %bb.0: ; %entry
627; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
628; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
629; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
630; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
631; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
633; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
634; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
635; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
636; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
637; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s0, v1
638; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
639; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v2
640; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
641; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
642; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s1
644; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v4, v3, v1
645; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
646; GFX9-NODL-NEXT:    s_endpgm
647;
648; GFX9-DL-LABEL: udot2_alt_AddOperands:
649; GFX9-DL:       ; %bb.0: ; %entry
650; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
651; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
652; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
653; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
655; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
656; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
657; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
658; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
659; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
660; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
661; GFX9-DL-NEXT:    s_endpgm
662;
663; GFX10-DL-LABEL: udot2_alt_AddOperands:
664; GFX10-DL:       ; %bb.0: ; %entry
665; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
666; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
667; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
668; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
669; GFX10-DL-NEXT:    s_clause 0x1
670; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
671; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
672; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
673; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
674; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
675; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
676; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
677; GFX10-DL-NEXT:    s_endpgm
678                                                 <2 x i16> addrspace(1)* %src2,
679                                                 i32 addrspace(1)* nocapture %dst) {
680entry:
681  %idx = call i32 @llvm.amdgcn.workitem.id.x()
682  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
683  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
684  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
685  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
686
687  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
688  %conv = zext i16 %s1.elt1 to i32
689  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
690  %conv2 = zext i16 %s2.elt1 to i32
691  %mul1 = mul nuw i32 %conv2, %conv
692
693  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
694  %conv3 = zext i16 %s1.elt2 to i32
695  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
696  %conv4 = zext i16 %s2.elt2 to i32
697  %mul2 = mul nuw i32 %conv4, %conv3
698
699  %s3 = load i32, i32 addrspace(1)* %dst, align 4
700  %add = add i32 %s3, %mul2
701  %add6 = add i32 %mul1, %add
702  store i32 %add6, i32 addrspace(1)* %dst, align 4
703  ret void
704}
705
706define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
707; GFX7-LABEL: idot2_MixedExt:
708; GFX7:       ; %bb.0: ; %entry
709; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
710; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
711; GFX7-NEXT:    s_mov_b32 s3, 0xf000
712; GFX7-NEXT:    s_mov_b32 s10, 0
713; GFX7-NEXT:    s_mov_b32 s11, s3
714; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
715; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
716; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
717; GFX7-NEXT:    v_mov_b32_e32 v1, 0
718; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
719; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
720; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
721; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
722; GFX7-NEXT:    s_mov_b32 s2, -1
723; GFX7-NEXT:    s_waitcnt vmcnt(1)
724; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
725; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
726; GFX7-NEXT:    s_waitcnt vmcnt(0)
727; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff, v0
728; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
729; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
731; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
732; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
733; GFX7-NEXT:    s_endpgm
734;
735; GFX8-LABEL: idot2_MixedExt:
736; GFX8:       ; %bb.0: ; %entry
737; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
738; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
739; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
740; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX8-NEXT:    v_mov_b32_e32 v1, s5
742; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
743; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
744; GFX8-NEXT:    flat_load_dword v3, v[0:1]
745; GFX8-NEXT:    v_mov_b32_e32 v1, s7
746; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
747; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
748; GFX8-NEXT:    flat_load_dword v0, v[0:1]
749; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
750; GFX8-NEXT:    s_waitcnt vmcnt(1)
751; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
752; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
753; GFX8-NEXT:    s_waitcnt vmcnt(0)
754; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v0
755; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
756; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
758; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
759; GFX8-NEXT:    v_mov_b32_e32 v0, s0
760; GFX8-NEXT:    v_mov_b32_e32 v1, s1
761; GFX8-NEXT:    flat_store_dword v[0:1], v2
762; GFX8-NEXT:    s_endpgm
763;
764; GFX9-NODL-LABEL: idot2_MixedExt:
765; GFX9-NODL:       ; %bb.0: ; %entry
766; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
767; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
768; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
769; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
770; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
771; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
772; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
773; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
774; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
775; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
776; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
777; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
778; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
779; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
780; GFX9-NODL-NEXT:    s_endpgm
781;
782; GFX9-DL-LABEL: idot2_MixedExt:
783; GFX9-DL:       ; %bb.0: ; %entry
784; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
785; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
786; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
787; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
789; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
790; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
791; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
792; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
793; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
794; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
795; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
796; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
797; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
798; GFX9-DL-NEXT:    s_endpgm
799;
800; GFX10-DL-LABEL: idot2_MixedExt:
801; GFX10-DL:       ; %bb.0: ; %entry
802; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
803; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
804; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
805; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
806; GFX10-DL-NEXT:    s_clause 0x1
807; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
808; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
809; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
810; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
811; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
812; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
813; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
814; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
815; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
816; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
817; GFX10-DL-NEXT:    s_endpgm
818                                          <2 x i16> addrspace(1)* %src2,
819                                          i32 addrspace(1)* nocapture %dst) {
820entry:
821  %idx = call i32 @llvm.amdgcn.workitem.id.x()
822  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
823  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
824  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
825  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
826
827  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
828  %conv = sext i16 %s1.elt1 to i32
829  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
830  %conv2 = zext i16 %s2.elt1 to i32
831  %mul1 = mul nuw i32 %conv2, %conv
832
833  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
834  %conv3 = sext i16 %s1.elt2 to i32
835  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
836  %conv4 = sext i16 %s2.elt2 to i32
837  %mul2 = mul nuw i32 %conv4, %conv3
838
839  %s3 = load i32, i32 addrspace(1)* %dst, align 4
840  %add = add i32 %mul2, %s3
841  %add6 = add i32 %add, %mul1
842  store i32 %add6, i32 addrspace(1)* %dst, align 4
843  ret void
844}
845
846define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
847; GFX7-LABEL: notudot2_SameVec:
848; GFX7:       ; %bb.0: ; %entry
849; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
850; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
851; GFX7-NEXT:    s_mov_b32 s3, 0xf000
852; GFX7-NEXT:    s_mov_b32 s10, 0
853; GFX7-NEXT:    s_mov_b32 s11, s3
854; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
856; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
857; GFX7-NEXT:    v_mov_b32_e32 v1, 0
858; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
859; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
860; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
861; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
862; GFX7-NEXT:    s_mov_b32 s2, -1
863; GFX7-NEXT:    s_waitcnt vmcnt(1)
864; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v2
865; GFX7-NEXT:    s_waitcnt vmcnt(0)
866; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
867; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
868; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v0, s4
869; GFX7-NEXT:    v_mad_u32_u24 v0, v1, v1, v0
870; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
871; GFX7-NEXT:    s_endpgm
872;
873; GFX8-LABEL: notudot2_SameVec:
874; GFX8:       ; %bb.0: ; %entry
875; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
876; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
877; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
878; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX8-NEXT:    v_mov_b32_e32 v1, s5
880; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
881; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
882; GFX8-NEXT:    flat_load_dword v3, v[0:1]
883; GFX8-NEXT:    v_mov_b32_e32 v1, s7
884; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
885; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
886; GFX8-NEXT:    flat_load_dword v0, v[0:1]
887; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
888; GFX8-NEXT:    s_waitcnt vmcnt(1)
889; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v3
890; GFX8-NEXT:    s_waitcnt vmcnt(0)
891; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
892; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
893; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v0, s2
894; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v1, v0
895; GFX8-NEXT:    v_mov_b32_e32 v0, s0
896; GFX8-NEXT:    v_mov_b32_e32 v1, s1
897; GFX8-NEXT:    flat_store_dword v[0:1], v2
898; GFX8-NEXT:    s_endpgm
899;
900; GFX9-NODL-LABEL: notudot2_SameVec:
901; GFX9-NODL:       ; %bb.0: ; %entry
902; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
903; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
904; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
905; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
907; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
908; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
909; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
910; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
911; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
912; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
913; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
914; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX9-NODL-NEXT:    v_add3_u32 v1, v2, s0, v1
916; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
917; GFX9-NODL-NEXT:    s_endpgm
918;
919; GFX9-DL-LABEL: notudot2_SameVec:
920; GFX9-DL:       ; %bb.0: ; %entry
921; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
922; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
923; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
924; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
925; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
926; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
927; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
928; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
929; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
930; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
931; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
932; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
933; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
934; GFX9-DL-NEXT:    v_add3_u32 v1, v2, s0, v1
935; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
936; GFX9-DL-NEXT:    s_endpgm
937;
938; GFX10-DL-LABEL: notudot2_SameVec:
939; GFX10-DL:       ; %bb.0: ; %entry
940; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
941; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
942; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
943; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX10-DL-NEXT:    s_clause 0x1
945; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
946; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
947; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
948; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
949; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
950; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
951; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
952; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
953; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
955; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
956; GFX10-DL-NEXT:    s_endpgm
957                                            <2 x i16> addrspace(1)* %src2,
958                                            i32 addrspace(1)* nocapture %dst) {
959entry:
960  %idx = call i32 @llvm.amdgcn.workitem.id.x()
961  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
962  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
963  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
964  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
965
966  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
967  %conv = zext i16 %s1.elt1 to i32
968  %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
969  %conv2 = zext i16 %s2.elt1 to i32
970  %mul1 = mul i32 %conv2, %conv
971
972  %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
973  %conv3 = zext i16 %s1.elt2 to i32
974  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
975  %conv4 = zext i16 %s2.elt2 to i32
976  %mul2 = mul i32 %conv4, %conv3
977
978  %s3 = load i32, i32 addrspace(1)* %dst, align 4
979  %add = add i32 %mul2, %s3
980  %add6 = add i32 %add, %mul1
981  store i32 %add6, i32 addrspace(1)* %dst, align 4
982  ret void
983}
984
985define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
986; GFX7-LABEL: udot2_v4i16:
987; GFX7:       ; %bb.0: ; %entry
988; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
989; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
990; GFX7-NEXT:    s_mov_b32 s3, 0xf000
991; GFX7-NEXT:    s_mov_b32 s10, 0
992; GFX7-NEXT:    s_mov_b32 s11, s3
993; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
994; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
995; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
996; GFX7-NEXT:    v_mov_b32_e32 v1, 0
997; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
998; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
999; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1000; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1001; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1002; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1003; GFX7-NEXT:    s_mov_b32 s2, -1
1004; GFX7-NEXT:    s_waitcnt vmcnt(1)
1005; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
1006; GFX7-NEXT:    s_waitcnt vmcnt(0)
1007; GFX7-NEXT:    v_and_b32_e32 v3, s4, v0
1008; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1009; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1010; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1011; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s5
1012; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
1013; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1014; GFX7-NEXT:    s_endpgm
1015;
1016; GFX8-LABEL: udot2_v4i16:
1017; GFX8:       ; %bb.0: ; %entry
1018; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1019; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1020; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1021; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1022; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1023; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1024; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1025; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1026; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1027; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1028; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1029; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1030; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1031; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1032; GFX8-NEXT:    s_waitcnt vmcnt(1)
1033; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
1034; GFX8-NEXT:    s_waitcnt vmcnt(0)
1035; GFX8-NEXT:    v_and_b32_e32 v3, s2, v1
1036; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1037; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1038; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1039; GFX8-NEXT:    v_mad_u32_u24 v0, v1, v0, s3
1040; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v2, v0
1041; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1042; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1043; GFX8-NEXT:    flat_store_dword v[0:1], v2
1044; GFX8-NEXT:    s_endpgm
1045;
1046; GFX9-NODL-LABEL: udot2_v4i16:
1047; GFX9-NODL:       ; %bb.0: ; %entry
1048; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1049; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1050; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1051; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1053; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1054; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1055; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1056; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1057; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1058; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1059; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1060; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1061; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1062; GFX9-NODL-NEXT:    s_endpgm
1063;
1064; GFX9-DL-LABEL: udot2_v4i16:
1065; GFX9-DL:       ; %bb.0: ; %entry
1066; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1067; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1068; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1069; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1070; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1071; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1072; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1073; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1074; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1075; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1076; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1077; GFX9-DL-NEXT:    s_endpgm
1078;
1079; GFX10-DL-LABEL: udot2_v4i16:
1080; GFX10-DL:       ; %bb.0: ; %entry
1081; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1082; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1083; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1084; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1085; GFX10-DL-NEXT:    s_clause 0x1
1086; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1087; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1088; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1089; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1090; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1091; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
1092; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1093; GFX10-DL-NEXT:    s_endpgm
1094                                       <4 x i16> addrspace(1)* %src2,
1095                                       i32 addrspace(1)* nocapture %dst) {
1096entry:
1097  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1098  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1099  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1100  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1101  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1102
1103  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1104  %conv = zext i16 %s1.elt1 to i32
1105  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1106  %conv2 = zext i16 %s2.elt1 to i32
1107  %mul1 = mul i32 %conv2, %conv
1108
1109  %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
1110  %conv3 = zext i16 %s1.elt2 to i32
1111  %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
1112  %conv4 = zext i16 %s2.elt2 to i32
1113  %mul2 = mul i32 %conv4, %conv3
1114
1115  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1116  %add = add i32 %mul2, %s3
1117  %add6 = add i32 %add, %mul1
1118  store i32 %add6, i32 addrspace(1)* %dst, align 4
1119  ret void
1120}
1121
1122define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
1123; GFX7-LABEL: udot2_v4i16_Hi:
1124; GFX7:       ; %bb.0: ; %entry
1125; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1126; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1127; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1128; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1129; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1130; GFX7-NEXT:    s_mov_b32 s10, 0
1131; GFX7-NEXT:    s_mov_b32 s11, s3
1132; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1134; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
1135; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1136; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4
1137; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1138; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1139; GFX7-NEXT:    s_mov_b32 s2, -1
1140; GFX7-NEXT:    s_waitcnt vmcnt(1)
1141; GFX7-NEXT:    v_and_b32_e32 v1, s4, v2
1142; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1143; GFX7-NEXT:    s_waitcnt vmcnt(0)
1144; GFX7-NEXT:    v_and_b32_e32 v3, s4, v0
1145; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1146; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1147; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, s5
1148; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v1, v0
1149; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1150; GFX7-NEXT:    s_endpgm
1151;
1152; GFX8-LABEL: udot2_v4i16_Hi:
1153; GFX8:       ; %bb.0: ; %entry
1154; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1155; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1156; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1157; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1158; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1159; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1160; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
1161; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1162; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1163; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
1164; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1165; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
1166; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1167; GFX8-NEXT:    flat_load_dword v2, v[0:1]
1168; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v4
1169; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1170; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1171; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1172; GFX8-NEXT:    s_waitcnt vmcnt(1)
1173; GFX8-NEXT:    v_and_b32_e32 v1, s2, v2
1174; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1175; GFX8-NEXT:    s_waitcnt vmcnt(0)
1176; GFX8-NEXT:    v_and_b32_e32 v3, s2, v0
1177; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1178; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1179; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v2, s3
1180; GFX8-NEXT:    v_mad_u32_u24 v2, v3, v1, v0
1181; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1182; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1183; GFX8-NEXT:    flat_store_dword v[0:1], v2
1184; GFX8-NEXT:    s_endpgm
1185;
1186; GFX9-NODL-LABEL: udot2_v4i16_Hi:
1187; GFX9-NODL:       ; %bb.0: ; %entry
1188; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1189; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1190; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1191; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1192; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
1193; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
1194; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1195; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1196; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1197; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1198; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1199; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1201; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1202; GFX9-NODL-NEXT:    s_endpgm
1203;
1204; GFX9-DL-LABEL: udot2_v4i16_Hi:
1205; GFX9-DL:       ; %bb.0: ; %entry
1206; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1207; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1208; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1209; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
1211; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
1212; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1213; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1214; GFX9-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1215; GFX9-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s0
1216; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1217; GFX9-DL-NEXT:    s_endpgm
1218;
1219; GFX10-DL-LABEL: udot2_v4i16_Hi:
1220; GFX10-DL:       ; %bb.0: ; %entry
1221; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1222; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1223; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1224; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1225; GFX10-DL-NEXT:    s_clause 0x1
1226; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5] offset:4
1227; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7] offset:4
1228; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
1229; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1230; GFX10-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1231; GFX10-DL-NEXT:    v_dot2_u32_u16 v1, v2, v1, s2
1232; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
1233; GFX10-DL-NEXT:    s_endpgm
1234                                          <4 x i16> addrspace(1)* %src2,
1235                                          i32 addrspace(1)* nocapture %dst) {
1236entry:
1237  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1238  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1239  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1240  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1241  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1242
1243  %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1244  %conv = zext i16 %s1.elt1 to i32
1245  %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1246  %conv2 = zext i16 %s2.elt1 to i32
1247  %mul1 = mul i32 %conv2, %conv
1248
1249  %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1250  %conv3 = zext i16 %s1.elt2 to i32
1251  %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1252  %conv4 = zext i16 %s2.elt2 to i32
1253  %mul2 = mul i32 %conv4, %conv3
1254
1255  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1256  %add = add i32 %mul2, %s3
1257  %add6 = add i32 %add, %mul1
1258  store i32 %add6, i32 addrspace(1)* %dst, align 4
1259  ret void
1260}
1261
1262define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
1263; GFX7-LABEL: notudot2_v4i16_Even:
1264; GFX7:       ; %bb.0: ; %entry
1265; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1266; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1267; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1268; GFX7-NEXT:    s_mov_b32 s10, 0
1269; GFX7-NEXT:    s_mov_b32 s11, s3
1270; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1271; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1272; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1273; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1274; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
1275; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
1276; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1277; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1278; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1279; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1280; GFX7-NEXT:    s_mov_b32 s2, -1
1281; GFX7-NEXT:    s_waitcnt vmcnt(1)
1282; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
1283; GFX7-NEXT:    s_waitcnt vmcnt(0)
1284; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
1285; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
1286; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
1287; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s5
1289; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1290; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1291; GFX7-NEXT:    s_endpgm
1292;
1293; GFX8-LABEL: notudot2_v4i16_Even:
1294; GFX8:       ; %bb.0: ; %entry
1295; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1296; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1297; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1298; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1299; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1300; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1301; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1302; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1303; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1304; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1305; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1306; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1307; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1308; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1309; GFX8-NEXT:    s_waitcnt vmcnt(1)
1310; GFX8-NEXT:    v_and_b32_e32 v1, s2, v1
1311; GFX8-NEXT:    s_waitcnt vmcnt(0)
1312; GFX8-NEXT:    v_and_b32_e32 v3, s2, v3
1313; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
1314; GFX8-NEXT:    v_and_b32_e32 v2, s2, v2
1315; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1316; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s3
1317; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
1318; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1319; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1320; GFX8-NEXT:    flat_store_dword v[0:1], v2
1321; GFX8-NEXT:    s_endpgm
1322;
1323; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1324; GFX9-NODL:       ; %bb.0: ; %entry
1325; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1326; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1327; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1328; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1329; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1330; GFX9-NODL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1331; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1332; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0
1333; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1334; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1335; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1336; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1337; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, s0, v0
1338; GFX9-NODL-NEXT:    global_store_dword v4, v0, s[2:3]
1339; GFX9-NODL-NEXT:    s_endpgm
1340;
1341; GFX9-DL-LABEL: notudot2_v4i16_Even:
1342; GFX9-DL:       ; %bb.0: ; %entry
1343; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1344; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1345; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1346; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1347; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1348; GFX9-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1349; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1350; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0
1351; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1352; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1353; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1354; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1355; GFX9-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1356; GFX9-DL-NEXT:    global_store_dword v4, v0, s[2:3]
1357; GFX9-DL-NEXT:    s_endpgm
1358;
1359; GFX10-DL-LABEL: notudot2_v4i16_Even:
1360; GFX10-DL:       ; %bb.0: ; %entry
1361; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1362; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1363; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1364; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1365; GFX10-DL-NEXT:    s_clause 0x1
1366; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1367; GFX10-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1368; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1369; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1370; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1371; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1372; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1373; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1374; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
1375; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1376; GFX10-DL-NEXT:    s_endpgm
1377                                               <4 x i16> addrspace(1)* %src2,
1378                                               i32 addrspace(1)* nocapture %dst) {
1379entry:
1380  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1381  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1382  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1383  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1384  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1385
1386  %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1387  %conv = zext i16 %s1.elt1 to i32
1388  %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1389  %conv2 = zext i16 %s2.elt1 to i32
1390  %mul1 = mul i32 %conv2, %conv
1391
1392  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1393  %conv3 = zext i16 %s1.elt2 to i32
1394  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1395  %conv4 = zext i16 %s2.elt2 to i32
1396  %mul2 = mul i32 %conv4, %conv3
1397
1398  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1399  %add = add i32 %mul2, %s3
1400  %add6 = add i32 %add, %mul1
1401  store i32 %add6, i32 addrspace(1)* %dst, align 4
1402  ret void
1403}
1404
1405define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
1406; GFX7-LABEL: notudot2_v4i16_Middle:
1407; GFX7:       ; %bb.0: ; %entry
1408; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1409; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1410; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1411; GFX7-NEXT:    s_mov_b32 s10, 0
1412; GFX7-NEXT:    s_mov_b32 s11, s3
1413; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1414; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1415; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1416; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1417; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
1418; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
1419; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
1420; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1421; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1422; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1423; GFX7-NEXT:    s_mov_b32 s2, -1
1424; GFX7-NEXT:    s_waitcnt vmcnt(1)
1425; GFX7-NEXT:    v_and_b32_e32 v3, s4, v3
1426; GFX7-NEXT:    s_waitcnt vmcnt(0)
1427; GFX7-NEXT:    v_and_b32_e32 v1, s4, v1
1428; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1429; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1430; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1431; GFX7-NEXT:    v_mad_u32_u24 v1, v1, v3, s5
1432; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1433; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1434; GFX7-NEXT:    s_endpgm
1435;
1436; GFX8-LABEL: notudot2_v4i16_Middle:
1437; GFX8:       ; %bb.0: ; %entry
1438; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1439; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1440; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1441; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1442; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1444; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1445; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1446; GFX8-NEXT:    v_mov_b32_e32 v3, s7
1447; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s6, v2
1448; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1449; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1450; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
1451; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1452; GFX8-NEXT:    s_waitcnt vmcnt(1)
1453; GFX8-NEXT:    v_and_b32_e32 v1, s2, v1
1454; GFX8-NEXT:    s_waitcnt vmcnt(0)
1455; GFX8-NEXT:    v_and_b32_e32 v3, s2, v3
1456; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1457; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1458; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1459; GFX8-NEXT:    v_mad_u32_u24 v1, v3, v1, s3
1460; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v0, v1
1461; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1462; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1463; GFX8-NEXT:    flat_store_dword v[0:1], v2
1464; GFX8-NEXT:    s_endpgm
1465;
1466; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1467; GFX9-NODL:       ; %bb.0: ; %entry
1468; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1469; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1470; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1471; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1472; GFX9-NODL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1473; GFX9-NODL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1474; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1475; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, 0
1476; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1477; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1478; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1479; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX9-NODL-NEXT:    v_add3_u32 v0, v1, s0, v0
1481; GFX9-NODL-NEXT:    global_store_dword v4, v0, s[2:3]
1482; GFX9-NODL-NEXT:    s_endpgm
1483;
1484; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1485; GFX9-DL:       ; %bb.0: ; %entry
1486; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1487; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1488; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1489; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX9-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1491; GFX9-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1492; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1493; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 0
1494; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1495; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1496; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1497; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1498; GFX9-DL-NEXT:    v_add3_u32 v0, v1, s0, v0
1499; GFX9-DL-NEXT:    global_store_dword v4, v0, s[2:3]
1500; GFX9-DL-NEXT:    s_endpgm
1501;
1502; GFX10-DL-LABEL: notudot2_v4i16_Middle:
1503; GFX10-DL:       ; %bb.0: ; %entry
1504; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1505; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1506; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1507; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX10-DL-NEXT:    s_clause 0x1
1509; GFX10-DL-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
1510; GFX10-DL-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
1511; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1512; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1513; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1514; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1515; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1516; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
1518; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1519; GFX10-DL-NEXT:    s_endpgm
1520                                                 <4 x i16> addrspace(1)* %src2,
1521                                                 i32 addrspace(1)* nocapture %dst) {
1522entry:
1523  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1524  %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src1, i32 %idx
1525  %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %gep1
1526  %gep2 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %src2, i32 %idx
1527  %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %gep2
1528
1529  %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1530  %conv = zext i16 %s1.elt1 to i32
1531  %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1532  %conv2 = zext i16 %s2.elt1 to i32
1533  %mul1 = mul i32 %conv2, %conv
1534
1535  %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1536  %conv3 = zext i16 %s1.elt2 to i32
1537  %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1538  %conv4 = zext i16 %s2.elt2 to i32
1539  %mul2 = mul i32 %conv4, %conv3
1540
1541  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1542  %add = add i32 %mul2, %s3
1543  %add6 = add i32 %add, %mul1
1544  store i32 %add6, i32 addrspace(1)* %dst, align 4
1545  ret void
1546}
1547
1548define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
1549; GFX7-LABEL: notudot2_DiffIndex:
1550; GFX7:       ; %bb.0: ; %entry
1551; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1552; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1553; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1554; GFX7-NEXT:    s_mov_b32 s10, 0
1555; GFX7-NEXT:    s_mov_b32 s11, s3
1556; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1557; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1558; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1559; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1560; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1561; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1562; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1563; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1564; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1565; GFX7-NEXT:    s_mov_b32 s2, -1
1566; GFX7-NEXT:    s_waitcnt vmcnt(1)
1567; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1568; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
1569; GFX7-NEXT:    s_waitcnt vmcnt(0)
1570; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1571; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
1572; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1573; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v1, s5
1574; GFX7-NEXT:    v_mad_u32_u24 v0, v3, v2, v0
1575; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1576; GFX7-NEXT:    s_endpgm
1577;
1578; GFX8-LABEL: notudot2_DiffIndex:
1579; GFX8:       ; %bb.0: ; %entry
1580; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1581; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1582; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1583; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1584; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1586; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1587; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1588; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1589; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1590; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1591; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1592; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1593; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1594; GFX8-NEXT:    s_waitcnt vmcnt(1)
1595; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
1596; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1597; GFX8-NEXT:    s_waitcnt vmcnt(0)
1598; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1599; GFX8-NEXT:    v_and_b32_e32 v0, s2, v0
1600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
1602; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
1603; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1604; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1605; GFX8-NEXT:    flat_store_dword v[0:1], v2
1606; GFX8-NEXT:    s_endpgm
1607;
1608; GFX9-NODL-LABEL: notudot2_DiffIndex:
1609; GFX9-NODL:       ; %bb.0: ; %entry
1610; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1611; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1612; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1613; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1614; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1615; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1616; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1617; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1618; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1619; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1620; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1621; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1622; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
1623; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1624; GFX9-NODL-NEXT:    s_endpgm
1625;
1626; GFX9-DL-LABEL: notudot2_DiffIndex:
1627; GFX9-DL:       ; %bb.0: ; %entry
1628; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1629; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1630; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1631; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1632; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1633; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1634; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1635; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1636; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1637; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1638; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1639; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1640; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
1641; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1642; GFX9-DL-NEXT:    s_endpgm
1643;
1644; GFX10-DL-LABEL: notudot2_DiffIndex:
1645; GFX10-DL:       ; %bb.0: ; %entry
1646; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1647; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1648; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1649; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1650; GFX10-DL-NEXT:    s_clause 0x1
1651; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1652; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1653; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1654; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1655; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
1656; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
1657; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1658; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1659; GFX10-DL-NEXT:    v_add3_u32 v0, v1, s2, v0
1660; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1661; GFX10-DL-NEXT:    s_endpgm
1662                                              <2 x i16> addrspace(1)* %src2,
1663                                              i32 addrspace(1)* nocapture %dst) {
1664entry:
1665  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1666  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
1667  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
1668  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
1669  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
1670
1671  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1672  %conv = zext i16 %s1.elt1 to i32
1673  %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1674  %conv2 = zext i16 %s2.elt1 to i32
1675  %mul1 = mul i32 %conv2, %conv
1676
1677  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1678  %conv3 = zext i16 %s1.elt2 to i32
1679  %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1680  %conv4 = zext i16 %s2.elt2 to i32
1681  %mul2 = mul i32 %conv4, %conv3
1682
1683  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1684  %add = add i32 %mul2, %s3
1685  %add6 = add i32 %add, %mul1
1686  store i32 %add6, i32 addrspace(1)* %dst, align 4
1687  ret void
1688}
1689
1690define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1691; GFX7-LABEL: udot2_MultipleUses_add1:
1692; GFX7:       ; %bb.0: ; %entry
1693; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1694; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1695; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1696; GFX7-NEXT:    s_mov_b32 s10, 0
1697; GFX7-NEXT:    s_mov_b32 s11, s3
1698; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1699; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1700; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1701; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1702; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1703; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1704; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1705; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
1706; GFX7-NEXT:    s_mov_b32 s4, 0xffff
1707; GFX7-NEXT:    s_mov_b32 s2, -1
1708; GFX7-NEXT:    s_waitcnt vmcnt(1)
1709; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
1710; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
1711; GFX7-NEXT:    s_waitcnt vmcnt(0)
1712; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1713; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
1714; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1715; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, s5
1716; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
1717; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
1718; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1719; GFX7-NEXT:    s_endpgm
1720;
1721; GFX8-LABEL: udot2_MultipleUses_add1:
1722; GFX8:       ; %bb.0: ; %entry
1723; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1724; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1725; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1726; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1727; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1728; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1729; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1730; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1731; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1732; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1733; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1734; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1735; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1736; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
1737; GFX8-NEXT:    s_waitcnt vmcnt(1)
1738; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
1739; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1740; GFX8-NEXT:    s_waitcnt vmcnt(0)
1741; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
1742; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1743; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1744; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, s3
1745; GFX8-NEXT:    v_mad_u32_u24 v1, v2, v1, v0
1746; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1747; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1748; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1749; GFX8-NEXT:    flat_store_dword v[0:1], v2
1750; GFX8-NEXT:    s_endpgm
1751;
1752; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1753; GFX9-NODL:       ; %bb.0: ; %entry
1754; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1755; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1756; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1757; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1758; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1759; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1760; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1761; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1762; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1763; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1764; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1765; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1766; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1767; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
1768; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v1
1769; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1770; GFX9-NODL-NEXT:    s_endpgm
1771;
1772; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1773; GFX9-DL:       ; %bb.0: ; %entry
1774; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1775; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1776; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1777; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1778; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1779; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1780; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1781; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1782; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1783; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1784; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1785; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1786; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1787; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
1788; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v1
1789; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1790; GFX9-DL-NEXT:    s_endpgm
1791;
1792; GFX10-DL-LABEL: udot2_MultipleUses_add1:
1793; GFX10-DL:       ; %bb.0: ; %entry
1794; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1795; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1796; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1797; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1798; GFX10-DL-NEXT:    s_clause 0x1
1799; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1800; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1801; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1802; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1803; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
1804; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1805; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1806; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1807; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1808; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1809; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
1810; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
1811; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1812; GFX10-DL-NEXT:    s_endpgm
1813                                                   <2 x i16> addrspace(1)* %src2,
1814                                                   i32 addrspace(1)* nocapture %dst) {
1815entry:
1816  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1817  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
1818  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
1819  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
1820  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
1821
1822  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1823  %conv = zext i16 %s1.elt1 to i32
1824  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1825  %conv2 = zext i16 %s2.elt1 to i32
1826  %mul1 = mul i32 %conv2, %conv
1827
1828  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1829  %conv3 = zext i16 %s1.elt2 to i32
1830  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1831  %conv4 = zext i16 %s2.elt2 to i32
1832  %mul2 = mul i32 %conv4, %conv3
1833
1834  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1835  %add1 = add i32 %mul2, %s3
1836  %add2 = add i32 %add1, %mul1
1837
1838  %res = add i32 %add2, %add1
1839  store i32 %res, i32 addrspace(1)* %dst, align 4
1840  ret void
1841}
1842
1843define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1844; GFX7-LABEL: idot2_MultipleUses_add1:
1845; GFX7:       ; %bb.0: ; %entry
1846; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1847; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1848; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1849; GFX7-NEXT:    s_mov_b32 s10, 0
1850; GFX7-NEXT:    s_mov_b32 s11, s3
1851; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1852; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
1853; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1854; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1855; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1856; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
1857; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1858; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
1859; GFX7-NEXT:    s_mov_b32 s2, -1
1860; GFX7-NEXT:    s_waitcnt vmcnt(1)
1861; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
1862; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1863; GFX7-NEXT:    s_waitcnt vmcnt(0)
1864; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
1865; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
1866; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1867; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
1868; GFX7-NEXT:    v_mad_i32_i24 v1, v3, v1, v0
1869; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1870; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1871; GFX7-NEXT:    s_endpgm
1872;
1873; GFX8-LABEL: idot2_MultipleUses_add1:
1874; GFX8:       ; %bb.0: ; %entry
1875; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1876; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1877; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1878; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1879; GFX8-NEXT:    v_mov_b32_e32 v1, s5
1880; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1881; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1882; GFX8-NEXT:    flat_load_dword v3, v[0:1]
1883; GFX8-NEXT:    v_mov_b32_e32 v1, s7
1884; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
1885; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1886; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1887; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
1888; GFX8-NEXT:    s_waitcnt vmcnt(1)
1889; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
1890; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
1891; GFX8-NEXT:    s_waitcnt vmcnt(0)
1892; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
1893; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
1894; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1895; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
1896; GFX8-NEXT:    v_mad_i32_i24 v1, v2, v1, v0
1897; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v1, v0
1898; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1899; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1900; GFX8-NEXT:    flat_store_dword v[0:1], v2
1901; GFX8-NEXT:    s_endpgm
1902;
1903; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1904; GFX9-NODL:       ; %bb.0: ; %entry
1905; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1906; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1907; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1908; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1909; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
1910; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
1911; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
1912; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
1913; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
1914; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1915; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
1916; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1917; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
1919; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v1
1920; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
1921; GFX9-NODL-NEXT:    s_endpgm
1922;
1923; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1924; GFX9-DL:       ; %bb.0: ; %entry
1925; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1926; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1927; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1928; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1929; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1930; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1931; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
1932; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
1933; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
1934; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1935; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
1936; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
1937; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
1939; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v1
1940; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
1941; GFX9-DL-NEXT:    s_endpgm
1942;
1943; GFX10-DL-LABEL: idot2_MultipleUses_add1:
1944; GFX10-DL:       ; %bb.0: ; %entry
1945; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1946; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1947; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1948; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1949; GFX10-DL-NEXT:    s_clause 0x1
1950; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
1951; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
1952; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
1953; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
1954; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v0, 16, v1
1955; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
1956; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
1957; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
1958; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
1959; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
1960; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
1961; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v0
1962; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
1963; GFX10-DL-NEXT:    s_endpgm
1964                                                   <2 x i16> addrspace(1)* %src2,
1965                                                   i32 addrspace(1)* nocapture %dst) {
1966entry:
1967  %idx = call i32 @llvm.amdgcn.workitem.id.x()
1968  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
1969  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
1970  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
1971  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
1972
1973  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1974  %conv = sext i16 %s1.elt1 to i32
1975  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1976  %conv2 = sext i16 %s2.elt1 to i32
1977  %mul1 = mul i32 %conv2, %conv
1978
1979  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1980  %conv3 = sext i16 %s1.elt2 to i32
1981  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1982  %conv4 = sext i16 %s2.elt2 to i32
1983  %mul2 = mul i32 %conv4, %conv3
1984
1985  %s3 = load i32, i32 addrspace(1)* %dst, align 4
1986  %add1 = add i32 %mul2, %s3
1987  %add2 = add i32 %add1, %mul1
1988
1989  %res = add i32 %add2, %add1
1990  store i32 %res, i32 addrspace(1)* %dst, align 4
1991  ret void
1992}
1993
1994define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1995; GFX7-LABEL: udot2_MultipleUses_mul1:
1996; GFX7:       ; %bb.0: ; %entry
1997; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1998; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1999; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2000; GFX7-NEXT:    s_mov_b32 s10, 0
2001; GFX7-NEXT:    s_mov_b32 s11, s3
2002; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2003; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2004; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2005; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2006; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2007; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2008; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2009; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
2010; GFX7-NEXT:    s_mov_b32 s4, 0xffff
2011; GFX7-NEXT:    s_mov_b32 s2, -1
2012; GFX7-NEXT:    s_waitcnt vmcnt(1)
2013; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
2014; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
2015; GFX7-NEXT:    s_waitcnt vmcnt(0)
2016; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2017; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
2018; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2019; GFX7-NEXT:    v_mad_u32_u24 v4, v0, v2, s5
2020; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
2021; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
2022; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2023; GFX7-NEXT:    s_endpgm
2024;
2025; GFX8-LABEL: udot2_MultipleUses_mul1:
2026; GFX8:       ; %bb.0: ; %entry
2027; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2028; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2029; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2030; GFX8-NEXT:    s_mov_b32 s2, 0xffff
2031; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2032; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2033; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2034; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2035; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2036; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2037; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2038; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2039; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2040; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
2041; GFX8-NEXT:    s_waitcnt vmcnt(1)
2042; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
2043; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2044; GFX8-NEXT:    s_waitcnt vmcnt(0)
2045; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
2046; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2047; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2048; GFX8-NEXT:    v_mad_u32_u24 v4, v2, v1, s3
2049; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
2050; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
2051; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2052; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2053; GFX8-NEXT:    flat_store_dword v[0:1], v2
2054; GFX8-NEXT:    s_endpgm
2055;
2056; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
2057; GFX9-NODL:       ; %bb.0: ; %entry
2058; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2059; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2060; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2061; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
2062; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2063; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2064; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2065; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
2066; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2067; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2068; GFX9-NODL-NEXT:    v_and_b32_e32 v3, s0, v1
2069; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2070; GFX9-NODL-NEXT:    v_and_b32_e32 v4, s0, v2
2071; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2072; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
2073; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2074; GFX9-NODL-NEXT:    v_mad_u32_u24 v3, v4, v3, s1
2075; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
2076; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2077; GFX9-NODL-NEXT:    s_endpgm
2078;
2079; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
2080; GFX9-DL:       ; %bb.0: ; %entry
2081; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2082; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2083; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2084; GFX9-DL-NEXT:    s_mov_b32 s0, 0xffff
2085; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2086; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2087; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2088; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
2089; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2090; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2091; GFX9-DL-NEXT:    v_and_b32_e32 v3, s0, v1
2092; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2093; GFX9-DL-NEXT:    v_and_b32_e32 v4, s0, v2
2094; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2095; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v4, v3
2096; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2097; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v4, v3, s1
2098; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
2099; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2100; GFX9-DL-NEXT:    s_endpgm
2101;
2102; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
2103; GFX10-DL:       ; %bb.0: ; %entry
2104; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2105; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2106; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2107; GFX10-DL-NEXT:    s_mov_b32 s3, 0xffff
2108; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2109; GFX10-DL-NEXT:    s_clause 0x1
2110; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2111; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2112; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2113; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2114; GFX10-DL-NEXT:    v_and_b32_e32 v0, s3, v1
2115; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2116; GFX10-DL-NEXT:    v_and_b32_e32 v3, s3, v2
2117; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2118; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
2119; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2120; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
2121; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2122; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
2123; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2124; GFX10-DL-NEXT:    s_endpgm
2125                                                   <2 x i16> addrspace(1)* %src2,
2126                                                   i32 addrspace(1)* nocapture %dst) {
2127entry:
2128  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2129  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2130  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2131  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2132  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2133
2134  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2135  %conv = zext i16 %s1.elt1 to i32
2136  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2137  %conv2 = zext i16 %s2.elt1 to i32
2138  %mul1 = mul i32 %conv2, %conv
2139
2140  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2141  %conv3 = zext i16 %s1.elt2 to i32
2142  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2143  %conv4 = zext i16 %s2.elt2 to i32
2144  %mul2 = mul i32 %conv4, %conv3
2145
2146  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2147  %add0 = add i32 %mul1, %s3
2148
2149  %add1 = add i32 %mul2, %add0
2150  %add2 = add i32 %add1, %mul1
2151
2152  store i32 %add2, i32 addrspace(1)* %dst, align 4
2153  ret void
2154}
2155
2156define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
2157; GFX7-LABEL: idot2_MultipleUses_mul1:
2158; GFX7:       ; %bb.0: ; %entry
2159; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2160; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2161; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2162; GFX7-NEXT:    s_mov_b32 s10, 0
2163; GFX7-NEXT:    s_mov_b32 s11, s3
2164; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2165; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2166; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2167; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2168; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2169; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2170; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2171; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2172; GFX7-NEXT:    s_mov_b32 s2, -1
2173; GFX7-NEXT:    s_waitcnt vmcnt(1)
2174; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
2175; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2176; GFX7-NEXT:    s_waitcnt vmcnt(0)
2177; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
2178; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2179; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2180; GFX7-NEXT:    v_mad_i32_i24 v4, v3, v1, s4
2181; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
2182; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2183; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2184; GFX7-NEXT:    s_endpgm
2185;
2186; GFX8-LABEL: idot2_MultipleUses_mul1:
2187; GFX8:       ; %bb.0: ; %entry
2188; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2189; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2190; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2191; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2192; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2193; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2194; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2195; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2196; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2197; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2198; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2199; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2200; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2201; GFX8-NEXT:    s_waitcnt vmcnt(1)
2202; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
2203; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
2204; GFX8-NEXT:    s_waitcnt vmcnt(0)
2205; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
2206; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2207; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX8-NEXT:    v_mad_i32_i24 v4, v2, v1, s2
2209; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, v4
2210; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2211; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2212; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2213; GFX8-NEXT:    flat_store_dword v[0:1], v2
2214; GFX8-NEXT:    s_endpgm
2215;
2216; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
2217; GFX9-NODL:       ; %bb.0: ; %entry
2218; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2219; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2220; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2221; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2222; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2223; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2224; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2225; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2226; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2227; GFX9-NODL-NEXT:    v_bfe_i32 v3, v1, 0, 16
2228; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2229; GFX9-NODL-NEXT:    v_bfe_i32 v4, v2, 0, 16
2230; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2231; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v2, v4, v3
2232; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2233; GFX9-NODL-NEXT:    v_mad_i32_i24 v3, v4, v3, s0
2234; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, v3, v2
2235; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2236; GFX9-NODL-NEXT:    s_endpgm
2237;
2238; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
2239; GFX9-DL:       ; %bb.0: ; %entry
2240; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2241; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2242; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2243; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2244; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2245; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2246; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2247; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2248; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2249; GFX9-DL-NEXT:    v_bfe_i32 v3, v1, 0, 16
2250; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2251; GFX9-DL-NEXT:    v_bfe_i32 v4, v2, 0, 16
2252; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2253; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v2, v4, v3
2254; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2255; GFX9-DL-NEXT:    v_mad_i32_i24 v3, v4, v3, s0
2256; GFX9-DL-NEXT:    v_add3_u32 v1, v1, v3, v2
2257; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2258; GFX9-DL-NEXT:    s_endpgm
2259;
2260; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
2261; GFX10-DL:       ; %bb.0: ; %entry
2262; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2263; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2264; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2265; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2266; GFX10-DL-NEXT:    s_clause 0x1
2267; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2268; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2269; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2270; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2271; GFX10-DL-NEXT:    v_bfe_i32 v0, v1, 0, 16
2272; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2273; GFX10-DL-NEXT:    v_bfe_i32 v3, v2, 0, 16
2274; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2275; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v0
2276; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2277; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
2278; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2279; GFX10-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
2280; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2281; GFX10-DL-NEXT:    s_endpgm
2282                                                   <2 x i16> addrspace(1)* %src2,
2283                                                   i32 addrspace(1)* nocapture %dst) {
2284entry:
2285  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2286  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2287  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2288  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2289  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2290
2291  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2292  %conv = sext i16 %s1.elt1 to i32
2293  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2294  %conv2 = sext i16 %s2.elt1 to i32
2295  %mul1 = mul i32 %conv2, %conv
2296
2297  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2298  %conv3 = sext i16 %s1.elt2 to i32
2299  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2300  %conv4 = sext i16 %s2.elt2 to i32
2301  %mul2 = mul i32 %conv4, %conv3
2302
2303  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2304  %add0 = add i32 %mul1, %s3
2305
2306  %add1 = add i32 %mul2, %add0
2307  %add2 = add i32 %add1, %mul1
2308
2309  store i32 %add2, i32 addrspace(1)* %dst, align 4
2310  ret void
2311}
2312
2313define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2314; GFX7-LABEL: udot2_MultipleUses_mul2:
2315; GFX7:       ; %bb.0: ; %entry
2316; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2317; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2318; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2319; GFX7-NEXT:    s_mov_b32 s10, 0
2320; GFX7-NEXT:    s_mov_b32 s11, s3
2321; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2323; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2324; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2325; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2326; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2327; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2328; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x0
2329; GFX7-NEXT:    s_mov_b32 s4, 0xffff
2330; GFX7-NEXT:    s_mov_b32 s2, -1
2331; GFX7-NEXT:    s_waitcnt vmcnt(1)
2332; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
2333; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
2334; GFX7-NEXT:    s_waitcnt vmcnt(0)
2335; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
2336; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2337; GFX7-NEXT:    v_mad_u32_u24 v4, v3, v1, s5
2338; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
2339; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v1, v4
2340; GFX7-NEXT:    v_mad_u32_u24 v0, v0, v2, v1
2341; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2342; GFX7-NEXT:    s_endpgm
2343;
2344; GFX8-LABEL: udot2_MultipleUses_mul2:
2345; GFX8:       ; %bb.0: ; %entry
2346; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2347; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2348; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2349; GFX8-NEXT:    s_mov_b32 s2, 0xffff
2350; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2351; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2352; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2353; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2354; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2355; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2356; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2357; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2358; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2359; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x0
2360; GFX8-NEXT:    s_waitcnt vmcnt(1)
2361; GFX8-NEXT:    v_and_b32_e32 v1, s2, v3
2362; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2363; GFX8-NEXT:    s_waitcnt vmcnt(0)
2364; GFX8-NEXT:    v_and_b32_e32 v2, s2, v0
2365; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2366; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2367; GFX8-NEXT:    v_mad_u32_u24 v4, v0, v3, s3
2368; GFX8-NEXT:    v_mad_u32_u24 v0, v0, v3, v4
2369; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v1, v0
2370; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2371; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2372; GFX8-NEXT:    flat_store_dword v[0:1], v2
2373; GFX8-NEXT:    s_endpgm
2374;
2375; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
2376; GFX9-NODL:       ; %bb.0: ; %entry
2377; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2378; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2379; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2380; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2381; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2382; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2383; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2384; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2385; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2386; GFX9-NODL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2387; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2388; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2389; GFX9-NODL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
2390; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2391; GFX9-NODL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
2392; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
2393; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2394; GFX9-NODL-NEXT:    s_endpgm
2395;
2396; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
2397; GFX9-DL:       ; %bb.0: ; %entry
2398; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2399; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2400; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2401; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2402; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2403; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2404; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2405; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2406; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2407; GFX9-DL-NEXT:    v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2408; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2409; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2410; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v2, v1
2411; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2412; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v2, v1, s0
2413; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
2414; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2415; GFX9-DL-NEXT:    s_endpgm
2416;
2417; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
2418; GFX10-DL:       ; %bb.0: ; %entry
2419; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2420; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2421; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2422; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2423; GFX10-DL-NEXT:    s_clause 0x1
2424; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2425; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2426; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2427; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2428; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
2429; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2430; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2431; GFX10-DL-NEXT:    v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2432; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v0
2433; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2434; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
2435; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2436; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
2437; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2438; GFX10-DL-NEXT:    s_endpgm
2439                                                   <2 x i16> addrspace(1)* %src2,
2440                                                   i32 addrspace(1)* nocapture %dst) {
2441entry:
2442  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2443  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2444  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2445  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2446  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2447
2448  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2449  %conv = zext i16 %s1.elt1 to i32
2450  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2451  %conv2 = zext i16 %s2.elt1 to i32
2452  %mul1 = mul i32 %conv2, %conv
2453
2454  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2455  %conv3 = zext i16 %s1.elt2 to i32
2456  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2457  %conv4 = zext i16 %s2.elt2 to i32
2458  %mul2 = mul i32 %conv4, %conv3
2459
2460  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2461  %add0 = add i32 %mul2, %s3
2462
2463  %add1 = add i32 %mul2, %add0
2464  %add2 = add i32 %add1, %mul1
2465
2466  store i32 %add2, i32 addrspace(1)* %dst, align 4
2467  ret void
2468}
2469
2470define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
2471; GFX7-LABEL: idot2_MultipleUses_mul2:
2472; GFX7:       ; %bb.0: ; %entry
2473; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2474; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2475; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2476; GFX7-NEXT:    s_mov_b32 s10, 0
2477; GFX7-NEXT:    s_mov_b32 s11, s3
2478; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2479; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2480; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2481; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2482; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2483; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2484; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2485; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2486; GFX7-NEXT:    s_mov_b32 s2, -1
2487; GFX7-NEXT:    s_waitcnt vmcnt(1)
2488; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 16
2489; GFX7-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2490; GFX7-NEXT:    s_waitcnt vmcnt(0)
2491; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 16
2492; GFX7-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2493; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2494; GFX7-NEXT:    v_mad_i32_i24 v4, v0, v2, s4
2495; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, v4
2496; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2497; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2498; GFX7-NEXT:    s_endpgm
2499;
2500; GFX8-LABEL: idot2_MultipleUses_mul2:
2501; GFX8:       ; %bb.0: ; %entry
2502; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2503; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2504; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2505; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2507; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2508; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2509; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2510; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2511; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2512; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2513; GFX8-NEXT:    flat_load_dword v0, v[0:1]
2514; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2515; GFX8-NEXT:    s_waitcnt vmcnt(1)
2516; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 16
2517; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
2518; GFX8-NEXT:    s_waitcnt vmcnt(0)
2519; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 16
2520; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
2521; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2522; GFX8-NEXT:    v_mad_i32_i24 v4, v0, v3, s2
2523; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, v4
2524; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2525; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2526; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2527; GFX8-NEXT:    flat_store_dword v[0:1], v2
2528; GFX8-NEXT:    s_endpgm
2529;
2530; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2531; GFX9-NODL:       ; %bb.0: ; %entry
2532; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2533; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2534; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2535; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2536; GFX9-NODL-NEXT:    global_load_dword v1, v0, s[4:5]
2537; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[6:7]
2538; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2539; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2540; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2541; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2542; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2543; GFX9-NODL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2544; GFX9-NODL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
2545; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2546; GFX9-NODL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
2547; GFX9-NODL-NEXT:    v_add3_u32 v1, v4, v1, v3
2548; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2549; GFX9-NODL-NEXT:    s_endpgm
2550;
2551; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2552; GFX9-DL:       ; %bb.0: ; %entry
2553; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2554; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2555; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2556; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2557; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2558; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2559; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2560; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2561; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2562; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2563; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
2564; GFX9-DL-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
2565; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v4, v2, v1
2566; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2567; GFX9-DL-NEXT:    v_mad_i32_i24 v1, v2, v1, s0
2568; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v3
2569; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2570; GFX9-DL-NEXT:    s_endpgm
2571;
2572; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
2573; GFX10-DL:       ; %bb.0: ; %entry
2574; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2575; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2576; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2577; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2578; GFX10-DL-NEXT:    s_clause 0x1
2579; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
2580; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
2581; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2582; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2583; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v0, 16, v1
2584; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2585; GFX10-DL-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
2586; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
2587; GFX10-DL-NEXT:    v_mul_i32_i24_e32 v2, v3, v0
2588; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2589; GFX10-DL-NEXT:    v_mad_i32_i24 v0, v3, v0, s2
2590; GFX10-DL-NEXT:    v_mov_b32_e32 v3, 0
2591; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
2592; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
2593; GFX10-DL-NEXT:    s_endpgm
2594                                                   <2 x i16> addrspace(1)* %src2,
2595                                                   i32 addrspace(1)* nocapture %dst) {
2596entry:
2597  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2598  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2599  %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2600  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2601  %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2602
2603  %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2604  %conv = sext i16 %s1.elt1 to i32
2605  %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2606  %conv2 = sext i16 %s2.elt1 to i32
2607  %mul1 = mul i32 %conv2, %conv
2608
2609  %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2610  %conv3 = sext i16 %s1.elt2 to i32
2611  %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2612  %conv4 = sext i16 %s2.elt2 to i32
2613  %mul2 = mul i32 %conv4, %conv3
2614
2615  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2616  %add0 = add i32 %mul2, %s3
2617
2618  %add1 = add i32 %mul2, %add0
2619  %add2 = add i32 %add1, %mul1
2620
2621  store i32 %add2, i32 addrspace(1)* %dst, align 4
2622  ret void
2623}
2624
2625define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
2626; GFX7-LABEL: udot2_acc16:
2627; GFX7:       ; %bb.0: ; %entry
2628; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2629; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2630; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2631; GFX7-NEXT:    s_mov_b32 s10, 0
2632; GFX7-NEXT:    s_mov_b32 s11, s3
2633; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2634; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2635; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2636; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2637; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
2638; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2639; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2640; GFX7-NEXT:    s_mov_b32 s2, -1
2641; GFX7-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
2642; GFX7-NEXT:    s_mov_b32 s4, 0xffff
2643; GFX7-NEXT:    s_waitcnt vmcnt(2)
2644; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2645; GFX7-NEXT:    v_and_b32_e32 v2, s4, v2
2646; GFX7-NEXT:    s_waitcnt vmcnt(1)
2647; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
2648; GFX7-NEXT:    v_and_b32_e32 v0, s4, v0
2649; GFX7-NEXT:    s_waitcnt vmcnt(0)
2650; GFX7-NEXT:    v_mad_u32_u24 v1, v3, v4, v1
2651; GFX7-NEXT:    v_mad_u32_u24 v0, v2, v0, v1
2652; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
2653; GFX7-NEXT:    s_endpgm
2654;
2655; GFX8-LABEL: udot2_acc16:
2656; GFX8:       ; %bb.0: ; %entry
2657; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2658; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2659; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
2660; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2661; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2662; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2663; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2664; GFX8-NEXT:    flat_load_dword v3, v[0:1]
2665; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2666; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2667; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2668; GFX8-NEXT:    flat_load_dword v2, v[0:1]
2669; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2670; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2671; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
2672; GFX8-NEXT:    s_waitcnt vmcnt(2)
2673; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2674; GFX8-NEXT:    s_waitcnt vmcnt(1)
2675; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
2676; GFX8-NEXT:    s_waitcnt vmcnt(0)
2677; GFX8-NEXT:    v_mad_u16 v4, v5, v6, v4
2678; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
2679; GFX8-NEXT:    flat_store_short v[0:1], v2
2680; GFX8-NEXT:    s_endpgm
2681;
2682; GFX9-NODL-LABEL: udot2_acc16:
2683; GFX9-NODL:       ; %bb.0: ; %entry
2684; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2685; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2686; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2687; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, 0
2688; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2689; GFX9-NODL-NEXT:    global_load_dword v2, v0, s[4:5]
2690; GFX9-NODL-NEXT:    global_load_dword v3, v0, s[6:7]
2691; GFX9-NODL-NEXT:    global_load_ushort v4, v1, s[2:3]
2692; GFX9-NODL-NEXT:    s_waitcnt vmcnt(2)
2693; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2694; GFX9-NODL-NEXT:    s_waitcnt vmcnt(1)
2695; GFX9-NODL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2696; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2697; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v4
2698; GFX9-NODL-NEXT:    v_mad_legacy_u16 v0, v2, v3, v0
2699; GFX9-NODL-NEXT:    global_store_short v1, v0, s[2:3]
2700; GFX9-NODL-NEXT:    s_endpgm
2701;
2702; GFX9-DL-LABEL: udot2_acc16:
2703; GFX9-DL:       ; %bb.0: ; %entry
2704; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2705; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2706; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2707; GFX9-DL-NEXT:    v_mov_b32_e32 v1, 0
2708; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2709; GFX9-DL-NEXT:    global_load_dword v2, v0, s[4:5]
2710; GFX9-DL-NEXT:    global_load_dword v3, v0, s[6:7]
2711; GFX9-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
2712; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
2713; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2714; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
2715; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2716; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2717; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v0, v5, v4
2718; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v2, v3, v0
2719; GFX9-DL-NEXT:    global_store_short v1, v0, s[2:3]
2720; GFX9-DL-NEXT:    s_endpgm
2721;
2722; GFX10-DL-LABEL: udot2_acc16:
2723; GFX10-DL:       ; %bb.0: ; %entry
2724; GFX10-DL-NEXT:    s_clause 0x1
2725; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2726; GFX10-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2727; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2728; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
2729; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX10-DL-NEXT:    s_clause 0x1
2731; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
2732; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
2733; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[2:3]
2734; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
2735; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
2736; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2737; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
2738; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2739; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
2740; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
2741; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
2742; GFX10-DL-NEXT:    s_endpgm
2743                                       <2 x i16> addrspace(1)* %src2,
2744                                       i16 addrspace(1)* nocapture %dst) {
2745entry:
2746  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2747  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src1, i32 %idx
2748  %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %gep1
2749  %gep2 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %src2, i32 %idx
2750  %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %gep2
2751
2752  %v1e1 = extractelement <2 x i16> %v1, i64 0
2753  %v2e1 = extractelement <2 x i16> %v2, i64 0
2754  %mul1 = mul i16 %v1e1, %v2e1
2755
2756  %v1e2 = extractelement <2 x i16> %v1, i64 1
2757  %v2e2 = extractelement <2 x i16> %v2, i64 1
2758  %mul2 = mul i16 %v1e2, %v2e2
2759
2760  %s2 = load i16, i16 addrspace(1)* %dst, align 2
2761  %add1 = add i16 %mul2, %s2
2762  %add2 = add i16 %add1, %mul1
2763  store i16 %add2, i16 addrspace(1)* %dst, align 2
2764  ret void
2765}
2766
2767define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
2768; GFX7-LABEL: notsdot2_sext8:
2769; GFX7:       ; %bb.0: ; %entry
2770; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2771; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2772; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2773; GFX7-NEXT:    s_mov_b32 s10, 0
2774; GFX7-NEXT:    s_mov_b32 s11, s3
2775; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2776; GFX7-NEXT:    s_mov_b64 s[8:9], s[4:5]
2777; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2778; GFX7-NEXT:    v_mov_b32_e32 v1, 0
2779; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64
2780; GFX7-NEXT:    s_mov_b64 s[8:9], s[6:7]
2781; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
2782; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
2783; GFX7-NEXT:    s_mov_b32 s2, -1
2784; GFX7-NEXT:    s_waitcnt vmcnt(1)
2785; GFX7-NEXT:    v_bfe_i32 v1, v2, 0, 8
2786; GFX7-NEXT:    v_bfe_i32 v2, v2, 8, 8
2787; GFX7-NEXT:    s_waitcnt vmcnt(0)
2788; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
2789; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
2790; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2791; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v2, s4
2792; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v1, v0
2793; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2794; GFX7-NEXT:    s_endpgm
2795;
2796; GFX8-LABEL: notsdot2_sext8:
2797; GFX8:       ; %bb.0: ; %entry
2798; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2799; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2800; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
2801; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2802; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2803; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
2804; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2805; GFX8-NEXT:    flat_load_ushort v3, v[0:1]
2806; GFX8-NEXT:    v_mov_b32_e32 v1, s7
2807; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
2808; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2809; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
2810; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x0
2811; GFX8-NEXT:    s_waitcnt vmcnt(1)
2812; GFX8-NEXT:    v_bfe_i32 v1, v3, 0, 8
2813; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
2814; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 8
2815; GFX8-NEXT:    s_waitcnt vmcnt(0)
2816; GFX8-NEXT:    v_bfe_i32 v2, v0, 0, 8
2817; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
2818; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 8
2819; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2820; GFX8-NEXT:    v_mad_i32_i24 v0, v0, v3, s2
2821; GFX8-NEXT:    v_mad_i32_i24 v2, v2, v1, v0
2822; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2823; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2824; GFX8-NEXT:    flat_store_dword v[0:1], v2
2825; GFX8-NEXT:    s_endpgm
2826;
2827; GFX9-NODL-LABEL: notsdot2_sext8:
2828; GFX9-NODL:       ; %bb.0: ; %entry
2829; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2830; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2831; GFX9-NODL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2832; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2833; GFX9-NODL-NEXT:    global_load_ushort v1, v0, s[4:5]
2834; GFX9-NODL-NEXT:    global_load_ushort v2, v0, s[6:7]
2835; GFX9-NODL-NEXT:    s_load_dword s0, s[2:3], 0x0
2836; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, 0
2837; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
2838; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2839; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2840; GFX9-NODL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2841; GFX9-NODL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2842; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
2843; GFX9-NODL-NEXT:    v_add3_u32 v1, v1, s0, v3
2844; GFX9-NODL-NEXT:    global_store_dword v0, v1, s[2:3]
2845; GFX9-NODL-NEXT:    s_endpgm
2846;
2847; GFX9-DL-LABEL: notsdot2_sext8:
2848; GFX9-DL:       ; %bb.0: ; %entry
2849; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2850; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2851; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2852; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2853; GFX9-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2854; GFX9-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
2855; GFX9-DL-NEXT:    s_load_dword s0, s[2:3], 0x0
2856; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
2857; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
2858; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2859; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v1, 8, v1
2860; GFX9-DL-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
2861; GFX9-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2862; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
2863; GFX9-DL-NEXT:    v_add3_u32 v1, v1, s0, v3
2864; GFX9-DL-NEXT:    global_store_dword v0, v1, s[2:3]
2865; GFX9-DL-NEXT:    s_endpgm
2866;
2867; GFX10-DL-LABEL: notsdot2_sext8:
2868; GFX10-DL:       ; %bb.0: ; %entry
2869; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2870; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2871; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2872; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2873; GFX10-DL-NEXT:    s_clause 0x1
2874; GFX10-DL-NEXT:    global_load_ushort v1, v0, s[4:5]
2875; GFX10-DL-NEXT:    global_load_ushort v2, v0, s[6:7]
2876; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
2877; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
2878; GFX10-DL-NEXT:    v_lshrrev_b16 v0, 8, v1
2879; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
2880; GFX10-DL-NEXT:    v_lshrrev_b16 v3, 8, v2
2881; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2882; GFX10-DL-NEXT:    v_mov_b32_e32 v2, 0
2883; GFX10-DL-NEXT:    v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
2884; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
2885; GFX10-DL-NEXT:    v_add3_u32 v0, v0, s2, v1
2886; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
2887; GFX10-DL-NEXT:    s_endpgm
2888                                          <2 x i8> addrspace(1)* %src2,
2889                                          i32 addrspace(1)* nocapture %dst) {
2890entry:
2891  %idx = call i32 @llvm.amdgcn.workitem.id.x()
2892  %gep1 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src1, i32 %idx
2893  %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %gep1
2894  %gep2 = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %src2, i32 %idx
2895  %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %gep2
2896
2897  %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2898  %conv = sext i8 %s1.elt1 to i32
2899  %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2900  %conv2 = sext i8 %s2.elt1 to i32
2901  %mul1 = mul nuw i32 %conv2, %conv
2902
2903  %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2904  %conv3 = sext i8 %s1.elt2 to i32
2905  %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2906  %conv4 = sext i8 %s2.elt2 to i32
2907  %mul2 = mul nuw i32 %conv4, %conv3
2908
2909  %s3 = load i32, i32 addrspace(1)* %dst, align 4
2910  %add = add i32 %mul2, %s3
2911  %add6 = add i32 %add, %mul1
2912  store i32 %add6, i32 addrspace(1)* %dst, align 4
2913  ret void
2914}
2915
2916declare i32 @llvm.amdgcn.workitem.id.x()
2917