1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
3; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2
5
6@buf = dso_local global [1024 x i8] zeroinitializer, align 16
7@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
8
9; Function Attrs: nounwind uwtable
10define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr {
11; AVX512-LABEL: test_api:
12; AVX512:       # %bb.0: # %entry
13; AVX512-NEXT:    pushq %rbp
14; AVX512-NEXT:    .cfi_def_cfa_offset 16
15; AVX512-NEXT:    .cfi_offset %rbp, -16
16; AVX512-NEXT:    movq %rsp, %rbp
17; AVX512-NEXT:    .cfi_def_cfa_register %rbp
18; AVX512-NEXT:    andq $-1024, %rsp # imm = 0xFC00
19; AVX512-NEXT:    subq $6144, %rsp # imm = 0x1800
20; AVX512-NEXT:    movw %dx, %ax
21; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
22; AVX512-NEXT:    movw %si, %ax
23; AVX512-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
24; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
25; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
26; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
27; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
28; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
29; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
30; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
31; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
32; AVX512-NEXT:    cmpl $0, %edi
33; AVX512-NEXT:    je .LBB0_2
34; AVX512-NEXT:  # %bb.1: # %if.then
35; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
36; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
37; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
38; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
39; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
40; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
41; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
42; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
43; AVX512-NEXT:    movb %al, %sil
44; AVX512-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
45; AVX512-NEXT:    movw $8, {{[0-9]+}}(%rsp)
46; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
47; AVX512-NEXT:    movl $buf, %r9d
48; AVX512-NEXT:    movl $32, %r10d
49; AVX512-NEXT:    movw $8, %si
50; AVX512-NEXT:    tileloadd (%r9,%r10), %tmm0
51; AVX512-NEXT:    movl $64, %r8d
52; AVX512-NEXT:    tilestored %tmm0, (%r11,%r8)
53; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
54; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
55; AVX512-NEXT:    movb $8, {{[0-9]+}}(%rsp)
56; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
57; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
58; AVX512-NEXT:    tileloadd (%r9,%r10), %tmm0
59; AVX512-NEXT:    tilestored %tmm0, (%rdi,%r8)
60; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
61; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
62; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
63; AVX512-NEXT:    movb %al, %dil
64; AVX512-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
65; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
66; AVX512-NEXT:    ldtilecfg (%rsi)
67; AVX512-NEXT:    movl $buf, %esi
68; AVX512-NEXT:    movl $32, %edi
69; AVX512-NEXT:    tileloadd (%rsi,%rdi), %tmm0
70; AVX512-NEXT:    movl $64, %esi
71; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
72; AVX512-NEXT:    jmp .LBB0_3
73; AVX512-NEXT:  .LBB0_2: # %if.else
74; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
75; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
76; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
77; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
78; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
79; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
80; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
81; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
82; AVX512-NEXT:    movb %al, %sil
83; AVX512-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
84; AVX512-NEXT:    movw $8, {{[0-9]+}}(%rsp)
85; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
86; AVX512-NEXT:    movl $buf2, %r9d
87; AVX512-NEXT:    movl $32, %r10d
88; AVX512-NEXT:    movw $8, %si
89; AVX512-NEXT:    tileloadd (%r9,%r10), %tmm0
90; AVX512-NEXT:    movl $64, %r8d
91; AVX512-NEXT:    tilestored %tmm0, (%r11,%r8)
92; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
93; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
94; AVX512-NEXT:    movb $8, {{[0-9]+}}(%rsp)
95; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
96; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
97; AVX512-NEXT:    tileloadd (%r9,%r10), %tmm0
98; AVX512-NEXT:    tilestored %tmm0, (%rdi,%r8)
99; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
100; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
101; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
102; AVX512-NEXT:    movb %al, %dil
103; AVX512-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
104; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
105; AVX512-NEXT:    ldtilecfg (%rsi)
106; AVX512-NEXT:    movl $buf2, %esi
107; AVX512-NEXT:    movl $32, %edi
108; AVX512-NEXT:    tileloadd (%rsi,%rdi), %tmm0
109; AVX512-NEXT:    movl $64, %esi
110; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
111; AVX512-NEXT:  .LBB0_3: # %if.end
112; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
113; AVX512-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
114; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
115; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
116; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
117; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
118; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
119; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
120; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
121; AVX512-NEXT:    movb %al, %sil
122; AVX512-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
123; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
124; AVX512-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
125; AVX512-NEXT:    movw $8, {{[0-9]+}}(%rsp)
126; AVX512-NEXT:    movb $8, {{[0-9]+}}(%rsp)
127; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
128; AVX512-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
129; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
130; AVX512-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
131; AVX512-NEXT:    movl $64, %esi
132; AVX512-NEXT:    movw $8, %di
133; AVX512-NEXT:    tileloadd (%r10,%rsi), %tmm1
134; AVX512-NEXT:    tileloadd (%r9,%rsi), %tmm2
135; AVX512-NEXT:    tileloadd (%r8,%rsi), %tmm0
136; AVX512-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
137; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
138; AVX512-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
139; AVX512-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
140; AVX512-NEXT:    movb $1, {{[0-9]+}}(%rsp)
141; AVX512-NEXT:    movb %al, %dil
142; AVX512-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
143; AVX512-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
144; AVX512-NEXT:    ldtilecfg (%rsi)
145; AVX512-NEXT:    movl $64, %esi
146; AVX512-NEXT:    tileloadd (%rdx,%rsi), %tmm0
147; AVX512-NEXT:    movl $buf, %edx
148; AVX512-NEXT:    movl $32, %esi
149; AVX512-NEXT:    tilestored %tmm0, (%rdx,%rsi)
150; AVX512-NEXT:    movq %rbp, %rsp
151; AVX512-NEXT:    popq %rbp
152; AVX512-NEXT:    .cfi_def_cfa %rsp, 8
153; AVX512-NEXT:    tilerelease
154; AVX512-NEXT:    vzeroupper
155; AVX512-NEXT:    retq
156;
157; AVX2-LABEL: test_api:
158; AVX2:       # %bb.0: # %entry
159; AVX2-NEXT:    pushq %rbp
160; AVX2-NEXT:    .cfi_def_cfa_offset 16
161; AVX2-NEXT:    .cfi_offset %rbp, -16
162; AVX2-NEXT:    movq %rsp, %rbp
163; AVX2-NEXT:    .cfi_def_cfa_register %rbp
164; AVX2-NEXT:    andq $-1024, %rsp # imm = 0xFC00
165; AVX2-NEXT:    subq $6144, %rsp # imm = 0x1800
166; AVX2-NEXT:    movw %dx, %ax
167; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
168; AVX2-NEXT:    movw %si, %ax
169; AVX2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
170; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
171; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
172; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
173; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
174; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
175; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
176; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
177; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
178; AVX2-NEXT:    cmpl $0, %edi
179; AVX2-NEXT:    je .LBB0_2
180; AVX2-NEXT:  # %bb.1: # %if.then
181; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
182; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
183; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
184; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
185; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
186; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
187; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
188; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
189; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
190; AVX2-NEXT:    movb %al, %sil
191; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
192; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
193; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
194; AVX2-NEXT:    movl $buf, %r9d
195; AVX2-NEXT:    movl $32, %r10d
196; AVX2-NEXT:    movw $8, %si
197; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
198; AVX2-NEXT:    movl $64, %r8d
199; AVX2-NEXT:    tilestored %tmm0, (%r11,%r8)
200; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
201; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
202; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
203; AVX2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
204; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
205; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
206; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
207; AVX2-NEXT:    tilestored %tmm0, (%rdi,%r8)
208; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
209; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
210; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
211; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
212; AVX2-NEXT:    movb %al, %dil
213; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
214; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
215; AVX2-NEXT:    ldtilecfg (%rsi)
216; AVX2-NEXT:    movl $buf, %esi
217; AVX2-NEXT:    movl $32, %edi
218; AVX2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
219; AVX2-NEXT:    movl $64, %esi
220; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
221; AVX2-NEXT:    jmp .LBB0_3
222; AVX2-NEXT:  .LBB0_2: # %if.else
223; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
224; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
225; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
226; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
227; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
228; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
229; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
230; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
231; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
232; AVX2-NEXT:    movb %al, %sil
233; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
234; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
235; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
236; AVX2-NEXT:    movl $buf2, %r9d
237; AVX2-NEXT:    movl $32, %r10d
238; AVX2-NEXT:    movw $8, %si
239; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
240; AVX2-NEXT:    movl $64, %r8d
241; AVX2-NEXT:    tilestored %tmm0, (%r11,%r8)
242; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
243; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
244; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
245; AVX2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
246; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
247; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
248; AVX2-NEXT:    tileloadd (%r9,%r10), %tmm0
249; AVX2-NEXT:    tilestored %tmm0, (%rdi,%r8)
250; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
251; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
252; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
253; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
254; AVX2-NEXT:    movb %al, %dil
255; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
256; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
257; AVX2-NEXT:    ldtilecfg (%rsi)
258; AVX2-NEXT:    movl $buf2, %esi
259; AVX2-NEXT:    movl $32, %edi
260; AVX2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
261; AVX2-NEXT:    movl $64, %esi
262; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
263; AVX2-NEXT:  .LBB0_3: # %if.end
264; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
265; AVX2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
266; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
267; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
268; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
269; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
270; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
271; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
272; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
273; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
274; AVX2-NEXT:    movb %al, %sil
275; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
276; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
277; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
278; AVX2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
279; AVX2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
280; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
281; AVX2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
282; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
283; AVX2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
284; AVX2-NEXT:    movl $64, %esi
285; AVX2-NEXT:    movw $8, %di
286; AVX2-NEXT:    tileloadd (%r10,%rsi), %tmm1
287; AVX2-NEXT:    tileloadd (%r9,%rsi), %tmm2
288; AVX2-NEXT:    tileloadd (%r8,%rsi), %tmm0
289; AVX2-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
290; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
291; AVX2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
292; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
293; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
294; AVX2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
295; AVX2-NEXT:    movb %al, %dil
296; AVX2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
297; AVX2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
298; AVX2-NEXT:    ldtilecfg (%rsi)
299; AVX2-NEXT:    movl $64, %esi
300; AVX2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
301; AVX2-NEXT:    movl $buf, %edx
302; AVX2-NEXT:    movl $32, %esi
303; AVX2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
304; AVX2-NEXT:    movq %rbp, %rsp
305; AVX2-NEXT:    popq %rbp
306; AVX2-NEXT:    .cfi_def_cfa %rsp, 8
307; AVX2-NEXT:    tilerelease
308; AVX2-NEXT:    vzeroupper
309; AVX2-NEXT:    retq
310;
311; SSE2-LABEL: test_api:
312; SSE2:       # %bb.0: # %entry
313; SSE2-NEXT:    pushq %rbp
314; SSE2-NEXT:    .cfi_def_cfa_offset 16
315; SSE2-NEXT:    .cfi_offset %rbp, -16
316; SSE2-NEXT:    movq %rsp, %rbp
317; SSE2-NEXT:    .cfi_def_cfa_register %rbp
318; SSE2-NEXT:    andq $-1024, %rsp # imm = 0xFC00
319; SSE2-NEXT:    subq $6144, %rsp # imm = 0x1800
320; SSE2-NEXT:    movw %dx, %ax
321; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
322; SSE2-NEXT:    movw %si, %ax
323; SSE2-NEXT:    movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
324; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
325; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
326; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
327; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
328; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
329; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
330; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
331; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
332; SSE2-NEXT:    cmpl $0, %edi
333; SSE2-NEXT:    je .LBB0_2
334; SSE2-NEXT:  # %bb.1: # %if.then
335; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
336; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
337; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
338; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
339; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
340; SSE2-NEXT:    xorps %xmm0, %xmm0
341; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
342; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
343; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
344; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
345; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
346; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
347; SSE2-NEXT:    movb %al, %sil
348; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
349; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
350; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
351; SSE2-NEXT:    movl $buf, %r9d
352; SSE2-NEXT:    movl $32, %r10d
353; SSE2-NEXT:    movw $8, %si
354; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
355; SSE2-NEXT:    movl $64, %r8d
356; SSE2-NEXT:    tilestored %tmm0, (%r11,%r8)
357; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
358; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
359; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
360; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
361; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
362; SSE2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
363; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
364; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
365; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
366; SSE2-NEXT:    tilestored %tmm0, (%rdi,%r8)
367; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
368; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
369; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
370; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
371; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
372; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
373; SSE2-NEXT:    movb %al, %dil
374; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
375; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
376; SSE2-NEXT:    ldtilecfg (%rsi)
377; SSE2-NEXT:    movl $buf, %esi
378; SSE2-NEXT:    movl $32, %edi
379; SSE2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
380; SSE2-NEXT:    movl $64, %esi
381; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
382; SSE2-NEXT:    jmp .LBB0_3
383; SSE2-NEXT:  .LBB0_2: # %if.else
384; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
385; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
386; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
387; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
388; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
389; SSE2-NEXT:    xorps %xmm0, %xmm0
390; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
391; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
392; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
393; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
394; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
395; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
396; SSE2-NEXT:    movb %al, %sil
397; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
398; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
399; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
400; SSE2-NEXT:    movl $buf2, %r9d
401; SSE2-NEXT:    movl $32, %r10d
402; SSE2-NEXT:    movw $8, %si
403; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
404; SSE2-NEXT:    movl $64, %r8d
405; SSE2-NEXT:    tilestored %tmm0, (%r11,%r8)
406; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
407; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
408; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
409; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
410; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
411; SSE2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
412; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
413; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
414; SSE2-NEXT:    tileloadd (%r9,%r10), %tmm0
415; SSE2-NEXT:    tilestored %tmm0, (%rdi,%r8)
416; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
417; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
418; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
419; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
420; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
421; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
422; SSE2-NEXT:    movb %al, %dil
423; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
424; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
425; SSE2-NEXT:    ldtilecfg (%rsi)
426; SSE2-NEXT:    movl $buf2, %esi
427; SSE2-NEXT:    movl $32, %edi
428; SSE2-NEXT:    tileloadd (%rsi,%rdi), %tmm0
429; SSE2-NEXT:    movl $64, %esi
430; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
431; SSE2-NEXT:  .LBB0_3: # %if.end
432; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
433; SSE2-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
434; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
435; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
436; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
437; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
438; SSE2-NEXT:    xorps %xmm0, %xmm0
439; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
440; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
441; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
442; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
443; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
444; SSE2-NEXT:    movb %al, %sil
445; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
446; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
447; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
448; SSE2-NEXT:    movw $8, {{[0-9]+}}(%rsp)
449; SSE2-NEXT:    movb $8, {{[0-9]+}}(%rsp)
450; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
451; SSE2-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
452; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
453; SSE2-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
454; SSE2-NEXT:    movl $64, %esi
455; SSE2-NEXT:    movw $8, %di
456; SSE2-NEXT:    tileloadd (%r10,%rsi), %tmm1
457; SSE2-NEXT:    tileloadd (%r9,%rsi), %tmm2
458; SSE2-NEXT:    tileloadd (%r8,%rsi), %tmm0
459; SSE2-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
460; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
461; SSE2-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
462; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
463; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
464; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
465; SSE2-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
466; SSE2-NEXT:    movb $1, {{[0-9]+}}(%rsp)
467; SSE2-NEXT:    movb %al, %dil
468; SSE2-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
469; SSE2-NEXT:    movw %cx, {{[0-9]+}}(%rsp)
470; SSE2-NEXT:    ldtilecfg (%rsi)
471; SSE2-NEXT:    movl $64, %esi
472; SSE2-NEXT:    tileloadd (%rdx,%rsi), %tmm0
473; SSE2-NEXT:    movl $buf, %edx
474; SSE2-NEXT:    movl $32, %esi
475; SSE2-NEXT:    tilestored %tmm0, (%rdx,%rsi)
476; SSE2-NEXT:    movq %rbp, %rsp
477; SSE2-NEXT:    popq %rbp
478; SSE2-NEXT:    .cfi_def_cfa %rsp, 8
479; SSE2-NEXT:    tilerelease
480; SSE2-NEXT:    retq
481entry:
482  %tobool.not = icmp eq i32 %cond, 0
483  br i1 %tobool.not, label %if.else, label %if.then
484
485if.then:                                          ; preds = %entry
486  %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
487  %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
488  %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
489  br label %if.end
490
491if.else:                                          ; preds = %entry
492  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
493  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
494  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
495  br label %if.end
496
497if.end:                                           ; preds = %if.else, %if.then
498  %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
499  %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
500  %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
501  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
502  tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
503  ret void
504}
505
506; Function Attrs: nounwind
507declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
508
509; Function Attrs: nounwind
510declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
511
512; Function Attrs: nounwind
513declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
514