1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 3; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 | FileCheck %s --check-prefix=SSE2 5 6@buf = dso_local global [1024 x i8] zeroinitializer, align 16 7@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16 8 9; Function Attrs: nounwind uwtable 10define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) local_unnamed_addr { 11; AVX512-LABEL: test_api: 12; AVX512: # %bb.0: # %entry 13; AVX512-NEXT: pushq %rbp 14; AVX512-NEXT: .cfi_def_cfa_offset 16 15; AVX512-NEXT: .cfi_offset %rbp, -16 16; AVX512-NEXT: movq %rsp, %rbp 17; AVX512-NEXT: .cfi_def_cfa_register %rbp 18; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00 19; AVX512-NEXT: subq $6144, %rsp # imm = 0x1800 20; AVX512-NEXT: movw %dx, %ax 21; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 22; AVX512-NEXT: movw %si, %ax 23; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 24; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 25; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 26; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 27; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 28; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 29; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 30; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax 31; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 32; AVX512-NEXT: cmpl $0, %edi 33; AVX512-NEXT: je .LBB0_2 34; AVX512-NEXT: # %bb.1: # %if.then 35; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 36; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 37; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 38; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 39; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload 40; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 41; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 42; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 43; AVX512-NEXT: movb %al, %sil 44; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) 45; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) 46; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 47; AVX512-NEXT: movl $buf, %r9d 48; AVX512-NEXT: movl $32, %r10d 49; AVX512-NEXT: movw $8, %si 50; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 51; AVX512-NEXT: movl $64, %r8d 52; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) 53; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 54; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 55; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) 56; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 57; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 58; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 59; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) 60; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 61; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 62; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 63; AVX512-NEXT: movb %al, %dil 64; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) 65; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 66; AVX512-NEXT: ldtilecfg (%rsi) 67; AVX512-NEXT: movl $buf, %esi 68; AVX512-NEXT: movl $32, %edi 69; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 70; AVX512-NEXT: movl $64, %esi 71; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 72; AVX512-NEXT: jmp .LBB0_3 73; AVX512-NEXT: .LBB0_2: # %if.else 74; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 75; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 76; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 77; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 78; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload 79; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 80; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 81; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 82; AVX512-NEXT: movb %al, %sil 83; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) 84; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) 85; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 86; AVX512-NEXT: movl $buf2, %r9d 87; AVX512-NEXT: movl $32, %r10d 88; AVX512-NEXT: movw $8, %si 89; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 90; AVX512-NEXT: movl $64, %r8d 91; AVX512-NEXT: tilestored %tmm0, (%r11,%r8) 92; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 93; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 94; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) 95; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 96; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 97; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0 98; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8) 99; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 100; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 101; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 102; AVX512-NEXT: movb %al, %dil 103; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) 104; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 105; AVX512-NEXT: ldtilecfg (%rsi) 106; AVX512-NEXT: movl $buf2, %esi 107; AVX512-NEXT: movl $32, %edi 108; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0 109; AVX512-NEXT: movl $64, %esi 110; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 111; AVX512-NEXT: .LBB0_3: # %if.end 112; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 113; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 114; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 115; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload 116; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload 117; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload 118; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 119; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 120; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 121; AVX512-NEXT: movb %al, %sil 122; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) 123; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 124; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) 125; AVX512-NEXT: movw $8, {{[0-9]+}}(%rsp) 126; AVX512-NEXT: movb $8, {{[0-9]+}}(%rsp) 127; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 128; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp) 129; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 130; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 131; AVX512-NEXT: movl $64, %esi 132; AVX512-NEXT: movw $8, %di 133; AVX512-NEXT: tileloadd (%r10,%rsi), %tmm1 134; AVX512-NEXT: tileloadd (%r9,%rsi), %tmm2 135; AVX512-NEXT: tileloadd (%r8,%rsi), %tmm0 136; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 137; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 138; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 139; AVX512-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) 140; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp) 141; AVX512-NEXT: movb %al, %dil 142; AVX512-NEXT: movb %dil, {{[0-9]+}}(%rsp) 143; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp) 144; AVX512-NEXT: ldtilecfg (%rsi) 145; AVX512-NEXT: movl $64, %esi 146; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0 147; AVX512-NEXT: movl $buf, %edx 148; AVX512-NEXT: movl $32, %esi 149; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi) 150; AVX512-NEXT: movq %rbp, %rsp 151; AVX512-NEXT: popq %rbp 152; AVX512-NEXT: .cfi_def_cfa %rsp, 8 153; AVX512-NEXT: tilerelease 154; AVX512-NEXT: vzeroupper 155; AVX512-NEXT: retq 156; 157; AVX2-LABEL: test_api: 158; AVX2: # %bb.0: # %entry 159; AVX2-NEXT: pushq %rbp 160; AVX2-NEXT: .cfi_def_cfa_offset 16 161; AVX2-NEXT: .cfi_offset %rbp, -16 162; AVX2-NEXT: movq %rsp, %rbp 163; AVX2-NEXT: .cfi_def_cfa_register %rbp 164; AVX2-NEXT: andq $-1024, %rsp # imm = 0xFC00 165; AVX2-NEXT: subq $6144, %rsp # imm = 0x1800 166; AVX2-NEXT: movw %dx, %ax 167; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 168; AVX2-NEXT: movw %si, %ax 169; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 170; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 171; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 172; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 173; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 174; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 175; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 176; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 177; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 178; AVX2-NEXT: cmpl $0, %edi 179; AVX2-NEXT: je .LBB0_2 180; AVX2-NEXT: # %bb.1: # %if.then 181; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 182; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 183; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 184; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 185; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload 186; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 187; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 188; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 189; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 190; AVX2-NEXT: movb %al, %sil 191; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 192; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) 193; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 194; AVX2-NEXT: movl $buf, %r9d 195; AVX2-NEXT: movl $32, %r10d 196; AVX2-NEXT: movw $8, %si 197; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 198; AVX2-NEXT: movl $64, %r8d 199; AVX2-NEXT: tilestored %tmm0, (%r11,%r8) 200; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 201; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 202; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 203; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) 204; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 205; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 206; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 207; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) 208; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 209; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 210; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 211; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 212; AVX2-NEXT: movb %al, %dil 213; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) 214; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 215; AVX2-NEXT: ldtilecfg (%rsi) 216; AVX2-NEXT: movl $buf, %esi 217; AVX2-NEXT: movl $32, %edi 218; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0 219; AVX2-NEXT: movl $64, %esi 220; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) 221; AVX2-NEXT: jmp .LBB0_3 222; AVX2-NEXT: .LBB0_2: # %if.else 223; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 224; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 225; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 226; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 227; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload 228; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 229; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 230; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 231; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 232; AVX2-NEXT: movb %al, %sil 233; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 234; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) 235; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 236; AVX2-NEXT: movl $buf2, %r9d 237; AVX2-NEXT: movl $32, %r10d 238; AVX2-NEXT: movw $8, %si 239; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 240; AVX2-NEXT: movl $64, %r8d 241; AVX2-NEXT: tilestored %tmm0, (%r11,%r8) 242; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 243; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 244; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 245; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) 246; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 247; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 248; AVX2-NEXT: tileloadd (%r9,%r10), %tmm0 249; AVX2-NEXT: tilestored %tmm0, (%rdi,%r8) 250; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 251; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 252; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 253; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 254; AVX2-NEXT: movb %al, %dil 255; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) 256; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 257; AVX2-NEXT: ldtilecfg (%rsi) 258; AVX2-NEXT: movl $buf2, %esi 259; AVX2-NEXT: movl $32, %edi 260; AVX2-NEXT: tileloadd (%rsi,%rdi), %tmm0 261; AVX2-NEXT: movl $64, %esi 262; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) 263; AVX2-NEXT: .LBB0_3: # %if.end 264; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 265; AVX2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 266; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 267; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload 268; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload 269; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload 270; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 271; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 272; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 273; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 274; AVX2-NEXT: movb %al, %sil 275; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 276; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 277; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 278; AVX2-NEXT: movw $8, {{[0-9]+}}(%rsp) 279; AVX2-NEXT: movb $8, {{[0-9]+}}(%rsp) 280; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 281; AVX2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 282; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 283; AVX2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 284; AVX2-NEXT: movl $64, %esi 285; AVX2-NEXT: movw $8, %di 286; AVX2-NEXT: tileloadd (%r10,%rsi), %tmm1 287; AVX2-NEXT: tileloadd (%r9,%rsi), %tmm2 288; AVX2-NEXT: tileloadd (%r8,%rsi), %tmm0 289; AVX2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 290; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) 291; AVX2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 292; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 293; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) 294; AVX2-NEXT: movb $1, {{[0-9]+}}(%rsp) 295; AVX2-NEXT: movb %al, %dil 296; AVX2-NEXT: movb %dil, {{[0-9]+}}(%rsp) 297; AVX2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 298; AVX2-NEXT: ldtilecfg (%rsi) 299; AVX2-NEXT: movl $64, %esi 300; AVX2-NEXT: tileloadd (%rdx,%rsi), %tmm0 301; AVX2-NEXT: movl $buf, %edx 302; AVX2-NEXT: movl $32, %esi 303; AVX2-NEXT: tilestored %tmm0, (%rdx,%rsi) 304; AVX2-NEXT: movq %rbp, %rsp 305; AVX2-NEXT: popq %rbp 306; AVX2-NEXT: .cfi_def_cfa %rsp, 8 307; AVX2-NEXT: tilerelease 308; AVX2-NEXT: vzeroupper 309; AVX2-NEXT: retq 310; 311; SSE2-LABEL: test_api: 312; SSE2: # %bb.0: # %entry 313; SSE2-NEXT: pushq %rbp 314; SSE2-NEXT: .cfi_def_cfa_offset 16 315; SSE2-NEXT: .cfi_offset %rbp, -16 316; SSE2-NEXT: movq %rsp, %rbp 317; SSE2-NEXT: .cfi_def_cfa_register %rbp 318; SSE2-NEXT: andq $-1024, %rsp # imm = 0xFC00 319; SSE2-NEXT: subq $6144, %rsp # imm = 0x1800 320; SSE2-NEXT: movw %dx, %ax 321; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 322; SSE2-NEXT: movw %si, %ax 323; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill 324; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 325; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 326; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 327; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 328; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 329; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 330; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rax 331; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 332; SSE2-NEXT: cmpl $0, %edi 333; SSE2-NEXT: je .LBB0_2 334; SSE2-NEXT: # %bb.1: # %if.then 335; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 336; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 337; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 338; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 339; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload 340; SSE2-NEXT: xorps %xmm0, %xmm0 341; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 342; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 343; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 344; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 345; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 346; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 347; SSE2-NEXT: movb %al, %sil 348; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 349; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) 350; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 351; SSE2-NEXT: movl $buf, %r9d 352; SSE2-NEXT: movl $32, %r10d 353; SSE2-NEXT: movw $8, %si 354; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 355; SSE2-NEXT: movl $64, %r8d 356; SSE2-NEXT: tilestored %tmm0, (%r11,%r8) 357; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 358; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 359; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 360; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 361; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 362; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) 363; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 364; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 365; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 366; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) 367; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 368; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 369; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 370; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 371; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 372; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 373; SSE2-NEXT: movb %al, %dil 374; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) 375; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 376; SSE2-NEXT: ldtilecfg (%rsi) 377; SSE2-NEXT: movl $buf, %esi 378; SSE2-NEXT: movl $32, %edi 379; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0 380; SSE2-NEXT: movl $64, %esi 381; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) 382; SSE2-NEXT: jmp .LBB0_3 383; SSE2-NEXT: .LBB0_2: # %if.else 384; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 385; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 386; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 387; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload 388; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload 389; SSE2-NEXT: xorps %xmm0, %xmm0 390; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 391; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 392; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 393; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 394; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 395; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 396; SSE2-NEXT: movb %al, %sil 397; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 398; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) 399; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 400; SSE2-NEXT: movl $buf2, %r9d 401; SSE2-NEXT: movl $32, %r10d 402; SSE2-NEXT: movw $8, %si 403; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 404; SSE2-NEXT: movl $64, %r8d 405; SSE2-NEXT: tilestored %tmm0, (%r11,%r8) 406; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 407; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 408; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 409; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 410; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 411; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) 412; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 413; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 414; SSE2-NEXT: tileloadd (%r9,%r10), %tmm0 415; SSE2-NEXT: tilestored %tmm0, (%rdi,%r8) 416; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 417; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 418; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 419; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 420; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 421; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 422; SSE2-NEXT: movb %al, %dil 423; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) 424; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 425; SSE2-NEXT: ldtilecfg (%rsi) 426; SSE2-NEXT: movl $buf2, %esi 427; SSE2-NEXT: movl $32, %edi 428; SSE2-NEXT: tileloadd (%rsi,%rdi), %tmm0 429; SSE2-NEXT: movl $64, %esi 430; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) 431; SSE2-NEXT: .LBB0_3: # %if.end 432; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload 433; SSE2-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload 434; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload 435; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload 436; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload 437; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload 438; SSE2-NEXT: xorps %xmm0, %xmm0 439; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 440; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 441; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 442; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 443; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 444; SSE2-NEXT: movb %al, %sil 445; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 446; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 447; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 448; SSE2-NEXT: movw $8, {{[0-9]+}}(%rsp) 449; SSE2-NEXT: movb $8, {{[0-9]+}}(%rsp) 450; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 451; SSE2-NEXT: movb %sil, {{[0-9]+}}(%rsp) 452; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 453; SSE2-NEXT: ldtilecfg {{[0-9]+}}(%rsp) 454; SSE2-NEXT: movl $64, %esi 455; SSE2-NEXT: movw $8, %di 456; SSE2-NEXT: tileloadd (%r10,%rsi), %tmm1 457; SSE2-NEXT: tileloadd (%r9,%rsi), %tmm2 458; SSE2-NEXT: tileloadd (%r8,%rsi), %tmm0 459; SSE2-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 460; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) 461; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %rsi 462; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 463; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 464; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 465; SSE2-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) 466; SSE2-NEXT: movb $1, {{[0-9]+}}(%rsp) 467; SSE2-NEXT: movb %al, %dil 468; SSE2-NEXT: movb %dil, {{[0-9]+}}(%rsp) 469; SSE2-NEXT: movw %cx, {{[0-9]+}}(%rsp) 470; SSE2-NEXT: ldtilecfg (%rsi) 471; SSE2-NEXT: movl $64, %esi 472; SSE2-NEXT: tileloadd (%rdx,%rsi), %tmm0 473; SSE2-NEXT: movl $buf, %edx 474; SSE2-NEXT: movl $32, %esi 475; SSE2-NEXT: tilestored %tmm0, (%rdx,%rsi) 476; SSE2-NEXT: movq %rbp, %rsp 477; SSE2-NEXT: popq %rbp 478; SSE2-NEXT: .cfi_def_cfa %rsp, 8 479; SSE2-NEXT: tilerelease 480; SSE2-NEXT: retq 481entry: 482 %tobool.not = icmp eq i32 %cond, 0 483 br i1 %tobool.not, label %if.else, label %if.then 484 485if.then: ; preds = %entry 486 %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) 487 %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) 488 %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32) 489 br label %if.end 490 491if.else: ; preds = %entry 492 %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) 493 %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) 494 %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32) 495 br label %if.end 496 497if.end: ; preds = %if.else, %if.then 498 %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] 499 %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] 500 %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] 501 %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) 502 tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) 503 ret void 504} 505 506; Function Attrs: nounwind 507declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) 508 509; Function Attrs: nounwind 510declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) 511 512; Function Attrs: nounwind 513declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) 514