1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi,+bmi2,+cmov | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefix=X64
4
5define i32 @bzhi32(i32 %x, i32 %y)   {
6; X86-LABEL: bzhi32:
7; X86:       # %bb.0:
8; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
9; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
10; X86-NEXT:    addl %ecx, %ecx
11; X86-NEXT:    bzhil %eax, %ecx, %eax
12; X86-NEXT:    retl
13;
14; X64-LABEL: bzhi32:
15; X64:       # %bb.0:
16; X64-NEXT:    addl %edi, %edi
17; X64-NEXT:    bzhil %esi, %edi, %eax
18; X64-NEXT:    retq
19  %x1 = add i32 %x, %x
20  %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
21  ret i32 %tmp
22}
23
24define i32 @bzhi32_load(ptr %x, i32 %y)   {
25; X86-LABEL: bzhi32_load:
26; X86:       # %bb.0:
27; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
28; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
29; X86-NEXT:    bzhil %eax, (%ecx), %eax
30; X86-NEXT:    retl
31;
32; X64-LABEL: bzhi32_load:
33; X64:       # %bb.0:
34; X64-NEXT:    bzhil %esi, (%rdi), %eax
35; X64-NEXT:    retq
36  %x1 = load i32, ptr %x
37  %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
38  ret i32 %tmp
39}
40
41; PR48768 - 'bzhi' clears the overflow flag, so we don't need a separate 'test'.
42define i1 @bzhi32_overflow(i32 %x, i32 %y) {
43; X86-LABEL: bzhi32_overflow:
44; X86:       # %bb.0:
45; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
46; X86-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
47; X86-NEXT:    setle %al
48; X86-NEXT:    retl
49;
50; X64-LABEL: bzhi32_overflow:
51; X64:       # %bb.0:
52; X64-NEXT:    bzhil %esi, %edi, %eax
53; X64-NEXT:    setle %al
54; X64-NEXT:    retq
55  %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
56  %cmp = icmp slt i32 %tmp, 1
57  ret i1 %cmp
58}
59
60declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
61
62define i32 @pdep32(i32 %x, i32 %y)   {
63; X86-LABEL: pdep32:
64; X86:       # %bb.0:
65; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
67; X86-NEXT:    addl %ecx, %ecx
68; X86-NEXT:    pdepl %ecx, %eax, %eax
69; X86-NEXT:    retl
70;
71; X64-LABEL: pdep32:
72; X64:       # %bb.0:
73; X64-NEXT:    addl %esi, %esi
74; X64-NEXT:    pdepl %esi, %edi, %eax
75; X64-NEXT:    retq
76  %y1 = add i32 %y, %y
77  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
78  ret i32 %tmp
79}
80
81define i32 @pdep32_load(i32 %x, ptr %y)   {
82; X86-LABEL: pdep32_load:
83; X86:       # %bb.0:
84; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
85; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
86; X86-NEXT:    pdepl (%eax), %ecx, %eax
87; X86-NEXT:    retl
88;
89; X64-LABEL: pdep32_load:
90; X64:       # %bb.0:
91; X64-NEXT:    pdepl (%rsi), %edi, %eax
92; X64-NEXT:    retq
93  %y1 = load i32, ptr %y
94  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
95  ret i32 %tmp
96}
97
98define i32 @pdep32_anyext(i16 %x)   {
99; X86-LABEL: pdep32_anyext:
100; X86:       # %bb.0:
101; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
102; X86-NEXT:    movl $-1431655766, %ecx # imm = 0xAAAAAAAA
103; X86-NEXT:    pdepl %ecx, %eax, %eax
104; X86-NEXT:    retl
105;
106; X64-LABEL: pdep32_anyext:
107; X64:       # %bb.0:
108; X64-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA
109; X64-NEXT:    pdepl %eax, %edi, %eax
110; X64-NEXT:    retq
111  %x1 = sext i16 %x to i32
112  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x1, i32 -1431655766)
113  ret i32 %tmp
114}
115
116define i32 @pdep32_demandedbits(i32 %x) {
117; X86-LABEL: pdep32_demandedbits:
118; X86:       # %bb.0:
119; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
120; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
121; X86-NEXT:    pdepl %ecx, %eax, %eax
122; X86-NEXT:    retl
123;
124; X64-LABEL: pdep32_demandedbits:
125; X64:       # %bb.0:
126; X64-NEXT:    movl $1431655765, %eax # imm = 0x55555555
127; X64-NEXT:    pdepl %eax, %edi, %eax
128; X64-NEXT:    retq
129  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 1431655765)
130  %tmp2 = and i32 %tmp, 1431655765
131  ret i32 %tmp2
132}
133
134define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
135; X86-LABEL: pdep32_demandedbits2:
136; X86:       # %bb.0:
137; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
138; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
139; X86-NEXT:    andl $128, %eax
140; X86-NEXT:    retl
141;
142; X64-LABEL: pdep32_demandedbits2:
143; X64:       # %bb.0:
144; X64-NEXT:    pdepl %esi, %edi, %eax
145; X64-NEXT:    andl $128, %eax
146; X64-NEXT:    retq
147  %tmp = and i32 %x, 255
148  %tmp2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %tmp, i32 %y)
149  %tmp3 = and i32 %tmp2, 128
150  ret i32 %tmp3
151}
152
153define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
154; X86-LABEL: pdep32_demandedbits_mask:
155; X86:       # %bb.0:
156; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
157; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
158; X86-NEXT:    pdepl %eax, %ecx, %eax
159; X86-NEXT:    andl $32768, %eax # imm = 0x8000
160; X86-NEXT:    retl
161;
162; X64-LABEL: pdep32_demandedbits_mask:
163; X64:       # %bb.0:
164; X64-NEXT:    pdepl %esi, %edi, %eax
165; X64-NEXT:    andl $32768, %eax # imm = 0x8000
166; X64-NEXT:    retq
167  %tmp = sext i16 %y to i32
168  %tmp2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %tmp)
169  %tmp3 = and i32 %tmp2, 32768
170  ret i32 %tmp3
171}
172
173define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) {
174; X86-LABEL: pdep32_demandedbits_mask2:
175; X86:       # %bb.0:
176; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
177; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
178; X86-NEXT:    pdepl %eax, %ecx, %eax
179; X86-NEXT:    movzwl %ax, %eax
180; X86-NEXT:    retl
181;
182; X64-LABEL: pdep32_demandedbits_mask2:
183; X64:       # %bb.0:
184; X64-NEXT:    pdepl %esi, %edi, %eax
185; X64-NEXT:    movzwl %ax, %eax
186; X64-NEXT:    retq
187  %tmp = sext i16 %y to i32
188  %tmp2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %tmp)
189  %tmp3 = and i32 %tmp2, 65535
190  ret i32 %tmp3
191}
192
193define i32 @pdep32_knownbits(i32 %x) {
194; X86-LABEL: pdep32_knownbits:
195; X86:       # %bb.0:
196; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
197; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
198; X86-NEXT:    pdepl %ecx, %eax, %eax
199; X86-NEXT:    imull %eax, %eax
200; X86-NEXT:    retl
201;
202; X64-LABEL: pdep32_knownbits:
203; X64:       # %bb.0:
204; X64-NEXT:    movl $1431655765, %eax # imm = 0x55555555
205; X64-NEXT:    pdepl %eax, %edi, %eax
206; X64-NEXT:    imull %eax, %eax
207; X64-NEXT:    retq
208  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 1431655765)
209  %tmp2 = and i32 %tmp, 1431655765
210  %tmp3 = mul i32 %tmp, %tmp2
211  ret i32 %tmp3
212}
213
214define i32 @pdep32_knownbits2(i32 %x, i32 %y) {
215; X86-LABEL: pdep32_knownbits2:
216; X86:       # %bb.0:
217; X86-NEXT:    movl $-256, %eax
218; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
219; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
220; X86-NEXT:    imull %eax, %eax
221; X86-NEXT:    retl
222;
223; X64-LABEL: pdep32_knownbits2:
224; X64:       # %bb.0:
225; X64-NEXT:    andl $-256, %edi
226; X64-NEXT:    pdepl %esi, %edi, %eax
227; X64-NEXT:    imull %eax, %eax
228; X64-NEXT:    retq
229  %tmp = and i32 %x, -256
230  %tmp2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %tmp, i32 %y)
231  %tmp3 = and i32 %tmp2, -256
232  %tmp4 = mul i32 %tmp2, %tmp3
233  ret i32 %tmp4
234}
235
236declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
237
238define i32 @pext32(i32 %x, i32 %y)   {
239; X86-LABEL: pext32:
240; X86:       # %bb.0:
241; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
242; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
243; X86-NEXT:    addl %ecx, %ecx
244; X86-NEXT:    pextl %ecx, %eax, %eax
245; X86-NEXT:    retl
246;
247; X64-LABEL: pext32:
248; X64:       # %bb.0:
249; X64-NEXT:    addl %esi, %esi
250; X64-NEXT:    pextl %esi, %edi, %eax
251; X64-NEXT:    retq
252  %y1 = add i32 %y, %y
253  %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
254  ret i32 %tmp
255}
256
257define i32 @pext32_load(i32 %x, ptr %y)   {
258; X86-LABEL: pext32_load:
259; X86:       # %bb.0:
260; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
261; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
262; X86-NEXT:    pextl (%eax), %ecx, %eax
263; X86-NEXT:    retl
264;
265; X64-LABEL: pext32_load:
266; X64:       # %bb.0:
267; X64-NEXT:    pextl (%rsi), %edi, %eax
268; X64-NEXT:    retq
269  %y1 = load i32, ptr %y
270  %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
271  ret i32 %tmp
272}
273
274define i32 @pext32_knownbits(i32 %x)   {
275; X86-LABEL: pext32_knownbits:
276; X86:       # %bb.0:
277; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
278; X86-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
279; X86-NEXT:    pextl %ecx, %eax, %eax
280; X86-NEXT:    retl
281;
282; X64-LABEL: pext32_knownbits:
283; X64:       # %bb.0:
284; X64-NEXT:    movl $1431655765, %eax # imm = 0x55555555
285; X64-NEXT:    pextl %eax, %edi, %eax
286; X64-NEXT:    retq
287  %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 1431655765)
288  %tmp2 = and i32 %tmp, 65535
289  ret i32 %tmp2
290}
291
292declare i32 @llvm.x86.bmi.pext.32(i32, i32)
293
294define i32 @mulx32(i32 %x, i32 %y, ptr %p)   {
295; X86-LABEL: mulx32:
296; X86:       # %bb.0:
297; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
298; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
299; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
300; X86-NEXT:    addl %edx, %edx
301; X86-NEXT:    addl %eax, %eax
302; X86-NEXT:    mulxl %eax, %eax, %edx
303; X86-NEXT:    movl %edx, (%ecx)
304; X86-NEXT:    retl
305;
306; X64-LABEL: mulx32:
307; X64:       # %bb.0:
308; X64-NEXT:    # kill: def $esi killed $esi def $rsi
309; X64-NEXT:    # kill: def $edi killed $edi def $rdi
310; X64-NEXT:    addl %edi, %edi
311; X64-NEXT:    leal (%rsi,%rsi), %eax
312; X64-NEXT:    imulq %rdi, %rax
313; X64-NEXT:    movq %rax, %rcx
314; X64-NEXT:    shrq $32, %rcx
315; X64-NEXT:    movl %ecx, (%rdx)
316; X64-NEXT:    # kill: def $eax killed $eax killed $rax
317; X64-NEXT:    retq
318  %x1 = add i32 %x, %x
319  %y1 = add i32 %y, %y
320  %x2 = zext i32 %x1 to i64
321  %y2 = zext i32 %y1 to i64
322  %r1 = mul i64 %x2, %y2
323  %h1 = lshr i64 %r1, 32
324  %h  = trunc i64 %h1 to i32
325  %l  = trunc i64 %r1 to i32
326  store i32 %h, ptr %p
327  ret i32 %l
328}
329
330define i32 @mulx32_load(i32 %x, ptr %y, ptr %p)   {
331; X86-LABEL: mulx32_load:
332; X86:       # %bb.0:
333; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
334; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
335; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
336; X86-NEXT:    addl %edx, %edx
337; X86-NEXT:    mulxl (%eax), %eax, %edx
338; X86-NEXT:    movl %edx, (%ecx)
339; X86-NEXT:    retl
340;
341; X64-LABEL: mulx32_load:
342; X64:       # %bb.0:
343; X64-NEXT:    # kill: def $edi killed $edi def $rdi
344; X64-NEXT:    leal (%rdi,%rdi), %eax
345; X64-NEXT:    movl (%rsi), %ecx
346; X64-NEXT:    imulq %rcx, %rax
347; X64-NEXT:    movq %rax, %rcx
348; X64-NEXT:    shrq $32, %rcx
349; X64-NEXT:    movl %ecx, (%rdx)
350; X64-NEXT:    # kill: def $eax killed $eax killed $rax
351; X64-NEXT:    retq
352  %x1 = add i32 %x, %x
353  %y1 = load i32, ptr %y
354  %x2 = zext i32 %x1 to i64
355  %y2 = zext i32 %y1 to i64
356  %r1 = mul i64 %x2, %y2
357  %h1 = lshr i64 %r1, 32
358  %h  = trunc i64 %h1 to i32
359  %l  = trunc i64 %r1 to i32
360  store i32 %h, ptr %p
361  ret i32 %l
362}
363