1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=0 | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-CUR %s
3; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake  -x86-experimental-unordered-atomic-isel=0 | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-CUR %s
4; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=1 | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-EX %s
5; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=1 | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-EX %s
6
7define i8 @load_i8(i8* %ptr) {
8; CHECK-LABEL: load_i8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    movb (%rdi), %al
11; CHECK-NEXT:    retq
12  %v = load atomic i8, i8* %ptr unordered, align 1
13  ret i8 %v
14}
15
16define void @store_i8(i8* %ptr, i8 %v) {
17; CHECK-O0-LABEL: store_i8:
18; CHECK-O0:       # %bb.0:
19; CHECK-O0-NEXT:    movb %sil, %al
20; CHECK-O0-NEXT:    movb %al, (%rdi)
21; CHECK-O0-NEXT:    retq
22;
23; CHECK-O3-LABEL: store_i8:
24; CHECK-O3:       # %bb.0:
25; CHECK-O3-NEXT:    movb %sil, (%rdi)
26; CHECK-O3-NEXT:    retq
27  store atomic i8 %v, i8* %ptr unordered, align 1
28  ret void
29}
30
31define i16 @load_i16(i16* %ptr) {
32; CHECK-O0-LABEL: load_i16:
33; CHECK-O0:       # %bb.0:
34; CHECK-O0-NEXT:    movw (%rdi), %ax
35; CHECK-O0-NEXT:    retq
36;
37; CHECK-O3-LABEL: load_i16:
38; CHECK-O3:       # %bb.0:
39; CHECK-O3-NEXT:    movzwl (%rdi), %eax
40; CHECK-O3-NEXT:    retq
41  %v = load atomic i16, i16* %ptr unordered, align 2
42  ret i16 %v
43}
44
45
46define void @store_i16(i16* %ptr, i16 %v) {
47; CHECK-O0-LABEL: store_i16:
48; CHECK-O0:       # %bb.0:
49; CHECK-O0-NEXT:    movw %si, %ax
50; CHECK-O0-NEXT:    movw %ax, (%rdi)
51; CHECK-O0-NEXT:    retq
52;
53; CHECK-O3-LABEL: store_i16:
54; CHECK-O3:       # %bb.0:
55; CHECK-O3-NEXT:    movw %si, (%rdi)
56; CHECK-O3-NEXT:    retq
57  store atomic i16 %v, i16* %ptr unordered, align 2
58  ret void
59}
60
61define i32 @load_i32(i32* %ptr) {
62; CHECK-LABEL: load_i32:
63; CHECK:       # %bb.0:
64; CHECK-NEXT:    movl (%rdi), %eax
65; CHECK-NEXT:    retq
66  %v = load atomic i32, i32* %ptr unordered, align 4
67  ret i32 %v
68}
69
70define void @store_i32(i32* %ptr, i32 %v) {
71; CHECK-LABEL: store_i32:
72; CHECK:       # %bb.0:
73; CHECK-NEXT:    movl %esi, (%rdi)
74; CHECK-NEXT:    retq
75  store atomic i32 %v, i32* %ptr unordered, align 4
76  ret void
77}
78
79define i64 @load_i64(i64* %ptr) {
80; CHECK-LABEL: load_i64:
81; CHECK:       # %bb.0:
82; CHECK-NEXT:    movq (%rdi), %rax
83; CHECK-NEXT:    retq
84  %v = load atomic i64, i64* %ptr unordered, align 8
85  ret i64 %v
86}
87
88define void @store_i64(i64* %ptr, i64 %v) {
89; CHECK-LABEL: store_i64:
90; CHECK:       # %bb.0:
91; CHECK-NEXT:    movq %rsi, (%rdi)
92; CHECK-NEXT:    retq
93  store atomic i64 %v, i64* %ptr unordered, align 8
94  ret void
95}
96
97;; The tests in the rest of this file are intended to show transforms which we
98;; either *can't* do for legality, or don't currently implement.  The later
99;; are noted carefully where relevant.
100
101;; Start w/some clearly illegal ones.
102
103; Must use a full width op, not a byte op
104define void @narrow_writeback_or(i64* %ptr) {
105; CHECK-O0-LABEL: narrow_writeback_or:
106; CHECK-O0:       # %bb.0:
107; CHECK-O0-NEXT:    movq (%rdi), %rax
108; CHECK-O0-NEXT:    orq $7, %rax
109; CHECK-O0-NEXT:    movq %rax, (%rdi)
110; CHECK-O0-NEXT:    retq
111;
112; CHECK-O3-LABEL: narrow_writeback_or:
113; CHECK-O3:       # %bb.0:
114; CHECK-O3-NEXT:    orq $7, (%rdi)
115; CHECK-O3-NEXT:    retq
116  %v = load atomic i64, i64* %ptr unordered, align 8
117  %v.new = or i64 %v, 7
118  store atomic i64 %v.new, i64* %ptr unordered, align 8
119  ret void
120}
121
122; Must use a full width op, not a byte op
123define void @narrow_writeback_and(i64* %ptr) {
124; CHECK-O0-LABEL: narrow_writeback_and:
125; CHECK-O0:       # %bb.0:
126; CHECK-O0-NEXT:    movq (%rdi), %rax
127; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
128; CHECK-O0-NEXT:    andl $-256, %eax
129; CHECK-O0-NEXT:    # kill: def $rax killed $eax
130; CHECK-O0-NEXT:    movq %rax, (%rdi)
131; CHECK-O0-NEXT:    retq
132;
133; CHECK-O3-LABEL: narrow_writeback_and:
134; CHECK-O3:       # %bb.0:
135; CHECK-O3-NEXT:    movl $4294967040, %eax # imm = 0xFFFFFF00
136; CHECK-O3-NEXT:    andq %rax, (%rdi)
137; CHECK-O3-NEXT:    retq
138  %v = load atomic i64, i64* %ptr unordered, align 8
139  %v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00
140  store atomic i64 %v.new, i64* %ptr unordered, align 8
141  ret void
142}
143
144; Must use a full width op, not a byte op
145define void @narrow_writeback_xor(i64* %ptr) {
146; CHECK-O0-LABEL: narrow_writeback_xor:
147; CHECK-O0:       # %bb.0:
148; CHECK-O0-NEXT:    movq (%rdi), %rax
149; CHECK-O0-NEXT:    xorq $7, %rax
150; CHECK-O0-NEXT:    movq %rax, (%rdi)
151; CHECK-O0-NEXT:    retq
152;
153; CHECK-O3-LABEL: narrow_writeback_xor:
154; CHECK-O3:       # %bb.0:
155; CHECK-O3-NEXT:    xorq $7, (%rdi)
156; CHECK-O3-NEXT:    retq
157  %v = load atomic i64, i64* %ptr unordered, align 8
158  %v.new = xor i64 %v, 7
159  store atomic i64 %v.new, i64* %ptr unordered, align 8
160  ret void
161}
162
163;; Next batch of tests are exercising cases where store widening would
164;; improve codegeneration.  Note that widening is only legal if the
165;; resulting type would be atomic.  Each tests has a well aligned, and
166;; unaligned variant to ensure we get correct codegen here.
167;; Note: It's not a legality issue, but there's a gotcha here to be aware
168;; of.  Once we widen a pair of atomic stores, we loose the information
169;; that the original atomicity requirement was half the width.  Given that,
170;; we can't then split the load again.  This challenges our usual iterative
171;; approach to incremental improvement.
172
173; Legal if wider type is also atomic (TODO)
174define void @widen_store(i32* %p0, i32 %v1, i32 %v2) {
175; CHECK-LABEL: widen_store:
176; CHECK:       # %bb.0:
177; CHECK-NEXT:    movl %esi, (%rdi)
178; CHECK-NEXT:    movl %edx, 4(%rdi)
179; CHECK-NEXT:    retq
180  %p1 = getelementptr i32, i32* %p0, i64 1
181  store atomic i32 %v1, i32* %p0 unordered, align 8
182  store atomic i32 %v2, i32* %p1 unordered, align 4
183  ret void
184}
185
186; This one is *NOT* legal to widen.  With weaker alignment,
187; the wider type might cross a cache line and violate the
188; atomicity requirement.
189define void @widen_store_unaligned(i32* %p0, i32 %v1, i32 %v2) {
190; CHECK-LABEL: widen_store_unaligned:
191; CHECK:       # %bb.0:
192; CHECK-NEXT:    movl %esi, (%rdi)
193; CHECK-NEXT:    movl %edx, 4(%rdi)
194; CHECK-NEXT:    retq
195  %p1 = getelementptr i32, i32* %p0, i64 1
196  store atomic i32 %v1, i32* %p0 unordered, align 4
197  store atomic i32 %v2, i32* %p1 unordered, align 4
198  ret void
199}
200
201; Legal if wider type is also atomic (TODO)
202define void @widen_broadcast(i32* %p0, i32 %v) {
203; CHECK-LABEL: widen_broadcast:
204; CHECK:       # %bb.0:
205; CHECK-NEXT:    movl %esi, (%rdi)
206; CHECK-NEXT:    movl %esi, 4(%rdi)
207; CHECK-NEXT:    retq
208  %p1 = getelementptr i32, i32* %p0, i64 1
209  store atomic i32 %v, i32* %p0 unordered, align 8
210  store atomic i32 %v, i32* %p1 unordered, align 4
211  ret void
212}
213
214; Not legal to widen due to alignment restriction
215define void @widen_broadcast_unaligned(i32* %p0, i32 %v) {
216; CHECK-LABEL: widen_broadcast_unaligned:
217; CHECK:       # %bb.0:
218; CHECK-NEXT:    movl %esi, (%rdi)
219; CHECK-NEXT:    movl %esi, 4(%rdi)
220; CHECK-NEXT:    retq
221  %p1 = getelementptr i32, i32* %p0, i64 1
222  store atomic i32 %v, i32* %p0 unordered, align 4
223  store atomic i32 %v, i32* %p1 unordered, align 4
224  ret void
225}
226
227define i128 @load_i128(i128* %ptr) {
228; CHECK-O0-LABEL: load_i128:
229; CHECK-O0:       # %bb.0:
230; CHECK-O0-NEXT:    pushq %rbx
231; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
232; CHECK-O0-NEXT:    .cfi_offset %rbx, -16
233; CHECK-O0-NEXT:    xorl %eax, %eax
234; CHECK-O0-NEXT:    movl %eax, %ebx
235; CHECK-O0-NEXT:    movq %rbx, %rax
236; CHECK-O0-NEXT:    movq %rbx, %rdx
237; CHECK-O0-NEXT:    movq %rbx, %rcx
238; CHECK-O0-NEXT:    lock cmpxchg16b (%rdi)
239; CHECK-O0-NEXT:    popq %rbx
240; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
241; CHECK-O0-NEXT:    retq
242;
243; CHECK-O3-LABEL: load_i128:
244; CHECK-O3:       # %bb.0:
245; CHECK-O3-NEXT:    pushq %rbx
246; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
247; CHECK-O3-NEXT:    .cfi_offset %rbx, -16
248; CHECK-O3-NEXT:    xorl %eax, %eax
249; CHECK-O3-NEXT:    xorl %edx, %edx
250; CHECK-O3-NEXT:    xorl %ecx, %ecx
251; CHECK-O3-NEXT:    xorl %ebx, %ebx
252; CHECK-O3-NEXT:    lock cmpxchg16b (%rdi)
253; CHECK-O3-NEXT:    popq %rbx
254; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
255; CHECK-O3-NEXT:    retq
256  %v = load atomic i128, i128* %ptr unordered, align 16
257  ret i128 %v
258}
259
260define void @store_i128(i128* %ptr, i128 %v) {
261; CHECK-O0-LABEL: store_i128:
262; CHECK-O0:       # %bb.0:
263; CHECK-O0-NEXT:    pushq %rbx
264; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
265; CHECK-O0-NEXT:    .cfi_offset %rbx, -16
266; CHECK-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
267; CHECK-O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
268; CHECK-O0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
269; CHECK-O0-NEXT:    movq (%rdi), %rax
270; CHECK-O0-NEXT:    movq 8(%rdi), %rdx
271; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
272; CHECK-O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
273; CHECK-O0-NEXT:    jmp .LBB16_1
274; CHECK-O0-NEXT:  .LBB16_1: # %atomicrmw.start
275; CHECK-O0-NEXT:    # =>This Inner Loop Header: Depth=1
276; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
277; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
278; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
279; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
280; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
281; CHECK-O0-NEXT:    lock cmpxchg16b (%rsi)
282; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
283; CHECK-O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
284; CHECK-O0-NEXT:    jne .LBB16_1
285; CHECK-O0-NEXT:    jmp .LBB16_2
286; CHECK-O0-NEXT:  .LBB16_2: # %atomicrmw.end
287; CHECK-O0-NEXT:    popq %rbx
288; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
289; CHECK-O0-NEXT:    retq
290;
291; CHECK-O3-LABEL: store_i128:
292; CHECK-O3:       # %bb.0:
293; CHECK-O3-NEXT:    pushq %rbx
294; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
295; CHECK-O3-NEXT:    .cfi_offset %rbx, -16
296; CHECK-O3-NEXT:    movq %rdx, %rcx
297; CHECK-O3-NEXT:    movq %rsi, %rbx
298; CHECK-O3-NEXT:    movq (%rdi), %rax
299; CHECK-O3-NEXT:    movq 8(%rdi), %rdx
300; CHECK-O3-NEXT:    .p2align 4, 0x90
301; CHECK-O3-NEXT:  .LBB16_1: # %atomicrmw.start
302; CHECK-O3-NEXT:    # =>This Inner Loop Header: Depth=1
303; CHECK-O3-NEXT:    lock cmpxchg16b (%rdi)
304; CHECK-O3-NEXT:    jne .LBB16_1
305; CHECK-O3-NEXT:  # %bb.2: # %atomicrmw.end
306; CHECK-O3-NEXT:    popq %rbx
307; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
308; CHECK-O3-NEXT:    retq
309  store atomic i128 %v, i128* %ptr unordered, align 16
310  ret void
311}
312
313define i256 @load_i256(i256* %ptr) {
314; CHECK-O0-LABEL: load_i256:
315; CHECK-O0:       # %bb.0:
316; CHECK-O0-NEXT:    subq $56, %rsp
317; CHECK-O0-NEXT:    .cfi_def_cfa_offset 64
318; CHECK-O0-NEXT:    movq %rdi, %rax
319; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
320; CHECK-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
321; CHECK-O0-NEXT:    movl $32, %edi
322; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
323; CHECK-O0-NEXT:    xorl %ecx, %ecx
324; CHECK-O0-NEXT:    callq __atomic_load@PLT
325; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
326; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
327; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
328; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
329; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
330; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %r8
331; CHECK-O0-NEXT:    movq %r8, 24(%rdi)
332; CHECK-O0-NEXT:    movq %rsi, 16(%rdi)
333; CHECK-O0-NEXT:    movq %rdx, 8(%rdi)
334; CHECK-O0-NEXT:    movq %rcx, (%rdi)
335; CHECK-O0-NEXT:    addq $56, %rsp
336; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
337; CHECK-O0-NEXT:    retq
338;
339; CHECK-O3-LABEL: load_i256:
340; CHECK-O3:       # %bb.0:
341; CHECK-O3-NEXT:    pushq %rbx
342; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
343; CHECK-O3-NEXT:    subq $32, %rsp
344; CHECK-O3-NEXT:    .cfi_def_cfa_offset 48
345; CHECK-O3-NEXT:    .cfi_offset %rbx, -16
346; CHECK-O3-NEXT:    movq %rdi, %rbx
347; CHECK-O3-NEXT:    movq %rsp, %rdx
348; CHECK-O3-NEXT:    movl $32, %edi
349; CHECK-O3-NEXT:    xorl %ecx, %ecx
350; CHECK-O3-NEXT:    callq __atomic_load@PLT
351; CHECK-O3-NEXT:    vmovups (%rsp), %ymm0
352; CHECK-O3-NEXT:    vmovups %ymm0, (%rbx)
353; CHECK-O3-NEXT:    movq %rbx, %rax
354; CHECK-O3-NEXT:    addq $32, %rsp
355; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
356; CHECK-O3-NEXT:    popq %rbx
357; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
358; CHECK-O3-NEXT:    vzeroupper
359; CHECK-O3-NEXT:    retq
360  %v = load atomic i256, i256* %ptr unordered, align 16
361  ret i256 %v
362}
363
364define void @store_i256(i256* %ptr, i256 %v) {
365; CHECK-O0-LABEL: store_i256:
366; CHECK-O0:       # %bb.0:
367; CHECK-O0-NEXT:    subq $40, %rsp
368; CHECK-O0-NEXT:    .cfi_def_cfa_offset 48
369; CHECK-O0-NEXT:    movq %rdx, %rax
370; CHECK-O0-NEXT:    movq %rsi, (%rsp) # 8-byte Spill
371; CHECK-O0-NEXT:    movq %rdi, %rsi
372; CHECK-O0-NEXT:    movq (%rsp), %rdi # 8-byte Reload
373; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
374; CHECK-O0-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
375; CHECK-O0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
376; CHECK-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
377; CHECK-O0-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
378; CHECK-O0-NEXT:    movl $32, %edi
379; CHECK-O0-NEXT:    xorl %ecx, %ecx
380; CHECK-O0-NEXT:    callq __atomic_store@PLT
381; CHECK-O0-NEXT:    addq $40, %rsp
382; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
383; CHECK-O0-NEXT:    retq
384;
385; CHECK-O3-LABEL: store_i256:
386; CHECK-O3:       # %bb.0:
387; CHECK-O3-NEXT:    subq $40, %rsp
388; CHECK-O3-NEXT:    .cfi_def_cfa_offset 48
389; CHECK-O3-NEXT:    movq %rdi, %rax
390; CHECK-O3-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
391; CHECK-O3-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
392; CHECK-O3-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
393; CHECK-O3-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
394; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
395; CHECK-O3-NEXT:    movl $32, %edi
396; CHECK-O3-NEXT:    movq %rax, %rsi
397; CHECK-O3-NEXT:    xorl %ecx, %ecx
398; CHECK-O3-NEXT:    callq __atomic_store@PLT
399; CHECK-O3-NEXT:    addq $40, %rsp
400; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
401; CHECK-O3-NEXT:    retq
402  store atomic i256 %v, i256* %ptr unordered, align 16
403  ret void
404}
405
406; Legal if wider type is also atomic (TODO)
407define void @vec_store(i32* %p0, <2 x i32> %vec) {
408; CHECK-O0-CUR-LABEL: vec_store:
409; CHECK-O0-CUR:       # %bb.0:
410; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %ecx
411; CHECK-O0-CUR-NEXT:    vpextrd $1, %xmm0, %eax
412; CHECK-O0-CUR-NEXT:    movl %ecx, (%rdi)
413; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
414; CHECK-O0-CUR-NEXT:    retq
415;
416; CHECK-O3-CUR-LABEL: vec_store:
417; CHECK-O3-CUR:       # %bb.0:
418; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
419; CHECK-O3-CUR-NEXT:    vpextrd $1, %xmm0, %ecx
420; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
421; CHECK-O3-CUR-NEXT:    movl %ecx, 4(%rdi)
422; CHECK-O3-CUR-NEXT:    retq
423;
424; CHECK-O0-EX-LABEL: vec_store:
425; CHECK-O0-EX:       # %bb.0:
426; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
427; CHECK-O0-EX-NEXT:    vpextrd $1, %xmm0, 4(%rdi)
428; CHECK-O0-EX-NEXT:    retq
429;
430; CHECK-O3-EX-LABEL: vec_store:
431; CHECK-O3-EX:       # %bb.0:
432; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
433; CHECK-O3-EX-NEXT:    vextractps $1, %xmm0, 4(%rdi)
434; CHECK-O3-EX-NEXT:    retq
435  %v1 = extractelement <2 x i32> %vec, i32 0
436  %v2 = extractelement <2 x i32> %vec, i32 1
437  %p1 = getelementptr i32, i32* %p0, i64 1
438  store atomic i32 %v1, i32* %p0 unordered, align 8
439  store atomic i32 %v2, i32* %p1 unordered, align 4
440  ret void
441}
442
443; Not legal to widen due to alignment restriction
444define void @vec_store_unaligned(i32* %p0, <2 x i32> %vec) {
445; CHECK-O0-CUR-LABEL: vec_store_unaligned:
446; CHECK-O0-CUR:       # %bb.0:
447; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %ecx
448; CHECK-O0-CUR-NEXT:    vpextrd $1, %xmm0, %eax
449; CHECK-O0-CUR-NEXT:    movl %ecx, (%rdi)
450; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
451; CHECK-O0-CUR-NEXT:    retq
452;
453; CHECK-O3-CUR-LABEL: vec_store_unaligned:
454; CHECK-O3-CUR:       # %bb.0:
455; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
456; CHECK-O3-CUR-NEXT:    vpextrd $1, %xmm0, %ecx
457; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
458; CHECK-O3-CUR-NEXT:    movl %ecx, 4(%rdi)
459; CHECK-O3-CUR-NEXT:    retq
460;
461; CHECK-O0-EX-LABEL: vec_store_unaligned:
462; CHECK-O0-EX:       # %bb.0:
463; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
464; CHECK-O0-EX-NEXT:    vpextrd $1, %xmm0, 4(%rdi)
465; CHECK-O0-EX-NEXT:    retq
466;
467; CHECK-O3-EX-LABEL: vec_store_unaligned:
468; CHECK-O3-EX:       # %bb.0:
469; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
470; CHECK-O3-EX-NEXT:    vextractps $1, %xmm0, 4(%rdi)
471; CHECK-O3-EX-NEXT:    retq
472  %v1 = extractelement <2 x i32> %vec, i32 0
473  %v2 = extractelement <2 x i32> %vec, i32 1
474  %p1 = getelementptr i32, i32* %p0, i64 1
475  store atomic i32 %v1, i32* %p0 unordered, align 4
476  store atomic i32 %v2, i32* %p1 unordered, align 4
477  ret void
478}
479
480
481
482; Legal if wider type is also atomic (TODO)
483; Also, can avoid register move from xmm to eax (TODO)
484define void @widen_broadcast2(i32* %p0, <2 x i32> %vec) {
485; CHECK-O0-CUR-LABEL: widen_broadcast2:
486; CHECK-O0-CUR:       # %bb.0:
487; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %eax
488; CHECK-O0-CUR-NEXT:    movl %eax, (%rdi)
489; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
490; CHECK-O0-CUR-NEXT:    retq
491;
492; CHECK-O3-CUR-LABEL: widen_broadcast2:
493; CHECK-O3-CUR:       # %bb.0:
494; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
495; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
496; CHECK-O3-CUR-NEXT:    movl %eax, 4(%rdi)
497; CHECK-O3-CUR-NEXT:    retq
498;
499; CHECK-O0-EX-LABEL: widen_broadcast2:
500; CHECK-O0-EX:       # %bb.0:
501; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
502; CHECK-O0-EX-NEXT:    vmovd %xmm0, 4(%rdi)
503; CHECK-O0-EX-NEXT:    retq
504;
505; CHECK-O3-EX-LABEL: widen_broadcast2:
506; CHECK-O3-EX:       # %bb.0:
507; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
508; CHECK-O3-EX-NEXT:    vmovss %xmm0, 4(%rdi)
509; CHECK-O3-EX-NEXT:    retq
510  %v1 = extractelement <2 x i32> %vec, i32 0
511  %p1 = getelementptr i32, i32* %p0, i64 1
512  store atomic i32 %v1, i32* %p0 unordered, align 8
513  store atomic i32 %v1, i32* %p1 unordered, align 4
514  ret void
515}
516
517; Not legal to widen due to alignment restriction
518define void @widen_broadcast2_unaligned(i32* %p0, <2 x i32> %vec) {
519; CHECK-O0-CUR-LABEL: widen_broadcast2_unaligned:
520; CHECK-O0-CUR:       # %bb.0:
521; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %eax
522; CHECK-O0-CUR-NEXT:    movl %eax, (%rdi)
523; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
524; CHECK-O0-CUR-NEXT:    retq
525;
526; CHECK-O3-CUR-LABEL: widen_broadcast2_unaligned:
527; CHECK-O3-CUR:       # %bb.0:
528; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
529; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
530; CHECK-O3-CUR-NEXT:    movl %eax, 4(%rdi)
531; CHECK-O3-CUR-NEXT:    retq
532;
533; CHECK-O0-EX-LABEL: widen_broadcast2_unaligned:
534; CHECK-O0-EX:       # %bb.0:
535; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
536; CHECK-O0-EX-NEXT:    vmovd %xmm0, 4(%rdi)
537; CHECK-O0-EX-NEXT:    retq
538;
539; CHECK-O3-EX-LABEL: widen_broadcast2_unaligned:
540; CHECK-O3-EX:       # %bb.0:
541; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
542; CHECK-O3-EX-NEXT:    vmovss %xmm0, 4(%rdi)
543; CHECK-O3-EX-NEXT:    retq
544  %v1 = extractelement <2 x i32> %vec, i32 0
545  %p1 = getelementptr i32, i32* %p0, i64 1
546  store atomic i32 %v1, i32* %p0 unordered, align 4
547  store atomic i32 %v1, i32* %p1 unordered, align 4
548  ret void
549}
550
551; Legal if wider type is also atomic (TODO)
552define void @widen_zero_init(i32* %p0, i32 %v1, i32 %v2) {
553; CHECK-LABEL: widen_zero_init:
554; CHECK:       # %bb.0:
555; CHECK-NEXT:    movl $0, (%rdi)
556; CHECK-NEXT:    movl $0, 4(%rdi)
557; CHECK-NEXT:    retq
558  %p1 = getelementptr i32, i32* %p0, i64 1
559  store atomic i32 0, i32* %p0 unordered, align 8
560  store atomic i32 0, i32* %p1 unordered, align 4
561  ret void
562}
563
564; Not legal to widen due to alignment restriction
565define void @widen_zero_init_unaligned(i32* %p0, i32 %v1, i32 %v2) {
566; CHECK-LABEL: widen_zero_init_unaligned:
567; CHECK:       # %bb.0:
568; CHECK-NEXT:    movl $0, (%rdi)
569; CHECK-NEXT:    movl $0, 4(%rdi)
570; CHECK-NEXT:    retq
571  %p1 = getelementptr i32, i32* %p0, i64 1
572  store atomic i32 0, i32* %p0 unordered, align 4
573  store atomic i32 0, i32* %p1 unordered, align 4
574  ret void
575}
576
577;; The next batch of tests are stressing load folding. Folding is legal
578;; on x86, so these are simply checking optimization quality.
579
580; Legal, as expected
581define i64 @load_fold_add1(i64* %p) {
582; CHECK-LABEL: load_fold_add1:
583; CHECK:       # %bb.0:
584; CHECK-NEXT:    movq (%rdi), %rax
585; CHECK-NEXT:    addq $15, %rax
586; CHECK-NEXT:    retq
587  %v = load atomic i64, i64* %p unordered, align 8
588  %ret = add i64 %v, 15
589  ret i64 %ret
590}
591
592define i64 @load_fold_add2(i64* %p, i64 %v2) {
593; CHECK-LABEL: load_fold_add2:
594; CHECK:       # %bb.0:
595; CHECK-NEXT:    movq %rsi, %rax
596; CHECK-NEXT:    addq (%rdi), %rax
597; CHECK-NEXT:    retq
598  %v = load atomic i64, i64* %p unordered, align 8
599  %ret = add i64 %v, %v2
600  ret i64 %ret
601}
602
603define i64 @load_fold_add3(i64* %p1, i64* %p2) {
604; CHECK-O0-LABEL: load_fold_add3:
605; CHECK-O0:       # %bb.0:
606; CHECK-O0-NEXT:    movq (%rdi), %rax
607; CHECK-O0-NEXT:    addq (%rsi), %rax
608; CHECK-O0-NEXT:    retq
609;
610; CHECK-O3-CUR-LABEL: load_fold_add3:
611; CHECK-O3-CUR:       # %bb.0:
612; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
613; CHECK-O3-CUR-NEXT:    addq (%rdi), %rax
614; CHECK-O3-CUR-NEXT:    retq
615;
616; CHECK-O3-EX-LABEL: load_fold_add3:
617; CHECK-O3-EX:       # %bb.0:
618; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
619; CHECK-O3-EX-NEXT:    addq (%rsi), %rax
620; CHECK-O3-EX-NEXT:    retq
621  %v = load atomic i64, i64* %p1 unordered, align 8
622  %v2 = load atomic i64, i64* %p2 unordered, align 8
623  %ret = add i64 %v, %v2
624  ret i64 %ret
625}
626
627; Legal, as expected
628define i64 @load_fold_sub1(i64* %p) {
629; CHECK-O0-LABEL: load_fold_sub1:
630; CHECK-O0:       # %bb.0:
631; CHECK-O0-NEXT:    movq (%rdi), %rax
632; CHECK-O0-NEXT:    subq $15, %rax
633; CHECK-O0-NEXT:    retq
634;
635; CHECK-O3-LABEL: load_fold_sub1:
636; CHECK-O3:       # %bb.0:
637; CHECK-O3-NEXT:    movq (%rdi), %rax
638; CHECK-O3-NEXT:    addq $-15, %rax
639; CHECK-O3-NEXT:    retq
640  %v = load atomic i64, i64* %p unordered, align 8
641  %ret = sub i64 %v, 15
642  ret i64 %ret
643}
644
645define i64 @load_fold_sub2(i64* %p, i64 %v2) {
646; CHECK-LABEL: load_fold_sub2:
647; CHECK:       # %bb.0:
648; CHECK-NEXT:    movq (%rdi), %rax
649; CHECK-NEXT:    subq %rsi, %rax
650; CHECK-NEXT:    retq
651  %v = load atomic i64, i64* %p unordered, align 8
652  %ret = sub i64 %v, %v2
653  ret i64 %ret
654}
655
656define i64 @load_fold_sub3(i64* %p1, i64* %p2) {
657; CHECK-LABEL: load_fold_sub3:
658; CHECK:       # %bb.0:
659; CHECK-NEXT:    movq (%rdi), %rax
660; CHECK-NEXT:    subq (%rsi), %rax
661; CHECK-NEXT:    retq
662  %v = load atomic i64, i64* %p1 unordered, align 8
663  %v2 = load atomic i64, i64* %p2 unordered, align 8
664  %ret = sub i64 %v, %v2
665  ret i64 %ret
666}
667
668; Legal, as expected
669define i64 @load_fold_mul1(i64* %p) {
670; CHECK-O0-LABEL: load_fold_mul1:
671; CHECK-O0:       # %bb.0:
672; CHECK-O0-NEXT:    imulq $15, (%rdi), %rax
673; CHECK-O0-NEXT:    retq
674;
675; CHECK-O3-LABEL: load_fold_mul1:
676; CHECK-O3:       # %bb.0:
677; CHECK-O3-NEXT:    movq (%rdi), %rax
678; CHECK-O3-NEXT:    leaq (%rax,%rax,4), %rax
679; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
680; CHECK-O3-NEXT:    retq
681  %v = load atomic i64, i64* %p unordered, align 8
682  %ret = mul i64 %v, 15
683  ret i64 %ret
684}
685
686define i64 @load_fold_mul2(i64* %p, i64 %v2) {
687; CHECK-LABEL: load_fold_mul2:
688; CHECK:       # %bb.0:
689; CHECK-NEXT:    movq %rsi, %rax
690; CHECK-NEXT:    imulq (%rdi), %rax
691; CHECK-NEXT:    retq
692  %v = load atomic i64, i64* %p unordered, align 8
693  %ret = mul i64 %v, %v2
694  ret i64 %ret
695}
696
697define i64 @load_fold_mul3(i64* %p1, i64* %p2) {
698; CHECK-O0-LABEL: load_fold_mul3:
699; CHECK-O0:       # %bb.0:
700; CHECK-O0-NEXT:    movq (%rdi), %rax
701; CHECK-O0-NEXT:    imulq (%rsi), %rax
702; CHECK-O0-NEXT:    retq
703;
704; CHECK-O3-CUR-LABEL: load_fold_mul3:
705; CHECK-O3-CUR:       # %bb.0:
706; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
707; CHECK-O3-CUR-NEXT:    imulq (%rdi), %rax
708; CHECK-O3-CUR-NEXT:    retq
709;
710; CHECK-O3-EX-LABEL: load_fold_mul3:
711; CHECK-O3-EX:       # %bb.0:
712; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
713; CHECK-O3-EX-NEXT:    imulq (%rsi), %rax
714; CHECK-O3-EX-NEXT:    retq
715  %v = load atomic i64, i64* %p1 unordered, align 8
716  %v2 = load atomic i64, i64* %p2 unordered, align 8
717  %ret = mul i64 %v, %v2
718  ret i64 %ret
719}
720
721; Legal to fold (TODO)
722define i64 @load_fold_sdiv1(i64* %p) {
723; CHECK-O0-LABEL: load_fold_sdiv1:
724; CHECK-O0:       # %bb.0:
725; CHECK-O0-NEXT:    movq (%rdi), %rax
726; CHECK-O0-NEXT:    movl $15, %ecx
727; CHECK-O0-NEXT:    cqto
728; CHECK-O0-NEXT:    idivq %rcx
729; CHECK-O0-NEXT:    retq
730;
731; CHECK-O3-LABEL: load_fold_sdiv1:
732; CHECK-O3:       # %bb.0:
733; CHECK-O3-NEXT:    movq (%rdi), %rcx
734; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
735; CHECK-O3-NEXT:    movq %rcx, %rax
736; CHECK-O3-NEXT:    imulq %rdx
737; CHECK-O3-NEXT:    addq %rdx, %rcx
738; CHECK-O3-NEXT:    movq %rcx, %rax
739; CHECK-O3-NEXT:    shrq $63, %rax
740; CHECK-O3-NEXT:    sarq $3, %rcx
741; CHECK-O3-NEXT:    addq %rax, %rcx
742; CHECK-O3-NEXT:    movq %rcx, %rax
743; CHECK-O3-NEXT:    retq
744  %v = load atomic i64, i64* %p unordered, align 8
745  %ret = sdiv i64 %v, 15
746  ret i64 %ret
747}
748
749; Legal to fold (TODO)
750define i64 @load_fold_sdiv2(i64* %p, i64 %v2) {
751; CHECK-O0-LABEL: load_fold_sdiv2:
752; CHECK-O0:       # %bb.0:
753; CHECK-O0-NEXT:    movq (%rdi), %rax
754; CHECK-O0-NEXT:    cqto
755; CHECK-O0-NEXT:    idivq %rsi
756; CHECK-O0-NEXT:    retq
757;
758; CHECK-O3-LABEL: load_fold_sdiv2:
759; CHECK-O3:       # %bb.0:
760; CHECK-O3-NEXT:    movq (%rdi), %rax
761; CHECK-O3-NEXT:    movq %rax, %rcx
762; CHECK-O3-NEXT:    orq %rsi, %rcx
763; CHECK-O3-NEXT:    shrq $32, %rcx
764; CHECK-O3-NEXT:    je .LBB35_1
765; CHECK-O3-NEXT:  # %bb.2:
766; CHECK-O3-NEXT:    cqto
767; CHECK-O3-NEXT:    idivq %rsi
768; CHECK-O3-NEXT:    retq
769; CHECK-O3-NEXT:  .LBB35_1:
770; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
771; CHECK-O3-NEXT:    xorl %edx, %edx
772; CHECK-O3-NEXT:    divl %esi
773; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
774; CHECK-O3-NEXT:    retq
775  %v = load atomic i64, i64* %p unordered, align 8
776  %ret = sdiv i64 %v, %v2
777  ret i64 %ret
778}
779
780define i64 @load_fold_sdiv3(i64* %p1, i64* %p2) {
781; CHECK-O0-LABEL: load_fold_sdiv3:
782; CHECK-O0:       # %bb.0:
783; CHECK-O0-NEXT:    movq (%rdi), %rax
784; CHECK-O0-NEXT:    cqto
785; CHECK-O0-NEXT:    idivq (%rsi)
786; CHECK-O0-NEXT:    retq
787;
788; CHECK-O3-LABEL: load_fold_sdiv3:
789; CHECK-O3:       # %bb.0:
790; CHECK-O3-NEXT:    movq (%rdi), %rax
791; CHECK-O3-NEXT:    movq (%rsi), %rcx
792; CHECK-O3-NEXT:    movq %rax, %rdx
793; CHECK-O3-NEXT:    orq %rcx, %rdx
794; CHECK-O3-NEXT:    shrq $32, %rdx
795; CHECK-O3-NEXT:    je .LBB36_1
796; CHECK-O3-NEXT:  # %bb.2:
797; CHECK-O3-NEXT:    cqto
798; CHECK-O3-NEXT:    idivq %rcx
799; CHECK-O3-NEXT:    retq
800; CHECK-O3-NEXT:  .LBB36_1:
801; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
802; CHECK-O3-NEXT:    xorl %edx, %edx
803; CHECK-O3-NEXT:    divl %ecx
804; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
805; CHECK-O3-NEXT:    retq
806  %v = load atomic i64, i64* %p1 unordered, align 8
807  %v2 = load atomic i64, i64* %p2 unordered, align 8
808  %ret = sdiv i64 %v, %v2
809  ret i64 %ret
810}
811
812; Legal to fold (TODO)
813define i64 @load_fold_udiv1(i64* %p) {
814; CHECK-O0-LABEL: load_fold_udiv1:
815; CHECK-O0:       # %bb.0:
816; CHECK-O0-NEXT:    movq (%rdi), %rax
817; CHECK-O0-NEXT:    movl $15, %ecx
818; CHECK-O0-NEXT:    xorl %edx, %edx
819; CHECK-O0-NEXT:    # kill: def $rdx killed $edx
820; CHECK-O0-NEXT:    divq %rcx
821; CHECK-O0-NEXT:    retq
822;
823; CHECK-O3-CUR-LABEL: load_fold_udiv1:
824; CHECK-O3-CUR:       # %bb.0:
825; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
826; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
827; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
828; CHECK-O3-CUR-NEXT:    shrq $3, %rax
829; CHECK-O3-CUR-NEXT:    retq
830;
831; CHECK-O3-EX-LABEL: load_fold_udiv1:
832; CHECK-O3-EX:       # %bb.0:
833; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
834; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
835; CHECK-O3-EX-NEXT:    shrq $3, %rax
836; CHECK-O3-EX-NEXT:    retq
837  %v = load atomic i64, i64* %p unordered, align 8
838  %ret = udiv i64 %v, 15
839  ret i64 %ret
840}
841
842define i64 @load_fold_udiv2(i64* %p, i64 %v2) {
843; CHECK-O0-LABEL: load_fold_udiv2:
844; CHECK-O0:       # %bb.0:
845; CHECK-O0-NEXT:    movq (%rdi), %rax
846; CHECK-O0-NEXT:    xorl %ecx, %ecx
847; CHECK-O0-NEXT:    movl %ecx, %edx
848; CHECK-O0-NEXT:    divq %rsi
849; CHECK-O0-NEXT:    retq
850;
851; CHECK-O3-LABEL: load_fold_udiv2:
852; CHECK-O3:       # %bb.0:
853; CHECK-O3-NEXT:    movq (%rdi), %rax
854; CHECK-O3-NEXT:    movq %rax, %rcx
855; CHECK-O3-NEXT:    orq %rsi, %rcx
856; CHECK-O3-NEXT:    shrq $32, %rcx
857; CHECK-O3-NEXT:    je .LBB38_1
858; CHECK-O3-NEXT:  # %bb.2:
859; CHECK-O3-NEXT:    xorl %edx, %edx
860; CHECK-O3-NEXT:    divq %rsi
861; CHECK-O3-NEXT:    retq
862; CHECK-O3-NEXT:  .LBB38_1:
863; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
864; CHECK-O3-NEXT:    xorl %edx, %edx
865; CHECK-O3-NEXT:    divl %esi
866; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
867; CHECK-O3-NEXT:    retq
868  %v = load atomic i64, i64* %p unordered, align 8
869  %ret = udiv i64 %v, %v2
870  ret i64 %ret
871}
872
873define i64 @load_fold_udiv3(i64* %p1, i64* %p2) {
874; CHECK-O0-LABEL: load_fold_udiv3:
875; CHECK-O0:       # %bb.0:
876; CHECK-O0-NEXT:    movq (%rdi), %rax
877; CHECK-O0-NEXT:    xorl %ecx, %ecx
878; CHECK-O0-NEXT:    movl %ecx, %edx
879; CHECK-O0-NEXT:    divq (%rsi)
880; CHECK-O0-NEXT:    retq
881;
882; CHECK-O3-LABEL: load_fold_udiv3:
883; CHECK-O3:       # %bb.0:
884; CHECK-O3-NEXT:    movq (%rdi), %rax
885; CHECK-O3-NEXT:    movq (%rsi), %rcx
886; CHECK-O3-NEXT:    movq %rax, %rdx
887; CHECK-O3-NEXT:    orq %rcx, %rdx
888; CHECK-O3-NEXT:    shrq $32, %rdx
889; CHECK-O3-NEXT:    je .LBB39_1
890; CHECK-O3-NEXT:  # %bb.2:
891; CHECK-O3-NEXT:    xorl %edx, %edx
892; CHECK-O3-NEXT:    divq %rcx
893; CHECK-O3-NEXT:    retq
894; CHECK-O3-NEXT:  .LBB39_1:
895; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
896; CHECK-O3-NEXT:    xorl %edx, %edx
897; CHECK-O3-NEXT:    divl %ecx
898; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
899; CHECK-O3-NEXT:    retq
900  %v = load atomic i64, i64* %p1 unordered, align 8
901  %v2 = load atomic i64, i64* %p2 unordered, align 8
902  %ret = udiv i64 %v, %v2
903  ret i64 %ret
904}
905
906; Legal to fold (TODO)
907define i64 @load_fold_srem1(i64* %p) {
908; CHECK-O0-LABEL: load_fold_srem1:
909; CHECK-O0:       # %bb.0:
910; CHECK-O0-NEXT:    movq (%rdi), %rax
911; CHECK-O0-NEXT:    movl $15, %ecx
912; CHECK-O0-NEXT:    cqto
913; CHECK-O0-NEXT:    idivq %rcx
914; CHECK-O0-NEXT:    movq %rdx, %rax
915; CHECK-O0-NEXT:    retq
916;
917; CHECK-O3-LABEL: load_fold_srem1:
918; CHECK-O3:       # %bb.0:
919; CHECK-O3-NEXT:    movq (%rdi), %rcx
920; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
921; CHECK-O3-NEXT:    movq %rcx, %rax
922; CHECK-O3-NEXT:    imulq %rdx
923; CHECK-O3-NEXT:    addq %rcx, %rdx
924; CHECK-O3-NEXT:    movq %rdx, %rax
925; CHECK-O3-NEXT:    shrq $63, %rax
926; CHECK-O3-NEXT:    sarq $3, %rdx
927; CHECK-O3-NEXT:    addq %rax, %rdx
928; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rax
929; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
930; CHECK-O3-NEXT:    subq %rax, %rcx
931; CHECK-O3-NEXT:    movq %rcx, %rax
932; CHECK-O3-NEXT:    retq
933  %v = load atomic i64, i64* %p unordered, align 8
934  %ret = srem i64 %v, 15
935  ret i64 %ret
936}
937
938; Legal, as expected
939define i64 @load_fold_srem2(i64* %p, i64 %v2) {
940; CHECK-O0-LABEL: load_fold_srem2:
941; CHECK-O0:       # %bb.0:
942; CHECK-O0-NEXT:    movq (%rdi), %rax
943; CHECK-O0-NEXT:    cqto
944; CHECK-O0-NEXT:    idivq %rsi
945; CHECK-O0-NEXT:    movq %rdx, %rax
946; CHECK-O0-NEXT:    retq
947;
948; CHECK-O3-LABEL: load_fold_srem2:
949; CHECK-O3:       # %bb.0:
950; CHECK-O3-NEXT:    movq (%rdi), %rax
951; CHECK-O3-NEXT:    movq %rax, %rcx
952; CHECK-O3-NEXT:    orq %rsi, %rcx
953; CHECK-O3-NEXT:    shrq $32, %rcx
954; CHECK-O3-NEXT:    je .LBB41_1
955; CHECK-O3-NEXT:  # %bb.2:
956; CHECK-O3-NEXT:    cqto
957; CHECK-O3-NEXT:    idivq %rsi
958; CHECK-O3-NEXT:    movq %rdx, %rax
959; CHECK-O3-NEXT:    retq
960; CHECK-O3-NEXT:  .LBB41_1:
961; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
962; CHECK-O3-NEXT:    xorl %edx, %edx
963; CHECK-O3-NEXT:    divl %esi
964; CHECK-O3-NEXT:    movl %edx, %eax
965; CHECK-O3-NEXT:    retq
966  %v = load atomic i64, i64* %p unordered, align 8
967  %ret = srem i64 %v, %v2
968  ret i64 %ret
969}
970
971define i64 @load_fold_srem3(i64* %p1, i64* %p2) {
972; CHECK-O0-LABEL: load_fold_srem3:
973; CHECK-O0:       # %bb.0:
974; CHECK-O0-NEXT:    movq (%rdi), %rax
975; CHECK-O0-NEXT:    cqto
976; CHECK-O0-NEXT:    idivq (%rsi)
977; CHECK-O0-NEXT:    movq %rdx, %rax
978; CHECK-O0-NEXT:    retq
979;
980; CHECK-O3-LABEL: load_fold_srem3:
981; CHECK-O3:       # %bb.0:
982; CHECK-O3-NEXT:    movq (%rdi), %rax
983; CHECK-O3-NEXT:    movq (%rsi), %rcx
984; CHECK-O3-NEXT:    movq %rax, %rdx
985; CHECK-O3-NEXT:    orq %rcx, %rdx
986; CHECK-O3-NEXT:    shrq $32, %rdx
987; CHECK-O3-NEXT:    je .LBB42_1
988; CHECK-O3-NEXT:  # %bb.2:
989; CHECK-O3-NEXT:    cqto
990; CHECK-O3-NEXT:    idivq %rcx
991; CHECK-O3-NEXT:    movq %rdx, %rax
992; CHECK-O3-NEXT:    retq
993; CHECK-O3-NEXT:  .LBB42_1:
994; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
995; CHECK-O3-NEXT:    xorl %edx, %edx
996; CHECK-O3-NEXT:    divl %ecx
997; CHECK-O3-NEXT:    movl %edx, %eax
998; CHECK-O3-NEXT:    retq
999  %v = load atomic i64, i64* %p1 unordered, align 8
1000  %v2 = load atomic i64, i64* %p2 unordered, align 8
1001  %ret = srem i64 %v, %v2
1002  ret i64 %ret
1003}
1004
1005; Legal to fold (TODO)
1006define i64 @load_fold_urem1(i64* %p) {
1007; CHECK-O0-LABEL: load_fold_urem1:
1008; CHECK-O0:       # %bb.0:
1009; CHECK-O0-NEXT:    movq (%rdi), %rax
1010; CHECK-O0-NEXT:    movl $15, %ecx
1011; CHECK-O0-NEXT:    xorl %edx, %edx
1012; CHECK-O0-NEXT:    # kill: def $rdx killed $edx
1013; CHECK-O0-NEXT:    divq %rcx
1014; CHECK-O0-NEXT:    movq %rdx, %rax
1015; CHECK-O0-NEXT:    retq
1016;
1017; CHECK-O3-LABEL: load_fold_urem1:
1018; CHECK-O3:       # %bb.0:
1019; CHECK-O3-NEXT:    movq (%rdi), %rax
1020; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1021; CHECK-O3-NEXT:    movq %rax, %rdx
1022; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rcx
1023; CHECK-O3-NEXT:    shrq $3, %rcx
1024; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rcx
1025; CHECK-O3-NEXT:    leaq (%rcx,%rcx,2), %rcx
1026; CHECK-O3-NEXT:    subq %rcx, %rax
1027; CHECK-O3-NEXT:    retq
1028  %v = load atomic i64, i64* %p unordered, align 8
1029  %ret = urem i64 %v, 15
1030  ret i64 %ret
1031}
1032
1033; Legal, as expected
1034define i64 @load_fold_urem2(i64* %p, i64 %v2) {
1035; CHECK-O0-LABEL: load_fold_urem2:
1036; CHECK-O0:       # %bb.0:
1037; CHECK-O0-NEXT:    movq (%rdi), %rax
1038; CHECK-O0-NEXT:    xorl %ecx, %ecx
1039; CHECK-O0-NEXT:    movl %ecx, %edx
1040; CHECK-O0-NEXT:    divq %rsi
1041; CHECK-O0-NEXT:    movq %rdx, %rax
1042; CHECK-O0-NEXT:    retq
1043;
1044; CHECK-O3-LABEL: load_fold_urem2:
1045; CHECK-O3:       # %bb.0:
1046; CHECK-O3-NEXT:    movq (%rdi), %rax
1047; CHECK-O3-NEXT:    movq %rax, %rcx
1048; CHECK-O3-NEXT:    orq %rsi, %rcx
1049; CHECK-O3-NEXT:    shrq $32, %rcx
1050; CHECK-O3-NEXT:    je .LBB44_1
1051; CHECK-O3-NEXT:  # %bb.2:
1052; CHECK-O3-NEXT:    xorl %edx, %edx
1053; CHECK-O3-NEXT:    divq %rsi
1054; CHECK-O3-NEXT:    movq %rdx, %rax
1055; CHECK-O3-NEXT:    retq
1056; CHECK-O3-NEXT:  .LBB44_1:
1057; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1058; CHECK-O3-NEXT:    xorl %edx, %edx
1059; CHECK-O3-NEXT:    divl %esi
1060; CHECK-O3-NEXT:    movl %edx, %eax
1061; CHECK-O3-NEXT:    retq
1062  %v = load atomic i64, i64* %p unordered, align 8
1063  %ret = urem i64 %v, %v2
1064  ret i64 %ret
1065}
1066
1067define i64 @load_fold_urem3(i64* %p1, i64* %p2) {
1068; CHECK-O0-LABEL: load_fold_urem3:
1069; CHECK-O0:       # %bb.0:
1070; CHECK-O0-NEXT:    movq (%rdi), %rax
1071; CHECK-O0-NEXT:    xorl %ecx, %ecx
1072; CHECK-O0-NEXT:    movl %ecx, %edx
1073; CHECK-O0-NEXT:    divq (%rsi)
1074; CHECK-O0-NEXT:    movq %rdx, %rax
1075; CHECK-O0-NEXT:    retq
1076;
1077; CHECK-O3-LABEL: load_fold_urem3:
1078; CHECK-O3:       # %bb.0:
1079; CHECK-O3-NEXT:    movq (%rdi), %rax
1080; CHECK-O3-NEXT:    movq (%rsi), %rcx
1081; CHECK-O3-NEXT:    movq %rax, %rdx
1082; CHECK-O3-NEXT:    orq %rcx, %rdx
1083; CHECK-O3-NEXT:    shrq $32, %rdx
1084; CHECK-O3-NEXT:    je .LBB45_1
1085; CHECK-O3-NEXT:  # %bb.2:
1086; CHECK-O3-NEXT:    xorl %edx, %edx
1087; CHECK-O3-NEXT:    divq %rcx
1088; CHECK-O3-NEXT:    movq %rdx, %rax
1089; CHECK-O3-NEXT:    retq
1090; CHECK-O3-NEXT:  .LBB45_1:
1091; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1092; CHECK-O3-NEXT:    xorl %edx, %edx
1093; CHECK-O3-NEXT:    divl %ecx
1094; CHECK-O3-NEXT:    movl %edx, %eax
1095; CHECK-O3-NEXT:    retq
1096  %v = load atomic i64, i64* %p1 unordered, align 8
1097  %v2 = load atomic i64, i64* %p2 unordered, align 8
1098  %ret = urem i64 %v, %v2
1099  ret i64 %ret
1100}
1101
1102; Legal, as expected
1103define i64 @load_fold_shl1(i64* %p) {
1104; CHECK-LABEL: load_fold_shl1:
1105; CHECK:       # %bb.0:
1106; CHECK-NEXT:    movq (%rdi), %rax
1107; CHECK-NEXT:    shlq $15, %rax
1108; CHECK-NEXT:    retq
1109  %v = load atomic i64, i64* %p unordered, align 8
1110  %ret = shl i64 %v, 15
1111  ret i64 %ret
1112}
1113
1114define i64 @load_fold_shl2(i64* %p, i64 %v2) {
1115; CHECK-O0-LABEL: load_fold_shl2:
1116; CHECK-O0:       # %bb.0:
1117; CHECK-O0-NEXT:    movq %rsi, %rcx
1118; CHECK-O0-NEXT:    movq (%rdi), %rax
1119; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1120; CHECK-O0-NEXT:    shlq %cl, %rax
1121; CHECK-O0-NEXT:    retq
1122;
1123; CHECK-O3-LABEL: load_fold_shl2:
1124; CHECK-O3:       # %bb.0:
1125; CHECK-O3-NEXT:    shlxq %rsi, (%rdi), %rax
1126; CHECK-O3-NEXT:    retq
1127  %v = load atomic i64, i64* %p unordered, align 8
1128  %ret = shl i64 %v, %v2
1129  ret i64 %ret
1130}
1131
1132define i64 @load_fold_shl3(i64* %p1, i64* %p2) {
1133; CHECK-O0-LABEL: load_fold_shl3:
1134; CHECK-O0:       # %bb.0:
1135; CHECK-O0-NEXT:    movq (%rdi), %rax
1136; CHECK-O0-NEXT:    movq (%rsi), %rcx
1137; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1138; CHECK-O0-NEXT:    shlq %cl, %rax
1139; CHECK-O0-NEXT:    retq
1140;
1141; CHECK-O3-LABEL: load_fold_shl3:
1142; CHECK-O3:       # %bb.0:
1143; CHECK-O3-NEXT:    movq (%rsi), %rax
1144; CHECK-O3-NEXT:    shlxq %rax, (%rdi), %rax
1145; CHECK-O3-NEXT:    retq
1146  %v = load atomic i64, i64* %p1 unordered, align 8
1147  %v2 = load atomic i64, i64* %p2 unordered, align 8
1148  %ret = shl i64 %v, %v2
1149  ret i64 %ret
1150}
1151
1152; Legal, as expected
1153define i64 @load_fold_lshr1(i64* %p) {
1154; CHECK-LABEL: load_fold_lshr1:
1155; CHECK:       # %bb.0:
1156; CHECK-NEXT:    movq (%rdi), %rax
1157; CHECK-NEXT:    shrq $15, %rax
1158; CHECK-NEXT:    retq
1159  %v = load atomic i64, i64* %p unordered, align 8
1160  %ret = lshr i64 %v, 15
1161  ret i64 %ret
1162}
1163
1164define i64 @load_fold_lshr2(i64* %p, i64 %v2) {
1165; CHECK-O0-LABEL: load_fold_lshr2:
1166; CHECK-O0:       # %bb.0:
1167; CHECK-O0-NEXT:    movq %rsi, %rcx
1168; CHECK-O0-NEXT:    movq (%rdi), %rax
1169; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1170; CHECK-O0-NEXT:    shrq %cl, %rax
1171; CHECK-O0-NEXT:    retq
1172;
1173; CHECK-O3-LABEL: load_fold_lshr2:
1174; CHECK-O3:       # %bb.0:
1175; CHECK-O3-NEXT:    shrxq %rsi, (%rdi), %rax
1176; CHECK-O3-NEXT:    retq
1177  %v = load atomic i64, i64* %p unordered, align 8
1178  %ret = lshr i64 %v, %v2
1179  ret i64 %ret
1180}
1181
1182define i64 @load_fold_lshr3(i64* %p1, i64* %p2) {
1183; CHECK-O0-LABEL: load_fold_lshr3:
1184; CHECK-O0:       # %bb.0:
1185; CHECK-O0-NEXT:    movq (%rdi), %rax
1186; CHECK-O0-NEXT:    movq (%rsi), %rcx
1187; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1188; CHECK-O0-NEXT:    shrq %cl, %rax
1189; CHECK-O0-NEXT:    retq
1190;
1191; CHECK-O3-LABEL: load_fold_lshr3:
1192; CHECK-O3:       # %bb.0:
1193; CHECK-O3-NEXT:    movq (%rsi), %rax
1194; CHECK-O3-NEXT:    shrxq %rax, (%rdi), %rax
1195; CHECK-O3-NEXT:    retq
1196  %v = load atomic i64, i64* %p1 unordered, align 8
1197  %v2 = load atomic i64, i64* %p2 unordered, align 8
1198  %ret = lshr i64 %v, %v2
1199  ret i64 %ret
1200}
1201
1202; Legal, as expected
1203define i64 @load_fold_ashr1(i64* %p) {
1204; CHECK-LABEL: load_fold_ashr1:
1205; CHECK:       # %bb.0:
1206; CHECK-NEXT:    movq (%rdi), %rax
1207; CHECK-NEXT:    sarq $15, %rax
1208; CHECK-NEXT:    retq
1209  %v = load atomic i64, i64* %p unordered, align 8
1210  %ret = ashr i64 %v, 15
1211  ret i64 %ret
1212}
1213
1214define i64 @load_fold_ashr2(i64* %p, i64 %v2) {
1215; CHECK-O0-LABEL: load_fold_ashr2:
1216; CHECK-O0:       # %bb.0:
1217; CHECK-O0-NEXT:    movq %rsi, %rcx
1218; CHECK-O0-NEXT:    movq (%rdi), %rax
1219; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1220; CHECK-O0-NEXT:    sarq %cl, %rax
1221; CHECK-O0-NEXT:    retq
1222;
1223; CHECK-O3-LABEL: load_fold_ashr2:
1224; CHECK-O3:       # %bb.0:
1225; CHECK-O3-NEXT:    sarxq %rsi, (%rdi), %rax
1226; CHECK-O3-NEXT:    retq
1227  %v = load atomic i64, i64* %p unordered, align 8
1228  %ret = ashr i64 %v, %v2
1229  ret i64 %ret
1230}
1231
1232define i64 @load_fold_ashr3(i64* %p1, i64* %p2) {
1233; CHECK-O0-LABEL: load_fold_ashr3:
1234; CHECK-O0:       # %bb.0:
1235; CHECK-O0-NEXT:    movq (%rdi), %rax
1236; CHECK-O0-NEXT:    movq (%rsi), %rcx
1237; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1238; CHECK-O0-NEXT:    sarq %cl, %rax
1239; CHECK-O0-NEXT:    retq
1240;
1241; CHECK-O3-LABEL: load_fold_ashr3:
1242; CHECK-O3:       # %bb.0:
1243; CHECK-O3-NEXT:    movq (%rsi), %rax
1244; CHECK-O3-NEXT:    sarxq %rax, (%rdi), %rax
1245; CHECK-O3-NEXT:    retq
1246  %v = load atomic i64, i64* %p1 unordered, align 8
1247  %v2 = load atomic i64, i64* %p2 unordered, align 8
1248  %ret = ashr i64 %v, %v2
1249  ret i64 %ret
1250}
1251
1252; Legal, as expected
1253define i64 @load_fold_and1(i64* %p) {
1254; CHECK-O0-LABEL: load_fold_and1:
1255; CHECK-O0:       # %bb.0:
1256; CHECK-O0-NEXT:    movq (%rdi), %rax
1257; CHECK-O0-NEXT:    andq $15, %rax
1258; CHECK-O0-NEXT:    retq
1259;
1260; CHECK-O3-LABEL: load_fold_and1:
1261; CHECK-O3:       # %bb.0:
1262; CHECK-O3-NEXT:    movq (%rdi), %rax
1263; CHECK-O3-NEXT:    andl $15, %eax
1264; CHECK-O3-NEXT:    retq
1265  %v = load atomic i64, i64* %p unordered, align 8
1266  %ret = and i64 %v, 15
1267  ret i64 %ret
1268}
1269
1270define i64 @load_fold_and2(i64* %p, i64 %v2) {
1271; CHECK-LABEL: load_fold_and2:
1272; CHECK:       # %bb.0:
1273; CHECK-NEXT:    movq %rsi, %rax
1274; CHECK-NEXT:    andq (%rdi), %rax
1275; CHECK-NEXT:    retq
1276  %v = load atomic i64, i64* %p unordered, align 8
1277  %ret = and i64 %v, %v2
1278  ret i64 %ret
1279}
1280
1281define i64 @load_fold_and3(i64* %p1, i64* %p2) {
1282; CHECK-O0-LABEL: load_fold_and3:
1283; CHECK-O0:       # %bb.0:
1284; CHECK-O0-NEXT:    movq (%rdi), %rax
1285; CHECK-O0-NEXT:    andq (%rsi), %rax
1286; CHECK-O0-NEXT:    retq
1287;
1288; CHECK-O3-CUR-LABEL: load_fold_and3:
1289; CHECK-O3-CUR:       # %bb.0:
1290; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1291; CHECK-O3-CUR-NEXT:    andq (%rdi), %rax
1292; CHECK-O3-CUR-NEXT:    retq
1293;
1294; CHECK-O3-EX-LABEL: load_fold_and3:
1295; CHECK-O3-EX:       # %bb.0:
1296; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1297; CHECK-O3-EX-NEXT:    andq (%rsi), %rax
1298; CHECK-O3-EX-NEXT:    retq
1299  %v = load atomic i64, i64* %p1 unordered, align 8
1300  %v2 = load atomic i64, i64* %p2 unordered, align 8
1301  %ret = and i64 %v, %v2
1302  ret i64 %ret
1303}
1304
1305; Legal, as expected
1306define i64 @load_fold_or1(i64* %p) {
1307; CHECK-LABEL: load_fold_or1:
1308; CHECK:       # %bb.0:
1309; CHECK-NEXT:    movq (%rdi), %rax
1310; CHECK-NEXT:    orq $15, %rax
1311; CHECK-NEXT:    retq
1312  %v = load atomic i64, i64* %p unordered, align 8
1313  %ret = or i64 %v, 15
1314  ret i64 %ret
1315}
1316
1317define i64 @load_fold_or2(i64* %p, i64 %v2) {
1318; CHECK-LABEL: load_fold_or2:
1319; CHECK:       # %bb.0:
1320; CHECK-NEXT:    movq %rsi, %rax
1321; CHECK-NEXT:    orq (%rdi), %rax
1322; CHECK-NEXT:    retq
1323  %v = load atomic i64, i64* %p unordered, align 8
1324  %ret = or i64 %v, %v2
1325  ret i64 %ret
1326}
1327
1328define i64 @load_fold_or3(i64* %p1, i64* %p2) {
1329; CHECK-O0-LABEL: load_fold_or3:
1330; CHECK-O0:       # %bb.0:
1331; CHECK-O0-NEXT:    movq (%rdi), %rax
1332; CHECK-O0-NEXT:    orq (%rsi), %rax
1333; CHECK-O0-NEXT:    retq
1334;
1335; CHECK-O3-CUR-LABEL: load_fold_or3:
1336; CHECK-O3-CUR:       # %bb.0:
1337; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1338; CHECK-O3-CUR-NEXT:    orq (%rdi), %rax
1339; CHECK-O3-CUR-NEXT:    retq
1340;
1341; CHECK-O3-EX-LABEL: load_fold_or3:
1342; CHECK-O3-EX:       # %bb.0:
1343; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1344; CHECK-O3-EX-NEXT:    orq (%rsi), %rax
1345; CHECK-O3-EX-NEXT:    retq
1346  %v = load atomic i64, i64* %p1 unordered, align 8
1347  %v2 = load atomic i64, i64* %p2 unordered, align 8
1348  %ret = or i64 %v, %v2
1349  ret i64 %ret
1350}
1351
1352; Legal, as expected
1353define i64 @load_fold_xor1(i64* %p) {
1354; CHECK-LABEL: load_fold_xor1:
1355; CHECK:       # %bb.0:
1356; CHECK-NEXT:    movq (%rdi), %rax
1357; CHECK-NEXT:    xorq $15, %rax
1358; CHECK-NEXT:    retq
1359  %v = load atomic i64, i64* %p unordered, align 8
1360  %ret = xor i64 %v, 15
1361  ret i64 %ret
1362}
1363
1364define i64 @load_fold_xor2(i64* %p, i64 %v2) {
1365; CHECK-LABEL: load_fold_xor2:
1366; CHECK:       # %bb.0:
1367; CHECK-NEXT:    movq %rsi, %rax
1368; CHECK-NEXT:    xorq (%rdi), %rax
1369; CHECK-NEXT:    retq
1370  %v = load atomic i64, i64* %p unordered, align 8
1371  %ret = xor i64 %v, %v2
1372  ret i64 %ret
1373}
1374
1375define i64 @load_fold_xor3(i64* %p1, i64* %p2) {
1376; CHECK-O0-LABEL: load_fold_xor3:
1377; CHECK-O0:       # %bb.0:
1378; CHECK-O0-NEXT:    movq (%rdi), %rax
1379; CHECK-O0-NEXT:    xorq (%rsi), %rax
1380; CHECK-O0-NEXT:    retq
1381;
1382; CHECK-O3-CUR-LABEL: load_fold_xor3:
1383; CHECK-O3-CUR:       # %bb.0:
1384; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1385; CHECK-O3-CUR-NEXT:    xorq (%rdi), %rax
1386; CHECK-O3-CUR-NEXT:    retq
1387;
1388; CHECK-O3-EX-LABEL: load_fold_xor3:
1389; CHECK-O3-EX:       # %bb.0:
1390; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1391; CHECK-O3-EX-NEXT:    xorq (%rsi), %rax
1392; CHECK-O3-EX-NEXT:    retq
1393  %v = load atomic i64, i64* %p1 unordered, align 8
1394  %v2 = load atomic i64, i64* %p2 unordered, align 8
1395  %ret = xor i64 %v, %v2
1396  ret i64 %ret
1397}
1398
1399define i1 @load_fold_icmp1(i64* %p) {
1400; CHECK-O0-LABEL: load_fold_icmp1:
1401; CHECK-O0:       # %bb.0:
1402; CHECK-O0-NEXT:    movq (%rdi), %rax
1403; CHECK-O0-NEXT:    subq $15, %rax
1404; CHECK-O0-NEXT:    sete %al
1405; CHECK-O0-NEXT:    retq
1406;
1407; CHECK-O3-LABEL: load_fold_icmp1:
1408; CHECK-O3:       # %bb.0:
1409; CHECK-O3-NEXT:    cmpq $15, (%rdi)
1410; CHECK-O3-NEXT:    sete %al
1411; CHECK-O3-NEXT:    retq
1412  %v = load atomic i64, i64* %p unordered, align 8
1413  %ret = icmp eq i64 %v, 15
1414  ret i1 %ret
1415}
1416
1417define i1 @load_fold_icmp2(i64* %p, i64 %v2) {
1418; CHECK-O0-LABEL: load_fold_icmp2:
1419; CHECK-O0:       # %bb.0:
1420; CHECK-O0-NEXT:    movq (%rdi), %rax
1421; CHECK-O0-NEXT:    subq %rsi, %rax
1422; CHECK-O0-NEXT:    sete %al
1423; CHECK-O0-NEXT:    retq
1424;
1425; CHECK-O3-LABEL: load_fold_icmp2:
1426; CHECK-O3:       # %bb.0:
1427; CHECK-O3-NEXT:    cmpq %rsi, (%rdi)
1428; CHECK-O3-NEXT:    sete %al
1429; CHECK-O3-NEXT:    retq
1430  %v = load atomic i64, i64* %p unordered, align 8
1431  %ret = icmp eq i64 %v, %v2
1432  ret i1 %ret
1433}
1434
1435define i1 @load_fold_icmp3(i64* %p1, i64* %p2) {
1436; CHECK-O0-LABEL: load_fold_icmp3:
1437; CHECK-O0:       # %bb.0:
1438; CHECK-O0-NEXT:    movq (%rdi), %rax
1439; CHECK-O0-NEXT:    movq (%rsi), %rcx
1440; CHECK-O0-NEXT:    subq %rcx, %rax
1441; CHECK-O0-NEXT:    sete %al
1442; CHECK-O0-NEXT:    retq
1443;
1444; CHECK-O3-CUR-LABEL: load_fold_icmp3:
1445; CHECK-O3-CUR:       # %bb.0:
1446; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1447; CHECK-O3-CUR-NEXT:    cmpq %rax, (%rdi)
1448; CHECK-O3-CUR-NEXT:    sete %al
1449; CHECK-O3-CUR-NEXT:    retq
1450;
1451; CHECK-O3-EX-LABEL: load_fold_icmp3:
1452; CHECK-O3-EX:       # %bb.0:
1453; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1454; CHECK-O3-EX-NEXT:    cmpq (%rsi), %rax
1455; CHECK-O3-EX-NEXT:    sete %al
1456; CHECK-O3-EX-NEXT:    retq
1457  %v = load atomic i64, i64* %p1 unordered, align 8
1458  %v2 = load atomic i64, i64* %p2 unordered, align 8
1459  %ret = icmp eq i64 %v, %v2
1460  ret i1 %ret
1461}
1462
1463
1464;; The next batch of tests check for read-modify-write patterns
1465;; Legally, it's okay to use a memory operand here as long as the operand
1466;; is well aligned (i.e. doesn't cross a cache line boundary).  We are
1467;; required not to narrow the store though!
1468
1469; Legal, as expected
1470define void @rmw_fold_add1(i64* %p, i64 %v) {
1471; CHECK-O0-LABEL: rmw_fold_add1:
1472; CHECK-O0:       # %bb.0:
1473; CHECK-O0-NEXT:    movq (%rdi), %rax
1474; CHECK-O0-NEXT:    addq $15, %rax
1475; CHECK-O0-NEXT:    movq %rax, (%rdi)
1476; CHECK-O0-NEXT:    retq
1477;
1478; CHECK-O3-LABEL: rmw_fold_add1:
1479; CHECK-O3:       # %bb.0:
1480; CHECK-O3-NEXT:    addq $15, (%rdi)
1481; CHECK-O3-NEXT:    retq
1482  %prev = load atomic i64, i64* %p unordered, align 8
1483  %val = add i64 %prev, 15
1484  store atomic i64 %val, i64* %p unordered, align 8
1485  ret void
1486}
1487
1488; Legal, as expected
1489define void @rmw_fold_add2(i64* %p, i64 %v) {
1490; CHECK-O0-LABEL: rmw_fold_add2:
1491; CHECK-O0:       # %bb.0:
1492; CHECK-O0-NEXT:    movq (%rdi), %rax
1493; CHECK-O0-NEXT:    addq %rsi, %rax
1494; CHECK-O0-NEXT:    movq %rax, (%rdi)
1495; CHECK-O0-NEXT:    retq
1496;
1497; CHECK-O3-LABEL: rmw_fold_add2:
1498; CHECK-O3:       # %bb.0:
1499; CHECK-O3-NEXT:    addq %rsi, (%rdi)
1500; CHECK-O3-NEXT:    retq
1501  %prev = load atomic i64, i64* %p unordered, align 8
1502  %val = add i64 %prev, %v
1503  store atomic i64 %val, i64* %p unordered, align 8
1504  ret void
1505}
1506
1507; Legal, as expected
1508define void @rmw_fold_sub1(i64* %p, i64 %v) {
1509; CHECK-O0-LABEL: rmw_fold_sub1:
1510; CHECK-O0:       # %bb.0:
1511; CHECK-O0-NEXT:    movq (%rdi), %rax
1512; CHECK-O0-NEXT:    addq $-15, %rax
1513; CHECK-O0-NEXT:    movq %rax, (%rdi)
1514; CHECK-O0-NEXT:    retq
1515;
1516; CHECK-O3-LABEL: rmw_fold_sub1:
1517; CHECK-O3:       # %bb.0:
1518; CHECK-O3-NEXT:    addq $-15, (%rdi)
1519; CHECK-O3-NEXT:    retq
1520  %prev = load atomic i64, i64* %p unordered, align 8
1521  %val = sub i64 %prev, 15
1522  store atomic i64 %val, i64* %p unordered, align 8
1523  ret void
1524}
1525
1526; Legal, as expected
1527define void @rmw_fold_sub2(i64* %p, i64 %v) {
1528; CHECK-O0-LABEL: rmw_fold_sub2:
1529; CHECK-O0:       # %bb.0:
1530; CHECK-O0-NEXT:    movq (%rdi), %rax
1531; CHECK-O0-NEXT:    subq %rsi, %rax
1532; CHECK-O0-NEXT:    movq %rax, (%rdi)
1533; CHECK-O0-NEXT:    retq
1534;
1535; CHECK-O3-LABEL: rmw_fold_sub2:
1536; CHECK-O3:       # %bb.0:
1537; CHECK-O3-NEXT:    subq %rsi, (%rdi)
1538; CHECK-O3-NEXT:    retq
1539  %prev = load atomic i64, i64* %p unordered, align 8
1540  %val = sub i64 %prev, %v
1541  store atomic i64 %val, i64* %p unordered, align 8
1542  ret void
1543}
1544
1545; Legal, as expected
1546define void @rmw_fold_mul1(i64* %p, i64 %v) {
1547; CHECK-LABEL: rmw_fold_mul1:
1548; CHECK:       # %bb.0:
1549; CHECK-NEXT:    movq (%rdi), %rax
1550; CHECK-NEXT:    leaq (%rax,%rax,4), %rax
1551; CHECK-NEXT:    leaq (%rax,%rax,2), %rax
1552; CHECK-NEXT:    movq %rax, (%rdi)
1553; CHECK-NEXT:    retq
1554  %prev = load atomic i64, i64* %p unordered, align 8
1555  %val = mul i64 %prev, 15
1556  store atomic i64 %val, i64* %p unordered, align 8
1557  ret void
1558}
1559
1560; Legal to fold (TODO)
1561define void @rmw_fold_mul2(i64* %p, i64 %v) {
1562; CHECK-O0-LABEL: rmw_fold_mul2:
1563; CHECK-O0:       # %bb.0:
1564; CHECK-O0-NEXT:    movq (%rdi), %rax
1565; CHECK-O0-NEXT:    imulq %rsi, %rax
1566; CHECK-O0-NEXT:    movq %rax, (%rdi)
1567; CHECK-O0-NEXT:    retq
1568;
1569; CHECK-O3-LABEL: rmw_fold_mul2:
1570; CHECK-O3:       # %bb.0:
1571; CHECK-O3-NEXT:    imulq (%rdi), %rsi
1572; CHECK-O3-NEXT:    movq %rsi, (%rdi)
1573; CHECK-O3-NEXT:    retq
1574  %prev = load atomic i64, i64* %p unordered, align 8
1575  %val = mul i64 %prev, %v
1576  store atomic i64 %val, i64* %p unordered, align 8
1577  ret void
1578}
1579
1580; Legal, as expected
1581define void @rmw_fold_sdiv1(i64* %p, i64 %v) {
1582; CHECK-O0-LABEL: rmw_fold_sdiv1:
1583; CHECK-O0:       # %bb.0:
1584; CHECK-O0-NEXT:    movq (%rdi), %rcx
1585; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1586; CHECK-O0-NEXT:    movq %rcx, %rax
1587; CHECK-O0-NEXT:    imulq %rdx
1588; CHECK-O0-NEXT:    movq %rdx, %rax
1589; CHECK-O0-NEXT:    addq %rcx, %rax
1590; CHECK-O0-NEXT:    movq %rax, %rcx
1591; CHECK-O0-NEXT:    shrq $63, %rcx
1592; CHECK-O0-NEXT:    sarq $3, %rax
1593; CHECK-O0-NEXT:    addq %rcx, %rax
1594; CHECK-O0-NEXT:    movq %rax, (%rdi)
1595; CHECK-O0-NEXT:    retq
1596;
1597; CHECK-O3-LABEL: rmw_fold_sdiv1:
1598; CHECK-O3:       # %bb.0:
1599; CHECK-O3-NEXT:    movq (%rdi), %rcx
1600; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1601; CHECK-O3-NEXT:    movq %rcx, %rax
1602; CHECK-O3-NEXT:    imulq %rdx
1603; CHECK-O3-NEXT:    addq %rcx, %rdx
1604; CHECK-O3-NEXT:    movq %rdx, %rax
1605; CHECK-O3-NEXT:    shrq $63, %rax
1606; CHECK-O3-NEXT:    sarq $3, %rdx
1607; CHECK-O3-NEXT:    addq %rax, %rdx
1608; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1609; CHECK-O3-NEXT:    retq
1610  %prev = load atomic i64, i64* %p unordered, align 8
1611  %val = sdiv i64 %prev, 15
1612  store atomic i64 %val, i64* %p unordered, align 8
1613  ret void
1614}
1615
1616; Legal, as expected
1617define void @rmw_fold_sdiv2(i64* %p, i64 %v) {
1618; CHECK-O0-LABEL: rmw_fold_sdiv2:
1619; CHECK-O0:       # %bb.0:
1620; CHECK-O0-NEXT:    movq (%rdi), %rax
1621; CHECK-O0-NEXT:    cqto
1622; CHECK-O0-NEXT:    idivq %rsi
1623; CHECK-O0-NEXT:    movq %rax, (%rdi)
1624; CHECK-O0-NEXT:    retq
1625;
1626; CHECK-O3-LABEL: rmw_fold_sdiv2:
1627; CHECK-O3:       # %bb.0:
1628; CHECK-O3-NEXT:    movq (%rdi), %rax
1629; CHECK-O3-NEXT:    movq %rax, %rcx
1630; CHECK-O3-NEXT:    orq %rsi, %rcx
1631; CHECK-O3-NEXT:    shrq $32, %rcx
1632; CHECK-O3-NEXT:    je .LBB74_1
1633; CHECK-O3-NEXT:  # %bb.2:
1634; CHECK-O3-NEXT:    cqto
1635; CHECK-O3-NEXT:    idivq %rsi
1636; CHECK-O3-NEXT:    movq %rax, (%rdi)
1637; CHECK-O3-NEXT:    retq
1638; CHECK-O3-NEXT:  .LBB74_1:
1639; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1640; CHECK-O3-NEXT:    xorl %edx, %edx
1641; CHECK-O3-NEXT:    divl %esi
1642; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
1643; CHECK-O3-NEXT:    movq %rax, (%rdi)
1644; CHECK-O3-NEXT:    retq
1645  %prev = load atomic i64, i64* %p unordered, align 8
1646  %val = sdiv i64 %prev, %v
1647  store atomic i64 %val, i64* %p unordered, align 8
1648  ret void
1649}
1650
1651; Legal, as expected
1652define void @rmw_fold_udiv1(i64* %p, i64 %v) {
1653; CHECK-O0-LABEL: rmw_fold_udiv1:
1654; CHECK-O0:       # %bb.0:
1655; CHECK-O0-NEXT:    movq (%rdi), %rdx
1656; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1657; CHECK-O0-NEXT:    mulxq %rax, %rax, %rax
1658; CHECK-O0-NEXT:    shrq $3, %rax
1659; CHECK-O0-NEXT:    movq %rax, (%rdi)
1660; CHECK-O0-NEXT:    retq
1661;
1662; CHECK-O3-CUR-LABEL: rmw_fold_udiv1:
1663; CHECK-O3-CUR:       # %bb.0:
1664; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
1665; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1666; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
1667; CHECK-O3-CUR-NEXT:    shrq $3, %rax
1668; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1669; CHECK-O3-CUR-NEXT:    retq
1670;
1671; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
1672; CHECK-O3-EX:       # %bb.0:
1673; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1674; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
1675; CHECK-O3-EX-NEXT:    shrq $3, %rax
1676; CHECK-O3-EX-NEXT:    movq %rax, (%rdi)
1677; CHECK-O3-EX-NEXT:    retq
1678  %prev = load atomic i64, i64* %p unordered, align 8
1679  %val = udiv i64 %prev, 15
1680  store atomic i64 %val, i64* %p unordered, align 8
1681  ret void
1682}
1683
1684; Legal, as expected
1685define void @rmw_fold_udiv2(i64* %p, i64 %v) {
1686; CHECK-O0-LABEL: rmw_fold_udiv2:
1687; CHECK-O0:       # %bb.0:
1688; CHECK-O0-NEXT:    movq (%rdi), %rax
1689; CHECK-O0-NEXT:    xorl %ecx, %ecx
1690; CHECK-O0-NEXT:    movl %ecx, %edx
1691; CHECK-O0-NEXT:    divq %rsi
1692; CHECK-O0-NEXT:    movq %rax, (%rdi)
1693; CHECK-O0-NEXT:    retq
1694;
1695; CHECK-O3-LABEL: rmw_fold_udiv2:
1696; CHECK-O3:       # %bb.0:
1697; CHECK-O3-NEXT:    movq (%rdi), %rax
1698; CHECK-O3-NEXT:    movq %rax, %rcx
1699; CHECK-O3-NEXT:    orq %rsi, %rcx
1700; CHECK-O3-NEXT:    shrq $32, %rcx
1701; CHECK-O3-NEXT:    je .LBB76_1
1702; CHECK-O3-NEXT:  # %bb.2:
1703; CHECK-O3-NEXT:    xorl %edx, %edx
1704; CHECK-O3-NEXT:    divq %rsi
1705; CHECK-O3-NEXT:    movq %rax, (%rdi)
1706; CHECK-O3-NEXT:    retq
1707; CHECK-O3-NEXT:  .LBB76_1:
1708; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1709; CHECK-O3-NEXT:    xorl %edx, %edx
1710; CHECK-O3-NEXT:    divl %esi
1711; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
1712; CHECK-O3-NEXT:    movq %rax, (%rdi)
1713; CHECK-O3-NEXT:    retq
1714  %prev = load atomic i64, i64* %p unordered, align 8
1715  %val = udiv i64 %prev, %v
1716  store atomic i64 %val, i64* %p unordered, align 8
1717  ret void
1718}
1719
1720; Legal, as expected
1721define void @rmw_fold_srem1(i64* %p, i64 %v) {
1722; CHECK-O0-LABEL: rmw_fold_srem1:
1723; CHECK-O0:       # %bb.0:
1724; CHECK-O0-NEXT:    movq (%rdi), %rax
1725; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1726; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1727; CHECK-O0-NEXT:    imulq %rcx
1728; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1729; CHECK-O0-NEXT:    movq %rdx, %rcx
1730; CHECK-O0-NEXT:    addq %rax, %rcx
1731; CHECK-O0-NEXT:    movq %rcx, %rdx
1732; CHECK-O0-NEXT:    shrq $63, %rdx
1733; CHECK-O0-NEXT:    sarq $3, %rcx
1734; CHECK-O0-NEXT:    addq %rdx, %rcx
1735; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
1736; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
1737; CHECK-O0-NEXT:    subq %rcx, %rax
1738; CHECK-O0-NEXT:    movq %rax, (%rdi)
1739; CHECK-O0-NEXT:    retq
1740;
1741; CHECK-O3-LABEL: rmw_fold_srem1:
1742; CHECK-O3:       # %bb.0:
1743; CHECK-O3-NEXT:    movq (%rdi), %rcx
1744; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1745; CHECK-O3-NEXT:    movq %rcx, %rax
1746; CHECK-O3-NEXT:    imulq %rdx
1747; CHECK-O3-NEXT:    addq %rcx, %rdx
1748; CHECK-O3-NEXT:    movq %rdx, %rax
1749; CHECK-O3-NEXT:    shrq $63, %rax
1750; CHECK-O3-NEXT:    sarq $3, %rdx
1751; CHECK-O3-NEXT:    addq %rax, %rdx
1752; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rax
1753; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
1754; CHECK-O3-NEXT:    subq %rax, %rcx
1755; CHECK-O3-NEXT:    movq %rcx, (%rdi)
1756; CHECK-O3-NEXT:    retq
1757  %prev = load atomic i64, i64* %p unordered, align 8
1758  %val = srem i64 %prev, 15
1759  store atomic i64 %val, i64* %p unordered, align 8
1760  ret void
1761}
1762
1763; Legal, as expected
1764define void @rmw_fold_srem2(i64* %p, i64 %v) {
1765; CHECK-O0-LABEL: rmw_fold_srem2:
1766; CHECK-O0:       # %bb.0:
1767; CHECK-O0-NEXT:    movq (%rdi), %rax
1768; CHECK-O0-NEXT:    cqto
1769; CHECK-O0-NEXT:    idivq %rsi
1770; CHECK-O0-NEXT:    movq %rdx, (%rdi)
1771; CHECK-O0-NEXT:    retq
1772;
1773; CHECK-O3-LABEL: rmw_fold_srem2:
1774; CHECK-O3:       # %bb.0:
1775; CHECK-O3-NEXT:    movq (%rdi), %rax
1776; CHECK-O3-NEXT:    movq %rax, %rcx
1777; CHECK-O3-NEXT:    orq %rsi, %rcx
1778; CHECK-O3-NEXT:    shrq $32, %rcx
1779; CHECK-O3-NEXT:    je .LBB78_1
1780; CHECK-O3-NEXT:  # %bb.2:
1781; CHECK-O3-NEXT:    cqto
1782; CHECK-O3-NEXT:    idivq %rsi
1783; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1784; CHECK-O3-NEXT:    retq
1785; CHECK-O3-NEXT:  .LBB78_1:
1786; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1787; CHECK-O3-NEXT:    xorl %edx, %edx
1788; CHECK-O3-NEXT:    divl %esi
1789; CHECK-O3-NEXT:    # kill: def $edx killed $edx def $rdx
1790; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1791; CHECK-O3-NEXT:    retq
1792  %prev = load atomic i64, i64* %p unordered, align 8
1793  %val = srem i64 %prev, %v
1794  store atomic i64 %val, i64* %p unordered, align 8
1795  ret void
1796}
1797
1798; Legal, as expected
1799define void @rmw_fold_urem1(i64* %p, i64 %v) {
1800; CHECK-O0-LABEL: rmw_fold_urem1:
1801; CHECK-O0:       # %bb.0:
1802; CHECK-O0-NEXT:    movq (%rdi), %rax
1803; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1804; CHECK-O0-NEXT:    movq %rax, %rdx
1805; CHECK-O0-NEXT:    mulxq %rcx, %rcx, %rcx
1806; CHECK-O0-NEXT:    shrq $3, %rcx
1807; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
1808; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
1809; CHECK-O0-NEXT:    subq %rcx, %rax
1810; CHECK-O0-NEXT:    movq %rax, (%rdi)
1811; CHECK-O0-NEXT:    retq
1812;
1813; CHECK-O3-LABEL: rmw_fold_urem1:
1814; CHECK-O3:       # %bb.0:
1815; CHECK-O3-NEXT:    movq (%rdi), %rdx
1816; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1817; CHECK-O3-NEXT:    mulxq %rax, %rax, %rax
1818; CHECK-O3-NEXT:    shrq $3, %rax
1819; CHECK-O3-NEXT:    leaq (%rax,%rax,4), %rax
1820; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
1821; CHECK-O3-NEXT:    subq %rax, %rdx
1822; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1823; CHECK-O3-NEXT:    retq
1824  %prev = load atomic i64, i64* %p unordered, align 8
1825  %val = urem i64 %prev, 15
1826  store atomic i64 %val, i64* %p unordered, align 8
1827  ret void
1828}
1829
1830; Legal, as expected
1831define void @rmw_fold_urem2(i64* %p, i64 %v) {
1832; CHECK-O0-LABEL: rmw_fold_urem2:
1833; CHECK-O0:       # %bb.0:
1834; CHECK-O0-NEXT:    movq (%rdi), %rax
1835; CHECK-O0-NEXT:    xorl %ecx, %ecx
1836; CHECK-O0-NEXT:    movl %ecx, %edx
1837; CHECK-O0-NEXT:    divq %rsi
1838; CHECK-O0-NEXT:    movq %rdx, (%rdi)
1839; CHECK-O0-NEXT:    retq
1840;
1841; CHECK-O3-LABEL: rmw_fold_urem2:
1842; CHECK-O3:       # %bb.0:
1843; CHECK-O3-NEXT:    movq (%rdi), %rax
1844; CHECK-O3-NEXT:    movq %rax, %rcx
1845; CHECK-O3-NEXT:    orq %rsi, %rcx
1846; CHECK-O3-NEXT:    shrq $32, %rcx
1847; CHECK-O3-NEXT:    je .LBB80_1
1848; CHECK-O3-NEXT:  # %bb.2:
1849; CHECK-O3-NEXT:    xorl %edx, %edx
1850; CHECK-O3-NEXT:    divq %rsi
1851; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1852; CHECK-O3-NEXT:    retq
1853; CHECK-O3-NEXT:  .LBB80_1:
1854; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1855; CHECK-O3-NEXT:    xorl %edx, %edx
1856; CHECK-O3-NEXT:    divl %esi
1857; CHECK-O3-NEXT:    # kill: def $edx killed $edx def $rdx
1858; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1859; CHECK-O3-NEXT:    retq
1860  %prev = load atomic i64, i64* %p unordered, align 8
1861  %val = urem i64 %prev, %v
1862  store atomic i64 %val, i64* %p unordered, align 8
1863  ret void
1864}
1865
1866; Legal to fold (TODO)
1867define void @rmw_fold_shl1(i64* %p, i64 %v) {
1868; CHECK-O0-LABEL: rmw_fold_shl1:
1869; CHECK-O0:       # %bb.0:
1870; CHECK-O0-NEXT:    movq (%rdi), %rax
1871; CHECK-O0-NEXT:    shlq $15, %rax
1872; CHECK-O0-NEXT:    movq %rax, (%rdi)
1873; CHECK-O0-NEXT:    retq
1874;
1875; CHECK-O3-CUR-LABEL: rmw_fold_shl1:
1876; CHECK-O3-CUR:       # %bb.0:
1877; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
1878; CHECK-O3-CUR-NEXT:    shlq $15, %rax
1879; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1880; CHECK-O3-CUR-NEXT:    retq
1881;
1882; CHECK-O3-EX-LABEL: rmw_fold_shl1:
1883; CHECK-O3-EX:       # %bb.0:
1884; CHECK-O3-EX-NEXT:    shlq $15, (%rdi)
1885; CHECK-O3-EX-NEXT:    retq
1886  %prev = load atomic i64, i64* %p unordered, align 8
1887  %val = shl i64 %prev, 15
1888  store atomic i64 %val, i64* %p unordered, align 8
1889  ret void
1890}
1891
1892; Legal to fold (TODO)
1893define void @rmw_fold_shl2(i64* %p, i64 %v) {
1894; CHECK-O0-LABEL: rmw_fold_shl2:
1895; CHECK-O0:       # %bb.0:
1896; CHECK-O0-NEXT:    movq (%rdi), %rax
1897; CHECK-O0-NEXT:    movb %sil, %dl
1898; CHECK-O0-NEXT:    # implicit-def: $rcx
1899; CHECK-O0-NEXT:    movb %dl, %cl
1900; CHECK-O0-NEXT:    shlxq %rcx, %rax, %rax
1901; CHECK-O0-NEXT:    movq %rax, (%rdi)
1902; CHECK-O0-NEXT:    retq
1903;
1904; CHECK-O3-CUR-LABEL: rmw_fold_shl2:
1905; CHECK-O3-CUR:       # %bb.0:
1906; CHECK-O3-CUR-NEXT:    shlxq %rsi, (%rdi), %rax
1907; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1908; CHECK-O3-CUR-NEXT:    retq
1909;
1910; CHECK-O3-EX-LABEL: rmw_fold_shl2:
1911; CHECK-O3-EX:       # %bb.0:
1912; CHECK-O3-EX-NEXT:    movq %rsi, %rcx
1913; CHECK-O3-EX-NEXT:    # kill: def $cl killed $cl killed $rcx
1914; CHECK-O3-EX-NEXT:    shlq %cl, (%rdi)
1915; CHECK-O3-EX-NEXT:    retq
1916  %prev = load atomic i64, i64* %p unordered, align 8
1917  %val = shl i64 %prev, %v
1918  store atomic i64 %val, i64* %p unordered, align 8
1919  ret void
1920}
1921
1922; Legal to fold (TODO)
1923define void @rmw_fold_lshr1(i64* %p, i64 %v) {
1924; CHECK-O0-LABEL: rmw_fold_lshr1:
1925; CHECK-O0:       # %bb.0:
1926; CHECK-O0-NEXT:    movq (%rdi), %rax
1927; CHECK-O0-NEXT:    shrq $15, %rax
1928; CHECK-O0-NEXT:    movq %rax, (%rdi)
1929; CHECK-O0-NEXT:    retq
1930;
1931; CHECK-O3-CUR-LABEL: rmw_fold_lshr1:
1932; CHECK-O3-CUR:       # %bb.0:
1933; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
1934; CHECK-O3-CUR-NEXT:    shrq $15, %rax
1935; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1936; CHECK-O3-CUR-NEXT:    retq
1937;
1938; CHECK-O3-EX-LABEL: rmw_fold_lshr1:
1939; CHECK-O3-EX:       # %bb.0:
1940; CHECK-O3-EX-NEXT:    shrq $15, (%rdi)
1941; CHECK-O3-EX-NEXT:    retq
1942  %prev = load atomic i64, i64* %p unordered, align 8
1943  %val = lshr i64 %prev, 15
1944  store atomic i64 %val, i64* %p unordered, align 8
1945  ret void
1946}
1947
1948; Legal to fold (TODO)
1949define void @rmw_fold_lshr2(i64* %p, i64 %v) {
1950; CHECK-O0-LABEL: rmw_fold_lshr2:
1951; CHECK-O0:       # %bb.0:
1952; CHECK-O0-NEXT:    movq (%rdi), %rax
1953; CHECK-O0-NEXT:    movb %sil, %dl
1954; CHECK-O0-NEXT:    # implicit-def: $rcx
1955; CHECK-O0-NEXT:    movb %dl, %cl
1956; CHECK-O0-NEXT:    shrxq %rcx, %rax, %rax
1957; CHECK-O0-NEXT:    movq %rax, (%rdi)
1958; CHECK-O0-NEXT:    retq
1959;
1960; CHECK-O3-CUR-LABEL: rmw_fold_lshr2:
1961; CHECK-O3-CUR:       # %bb.0:
1962; CHECK-O3-CUR-NEXT:    shrxq %rsi, (%rdi), %rax
1963; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1964; CHECK-O3-CUR-NEXT:    retq
1965;
1966; CHECK-O3-EX-LABEL: rmw_fold_lshr2:
1967; CHECK-O3-EX:       # %bb.0:
1968; CHECK-O3-EX-NEXT:    movq %rsi, %rcx
1969; CHECK-O3-EX-NEXT:    # kill: def $cl killed $cl killed $rcx
1970; CHECK-O3-EX-NEXT:    shrq %cl, (%rdi)
1971; CHECK-O3-EX-NEXT:    retq
1972  %prev = load atomic i64, i64* %p unordered, align 8
1973  %val = lshr i64 %prev, %v
1974  store atomic i64 %val, i64* %p unordered, align 8
1975  ret void
1976}
1977
1978; Legal to fold (TODO)
1979define void @rmw_fold_ashr1(i64* %p, i64 %v) {
1980; CHECK-O0-LABEL: rmw_fold_ashr1:
1981; CHECK-O0:       # %bb.0:
1982; CHECK-O0-NEXT:    movq (%rdi), %rax
1983; CHECK-O0-NEXT:    sarq $15, %rax
1984; CHECK-O0-NEXT:    movq %rax, (%rdi)
1985; CHECK-O0-NEXT:    retq
1986;
1987; CHECK-O3-CUR-LABEL: rmw_fold_ashr1:
1988; CHECK-O3-CUR:       # %bb.0:
1989; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
1990; CHECK-O3-CUR-NEXT:    sarq $15, %rax
1991; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1992; CHECK-O3-CUR-NEXT:    retq
1993;
1994; CHECK-O3-EX-LABEL: rmw_fold_ashr1:
1995; CHECK-O3-EX:       # %bb.0:
1996; CHECK-O3-EX-NEXT:    sarq $15, (%rdi)
1997; CHECK-O3-EX-NEXT:    retq
1998  %prev = load atomic i64, i64* %p unordered, align 8
1999  %val = ashr i64 %prev, 15
2000  store atomic i64 %val, i64* %p unordered, align 8
2001  ret void
2002}
2003
2004; Legal to fold (TODO)
2005define void @rmw_fold_ashr2(i64* %p, i64 %v) {
2006; CHECK-O0-LABEL: rmw_fold_ashr2:
2007; CHECK-O0:       # %bb.0:
2008; CHECK-O0-NEXT:    movq (%rdi), %rax
2009; CHECK-O0-NEXT:    movb %sil, %dl
2010; CHECK-O0-NEXT:    # implicit-def: $rcx
2011; CHECK-O0-NEXT:    movb %dl, %cl
2012; CHECK-O0-NEXT:    sarxq %rcx, %rax, %rax
2013; CHECK-O0-NEXT:    movq %rax, (%rdi)
2014; CHECK-O0-NEXT:    retq
2015;
2016; CHECK-O3-CUR-LABEL: rmw_fold_ashr2:
2017; CHECK-O3-CUR:       # %bb.0:
2018; CHECK-O3-CUR-NEXT:    sarxq %rsi, (%rdi), %rax
2019; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
2020; CHECK-O3-CUR-NEXT:    retq
2021;
2022; CHECK-O3-EX-LABEL: rmw_fold_ashr2:
2023; CHECK-O3-EX:       # %bb.0:
2024; CHECK-O3-EX-NEXT:    movq %rsi, %rcx
2025; CHECK-O3-EX-NEXT:    # kill: def $cl killed $cl killed $rcx
2026; CHECK-O3-EX-NEXT:    sarq %cl, (%rdi)
2027; CHECK-O3-EX-NEXT:    retq
2028  %prev = load atomic i64, i64* %p unordered, align 8
2029  %val = ashr i64 %prev, %v
2030  store atomic i64 %val, i64* %p unordered, align 8
2031  ret void
2032}
2033
2034; Legal, as expected
2035define void @rmw_fold_and1(i64* %p, i64 %v) {
2036; CHECK-O0-LABEL: rmw_fold_and1:
2037; CHECK-O0:       # %bb.0:
2038; CHECK-O0-NEXT:    movq (%rdi), %rax
2039; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2040; CHECK-O0-NEXT:    andl $15, %eax
2041; CHECK-O0-NEXT:    # kill: def $rax killed $eax
2042; CHECK-O0-NEXT:    movq %rax, (%rdi)
2043; CHECK-O0-NEXT:    retq
2044;
2045; CHECK-O3-LABEL: rmw_fold_and1:
2046; CHECK-O3:       # %bb.0:
2047; CHECK-O3-NEXT:    andq $15, (%rdi)
2048; CHECK-O3-NEXT:    retq
2049  %prev = load atomic i64, i64* %p unordered, align 8
2050  %val = and i64 %prev, 15
2051  store atomic i64 %val, i64* %p unordered, align 8
2052  ret void
2053}
2054
2055; Legal, as expected
2056define void @rmw_fold_and2(i64* %p, i64 %v) {
2057; CHECK-O0-LABEL: rmw_fold_and2:
2058; CHECK-O0:       # %bb.0:
2059; CHECK-O0-NEXT:    movq (%rdi), %rax
2060; CHECK-O0-NEXT:    andq %rsi, %rax
2061; CHECK-O0-NEXT:    movq %rax, (%rdi)
2062; CHECK-O0-NEXT:    retq
2063;
2064; CHECK-O3-LABEL: rmw_fold_and2:
2065; CHECK-O3:       # %bb.0:
2066; CHECK-O3-NEXT:    andq %rsi, (%rdi)
2067; CHECK-O3-NEXT:    retq
2068  %prev = load atomic i64, i64* %p unordered, align 8
2069  %val = and i64 %prev, %v
2070  store atomic i64 %val, i64* %p unordered, align 8
2071  ret void
2072}
2073
2074; Legal, as expected
2075define void @rmw_fold_or1(i64* %p, i64 %v) {
2076; CHECK-O0-LABEL: rmw_fold_or1:
2077; CHECK-O0:       # %bb.0:
2078; CHECK-O0-NEXT:    movq (%rdi), %rax
2079; CHECK-O0-NEXT:    orq $15, %rax
2080; CHECK-O0-NEXT:    movq %rax, (%rdi)
2081; CHECK-O0-NEXT:    retq
2082;
2083; CHECK-O3-LABEL: rmw_fold_or1:
2084; CHECK-O3:       # %bb.0:
2085; CHECK-O3-NEXT:    orq $15, (%rdi)
2086; CHECK-O3-NEXT:    retq
2087  %prev = load atomic i64, i64* %p unordered, align 8
2088  %val = or i64 %prev, 15
2089  store atomic i64 %val, i64* %p unordered, align 8
2090  ret void
2091}
2092
2093; Legal, as expected
2094define void @rmw_fold_or2(i64* %p, i64 %v) {
2095; CHECK-O0-LABEL: rmw_fold_or2:
2096; CHECK-O0:       # %bb.0:
2097; CHECK-O0-NEXT:    movq (%rdi), %rax
2098; CHECK-O0-NEXT:    orq %rsi, %rax
2099; CHECK-O0-NEXT:    movq %rax, (%rdi)
2100; CHECK-O0-NEXT:    retq
2101;
2102; CHECK-O3-LABEL: rmw_fold_or2:
2103; CHECK-O3:       # %bb.0:
2104; CHECK-O3-NEXT:    orq %rsi, (%rdi)
2105; CHECK-O3-NEXT:    retq
2106  %prev = load atomic i64, i64* %p unordered, align 8
2107  %val = or i64 %prev, %v
2108  store atomic i64 %val, i64* %p unordered, align 8
2109  ret void
2110}
2111
2112; Legal, as expected
2113define void @rmw_fold_xor1(i64* %p, i64 %v) {
2114; CHECK-O0-LABEL: rmw_fold_xor1:
2115; CHECK-O0:       # %bb.0:
2116; CHECK-O0-NEXT:    movq (%rdi), %rax
2117; CHECK-O0-NEXT:    xorq $15, %rax
2118; CHECK-O0-NEXT:    movq %rax, (%rdi)
2119; CHECK-O0-NEXT:    retq
2120;
2121; CHECK-O3-LABEL: rmw_fold_xor1:
2122; CHECK-O3:       # %bb.0:
2123; CHECK-O3-NEXT:    xorq $15, (%rdi)
2124; CHECK-O3-NEXT:    retq
2125  %prev = load atomic i64, i64* %p unordered, align 8
2126  %val = xor i64 %prev, 15
2127  store atomic i64 %val, i64* %p unordered, align 8
2128  ret void
2129}
2130
2131; Legal, as expected
2132define void @rmw_fold_xor2(i64* %p, i64 %v) {
2133; CHECK-O0-LABEL: rmw_fold_xor2:
2134; CHECK-O0:       # %bb.0:
2135; CHECK-O0-NEXT:    movq (%rdi), %rax
2136; CHECK-O0-NEXT:    xorq %rsi, %rax
2137; CHECK-O0-NEXT:    movq %rax, (%rdi)
2138; CHECK-O0-NEXT:    retq
2139;
2140; CHECK-O3-LABEL: rmw_fold_xor2:
2141; CHECK-O3:       # %bb.0:
2142; CHECK-O3-NEXT:    xorq %rsi, (%rdi)
2143; CHECK-O3-NEXT:    retq
2144  %prev = load atomic i64, i64* %p unordered, align 8
2145  %val = xor i64 %prev, %v
2146  store atomic i64 %val, i64* %p unordered, align 8
2147  ret void
2148}
2149
2150;; The next batch test truncations, in combination w/operations which could
2151;; be folded against the memory operation.
2152
2153; Legal to reduce the load width (TODO)
2154define i32 @fold_trunc(i64* %p) {
2155; CHECK-LABEL: fold_trunc:
2156; CHECK:       # %bb.0:
2157; CHECK-NEXT:    movq (%rdi), %rax
2158; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
2159; CHECK-NEXT:    retq
2160  %v = load atomic i64, i64* %p unordered, align 8
2161  %ret = trunc i64 %v to i32
2162  ret i32 %ret
2163}
2164
2165; Legal to reduce the load width and fold the load (TODO)
2166define i32 @fold_trunc_add(i64* %p, i32 %v2) {
2167; CHECK-O0-LABEL: fold_trunc_add:
2168; CHECK-O0:       # %bb.0:
2169; CHECK-O0-NEXT:    movq (%rdi), %rax
2170; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2171; CHECK-O0-NEXT:    addl %esi, %eax
2172; CHECK-O0-NEXT:    retq
2173;
2174; CHECK-O3-LABEL: fold_trunc_add:
2175; CHECK-O3:       # %bb.0:
2176; CHECK-O3-NEXT:    movq (%rdi), %rax
2177; CHECK-O3-NEXT:    addl %esi, %eax
2178; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
2179; CHECK-O3-NEXT:    retq
2180  %v = load atomic i64, i64* %p unordered, align 8
2181  %trunc = trunc i64 %v to i32
2182  %ret = add i32 %trunc, %v2
2183  ret i32 %ret
2184}
2185
2186; Legal to reduce the load width and fold the load (TODO)
2187define i32 @fold_trunc_and(i64* %p, i32 %v2) {
2188; CHECK-O0-LABEL: fold_trunc_and:
2189; CHECK-O0:       # %bb.0:
2190; CHECK-O0-NEXT:    movq (%rdi), %rax
2191; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2192; CHECK-O0-NEXT:    andl %esi, %eax
2193; CHECK-O0-NEXT:    retq
2194;
2195; CHECK-O3-LABEL: fold_trunc_and:
2196; CHECK-O3:       # %bb.0:
2197; CHECK-O3-NEXT:    movq (%rdi), %rax
2198; CHECK-O3-NEXT:    andl %esi, %eax
2199; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
2200; CHECK-O3-NEXT:    retq
2201  %v = load atomic i64, i64* %p unordered, align 8
2202  %trunc = trunc i64 %v to i32
2203  %ret = and i32 %trunc, %v2
2204  ret i32 %ret
2205}
2206
2207; Legal to reduce the load width and fold the load (TODO)
2208define i32 @fold_trunc_or(i64* %p, i32 %v2) {
2209; CHECK-O0-LABEL: fold_trunc_or:
2210; CHECK-O0:       # %bb.0:
2211; CHECK-O0-NEXT:    movq (%rdi), %rax
2212; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2213; CHECK-O0-NEXT:    orl %esi, %eax
2214; CHECK-O0-NEXT:    retq
2215;
2216; CHECK-O3-LABEL: fold_trunc_or:
2217; CHECK-O3:       # %bb.0:
2218; CHECK-O3-NEXT:    movq (%rdi), %rax
2219; CHECK-O3-NEXT:    orl %esi, %eax
2220; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
2221; CHECK-O3-NEXT:    retq
2222  %v = load atomic i64, i64* %p unordered, align 8
2223  %trunc = trunc i64 %v to i32
2224  %ret = or i32 %trunc, %v2
2225  ret i32 %ret
2226}
2227
2228; It's tempting to split the wide load into two smaller byte loads
2229; to reduce memory traffic, but this would be illegal for a atomic load
2230define i32 @split_load(i64* %p) {
2231; CHECK-O0-LABEL: split_load:
2232; CHECK-O0:       # %bb.0:
2233; CHECK-O0-NEXT:    movq (%rdi), %rcx
2234; CHECK-O0-NEXT:    movb %cl, %al
2235; CHECK-O0-NEXT:    shrq $32, %rcx
2236; CHECK-O0-NEXT:    # kill: def $cl killed $cl killed $rcx
2237; CHECK-O0-NEXT:    orb %cl, %al
2238; CHECK-O0-NEXT:    movzbl %al, %eax
2239; CHECK-O0-NEXT:    retq
2240;
2241; CHECK-O3-LABEL: split_load:
2242; CHECK-O3:       # %bb.0:
2243; CHECK-O3-NEXT:    movq (%rdi), %rax
2244; CHECK-O3-NEXT:    movq %rax, %rcx
2245; CHECK-O3-NEXT:    shrq $32, %rcx
2246; CHECK-O3-NEXT:    orl %eax, %ecx
2247; CHECK-O3-NEXT:    movzbl %cl, %eax
2248; CHECK-O3-NEXT:    retq
2249  %v = load atomic i64, i64* %p unordered, align 8
2250  %b1 = trunc i64 %v to i8
2251  %v.shift = lshr i64 %v, 32
2252  %b2 = trunc i64 %v.shift to i8
2253  %or = or i8 %b1, %b2
2254  %ret = zext i8 %or to i32
2255  ret i32 %ret
2256}
2257
2258;; A collection of simple memory forwarding tests.  Nothing particular
2259;; interesting semantic wise, just demonstrating obvious missed transforms.
2260
2261@Zero = constant i64 0
2262
2263; TODO: should return constant
2264define i64 @constant_folding(i64* %p) {
2265; CHECK-LABEL: constant_folding:
2266; CHECK:       # %bb.0:
2267; CHECK-NEXT:    movq (%rdi), %rax
2268; CHECK-NEXT:    retq
2269  %v = load atomic i64, i64* %p unordered, align 8
2270  ret i64 %v
2271}
2272
2273; Legal to forward and fold (TODO)
2274define i64 @load_forwarding(i64* %p) {
2275; CHECK-LABEL: load_forwarding:
2276; CHECK:       # %bb.0:
2277; CHECK-NEXT:    movq (%rdi), %rax
2278; CHECK-NEXT:    orq (%rdi), %rax
2279; CHECK-NEXT:    retq
2280  %v = load atomic i64, i64* %p unordered, align 8
2281  %v2 = load atomic i64, i64* %p unordered, align 8
2282  %ret = or i64 %v, %v2
2283  ret i64 %ret
2284}
2285
2286; Legal to forward (TODO)
2287define i64 @store_forward(i64* %p, i64 %v) {
2288; CHECK-LABEL: store_forward:
2289; CHECK:       # %bb.0:
2290; CHECK-NEXT:    movq %rsi, (%rdi)
2291; CHECK-NEXT:    movq (%rdi), %rax
2292; CHECK-NEXT:    retq
2293  store atomic i64 %v, i64* %p unordered, align 8
2294  %ret = load atomic i64, i64* %p unordered, align 8
2295  ret i64 %ret
2296}
2297
2298; Legal to kill (TODO)
2299define void @dead_writeback(i64* %p) {
2300; CHECK-LABEL: dead_writeback:
2301; CHECK:       # %bb.0:
2302; CHECK-NEXT:    movq (%rdi), %rax
2303; CHECK-NEXT:    movq %rax, (%rdi)
2304; CHECK-NEXT:    retq
2305  %v = load atomic i64, i64* %p unordered, align 8
2306  store atomic i64 %v, i64* %p unordered, align 8
2307  ret void
2308}
2309
2310; Legal to kill (TODO)
2311define void @dead_store(i64* %p, i64 %v) {
2312; CHECK-LABEL: dead_store:
2313; CHECK:       # %bb.0:
2314; CHECK-NEXT:    movq $0, (%rdi)
2315; CHECK-NEXT:    movq %rsi, (%rdi)
2316; CHECK-NEXT:    retq
2317  store atomic i64 0, i64* %p unordered, align 8
2318  store atomic i64 %v, i64* %p unordered, align 8
2319  ret void
2320}
2321
2322;; The next batch of tests ensure that we don't try to fold a load into a
2323;; use where the code motion implied for the load is prevented by a fence.
2324;; Note: We're checking that the load doesn't get moved below the fence as
2325;; part of folding, but is technically legal to lift the add above the fence.
2326;; If that were to happen, please rewrite the test to ensure load movement
2327;; isn't violated.
2328
2329define i64 @nofold_fence(i64* %p) {
2330; CHECK-LABEL: nofold_fence:
2331; CHECK:       # %bb.0:
2332; CHECK-NEXT:    movq (%rdi), %rax
2333; CHECK-NEXT:    mfence
2334; CHECK-NEXT:    addq $15, %rax
2335; CHECK-NEXT:    retq
2336  %v = load atomic i64, i64* %p unordered, align 8
2337  fence seq_cst
2338  %ret = add i64 %v, 15
2339  ret i64 %ret
2340}
2341
2342define i64 @nofold_fence_acquire(i64* %p) {
2343; CHECK-LABEL: nofold_fence_acquire:
2344; CHECK:       # %bb.0:
2345; CHECK-NEXT:    movq (%rdi), %rax
2346; CHECK-NEXT:    #MEMBARRIER
2347; CHECK-NEXT:    addq $15, %rax
2348; CHECK-NEXT:    retq
2349  %v = load atomic i64, i64* %p unordered, align 8
2350  fence acquire
2351  %ret = add i64 %v, 15
2352  ret i64 %ret
2353}
2354
2355
2356define i64 @nofold_stfence(i64* %p) {
2357; CHECK-LABEL: nofold_stfence:
2358; CHECK:       # %bb.0:
2359; CHECK-NEXT:    movq (%rdi), %rax
2360; CHECK-NEXT:    #MEMBARRIER
2361; CHECK-NEXT:    addq $15, %rax
2362; CHECK-NEXT:    retq
2363  %v = load atomic i64, i64* %p unordered, align 8
2364  fence syncscope("singlethread") seq_cst
2365  %ret = add i64 %v, 15
2366  ret i64 %ret
2367}
2368
2369;; Next, test how well we can fold invariant loads.
2370
2371@Constant = external dso_local constant i64
2372
2373define i64 @fold_constant(i64 %arg) {
2374; CHECK-O0-LABEL: fold_constant:
2375; CHECK-O0:       # %bb.0:
2376; CHECK-O0-NEXT:    movq %rdi, %rax
2377; CHECK-O0-NEXT:    addq Constant, %rax
2378; CHECK-O0-NEXT:    retq
2379;
2380; CHECK-O3-LABEL: fold_constant:
2381; CHECK-O3:       # %bb.0:
2382; CHECK-O3-NEXT:    movq %rdi, %rax
2383; CHECK-O3-NEXT:    addq Constant(%rip), %rax
2384; CHECK-O3-NEXT:    retq
2385  %v = load atomic i64, i64* @Constant unordered, align 8
2386  %ret = add i64 %v, %arg
2387  ret i64 %ret
2388}
2389
2390define i64 @fold_constant_clobber(i64* %p, i64 %arg) {
2391; CHECK-O0-LABEL: fold_constant_clobber:
2392; CHECK-O0:       # %bb.0:
2393; CHECK-O0-NEXT:    movq Constant(%rip), %rax
2394; CHECK-O0-NEXT:    movq $5, (%rdi)
2395; CHECK-O0-NEXT:    addq %rsi, %rax
2396; CHECK-O0-NEXT:    retq
2397;
2398; CHECK-O3-CUR-LABEL: fold_constant_clobber:
2399; CHECK-O3-CUR:       # %bb.0:
2400; CHECK-O3-CUR-NEXT:    movq Constant(%rip), %rax
2401; CHECK-O3-CUR-NEXT:    movq $5, (%rdi)
2402; CHECK-O3-CUR-NEXT:    addq %rsi, %rax
2403; CHECK-O3-CUR-NEXT:    retq
2404;
2405; CHECK-O3-EX-LABEL: fold_constant_clobber:
2406; CHECK-O3-EX:       # %bb.0:
2407; CHECK-O3-EX-NEXT:    movq %rsi, %rax
2408; CHECK-O3-EX-NEXT:    addq Constant(%rip), %rax
2409; CHECK-O3-EX-NEXT:    movq $5, (%rdi)
2410; CHECK-O3-EX-NEXT:    retq
2411  %v = load atomic i64, i64* @Constant unordered, align 8
2412  store i64 5, i64* %p
2413  %ret = add i64 %v, %arg
2414  ret i64 %ret
2415}
2416
2417define i64 @fold_constant_fence(i64 %arg) {
2418; CHECK-O0-LABEL: fold_constant_fence:
2419; CHECK-O0:       # %bb.0:
2420; CHECK-O0-NEXT:    movq Constant(%rip), %rax
2421; CHECK-O0-NEXT:    mfence
2422; CHECK-O0-NEXT:    addq %rdi, %rax
2423; CHECK-O0-NEXT:    retq
2424;
2425; CHECK-O3-CUR-LABEL: fold_constant_fence:
2426; CHECK-O3-CUR:       # %bb.0:
2427; CHECK-O3-CUR-NEXT:    movq Constant(%rip), %rax
2428; CHECK-O3-CUR-NEXT:    mfence
2429; CHECK-O3-CUR-NEXT:    addq %rdi, %rax
2430; CHECK-O3-CUR-NEXT:    retq
2431;
2432; CHECK-O3-EX-LABEL: fold_constant_fence:
2433; CHECK-O3-EX:       # %bb.0:
2434; CHECK-O3-EX-NEXT:    movq %rdi, %rax
2435; CHECK-O3-EX-NEXT:    addq Constant(%rip), %rax
2436; CHECK-O3-EX-NEXT:    mfence
2437; CHECK-O3-EX-NEXT:    retq
2438  %v = load atomic i64, i64* @Constant unordered, align 8
2439  fence seq_cst
2440  %ret = add i64 %v, %arg
2441  ret i64 %ret
2442}
2443
2444define i64 @fold_invariant_clobber(i64* dereferenceable(8) %p, i64 %arg) {
2445; CHECK-O0-LABEL: fold_invariant_clobber:
2446; CHECK-O0:       # %bb.0:
2447; CHECK-O0-NEXT:    movq (%rdi), %rax
2448; CHECK-O0-NEXT:    movq $5, (%rdi)
2449; CHECK-O0-NEXT:    addq %rsi, %rax
2450; CHECK-O0-NEXT:    retq
2451;
2452; CHECK-O3-CUR-LABEL: fold_invariant_clobber:
2453; CHECK-O3-CUR:       # %bb.0:
2454; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
2455; CHECK-O3-CUR-NEXT:    movq $5, (%rdi)
2456; CHECK-O3-CUR-NEXT:    addq %rsi, %rax
2457; CHECK-O3-CUR-NEXT:    retq
2458;
2459; CHECK-O3-EX-LABEL: fold_invariant_clobber:
2460; CHECK-O3-EX:       # %bb.0:
2461; CHECK-O3-EX-NEXT:    movq %rsi, %rax
2462; CHECK-O3-EX-NEXT:    addq (%rdi), %rax
2463; CHECK-O3-EX-NEXT:    movq $5, (%rdi)
2464; CHECK-O3-EX-NEXT:    retq
2465  %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{}
2466  store i64 5, i64* %p
2467  %ret = add i64 %v, %arg
2468  ret i64 %ret
2469}
2470
2471
2472define i64 @fold_invariant_fence(i64* dereferenceable(8) %p, i64 %arg) {
2473; CHECK-O0-LABEL: fold_invariant_fence:
2474; CHECK-O0:       # %bb.0:
2475; CHECK-O0-NEXT:    movq (%rdi), %rax
2476; CHECK-O0-NEXT:    mfence
2477; CHECK-O0-NEXT:    addq %rsi, %rax
2478; CHECK-O0-NEXT:    retq
2479;
2480; CHECK-O3-CUR-LABEL: fold_invariant_fence:
2481; CHECK-O3-CUR:       # %bb.0:
2482; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
2483; CHECK-O3-CUR-NEXT:    mfence
2484; CHECK-O3-CUR-NEXT:    addq %rsi, %rax
2485; CHECK-O3-CUR-NEXT:    retq
2486;
2487; CHECK-O3-EX-LABEL: fold_invariant_fence:
2488; CHECK-O3-EX:       # %bb.0:
2489; CHECK-O3-EX-NEXT:    movq %rsi, %rax
2490; CHECK-O3-EX-NEXT:    addq (%rdi), %rax
2491; CHECK-O3-EX-NEXT:    mfence
2492; CHECK-O3-EX-NEXT:    retq
2493  %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{}
2494  fence seq_cst
2495  %ret = add i64 %v, %arg
2496  ret i64 %ret
2497}
2498
2499
2500; Exercise a few cases involving any extend idioms
2501
2502define i16 @load_i8_anyext_i16(i8* %ptr) {
2503; CHECK-O0-CUR-LABEL: load_i8_anyext_i16:
2504; CHECK-O0-CUR:       # %bb.0:
2505; CHECK-O0-CUR-NEXT:    movb (%rdi), %al
2506; CHECK-O0-CUR-NEXT:    movzbl %al, %eax
2507; CHECK-O0-CUR-NEXT:    # kill: def $ax killed $ax killed $eax
2508; CHECK-O0-CUR-NEXT:    retq
2509;
2510; CHECK-O3-CUR-LABEL: load_i8_anyext_i16:
2511; CHECK-O3-CUR:       # %bb.0:
2512; CHECK-O3-CUR-NEXT:    movzbl (%rdi), %eax
2513; CHECK-O3-CUR-NEXT:    # kill: def $ax killed $ax killed $eax
2514; CHECK-O3-CUR-NEXT:    retq
2515;
2516; CHECK-O0-EX-LABEL: load_i8_anyext_i16:
2517; CHECK-O0-EX:       # %bb.0:
2518; CHECK-O0-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2519; CHECK-O0-EX-NEXT:    vmovd %xmm0, %eax
2520; CHECK-O0-EX-NEXT:    # kill: def $ax killed $ax killed $eax
2521; CHECK-O0-EX-NEXT:    retq
2522;
2523; CHECK-O3-EX-LABEL: load_i8_anyext_i16:
2524; CHECK-O3-EX:       # %bb.0:
2525; CHECK-O3-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2526; CHECK-O3-EX-NEXT:    vmovd %xmm0, %eax
2527; CHECK-O3-EX-NEXT:    # kill: def $ax killed $ax killed $eax
2528; CHECK-O3-EX-NEXT:    retq
2529  %v = load atomic i8, i8* %ptr unordered, align 2
2530  %vec = insertelement <2 x i8> undef, i8 %v, i32 0
2531  %res = bitcast <2 x i8> %vec to i16
2532  ret i16 %res
2533}
2534
2535define i32 @load_i8_anyext_i32(i8* %ptr) {
2536; CHECK-O0-CUR-LABEL: load_i8_anyext_i32:
2537; CHECK-O0-CUR:       # %bb.0:
2538; CHECK-O0-CUR-NEXT:    movb (%rdi), %al
2539; CHECK-O0-CUR-NEXT:    movzbl %al, %eax
2540; CHECK-O0-CUR-NEXT:    retq
2541;
2542; CHECK-O3-CUR-LABEL: load_i8_anyext_i32:
2543; CHECK-O3-CUR:       # %bb.0:
2544; CHECK-O3-CUR-NEXT:    movzbl (%rdi), %eax
2545; CHECK-O3-CUR-NEXT:    retq
2546;
2547; CHECK-O0-EX-LABEL: load_i8_anyext_i32:
2548; CHECK-O0-EX:       # %bb.0:
2549; CHECK-O0-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2550; CHECK-O0-EX-NEXT:    vmovd %xmm0, %eax
2551; CHECK-O0-EX-NEXT:    retq
2552;
2553; CHECK-O3-EX-LABEL: load_i8_anyext_i32:
2554; CHECK-O3-EX:       # %bb.0:
2555; CHECK-O3-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2556; CHECK-O3-EX-NEXT:    vmovd %xmm0, %eax
2557; CHECK-O3-EX-NEXT:    retq
2558  %v = load atomic i8, i8* %ptr unordered, align 4
2559  %vec = insertelement <4 x i8> undef, i8 %v, i32 0
2560  %res = bitcast <4 x i8> %vec to i32
2561  ret i32 %res
2562}
2563
2564define i32 @load_i16_anyext_i32(i16* %ptr) {
2565; CHECK-O0-CUR-LABEL: load_i16_anyext_i32:
2566; CHECK-O0-CUR:       # %bb.0:
2567; CHECK-O0-CUR-NEXT:    movw (%rdi), %cx
2568; CHECK-O0-CUR-NEXT:    # implicit-def: $eax
2569; CHECK-O0-CUR-NEXT:    movw %cx, %ax
2570; CHECK-O0-CUR-NEXT:    retq
2571;
2572; CHECK-O3-CUR-LABEL: load_i16_anyext_i32:
2573; CHECK-O3-CUR:       # %bb.0:
2574; CHECK-O3-CUR-NEXT:    movzwl (%rdi), %eax
2575; CHECK-O3-CUR-NEXT:    retq
2576;
2577; CHECK-O0-EX-LABEL: load_i16_anyext_i32:
2578; CHECK-O0-EX:       # %bb.0:
2579; CHECK-O0-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2580; CHECK-O0-EX-NEXT:    vmovd %xmm0, %eax
2581; CHECK-O0-EX-NEXT:    retq
2582;
2583; CHECK-O3-EX-LABEL: load_i16_anyext_i32:
2584; CHECK-O3-EX:       # %bb.0:
2585; CHECK-O3-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2586; CHECK-O3-EX-NEXT:    vmovd %xmm0, %eax
2587; CHECK-O3-EX-NEXT:    retq
2588  %v = load atomic i16, i16* %ptr unordered, align 4
2589  %vec = insertelement <2 x i16> undef, i16 %v, i64 0
2590  %res = bitcast <2 x i16> %vec to i32
2591  ret i32 %res
2592}
2593
2594define i64 @load_i16_anyext_i64(i16* %ptr) {
2595; CHECK-O0-CUR-LABEL: load_i16_anyext_i64:
2596; CHECK-O0-CUR:       # %bb.0:
2597; CHECK-O0-CUR-NEXT:    movw (%rdi), %cx
2598; CHECK-O0-CUR-NEXT:    # implicit-def: $eax
2599; CHECK-O0-CUR-NEXT:    movw %cx, %ax
2600; CHECK-O0-CUR-NEXT:    vmovd %eax, %xmm0
2601; CHECK-O0-CUR-NEXT:    vmovq %xmm0, %rax
2602; CHECK-O0-CUR-NEXT:    retq
2603;
2604; CHECK-O3-CUR-LABEL: load_i16_anyext_i64:
2605; CHECK-O3-CUR:       # %bb.0:
2606; CHECK-O3-CUR-NEXT:    movzwl (%rdi), %eax
2607; CHECK-O3-CUR-NEXT:    vmovd %eax, %xmm0
2608; CHECK-O3-CUR-NEXT:    vmovq %xmm0, %rax
2609; CHECK-O3-CUR-NEXT:    retq
2610;
2611; CHECK-O0-EX-LABEL: load_i16_anyext_i64:
2612; CHECK-O0-EX:       # %bb.0:
2613; CHECK-O0-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2614; CHECK-O0-EX-NEXT:    vmovq %xmm0, %rax
2615; CHECK-O0-EX-NEXT:    retq
2616;
2617; CHECK-O3-EX-LABEL: load_i16_anyext_i64:
2618; CHECK-O3-EX:       # %bb.0:
2619; CHECK-O3-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2620; CHECK-O3-EX-NEXT:    vmovq %xmm0, %rax
2621; CHECK-O3-EX-NEXT:    retq
2622  %v = load atomic i16, i16* %ptr unordered, align 8
2623  %vec = insertelement <4 x i16> undef, i16 %v, i64 0
2624  %res = bitcast <4 x i16> %vec to i64
2625  ret i64 %res
2626}
2627
2628; TODO: Would be legal to combine for legal atomic wider types
2629define i16 @load_combine(i8* %p) {
2630; CHECK-O0-LABEL: load_combine:
2631; CHECK-O0:       # %bb.0:
2632; CHECK-O0-NEXT:    movb (%rdi), %al
2633; CHECK-O0-NEXT:    movb 1(%rdi), %cl
2634; CHECK-O0-NEXT:    movzbl %al, %eax
2635; CHECK-O0-NEXT:    # kill: def $ax killed $ax killed $eax
2636; CHECK-O0-NEXT:    movzbl %cl, %ecx
2637; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
2638; CHECK-O0-NEXT:    shlw $8, %cx
2639; CHECK-O0-NEXT:    orw %cx, %ax
2640; CHECK-O0-NEXT:    retq
2641;
2642; CHECK-O3-LABEL: load_combine:
2643; CHECK-O3:       # %bb.0:
2644; CHECK-O3-NEXT:    movzbl (%rdi), %ecx
2645; CHECK-O3-NEXT:    movzbl 1(%rdi), %eax
2646; CHECK-O3-NEXT:    shll $8, %eax
2647; CHECK-O3-NEXT:    orl %ecx, %eax
2648; CHECK-O3-NEXT:    # kill: def $ax killed $ax killed $eax
2649; CHECK-O3-NEXT:    retq
2650  %v1 = load atomic i8, i8* %p unordered, align 2
2651  %p2 = getelementptr i8, i8* %p, i64 1
2652  %v2 = load atomic i8, i8* %p2 unordered, align 1
2653  %v1.ext = zext i8 %v1 to i16
2654  %v2.ext = zext i8 %v2 to i16
2655  %v2.sht = shl i16 %v2.ext, 8
2656  %res = or i16 %v1.ext, %v2.sht
2657  ret i16 %res
2658}
2659
2660define i1 @fold_cmp_over_fence(i32* %p, i32 %v1) {
2661; CHECK-O0-LABEL: fold_cmp_over_fence:
2662; CHECK-O0:       # %bb.0:
2663; CHECK-O0-NEXT:    movl (%rdi), %eax
2664; CHECK-O0-NEXT:    mfence
2665; CHECK-O0-NEXT:    cmpl %eax, %esi
2666; CHECK-O0-NEXT:    jne .LBB116_2
2667; CHECK-O0-NEXT:  # %bb.1: # %taken
2668; CHECK-O0-NEXT:    movb $1, %al
2669; CHECK-O0-NEXT:    retq
2670; CHECK-O0-NEXT:  .LBB116_2: # %untaken
2671; CHECK-O0-NEXT:    xorl %eax, %eax
2672; CHECK-O0-NEXT:    # kill: def $al killed $al killed $eax
2673; CHECK-O0-NEXT:    retq
2674;
2675; CHECK-O3-CUR-LABEL: fold_cmp_over_fence:
2676; CHECK-O3-CUR:       # %bb.0:
2677; CHECK-O3-CUR-NEXT:    movl (%rdi), %eax
2678; CHECK-O3-CUR-NEXT:    mfence
2679; CHECK-O3-CUR-NEXT:    cmpl %eax, %esi
2680; CHECK-O3-CUR-NEXT:    jne .LBB116_2
2681; CHECK-O3-CUR-NEXT:  # %bb.1: # %taken
2682; CHECK-O3-CUR-NEXT:    movb $1, %al
2683; CHECK-O3-CUR-NEXT:    retq
2684; CHECK-O3-CUR-NEXT:  .LBB116_2: # %untaken
2685; CHECK-O3-CUR-NEXT:    xorl %eax, %eax
2686; CHECK-O3-CUR-NEXT:    retq
2687;
2688; CHECK-O3-EX-LABEL: fold_cmp_over_fence:
2689; CHECK-O3-EX:       # %bb.0:
2690; CHECK-O3-EX-NEXT:    cmpl (%rdi), %esi
2691; CHECK-O3-EX-NEXT:    mfence
2692; CHECK-O3-EX-NEXT:    jne .LBB116_2
2693; CHECK-O3-EX-NEXT:  # %bb.1: # %taken
2694; CHECK-O3-EX-NEXT:    movb $1, %al
2695; CHECK-O3-EX-NEXT:    retq
2696; CHECK-O3-EX-NEXT:  .LBB116_2: # %untaken
2697; CHECK-O3-EX-NEXT:    xorl %eax, %eax
2698; CHECK-O3-EX-NEXT:    retq
2699  %v2 = load atomic i32, i32* %p unordered, align 4
2700  fence seq_cst
2701  %cmp = icmp eq i32 %v1, %v2
2702  br i1 %cmp, label %taken, label %untaken
2703taken:
2704  ret i1 true
2705untaken:
2706  ret i1 false
2707}
2708