1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=0 | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-CUR %s
3; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake  -x86-experimental-unordered-atomic-isel=0 | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-CUR %s
4; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=1 | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-EX %s
5; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=1 | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-EX %s
6
7define i8 @load_i8(i8* %ptr) {
8; CHECK-O0-LABEL: load_i8:
9; CHECK-O0:       # %bb.0:
10; CHECK-O0-NEXT:    movb (%rdi), %al
11; CHECK-O0-NEXT:    retq
12;
13; CHECK-O3-LABEL: load_i8:
14; CHECK-O3:       # %bb.0:
15; CHECK-O3-NEXT:    movzbl (%rdi), %eax
16; CHECK-O3-NEXT:    retq
17  %v = load atomic i8, i8* %ptr unordered, align 1
18  ret i8 %v
19}
20
21define void @store_i8(i8* %ptr, i8 %v) {
22; CHECK-O0-LABEL: store_i8:
23; CHECK-O0:       # %bb.0:
24; CHECK-O0-NEXT:    movb %sil, %al
25; CHECK-O0-NEXT:    movb %al, (%rdi)
26; CHECK-O0-NEXT:    retq
27;
28; CHECK-O3-LABEL: store_i8:
29; CHECK-O3:       # %bb.0:
30; CHECK-O3-NEXT:    movb %sil, (%rdi)
31; CHECK-O3-NEXT:    retq
32  store atomic i8 %v, i8* %ptr unordered, align 1
33  ret void
34}
35
36define i16 @load_i16(i16* %ptr) {
37; CHECK-O0-LABEL: load_i16:
38; CHECK-O0:       # %bb.0:
39; CHECK-O0-NEXT:    movw (%rdi), %ax
40; CHECK-O0-NEXT:    retq
41;
42; CHECK-O3-LABEL: load_i16:
43; CHECK-O3:       # %bb.0:
44; CHECK-O3-NEXT:    movzwl (%rdi), %eax
45; CHECK-O3-NEXT:    retq
46  %v = load atomic i16, i16* %ptr unordered, align 2
47  ret i16 %v
48}
49
50
51define void @store_i16(i16* %ptr, i16 %v) {
52; CHECK-O0-LABEL: store_i16:
53; CHECK-O0:       # %bb.0:
54; CHECK-O0-NEXT:    movw %si, %ax
55; CHECK-O0-NEXT:    movw %ax, (%rdi)
56; CHECK-O0-NEXT:    retq
57;
58; CHECK-O3-LABEL: store_i16:
59; CHECK-O3:       # %bb.0:
60; CHECK-O3-NEXT:    movw %si, (%rdi)
61; CHECK-O3-NEXT:    retq
62  store atomic i16 %v, i16* %ptr unordered, align 2
63  ret void
64}
65
66define i32 @load_i32(i32* %ptr) {
67; CHECK-LABEL: load_i32:
68; CHECK:       # %bb.0:
69; CHECK-NEXT:    movl (%rdi), %eax
70; CHECK-NEXT:    retq
71  %v = load atomic i32, i32* %ptr unordered, align 4
72  ret i32 %v
73}
74
75define void @store_i32(i32* %ptr, i32 %v) {
76; CHECK-LABEL: store_i32:
77; CHECK:       # %bb.0:
78; CHECK-NEXT:    movl %esi, (%rdi)
79; CHECK-NEXT:    retq
80  store atomic i32 %v, i32* %ptr unordered, align 4
81  ret void
82}
83
84define i64 @load_i64(i64* %ptr) {
85; CHECK-LABEL: load_i64:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    movq (%rdi), %rax
88; CHECK-NEXT:    retq
89  %v = load atomic i64, i64* %ptr unordered, align 8
90  ret i64 %v
91}
92
93define void @store_i64(i64* %ptr, i64 %v) {
94; CHECK-LABEL: store_i64:
95; CHECK:       # %bb.0:
96; CHECK-NEXT:    movq %rsi, (%rdi)
97; CHECK-NEXT:    retq
98  store atomic i64 %v, i64* %ptr unordered, align 8
99  ret void
100}
101
102;; The tests in the rest of this file are intended to show transforms which we
103;; either *can't* do for legality, or don't currently implement.  The later
104;; are noted carefully where relevant.
105
106;; Start w/some clearly illegal ones.
107
108; Must use a full width op, not a byte op
109define void @narrow_writeback_or(i64* %ptr) {
110; CHECK-O0-LABEL: narrow_writeback_or:
111; CHECK-O0:       # %bb.0:
112; CHECK-O0-NEXT:    movq (%rdi), %rax
113; CHECK-O0-NEXT:    orq $7, %rax
114; CHECK-O0-NEXT:    movq %rax, (%rdi)
115; CHECK-O0-NEXT:    retq
116;
117; CHECK-O3-LABEL: narrow_writeback_or:
118; CHECK-O3:       # %bb.0:
119; CHECK-O3-NEXT:    orq $7, (%rdi)
120; CHECK-O3-NEXT:    retq
121  %v = load atomic i64, i64* %ptr unordered, align 8
122  %v.new = or i64 %v, 7
123  store atomic i64 %v.new, i64* %ptr unordered, align 8
124  ret void
125}
126
127; Must use a full width op, not a byte op
128define void @narrow_writeback_and(i64* %ptr) {
129; CHECK-O0-LABEL: narrow_writeback_and:
130; CHECK-O0:       # %bb.0:
131; CHECK-O0-NEXT:    movq (%rdi), %rax
132; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
133; CHECK-O0-NEXT:    andl $-256, %eax
134; CHECK-O0-NEXT:    # kill: def $rax killed $eax
135; CHECK-O0-NEXT:    movq %rax, (%rdi)
136; CHECK-O0-NEXT:    retq
137;
138; CHECK-O3-LABEL: narrow_writeback_and:
139; CHECK-O3:       # %bb.0:
140; CHECK-O3-NEXT:    movl $4294967040, %eax # imm = 0xFFFFFF00
141; CHECK-O3-NEXT:    andq %rax, (%rdi)
142; CHECK-O3-NEXT:    retq
143  %v = load atomic i64, i64* %ptr unordered, align 8
144  %v.new = and i64 %v, 4294967040 ;; 0xFFFF_FF00
145  store atomic i64 %v.new, i64* %ptr unordered, align 8
146  ret void
147}
148
149; Must use a full width op, not a byte op
150define void @narrow_writeback_xor(i64* %ptr) {
151; CHECK-O0-LABEL: narrow_writeback_xor:
152; CHECK-O0:       # %bb.0:
153; CHECK-O0-NEXT:    movq (%rdi), %rax
154; CHECK-O0-NEXT:    xorq $7, %rax
155; CHECK-O0-NEXT:    movq %rax, (%rdi)
156; CHECK-O0-NEXT:    retq
157;
158; CHECK-O3-LABEL: narrow_writeback_xor:
159; CHECK-O3:       # %bb.0:
160; CHECK-O3-NEXT:    xorq $7, (%rdi)
161; CHECK-O3-NEXT:    retq
162  %v = load atomic i64, i64* %ptr unordered, align 8
163  %v.new = xor i64 %v, 7
164  store atomic i64 %v.new, i64* %ptr unordered, align 8
165  ret void
166}
167
168;; Next batch of tests are exercising cases where store widening would
169;; improve codegeneration.  Note that widening is only legal if the
170;; resulting type would be atomic.  Each tests has a well aligned, and
171;; unaligned variant to ensure we get correct codegen here.
172;; Note: It's not a legality issue, but there's a gotcha here to be aware
173;; of.  Once we widen a pair of atomic stores, we loose the information
174;; that the original atomicity requirement was half the width.  Given that,
175;; we can't then split the load again.  This challenges our usual iterative
176;; approach to incremental improvement.
177
178; Legal if wider type is also atomic (TODO)
179define void @widen_store(i32* %p0, i32 %v1, i32 %v2) {
180; CHECK-LABEL: widen_store:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    movl %esi, (%rdi)
183; CHECK-NEXT:    movl %edx, 4(%rdi)
184; CHECK-NEXT:    retq
185  %p1 = getelementptr i32, i32* %p0, i64 1
186  store atomic i32 %v1, i32* %p0 unordered, align 8
187  store atomic i32 %v2, i32* %p1 unordered, align 4
188  ret void
189}
190
191; This one is *NOT* legal to widen.  With weaker alignment,
192; the wider type might cross a cache line and violate the
193; atomicity requirement.
194define void @widen_store_unaligned(i32* %p0, i32 %v1, i32 %v2) {
195; CHECK-LABEL: widen_store_unaligned:
196; CHECK:       # %bb.0:
197; CHECK-NEXT:    movl %esi, (%rdi)
198; CHECK-NEXT:    movl %edx, 4(%rdi)
199; CHECK-NEXT:    retq
200  %p1 = getelementptr i32, i32* %p0, i64 1
201  store atomic i32 %v1, i32* %p0 unordered, align 4
202  store atomic i32 %v2, i32* %p1 unordered, align 4
203  ret void
204}
205
206; Legal if wider type is also atomic (TODO)
207define void @widen_broadcast(i32* %p0, i32 %v) {
208; CHECK-LABEL: widen_broadcast:
209; CHECK:       # %bb.0:
210; CHECK-NEXT:    movl %esi, (%rdi)
211; CHECK-NEXT:    movl %esi, 4(%rdi)
212; CHECK-NEXT:    retq
213  %p1 = getelementptr i32, i32* %p0, i64 1
214  store atomic i32 %v, i32* %p0 unordered, align 8
215  store atomic i32 %v, i32* %p1 unordered, align 4
216  ret void
217}
218
219; Not legal to widen due to alignment restriction
220define void @widen_broadcast_unaligned(i32* %p0, i32 %v) {
221; CHECK-LABEL: widen_broadcast_unaligned:
222; CHECK:       # %bb.0:
223; CHECK-NEXT:    movl %esi, (%rdi)
224; CHECK-NEXT:    movl %esi, 4(%rdi)
225; CHECK-NEXT:    retq
226  %p1 = getelementptr i32, i32* %p0, i64 1
227  store atomic i32 %v, i32* %p0 unordered, align 4
228  store atomic i32 %v, i32* %p1 unordered, align 4
229  ret void
230}
231
232define i128 @load_i128(i128* %ptr) {
233; CHECK-O0-LABEL: load_i128:
234; CHECK-O0:       # %bb.0:
235; CHECK-O0-NEXT:    pushq %rbx
236; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
237; CHECK-O0-NEXT:    .cfi_offset %rbx, -16
238; CHECK-O0-NEXT:    xorl %eax, %eax
239; CHECK-O0-NEXT:    movl %eax, %ebx
240; CHECK-O0-NEXT:    movq %rbx, %rax
241; CHECK-O0-NEXT:    movq %rbx, %rdx
242; CHECK-O0-NEXT:    movq %rbx, %rcx
243; CHECK-O0-NEXT:    lock cmpxchg16b (%rdi)
244; CHECK-O0-NEXT:    popq %rbx
245; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
246; CHECK-O0-NEXT:    retq
247;
248; CHECK-O3-LABEL: load_i128:
249; CHECK-O3:       # %bb.0:
250; CHECK-O3-NEXT:    pushq %rbx
251; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
252; CHECK-O3-NEXT:    .cfi_offset %rbx, -16
253; CHECK-O3-NEXT:    xorl %eax, %eax
254; CHECK-O3-NEXT:    xorl %edx, %edx
255; CHECK-O3-NEXT:    xorl %ecx, %ecx
256; CHECK-O3-NEXT:    xorl %ebx, %ebx
257; CHECK-O3-NEXT:    lock cmpxchg16b (%rdi)
258; CHECK-O3-NEXT:    popq %rbx
259; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
260; CHECK-O3-NEXT:    retq
261  %v = load atomic i128, i128* %ptr unordered, align 16
262  ret i128 %v
263}
264
265define void @store_i128(i128* %ptr, i128 %v) {
266; CHECK-O0-LABEL: store_i128:
267; CHECK-O0:       # %bb.0:
268; CHECK-O0-NEXT:    pushq %rbx
269; CHECK-O0-NEXT:    .cfi_def_cfa_offset 16
270; CHECK-O0-NEXT:    .cfi_offset %rbx, -16
271; CHECK-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
272; CHECK-O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
273; CHECK-O0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
274; CHECK-O0-NEXT:    movq (%rdi), %rax
275; CHECK-O0-NEXT:    movq 8(%rdi), %rdx
276; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
277; CHECK-O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
278; CHECK-O0-NEXT:    jmp .LBB16_1
279; CHECK-O0-NEXT:  .LBB16_1: # %atomicrmw.start
280; CHECK-O0-NEXT:    # =>This Inner Loop Header: Depth=1
281; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
282; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
283; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
284; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
285; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
286; CHECK-O0-NEXT:    lock cmpxchg16b (%rsi)
287; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
288; CHECK-O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
289; CHECK-O0-NEXT:    jne .LBB16_1
290; CHECK-O0-NEXT:    jmp .LBB16_2
291; CHECK-O0-NEXT:  .LBB16_2: # %atomicrmw.end
292; CHECK-O0-NEXT:    popq %rbx
293; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
294; CHECK-O0-NEXT:    retq
295;
296; CHECK-O3-LABEL: store_i128:
297; CHECK-O3:       # %bb.0:
298; CHECK-O3-NEXT:    pushq %rbx
299; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
300; CHECK-O3-NEXT:    .cfi_offset %rbx, -16
301; CHECK-O3-NEXT:    movq %rdx, %rcx
302; CHECK-O3-NEXT:    movq %rsi, %rbx
303; CHECK-O3-NEXT:    movq (%rdi), %rax
304; CHECK-O3-NEXT:    movq 8(%rdi), %rdx
305; CHECK-O3-NEXT:    .p2align 4, 0x90
306; CHECK-O3-NEXT:  .LBB16_1: # %atomicrmw.start
307; CHECK-O3-NEXT:    # =>This Inner Loop Header: Depth=1
308; CHECK-O3-NEXT:    lock cmpxchg16b (%rdi)
309; CHECK-O3-NEXT:    jne .LBB16_1
310; CHECK-O3-NEXT:  # %bb.2: # %atomicrmw.end
311; CHECK-O3-NEXT:    popq %rbx
312; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
313; CHECK-O3-NEXT:    retq
314  store atomic i128 %v, i128* %ptr unordered, align 16
315  ret void
316}
317
318define i256 @load_i256(i256* %ptr) {
319; CHECK-O0-LABEL: load_i256:
320; CHECK-O0:       # %bb.0:
321; CHECK-O0-NEXT:    subq $56, %rsp
322; CHECK-O0-NEXT:    .cfi_def_cfa_offset 64
323; CHECK-O0-NEXT:    movq %rdi, %rax
324; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
325; CHECK-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
326; CHECK-O0-NEXT:    movl $32, %edi
327; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
328; CHECK-O0-NEXT:    xorl %ecx, %ecx
329; CHECK-O0-NEXT:    callq __atomic_load@PLT
330; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
331; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
332; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
333; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
334; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
335; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %r8
336; CHECK-O0-NEXT:    movq %r8, 24(%rdi)
337; CHECK-O0-NEXT:    movq %rsi, 16(%rdi)
338; CHECK-O0-NEXT:    movq %rdx, 8(%rdi)
339; CHECK-O0-NEXT:    movq %rcx, (%rdi)
340; CHECK-O0-NEXT:    addq $56, %rsp
341; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
342; CHECK-O0-NEXT:    retq
343;
344; CHECK-O3-LABEL: load_i256:
345; CHECK-O3:       # %bb.0:
346; CHECK-O3-NEXT:    pushq %rbx
347; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
348; CHECK-O3-NEXT:    subq $32, %rsp
349; CHECK-O3-NEXT:    .cfi_def_cfa_offset 48
350; CHECK-O3-NEXT:    .cfi_offset %rbx, -16
351; CHECK-O3-NEXT:    movq %rdi, %rbx
352; CHECK-O3-NEXT:    movq %rsp, %rdx
353; CHECK-O3-NEXT:    movl $32, %edi
354; CHECK-O3-NEXT:    xorl %ecx, %ecx
355; CHECK-O3-NEXT:    callq __atomic_load@PLT
356; CHECK-O3-NEXT:    vmovups (%rsp), %ymm0
357; CHECK-O3-NEXT:    vmovups %ymm0, (%rbx)
358; CHECK-O3-NEXT:    movq %rbx, %rax
359; CHECK-O3-NEXT:    addq $32, %rsp
360; CHECK-O3-NEXT:    .cfi_def_cfa_offset 16
361; CHECK-O3-NEXT:    popq %rbx
362; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
363; CHECK-O3-NEXT:    vzeroupper
364; CHECK-O3-NEXT:    retq
365  %v = load atomic i256, i256* %ptr unordered, align 16
366  ret i256 %v
367}
368
369define void @store_i256(i256* %ptr, i256 %v) {
370; CHECK-O0-LABEL: store_i256:
371; CHECK-O0:       # %bb.0:
372; CHECK-O0-NEXT:    subq $40, %rsp
373; CHECK-O0-NEXT:    .cfi_def_cfa_offset 48
374; CHECK-O0-NEXT:    movq %rdx, %rax
375; CHECK-O0-NEXT:    movq %rsi, (%rsp) # 8-byte Spill
376; CHECK-O0-NEXT:    movq %rdi, %rsi
377; CHECK-O0-NEXT:    movq (%rsp), %rdi # 8-byte Reload
378; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
379; CHECK-O0-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
380; CHECK-O0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
381; CHECK-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
382; CHECK-O0-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
383; CHECK-O0-NEXT:    movl $32, %edi
384; CHECK-O0-NEXT:    xorl %ecx, %ecx
385; CHECK-O0-NEXT:    callq __atomic_store@PLT
386; CHECK-O0-NEXT:    addq $40, %rsp
387; CHECK-O0-NEXT:    .cfi_def_cfa_offset 8
388; CHECK-O0-NEXT:    retq
389;
390; CHECK-O3-LABEL: store_i256:
391; CHECK-O3:       # %bb.0:
392; CHECK-O3-NEXT:    subq $40, %rsp
393; CHECK-O3-NEXT:    .cfi_def_cfa_offset 48
394; CHECK-O3-NEXT:    movq %rdi, %rax
395; CHECK-O3-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
396; CHECK-O3-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
397; CHECK-O3-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
398; CHECK-O3-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
399; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
400; CHECK-O3-NEXT:    movl $32, %edi
401; CHECK-O3-NEXT:    movq %rax, %rsi
402; CHECK-O3-NEXT:    xorl %ecx, %ecx
403; CHECK-O3-NEXT:    callq __atomic_store@PLT
404; CHECK-O3-NEXT:    addq $40, %rsp
405; CHECK-O3-NEXT:    .cfi_def_cfa_offset 8
406; CHECK-O3-NEXT:    retq
407  store atomic i256 %v, i256* %ptr unordered, align 16
408  ret void
409}
410
411; Legal if wider type is also atomic (TODO)
412define void @vec_store(i32* %p0, <2 x i32> %vec) {
413; CHECK-O0-CUR-LABEL: vec_store:
414; CHECK-O0-CUR:       # %bb.0:
415; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %ecx
416; CHECK-O0-CUR-NEXT:    vpextrd $1, %xmm0, %eax
417; CHECK-O0-CUR-NEXT:    movl %ecx, (%rdi)
418; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
419; CHECK-O0-CUR-NEXT:    retq
420;
421; CHECK-O3-CUR-LABEL: vec_store:
422; CHECK-O3-CUR:       # %bb.0:
423; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
424; CHECK-O3-CUR-NEXT:    vpextrd $1, %xmm0, %ecx
425; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
426; CHECK-O3-CUR-NEXT:    movl %ecx, 4(%rdi)
427; CHECK-O3-CUR-NEXT:    retq
428;
429; CHECK-O0-EX-LABEL: vec_store:
430; CHECK-O0-EX:       # %bb.0:
431; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
432; CHECK-O0-EX-NEXT:    vpextrd $1, %xmm0, 4(%rdi)
433; CHECK-O0-EX-NEXT:    retq
434;
435; CHECK-O3-EX-LABEL: vec_store:
436; CHECK-O3-EX:       # %bb.0:
437; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
438; CHECK-O3-EX-NEXT:    vextractps $1, %xmm0, 4(%rdi)
439; CHECK-O3-EX-NEXT:    retq
440  %v1 = extractelement <2 x i32> %vec, i32 0
441  %v2 = extractelement <2 x i32> %vec, i32 1
442  %p1 = getelementptr i32, i32* %p0, i64 1
443  store atomic i32 %v1, i32* %p0 unordered, align 8
444  store atomic i32 %v2, i32* %p1 unordered, align 4
445  ret void
446}
447
448; Not legal to widen due to alignment restriction
449define void @vec_store_unaligned(i32* %p0, <2 x i32> %vec) {
450; CHECK-O0-CUR-LABEL: vec_store_unaligned:
451; CHECK-O0-CUR:       # %bb.0:
452; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %ecx
453; CHECK-O0-CUR-NEXT:    vpextrd $1, %xmm0, %eax
454; CHECK-O0-CUR-NEXT:    movl %ecx, (%rdi)
455; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
456; CHECK-O0-CUR-NEXT:    retq
457;
458; CHECK-O3-CUR-LABEL: vec_store_unaligned:
459; CHECK-O3-CUR:       # %bb.0:
460; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
461; CHECK-O3-CUR-NEXT:    vpextrd $1, %xmm0, %ecx
462; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
463; CHECK-O3-CUR-NEXT:    movl %ecx, 4(%rdi)
464; CHECK-O3-CUR-NEXT:    retq
465;
466; CHECK-O0-EX-LABEL: vec_store_unaligned:
467; CHECK-O0-EX:       # %bb.0:
468; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
469; CHECK-O0-EX-NEXT:    vpextrd $1, %xmm0, 4(%rdi)
470; CHECK-O0-EX-NEXT:    retq
471;
472; CHECK-O3-EX-LABEL: vec_store_unaligned:
473; CHECK-O3-EX:       # %bb.0:
474; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
475; CHECK-O3-EX-NEXT:    vextractps $1, %xmm0, 4(%rdi)
476; CHECK-O3-EX-NEXT:    retq
477  %v1 = extractelement <2 x i32> %vec, i32 0
478  %v2 = extractelement <2 x i32> %vec, i32 1
479  %p1 = getelementptr i32, i32* %p0, i64 1
480  store atomic i32 %v1, i32* %p0 unordered, align 4
481  store atomic i32 %v2, i32* %p1 unordered, align 4
482  ret void
483}
484
485
486
487; Legal if wider type is also atomic (TODO)
488; Also, can avoid register move from xmm to eax (TODO)
489define void @widen_broadcast2(i32* %p0, <2 x i32> %vec) {
490; CHECK-O0-CUR-LABEL: widen_broadcast2:
491; CHECK-O0-CUR:       # %bb.0:
492; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %eax
493; CHECK-O0-CUR-NEXT:    movl %eax, (%rdi)
494; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
495; CHECK-O0-CUR-NEXT:    retq
496;
497; CHECK-O3-CUR-LABEL: widen_broadcast2:
498; CHECK-O3-CUR:       # %bb.0:
499; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
500; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
501; CHECK-O3-CUR-NEXT:    movl %eax, 4(%rdi)
502; CHECK-O3-CUR-NEXT:    retq
503;
504; CHECK-O0-EX-LABEL: widen_broadcast2:
505; CHECK-O0-EX:       # %bb.0:
506; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
507; CHECK-O0-EX-NEXT:    vmovd %xmm0, 4(%rdi)
508; CHECK-O0-EX-NEXT:    retq
509;
510; CHECK-O3-EX-LABEL: widen_broadcast2:
511; CHECK-O3-EX:       # %bb.0:
512; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
513; CHECK-O3-EX-NEXT:    vmovss %xmm0, 4(%rdi)
514; CHECK-O3-EX-NEXT:    retq
515  %v1 = extractelement <2 x i32> %vec, i32 0
516  %p1 = getelementptr i32, i32* %p0, i64 1
517  store atomic i32 %v1, i32* %p0 unordered, align 8
518  store atomic i32 %v1, i32* %p1 unordered, align 4
519  ret void
520}
521
522; Not legal to widen due to alignment restriction
523define void @widen_broadcast2_unaligned(i32* %p0, <2 x i32> %vec) {
524; CHECK-O0-CUR-LABEL: widen_broadcast2_unaligned:
525; CHECK-O0-CUR:       # %bb.0:
526; CHECK-O0-CUR-NEXT:    vmovd %xmm0, %eax
527; CHECK-O0-CUR-NEXT:    movl %eax, (%rdi)
528; CHECK-O0-CUR-NEXT:    movl %eax, 4(%rdi)
529; CHECK-O0-CUR-NEXT:    retq
530;
531; CHECK-O3-CUR-LABEL: widen_broadcast2_unaligned:
532; CHECK-O3-CUR:       # %bb.0:
533; CHECK-O3-CUR-NEXT:    vmovd %xmm0, %eax
534; CHECK-O3-CUR-NEXT:    movl %eax, (%rdi)
535; CHECK-O3-CUR-NEXT:    movl %eax, 4(%rdi)
536; CHECK-O3-CUR-NEXT:    retq
537;
538; CHECK-O0-EX-LABEL: widen_broadcast2_unaligned:
539; CHECK-O0-EX:       # %bb.0:
540; CHECK-O0-EX-NEXT:    vmovd %xmm0, (%rdi)
541; CHECK-O0-EX-NEXT:    vmovd %xmm0, 4(%rdi)
542; CHECK-O0-EX-NEXT:    retq
543;
544; CHECK-O3-EX-LABEL: widen_broadcast2_unaligned:
545; CHECK-O3-EX:       # %bb.0:
546; CHECK-O3-EX-NEXT:    vmovss %xmm0, (%rdi)
547; CHECK-O3-EX-NEXT:    vmovss %xmm0, 4(%rdi)
548; CHECK-O3-EX-NEXT:    retq
549  %v1 = extractelement <2 x i32> %vec, i32 0
550  %p1 = getelementptr i32, i32* %p0, i64 1
551  store atomic i32 %v1, i32* %p0 unordered, align 4
552  store atomic i32 %v1, i32* %p1 unordered, align 4
553  ret void
554}
555
556; Legal if wider type is also atomic (TODO)
557define void @widen_zero_init(i32* %p0, i32 %v1, i32 %v2) {
558; CHECK-LABEL: widen_zero_init:
559; CHECK:       # %bb.0:
560; CHECK-NEXT:    movl $0, (%rdi)
561; CHECK-NEXT:    movl $0, 4(%rdi)
562; CHECK-NEXT:    retq
563  %p1 = getelementptr i32, i32* %p0, i64 1
564  store atomic i32 0, i32* %p0 unordered, align 8
565  store atomic i32 0, i32* %p1 unordered, align 4
566  ret void
567}
568
569; Not legal to widen due to alignment restriction
570define void @widen_zero_init_unaligned(i32* %p0, i32 %v1, i32 %v2) {
571; CHECK-LABEL: widen_zero_init_unaligned:
572; CHECK:       # %bb.0:
573; CHECK-NEXT:    movl $0, (%rdi)
574; CHECK-NEXT:    movl $0, 4(%rdi)
575; CHECK-NEXT:    retq
576  %p1 = getelementptr i32, i32* %p0, i64 1
577  store atomic i32 0, i32* %p0 unordered, align 4
578  store atomic i32 0, i32* %p1 unordered, align 4
579  ret void
580}
581
582;; The next batch of tests are stressing load folding. Folding is legal
583;; on x86, so these are simply checking optimization quality.
584
585; Legal, as expected
586define i64 @load_fold_add1(i64* %p) {
587; CHECK-LABEL: load_fold_add1:
588; CHECK:       # %bb.0:
589; CHECK-NEXT:    movq (%rdi), %rax
590; CHECK-NEXT:    addq $15, %rax
591; CHECK-NEXT:    retq
592  %v = load atomic i64, i64* %p unordered, align 8
593  %ret = add i64 %v, 15
594  ret i64 %ret
595}
596
597define i64 @load_fold_add2(i64* %p, i64 %v2) {
598; CHECK-LABEL: load_fold_add2:
599; CHECK:       # %bb.0:
600; CHECK-NEXT:    movq %rsi, %rax
601; CHECK-NEXT:    addq (%rdi), %rax
602; CHECK-NEXT:    retq
603  %v = load atomic i64, i64* %p unordered, align 8
604  %ret = add i64 %v, %v2
605  ret i64 %ret
606}
607
608define i64 @load_fold_add3(i64* %p1, i64* %p2) {
609; CHECK-O0-LABEL: load_fold_add3:
610; CHECK-O0:       # %bb.0:
611; CHECK-O0-NEXT:    movq (%rdi), %rax
612; CHECK-O0-NEXT:    addq (%rsi), %rax
613; CHECK-O0-NEXT:    retq
614;
615; CHECK-O3-CUR-LABEL: load_fold_add3:
616; CHECK-O3-CUR:       # %bb.0:
617; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
618; CHECK-O3-CUR-NEXT:    addq (%rdi), %rax
619; CHECK-O3-CUR-NEXT:    retq
620;
621; CHECK-O3-EX-LABEL: load_fold_add3:
622; CHECK-O3-EX:       # %bb.0:
623; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
624; CHECK-O3-EX-NEXT:    addq (%rsi), %rax
625; CHECK-O3-EX-NEXT:    retq
626  %v = load atomic i64, i64* %p1 unordered, align 8
627  %v2 = load atomic i64, i64* %p2 unordered, align 8
628  %ret = add i64 %v, %v2
629  ret i64 %ret
630}
631
632; Legal, as expected
633define i64 @load_fold_sub1(i64* %p) {
634; CHECK-O0-LABEL: load_fold_sub1:
635; CHECK-O0:       # %bb.0:
636; CHECK-O0-NEXT:    movq (%rdi), %rax
637; CHECK-O0-NEXT:    subq $15, %rax
638; CHECK-O0-NEXT:    retq
639;
640; CHECK-O3-LABEL: load_fold_sub1:
641; CHECK-O3:       # %bb.0:
642; CHECK-O3-NEXT:    movq (%rdi), %rax
643; CHECK-O3-NEXT:    addq $-15, %rax
644; CHECK-O3-NEXT:    retq
645  %v = load atomic i64, i64* %p unordered, align 8
646  %ret = sub i64 %v, 15
647  ret i64 %ret
648}
649
650define i64 @load_fold_sub2(i64* %p, i64 %v2) {
651; CHECK-LABEL: load_fold_sub2:
652; CHECK:       # %bb.0:
653; CHECK-NEXT:    movq (%rdi), %rax
654; CHECK-NEXT:    subq %rsi, %rax
655; CHECK-NEXT:    retq
656  %v = load atomic i64, i64* %p unordered, align 8
657  %ret = sub i64 %v, %v2
658  ret i64 %ret
659}
660
661define i64 @load_fold_sub3(i64* %p1, i64* %p2) {
662; CHECK-LABEL: load_fold_sub3:
663; CHECK:       # %bb.0:
664; CHECK-NEXT:    movq (%rdi), %rax
665; CHECK-NEXT:    subq (%rsi), %rax
666; CHECK-NEXT:    retq
667  %v = load atomic i64, i64* %p1 unordered, align 8
668  %v2 = load atomic i64, i64* %p2 unordered, align 8
669  %ret = sub i64 %v, %v2
670  ret i64 %ret
671}
672
673; Legal, as expected
674define i64 @load_fold_mul1(i64* %p) {
675; CHECK-O0-LABEL: load_fold_mul1:
676; CHECK-O0:       # %bb.0:
677; CHECK-O0-NEXT:    imulq $15, (%rdi), %rax
678; CHECK-O0-NEXT:    retq
679;
680; CHECK-O3-LABEL: load_fold_mul1:
681; CHECK-O3:       # %bb.0:
682; CHECK-O3-NEXT:    movq (%rdi), %rax
683; CHECK-O3-NEXT:    leaq (%rax,%rax,4), %rax
684; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
685; CHECK-O3-NEXT:    retq
686  %v = load atomic i64, i64* %p unordered, align 8
687  %ret = mul i64 %v, 15
688  ret i64 %ret
689}
690
691define i64 @load_fold_mul2(i64* %p, i64 %v2) {
692; CHECK-LABEL: load_fold_mul2:
693; CHECK:       # %bb.0:
694; CHECK-NEXT:    movq %rsi, %rax
695; CHECK-NEXT:    imulq (%rdi), %rax
696; CHECK-NEXT:    retq
697  %v = load atomic i64, i64* %p unordered, align 8
698  %ret = mul i64 %v, %v2
699  ret i64 %ret
700}
701
702define i64 @load_fold_mul3(i64* %p1, i64* %p2) {
703; CHECK-O0-LABEL: load_fold_mul3:
704; CHECK-O0:       # %bb.0:
705; CHECK-O0-NEXT:    movq (%rdi), %rax
706; CHECK-O0-NEXT:    imulq (%rsi), %rax
707; CHECK-O0-NEXT:    retq
708;
709; CHECK-O3-CUR-LABEL: load_fold_mul3:
710; CHECK-O3-CUR:       # %bb.0:
711; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
712; CHECK-O3-CUR-NEXT:    imulq (%rdi), %rax
713; CHECK-O3-CUR-NEXT:    retq
714;
715; CHECK-O3-EX-LABEL: load_fold_mul3:
716; CHECK-O3-EX:       # %bb.0:
717; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
718; CHECK-O3-EX-NEXT:    imulq (%rsi), %rax
719; CHECK-O3-EX-NEXT:    retq
720  %v = load atomic i64, i64* %p1 unordered, align 8
721  %v2 = load atomic i64, i64* %p2 unordered, align 8
722  %ret = mul i64 %v, %v2
723  ret i64 %ret
724}
725
726; Legal to fold (TODO)
727define i64 @load_fold_sdiv1(i64* %p) {
728; CHECK-O0-LABEL: load_fold_sdiv1:
729; CHECK-O0:       # %bb.0:
730; CHECK-O0-NEXT:    movq (%rdi), %rax
731; CHECK-O0-NEXT:    movl $15, %ecx
732; CHECK-O0-NEXT:    cqto
733; CHECK-O0-NEXT:    idivq %rcx
734; CHECK-O0-NEXT:    retq
735;
736; CHECK-O3-LABEL: load_fold_sdiv1:
737; CHECK-O3:       # %bb.0:
738; CHECK-O3-NEXT:    movq (%rdi), %rcx
739; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
740; CHECK-O3-NEXT:    movq %rcx, %rax
741; CHECK-O3-NEXT:    imulq %rdx
742; CHECK-O3-NEXT:    addq %rdx, %rcx
743; CHECK-O3-NEXT:    movq %rcx, %rax
744; CHECK-O3-NEXT:    shrq $63, %rax
745; CHECK-O3-NEXT:    sarq $3, %rcx
746; CHECK-O3-NEXT:    addq %rax, %rcx
747; CHECK-O3-NEXT:    movq %rcx, %rax
748; CHECK-O3-NEXT:    retq
749  %v = load atomic i64, i64* %p unordered, align 8
750  %ret = sdiv i64 %v, 15
751  ret i64 %ret
752}
753
754; Legal to fold (TODO)
755define i64 @load_fold_sdiv2(i64* %p, i64 %v2) {
756; CHECK-O0-LABEL: load_fold_sdiv2:
757; CHECK-O0:       # %bb.0:
758; CHECK-O0-NEXT:    movq (%rdi), %rax
759; CHECK-O0-NEXT:    cqto
760; CHECK-O0-NEXT:    idivq %rsi
761; CHECK-O0-NEXT:    retq
762;
763; CHECK-O3-LABEL: load_fold_sdiv2:
764; CHECK-O3:       # %bb.0:
765; CHECK-O3-NEXT:    movq (%rdi), %rax
766; CHECK-O3-NEXT:    movq %rax, %rcx
767; CHECK-O3-NEXT:    orq %rsi, %rcx
768; CHECK-O3-NEXT:    shrq $32, %rcx
769; CHECK-O3-NEXT:    je .LBB35_1
770; CHECK-O3-NEXT:  # %bb.2:
771; CHECK-O3-NEXT:    cqto
772; CHECK-O3-NEXT:    idivq %rsi
773; CHECK-O3-NEXT:    retq
774; CHECK-O3-NEXT:  .LBB35_1:
775; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
776; CHECK-O3-NEXT:    xorl %edx, %edx
777; CHECK-O3-NEXT:    divl %esi
778; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
779; CHECK-O3-NEXT:    retq
780  %v = load atomic i64, i64* %p unordered, align 8
781  %ret = sdiv i64 %v, %v2
782  ret i64 %ret
783}
784
785define i64 @load_fold_sdiv3(i64* %p1, i64* %p2) {
786; CHECK-O0-LABEL: load_fold_sdiv3:
787; CHECK-O0:       # %bb.0:
788; CHECK-O0-NEXT:    movq (%rdi), %rax
789; CHECK-O0-NEXT:    cqto
790; CHECK-O0-NEXT:    idivq (%rsi)
791; CHECK-O0-NEXT:    retq
792;
793; CHECK-O3-LABEL: load_fold_sdiv3:
794; CHECK-O3:       # %bb.0:
795; CHECK-O3-NEXT:    movq (%rdi), %rax
796; CHECK-O3-NEXT:    movq (%rsi), %rcx
797; CHECK-O3-NEXT:    movq %rax, %rdx
798; CHECK-O3-NEXT:    orq %rcx, %rdx
799; CHECK-O3-NEXT:    shrq $32, %rdx
800; CHECK-O3-NEXT:    je .LBB36_1
801; CHECK-O3-NEXT:  # %bb.2:
802; CHECK-O3-NEXT:    cqto
803; CHECK-O3-NEXT:    idivq %rcx
804; CHECK-O3-NEXT:    retq
805; CHECK-O3-NEXT:  .LBB36_1:
806; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
807; CHECK-O3-NEXT:    xorl %edx, %edx
808; CHECK-O3-NEXT:    divl %ecx
809; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
810; CHECK-O3-NEXT:    retq
811  %v = load atomic i64, i64* %p1 unordered, align 8
812  %v2 = load atomic i64, i64* %p2 unordered, align 8
813  %ret = sdiv i64 %v, %v2
814  ret i64 %ret
815}
816
817; Legal to fold (TODO)
818define i64 @load_fold_udiv1(i64* %p) {
819; CHECK-O0-LABEL: load_fold_udiv1:
820; CHECK-O0:       # %bb.0:
821; CHECK-O0-NEXT:    movq (%rdi), %rax
822; CHECK-O0-NEXT:    movl $15, %ecx
823; CHECK-O0-NEXT:    xorl %edx, %edx
824; CHECK-O0-NEXT:    # kill: def $rdx killed $edx
825; CHECK-O0-NEXT:    divq %rcx
826; CHECK-O0-NEXT:    retq
827;
828; CHECK-O3-CUR-LABEL: load_fold_udiv1:
829; CHECK-O3-CUR:       # %bb.0:
830; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
831; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
832; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
833; CHECK-O3-CUR-NEXT:    shrq $3, %rax
834; CHECK-O3-CUR-NEXT:    retq
835;
836; CHECK-O3-EX-LABEL: load_fold_udiv1:
837; CHECK-O3-EX:       # %bb.0:
838; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
839; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
840; CHECK-O3-EX-NEXT:    shrq $3, %rax
841; CHECK-O3-EX-NEXT:    retq
842  %v = load atomic i64, i64* %p unordered, align 8
843  %ret = udiv i64 %v, 15
844  ret i64 %ret
845}
846
847define i64 @load_fold_udiv2(i64* %p, i64 %v2) {
848; CHECK-O0-LABEL: load_fold_udiv2:
849; CHECK-O0:       # %bb.0:
850; CHECK-O0-NEXT:    movq (%rdi), %rax
851; CHECK-O0-NEXT:    xorl %ecx, %ecx
852; CHECK-O0-NEXT:    movl %ecx, %edx
853; CHECK-O0-NEXT:    divq %rsi
854; CHECK-O0-NEXT:    retq
855;
856; CHECK-O3-LABEL: load_fold_udiv2:
857; CHECK-O3:       # %bb.0:
858; CHECK-O3-NEXT:    movq (%rdi), %rax
859; CHECK-O3-NEXT:    movq %rax, %rcx
860; CHECK-O3-NEXT:    orq %rsi, %rcx
861; CHECK-O3-NEXT:    shrq $32, %rcx
862; CHECK-O3-NEXT:    je .LBB38_1
863; CHECK-O3-NEXT:  # %bb.2:
864; CHECK-O3-NEXT:    xorl %edx, %edx
865; CHECK-O3-NEXT:    divq %rsi
866; CHECK-O3-NEXT:    retq
867; CHECK-O3-NEXT:  .LBB38_1:
868; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
869; CHECK-O3-NEXT:    xorl %edx, %edx
870; CHECK-O3-NEXT:    divl %esi
871; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
872; CHECK-O3-NEXT:    retq
873  %v = load atomic i64, i64* %p unordered, align 8
874  %ret = udiv i64 %v, %v2
875  ret i64 %ret
876}
877
878define i64 @load_fold_udiv3(i64* %p1, i64* %p2) {
879; CHECK-O0-LABEL: load_fold_udiv3:
880; CHECK-O0:       # %bb.0:
881; CHECK-O0-NEXT:    movq (%rdi), %rax
882; CHECK-O0-NEXT:    xorl %ecx, %ecx
883; CHECK-O0-NEXT:    movl %ecx, %edx
884; CHECK-O0-NEXT:    divq (%rsi)
885; CHECK-O0-NEXT:    retq
886;
887; CHECK-O3-LABEL: load_fold_udiv3:
888; CHECK-O3:       # %bb.0:
889; CHECK-O3-NEXT:    movq (%rdi), %rax
890; CHECK-O3-NEXT:    movq (%rsi), %rcx
891; CHECK-O3-NEXT:    movq %rax, %rdx
892; CHECK-O3-NEXT:    orq %rcx, %rdx
893; CHECK-O3-NEXT:    shrq $32, %rdx
894; CHECK-O3-NEXT:    je .LBB39_1
895; CHECK-O3-NEXT:  # %bb.2:
896; CHECK-O3-NEXT:    xorl %edx, %edx
897; CHECK-O3-NEXT:    divq %rcx
898; CHECK-O3-NEXT:    retq
899; CHECK-O3-NEXT:  .LBB39_1:
900; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
901; CHECK-O3-NEXT:    xorl %edx, %edx
902; CHECK-O3-NEXT:    divl %ecx
903; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
904; CHECK-O3-NEXT:    retq
905  %v = load atomic i64, i64* %p1 unordered, align 8
906  %v2 = load atomic i64, i64* %p2 unordered, align 8
907  %ret = udiv i64 %v, %v2
908  ret i64 %ret
909}
910
911; Legal to fold (TODO)
912define i64 @load_fold_srem1(i64* %p) {
913; CHECK-O0-LABEL: load_fold_srem1:
914; CHECK-O0:       # %bb.0:
915; CHECK-O0-NEXT:    movq (%rdi), %rax
916; CHECK-O0-NEXT:    movl $15, %ecx
917; CHECK-O0-NEXT:    cqto
918; CHECK-O0-NEXT:    idivq %rcx
919; CHECK-O0-NEXT:    movq %rdx, %rax
920; CHECK-O0-NEXT:    retq
921;
922; CHECK-O3-LABEL: load_fold_srem1:
923; CHECK-O3:       # %bb.0:
924; CHECK-O3-NEXT:    movq (%rdi), %rcx
925; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
926; CHECK-O3-NEXT:    movq %rcx, %rax
927; CHECK-O3-NEXT:    imulq %rdx
928; CHECK-O3-NEXT:    addq %rcx, %rdx
929; CHECK-O3-NEXT:    movq %rdx, %rax
930; CHECK-O3-NEXT:    shrq $63, %rax
931; CHECK-O3-NEXT:    sarq $3, %rdx
932; CHECK-O3-NEXT:    addq %rax, %rdx
933; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rax
934; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
935; CHECK-O3-NEXT:    subq %rax, %rcx
936; CHECK-O3-NEXT:    movq %rcx, %rax
937; CHECK-O3-NEXT:    retq
938  %v = load atomic i64, i64* %p unordered, align 8
939  %ret = srem i64 %v, 15
940  ret i64 %ret
941}
942
943; Legal, as expected
944define i64 @load_fold_srem2(i64* %p, i64 %v2) {
945; CHECK-O0-LABEL: load_fold_srem2:
946; CHECK-O0:       # %bb.0:
947; CHECK-O0-NEXT:    movq (%rdi), %rax
948; CHECK-O0-NEXT:    cqto
949; CHECK-O0-NEXT:    idivq %rsi
950; CHECK-O0-NEXT:    movq %rdx, %rax
951; CHECK-O0-NEXT:    retq
952;
953; CHECK-O3-LABEL: load_fold_srem2:
954; CHECK-O3:       # %bb.0:
955; CHECK-O3-NEXT:    movq (%rdi), %rax
956; CHECK-O3-NEXT:    movq %rax, %rcx
957; CHECK-O3-NEXT:    orq %rsi, %rcx
958; CHECK-O3-NEXT:    shrq $32, %rcx
959; CHECK-O3-NEXT:    je .LBB41_1
960; CHECK-O3-NEXT:  # %bb.2:
961; CHECK-O3-NEXT:    cqto
962; CHECK-O3-NEXT:    idivq %rsi
963; CHECK-O3-NEXT:    movq %rdx, %rax
964; CHECK-O3-NEXT:    retq
965; CHECK-O3-NEXT:  .LBB41_1:
966; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
967; CHECK-O3-NEXT:    xorl %edx, %edx
968; CHECK-O3-NEXT:    divl %esi
969; CHECK-O3-NEXT:    movl %edx, %eax
970; CHECK-O3-NEXT:    retq
971  %v = load atomic i64, i64* %p unordered, align 8
972  %ret = srem i64 %v, %v2
973  ret i64 %ret
974}
975
976define i64 @load_fold_srem3(i64* %p1, i64* %p2) {
977; CHECK-O0-LABEL: load_fold_srem3:
978; CHECK-O0:       # %bb.0:
979; CHECK-O0-NEXT:    movq (%rdi), %rax
980; CHECK-O0-NEXT:    cqto
981; CHECK-O0-NEXT:    idivq (%rsi)
982; CHECK-O0-NEXT:    movq %rdx, %rax
983; CHECK-O0-NEXT:    retq
984;
985; CHECK-O3-LABEL: load_fold_srem3:
986; CHECK-O3:       # %bb.0:
987; CHECK-O3-NEXT:    movq (%rdi), %rax
988; CHECK-O3-NEXT:    movq (%rsi), %rcx
989; CHECK-O3-NEXT:    movq %rax, %rdx
990; CHECK-O3-NEXT:    orq %rcx, %rdx
991; CHECK-O3-NEXT:    shrq $32, %rdx
992; CHECK-O3-NEXT:    je .LBB42_1
993; CHECK-O3-NEXT:  # %bb.2:
994; CHECK-O3-NEXT:    cqto
995; CHECK-O3-NEXT:    idivq %rcx
996; CHECK-O3-NEXT:    movq %rdx, %rax
997; CHECK-O3-NEXT:    retq
998; CHECK-O3-NEXT:  .LBB42_1:
999; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1000; CHECK-O3-NEXT:    xorl %edx, %edx
1001; CHECK-O3-NEXT:    divl %ecx
1002; CHECK-O3-NEXT:    movl %edx, %eax
1003; CHECK-O3-NEXT:    retq
1004  %v = load atomic i64, i64* %p1 unordered, align 8
1005  %v2 = load atomic i64, i64* %p2 unordered, align 8
1006  %ret = srem i64 %v, %v2
1007  ret i64 %ret
1008}
1009
1010; Legal to fold (TODO)
1011define i64 @load_fold_urem1(i64* %p) {
1012; CHECK-O0-LABEL: load_fold_urem1:
1013; CHECK-O0:       # %bb.0:
1014; CHECK-O0-NEXT:    movq (%rdi), %rax
1015; CHECK-O0-NEXT:    movl $15, %ecx
1016; CHECK-O0-NEXT:    xorl %edx, %edx
1017; CHECK-O0-NEXT:    # kill: def $rdx killed $edx
1018; CHECK-O0-NEXT:    divq %rcx
1019; CHECK-O0-NEXT:    movq %rdx, %rax
1020; CHECK-O0-NEXT:    retq
1021;
1022; CHECK-O3-LABEL: load_fold_urem1:
1023; CHECK-O3:       # %bb.0:
1024; CHECK-O3-NEXT:    movq (%rdi), %rax
1025; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1026; CHECK-O3-NEXT:    movq %rax, %rdx
1027; CHECK-O3-NEXT:    mulxq %rcx, %rcx, %rcx
1028; CHECK-O3-NEXT:    shrq $3, %rcx
1029; CHECK-O3-NEXT:    leaq (%rcx,%rcx,4), %rcx
1030; CHECK-O3-NEXT:    leaq (%rcx,%rcx,2), %rcx
1031; CHECK-O3-NEXT:    subq %rcx, %rax
1032; CHECK-O3-NEXT:    retq
1033  %v = load atomic i64, i64* %p unordered, align 8
1034  %ret = urem i64 %v, 15
1035  ret i64 %ret
1036}
1037
1038; Legal, as expected
1039define i64 @load_fold_urem2(i64* %p, i64 %v2) {
1040; CHECK-O0-LABEL: load_fold_urem2:
1041; CHECK-O0:       # %bb.0:
1042; CHECK-O0-NEXT:    movq (%rdi), %rax
1043; CHECK-O0-NEXT:    xorl %ecx, %ecx
1044; CHECK-O0-NEXT:    movl %ecx, %edx
1045; CHECK-O0-NEXT:    divq %rsi
1046; CHECK-O0-NEXT:    movq %rdx, %rax
1047; CHECK-O0-NEXT:    retq
1048;
1049; CHECK-O3-LABEL: load_fold_urem2:
1050; CHECK-O3:       # %bb.0:
1051; CHECK-O3-NEXT:    movq (%rdi), %rax
1052; CHECK-O3-NEXT:    movq %rax, %rcx
1053; CHECK-O3-NEXT:    orq %rsi, %rcx
1054; CHECK-O3-NEXT:    shrq $32, %rcx
1055; CHECK-O3-NEXT:    je .LBB44_1
1056; CHECK-O3-NEXT:  # %bb.2:
1057; CHECK-O3-NEXT:    xorl %edx, %edx
1058; CHECK-O3-NEXT:    divq %rsi
1059; CHECK-O3-NEXT:    movq %rdx, %rax
1060; CHECK-O3-NEXT:    retq
1061; CHECK-O3-NEXT:  .LBB44_1:
1062; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1063; CHECK-O3-NEXT:    xorl %edx, %edx
1064; CHECK-O3-NEXT:    divl %esi
1065; CHECK-O3-NEXT:    movl %edx, %eax
1066; CHECK-O3-NEXT:    retq
1067  %v = load atomic i64, i64* %p unordered, align 8
1068  %ret = urem i64 %v, %v2
1069  ret i64 %ret
1070}
1071
1072define i64 @load_fold_urem3(i64* %p1, i64* %p2) {
1073; CHECK-O0-LABEL: load_fold_urem3:
1074; CHECK-O0:       # %bb.0:
1075; CHECK-O0-NEXT:    movq (%rdi), %rax
1076; CHECK-O0-NEXT:    xorl %ecx, %ecx
1077; CHECK-O0-NEXT:    movl %ecx, %edx
1078; CHECK-O0-NEXT:    divq (%rsi)
1079; CHECK-O0-NEXT:    movq %rdx, %rax
1080; CHECK-O0-NEXT:    retq
1081;
1082; CHECK-O3-LABEL: load_fold_urem3:
1083; CHECK-O3:       # %bb.0:
1084; CHECK-O3-NEXT:    movq (%rdi), %rax
1085; CHECK-O3-NEXT:    movq (%rsi), %rcx
1086; CHECK-O3-NEXT:    movq %rax, %rdx
1087; CHECK-O3-NEXT:    orq %rcx, %rdx
1088; CHECK-O3-NEXT:    shrq $32, %rdx
1089; CHECK-O3-NEXT:    je .LBB45_1
1090; CHECK-O3-NEXT:  # %bb.2:
1091; CHECK-O3-NEXT:    xorl %edx, %edx
1092; CHECK-O3-NEXT:    divq %rcx
1093; CHECK-O3-NEXT:    movq %rdx, %rax
1094; CHECK-O3-NEXT:    retq
1095; CHECK-O3-NEXT:  .LBB45_1:
1096; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1097; CHECK-O3-NEXT:    xorl %edx, %edx
1098; CHECK-O3-NEXT:    divl %ecx
1099; CHECK-O3-NEXT:    movl %edx, %eax
1100; CHECK-O3-NEXT:    retq
1101  %v = load atomic i64, i64* %p1 unordered, align 8
1102  %v2 = load atomic i64, i64* %p2 unordered, align 8
1103  %ret = urem i64 %v, %v2
1104  ret i64 %ret
1105}
1106
1107; Legal, as expected
1108define i64 @load_fold_shl1(i64* %p) {
1109; CHECK-LABEL: load_fold_shl1:
1110; CHECK:       # %bb.0:
1111; CHECK-NEXT:    movq (%rdi), %rax
1112; CHECK-NEXT:    shlq $15, %rax
1113; CHECK-NEXT:    retq
1114  %v = load atomic i64, i64* %p unordered, align 8
1115  %ret = shl i64 %v, 15
1116  ret i64 %ret
1117}
1118
1119define i64 @load_fold_shl2(i64* %p, i64 %v2) {
1120; CHECK-O0-LABEL: load_fold_shl2:
1121; CHECK-O0:       # %bb.0:
1122; CHECK-O0-NEXT:    movq %rsi, %rcx
1123; CHECK-O0-NEXT:    movq (%rdi), %rax
1124; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1125; CHECK-O0-NEXT:    shlq %cl, %rax
1126; CHECK-O0-NEXT:    retq
1127;
1128; CHECK-O3-LABEL: load_fold_shl2:
1129; CHECK-O3:       # %bb.0:
1130; CHECK-O3-NEXT:    shlxq %rsi, (%rdi), %rax
1131; CHECK-O3-NEXT:    retq
1132  %v = load atomic i64, i64* %p unordered, align 8
1133  %ret = shl i64 %v, %v2
1134  ret i64 %ret
1135}
1136
1137define i64 @load_fold_shl3(i64* %p1, i64* %p2) {
1138; CHECK-O0-LABEL: load_fold_shl3:
1139; CHECK-O0:       # %bb.0:
1140; CHECK-O0-NEXT:    movq (%rdi), %rax
1141; CHECK-O0-NEXT:    movq (%rsi), %rcx
1142; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1143; CHECK-O0-NEXT:    shlq %cl, %rax
1144; CHECK-O0-NEXT:    retq
1145;
1146; CHECK-O3-LABEL: load_fold_shl3:
1147; CHECK-O3:       # %bb.0:
1148; CHECK-O3-NEXT:    movq (%rsi), %rax
1149; CHECK-O3-NEXT:    shlxq %rax, (%rdi), %rax
1150; CHECK-O3-NEXT:    retq
1151  %v = load atomic i64, i64* %p1 unordered, align 8
1152  %v2 = load atomic i64, i64* %p2 unordered, align 8
1153  %ret = shl i64 %v, %v2
1154  ret i64 %ret
1155}
1156
1157; Legal, as expected
1158define i64 @load_fold_lshr1(i64* %p) {
1159; CHECK-LABEL: load_fold_lshr1:
1160; CHECK:       # %bb.0:
1161; CHECK-NEXT:    movq (%rdi), %rax
1162; CHECK-NEXT:    shrq $15, %rax
1163; CHECK-NEXT:    retq
1164  %v = load atomic i64, i64* %p unordered, align 8
1165  %ret = lshr i64 %v, 15
1166  ret i64 %ret
1167}
1168
1169define i64 @load_fold_lshr2(i64* %p, i64 %v2) {
1170; CHECK-O0-LABEL: load_fold_lshr2:
1171; CHECK-O0:       # %bb.0:
1172; CHECK-O0-NEXT:    movq %rsi, %rcx
1173; CHECK-O0-NEXT:    movq (%rdi), %rax
1174; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1175; CHECK-O0-NEXT:    shrq %cl, %rax
1176; CHECK-O0-NEXT:    retq
1177;
1178; CHECK-O3-LABEL: load_fold_lshr2:
1179; CHECK-O3:       # %bb.0:
1180; CHECK-O3-NEXT:    shrxq %rsi, (%rdi), %rax
1181; CHECK-O3-NEXT:    retq
1182  %v = load atomic i64, i64* %p unordered, align 8
1183  %ret = lshr i64 %v, %v2
1184  ret i64 %ret
1185}
1186
1187define i64 @load_fold_lshr3(i64* %p1, i64* %p2) {
1188; CHECK-O0-LABEL: load_fold_lshr3:
1189; CHECK-O0:       # %bb.0:
1190; CHECK-O0-NEXT:    movq (%rdi), %rax
1191; CHECK-O0-NEXT:    movq (%rsi), %rcx
1192; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1193; CHECK-O0-NEXT:    shrq %cl, %rax
1194; CHECK-O0-NEXT:    retq
1195;
1196; CHECK-O3-LABEL: load_fold_lshr3:
1197; CHECK-O3:       # %bb.0:
1198; CHECK-O3-NEXT:    movq (%rsi), %rax
1199; CHECK-O3-NEXT:    shrxq %rax, (%rdi), %rax
1200; CHECK-O3-NEXT:    retq
1201  %v = load atomic i64, i64* %p1 unordered, align 8
1202  %v2 = load atomic i64, i64* %p2 unordered, align 8
1203  %ret = lshr i64 %v, %v2
1204  ret i64 %ret
1205}
1206
1207; Legal, as expected
1208define i64 @load_fold_ashr1(i64* %p) {
1209; CHECK-LABEL: load_fold_ashr1:
1210; CHECK:       # %bb.0:
1211; CHECK-NEXT:    movq (%rdi), %rax
1212; CHECK-NEXT:    sarq $15, %rax
1213; CHECK-NEXT:    retq
1214  %v = load atomic i64, i64* %p unordered, align 8
1215  %ret = ashr i64 %v, 15
1216  ret i64 %ret
1217}
1218
1219define i64 @load_fold_ashr2(i64* %p, i64 %v2) {
1220; CHECK-O0-LABEL: load_fold_ashr2:
1221; CHECK-O0:       # %bb.0:
1222; CHECK-O0-NEXT:    movq %rsi, %rcx
1223; CHECK-O0-NEXT:    movq (%rdi), %rax
1224; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1225; CHECK-O0-NEXT:    sarq %cl, %rax
1226; CHECK-O0-NEXT:    retq
1227;
1228; CHECK-O3-LABEL: load_fold_ashr2:
1229; CHECK-O3:       # %bb.0:
1230; CHECK-O3-NEXT:    sarxq %rsi, (%rdi), %rax
1231; CHECK-O3-NEXT:    retq
1232  %v = load atomic i64, i64* %p unordered, align 8
1233  %ret = ashr i64 %v, %v2
1234  ret i64 %ret
1235}
1236
1237define i64 @load_fold_ashr3(i64* %p1, i64* %p2) {
1238; CHECK-O0-LABEL: load_fold_ashr3:
1239; CHECK-O0:       # %bb.0:
1240; CHECK-O0-NEXT:    movq (%rdi), %rax
1241; CHECK-O0-NEXT:    movq (%rsi), %rcx
1242; CHECK-O0-NEXT:    # kill: def $cl killed $rcx
1243; CHECK-O0-NEXT:    sarq %cl, %rax
1244; CHECK-O0-NEXT:    retq
1245;
1246; CHECK-O3-LABEL: load_fold_ashr3:
1247; CHECK-O3:       # %bb.0:
1248; CHECK-O3-NEXT:    movq (%rsi), %rax
1249; CHECK-O3-NEXT:    sarxq %rax, (%rdi), %rax
1250; CHECK-O3-NEXT:    retq
1251  %v = load atomic i64, i64* %p1 unordered, align 8
1252  %v2 = load atomic i64, i64* %p2 unordered, align 8
1253  %ret = ashr i64 %v, %v2
1254  ret i64 %ret
1255}
1256
1257; Legal, as expected
1258define i64 @load_fold_and1(i64* %p) {
1259; CHECK-O0-LABEL: load_fold_and1:
1260; CHECK-O0:       # %bb.0:
1261; CHECK-O0-NEXT:    movq (%rdi), %rax
1262; CHECK-O0-NEXT:    andq $15, %rax
1263; CHECK-O0-NEXT:    retq
1264;
1265; CHECK-O3-LABEL: load_fold_and1:
1266; CHECK-O3:       # %bb.0:
1267; CHECK-O3-NEXT:    movq (%rdi), %rax
1268; CHECK-O3-NEXT:    andl $15, %eax
1269; CHECK-O3-NEXT:    retq
1270  %v = load atomic i64, i64* %p unordered, align 8
1271  %ret = and i64 %v, 15
1272  ret i64 %ret
1273}
1274
1275define i64 @load_fold_and2(i64* %p, i64 %v2) {
1276; CHECK-LABEL: load_fold_and2:
1277; CHECK:       # %bb.0:
1278; CHECK-NEXT:    movq %rsi, %rax
1279; CHECK-NEXT:    andq (%rdi), %rax
1280; CHECK-NEXT:    retq
1281  %v = load atomic i64, i64* %p unordered, align 8
1282  %ret = and i64 %v, %v2
1283  ret i64 %ret
1284}
1285
1286define i64 @load_fold_and3(i64* %p1, i64* %p2) {
1287; CHECK-O0-LABEL: load_fold_and3:
1288; CHECK-O0:       # %bb.0:
1289; CHECK-O0-NEXT:    movq (%rdi), %rax
1290; CHECK-O0-NEXT:    andq (%rsi), %rax
1291; CHECK-O0-NEXT:    retq
1292;
1293; CHECK-O3-CUR-LABEL: load_fold_and3:
1294; CHECK-O3-CUR:       # %bb.0:
1295; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1296; CHECK-O3-CUR-NEXT:    andq (%rdi), %rax
1297; CHECK-O3-CUR-NEXT:    retq
1298;
1299; CHECK-O3-EX-LABEL: load_fold_and3:
1300; CHECK-O3-EX:       # %bb.0:
1301; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1302; CHECK-O3-EX-NEXT:    andq (%rsi), %rax
1303; CHECK-O3-EX-NEXT:    retq
1304  %v = load atomic i64, i64* %p1 unordered, align 8
1305  %v2 = load atomic i64, i64* %p2 unordered, align 8
1306  %ret = and i64 %v, %v2
1307  ret i64 %ret
1308}
1309
1310; Legal, as expected
1311define i64 @load_fold_or1(i64* %p) {
1312; CHECK-LABEL: load_fold_or1:
1313; CHECK:       # %bb.0:
1314; CHECK-NEXT:    movq (%rdi), %rax
1315; CHECK-NEXT:    orq $15, %rax
1316; CHECK-NEXT:    retq
1317  %v = load atomic i64, i64* %p unordered, align 8
1318  %ret = or i64 %v, 15
1319  ret i64 %ret
1320}
1321
1322define i64 @load_fold_or2(i64* %p, i64 %v2) {
1323; CHECK-LABEL: load_fold_or2:
1324; CHECK:       # %bb.0:
1325; CHECK-NEXT:    movq %rsi, %rax
1326; CHECK-NEXT:    orq (%rdi), %rax
1327; CHECK-NEXT:    retq
1328  %v = load atomic i64, i64* %p unordered, align 8
1329  %ret = or i64 %v, %v2
1330  ret i64 %ret
1331}
1332
1333define i64 @load_fold_or3(i64* %p1, i64* %p2) {
1334; CHECK-O0-LABEL: load_fold_or3:
1335; CHECK-O0:       # %bb.0:
1336; CHECK-O0-NEXT:    movq (%rdi), %rax
1337; CHECK-O0-NEXT:    orq (%rsi), %rax
1338; CHECK-O0-NEXT:    retq
1339;
1340; CHECK-O3-CUR-LABEL: load_fold_or3:
1341; CHECK-O3-CUR:       # %bb.0:
1342; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1343; CHECK-O3-CUR-NEXT:    orq (%rdi), %rax
1344; CHECK-O3-CUR-NEXT:    retq
1345;
1346; CHECK-O3-EX-LABEL: load_fold_or3:
1347; CHECK-O3-EX:       # %bb.0:
1348; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1349; CHECK-O3-EX-NEXT:    orq (%rsi), %rax
1350; CHECK-O3-EX-NEXT:    retq
1351  %v = load atomic i64, i64* %p1 unordered, align 8
1352  %v2 = load atomic i64, i64* %p2 unordered, align 8
1353  %ret = or i64 %v, %v2
1354  ret i64 %ret
1355}
1356
1357; Legal, as expected
1358define i64 @load_fold_xor1(i64* %p) {
1359; CHECK-LABEL: load_fold_xor1:
1360; CHECK:       # %bb.0:
1361; CHECK-NEXT:    movq (%rdi), %rax
1362; CHECK-NEXT:    xorq $15, %rax
1363; CHECK-NEXT:    retq
1364  %v = load atomic i64, i64* %p unordered, align 8
1365  %ret = xor i64 %v, 15
1366  ret i64 %ret
1367}
1368
1369define i64 @load_fold_xor2(i64* %p, i64 %v2) {
1370; CHECK-LABEL: load_fold_xor2:
1371; CHECK:       # %bb.0:
1372; CHECK-NEXT:    movq %rsi, %rax
1373; CHECK-NEXT:    xorq (%rdi), %rax
1374; CHECK-NEXT:    retq
1375  %v = load atomic i64, i64* %p unordered, align 8
1376  %ret = xor i64 %v, %v2
1377  ret i64 %ret
1378}
1379
1380define i64 @load_fold_xor3(i64* %p1, i64* %p2) {
1381; CHECK-O0-LABEL: load_fold_xor3:
1382; CHECK-O0:       # %bb.0:
1383; CHECK-O0-NEXT:    movq (%rdi), %rax
1384; CHECK-O0-NEXT:    xorq (%rsi), %rax
1385; CHECK-O0-NEXT:    retq
1386;
1387; CHECK-O3-CUR-LABEL: load_fold_xor3:
1388; CHECK-O3-CUR:       # %bb.0:
1389; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1390; CHECK-O3-CUR-NEXT:    xorq (%rdi), %rax
1391; CHECK-O3-CUR-NEXT:    retq
1392;
1393; CHECK-O3-EX-LABEL: load_fold_xor3:
1394; CHECK-O3-EX:       # %bb.0:
1395; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1396; CHECK-O3-EX-NEXT:    xorq (%rsi), %rax
1397; CHECK-O3-EX-NEXT:    retq
1398  %v = load atomic i64, i64* %p1 unordered, align 8
1399  %v2 = load atomic i64, i64* %p2 unordered, align 8
1400  %ret = xor i64 %v, %v2
1401  ret i64 %ret
1402}
1403
1404define i1 @load_fold_icmp1(i64* %p) {
1405; CHECK-O0-LABEL: load_fold_icmp1:
1406; CHECK-O0:       # %bb.0:
1407; CHECK-O0-NEXT:    movq (%rdi), %rax
1408; CHECK-O0-NEXT:    subq $15, %rax
1409; CHECK-O0-NEXT:    sete %al
1410; CHECK-O0-NEXT:    retq
1411;
1412; CHECK-O3-LABEL: load_fold_icmp1:
1413; CHECK-O3:       # %bb.0:
1414; CHECK-O3-NEXT:    cmpq $15, (%rdi)
1415; CHECK-O3-NEXT:    sete %al
1416; CHECK-O3-NEXT:    retq
1417  %v = load atomic i64, i64* %p unordered, align 8
1418  %ret = icmp eq i64 %v, 15
1419  ret i1 %ret
1420}
1421
1422define i1 @load_fold_icmp2(i64* %p, i64 %v2) {
1423; CHECK-O0-LABEL: load_fold_icmp2:
1424; CHECK-O0:       # %bb.0:
1425; CHECK-O0-NEXT:    movq (%rdi), %rax
1426; CHECK-O0-NEXT:    subq %rsi, %rax
1427; CHECK-O0-NEXT:    sete %al
1428; CHECK-O0-NEXT:    retq
1429;
1430; CHECK-O3-LABEL: load_fold_icmp2:
1431; CHECK-O3:       # %bb.0:
1432; CHECK-O3-NEXT:    cmpq %rsi, (%rdi)
1433; CHECK-O3-NEXT:    sete %al
1434; CHECK-O3-NEXT:    retq
1435  %v = load atomic i64, i64* %p unordered, align 8
1436  %ret = icmp eq i64 %v, %v2
1437  ret i1 %ret
1438}
1439
1440define i1 @load_fold_icmp3(i64* %p1, i64* %p2) {
1441; CHECK-O0-LABEL: load_fold_icmp3:
1442; CHECK-O0:       # %bb.0:
1443; CHECK-O0-NEXT:    movq (%rdi), %rax
1444; CHECK-O0-NEXT:    movq (%rsi), %rcx
1445; CHECK-O0-NEXT:    subq %rcx, %rax
1446; CHECK-O0-NEXT:    sete %al
1447; CHECK-O0-NEXT:    retq
1448;
1449; CHECK-O3-CUR-LABEL: load_fold_icmp3:
1450; CHECK-O3-CUR:       # %bb.0:
1451; CHECK-O3-CUR-NEXT:    movq (%rsi), %rax
1452; CHECK-O3-CUR-NEXT:    cmpq %rax, (%rdi)
1453; CHECK-O3-CUR-NEXT:    sete %al
1454; CHECK-O3-CUR-NEXT:    retq
1455;
1456; CHECK-O3-EX-LABEL: load_fold_icmp3:
1457; CHECK-O3-EX:       # %bb.0:
1458; CHECK-O3-EX-NEXT:    movq (%rdi), %rax
1459; CHECK-O3-EX-NEXT:    cmpq (%rsi), %rax
1460; CHECK-O3-EX-NEXT:    sete %al
1461; CHECK-O3-EX-NEXT:    retq
1462  %v = load atomic i64, i64* %p1 unordered, align 8
1463  %v2 = load atomic i64, i64* %p2 unordered, align 8
1464  %ret = icmp eq i64 %v, %v2
1465  ret i1 %ret
1466}
1467
1468
1469;; The next batch of tests check for read-modify-write patterns
1470;; Legally, it's okay to use a memory operand here as long as the operand
1471;; is well aligned (i.e. doesn't cross a cache line boundary).  We are
1472;; required not to narrow the store though!
1473
1474; Legal, as expected
1475define void @rmw_fold_add1(i64* %p, i64 %v) {
1476; CHECK-O0-LABEL: rmw_fold_add1:
1477; CHECK-O0:       # %bb.0:
1478; CHECK-O0-NEXT:    movq (%rdi), %rax
1479; CHECK-O0-NEXT:    addq $15, %rax
1480; CHECK-O0-NEXT:    movq %rax, (%rdi)
1481; CHECK-O0-NEXT:    retq
1482;
1483; CHECK-O3-LABEL: rmw_fold_add1:
1484; CHECK-O3:       # %bb.0:
1485; CHECK-O3-NEXT:    addq $15, (%rdi)
1486; CHECK-O3-NEXT:    retq
1487  %prev = load atomic i64, i64* %p unordered, align 8
1488  %val = add i64 %prev, 15
1489  store atomic i64 %val, i64* %p unordered, align 8
1490  ret void
1491}
1492
1493; Legal, as expected
1494define void @rmw_fold_add2(i64* %p, i64 %v) {
1495; CHECK-O0-LABEL: rmw_fold_add2:
1496; CHECK-O0:       # %bb.0:
1497; CHECK-O0-NEXT:    movq (%rdi), %rax
1498; CHECK-O0-NEXT:    addq %rsi, %rax
1499; CHECK-O0-NEXT:    movq %rax, (%rdi)
1500; CHECK-O0-NEXT:    retq
1501;
1502; CHECK-O3-LABEL: rmw_fold_add2:
1503; CHECK-O3:       # %bb.0:
1504; CHECK-O3-NEXT:    addq %rsi, (%rdi)
1505; CHECK-O3-NEXT:    retq
1506  %prev = load atomic i64, i64* %p unordered, align 8
1507  %val = add i64 %prev, %v
1508  store atomic i64 %val, i64* %p unordered, align 8
1509  ret void
1510}
1511
1512; Legal, as expected
1513define void @rmw_fold_sub1(i64* %p, i64 %v) {
1514; CHECK-O0-LABEL: rmw_fold_sub1:
1515; CHECK-O0:       # %bb.0:
1516; CHECK-O0-NEXT:    movq (%rdi), %rax
1517; CHECK-O0-NEXT:    addq $-15, %rax
1518; CHECK-O0-NEXT:    movq %rax, (%rdi)
1519; CHECK-O0-NEXT:    retq
1520;
1521; CHECK-O3-LABEL: rmw_fold_sub1:
1522; CHECK-O3:       # %bb.0:
1523; CHECK-O3-NEXT:    addq $-15, (%rdi)
1524; CHECK-O3-NEXT:    retq
1525  %prev = load atomic i64, i64* %p unordered, align 8
1526  %val = sub i64 %prev, 15
1527  store atomic i64 %val, i64* %p unordered, align 8
1528  ret void
1529}
1530
1531; Legal, as expected
1532define void @rmw_fold_sub2(i64* %p, i64 %v) {
1533; CHECK-O0-LABEL: rmw_fold_sub2:
1534; CHECK-O0:       # %bb.0:
1535; CHECK-O0-NEXT:    movq (%rdi), %rax
1536; CHECK-O0-NEXT:    subq %rsi, %rax
1537; CHECK-O0-NEXT:    movq %rax, (%rdi)
1538; CHECK-O0-NEXT:    retq
1539;
1540; CHECK-O3-LABEL: rmw_fold_sub2:
1541; CHECK-O3:       # %bb.0:
1542; CHECK-O3-NEXT:    subq %rsi, (%rdi)
1543; CHECK-O3-NEXT:    retq
1544  %prev = load atomic i64, i64* %p unordered, align 8
1545  %val = sub i64 %prev, %v
1546  store atomic i64 %val, i64* %p unordered, align 8
1547  ret void
1548}
1549
1550; Legal, as expected
1551define void @rmw_fold_mul1(i64* %p, i64 %v) {
1552; CHECK-LABEL: rmw_fold_mul1:
1553; CHECK:       # %bb.0:
1554; CHECK-NEXT:    movq (%rdi), %rax
1555; CHECK-NEXT:    leaq (%rax,%rax,4), %rax
1556; CHECK-NEXT:    leaq (%rax,%rax,2), %rax
1557; CHECK-NEXT:    movq %rax, (%rdi)
1558; CHECK-NEXT:    retq
1559  %prev = load atomic i64, i64* %p unordered, align 8
1560  %val = mul i64 %prev, 15
1561  store atomic i64 %val, i64* %p unordered, align 8
1562  ret void
1563}
1564
1565; Legal to fold (TODO)
1566define void @rmw_fold_mul2(i64* %p, i64 %v) {
1567; CHECK-O0-LABEL: rmw_fold_mul2:
1568; CHECK-O0:       # %bb.0:
1569; CHECK-O0-NEXT:    movq (%rdi), %rax
1570; CHECK-O0-NEXT:    imulq %rsi, %rax
1571; CHECK-O0-NEXT:    movq %rax, (%rdi)
1572; CHECK-O0-NEXT:    retq
1573;
1574; CHECK-O3-LABEL: rmw_fold_mul2:
1575; CHECK-O3:       # %bb.0:
1576; CHECK-O3-NEXT:    imulq (%rdi), %rsi
1577; CHECK-O3-NEXT:    movq %rsi, (%rdi)
1578; CHECK-O3-NEXT:    retq
1579  %prev = load atomic i64, i64* %p unordered, align 8
1580  %val = mul i64 %prev, %v
1581  store atomic i64 %val, i64* %p unordered, align 8
1582  ret void
1583}
1584
1585; Legal, as expected
1586define void @rmw_fold_sdiv1(i64* %p, i64 %v) {
1587; CHECK-O0-LABEL: rmw_fold_sdiv1:
1588; CHECK-O0:       # %bb.0:
1589; CHECK-O0-NEXT:    movq (%rdi), %rcx
1590; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1591; CHECK-O0-NEXT:    movq %rcx, %rax
1592; CHECK-O0-NEXT:    imulq %rdx
1593; CHECK-O0-NEXT:    movq %rdx, %rax
1594; CHECK-O0-NEXT:    addq %rcx, %rax
1595; CHECK-O0-NEXT:    movq %rax, %rcx
1596; CHECK-O0-NEXT:    shrq $63, %rcx
1597; CHECK-O0-NEXT:    sarq $3, %rax
1598; CHECK-O0-NEXT:    addq %rcx, %rax
1599; CHECK-O0-NEXT:    movq %rax, (%rdi)
1600; CHECK-O0-NEXT:    retq
1601;
1602; CHECK-O3-LABEL: rmw_fold_sdiv1:
1603; CHECK-O3:       # %bb.0:
1604; CHECK-O3-NEXT:    movq (%rdi), %rcx
1605; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1606; CHECK-O3-NEXT:    movq %rcx, %rax
1607; CHECK-O3-NEXT:    imulq %rdx
1608; CHECK-O3-NEXT:    addq %rcx, %rdx
1609; CHECK-O3-NEXT:    movq %rdx, %rax
1610; CHECK-O3-NEXT:    shrq $63, %rax
1611; CHECK-O3-NEXT:    sarq $3, %rdx
1612; CHECK-O3-NEXT:    addq %rax, %rdx
1613; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1614; CHECK-O3-NEXT:    retq
1615  %prev = load atomic i64, i64* %p unordered, align 8
1616  %val = sdiv i64 %prev, 15
1617  store atomic i64 %val, i64* %p unordered, align 8
1618  ret void
1619}
1620
1621; Legal, as expected
1622define void @rmw_fold_sdiv2(i64* %p, i64 %v) {
1623; CHECK-O0-LABEL: rmw_fold_sdiv2:
1624; CHECK-O0:       # %bb.0:
1625; CHECK-O0-NEXT:    movq (%rdi), %rax
1626; CHECK-O0-NEXT:    cqto
1627; CHECK-O0-NEXT:    idivq %rsi
1628; CHECK-O0-NEXT:    movq %rax, (%rdi)
1629; CHECK-O0-NEXT:    retq
1630;
1631; CHECK-O3-LABEL: rmw_fold_sdiv2:
1632; CHECK-O3:       # %bb.0:
1633; CHECK-O3-NEXT:    movq (%rdi), %rax
1634; CHECK-O3-NEXT:    movq %rax, %rcx
1635; CHECK-O3-NEXT:    orq %rsi, %rcx
1636; CHECK-O3-NEXT:    shrq $32, %rcx
1637; CHECK-O3-NEXT:    je .LBB74_1
1638; CHECK-O3-NEXT:  # %bb.2:
1639; CHECK-O3-NEXT:    cqto
1640; CHECK-O3-NEXT:    idivq %rsi
1641; CHECK-O3-NEXT:    movq %rax, (%rdi)
1642; CHECK-O3-NEXT:    retq
1643; CHECK-O3-NEXT:  .LBB74_1:
1644; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1645; CHECK-O3-NEXT:    xorl %edx, %edx
1646; CHECK-O3-NEXT:    divl %esi
1647; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
1648; CHECK-O3-NEXT:    movq %rax, (%rdi)
1649; CHECK-O3-NEXT:    retq
1650  %prev = load atomic i64, i64* %p unordered, align 8
1651  %val = sdiv i64 %prev, %v
1652  store atomic i64 %val, i64* %p unordered, align 8
1653  ret void
1654}
1655
1656; Legal, as expected
1657define void @rmw_fold_udiv1(i64* %p, i64 %v) {
1658; CHECK-O0-LABEL: rmw_fold_udiv1:
1659; CHECK-O0:       # %bb.0:
1660; CHECK-O0-NEXT:    movq (%rdi), %rdx
1661; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1662; CHECK-O0-NEXT:    mulxq %rax, %rax, %rax
1663; CHECK-O0-NEXT:    shrq $3, %rax
1664; CHECK-O0-NEXT:    movq %rax, (%rdi)
1665; CHECK-O0-NEXT:    retq
1666;
1667; CHECK-O3-CUR-LABEL: rmw_fold_udiv1:
1668; CHECK-O3-CUR:       # %bb.0:
1669; CHECK-O3-CUR-NEXT:    movq (%rdi), %rdx
1670; CHECK-O3-CUR-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1671; CHECK-O3-CUR-NEXT:    mulxq %rax, %rax, %rax
1672; CHECK-O3-CUR-NEXT:    shrq $3, %rax
1673; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1674; CHECK-O3-CUR-NEXT:    retq
1675;
1676; CHECK-O3-EX-LABEL: rmw_fold_udiv1:
1677; CHECK-O3-EX:       # %bb.0:
1678; CHECK-O3-EX-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1679; CHECK-O3-EX-NEXT:    mulxq (%rdi), %rax, %rax
1680; CHECK-O3-EX-NEXT:    shrq $3, %rax
1681; CHECK-O3-EX-NEXT:    movq %rax, (%rdi)
1682; CHECK-O3-EX-NEXT:    retq
1683  %prev = load atomic i64, i64* %p unordered, align 8
1684  %val = udiv i64 %prev, 15
1685  store atomic i64 %val, i64* %p unordered, align 8
1686  ret void
1687}
1688
1689; Legal, as expected
1690define void @rmw_fold_udiv2(i64* %p, i64 %v) {
1691; CHECK-O0-LABEL: rmw_fold_udiv2:
1692; CHECK-O0:       # %bb.0:
1693; CHECK-O0-NEXT:    movq (%rdi), %rax
1694; CHECK-O0-NEXT:    xorl %ecx, %ecx
1695; CHECK-O0-NEXT:    movl %ecx, %edx
1696; CHECK-O0-NEXT:    divq %rsi
1697; CHECK-O0-NEXT:    movq %rax, (%rdi)
1698; CHECK-O0-NEXT:    retq
1699;
1700; CHECK-O3-LABEL: rmw_fold_udiv2:
1701; CHECK-O3:       # %bb.0:
1702; CHECK-O3-NEXT:    movq (%rdi), %rax
1703; CHECK-O3-NEXT:    movq %rax, %rcx
1704; CHECK-O3-NEXT:    orq %rsi, %rcx
1705; CHECK-O3-NEXT:    shrq $32, %rcx
1706; CHECK-O3-NEXT:    je .LBB76_1
1707; CHECK-O3-NEXT:  # %bb.2:
1708; CHECK-O3-NEXT:    xorl %edx, %edx
1709; CHECK-O3-NEXT:    divq %rsi
1710; CHECK-O3-NEXT:    movq %rax, (%rdi)
1711; CHECK-O3-NEXT:    retq
1712; CHECK-O3-NEXT:  .LBB76_1:
1713; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1714; CHECK-O3-NEXT:    xorl %edx, %edx
1715; CHECK-O3-NEXT:    divl %esi
1716; CHECK-O3-NEXT:    # kill: def $eax killed $eax def $rax
1717; CHECK-O3-NEXT:    movq %rax, (%rdi)
1718; CHECK-O3-NEXT:    retq
1719  %prev = load atomic i64, i64* %p unordered, align 8
1720  %val = udiv i64 %prev, %v
1721  store atomic i64 %val, i64* %p unordered, align 8
1722  ret void
1723}
1724
1725; Legal, as expected
1726define void @rmw_fold_srem1(i64* %p, i64 %v) {
1727; CHECK-O0-LABEL: rmw_fold_srem1:
1728; CHECK-O0:       # %bb.0:
1729; CHECK-O0-NEXT:    movq (%rdi), %rax
1730; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1731; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1732; CHECK-O0-NEXT:    imulq %rcx
1733; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1734; CHECK-O0-NEXT:    movq %rdx, %rcx
1735; CHECK-O0-NEXT:    addq %rax, %rcx
1736; CHECK-O0-NEXT:    movq %rcx, %rdx
1737; CHECK-O0-NEXT:    shrq $63, %rdx
1738; CHECK-O0-NEXT:    sarq $3, %rcx
1739; CHECK-O0-NEXT:    addq %rdx, %rcx
1740; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
1741; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
1742; CHECK-O0-NEXT:    subq %rcx, %rax
1743; CHECK-O0-NEXT:    movq %rax, (%rdi)
1744; CHECK-O0-NEXT:    retq
1745;
1746; CHECK-O3-LABEL: rmw_fold_srem1:
1747; CHECK-O3:       # %bb.0:
1748; CHECK-O3-NEXT:    movq (%rdi), %rcx
1749; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rdx # imm = 0x8888888888888889
1750; CHECK-O3-NEXT:    movq %rcx, %rax
1751; CHECK-O3-NEXT:    imulq %rdx
1752; CHECK-O3-NEXT:    addq %rcx, %rdx
1753; CHECK-O3-NEXT:    movq %rdx, %rax
1754; CHECK-O3-NEXT:    shrq $63, %rax
1755; CHECK-O3-NEXT:    sarq $3, %rdx
1756; CHECK-O3-NEXT:    addq %rax, %rdx
1757; CHECK-O3-NEXT:    leaq (%rdx,%rdx,4), %rax
1758; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
1759; CHECK-O3-NEXT:    subq %rax, %rcx
1760; CHECK-O3-NEXT:    movq %rcx, (%rdi)
1761; CHECK-O3-NEXT:    retq
1762  %prev = load atomic i64, i64* %p unordered, align 8
1763  %val = srem i64 %prev, 15
1764  store atomic i64 %val, i64* %p unordered, align 8
1765  ret void
1766}
1767
1768; Legal, as expected
1769define void @rmw_fold_srem2(i64* %p, i64 %v) {
1770; CHECK-O0-LABEL: rmw_fold_srem2:
1771; CHECK-O0:       # %bb.0:
1772; CHECK-O0-NEXT:    movq (%rdi), %rax
1773; CHECK-O0-NEXT:    cqto
1774; CHECK-O0-NEXT:    idivq %rsi
1775; CHECK-O0-NEXT:    movq %rdx, (%rdi)
1776; CHECK-O0-NEXT:    retq
1777;
1778; CHECK-O3-LABEL: rmw_fold_srem2:
1779; CHECK-O3:       # %bb.0:
1780; CHECK-O3-NEXT:    movq (%rdi), %rax
1781; CHECK-O3-NEXT:    movq %rax, %rcx
1782; CHECK-O3-NEXT:    orq %rsi, %rcx
1783; CHECK-O3-NEXT:    shrq $32, %rcx
1784; CHECK-O3-NEXT:    je .LBB78_1
1785; CHECK-O3-NEXT:  # %bb.2:
1786; CHECK-O3-NEXT:    cqto
1787; CHECK-O3-NEXT:    idivq %rsi
1788; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1789; CHECK-O3-NEXT:    retq
1790; CHECK-O3-NEXT:  .LBB78_1:
1791; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1792; CHECK-O3-NEXT:    xorl %edx, %edx
1793; CHECK-O3-NEXT:    divl %esi
1794; CHECK-O3-NEXT:    # kill: def $edx killed $edx def $rdx
1795; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1796; CHECK-O3-NEXT:    retq
1797  %prev = load atomic i64, i64* %p unordered, align 8
1798  %val = srem i64 %prev, %v
1799  store atomic i64 %val, i64* %p unordered, align 8
1800  ret void
1801}
1802
1803; Legal, as expected
1804define void @rmw_fold_urem1(i64* %p, i64 %v) {
1805; CHECK-O0-LABEL: rmw_fold_urem1:
1806; CHECK-O0:       # %bb.0:
1807; CHECK-O0-NEXT:    movq (%rdi), %rax
1808; CHECK-O0-NEXT:    movabsq $-8608480567731124087, %rcx # imm = 0x8888888888888889
1809; CHECK-O0-NEXT:    movq %rax, %rdx
1810; CHECK-O0-NEXT:    mulxq %rcx, %rcx, %rcx
1811; CHECK-O0-NEXT:    shrq $3, %rcx
1812; CHECK-O0-NEXT:    leaq (%rcx,%rcx,4), %rcx
1813; CHECK-O0-NEXT:    leaq (%rcx,%rcx,2), %rcx
1814; CHECK-O0-NEXT:    subq %rcx, %rax
1815; CHECK-O0-NEXT:    movq %rax, (%rdi)
1816; CHECK-O0-NEXT:    retq
1817;
1818; CHECK-O3-LABEL: rmw_fold_urem1:
1819; CHECK-O3:       # %bb.0:
1820; CHECK-O3-NEXT:    movq (%rdi), %rdx
1821; CHECK-O3-NEXT:    movabsq $-8608480567731124087, %rax # imm = 0x8888888888888889
1822; CHECK-O3-NEXT:    mulxq %rax, %rax, %rax
1823; CHECK-O3-NEXT:    shrq $3, %rax
1824; CHECK-O3-NEXT:    leaq (%rax,%rax,4), %rax
1825; CHECK-O3-NEXT:    leaq (%rax,%rax,2), %rax
1826; CHECK-O3-NEXT:    subq %rax, %rdx
1827; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1828; CHECK-O3-NEXT:    retq
1829  %prev = load atomic i64, i64* %p unordered, align 8
1830  %val = urem i64 %prev, 15
1831  store atomic i64 %val, i64* %p unordered, align 8
1832  ret void
1833}
1834
1835; Legal, as expected
1836define void @rmw_fold_urem2(i64* %p, i64 %v) {
1837; CHECK-O0-LABEL: rmw_fold_urem2:
1838; CHECK-O0:       # %bb.0:
1839; CHECK-O0-NEXT:    movq (%rdi), %rax
1840; CHECK-O0-NEXT:    xorl %ecx, %ecx
1841; CHECK-O0-NEXT:    movl %ecx, %edx
1842; CHECK-O0-NEXT:    divq %rsi
1843; CHECK-O0-NEXT:    movq %rdx, (%rdi)
1844; CHECK-O0-NEXT:    retq
1845;
1846; CHECK-O3-LABEL: rmw_fold_urem2:
1847; CHECK-O3:       # %bb.0:
1848; CHECK-O3-NEXT:    movq (%rdi), %rax
1849; CHECK-O3-NEXT:    movq %rax, %rcx
1850; CHECK-O3-NEXT:    orq %rsi, %rcx
1851; CHECK-O3-NEXT:    shrq $32, %rcx
1852; CHECK-O3-NEXT:    je .LBB80_1
1853; CHECK-O3-NEXT:  # %bb.2:
1854; CHECK-O3-NEXT:    xorl %edx, %edx
1855; CHECK-O3-NEXT:    divq %rsi
1856; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1857; CHECK-O3-NEXT:    retq
1858; CHECK-O3-NEXT:  .LBB80_1:
1859; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
1860; CHECK-O3-NEXT:    xorl %edx, %edx
1861; CHECK-O3-NEXT:    divl %esi
1862; CHECK-O3-NEXT:    # kill: def $edx killed $edx def $rdx
1863; CHECK-O3-NEXT:    movq %rdx, (%rdi)
1864; CHECK-O3-NEXT:    retq
1865  %prev = load atomic i64, i64* %p unordered, align 8
1866  %val = urem i64 %prev, %v
1867  store atomic i64 %val, i64* %p unordered, align 8
1868  ret void
1869}
1870
1871; Legal to fold (TODO)
1872define void @rmw_fold_shl1(i64* %p, i64 %v) {
1873; CHECK-O0-LABEL: rmw_fold_shl1:
1874; CHECK-O0:       # %bb.0:
1875; CHECK-O0-NEXT:    movq (%rdi), %rax
1876; CHECK-O0-NEXT:    shlq $15, %rax
1877; CHECK-O0-NEXT:    movq %rax, (%rdi)
1878; CHECK-O0-NEXT:    retq
1879;
1880; CHECK-O3-CUR-LABEL: rmw_fold_shl1:
1881; CHECK-O3-CUR:       # %bb.0:
1882; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
1883; CHECK-O3-CUR-NEXT:    shlq $15, %rax
1884; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1885; CHECK-O3-CUR-NEXT:    retq
1886;
1887; CHECK-O3-EX-LABEL: rmw_fold_shl1:
1888; CHECK-O3-EX:       # %bb.0:
1889; CHECK-O3-EX-NEXT:    shlq $15, (%rdi)
1890; CHECK-O3-EX-NEXT:    retq
1891  %prev = load atomic i64, i64* %p unordered, align 8
1892  %val = shl i64 %prev, 15
1893  store atomic i64 %val, i64* %p unordered, align 8
1894  ret void
1895}
1896
1897; Legal to fold (TODO)
1898define void @rmw_fold_shl2(i64* %p, i64 %v) {
1899; CHECK-O0-LABEL: rmw_fold_shl2:
1900; CHECK-O0:       # %bb.0:
1901; CHECK-O0-NEXT:    movq (%rdi), %rax
1902; CHECK-O0-NEXT:    movb %sil, %dl
1903; CHECK-O0-NEXT:    # implicit-def: $rcx
1904; CHECK-O0-NEXT:    movb %dl, %cl
1905; CHECK-O0-NEXT:    shlxq %rcx, %rax, %rax
1906; CHECK-O0-NEXT:    movq %rax, (%rdi)
1907; CHECK-O0-NEXT:    retq
1908;
1909; CHECK-O3-CUR-LABEL: rmw_fold_shl2:
1910; CHECK-O3-CUR:       # %bb.0:
1911; CHECK-O3-CUR-NEXT:    shlxq %rsi, (%rdi), %rax
1912; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1913; CHECK-O3-CUR-NEXT:    retq
1914;
1915; CHECK-O3-EX-LABEL: rmw_fold_shl2:
1916; CHECK-O3-EX:       # %bb.0:
1917; CHECK-O3-EX-NEXT:    movq %rsi, %rcx
1918; CHECK-O3-EX-NEXT:    # kill: def $cl killed $cl killed $rcx
1919; CHECK-O3-EX-NEXT:    shlq %cl, (%rdi)
1920; CHECK-O3-EX-NEXT:    retq
1921  %prev = load atomic i64, i64* %p unordered, align 8
1922  %val = shl i64 %prev, %v
1923  store atomic i64 %val, i64* %p unordered, align 8
1924  ret void
1925}
1926
1927; Legal to fold (TODO)
1928define void @rmw_fold_lshr1(i64* %p, i64 %v) {
1929; CHECK-O0-LABEL: rmw_fold_lshr1:
1930; CHECK-O0:       # %bb.0:
1931; CHECK-O0-NEXT:    movq (%rdi), %rax
1932; CHECK-O0-NEXT:    shrq $15, %rax
1933; CHECK-O0-NEXT:    movq %rax, (%rdi)
1934; CHECK-O0-NEXT:    retq
1935;
1936; CHECK-O3-CUR-LABEL: rmw_fold_lshr1:
1937; CHECK-O3-CUR:       # %bb.0:
1938; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
1939; CHECK-O3-CUR-NEXT:    shrq $15, %rax
1940; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1941; CHECK-O3-CUR-NEXT:    retq
1942;
1943; CHECK-O3-EX-LABEL: rmw_fold_lshr1:
1944; CHECK-O3-EX:       # %bb.0:
1945; CHECK-O3-EX-NEXT:    shrq $15, (%rdi)
1946; CHECK-O3-EX-NEXT:    retq
1947  %prev = load atomic i64, i64* %p unordered, align 8
1948  %val = lshr i64 %prev, 15
1949  store atomic i64 %val, i64* %p unordered, align 8
1950  ret void
1951}
1952
1953; Legal to fold (TODO)
1954define void @rmw_fold_lshr2(i64* %p, i64 %v) {
1955; CHECK-O0-LABEL: rmw_fold_lshr2:
1956; CHECK-O0:       # %bb.0:
1957; CHECK-O0-NEXT:    movq (%rdi), %rax
1958; CHECK-O0-NEXT:    movb %sil, %dl
1959; CHECK-O0-NEXT:    # implicit-def: $rcx
1960; CHECK-O0-NEXT:    movb %dl, %cl
1961; CHECK-O0-NEXT:    shrxq %rcx, %rax, %rax
1962; CHECK-O0-NEXT:    movq %rax, (%rdi)
1963; CHECK-O0-NEXT:    retq
1964;
1965; CHECK-O3-CUR-LABEL: rmw_fold_lshr2:
1966; CHECK-O3-CUR:       # %bb.0:
1967; CHECK-O3-CUR-NEXT:    shrxq %rsi, (%rdi), %rax
1968; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1969; CHECK-O3-CUR-NEXT:    retq
1970;
1971; CHECK-O3-EX-LABEL: rmw_fold_lshr2:
1972; CHECK-O3-EX:       # %bb.0:
1973; CHECK-O3-EX-NEXT:    movq %rsi, %rcx
1974; CHECK-O3-EX-NEXT:    # kill: def $cl killed $cl killed $rcx
1975; CHECK-O3-EX-NEXT:    shrq %cl, (%rdi)
1976; CHECK-O3-EX-NEXT:    retq
1977  %prev = load atomic i64, i64* %p unordered, align 8
1978  %val = lshr i64 %prev, %v
1979  store atomic i64 %val, i64* %p unordered, align 8
1980  ret void
1981}
1982
1983; Legal to fold (TODO)
1984define void @rmw_fold_ashr1(i64* %p, i64 %v) {
1985; CHECK-O0-LABEL: rmw_fold_ashr1:
1986; CHECK-O0:       # %bb.0:
1987; CHECK-O0-NEXT:    movq (%rdi), %rax
1988; CHECK-O0-NEXT:    sarq $15, %rax
1989; CHECK-O0-NEXT:    movq %rax, (%rdi)
1990; CHECK-O0-NEXT:    retq
1991;
1992; CHECK-O3-CUR-LABEL: rmw_fold_ashr1:
1993; CHECK-O3-CUR:       # %bb.0:
1994; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
1995; CHECK-O3-CUR-NEXT:    sarq $15, %rax
1996; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
1997; CHECK-O3-CUR-NEXT:    retq
1998;
1999; CHECK-O3-EX-LABEL: rmw_fold_ashr1:
2000; CHECK-O3-EX:       # %bb.0:
2001; CHECK-O3-EX-NEXT:    sarq $15, (%rdi)
2002; CHECK-O3-EX-NEXT:    retq
2003  %prev = load atomic i64, i64* %p unordered, align 8
2004  %val = ashr i64 %prev, 15
2005  store atomic i64 %val, i64* %p unordered, align 8
2006  ret void
2007}
2008
2009; Legal to fold (TODO)
2010define void @rmw_fold_ashr2(i64* %p, i64 %v) {
2011; CHECK-O0-LABEL: rmw_fold_ashr2:
2012; CHECK-O0:       # %bb.0:
2013; CHECK-O0-NEXT:    movq (%rdi), %rax
2014; CHECK-O0-NEXT:    movb %sil, %dl
2015; CHECK-O0-NEXT:    # implicit-def: $rcx
2016; CHECK-O0-NEXT:    movb %dl, %cl
2017; CHECK-O0-NEXT:    sarxq %rcx, %rax, %rax
2018; CHECK-O0-NEXT:    movq %rax, (%rdi)
2019; CHECK-O0-NEXT:    retq
2020;
2021; CHECK-O3-CUR-LABEL: rmw_fold_ashr2:
2022; CHECK-O3-CUR:       # %bb.0:
2023; CHECK-O3-CUR-NEXT:    sarxq %rsi, (%rdi), %rax
2024; CHECK-O3-CUR-NEXT:    movq %rax, (%rdi)
2025; CHECK-O3-CUR-NEXT:    retq
2026;
2027; CHECK-O3-EX-LABEL: rmw_fold_ashr2:
2028; CHECK-O3-EX:       # %bb.0:
2029; CHECK-O3-EX-NEXT:    movq %rsi, %rcx
2030; CHECK-O3-EX-NEXT:    # kill: def $cl killed $cl killed $rcx
2031; CHECK-O3-EX-NEXT:    sarq %cl, (%rdi)
2032; CHECK-O3-EX-NEXT:    retq
2033  %prev = load atomic i64, i64* %p unordered, align 8
2034  %val = ashr i64 %prev, %v
2035  store atomic i64 %val, i64* %p unordered, align 8
2036  ret void
2037}
2038
2039; Legal, as expected
2040define void @rmw_fold_and1(i64* %p, i64 %v) {
2041; CHECK-O0-LABEL: rmw_fold_and1:
2042; CHECK-O0:       # %bb.0:
2043; CHECK-O0-NEXT:    movq (%rdi), %rax
2044; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2045; CHECK-O0-NEXT:    andl $15, %eax
2046; CHECK-O0-NEXT:    # kill: def $rax killed $eax
2047; CHECK-O0-NEXT:    movq %rax, (%rdi)
2048; CHECK-O0-NEXT:    retq
2049;
2050; CHECK-O3-LABEL: rmw_fold_and1:
2051; CHECK-O3:       # %bb.0:
2052; CHECK-O3-NEXT:    andq $15, (%rdi)
2053; CHECK-O3-NEXT:    retq
2054  %prev = load atomic i64, i64* %p unordered, align 8
2055  %val = and i64 %prev, 15
2056  store atomic i64 %val, i64* %p unordered, align 8
2057  ret void
2058}
2059
2060; Legal, as expected
2061define void @rmw_fold_and2(i64* %p, i64 %v) {
2062; CHECK-O0-LABEL: rmw_fold_and2:
2063; CHECK-O0:       # %bb.0:
2064; CHECK-O0-NEXT:    movq (%rdi), %rax
2065; CHECK-O0-NEXT:    andq %rsi, %rax
2066; CHECK-O0-NEXT:    movq %rax, (%rdi)
2067; CHECK-O0-NEXT:    retq
2068;
2069; CHECK-O3-LABEL: rmw_fold_and2:
2070; CHECK-O3:       # %bb.0:
2071; CHECK-O3-NEXT:    andq %rsi, (%rdi)
2072; CHECK-O3-NEXT:    retq
2073  %prev = load atomic i64, i64* %p unordered, align 8
2074  %val = and i64 %prev, %v
2075  store atomic i64 %val, i64* %p unordered, align 8
2076  ret void
2077}
2078
2079; Legal, as expected
2080define void @rmw_fold_or1(i64* %p, i64 %v) {
2081; CHECK-O0-LABEL: rmw_fold_or1:
2082; CHECK-O0:       # %bb.0:
2083; CHECK-O0-NEXT:    movq (%rdi), %rax
2084; CHECK-O0-NEXT:    orq $15, %rax
2085; CHECK-O0-NEXT:    movq %rax, (%rdi)
2086; CHECK-O0-NEXT:    retq
2087;
2088; CHECK-O3-LABEL: rmw_fold_or1:
2089; CHECK-O3:       # %bb.0:
2090; CHECK-O3-NEXT:    orq $15, (%rdi)
2091; CHECK-O3-NEXT:    retq
2092  %prev = load atomic i64, i64* %p unordered, align 8
2093  %val = or i64 %prev, 15
2094  store atomic i64 %val, i64* %p unordered, align 8
2095  ret void
2096}
2097
2098; Legal, as expected
2099define void @rmw_fold_or2(i64* %p, i64 %v) {
2100; CHECK-O0-LABEL: rmw_fold_or2:
2101; CHECK-O0:       # %bb.0:
2102; CHECK-O0-NEXT:    movq (%rdi), %rax
2103; CHECK-O0-NEXT:    orq %rsi, %rax
2104; CHECK-O0-NEXT:    movq %rax, (%rdi)
2105; CHECK-O0-NEXT:    retq
2106;
2107; CHECK-O3-LABEL: rmw_fold_or2:
2108; CHECK-O3:       # %bb.0:
2109; CHECK-O3-NEXT:    orq %rsi, (%rdi)
2110; CHECK-O3-NEXT:    retq
2111  %prev = load atomic i64, i64* %p unordered, align 8
2112  %val = or i64 %prev, %v
2113  store atomic i64 %val, i64* %p unordered, align 8
2114  ret void
2115}
2116
2117; Legal, as expected
2118define void @rmw_fold_xor1(i64* %p, i64 %v) {
2119; CHECK-O0-LABEL: rmw_fold_xor1:
2120; CHECK-O0:       # %bb.0:
2121; CHECK-O0-NEXT:    movq (%rdi), %rax
2122; CHECK-O0-NEXT:    xorq $15, %rax
2123; CHECK-O0-NEXT:    movq %rax, (%rdi)
2124; CHECK-O0-NEXT:    retq
2125;
2126; CHECK-O3-LABEL: rmw_fold_xor1:
2127; CHECK-O3:       # %bb.0:
2128; CHECK-O3-NEXT:    xorq $15, (%rdi)
2129; CHECK-O3-NEXT:    retq
2130  %prev = load atomic i64, i64* %p unordered, align 8
2131  %val = xor i64 %prev, 15
2132  store atomic i64 %val, i64* %p unordered, align 8
2133  ret void
2134}
2135
2136; Legal, as expected
2137define void @rmw_fold_xor2(i64* %p, i64 %v) {
2138; CHECK-O0-LABEL: rmw_fold_xor2:
2139; CHECK-O0:       # %bb.0:
2140; CHECK-O0-NEXT:    movq (%rdi), %rax
2141; CHECK-O0-NEXT:    xorq %rsi, %rax
2142; CHECK-O0-NEXT:    movq %rax, (%rdi)
2143; CHECK-O0-NEXT:    retq
2144;
2145; CHECK-O3-LABEL: rmw_fold_xor2:
2146; CHECK-O3:       # %bb.0:
2147; CHECK-O3-NEXT:    xorq %rsi, (%rdi)
2148; CHECK-O3-NEXT:    retq
2149  %prev = load atomic i64, i64* %p unordered, align 8
2150  %val = xor i64 %prev, %v
2151  store atomic i64 %val, i64* %p unordered, align 8
2152  ret void
2153}
2154
2155;; The next batch test truncations, in combination w/operations which could
2156;; be folded against the memory operation.
2157
2158; Legal to reduce the load width (TODO)
2159define i32 @fold_trunc(i64* %p) {
2160; CHECK-LABEL: fold_trunc:
2161; CHECK:       # %bb.0:
2162; CHECK-NEXT:    movq (%rdi), %rax
2163; CHECK-NEXT:    # kill: def $eax killed $eax killed $rax
2164; CHECK-NEXT:    retq
2165  %v = load atomic i64, i64* %p unordered, align 8
2166  %ret = trunc i64 %v to i32
2167  ret i32 %ret
2168}
2169
2170; Legal to reduce the load width and fold the load (TODO)
2171define i32 @fold_trunc_add(i64* %p, i32 %v2) {
2172; CHECK-O0-LABEL: fold_trunc_add:
2173; CHECK-O0:       # %bb.0:
2174; CHECK-O0-NEXT:    movq (%rdi), %rax
2175; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2176; CHECK-O0-NEXT:    addl %esi, %eax
2177; CHECK-O0-NEXT:    retq
2178;
2179; CHECK-O3-LABEL: fold_trunc_add:
2180; CHECK-O3:       # %bb.0:
2181; CHECK-O3-NEXT:    movq (%rdi), %rax
2182; CHECK-O3-NEXT:    addl %esi, %eax
2183; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
2184; CHECK-O3-NEXT:    retq
2185  %v = load atomic i64, i64* %p unordered, align 8
2186  %trunc = trunc i64 %v to i32
2187  %ret = add i32 %trunc, %v2
2188  ret i32 %ret
2189}
2190
2191; Legal to reduce the load width and fold the load (TODO)
2192define i32 @fold_trunc_and(i64* %p, i32 %v2) {
2193; CHECK-O0-LABEL: fold_trunc_and:
2194; CHECK-O0:       # %bb.0:
2195; CHECK-O0-NEXT:    movq (%rdi), %rax
2196; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2197; CHECK-O0-NEXT:    andl %esi, %eax
2198; CHECK-O0-NEXT:    retq
2199;
2200; CHECK-O3-LABEL: fold_trunc_and:
2201; CHECK-O3:       # %bb.0:
2202; CHECK-O3-NEXT:    movq (%rdi), %rax
2203; CHECK-O3-NEXT:    andl %esi, %eax
2204; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
2205; CHECK-O3-NEXT:    retq
2206  %v = load atomic i64, i64* %p unordered, align 8
2207  %trunc = trunc i64 %v to i32
2208  %ret = and i32 %trunc, %v2
2209  ret i32 %ret
2210}
2211
2212; Legal to reduce the load width and fold the load (TODO)
2213define i32 @fold_trunc_or(i64* %p, i32 %v2) {
2214; CHECK-O0-LABEL: fold_trunc_or:
2215; CHECK-O0:       # %bb.0:
2216; CHECK-O0-NEXT:    movq (%rdi), %rax
2217; CHECK-O0-NEXT:    # kill: def $eax killed $eax killed $rax
2218; CHECK-O0-NEXT:    orl %esi, %eax
2219; CHECK-O0-NEXT:    retq
2220;
2221; CHECK-O3-LABEL: fold_trunc_or:
2222; CHECK-O3:       # %bb.0:
2223; CHECK-O3-NEXT:    movq (%rdi), %rax
2224; CHECK-O3-NEXT:    orl %esi, %eax
2225; CHECK-O3-NEXT:    # kill: def $eax killed $eax killed $rax
2226; CHECK-O3-NEXT:    retq
2227  %v = load atomic i64, i64* %p unordered, align 8
2228  %trunc = trunc i64 %v to i32
2229  %ret = or i32 %trunc, %v2
2230  ret i32 %ret
2231}
2232
2233; It's tempting to split the wide load into two smaller byte loads
2234; to reduce memory traffic, but this would be illegal for a atomic load
2235define i32 @split_load(i64* %p) {
2236; CHECK-O0-LABEL: split_load:
2237; CHECK-O0:       # %bb.0:
2238; CHECK-O0-NEXT:    movq (%rdi), %rcx
2239; CHECK-O0-NEXT:    movb %cl, %al
2240; CHECK-O0-NEXT:    shrq $32, %rcx
2241; CHECK-O0-NEXT:    # kill: def $cl killed $cl killed $rcx
2242; CHECK-O0-NEXT:    orb %cl, %al
2243; CHECK-O0-NEXT:    movzbl %al, %eax
2244; CHECK-O0-NEXT:    retq
2245;
2246; CHECK-O3-LABEL: split_load:
2247; CHECK-O3:       # %bb.0:
2248; CHECK-O3-NEXT:    movq (%rdi), %rax
2249; CHECK-O3-NEXT:    movq %rax, %rcx
2250; CHECK-O3-NEXT:    shrq $32, %rcx
2251; CHECK-O3-NEXT:    orl %eax, %ecx
2252; CHECK-O3-NEXT:    movzbl %cl, %eax
2253; CHECK-O3-NEXT:    retq
2254  %v = load atomic i64, i64* %p unordered, align 8
2255  %b1 = trunc i64 %v to i8
2256  %v.shift = lshr i64 %v, 32
2257  %b2 = trunc i64 %v.shift to i8
2258  %or = or i8 %b1, %b2
2259  %ret = zext i8 %or to i32
2260  ret i32 %ret
2261}
2262
2263;; A collection of simple memory forwarding tests.  Nothing particular
2264;; interesting semantic wise, just demonstrating obvious missed transforms.
2265
2266@Zero = constant i64 0
2267
2268; TODO: should return constant
2269define i64 @constant_folding(i64* %p) {
2270; CHECK-LABEL: constant_folding:
2271; CHECK:       # %bb.0:
2272; CHECK-NEXT:    movq (%rdi), %rax
2273; CHECK-NEXT:    retq
2274  %v = load atomic i64, i64* %p unordered, align 8
2275  ret i64 %v
2276}
2277
2278; Legal to forward and fold (TODO)
2279define i64 @load_forwarding(i64* %p) {
2280; CHECK-LABEL: load_forwarding:
2281; CHECK:       # %bb.0:
2282; CHECK-NEXT:    movq (%rdi), %rax
2283; CHECK-NEXT:    orq (%rdi), %rax
2284; CHECK-NEXT:    retq
2285  %v = load atomic i64, i64* %p unordered, align 8
2286  %v2 = load atomic i64, i64* %p unordered, align 8
2287  %ret = or i64 %v, %v2
2288  ret i64 %ret
2289}
2290
2291; Legal to forward (TODO)
2292define i64 @store_forward(i64* %p, i64 %v) {
2293; CHECK-LABEL: store_forward:
2294; CHECK:       # %bb.0:
2295; CHECK-NEXT:    movq %rsi, (%rdi)
2296; CHECK-NEXT:    movq (%rdi), %rax
2297; CHECK-NEXT:    retq
2298  store atomic i64 %v, i64* %p unordered, align 8
2299  %ret = load atomic i64, i64* %p unordered, align 8
2300  ret i64 %ret
2301}
2302
2303; Legal to kill (TODO)
2304define void @dead_writeback(i64* %p) {
2305; CHECK-LABEL: dead_writeback:
2306; CHECK:       # %bb.0:
2307; CHECK-NEXT:    movq (%rdi), %rax
2308; CHECK-NEXT:    movq %rax, (%rdi)
2309; CHECK-NEXT:    retq
2310  %v = load atomic i64, i64* %p unordered, align 8
2311  store atomic i64 %v, i64* %p unordered, align 8
2312  ret void
2313}
2314
2315; Legal to kill (TODO)
2316define void @dead_store(i64* %p, i64 %v) {
2317; CHECK-LABEL: dead_store:
2318; CHECK:       # %bb.0:
2319; CHECK-NEXT:    movq $0, (%rdi)
2320; CHECK-NEXT:    movq %rsi, (%rdi)
2321; CHECK-NEXT:    retq
2322  store atomic i64 0, i64* %p unordered, align 8
2323  store atomic i64 %v, i64* %p unordered, align 8
2324  ret void
2325}
2326
2327;; The next batch of tests ensure that we don't try to fold a load into a
2328;; use where the code motion implied for the load is prevented by a fence.
2329;; Note: We're checking that the load doesn't get moved below the fence as
2330;; part of folding, but is technically legal to lift the add above the fence.
2331;; If that were to happen, please rewrite the test to ensure load movement
2332;; isn't violated.
2333
2334define i64 @nofold_fence(i64* %p) {
2335; CHECK-LABEL: nofold_fence:
2336; CHECK:       # %bb.0:
2337; CHECK-NEXT:    movq (%rdi), %rax
2338; CHECK-NEXT:    mfence
2339; CHECK-NEXT:    addq $15, %rax
2340; CHECK-NEXT:    retq
2341  %v = load atomic i64, i64* %p unordered, align 8
2342  fence seq_cst
2343  %ret = add i64 %v, 15
2344  ret i64 %ret
2345}
2346
2347define i64 @nofold_fence_acquire(i64* %p) {
2348; CHECK-LABEL: nofold_fence_acquire:
2349; CHECK:       # %bb.0:
2350; CHECK-NEXT:    movq (%rdi), %rax
2351; CHECK-NEXT:    #MEMBARRIER
2352; CHECK-NEXT:    addq $15, %rax
2353; CHECK-NEXT:    retq
2354  %v = load atomic i64, i64* %p unordered, align 8
2355  fence acquire
2356  %ret = add i64 %v, 15
2357  ret i64 %ret
2358}
2359
2360
2361define i64 @nofold_stfence(i64* %p) {
2362; CHECK-LABEL: nofold_stfence:
2363; CHECK:       # %bb.0:
2364; CHECK-NEXT:    movq (%rdi), %rax
2365; CHECK-NEXT:    #MEMBARRIER
2366; CHECK-NEXT:    addq $15, %rax
2367; CHECK-NEXT:    retq
2368  %v = load atomic i64, i64* %p unordered, align 8
2369  fence syncscope("singlethread") seq_cst
2370  %ret = add i64 %v, 15
2371  ret i64 %ret
2372}
2373
2374;; Next, test how well we can fold invariant loads.
2375
2376@Constant = external dso_local constant i64
2377
2378define i64 @fold_constant(i64 %arg) {
2379; CHECK-O0-LABEL: fold_constant:
2380; CHECK-O0:       # %bb.0:
2381; CHECK-O0-NEXT:    movq %rdi, %rax
2382; CHECK-O0-NEXT:    addq Constant, %rax
2383; CHECK-O0-NEXT:    retq
2384;
2385; CHECK-O3-LABEL: fold_constant:
2386; CHECK-O3:       # %bb.0:
2387; CHECK-O3-NEXT:    movq %rdi, %rax
2388; CHECK-O3-NEXT:    addq Constant(%rip), %rax
2389; CHECK-O3-NEXT:    retq
2390  %v = load atomic i64, i64* @Constant unordered, align 8
2391  %ret = add i64 %v, %arg
2392  ret i64 %ret
2393}
2394
2395define i64 @fold_constant_clobber(i64* %p, i64 %arg) {
2396; CHECK-O0-LABEL: fold_constant_clobber:
2397; CHECK-O0:       # %bb.0:
2398; CHECK-O0-NEXT:    movq Constant(%rip), %rax
2399; CHECK-O0-NEXT:    movq $5, (%rdi)
2400; CHECK-O0-NEXT:    addq %rsi, %rax
2401; CHECK-O0-NEXT:    retq
2402;
2403; CHECK-O3-CUR-LABEL: fold_constant_clobber:
2404; CHECK-O3-CUR:       # %bb.0:
2405; CHECK-O3-CUR-NEXT:    movq Constant(%rip), %rax
2406; CHECK-O3-CUR-NEXT:    movq $5, (%rdi)
2407; CHECK-O3-CUR-NEXT:    addq %rsi, %rax
2408; CHECK-O3-CUR-NEXT:    retq
2409;
2410; CHECK-O3-EX-LABEL: fold_constant_clobber:
2411; CHECK-O3-EX:       # %bb.0:
2412; CHECK-O3-EX-NEXT:    movq %rsi, %rax
2413; CHECK-O3-EX-NEXT:    addq Constant(%rip), %rax
2414; CHECK-O3-EX-NEXT:    movq $5, (%rdi)
2415; CHECK-O3-EX-NEXT:    retq
2416  %v = load atomic i64, i64* @Constant unordered, align 8
2417  store i64 5, i64* %p
2418  %ret = add i64 %v, %arg
2419  ret i64 %ret
2420}
2421
2422define i64 @fold_constant_fence(i64 %arg) {
2423; CHECK-O0-LABEL: fold_constant_fence:
2424; CHECK-O0:       # %bb.0:
2425; CHECK-O0-NEXT:    movq Constant(%rip), %rax
2426; CHECK-O0-NEXT:    mfence
2427; CHECK-O0-NEXT:    addq %rdi, %rax
2428; CHECK-O0-NEXT:    retq
2429;
2430; CHECK-O3-CUR-LABEL: fold_constant_fence:
2431; CHECK-O3-CUR:       # %bb.0:
2432; CHECK-O3-CUR-NEXT:    movq Constant(%rip), %rax
2433; CHECK-O3-CUR-NEXT:    mfence
2434; CHECK-O3-CUR-NEXT:    addq %rdi, %rax
2435; CHECK-O3-CUR-NEXT:    retq
2436;
2437; CHECK-O3-EX-LABEL: fold_constant_fence:
2438; CHECK-O3-EX:       # %bb.0:
2439; CHECK-O3-EX-NEXT:    movq %rdi, %rax
2440; CHECK-O3-EX-NEXT:    addq Constant(%rip), %rax
2441; CHECK-O3-EX-NEXT:    mfence
2442; CHECK-O3-EX-NEXT:    retq
2443  %v = load atomic i64, i64* @Constant unordered, align 8
2444  fence seq_cst
2445  %ret = add i64 %v, %arg
2446  ret i64 %ret
2447}
2448
2449define i64 @fold_invariant_clobber(i64* dereferenceable(8) %p, i64 %arg) {
2450; CHECK-O0-LABEL: fold_invariant_clobber:
2451; CHECK-O0:       # %bb.0:
2452; CHECK-O0-NEXT:    movq (%rdi), %rax
2453; CHECK-O0-NEXT:    movq $5, (%rdi)
2454; CHECK-O0-NEXT:    addq %rsi, %rax
2455; CHECK-O0-NEXT:    retq
2456;
2457; CHECK-O3-CUR-LABEL: fold_invariant_clobber:
2458; CHECK-O3-CUR:       # %bb.0:
2459; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
2460; CHECK-O3-CUR-NEXT:    movq $5, (%rdi)
2461; CHECK-O3-CUR-NEXT:    addq %rsi, %rax
2462; CHECK-O3-CUR-NEXT:    retq
2463;
2464; CHECK-O3-EX-LABEL: fold_invariant_clobber:
2465; CHECK-O3-EX:       # %bb.0:
2466; CHECK-O3-EX-NEXT:    movq %rsi, %rax
2467; CHECK-O3-EX-NEXT:    addq (%rdi), %rax
2468; CHECK-O3-EX-NEXT:    movq $5, (%rdi)
2469; CHECK-O3-EX-NEXT:    retq
2470  %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{}
2471  store i64 5, i64* %p
2472  %ret = add i64 %v, %arg
2473  ret i64 %ret
2474}
2475
2476
2477define i64 @fold_invariant_fence(i64* dereferenceable(8) %p, i64 %arg) {
2478; CHECK-O0-LABEL: fold_invariant_fence:
2479; CHECK-O0:       # %bb.0:
2480; CHECK-O0-NEXT:    movq (%rdi), %rax
2481; CHECK-O0-NEXT:    mfence
2482; CHECK-O0-NEXT:    addq %rsi, %rax
2483; CHECK-O0-NEXT:    retq
2484;
2485; CHECK-O3-CUR-LABEL: fold_invariant_fence:
2486; CHECK-O3-CUR:       # %bb.0:
2487; CHECK-O3-CUR-NEXT:    movq (%rdi), %rax
2488; CHECK-O3-CUR-NEXT:    mfence
2489; CHECK-O3-CUR-NEXT:    addq %rsi, %rax
2490; CHECK-O3-CUR-NEXT:    retq
2491;
2492; CHECK-O3-EX-LABEL: fold_invariant_fence:
2493; CHECK-O3-EX:       # %bb.0:
2494; CHECK-O3-EX-NEXT:    movq %rsi, %rax
2495; CHECK-O3-EX-NEXT:    addq (%rdi), %rax
2496; CHECK-O3-EX-NEXT:    mfence
2497; CHECK-O3-EX-NEXT:    retq
2498  %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{}
2499  fence seq_cst
2500  %ret = add i64 %v, %arg
2501  ret i64 %ret
2502}
2503
2504
2505; Exercise a few cases involving any extend idioms
2506
2507define i16 @load_i8_anyext_i16(i8* %ptr) {
2508; CHECK-O0-CUR-LABEL: load_i8_anyext_i16:
2509; CHECK-O0-CUR:       # %bb.0:
2510; CHECK-O0-CUR-NEXT:    movb (%rdi), %al
2511; CHECK-O0-CUR-NEXT:    movzbl %al, %eax
2512; CHECK-O0-CUR-NEXT:    # kill: def $ax killed $ax killed $eax
2513; CHECK-O0-CUR-NEXT:    retq
2514;
2515; CHECK-O3-CUR-LABEL: load_i8_anyext_i16:
2516; CHECK-O3-CUR:       # %bb.0:
2517; CHECK-O3-CUR-NEXT:    movzbl (%rdi), %eax
2518; CHECK-O3-CUR-NEXT:    # kill: def $ax killed $ax killed $eax
2519; CHECK-O3-CUR-NEXT:    retq
2520;
2521; CHECK-O0-EX-LABEL: load_i8_anyext_i16:
2522; CHECK-O0-EX:       # %bb.0:
2523; CHECK-O0-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2524; CHECK-O0-EX-NEXT:    vmovd %xmm0, %eax
2525; CHECK-O0-EX-NEXT:    # kill: def $ax killed $ax killed $eax
2526; CHECK-O0-EX-NEXT:    retq
2527;
2528; CHECK-O3-EX-LABEL: load_i8_anyext_i16:
2529; CHECK-O3-EX:       # %bb.0:
2530; CHECK-O3-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2531; CHECK-O3-EX-NEXT:    vmovd %xmm0, %eax
2532; CHECK-O3-EX-NEXT:    # kill: def $ax killed $ax killed $eax
2533; CHECK-O3-EX-NEXT:    retq
2534  %v = load atomic i8, i8* %ptr unordered, align 2
2535  %vec = insertelement <2 x i8> undef, i8 %v, i32 0
2536  %res = bitcast <2 x i8> %vec to i16
2537  ret i16 %res
2538}
2539
2540define i32 @load_i8_anyext_i32(i8* %ptr) {
2541; CHECK-O0-CUR-LABEL: load_i8_anyext_i32:
2542; CHECK-O0-CUR:       # %bb.0:
2543; CHECK-O0-CUR-NEXT:    movb (%rdi), %al
2544; CHECK-O0-CUR-NEXT:    movzbl %al, %eax
2545; CHECK-O0-CUR-NEXT:    retq
2546;
2547; CHECK-O3-CUR-LABEL: load_i8_anyext_i32:
2548; CHECK-O3-CUR:       # %bb.0:
2549; CHECK-O3-CUR-NEXT:    movzbl (%rdi), %eax
2550; CHECK-O3-CUR-NEXT:    retq
2551;
2552; CHECK-O0-EX-LABEL: load_i8_anyext_i32:
2553; CHECK-O0-EX:       # %bb.0:
2554; CHECK-O0-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2555; CHECK-O0-EX-NEXT:    vmovd %xmm0, %eax
2556; CHECK-O0-EX-NEXT:    retq
2557;
2558; CHECK-O3-EX-LABEL: load_i8_anyext_i32:
2559; CHECK-O3-EX:       # %bb.0:
2560; CHECK-O3-EX-NEXT:    vpbroadcastb (%rdi), %xmm0
2561; CHECK-O3-EX-NEXT:    vmovd %xmm0, %eax
2562; CHECK-O3-EX-NEXT:    retq
2563  %v = load atomic i8, i8* %ptr unordered, align 4
2564  %vec = insertelement <4 x i8> undef, i8 %v, i32 0
2565  %res = bitcast <4 x i8> %vec to i32
2566  ret i32 %res
2567}
2568
2569define i32 @load_i16_anyext_i32(i16* %ptr) {
2570; CHECK-O0-CUR-LABEL: load_i16_anyext_i32:
2571; CHECK-O0-CUR:       # %bb.0:
2572; CHECK-O0-CUR-NEXT:    movw (%rdi), %cx
2573; CHECK-O0-CUR-NEXT:    # implicit-def: $eax
2574; CHECK-O0-CUR-NEXT:    movw %cx, %ax
2575; CHECK-O0-CUR-NEXT:    retq
2576;
2577; CHECK-O3-CUR-LABEL: load_i16_anyext_i32:
2578; CHECK-O3-CUR:       # %bb.0:
2579; CHECK-O3-CUR-NEXT:    movzwl (%rdi), %eax
2580; CHECK-O3-CUR-NEXT:    retq
2581;
2582; CHECK-O0-EX-LABEL: load_i16_anyext_i32:
2583; CHECK-O0-EX:       # %bb.0:
2584; CHECK-O0-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2585; CHECK-O0-EX-NEXT:    vmovd %xmm0, %eax
2586; CHECK-O0-EX-NEXT:    retq
2587;
2588; CHECK-O3-EX-LABEL: load_i16_anyext_i32:
2589; CHECK-O3-EX:       # %bb.0:
2590; CHECK-O3-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2591; CHECK-O3-EX-NEXT:    vmovd %xmm0, %eax
2592; CHECK-O3-EX-NEXT:    retq
2593  %v = load atomic i16, i16* %ptr unordered, align 4
2594  %vec = insertelement <2 x i16> undef, i16 %v, i64 0
2595  %res = bitcast <2 x i16> %vec to i32
2596  ret i32 %res
2597}
2598
2599define i64 @load_i16_anyext_i64(i16* %ptr) {
2600; CHECK-O0-CUR-LABEL: load_i16_anyext_i64:
2601; CHECK-O0-CUR:       # %bb.0:
2602; CHECK-O0-CUR-NEXT:    movw (%rdi), %cx
2603; CHECK-O0-CUR-NEXT:    # implicit-def: $eax
2604; CHECK-O0-CUR-NEXT:    movw %cx, %ax
2605; CHECK-O0-CUR-NEXT:    vmovd %eax, %xmm0
2606; CHECK-O0-CUR-NEXT:    vmovq %xmm0, %rax
2607; CHECK-O0-CUR-NEXT:    retq
2608;
2609; CHECK-O3-CUR-LABEL: load_i16_anyext_i64:
2610; CHECK-O3-CUR:       # %bb.0:
2611; CHECK-O3-CUR-NEXT:    movzwl (%rdi), %eax
2612; CHECK-O3-CUR-NEXT:    vmovd %eax, %xmm0
2613; CHECK-O3-CUR-NEXT:    vmovq %xmm0, %rax
2614; CHECK-O3-CUR-NEXT:    retq
2615;
2616; CHECK-O0-EX-LABEL: load_i16_anyext_i64:
2617; CHECK-O0-EX:       # %bb.0:
2618; CHECK-O0-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2619; CHECK-O0-EX-NEXT:    vmovq %xmm0, %rax
2620; CHECK-O0-EX-NEXT:    retq
2621;
2622; CHECK-O3-EX-LABEL: load_i16_anyext_i64:
2623; CHECK-O3-EX:       # %bb.0:
2624; CHECK-O3-EX-NEXT:    vpbroadcastw (%rdi), %xmm0
2625; CHECK-O3-EX-NEXT:    vmovq %xmm0, %rax
2626; CHECK-O3-EX-NEXT:    retq
2627  %v = load atomic i16, i16* %ptr unordered, align 8
2628  %vec = insertelement <4 x i16> undef, i16 %v, i64 0
2629  %res = bitcast <4 x i16> %vec to i64
2630  ret i64 %res
2631}
2632
2633; TODO: Would be legal to combine for legal atomic wider types
2634define i16 @load_combine(i8* %p) {
2635; CHECK-O0-LABEL: load_combine:
2636; CHECK-O0:       # %bb.0:
2637; CHECK-O0-NEXT:    movb (%rdi), %al
2638; CHECK-O0-NEXT:    movb 1(%rdi), %cl
2639; CHECK-O0-NEXT:    movzbl %al, %eax
2640; CHECK-O0-NEXT:    # kill: def $ax killed $ax killed $eax
2641; CHECK-O0-NEXT:    movzbl %cl, %ecx
2642; CHECK-O0-NEXT:    # kill: def $cx killed $cx killed $ecx
2643; CHECK-O0-NEXT:    shlw $8, %cx
2644; CHECK-O0-NEXT:    orw %cx, %ax
2645; CHECK-O0-NEXT:    retq
2646;
2647; CHECK-O3-LABEL: load_combine:
2648; CHECK-O3:       # %bb.0:
2649; CHECK-O3-NEXT:    movzbl (%rdi), %ecx
2650; CHECK-O3-NEXT:    movzbl 1(%rdi), %eax
2651; CHECK-O3-NEXT:    shll $8, %eax
2652; CHECK-O3-NEXT:    orl %ecx, %eax
2653; CHECK-O3-NEXT:    # kill: def $ax killed $ax killed $eax
2654; CHECK-O3-NEXT:    retq
2655  %v1 = load atomic i8, i8* %p unordered, align 2
2656  %p2 = getelementptr i8, i8* %p, i64 1
2657  %v2 = load atomic i8, i8* %p2 unordered, align 1
2658  %v1.ext = zext i8 %v1 to i16
2659  %v2.ext = zext i8 %v2 to i16
2660  %v2.sht = shl i16 %v2.ext, 8
2661  %res = or i16 %v1.ext, %v2.sht
2662  ret i16 %res
2663}
2664
2665define i1 @fold_cmp_over_fence(i32* %p, i32 %v1) {
2666; CHECK-O0-LABEL: fold_cmp_over_fence:
2667; CHECK-O0:       # %bb.0:
2668; CHECK-O0-NEXT:    movl (%rdi), %eax
2669; CHECK-O0-NEXT:    mfence
2670; CHECK-O0-NEXT:    cmpl %eax, %esi
2671; CHECK-O0-NEXT:    jne .LBB116_2
2672; CHECK-O0-NEXT:  # %bb.1: # %taken
2673; CHECK-O0-NEXT:    movb $1, %al
2674; CHECK-O0-NEXT:    retq
2675; CHECK-O0-NEXT:  .LBB116_2: # %untaken
2676; CHECK-O0-NEXT:    xorl %eax, %eax
2677; CHECK-O0-NEXT:    # kill: def $al killed $al killed $eax
2678; CHECK-O0-NEXT:    retq
2679;
2680; CHECK-O3-CUR-LABEL: fold_cmp_over_fence:
2681; CHECK-O3-CUR:       # %bb.0:
2682; CHECK-O3-CUR-NEXT:    movl (%rdi), %eax
2683; CHECK-O3-CUR-NEXT:    mfence
2684; CHECK-O3-CUR-NEXT:    cmpl %eax, %esi
2685; CHECK-O3-CUR-NEXT:    jne .LBB116_2
2686; CHECK-O3-CUR-NEXT:  # %bb.1: # %taken
2687; CHECK-O3-CUR-NEXT:    movb $1, %al
2688; CHECK-O3-CUR-NEXT:    retq
2689; CHECK-O3-CUR-NEXT:  .LBB116_2: # %untaken
2690; CHECK-O3-CUR-NEXT:    xorl %eax, %eax
2691; CHECK-O3-CUR-NEXT:    retq
2692;
2693; CHECK-O3-EX-LABEL: fold_cmp_over_fence:
2694; CHECK-O3-EX:       # %bb.0:
2695; CHECK-O3-EX-NEXT:    cmpl (%rdi), %esi
2696; CHECK-O3-EX-NEXT:    mfence
2697; CHECK-O3-EX-NEXT:    jne .LBB116_2
2698; CHECK-O3-EX-NEXT:  # %bb.1: # %taken
2699; CHECK-O3-EX-NEXT:    movb $1, %al
2700; CHECK-O3-EX-NEXT:    retq
2701; CHECK-O3-EX-NEXT:  .LBB116_2: # %untaken
2702; CHECK-O3-EX-NEXT:    xorl %eax, %eax
2703; CHECK-O3-EX-NEXT:    retq
2704  %v2 = load atomic i32, i32* %p unordered, align 4
2705  fence seq_cst
2706  %cmp = icmp eq i32 %v1, %v2
2707  br i1 %cmp, label %taken, label %untaken
2708taken:
2709  ret i1 true
2710untaken:
2711  ret i1 false
2712}
2713