1;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s 2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s 3 4;CHECK-LABEL: {{^}}test1: 5;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc 6;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff 7;CHECK: s_waitcnt vmcnt(0) 8;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc 9;CHECK: s_waitcnt vmcnt(0) 10;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc 11;CHECK: s_waitcnt vmcnt(0) 12;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc 13;CHECK: s_waitcnt vmcnt(0) 14;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc 15;CHECK-DAG: s_waitcnt vmcnt(0) 16;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc 17;CHECK: s_waitcnt vmcnt(0) 18;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} 19define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { 20main_body: 21 %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 22 %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 23 %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) 24 %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) 25 %ofs.5 = add i32 %voffset, 42 26 %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) 27 %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) 28 %unused = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 29 %out = bitcast i32 %o6 to float 30 ret float %out 31} 32 33;CHECK-LABEL: {{^}}test2: 34;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc 35;CHECK: s_waitcnt vmcnt(0) 36;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc 37;CHECK: s_waitcnt vmcnt(0) 38;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc 39;CHECK: s_waitcnt vmcnt(0) 40;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc 41;CHECK: s_waitcnt vmcnt(0) 42;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc 43;CHECK: s_waitcnt vmcnt(0) 44;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc 45;CHECK: s_waitcnt vmcnt(0) 46;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc 47;CHECK: s_waitcnt vmcnt(0) 48;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc 49;CHECK: s_waitcnt vmcnt(0) 50;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc 51define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { 52main_body: 53 %t1 = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 54 %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 55 %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 56 %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 57 %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 58 %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 59 %t7 = call i32 @llvm.amdgcn.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 60 %t8 = call i32 @llvm.amdgcn.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 61 %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 62 %out = bitcast i32 %t9 to float 63 ret float %out 64} 65 66; Ideally, we would teach tablegen & friends that cmpswap only modifies the 67; first vgpr. Since we don't do that yet, the register allocator will have to 68; create copies which we don't bother to track here. 69; 70;CHECK-LABEL: {{^}}test3: 71;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc 72;CHECK: s_waitcnt vmcnt(0) 73;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff 74;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc 75;CHECK: s_waitcnt vmcnt(0) 76;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc 77;CHECK: s_waitcnt vmcnt(0) 78;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc 79;CHECK: s_waitcnt vmcnt(0) 80;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc 81;CHECK-DAG: s_waitcnt vmcnt(0) 82;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc 83define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { 84main_body: 85 %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 86 %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 87 %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) 88 %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) 89 %ofs.5 = add i32 %voffset, 42 90 %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) 91 %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) 92 93; Detecting the no-return variant doesn't work right now because of how the 94; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. 95; Since there probably isn't a reasonable use-case of cmpswap that discards 96; the return value, that seems okay. 97; 98; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 99 %out = bitcast i32 %o6 to float 100 ret float %out 101} 102 103;CHECK-LABEL: {{^}}test4: 104;CHECK: buffer_atomic_add v0, 105define amdgpu_ps float @test4() { 106main_body: 107 %v = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false) 108 %v.float = bitcast i32 %v to float 109 ret float %v.float 110} 111 112declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 113declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 114declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 115declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 116declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 117declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 118declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 119declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 120declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 121declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 122declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 123 124attributes #0 = { nounwind } 125