1;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI 2;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI 3 4;CHECK-LABEL: {{^}}test1: 5;CHECK-NOT: s_waitcnt 6;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc 7;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc 8;CHECK: s_waitcnt vmcnt(0) 9;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc 10;CHECK: s_waitcnt vmcnt(0) 11;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc 12;CHECK: s_waitcnt vmcnt(0) 13;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc 14;CHECK: s_waitcnt vmcnt(0) 15;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc 16;CHECK-DAG: s_waitcnt vmcnt(0) 17;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc 18;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc 19;CHECK: s_waitcnt vmcnt(0) 20;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} 21define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { 22main_body: 23 %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 24 %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 25 %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) 26 %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) 27 %ofs.5 = add i32 %voffset, 42 28 %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) 29 %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) 30 %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 31 %out = bitcast i32 %o6 to float 32 ret float %out 33} 34 35;CHECK-LABEL: {{^}}test11: 36;CHECK-NOT: s_waitcnt 37;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0 glc 38;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc 39;CHECK: s_waitcnt vmcnt(0) 40;CHECK: buffer_atomic_swap_x2 v[3:4], v1, s[0:3], 0 idxen glc 41;CHECK: s_waitcnt vmcnt(0) 42;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen glc 43;CHECK: s_waitcnt vmcnt(0) 44;CHECK: buffer_atomic_swap_x2 v[3:4], v[1:2], s[0:3], 0 idxen offen glc 45;CHECK: s_waitcnt vmcnt(0) 46;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen offset:42 glc 47;CHECK-DAG: s_waitcnt vmcnt(0) 48;SICI: buffer_atomic_swap_x2 v[3:4], v0, s[0:3], 0 offen glc 49;VI: buffer_atomic_swap_x2 v[3:4], off, s[0:3], [[SOFS]] offset:4 glc 50;CHECK: s_waitcnt vmcnt(0) 51;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0{{$}} 52define amdgpu_ps float @test11(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { 53main_body: 54 %o0 = sext i32 %data to i64 55 %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 56 %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 57 %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) 58 %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) 59 %ofs.5 = add i32 %voffset, 42 60 %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) 61 %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) 62 %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 63 %o7 = trunc i64 %o6 to i32 64 %out = bitcast i32 %o7 to float 65 ret float %out 66} 67 68;CHECK-LABEL: {{^}}test2: 69;CHECK-NOT: s_waitcnt 70;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc 71;CHECK: s_waitcnt vmcnt(0) 72;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc 73;CHECK: s_waitcnt vmcnt(0) 74;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc 75;CHECK: s_waitcnt vmcnt(0) 76;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc 77;CHECK: s_waitcnt vmcnt(0) 78;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc 79;CHECK: s_waitcnt vmcnt(0) 80;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc 81;CHECK: s_waitcnt vmcnt(0) 82;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc 83;CHECK: s_waitcnt vmcnt(0) 84;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc 85;CHECK: s_waitcnt vmcnt(0) 86;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc 87define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { 88main_body: 89 %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 90 %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 91 %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 92 %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 93 %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 94 %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 95 %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 96 %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 97 %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 98 %out = bitcast i32 %t9 to float 99 ret float %out 100} 101 102;CHECK-LABEL: {{^}}test3: 103;CHECK-NOT: s_waitcnt 104;CHECK: buffer_atomic_add_x2 v[0:1], v2, s[0:3], 0 idxen glc 105;CHECK: s_waitcnt vmcnt(0) 106;CHECK: buffer_atomic_sub_x2 v[0:1], v2, s[0:3], 0 idxen glc 107;CHECK: s_waitcnt vmcnt(0) 108;CHECK: buffer_atomic_smin_x2 v[0:1], v2, s[0:3], 0 idxen glc 109;CHECK: s_waitcnt vmcnt(0) 110;CHECK: buffer_atomic_umin_x2 v[0:1], v2, s[0:3], 0 idxen glc 111;CHECK: s_waitcnt vmcnt(0) 112;CHECK: buffer_atomic_smax_x2 v[0:1], v2, s[0:3], 0 idxen glc 113;CHECK: s_waitcnt vmcnt(0) 114;CHECK: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], 0 idxen glc 115;CHECK: s_waitcnt vmcnt(0) 116;CHECK: buffer_atomic_and_x2 v[0:1], v2, s[0:3], 0 idxen glc 117;CHECK: s_waitcnt vmcnt(0) 118;CHECK: buffer_atomic_or_x2 v[0:1], v2, s[0:3], 0 idxen glc 119;CHECK: s_waitcnt vmcnt(0) 120;CHECK: buffer_atomic_xor_x2 v[0:1], v2, s[0:3], 0 idxen glc 121define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { 122main_body: 123 %t0 = sext i32 %data to i64 124 %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 125 %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 126 %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 127 %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 128 %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 129 %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 130 %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 131 %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 132 %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 133 %t10 = trunc i64 %t9 to i32 134 %out = bitcast i32 %t10 to float 135 ret float %out 136} 137 138; Ideally, we would teach tablegen & friends that cmpswap only modifies the 139; first vgpr. Since we don't do that yet, the register allocator will have to 140; create copies which we don't bother to track here. 141; 142;CHECK-LABEL: {{^}}test4: 143;CHECK-NOT: s_waitcnt 144;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc 145;CHECK: s_waitcnt vmcnt(0) 146;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc 147;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc 148;CHECK: s_waitcnt vmcnt(0) 149;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc 150;CHECK: s_waitcnt vmcnt(0) 151;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc 152;CHECK: s_waitcnt vmcnt(0) 153;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc 154;CHECK-DAG: s_waitcnt vmcnt(0) 155;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc 156;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc 157define amdgpu_ps float @test4(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { 158main_body: 159 %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 160 %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) 161 %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0) 162 %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0) 163 %ofs.5 = add i32 %voffset, 44 164 %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0) 165 %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0) 166 167; Detecting the no-return variant doesn't work right now because of how the 168; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. 169; Since there probably isn't a reasonable use-case of cmpswap that discards 170; the return value, that seems okay. 171; 172; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) 173 %out = bitcast i32 %o6 to float 174 ret float %out 175} 176 177;CHECK-LABEL: {{^}}test7: 178;CHECK: buffer_atomic_add v0, 179define amdgpu_ps float @test7() { 180main_body: 181 %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false) 182 %v.float = bitcast i32 %v to float 183 ret float %v.float 184} 185 186declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0 187declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0 188declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0 189declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0 190declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0 191declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0 192declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0 193declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0 194declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0 195declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0 196declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 197declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) #0 198declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) #0 199declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) #0 200declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) #0 201declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) #0 202declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) #0 203declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) #0 204declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) #0 205declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) #0 206declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) #0 207 208attributes #0 = { nounwind } 209