1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-SDAG,GCN-SDAG %s 5 6; GCN-LABEL: {{^}}ds_append_lds: 7; GCN: s_load_dword [[PTR:s[0-9]+]] 8; GCN: s_mov_b32 m0, [[PTR]] 9; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 10; GCN-NOT: buffer_wbinvl1 11; GCN: {{.*}}store{{.*}} [[RESULT]] 12define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 13 %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false) 14 store i32 %val, i32 addrspace(1)* %out 15 ret void 16} 17 18; GCN-LABEL: {{^}}ds_append_lds_max_offset: 19; GCN: s_load_dword [[PTR:s[0-9]+]] 20; GCN: s_mov_b32 m0, [[PTR]] 21; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} 22; GCN-NOT: buffer_wbinvl1 23; GCN: {{.*}}store{{.*}} [[RESULT]] 24define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 25 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 26 %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) 27 store i32 %val, i32 addrspace(1)* %out 28 ret void 29} 30 31; GCN-LABEL: {{^}}ds_append_no_fold_offset_si: 32; GCN: s_load_dword [[PTR:s[0-9]+]] 33 34; SI: s_add_i32 [[PTR]], [[PTR]], 16 35; SI: s_mov_b32 m0, [[PTR]] 36; SI: ds_append [[RESULT:v[0-9]+]]{{$}} 37 38; CIPLUS: s_mov_b32 m0, [[PTR]] 39; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}} 40 41; GCN-NOT: buffer_wbinvl1 42; GCN: {{.*}}store{{.*}} [[RESULT]] 43define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 { 44 %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4 45 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4 46 %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) 47 store i32 %val, i32 addrspace(1)* %out 48 ret void 49} 50 51; GCN-LABEL: {{^}}ds_append_lds_over_max_offset: 52; GCN: s_load_dword [[PTR:s[0-9]+]] 53 54; SI-SDAG: s_bitset1_b32 [[PTR]], 16 55; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 56; GCN-SDAG: s_mov_b32 m0, [[PTR]] 57 58; SI-GISEL: s_bitset1_b32 m0, 16 59; CIPLUS-GISEL: s_add_u32 m0, [[PTR]], 0x10000 60 61; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 62; GCN-NOT: buffer_wbinvl1 63; GCN: {{.*}}store{{.*}} [[RESULT]] 64define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 65 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384 66 %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) 67 store i32 %val, i32 addrspace(1)* %out 68 ret void 69} 70 71; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr: 72; GCN-SDAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 73; GCN-SDAG: s_mov_b32 m0, [[READLANE]] 74 75; GCN-GISEL: v_readfirstlane_b32 m0, v0 76 77; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 78; GCN-NOT: buffer_wbinvl1 79; GCN: {{.*}}store{{.*}} [[RESULT]] 80define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 81 %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false) 82 store i32 %val, i32 addrspace(1)* %out 83 ret void 84} 85 86; GCN-LABEL: {{^}}ds_append_gds: 87; GCN: s_load_dword [[PTR:s[0-9]+]] 88; GCN: s_mov_b32 m0, [[PTR]] 89; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}} 90; GCN-NOT: buffer_wbinvl1 91; GCN: {{.*}}store{{.*}} [[RESULT]] 92define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { 93 %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i1 false) 94 store i32 %val, i32 addrspace(1)* %out 95 ret void 96} 97 98; GCN-LABEL: {{^}}ds_append_gds_max_offset: 99; GCN: s_load_dword [[PTR:s[0-9]+]] 100; GCN: s_mov_b32 m0, [[PTR]] 101; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}} 102; GCN-NOT: buffer_wbinvl1 103; GCN: {{.*}}store{{.*}} [[RESULT]] 104define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { 105 %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383 106 %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false) 107 store i32 %val, i32 addrspace(1)* %out 108 ret void 109} 110 111; GCN-LABEL: {{^}}ds_append_gds_over_max_offset: 112; GCN-NOT: buffer_wbinvl1 113define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { 114 %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384 115 %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false) 116 store i32 %val, i32 addrspace(1)* %out 117 ret void 118} 119 120; GCN-LABEL: {{^}}ds_append_lds_m0_restore: 121; GCN: s_load_dword [[PTR:s[0-9]+]] 122; GCN: s_mov_b32 m0, [[PTR]] 123; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} 124; GCN-NOT: buffer_wbinvl1 125; NOTGFX9: s_mov_b32 m0, -1 126; GFX9-NOT: m0 127; GCN: _store_dword 128; GCN: ds_read_b32 129define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 130 %val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false) 131 store i32 %val0, i32 addrspace(1)* %out 132 %val1 = load volatile i32, i32 addrspace(3)* %lds 133 ret void 134} 135 136; Make sure this selects successfully with no use. The result register needs to be constrained. 137; GCN-LABEL: {{^}}ds_append_lds_no_use: 138; GCN: s_load_dword [[PTR:s[0-9]+]] 139; GCN: s_mov_b32 m0, [[PTR]] 140; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} 141define amdgpu_kernel void @ds_append_lds_no_use(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { 142 %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 143 %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) 144 ret void 145} 146 147declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 148declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 149 150attributes #0 = { nounwind } 151attributes #1 = { argmemonly convergent nounwind } 152