1; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI,CIVI-HSA %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX10 %s 6 7; CHECK-LABEL: {{^}}store_flat_i32: 8; CHECK-DAG: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], 9; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]], 10; CHECK: s_waitcnt lgkmcnt(0) 11; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]] 12; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]] 13; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]] 14; CHECK: flat_store_dword v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]] 15define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 { 16 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32* 17 store volatile i32 %x, i32* %fptr, align 4 18 ret void 19} 20 21; CHECK-LABEL: {{^}}store_flat_i64: 22; CHECK: flat_store_dwordx2 23define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 { 24 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64* 25 store volatile i64 %x, i64* %fptr, align 8 26 ret void 27} 28 29; CHECK-LABEL: {{^}}store_flat_v4i32: 30; CHECK: flat_store_dwordx4 31define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 { 32 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>* 33 store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16 34 ret void 35} 36 37; CHECK-LABEL: {{^}}store_flat_trunc_i16: 38; CHECK: flat_store_short 39define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 { 40 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* 41 %y = trunc i32 %x to i16 42 store volatile i16 %y, i16* %fptr, align 2 43 ret void 44} 45 46; CHECK-LABEL: {{^}}store_flat_trunc_i8: 47; CHECK: flat_store_byte 48define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 { 49 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* 50 %y = trunc i32 %x to i8 51 store volatile i8 %y, i8* %fptr, align 2 52 ret void 53} 54 55 56 57; CHECK-LABEL: load_flat_i32: 58; CHECK: flat_load_dword 59define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 { 60 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32* 61 %fload = load volatile i32, i32* %fptr, align 4 62 store i32 %fload, i32 addrspace(1)* %out, align 4 63 ret void 64} 65 66; CHECK-LABEL: load_flat_i64: 67; CHECK: flat_load_dwordx2 68define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 { 69 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64* 70 %fload = load volatile i64, i64* %fptr, align 8 71 store i64 %fload, i64 addrspace(1)* %out, align 8 72 ret void 73} 74 75; CHECK-LABEL: load_flat_v4i32: 76; CHECK: flat_load_dwordx4 77define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 { 78 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>* 79 %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32 80 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8 81 ret void 82} 83 84; CHECK-LABEL: sextload_flat_i8: 85; CHECK: flat_load_sbyte 86define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { 87 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* 88 %fload = load volatile i8, i8* %fptr, align 4 89 %ext = sext i8 %fload to i32 90 store i32 %ext, i32 addrspace(1)* %out, align 4 91 ret void 92} 93 94; CHECK-LABEL: zextload_flat_i8: 95; CHECK: flat_load_ubyte 96define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 { 97 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8* 98 %fload = load volatile i8, i8* %fptr, align 4 99 %ext = zext i8 %fload to i32 100 store i32 %ext, i32 addrspace(1)* %out, align 4 101 ret void 102} 103 104; CHECK-LABEL: sextload_flat_i16: 105; CHECK: flat_load_sshort 106define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { 107 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* 108 %fload = load volatile i16, i16* %fptr, align 4 109 %ext = sext i16 %fload to i32 110 store i32 %ext, i32 addrspace(1)* %out, align 4 111 ret void 112} 113 114; CHECK-LABEL: zextload_flat_i16: 115; CHECK: flat_load_ushort 116define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 { 117 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16* 118 %fload = load volatile i16, i16* %fptr, align 4 119 %ext = zext i16 %fload to i32 120 store i32 %ext, i32 addrspace(1)* %out, align 4 121 ret void 122} 123 124; CHECK-LABEL: flat_scratch_unaligned_load: 125; CHECK: flat_load_ubyte 126; CHECK: flat_load_ubyte 127; CHECK: flat_load_ubyte 128; CHECK: flat_load_ubyte 129define amdgpu_kernel void @flat_scratch_unaligned_load() { 130 %scratch = alloca i32, addrspace(5) 131 %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* 132 %ld = load volatile i32, i32* %fptr, align 1 133 ret void 134} 135 136; CHECK-LABEL: flat_scratch_unaligned_store: 137; CHECK: flat_store_byte 138; CHECK: flat_store_byte 139; CHECK: flat_store_byte 140; CHECK: flat_store_byte 141define amdgpu_kernel void @flat_scratch_unaligned_store() { 142 %scratch = alloca i32, addrspace(5) 143 %fptr = addrspacecast i32 addrspace(5)* %scratch to i32* 144 store volatile i32 0, i32* %fptr, align 1 145 ret void 146} 147 148; CHECK-LABEL: flat_scratch_multidword_load: 149; CIVI-HSA: flat_load_dword v 150; CIVI-HSA: flat_load_dword v 151; GFX9: flat_load_dwordx2 152; GFX10: flat_load_dwordx2 153; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr 154define amdgpu_kernel void @flat_scratch_multidword_load() { 155 %scratch = alloca <2 x i32>, addrspace(5) 156 %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>* 157 %ld = load volatile <2 x i32>, <2 x i32>* %fptr 158 ret void 159} 160 161; CHECK-LABEL: flat_scratch_multidword_store: 162; CIVI-HSA: flat_store_dword v 163; CIVI-HSA: flat_store_dword v 164; GFX9: flat_store_dwordx2 165; GFX10: flat_store_dwordx2 166; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr 167define amdgpu_kernel void @flat_scratch_multidword_store() { 168 %scratch = alloca <2 x i32>, addrspace(5) 169 %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>* 170 store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr 171 ret void 172} 173 174; CHECK-LABEL: {{^}}store_flat_i8_max_offset: 175; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 176; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}} 177define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 { 178 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095 179 store volatile i8 %x, i8* %fptr.offset 180 ret void 181} 182 183; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1: 184; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 185define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 { 186 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 187 store volatile i8 %x, i8* %fptr.offset 188 ret void 189} 190 191; CHECK-LABEL: {{^}}store_flat_i8_neg_offset: 192; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 193 194; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s 195; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, 196; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} 197define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 { 198 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 199 store volatile i8 %x, i8* %fptr.offset 200 ret void 201} 202 203; CHECK-LABEL: {{^}}load_flat_i8_max_offset: 204; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 205; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}} 206; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} 207define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 { 208 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095 209 %val = load volatile i8, i8* %fptr.offset 210 ret void 211} 212 213; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1: 214; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 215; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 216; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}} 217define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 { 218 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096 219 %val = load volatile i8, i8* %fptr.offset 220 ret void 221} 222 223; CHECK-LABEL: {{^}}load_flat_i8_neg_offset: 224; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 225 226; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s 227; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, 228; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}} 229define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 { 230 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 231 %val = load volatile i8, i8* %fptr.offset 232 ret void 233} 234 235attributes #0 = { nounwind } 236attributes #1 = { nounwind convergent } 237