1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s 2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s 3 4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty: 5; HSA: enable_sgpr_kernarg_segment_ptr = 1 6; HSA: kernarg_segment_byte_size = 56 7; HSA: kernarg_segment_alignment = 4 8 9; MESA: enable_sgpr_kernarg_segment_ptr = 1 10; MESA: kernarg_segment_byte_size = 16 11; MESA: kernarg_segment_alignment = 4 12 13; HSA: s_load_dword s0, s[4:5], 0x0 14define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { 15 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 16 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 17 %load = load volatile i32, i32 addrspace(4)* %cast 18 ret void 19} 20 21; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty_0implicit: 22; HSA: enable_sgpr_kernarg_segment_ptr = 0 23; HSA: kernarg_segment_byte_size = 0 24; HSA: kernarg_segment_alignment = 4 25 26; MESA: enable_sgpr_kernarg_segment_ptr = 1 27; MESA: kernarg_segment_byte_size = 16 28; MESA: kernarg_segment_alignment = 4 29 30; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}} 31; HSA: s_load_dword s0, [[NULL]], 0x0 32 33; MESA: s_load_dword s0, s[4:5], 0x0 34define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 { 35 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 36 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 37 %load = load volatile i32, i32 addrspace(4)* %cast 38 ret void 39} 40 41; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty: 42; GCN: enable_sgpr_kernarg_segment_ptr = 1 43 44; HSA: kernarg_segment_byte_size = 48 45; HSA: kernarg_segment_alignment = 4 46 47; MESA: kernarg_segment_byte_size = 16 48; MESA: kernarg_segment_alignment = 4 49 50; HSA: s_load_dword s0, s[4:5], 0x0 51define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 { 52 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 53 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 54 %load = load volatile i32, i32 addrspace(4)* %cast 55 ret void 56} 57 58; GCN-LABEL: {{^}}kernel_implicitarg_ptr: 59; GCN: enable_sgpr_kernarg_segment_ptr = 1 60 61; HSA: kernarg_segment_byte_size = 168 62; HSA: kernarg_segment_alignment = 4 63 64; MESA: kernarg_segment_byte_size = 128 65; MESA: kernarg_segment_alignment = 4 66 67; HSA: s_load_dword s0, s[4:5], 0x1c 68define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { 69 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 70 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 71 %load = load volatile i32, i32 addrspace(4)* %cast 72 ret void 73} 74 75; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr: 76; GCN: enable_sgpr_kernarg_segment_ptr = 1 77 78; HSA: kernarg_segment_byte_size = 160 79; HSA: kernarg_segment_alignment = 4 80 81; MESA: kernarg_segment_byte_size = 128 82; MESA: kernarg_segment_alignment = 4 83 84; HSA: s_load_dword s0, s[4:5], 0x1c 85define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 { 86 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 87 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 88 %load = load volatile i32, i32 addrspace(4)* %cast 89 ret void 90} 91 92; GCN-LABEL: {{^}}func_implicitarg_ptr: 93; GCN: s_waitcnt 94; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 95; GCN-NEXT: s_waitcnt 96; GCN-NEXT: s_setpc_b64 97define void @func_implicitarg_ptr() #0 { 98 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 99 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 100 %load = load volatile i32, i32 addrspace(4)* %cast 101 ret void 102} 103 104; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: 105; GCN: s_waitcnt 106; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 107; GCN-NEXT: s_waitcnt 108; GCN-NEXT: s_setpc_b64 109define void @opencl_func_implicitarg_ptr() #0 { 110 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 111 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 112 %load = load volatile i32, i32 addrspace(4)* %cast 113 ret void 114} 115 116; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty: 117; HSA: enable_sgpr_kernarg_segment_ptr = 1 118; HSA: kernarg_segment_byte_size = 56 119; HSA: kernarg_segment_alignment = 4 120 121; MESA: enable_sgpr_kernarg_segment_ptr = 1 122; MESA: kernarg_segment_byte_size = 16 123; MESA: kernarg_segment_alignment = 4 124 125; GCN: s_mov_b64 s[8:9], s[4:5] 126; GCN: s_swappc_b64 127define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 { 128 call void @func_implicitarg_ptr() 129 ret void 130} 131 132; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty_implicit0: 133; HSA: enable_sgpr_kernarg_segment_ptr = 0 134; HSA: kernarg_segment_byte_size = 0 135; HSA: kernarg_segment_alignment = 4 136 137; MESA: enable_sgpr_kernarg_segment_ptr = 1 138; MESA: kernarg_segment_byte_size = 16 139; MESA: kernarg_segment_alignment = 4 140 141; HSA: s_mov_b64 s[8:9], 0{{$}} 142; MESA: s_mov_b64 s[8:9], s[4:5]{{$}} 143; GCN: s_swappc_b64 144define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 { 145 call void @func_implicitarg_ptr() 146 ret void 147} 148 149; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty: 150; GCN: enable_sgpr_kernarg_segment_ptr = 1 151; HSA: kernarg_segment_byte_size = 48 152; HSA: kernarg_segment_alignment = 4 153; MESA: kernarg_segment_byte_size = 16 154; GCN: s_mov_b64 s[8:9], s[4:5] 155; GCN-NOT: s4 156; GCN-NOT: s5 157; GCN: s_swappc_b64 158define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 { 159 call void @func_implicitarg_ptr() 160 ret void 161} 162 163; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func: 164; GCN: enable_sgpr_kernarg_segment_ptr = 1 165; HSA: kernarg_segment_byte_size = 168 166; HSA: kernarg_segment_alignment = 4 167 168; MESA: kernarg_segment_byte_size = 128 169; MESA: kernarg_segment_alignment = 4 170 171; HSA: s_add_u32 s8, s4, 0x70 172; MESA: s_add_u32 s8, s4, 0x70 173 174; GCN: s_addc_u32 s9, s5, 0{{$}} 175; GCN: s_swappc_b64 176define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { 177 call void @func_implicitarg_ptr() 178 ret void 179} 180 181; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func: 182; GCN: enable_sgpr_kernarg_segment_ptr = 1 183; HSA: kernarg_segment_byte_size = 160 184; HSA: kernarg_segment_alignment = 4 185; MESA: kernarg_segment_byte_size = 128 186; MESA: kernarg_segment_alignment = 4 187 188; GCN: s_add_u32 s8, s4, 0x70 189; GCN: s_addc_u32 s9, s5, 0{{$}} 190; GCN: s_swappc_b64 191define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { 192 call void @func_implicitarg_ptr() 193 ret void 194} 195 196; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: 197; GCN-NOT: s8 198; GCN-NOT: s9 199; GCN-NOT: s[8:9] 200define void @func_call_implicitarg_ptr_func() #0 { 201 call void @func_implicitarg_ptr() 202 ret void 203} 204 205; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: 206; GCN-NOT: s8 207; GCN-NOT: s9 208; GCN-NOT: s[8:9] 209define void @opencl_func_call_implicitarg_ptr_func() #0 { 210 call void @func_implicitarg_ptr() 211 ret void 212} 213 214; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: 215; GCN: s_waitcnt 216; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 217; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 218; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 219; GCN: s_waitcnt lgkmcnt(0) 220define void @func_kernarg_implicitarg_ptr() #0 { 221 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() 222 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 223 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* 224 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 225 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr 226 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg 227 ret void 228} 229 230; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: 231; GCN: s_waitcnt 232; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 233; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 234; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 235; GCN: s_waitcnt lgkmcnt(0) 236define void @opencl_func_kernarg_implicitarg_ptr() #0 { 237 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() 238 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 239 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* 240 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 241 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr 242 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg 243 ret void 244} 245 246; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: 247; GCN: s_add_u32 s8, s4, 0x70 248; GCN: s_addc_u32 s9, s5, 0 249; GCN: s_swappc_b64 250define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { 251 call void @func_kernarg_implicitarg_ptr() 252 ret void 253} 254 255; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding: 256; HSA: kernarg_segment_byte_size = 120 257; MESA: kernarg_segment_byte_size = 84 258; GCN: kernarg_segment_alignment = 6 259define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 { 260 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 261 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 262 %load = load volatile i32, i32 addrspace(4)* %cast 263 ret void 264} 265 266 267; HSA-LABEL: Kernels: 268; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty 269; HSA: CodeProps: 270; HSA: KernargSegmentSize: 56 271; HSA: KernargSegmentAlign: 8 272 273; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty_0implicit 274; HSA: KernargSegmentSize: 0 275; HSA: KernargSegmentAlign: 4 276 277; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr_empty 278; HSA: KernargSegmentSize: 48 279; HSA: KernargSegmentAlign: 8 280 281; HSA-LABEL: - Name: kernel_implicitarg_ptr 282; HSA: KernargSegmentSize: 168 283; HSA: KernargSegmentAlign: 8 284 285; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr 286; HSA: KernargSegmentSize: 160 287; HSA: KernargSegmentAlign: 8 288 289; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty 290; HSA: KernargSegmentSize: 56 291; HSA: KernargSegmentAlign: 8 292 293; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty_implicit0 294; HSA: KernargSegmentSize: 0 295; HSA: KernargSegmentAlign: 4 296 297; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func_empty 298; HSA: KernargSegmentSize: 48 299; HSA: KernargSegmentAlign: 8 300 301; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func 302; HSA: KernargSegmentSize: 168 303; HSA: KernargSegmentAlign: 8 304 305; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func 306; HSA: KernargSegmentSize: 160 307; HSA: KernargSegmentAlign: 8 308 309; HSA-LABEL: - Name: kernel_call_kernarg_implicitarg_ptr_func 310; HSA: KernargSegmentSize: 168 311; HSA: KernargSegmentAlign: 8 312 313; HSA-LABEL: - Name: kernel_implicitarg_no_struct_align_padding 314; HSA: KernargSegmentSize: 120 315; HSA: KernargSegmentAlign: 64 316 317declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2 318declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 319 320attributes #0 = { nounwind noinline } 321attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" } 322attributes #2 = { nounwind readnone speculatable } 323attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" } 324