1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,COV5 %s 3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s 4 5; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty: 6; HSA: enable_sgpr_kernarg_segment_ptr = 1 7; HSA: kernarg_segment_byte_size = 56 8; HSA: kernarg_segment_alignment = 4 9 10; MESA: enable_sgpr_kernarg_segment_ptr = 1 11; MESA: kernarg_segment_byte_size = 16 12; MESA: kernarg_segment_alignment = 4 13 14; HSA: s_load_dword s0, s[4:5], 0x0 15 16; COV5: .amdhsa_kernarg_size 256 17define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { 18 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 19 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 20 %load = load volatile i32, i32 addrspace(4)* %cast 21 ret void 22} 23 24; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty_0implicit: 25; HSA: enable_sgpr_kernarg_segment_ptr = 0 26; HSA: kernarg_segment_byte_size = 0 27; HSA: kernarg_segment_alignment = 4 28 29; MESA: enable_sgpr_kernarg_segment_ptr = 1 30; MESA: kernarg_segment_byte_size = 16 31; MESA: kernarg_segment_alignment = 4 32 33; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}} 34; HSA: s_load_dword s0, [[NULL]], 0x0 35 36; MESA: s_load_dword s0, s[4:5], 0x0 37 38; COV5: .amdhsa_kernarg_size 0 39define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 { 40 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 41 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 42 %load = load volatile i32, i32 addrspace(4)* %cast 43 ret void 44} 45 46; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty: 47 48; HSA: enable_sgpr_kernarg_segment_ptr = 1 49; HSA: kernarg_segment_byte_size = 48 50; HSA: kernarg_segment_alignment = 4 51 52; MESA: enable_sgpr_kernarg_segment_ptr = 1 53; MESA: kernarg_segment_byte_size = 16 54; MESA: kernarg_segment_alignment = 4 55 56; HSA: s_load_dword s0, s[4:5], 0x0 57 58; COV5: .amdhsa_kernarg_size 48 59define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 { 60 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 61 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 62 %load = load volatile i32, i32 addrspace(4)* %cast 63 ret void 64} 65 66; GCN-LABEL: {{^}}kernel_implicitarg_ptr: 67 68; HSA: enable_sgpr_kernarg_segment_ptr = 1 69; HSA: kernarg_segment_byte_size = 168 70; HSA: kernarg_segment_alignment = 4 71 72; MESA: enable_sgpr_kernarg_segment_ptr = 1 73; MESA: kernarg_segment_byte_size = 128 74; MESA: kernarg_segment_alignment = 4 75 76; HSA: s_load_dword s0, s[4:5], 0x1c 77 78; COV5: .amdhsa_kernarg_size 368 79define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { 80 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 81 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 82 %load = load volatile i32, i32 addrspace(4)* %cast 83 ret void 84} 85 86; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr: 87 88; HSA: enable_sgpr_kernarg_segment_ptr = 1 89; HSA: kernarg_segment_byte_size = 160 90; HSA: kernarg_segment_alignment = 4 91 92; MESA: enable_sgpr_kernarg_segment_ptr = 1 93; MESA: kernarg_segment_byte_size = 128 94; MESA: kernarg_segment_alignment = 4 95 96; HSA: s_load_dword s0, s[4:5], 0x1c 97 98; COV5: .amdhsa_kernarg_size 160 99define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 { 100 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 101 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 102 %load = load volatile i32, i32 addrspace(4)* %cast 103 ret void 104} 105 106; GCN-LABEL: {{^}}func_implicitarg_ptr: 107; GCN: s_waitcnt 108; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 109; GCN-NEXT: s_waitcnt 110; GCN-NEXT: s_setpc_b64 111define void @func_implicitarg_ptr() #0 { 112 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 113 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 114 %load = load volatile i32, i32 addrspace(4)* %cast 115 ret void 116} 117 118; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: 119; GCN: s_waitcnt 120; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 121; GCN-NEXT: s_waitcnt 122; GCN-NEXT: s_setpc_b64 123define void @opencl_func_implicitarg_ptr() #0 { 124 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 125 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 126 %load = load volatile i32, i32 addrspace(4)* %cast 127 ret void 128} 129 130; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty: 131; HSA: enable_sgpr_kernarg_segment_ptr = 1 132; HSA: kernarg_segment_byte_size = 56 133; HSA: kernarg_segment_alignment = 4 134 135; MESA: enable_sgpr_kernarg_segment_ptr = 1 136; MESA: kernarg_segment_byte_size = 16 137; MESA: kernarg_segment_alignment = 4 138 139; GCN: s_mov_b64 s[8:9], s[4:5] 140; GCN: s_swappc_b64 141 142; COV5: .amdhsa_kernarg_size 256 143define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 { 144 call void @func_implicitarg_ptr() 145 ret void 146} 147 148; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty_implicit0: 149; HSA: enable_sgpr_kernarg_segment_ptr = 0 150; HSA: kernarg_segment_byte_size = 0 151; HSA: kernarg_segment_alignment = 4 152 153; MESA: enable_sgpr_kernarg_segment_ptr = 1 154; MESA: kernarg_segment_byte_size = 16 155; MESA: kernarg_segment_alignment = 4 156 157; HSA: s_mov_b64 s[8:9], 0{{$}} 158; MESA: s_mov_b64 s[8:9], s[4:5]{{$}} 159; GCN: s_swappc_b64 160 161; COV5: .amdhsa_kernarg_size 0 162define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 { 163 call void @func_implicitarg_ptr() 164 ret void 165} 166 167; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty: 168; HSA: enable_sgpr_kernarg_segment_ptr = 1 169; HSA: kernarg_segment_byte_size = 48 170; HSA: kernarg_segment_alignment = 4 171; MESA: enable_sgpr_kernarg_segment_ptr = 1 172; MESA: kernarg_segment_byte_size = 16 173; GCN: s_mov_b64 s[8:9], s[4:5] 174; GCN-NOT: s4 175; GCN-NOT: s5 176; GCN: s_swappc_b64 177 178; COV5: .amdhsa_kernarg_size 48 179define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 { 180 call void @func_implicitarg_ptr() 181 ret void 182} 183 184; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func: 185; HSA: enable_sgpr_kernarg_segment_ptr = 1 186; HSA: kernarg_segment_byte_size = 168 187; HSA: kernarg_segment_alignment = 4 188 189; MESA: enable_sgpr_kernarg_segment_ptr = 1 190; MESA: kernarg_segment_byte_size = 128 191; MESA: kernarg_segment_alignment = 4 192 193; HSA: s_add_u32 s8, s4, 0x70 194; MESA: s_add_u32 s8, s4, 0x70 195 196; GCN: s_addc_u32 s9, s5, 0{{$}} 197; GCN: s_swappc_b64 198 199; COV5: .amdhsa_kernarg_size 368 200define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { 201 call void @func_implicitarg_ptr() 202 ret void 203} 204 205; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func: 206; HSA: enable_sgpr_kernarg_segment_ptr = 1 207; HSA: kernarg_segment_byte_size = 160 208; HSA: kernarg_segment_alignment = 4 209; MESA: enable_sgpr_kernarg_segment_ptr = 1 210; MESA: kernarg_segment_byte_size = 128 211; MESA: kernarg_segment_alignment = 4 212 213; GCN: s_add_u32 s8, s4, 0x70 214; GCN: s_addc_u32 s9, s5, 0{{$}} 215; GCN: s_swappc_b64 216 217; COV5: .amdhsa_kernarg_size 160 218define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 { 219 call void @func_implicitarg_ptr() 220 ret void 221} 222 223; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func: 224; GCN-NOT: s8 225; GCN-NOT: s9 226; GCN-NOT: s[8:9] 227; GCN: s_swappc_b64 228; GCN: s_setpc_b64 s[30:31] 229define void @func_call_implicitarg_ptr_func() #0 { 230 call void @func_implicitarg_ptr() 231 ret void 232} 233 234; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func: 235; GCN-NOT: s8 236; GCN-NOT: s9 237; GCN-NOT: s[8:9] 238; GCN: s_swappc_b64 239; GCN: s_setpc_b64 s[30:31] 240define void @opencl_func_call_implicitarg_ptr_func() #0 { 241 call void @func_implicitarg_ptr() 242 ret void 243} 244 245; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: 246; GCN: s_waitcnt 247; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 248; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 249; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 250; GCN: s_waitcnt lgkmcnt(0) 251define void @func_kernarg_implicitarg_ptr() #0 { 252 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() 253 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 254 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* 255 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 256 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr 257 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg 258 ret void 259} 260 261; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: 262; GCN: s_waitcnt 263; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 264; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 265; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 266; GCN: s_waitcnt lgkmcnt(0) 267define void @opencl_func_kernarg_implicitarg_ptr() #0 { 268 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() 269 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 270 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)* 271 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 272 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr 273 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg 274 ret void 275} 276 277; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: 278; GCN: s_add_u32 s8, s4, 0x70 279; GCN: s_addc_u32 s9, s5, 0 280; GCN: s_swappc_b64 281define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { 282 call void @func_kernarg_implicitarg_ptr() 283 ret void 284} 285 286; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding: 287; HSA: kernarg_segment_byte_size = 120 288; HSA: kernarg_segment_alignment = 6 289; MESA: kernarg_segment_byte_size = 84 290; MESA: kernarg_segment_alignment = 6 291 292; COV5: .amdhsa_kernarg_size 120 293define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 { 294 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() 295 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* 296 %load = load volatile i32, i32 addrspace(4)* %cast 297 ret void 298} 299 300; HSA-LABEL: Kernels: 301; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty 302; HSA: CodeProps: 303; HSA: KernargSegmentSize: 56 304; HSA: KernargSegmentAlign: 8 305 306; HSA-LABEL: - Name: kernel_implicitarg_ptr_empty_0implicit 307; HSA: KernargSegmentSize: 0 308; HSA: KernargSegmentAlign: 4 309 310; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr_empty 311; HSA: KernargSegmentSize: 48 312; HSA: KernargSegmentAlign: 8 313 314; HSA-LABEL: - Name: kernel_implicitarg_ptr 315; HSA: KernargSegmentSize: 168 316; HSA: KernargSegmentAlign: 8 317 318; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr 319; HSA: KernargSegmentSize: 160 320; HSA: KernargSegmentAlign: 8 321 322; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty 323; HSA: KernargSegmentSize: 56 324; HSA: KernargSegmentAlign: 8 325 326; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty_implicit0 327; HSA: KernargSegmentSize: 0 328; HSA: KernargSegmentAlign: 4 329 330; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func_empty 331; HSA: KernargSegmentSize: 48 332; HSA: KernargSegmentAlign: 8 333 334; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func 335; HSA: KernargSegmentSize: 168 336; HSA: KernargSegmentAlign: 8 337 338; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func 339; HSA: KernargSegmentSize: 160 340; HSA: KernargSegmentAlign: 8 341 342; HSA-LABEL: - Name: kernel_call_kernarg_implicitarg_ptr_func 343; HSA: KernargSegmentSize: 168 344; HSA: KernargSegmentAlign: 8 345 346; HSA-LABEL: - Name: kernel_implicitarg_no_struct_align_padding 347; HSA: KernargSegmentSize: 120 348; HSA: KernargSegmentAlign: 64 349 350; COV5-LABEL: amdhsa.kernels: 351; COV5: .kernarg_segment_align: 8 352; COV5-NEXT: .kernarg_segment_size: 256 353; COV5-LABEL: .name: kernel_implicitarg_ptr_empty 354 355; COV5: .kernarg_segment_align: 4 356; COV5-NEXT: .kernarg_segment_size: 0 357; COV5-LABEL: .name: kernel_implicitarg_ptr_empty_0implicit 358 359; COV5: .kernarg_segment_align: 8 360; COV5-NEXT: .kernarg_segment_size: 48 361; COV5-LABEL: .name: opencl_kernel_implicitarg_ptr_empty 362 363; COV5: .kernarg_segment_align: 8 364; COV5-NEXT: .kernarg_segment_size: 368 365; COV5-LABEL: .name: kernel_implicitarg_ptr 366 367; COV5: .kernarg_segment_align: 8 368; COV5-NEXT: .kernarg_segment_size: 160 369; COV5-LABEL: .name: opencl_kernel_implicitarg_ptr 370 371; COV5: .kernarg_segment_align: 8 372; COV5-NEXT: .kernarg_segment_size: 256 373; COV5-LABEL: .name: kernel_call_implicitarg_ptr_func_empty 374 375; COV5: .kernarg_segment_align: 4 376; COV5-NEXT: .kernarg_segment_size: 0 377; COV5-LABEL: .name: kernel_call_implicitarg_ptr_func_empty_implicit0 378 379; COV5: .kernarg_segment_align: 8 380; COV5-NEXT: .kernarg_segment_size: 48 381; COV5-LABEL: .name: opencl_kernel_call_implicitarg_ptr_func_empty 382 383; COV5: .kernarg_segment_align: 8 384; COV5-NEXT: .kernarg_segment_size: 368 385; COV5-LABEL: .name: kernel_call_implicitarg_ptr_func 386 387; COV5: .kernarg_segment_align: 8 388; COV5-NEXT: .kernarg_segment_size: 160 389; COV5-LABEL: .name: opencl_kernel_call_implicitarg_ptr_func 390 391; COV5: .kernarg_segment_align: 8 392; COV5-NEXT: .kernarg_segment_size: 368 393; COV5-LABEL: .name: kernel_call_kernarg_implicitarg_ptr_func 394 395; COV5: .kernarg_segment_align: 64 396; COV5-NEXT: .kernarg_segment_size: 120 397; COV5-LABEL: .name: kernel_implicitarg_no_struct_align_padding 398 399declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2 400declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2 401 402attributes #0 = { nounwind noinline } 403attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" } 404attributes #2 = { nounwind readnone speculatable } 405attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" } 406