1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14 15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 17define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 18 %ext = zext i8 %in to i32 19 store i32 %ext, i32 addrspace(1)* %out, align 4 20 ret void 21} 22 23; FUNC-LABEL: {{^}}i8_zext_arg: 24; HSA-VI: kernarg_segment_byte_size = 12 25; HSA-VI: kernarg_segment_alignment = 4 26; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 27; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 28; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 29 30; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 31; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 32define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 33 %ext = zext i8 %in to i32 34 store i32 %ext, i32 addrspace(1)* %out, align 4 35 ret void 36} 37 38; FUNC-LABEL: {{^}}i8_sext_arg: 39; HSA-VI: kernarg_segment_byte_size = 12 40; HSA-VI: kernarg_segment_alignment = 4 41; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 42; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 43 44; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 45 46; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 47; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] 48; HSA-VI: flat_store_dword 49define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 50 %ext = sext i8 %in to i32 51 store i32 %ext, i32 addrspace(1)* %out, align 4 52 ret void 53} 54 55; FUNC-LABEL: {{^}}i16_arg: 56; HSA-VI: kernarg_segment_byte_size = 12 57; HSA-VI: kernarg_segment_alignment = 4 58 59; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 60; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 61 62; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 63; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 64 65; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 66; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 67; HSA-VI: flat_store_dword 68define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 69 %ext = zext i16 %in to i32 70 store i32 %ext, i32 addrspace(1)* %out, align 4 71 ret void 72} 73 74; FUNC-LABEL: {{^}}i16_zext_arg: 75; HSA-VI: kernarg_segment_byte_size = 12 76; HSA-VI: kernarg_segment_alignment = 4 77 78; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 79; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 80; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 81 82; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 83; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 84; HSA-VI: flat_store_dword 85define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 86 %ext = zext i16 %in to i32 87 store i32 %ext, i32 addrspace(1)* %out, align 4 88 ret void 89} 90 91; FUNC-LABEL: {{^}}i16_sext_arg: 92; HSA-VI: kernarg_segment_byte_size = 12 93; HSA-VI: kernarg_segment_alignment = 4 94 95; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 96; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 97; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 98 99 100; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 101; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] 102; HSA-VI: flat_store_dword 103define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 104 %ext = sext i16 %in to i32 105 store i32 %ext, i32 addrspace(1)* %out, align 4 106 ret void 107} 108 109; FUNC-LABEL: {{^}}i32_arg: 110; HSA-VI: kernarg_segment_byte_size = 12 111; HSA-VI: kernarg_segment_alignment = 4 112 113; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 114; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 115; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 116; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 117define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 118entry: 119 store i32 %in, i32 addrspace(1)* %out, align 4 120 ret void 121} 122 123; FUNC-LABEL: {{^}}f32_arg: 124; HSA-VI: kernarg_segment_byte_size = 12 125; HSA-VI: kernarg_segment_alignment = 4 126; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 127; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 128; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 129; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 130define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 131entry: 132 store float %in, float addrspace(1)* %out, align 4 133 ret void 134} 135 136; FUNC-LABEL: {{^}}v2i8_arg: 137; HSA-VI: kernarg_segment_byte_size = 12 138; HSA-VI: kernarg_segment_alignment = 4 139 140; EG: VTX_READ_8 141; EG: VTX_READ_8 142 143; GCN: s_load_dword s 144; GCN-NOT: {{buffer|flat|global}}_load_ 145define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 146entry: 147 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 148 ret void 149} 150 151; FUNC-LABEL: {{^}}v2i16_arg: 152; HSA-VI: kernarg_segment_byte_size = 12 153; HSA-VI: kernarg_segment_alignment = 4 154 155; EG: VTX_READ_16 156; EG: VTX_READ_16 157 158; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 159; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 160; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 161define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 162entry: 163 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 164 ret void 165} 166 167; FUNC-LABEL: {{^}}v2i32_arg: 168; HSA-VI: kernarg_segment_byte_size = 16 169; HSA-VI: kernarg_segment_alignment = 4 170 171; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 172; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 173; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 174; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 175; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 176define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 177entry: 178 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 179 ret void 180} 181 182; FUNC-LABEL: {{^}}v2f32_arg: 183; HSA-VI: kernarg_segment_byte_size = 16 184; HSA-VI: kernarg_segment_alignment = 4 185 186; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 187; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 188; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 189; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 190; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 191define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 192entry: 193 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 194 ret void 195} 196 197; FUNC-LABEL: {{^}}v3i8_arg: 198; HSA-VI: kernarg_segment_byte_size = 12 199; HSA-VI: kernarg_segment_alignment = 4 200 201; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 202; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 203; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 204 205; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 206 207; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 208; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 209define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 210entry: 211 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 212 ret void 213} 214 215; FUNC-LABEL: {{^}}v3i16_arg: 216; HSA-VI: kernarg_segment_byte_size = 16 217; HSA-VI: kernarg_segment_alignment = 4 218 219; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 220; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 221; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 222 223; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 224 225; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 226; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 227define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 228entry: 229 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 230 ret void 231} 232 233; FUNC-LABEL: {{^}}v3i32_arg: 234; HSA-VI: kernarg_segment_byte_size = 32 235; HSA-VI: kernarg_segment_alignment = 4 236; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 237; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 238; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 239; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 240; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 241; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 242define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 243entry: 244 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 245 ret void 246} 247 248; FUNC-LABEL: {{^}}v3f32_arg: 249; HSA-VI: kernarg_segment_byte_size = 32 250; HSA-VI: kernarg_segment_alignment = 4 251; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 252; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 253; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 254; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 255; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 256; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 257define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 258entry: 259 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 260 ret void 261} 262 263; FUNC-LABEL: {{^}}v4i8_arg: 264; HSA-VI: kernarg_segment_byte_size = 12 265; HSA-VI: kernarg_segment_alignment = 4 266; EG: VTX_READ_8 267; EG: VTX_READ_8 268; EG: VTX_READ_8 269; EG: VTX_READ_8 270 271; GCN-DAG: s_load_dwordx2 s 272; GCN-DAG: s_load_dword s 273define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 274entry: 275 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 276 ret void 277} 278 279; FUNC-LABEL: {{^}}v4i16_arg: 280; HSA-VI: kernarg_segment_byte_size = 16 281; HSA-VI: kernarg_segment_alignment = 4 282; EG: VTX_READ_16 283; EG: VTX_READ_16 284; EG: VTX_READ_16 285; EG: VTX_READ_16 286 287; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb 288; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 289 290; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 291; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 292 293 294; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 295; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 296 297; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 298; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 299define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 300entry: 301 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 302 ret void 303} 304 305; FUNC-LABEL: {{^}}v4i32_arg: 306; HSA-VI: kernarg_segment_byte_size = 32 307; HSA-VI: kernarg_segment_alignment = 4 308; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 309; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 310; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 311; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 312 313; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 314; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 315; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 316define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 317entry: 318 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 319 ret void 320} 321 322; FUNC-LABEL: {{^}}v4f32_arg: 323; HSA-VI: kernarg_segment_byte_size = 32 324; HSA-VI: kernarg_segment_alignment = 4 325; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 326; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 327; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 328; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 329; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 330; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 331; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 332define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 333entry: 334 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 335 ret void 336} 337 338; FIXME: Lots of unpack and re-pack junk on VI 339; FUNC-LABEL: {{^}}v8i8_arg: 340; HSA-VI: kernarg_segment_byte_size = 16 341; HSA-VI: kernarg_segment_alignment = 4 342; EG: VTX_READ_8 343; EG: VTX_READ_8 344; EG: VTX_READ_8 345; EG: VTX_READ_8 346; EG: VTX_READ_8 347; EG: VTX_READ_8 348; EG: VTX_READ_8 349; EG: VTX_READ_8 350 351; SI-NOT: {{buffer|flat|global}}_load 352; SI: s_load_dwordx2 s 353; SI-NEXT: s_load_dwordx2 s 354; SI-NOT: {{buffer|flat|global}}_load 355 356; VI: s_load_dwordx2 s 357; VI-NEXT: s_load_dwordx2 s 358; VI-NOT: lshl 359; VI-NOT: _or 360; VI-NOT: _sdwa 361define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 362entry: 363 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 364 ret void 365} 366 367; FUNC-LABEL: {{^}}v8i16_arg: 368; HSA-VI: kernarg_segment_byte_size = 32 369; HSA-VI: kernarg_segment_alignment = 4 370; EG: VTX_READ_16 371; EG: VTX_READ_16 372; EG: VTX_READ_16 373; EG: VTX_READ_16 374; EG: VTX_READ_16 375; EG: VTX_READ_16 376; EG: VTX_READ_16 377; EG: VTX_READ_16 378 379; SI: s_load_dwordx4 380; SI-NEXT: s_load_dwordx2 381; SI-NOT: {{buffer|flat|global}}_load 382 383 384; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 385 386; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 387define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 388entry: 389 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 390 ret void 391} 392 393; FUNC-LABEL: {{^}}v8i32_arg: 394; HSA-VI: kernarg_segment_byte_size = 64 395; HSA-VI: kernarg_segment_alignment = 5 396; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 397; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 398; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 399; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 400; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 401; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 402; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 403; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 404 405; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 406; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 407; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 408define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 409entry: 410 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 411 ret void 412} 413 414; FUNC-LABEL: {{^}}v8f32_arg: 415; HSA-VI: kernarg_segment_byte_size = 64 416; HSA-VI: kernarg_segment_alignment = 5 417; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 418; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 419; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 420; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 421; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 422; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 423; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 424; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 425; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 426define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 427entry: 428 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 429 ret void 430} 431 432; FIXME: Pack/repack on VI 433 434; FUNC-LABEL: {{^}}v16i8_arg: 435; HSA-VI: kernarg_segment_byte_size = 32 436; HSA-VI: kernarg_segment_alignment = 4 437; EG: VTX_READ_8 438; EG: VTX_READ_8 439; EG: VTX_READ_8 440; EG: VTX_READ_8 441; EG: VTX_READ_8 442; EG: VTX_READ_8 443; EG: VTX_READ_8 444; EG: VTX_READ_8 445; EG: VTX_READ_8 446; EG: VTX_READ_8 447; EG: VTX_READ_8 448; EG: VTX_READ_8 449; EG: VTX_READ_8 450; EG: VTX_READ_8 451; EG: VTX_READ_8 452; EG: VTX_READ_8 453 454; SI: s_load_dwordx4 s 455; SI-NEXT: s_load_dwordx2 s 456; SI-NOT: {{buffer|flat|global}}_load 457 458 459; VI: s_load_dwordx4 s 460; VI-NOT: shr 461; VI-NOT: shl 462; VI-NOT: _sdwa 463; VI-NOT: _or_ 464define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 465entry: 466 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 467 ret void 468} 469 470; FUNC-LABEL: {{^}}v16i16_arg: 471; HSA-VI: kernarg_segment_byte_size = 64 472; HSA-VI: kernarg_segment_alignment = 5 473; EG: VTX_READ_16 474; EG: VTX_READ_16 475; EG: VTX_READ_16 476; EG: VTX_READ_16 477; EG: VTX_READ_16 478 479; EG: VTX_READ_16 480; EG: VTX_READ_16 481; EG: VTX_READ_16 482; EG: VTX_READ_16 483; EG: VTX_READ_16 484; EG: VTX_READ_16 485; EG: VTX_READ_16 486; EG: VTX_READ_16 487; EG: VTX_READ_16 488; EG: VTX_READ_16 489; EG: VTX_READ_16 490 491; SI: s_load_dwordx8 s 492; SI-NEXT: s_load_dwordx2 s 493; SI-NOT: {{buffer|flat|global}}_load 494 495 496; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 497 498; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 499define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 500entry: 501 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 502 ret void 503} 504 505; FUNC-LABEL: {{^}}v16i32_arg: 506; HSA-VI: kernarg_segment_byte_size = 128 507; HSA-VI: kernarg_segment_alignment = 6 508; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 509; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 510; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 511; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 512; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 513; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 514; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 515; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 516; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 517; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 518; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 519; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 520; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 521; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 522; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 523; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 524; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 525; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 526; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 527define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 528entry: 529 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 530 ret void 531} 532 533; FUNC-LABEL: {{^}}v16f32_arg: 534; HSA-VI: kernarg_segment_byte_size = 128 535; HSA-VI: kernarg_segment_alignment = 6 536; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 537; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 538; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 539; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 540; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 541; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 542; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 543; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 544; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 545; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 546; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 547; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 552; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 553; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 554; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 555define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 556entry: 557 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 558 ret void 559} 560 561; FUNC-LABEL: {{^}}kernel_arg_i64: 562; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 563; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 564 565; MESA-GCN: buffer_store_dwordx2 566define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 567 store i64 %a, i64 addrspace(1)* %out, align 8 568 ret void 569} 570 571; FUNC-LABEL: {{^}}f64_kernel_arg: 572; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 573; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 574; MESA-GCN: buffer_store_dwordx2 575 576; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 577define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 578entry: 579 store double %in, double addrspace(1)* %out 580 ret void 581} 582 583; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 584; XGCN: s_load_dwordx2 585; XGCN: s_load_dwordx2 586; XGCN: buffer_store_dwordx2 587; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 588; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 589; ret void 590; } 591 592; FUNC-LABEL: {{^}}i65_arg: 593; HSA-VI: kernarg_segment_byte_size = 24 594; HSA-VI: kernarg_segment_alignment = 4 595; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 596; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 597define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 598entry: 599 store i65 %in, i65 addrspace(1)* %out, align 4 600 ret void 601} 602 603; FUNC-LABEL: {{^}}i1_arg: 604; HSA-VI: kernarg_segment_byte_size = 12 605; HSA-VI: kernarg_segment_alignment = 4 606 607; GCN: s_load_dword s 608; GCN: s_and_b32 609; GCN: {{buffer|flat}}_store_byte 610define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 611 store i1 %x, i1 addrspace(1)* %out, align 1 612 ret void 613} 614 615; FUNC-LABEL: {{^}}i1_arg_zext_i32: 616; HSA-VI: kernarg_segment_byte_size = 12 617; HSA-VI: kernarg_segment_alignment = 4 618 619; GCN: s_load_dword 620; SGCN: buffer_store_dword 621define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 622 %ext = zext i1 %x to i32 623 store i32 %ext, i32 addrspace(1)* %out, align 4 624 ret void 625} 626 627; FUNC-LABEL: {{^}}i1_arg_zext_i64: 628; HSA-VI: kernarg_segment_byte_size = 12 629; HSA-VI: kernarg_segment_alignment = 4 630 631; GCN: s_load_dword s 632; GCN: {{buffer|flat}}_store_dwordx2 633define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 634 %ext = zext i1 %x to i64 635 store i64 %ext, i64 addrspace(1)* %out, align 8 636 ret void 637} 638 639; FUNC-LABEL: {{^}}i1_arg_sext_i32: 640; HSA-VI: kernarg_segment_byte_size = 12 641; HSA-VI: kernarg_segment_alignment = 4 642 643; GCN: s_load_dword 644; GCN: {{buffer|flat}}_store_dword 645define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 646 %ext = sext i1 %x to i32 647 store i32 %ext, i32addrspace(1)* %out, align 4 648 ret void 649} 650 651; FUNC-LABEL: {{^}}i1_arg_sext_i64: 652; HSA-VI: kernarg_segment_byte_size = 12 653; HSA-VI: kernarg_segment_alignment = 4 654 655; GCN: s_load_dword 656; GCN: s_bfe_i64 657; GCN: {{buffer|flat}}_store_dwordx2 658define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 659 %ext = sext i1 %x to i64 660 store i64 %ext, i64 addrspace(1)* %out, align 8 661 ret void 662} 663 664; FUNC-LABEL: {{^}}empty_struct_arg: 665; HSA-VI: kernarg_segment_byte_size = 0 666define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 667 ret void 668} 669 670; The correct load offsets for these: 671; load 4 from 0, 672; load 8 from 8 673; load 4 from 24 674; load 8 from 32 675 676; With the SelectionDAG argument lowering, the alignments for the 677; struct members is not properly considered, making these wrong. 678 679; FIXME: Total argument size is computed wrong 680; FUNC-LABEL: {{^}}struct_argument_alignment: 681; HSA-VI: kernarg_segment_byte_size = 40 682; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 683; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 684; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 685; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 686define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 687 %val0 = extractvalue {i32, i64} %arg0, 0 688 %val1 = extractvalue {i32, i64} %arg0, 1 689 %val2 = extractvalue {i32, i64} %arg1, 0 690 %val3 = extractvalue {i32, i64} %arg1, 1 691 store volatile i32 %val0, i32 addrspace(1)* null 692 store volatile i64 %val1, i64 addrspace(1)* null 693 store volatile i32 %val2, i32 addrspace(1)* null 694 store volatile i64 %val3, i64 addrspace(1)* null 695 ret void 696} 697 698; No padding between i8 and next struct, but round up at end to 4 byte 699; multiple. 700; FUNC-LABEL: {{^}}packed_struct_argument_alignment: 701; HSA-VI: kernarg_segment_byte_size = 28 702; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 703; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 704; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 705; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 706define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 707 %val0 = extractvalue <{i32, i64}> %arg0, 0 708 %val1 = extractvalue <{i32, i64}> %arg0, 1 709 %val2 = extractvalue <{i32, i64}> %arg1, 0 710 %val3 = extractvalue <{i32, i64}> %arg1, 1 711 store volatile i32 %val0, i32 addrspace(1)* null 712 store volatile i64 %val1, i64 addrspace(1)* null 713 store volatile i32 %val2, i32 addrspace(1)* null 714 store volatile i64 %val3, i64 addrspace(1)* null 715 ret void 716} 717 718; GCN-LABEL: {{^}}struct_argument_alignment_after: 719; HSA-VI: kernarg_segment_byte_size = 64 720; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 721; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 722; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 723; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 724; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 725define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 726 %val0 = extractvalue {i32, i64} %arg0, 0 727 %val1 = extractvalue {i32, i64} %arg0, 1 728 %val2 = extractvalue {i32, i64} %arg2, 0 729 %val3 = extractvalue {i32, i64} %arg2, 1 730 store volatile i32 %val0, i32 addrspace(1)* null 731 store volatile i64 %val1, i64 addrspace(1)* null 732 store volatile i32 %val2, i32 addrspace(1)* null 733 store volatile i64 %val3, i64 addrspace(1)* null 734 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 735 ret void 736} 737 738; GCN-LABEL: {{^}}array_3xi32: 739; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 740; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 741; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 742; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc 743define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 744 store volatile i16 %arg0, i16 addrspace(1)* undef 745 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 746 ret void 747} 748 749; FIXME: Why not all scalar loads? 750; GCN-LABEL: {{^}}array_3xi16: 751; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 752; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 753; HSA-VI: flat_load_ushort 754; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 755; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 756define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 757 store volatile i8 %arg0, i8 addrspace(1)* undef 758 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 759 ret void 760} 761