1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14 15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 17 18 19define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 20entry: 21 %0 = zext i8 %in to i32 22 store i32 %0, i32 addrspace(1)* %out, align 4 23 ret void 24} 25 26; FUNC-LABEL: {{^}}i8_zext_arg: 27; HSA-VI: kernarg_segment_byte_size = 12 28; HSA-VI: kernarg_segment_alignment = 4 29; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 32 33; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 34; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 35define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 36entry: 37 %0 = zext i8 %in to i32 38 store i32 %0, i32 addrspace(1)* %out, align 4 39 ret void 40} 41 42; FUNC-LABEL: {{^}}i8_sext_arg: 43; HSA-VI: kernarg_segment_byte_size = 12 44; HSA-VI: kernarg_segment_alignment = 4 45; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 46; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 47 48; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 49 50; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 51; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] 52; HSA-VI: flat_store_dword 53define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 54entry: 55 %0 = sext i8 %in to i32 56 store i32 %0, i32 addrspace(1)* %out, align 4 57 ret void 58} 59 60; FUNC-LABEL: {{^}}i16_arg: 61; HSA-VI: kernarg_segment_byte_size = 12 62; HSA-VI: kernarg_segment_alignment = 4 63 64; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 65; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 66 67; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 68; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 69 70; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 71; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 72; HSA-VI: flat_store_dword 73define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 74entry: 75 %0 = zext i16 %in to i32 76 store i32 %0, i32 addrspace(1)* %out, align 4 77 ret void 78} 79 80; FUNC-LABEL: {{^}}i16_zext_arg: 81; HSA-VI: kernarg_segment_byte_size = 12 82; HSA-VI: kernarg_segment_alignment = 4 83 84; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 85; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 86; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 87 88; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 89; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 90; HSA-VI: flat_store_dword 91define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 92entry: 93 %0 = zext i16 %in to i32 94 store i32 %0, i32 addrspace(1)* %out, align 4 95 ret void 96} 97 98; FUNC-LABEL: {{^}}i16_sext_arg: 99; HSA-VI: kernarg_segment_byte_size = 12 100; HSA-VI: kernarg_segment_alignment = 4 101 102; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 103; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 104; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 105 106 107; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 108; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] 109; HSA-VI: flat_store_dword 110define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 111entry: 112 %0 = sext i16 %in to i32 113 store i32 %0, i32 addrspace(1)* %out, align 4 114 ret void 115} 116 117; FUNC-LABEL: {{^}}i32_arg: 118; HSA-VI: kernarg_segment_byte_size = 12 119; HSA-VI: kernarg_segment_alignment = 4 120 121; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 122; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 123; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 124; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 125define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 126entry: 127 store i32 %in, i32 addrspace(1)* %out, align 4 128 ret void 129} 130 131; FUNC-LABEL: {{^}}f32_arg: 132; HSA-VI: kernarg_segment_byte_size = 12 133; HSA-VI: kernarg_segment_alignment = 4 134; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 135; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 136; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 137; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 138define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 139entry: 140 store float %in, float addrspace(1)* %out, align 4 141 ret void 142} 143 144; FUNC-LABEL: {{^}}v2i8_arg: 145; HSA-VI: kernarg_segment_byte_size = 12 146; HSA-VI: kernarg_segment_alignment = 4 147 148; EG: VTX_READ_8 149; EG: VTX_READ_8 150 151; GCN: s_load_dword s 152; GCN-NOT: {{buffer|flat|global}}_load_ 153define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 154entry: 155 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 156 ret void 157} 158 159; FUNC-LABEL: {{^}}v2i16_arg: 160; HSA-VI: kernarg_segment_byte_size = 12 161; HSA-VI: kernarg_segment_alignment = 4 162 163; EG: VTX_READ_16 164; EG: VTX_READ_16 165 166; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 167; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 168; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 169define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 170entry: 171 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 172 ret void 173} 174 175; FUNC-LABEL: {{^}}v2i32_arg: 176; HSA-VI: kernarg_segment_byte_size = 16 177; HSA-VI: kernarg_segment_alignment = 4 178 179; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 180; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 181; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 182; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 183; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 184define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 185entry: 186 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 187 ret void 188} 189 190; FUNC-LABEL: {{^}}v2f32_arg: 191; HSA-VI: kernarg_segment_byte_size = 16 192; HSA-VI: kernarg_segment_alignment = 4 193 194; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 195; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 196; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 197; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 198; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 199define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 200entry: 201 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 202 ret void 203} 204 205; FUNC-LABEL: {{^}}v3i8_arg: 206; HSA-VI: kernarg_segment_byte_size = 12 207; HSA-VI: kernarg_segment_alignment = 4 208 209; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 210; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 211; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 212 213; GCN: s_load_dword s 214; GCN-NOT: {{buffer|flat|global}}_load_ 215define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 216entry: 217 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 218 ret void 219} 220 221; FUNC-LABEL: {{^}}v3i16_arg: 222; HSA-VI: kernarg_segment_byte_size = 16 223; HSA-VI: kernarg_segment_alignment = 4 224 225; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 226; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 227; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 228 229; SI: s_load_dword s 230; SI: s_load_dword s 231 232; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 233; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 234define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 235entry: 236 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 237 ret void 238} 239; FUNC-LABEL: {{^}}v3i32_arg: 240; HSA-VI: kernarg_segment_byte_size = 32 241; HSA-VI: kernarg_segment_alignment = 4 242; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 243; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 244; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 245; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 246; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 247; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 248define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 249entry: 250 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 251 ret void 252} 253 254; FUNC-LABEL: {{^}}v3f32_arg: 255; HSA-VI: kernarg_segment_byte_size = 32 256; HSA-VI: kernarg_segment_alignment = 4 257; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 258; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 259; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 260; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 261; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 262; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 263define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 264entry: 265 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 266 ret void 267} 268 269; FUNC-LABEL: {{^}}v4i8_arg: 270; HSA-VI: kernarg_segment_byte_size = 12 271; HSA-VI: kernarg_segment_alignment = 4 272; EG: VTX_READ_8 273; EG: VTX_READ_8 274; EG: VTX_READ_8 275; EG: VTX_READ_8 276 277; GCN: s_load_dword s 278; GCN-NOT: {{buffer|flat|global}}_load_ 279define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 280entry: 281 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 282 ret void 283} 284 285; FUNC-LABEL: {{^}}v4i16_arg: 286; HSA-VI: kernarg_segment_byte_size = 16 287; HSA-VI: kernarg_segment_alignment = 4 288; EG: VTX_READ_16 289; EG: VTX_READ_16 290; EG: VTX_READ_16 291; EG: VTX_READ_16 292 293; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb 294; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc 295; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 296 297; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c 298; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 299define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 300entry: 301 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 302 ret void 303} 304 305; FUNC-LABEL: {{^}}v4i32_arg: 306; HSA-VI: kernarg_segment_byte_size = 32 307; HSA-VI: kernarg_segment_alignment = 4 308; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 309; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 310; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 311; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 312 313; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 314; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 315; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 316define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 317entry: 318 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 319 ret void 320} 321 322; FUNC-LABEL: {{^}}v4f32_arg: 323; HSA-VI: kernarg_segment_byte_size = 32 324; HSA-VI: kernarg_segment_alignment = 4 325; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 326; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 327; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 328; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 329; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 330; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 331; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 332define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 333entry: 334 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 335 ret void 336} 337 338; FIXME: Lots of unpack and re-pack junk on VI 339; FUNC-LABEL: {{^}}v8i8_arg: 340; HSA-VI: kernarg_segment_byte_size = 16 341; HSA-VI: kernarg_segment_alignment = 4 342; EG: VTX_READ_8 343; EG: VTX_READ_8 344; EG: VTX_READ_8 345; EG: VTX_READ_8 346; EG: VTX_READ_8 347; EG: VTX_READ_8 348; EG: VTX_READ_8 349; EG: VTX_READ_8 350 351 352; SI: s_load_dword s 353; SI: s_load_dword s 354; SI: s_load_dwordx2 s 355; SI-NOT: {{buffer|flat|global}}_load 356 357; VI: s_load_dword s 358; VI: s_load_dword s 359 360; VI: v_lshlrev_b16 361; VI: v_or_b32_e32 362; VI: v_or_b32_sdwa 363; VI: v_or_b32_sdwa 364; VI: v_lshlrev_b16 365; VI: s_lshr_b32 366; VI: v_or_b32_sdwa 367; VI: v_or_b32_sdwa 368define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 369entry: 370 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 371 ret void 372} 373 374; FUNC-LABEL: {{^}}v8i16_arg: 375; HSA-VI: kernarg_segment_byte_size = 32 376; HSA-VI: kernarg_segment_alignment = 4 377; EG: VTX_READ_16 378; EG: VTX_READ_16 379; EG: VTX_READ_16 380; EG: VTX_READ_16 381; EG: VTX_READ_16 382; EG: VTX_READ_16 383; EG: VTX_READ_16 384; EG: VTX_READ_16 385 386; SI: s_load_dword s 387; SI: s_load_dword s 388; SI: s_load_dword s 389; SI: s_load_dword s 390; SI: s_load_dwordx2 391; SI-NOT: {{buffer|flat|global}}_load 392 393 394; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 395; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c 396 397; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 398; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18 399define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 400entry: 401 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 402 ret void 403} 404 405; FUNC-LABEL: {{^}}v8i32_arg: 406; HSA-VI: kernarg_segment_byte_size = 64 407; HSA-VI: kernarg_segment_alignment = 5 408; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 409; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 410; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 411; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 412; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 413; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 414; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 415; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 416; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 417; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 418; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 419define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 420entry: 421 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 422 ret void 423} 424 425; FUNC-LABEL: {{^}}v8f32_arg: 426; HSA-VI: kernarg_segment_byte_size = 64 427; HSA-VI: kernarg_segment_alignment = 5 428; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 429; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 430; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 431; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 432; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 433; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 434; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 435; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 436; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 437define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 438entry: 439 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 440 ret void 441} 442 443; FIXME: Pack/repack on VI 444 445; FUNC-LABEL: {{^}}v16i8_arg: 446; HSA-VI: kernarg_segment_byte_size = 32 447; HSA-VI: kernarg_segment_alignment = 4 448; EG: VTX_READ_8 449; EG: VTX_READ_8 450; EG: VTX_READ_8 451; EG: VTX_READ_8 452; EG: VTX_READ_8 453; EG: VTX_READ_8 454; EG: VTX_READ_8 455; EG: VTX_READ_8 456; EG: VTX_READ_8 457; EG: VTX_READ_8 458; EG: VTX_READ_8 459; EG: VTX_READ_8 460; EG: VTX_READ_8 461; EG: VTX_READ_8 462; EG: VTX_READ_8 463; EG: VTX_READ_8 464 465; SI: s_load_dword s 466; SI: s_load_dword s 467; SI: s_load_dword s 468; SI: s_load_dword s 469; SI: s_load_dwordx2 470; SI-NOT: {{buffer|flat|global}}_load 471 472 473; VI: s_load_dword s 474; VI: s_load_dword s 475; VI: s_load_dword s 476; VI: s_load_dword s 477 478; VI: s_lshr_b32 479; VI: v_lshlrev_b16 480; VI: s_lshr_b32 481; VI: s_lshr_b32 482; VI: v_or_b32_sdwa 483; VI: v_or_b32_sdwa 484; VI: v_lshlrev_b16 485; VI: v_lshlrev_b16 486; VI: v_or_b32_sdwa 487; VI: v_or_b32_sdwa 488; VI: v_lshlrev_b16 489; VI: v_lshlrev_b16 490; VI: v_or_b32_sdwa 491; VI: v_or_b32_sdwa 492define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 493entry: 494 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 495 ret void 496} 497 498; FUNC-LABEL: {{^}}v16i16_arg: 499; HSA-VI: kernarg_segment_byte_size = 64 500; HSA-VI: kernarg_segment_alignment = 5 501; EG: VTX_READ_16 502; EG: VTX_READ_16 503; EG: VTX_READ_16 504; EG: VTX_READ_16 505; EG: VTX_READ_16 506 507; EG: VTX_READ_16 508; EG: VTX_READ_16 509; EG: VTX_READ_16 510; EG: VTX_READ_16 511; EG: VTX_READ_16 512; EG: VTX_READ_16 513; EG: VTX_READ_16 514; EG: VTX_READ_16 515; EG: VTX_READ_16 516; EG: VTX_READ_16 517; EG: VTX_READ_16 518 519; SI: s_load_dword s 520; SI: s_load_dword s 521; SI: s_load_dword s 522; SI: s_load_dword s 523; SI: s_load_dword s 524; SI: s_load_dword s 525; SI: s_load_dword s 526; SI: s_load_dword s 527 528; SI-NOT: {{buffer|flat|global}}_load 529 530 531; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 532; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c 533; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54 534; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c 535 536; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 537; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28 538; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 539; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38 540define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 541entry: 542 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 543 ret void 544} 545 546; FUNC-LABEL: {{^}}v16i32_arg: 547; HSA-VI: kernarg_segment_byte_size = 128 548; HSA-VI: kernarg_segment_alignment = 6 549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 560; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 561; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 562; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 563; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 564; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 565; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 566; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 567; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 568define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 569entry: 570 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 571 ret void 572} 573 574; FUNC-LABEL: {{^}}v16f32_arg: 575; HSA-VI: kernarg_segment_byte_size = 128 576; HSA-VI: kernarg_segment_alignment = 6 577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 588; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 589; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 590; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 591; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 592; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 593; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 594; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 595; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 596define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 597entry: 598 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 599 ret void 600} 601 602; FUNC-LABEL: {{^}}kernel_arg_i64: 603; MESA-GCN: s_load_dwordx2 604; MESA-GCN: s_load_dwordx2 605; MESA-GCN: buffer_store_dwordx2 606; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 607define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 608 store i64 %a, i64 addrspace(1)* %out, align 8 609 ret void 610} 611 612; FUNC-LABEL: {{^}}f64_kernel_arg: 613; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 614; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb 615; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 616; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c 617; MESA-GCN: buffer_store_dwordx2 618; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 619define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 620entry: 621 store double %in, double addrspace(1)* %out 622 ret void 623} 624 625; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 626; XGCN: s_load_dwordx2 627; XGCN: s_load_dwordx2 628; XGCN: buffer_store_dwordx2 629; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 630; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 631; ret void 632; } 633 634; FUNC-LABEL: {{^}}i1_arg: 635; HSA-VI: kernarg_segment_byte_size = 12 636; HSA-VI: kernarg_segment_alignment = 4 637 638; GCN: s_load_dword s 639; GCN: s_and_b32 640; GCN: {{buffer|flat}}_store_byte 641define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 642 store i1 %x, i1 addrspace(1)* %out, align 1 643 ret void 644} 645 646; FUNC-LABEL: {{^}}i1_arg_zext_i32: 647; HSA-VI: kernarg_segment_byte_size = 12 648; HSA-VI: kernarg_segment_alignment = 4 649 650; GCN: s_load_dword 651; SGCN: buffer_store_dword 652define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 653 %ext = zext i1 %x to i32 654 store i32 %ext, i32 addrspace(1)* %out, align 4 655 ret void 656} 657 658; FUNC-LABEL: {{^}}i1_arg_zext_i64: 659; HSA-VI: kernarg_segment_byte_size = 12 660; HSA-VI: kernarg_segment_alignment = 4 661 662; GCN: s_load_dword s 663; GCN: {{buffer|flat}}_store_dwordx2 664define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 665 %ext = zext i1 %x to i64 666 store i64 %ext, i64 addrspace(1)* %out, align 8 667 ret void 668} 669 670; FUNC-LABEL: {{^}}i1_arg_sext_i32: 671; HSA-VI: kernarg_segment_byte_size = 12 672; HSA-VI: kernarg_segment_alignment = 4 673 674; GCN: s_load_dword 675; GCN: {{buffer|flat}}_store_dword 676define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 677 %ext = sext i1 %x to i32 678 store i32 %ext, i32addrspace(1)* %out, align 4 679 ret void 680} 681 682; FUNC-LABEL: {{^}}i1_arg_sext_i64: 683; HSA-VI: kernarg_segment_byte_size = 12 684; HSA-VI: kernarg_segment_alignment = 4 685 686; GCN: s_load_dword 687; GCN: s_bfe_i64 688; GCN: {{buffer|flat}}_store_dwordx2 689define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 690 %ext = sext i1 %x to i64 691 store i64 %ext, i64 addrspace(1)* %out, align 8 692 ret void 693} 694