1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14 15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 17 18 19define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 20entry: 21 %0 = zext i8 %in to i32 22 store i32 %0, i32 addrspace(1)* %out, align 4 23 ret void 24} 25 26; FUNC-LABEL: {{^}}i8_zext_arg: 27; HSA-VI: kernarg_segment_byte_size = 12 28; HSA-VI: kernarg_segment_alignment = 4 29; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 32 33; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 34; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 35define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 36entry: 37 %0 = zext i8 %in to i32 38 store i32 %0, i32 addrspace(1)* %out, align 4 39 ret void 40} 41 42; FUNC-LABEL: {{^}}i8_sext_arg: 43; HSA-VI: kernarg_segment_byte_size = 12 44; HSA-VI: kernarg_segment_alignment = 4 45; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 46; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 47 48; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 49 50; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 51; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] 52; HSA-VI: flat_store_dword 53define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 54entry: 55 %0 = sext i8 %in to i32 56 store i32 %0, i32 addrspace(1)* %out, align 4 57 ret void 58} 59 60; FUNC-LABEL: {{^}}i16_arg: 61; HSA-VI: kernarg_segment_byte_size = 12 62; HSA-VI: kernarg_segment_alignment = 4 63 64; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 65; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 66 67; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 68; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 69 70; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 71; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 72; HSA-VI: flat_store_dword 73define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 74entry: 75 %0 = zext i16 %in to i32 76 store i32 %0, i32 addrspace(1)* %out, align 4 77 ret void 78} 79 80; FUNC-LABEL: {{^}}i16_zext_arg: 81; HSA-VI: kernarg_segment_byte_size = 12 82; HSA-VI: kernarg_segment_alignment = 4 83 84; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 85; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 86; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 87 88; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 89; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} 90; HSA-VI: flat_store_dword 91define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 92entry: 93 %0 = zext i16 %in to i32 94 store i32 %0, i32 addrspace(1)* %out, align 4 95 ret void 96} 97 98; FUNC-LABEL: {{^}}i16_sext_arg: 99; HSA-VI: kernarg_segment_byte_size = 12 100; HSA-VI: kernarg_segment_alignment = 4 101 102; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 103; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 104; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 105 106 107; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 108; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] 109; HSA-VI: flat_store_dword 110define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 111entry: 112 %0 = sext i16 %in to i32 113 store i32 %0, i32 addrspace(1)* %out, align 4 114 ret void 115} 116 117; FUNC-LABEL: {{^}}i32_arg: 118; HSA-VI: kernarg_segment_byte_size = 12 119; HSA-VI: kernarg_segment_alignment = 4 120 121; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 122; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 123; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 124; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 125define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 126entry: 127 store i32 %in, i32 addrspace(1)* %out, align 4 128 ret void 129} 130 131; FUNC-LABEL: {{^}}f32_arg: 132; HSA-VI: kernarg_segment_byte_size = 12 133; HSA-VI: kernarg_segment_alignment = 4 134; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 135; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 136; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 137; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 138define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 139entry: 140 store float %in, float addrspace(1)* %out, align 4 141 ret void 142} 143 144; FUNC-LABEL: {{^}}v2i8_arg: 145; HSA-VI: kernarg_segment_byte_size = 12 146; HSA-VI: kernarg_segment_alignment = 4 147 148; EG: VTX_READ_8 149; EG: VTX_READ_8 150 151; GCN: s_load_dword s 152; GCN-NOT: {{buffer|flat|global}}_load_ 153define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 154entry: 155 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 156 ret void 157} 158 159; FUNC-LABEL: {{^}}v2i16_arg: 160; HSA-VI: kernarg_segment_byte_size = 12 161; HSA-VI: kernarg_segment_alignment = 4 162 163; EG: VTX_READ_16 164; EG: VTX_READ_16 165 166; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 167; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 168; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 169define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 170entry: 171 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 172 ret void 173} 174 175; FUNC-LABEL: {{^}}v2i32_arg: 176; HSA-VI: kernarg_segment_byte_size = 16 177; HSA-VI: kernarg_segment_alignment = 4 178 179; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 180; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 181; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 182; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 183; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 184define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 185entry: 186 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 187 ret void 188} 189 190; FUNC-LABEL: {{^}}v2f32_arg: 191; HSA-VI: kernarg_segment_byte_size = 16 192; HSA-VI: kernarg_segment_alignment = 4 193 194; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 195; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 196; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 197; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 198; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 199define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 200entry: 201 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 202 ret void 203} 204 205; FUNC-LABEL: {{^}}v3i8_arg: 206; HSA-VI: kernarg_segment_byte_size = 12 207; HSA-VI: kernarg_segment_alignment = 4 208 209; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 210; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 211; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 212 213; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 214 215; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 216; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 217define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 218entry: 219 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 220 ret void 221} 222 223; FUNC-LABEL: {{^}}v3i16_arg: 224; HSA-VI: kernarg_segment_byte_size = 16 225; HSA-VI: kernarg_segment_alignment = 4 226 227; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 228; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 229; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 230 231; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb 232 233; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 234; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 235define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 236entry: 237 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 238 ret void 239} 240 241; FUNC-LABEL: {{^}}v3i32_arg: 242; HSA-VI: kernarg_segment_byte_size = 32 243; HSA-VI: kernarg_segment_alignment = 4 244; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 245; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 246; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 247; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 248; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 249; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 250define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 251entry: 252 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 253 ret void 254} 255 256; FUNC-LABEL: {{^}}v3f32_arg: 257; HSA-VI: kernarg_segment_byte_size = 32 258; HSA-VI: kernarg_segment_alignment = 4 259; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 260; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 261; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 262; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 263; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 264; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 265define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 266entry: 267 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 268 ret void 269} 270 271; FUNC-LABEL: {{^}}v4i8_arg: 272; HSA-VI: kernarg_segment_byte_size = 12 273; HSA-VI: kernarg_segment_alignment = 4 274; EG: VTX_READ_8 275; EG: VTX_READ_8 276; EG: VTX_READ_8 277; EG: VTX_READ_8 278 279; GCN-DAG: s_load_dwordx2 s 280; GCN-DAG: s_load_dword s 281define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 282entry: 283 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 284 ret void 285} 286 287; FUNC-LABEL: {{^}}v4i16_arg: 288; HSA-VI: kernarg_segment_byte_size = 16 289; HSA-VI: kernarg_segment_alignment = 4 290; EG: VTX_READ_16 291; EG: VTX_READ_16 292; EG: VTX_READ_16 293; EG: VTX_READ_16 294 295; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb 296; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 297 298; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 299; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 300 301 302; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24 303; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c 304 305; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 306; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 307define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 308entry: 309 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 310 ret void 311} 312 313; FUNC-LABEL: {{^}}v4i32_arg: 314; HSA-VI: kernarg_segment_byte_size = 32 315; HSA-VI: kernarg_segment_alignment = 4 316; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 317; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 318; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 319; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 320 321; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 322; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 323; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 324define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 325entry: 326 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 327 ret void 328} 329 330; FUNC-LABEL: {{^}}v4f32_arg: 331; HSA-VI: kernarg_segment_byte_size = 32 332; HSA-VI: kernarg_segment_alignment = 4 333; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 334; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 335; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 336; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 337; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 338; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 339; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 340define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 341entry: 342 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 343 ret void 344} 345 346; FIXME: Lots of unpack and re-pack junk on VI 347; FUNC-LABEL: {{^}}v8i8_arg: 348; HSA-VI: kernarg_segment_byte_size = 16 349; HSA-VI: kernarg_segment_alignment = 4 350; EG: VTX_READ_8 351; EG: VTX_READ_8 352; EG: VTX_READ_8 353; EG: VTX_READ_8 354; EG: VTX_READ_8 355; EG: VTX_READ_8 356; EG: VTX_READ_8 357; EG: VTX_READ_8 358 359; SI-NOT: {{buffer|flat|global}}_load 360; SI: s_load_dwordx2 s 361; SI-NEXT: s_load_dwordx2 s 362; SI-NOT: {{buffer|flat|global}}_load 363 364; VI: s_load_dwordx2 s 365; VI-NEXT: s_load_dwordx2 s 366; VI-NOT: lshl 367; VI-NOT: _or 368; VI-NOT: _sdwa 369define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 370entry: 371 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 372 ret void 373} 374 375; FUNC-LABEL: {{^}}v8i16_arg: 376; HSA-VI: kernarg_segment_byte_size = 32 377; HSA-VI: kernarg_segment_alignment = 4 378; EG: VTX_READ_16 379; EG: VTX_READ_16 380; EG: VTX_READ_16 381; EG: VTX_READ_16 382; EG: VTX_READ_16 383; EG: VTX_READ_16 384; EG: VTX_READ_16 385; EG: VTX_READ_16 386 387; SI: s_load_dwordx4 388; SI-NEXT: s_load_dwordx2 389; SI-NOT: {{buffer|flat|global}}_load 390 391 392; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34 393 394; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 395define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 396entry: 397 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 398 ret void 399} 400 401; FUNC-LABEL: {{^}}v8i32_arg: 402; HSA-VI: kernarg_segment_byte_size = 64 403; HSA-VI: kernarg_segment_alignment = 5 404; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 405; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 406; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 407; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 408; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 409; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 410; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 411; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 412 413; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 414; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 415; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 416define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 417entry: 418 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 419 ret void 420} 421 422; FUNC-LABEL: {{^}}v8f32_arg: 423; HSA-VI: kernarg_segment_byte_size = 64 424; HSA-VI: kernarg_segment_alignment = 5 425; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 426; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 427; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 428; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 429; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 430; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 431; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 432; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 433; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 434define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 435entry: 436 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 437 ret void 438} 439 440; FIXME: Pack/repack on VI 441 442; FUNC-LABEL: {{^}}v16i8_arg: 443; HSA-VI: kernarg_segment_byte_size = 32 444; HSA-VI: kernarg_segment_alignment = 4 445; EG: VTX_READ_8 446; EG: VTX_READ_8 447; EG: VTX_READ_8 448; EG: VTX_READ_8 449; EG: VTX_READ_8 450; EG: VTX_READ_8 451; EG: VTX_READ_8 452; EG: VTX_READ_8 453; EG: VTX_READ_8 454; EG: VTX_READ_8 455; EG: VTX_READ_8 456; EG: VTX_READ_8 457; EG: VTX_READ_8 458; EG: VTX_READ_8 459; EG: VTX_READ_8 460; EG: VTX_READ_8 461 462; SI: s_load_dwordx4 s 463; SI-NEXT: s_load_dwordx2 s 464; SI-NOT: {{buffer|flat|global}}_load 465 466 467; VI: s_load_dwordx4 s 468; VI-NOT: shr 469; VI-NOT: shl 470; VI-NOT: _sdwa 471; VI-NOT: _or_ 472define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 473entry: 474 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 475 ret void 476} 477 478; FUNC-LABEL: {{^}}v16i16_arg: 479; HSA-VI: kernarg_segment_byte_size = 64 480; HSA-VI: kernarg_segment_alignment = 5 481; EG: VTX_READ_16 482; EG: VTX_READ_16 483; EG: VTX_READ_16 484; EG: VTX_READ_16 485; EG: VTX_READ_16 486 487; EG: VTX_READ_16 488; EG: VTX_READ_16 489; EG: VTX_READ_16 490; EG: VTX_READ_16 491; EG: VTX_READ_16 492; EG: VTX_READ_16 493; EG: VTX_READ_16 494; EG: VTX_READ_16 495; EG: VTX_READ_16 496; EG: VTX_READ_16 497; EG: VTX_READ_16 498 499; SI: s_load_dwordx8 s 500; SI-NEXT: s_load_dwordx2 s 501; SI-NOT: {{buffer|flat|global}}_load 502 503 504; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 505 506; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 507define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 508entry: 509 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 510 ret void 511} 512 513; FUNC-LABEL: {{^}}v16i32_arg: 514; HSA-VI: kernarg_segment_byte_size = 128 515; HSA-VI: kernarg_segment_alignment = 6 516; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 517; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 518; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 519; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 520; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 521; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 522; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 523; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 524; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 525; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 526; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 527; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 528; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 529; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 530; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 531; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 532; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 533; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 534; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 535define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 536entry: 537 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 538 ret void 539} 540 541; FUNC-LABEL: {{^}}v16f32_arg: 542; HSA-VI: kernarg_segment_byte_size = 128 543; HSA-VI: kernarg_segment_alignment = 6 544; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 545; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 546; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 547; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 560; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 561; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 562; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 563define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 564entry: 565 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 566 ret void 567} 568 569; FUNC-LABEL: {{^}}kernel_arg_i64: 570; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24 571; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 572 573; MESA-GCN: buffer_store_dwordx2 574define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 575 store i64 %a, i64 addrspace(1)* %out, align 8 576 ret void 577} 578 579; FUNC-LABEL: {{^}}f64_kernel_arg: 580; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 581; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 582; MESA-GCN: buffer_store_dwordx2 583 584; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 585define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 586entry: 587 store double %in, double addrspace(1)* %out 588 ret void 589} 590 591; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 592; XGCN: s_load_dwordx2 593; XGCN: s_load_dwordx2 594; XGCN: buffer_store_dwordx2 595; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 596; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 597; ret void 598; } 599 600; FUNC-LABEL: {{^}}i1_arg: 601; HSA-VI: kernarg_segment_byte_size = 12 602; HSA-VI: kernarg_segment_alignment = 4 603 604; GCN: s_load_dword s 605; GCN: s_and_b32 606; GCN: {{buffer|flat}}_store_byte 607define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 608 store i1 %x, i1 addrspace(1)* %out, align 1 609 ret void 610} 611 612; FUNC-LABEL: {{^}}i1_arg_zext_i32: 613; HSA-VI: kernarg_segment_byte_size = 12 614; HSA-VI: kernarg_segment_alignment = 4 615 616; GCN: s_load_dword 617; SGCN: buffer_store_dword 618define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 619 %ext = zext i1 %x to i32 620 store i32 %ext, i32 addrspace(1)* %out, align 4 621 ret void 622} 623 624; FUNC-LABEL: {{^}}i1_arg_zext_i64: 625; HSA-VI: kernarg_segment_byte_size = 12 626; HSA-VI: kernarg_segment_alignment = 4 627 628; GCN: s_load_dword s 629; GCN: {{buffer|flat}}_store_dwordx2 630define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 631 %ext = zext i1 %x to i64 632 store i64 %ext, i64 addrspace(1)* %out, align 8 633 ret void 634} 635 636; FUNC-LABEL: {{^}}i1_arg_sext_i32: 637; HSA-VI: kernarg_segment_byte_size = 12 638; HSA-VI: kernarg_segment_alignment = 4 639 640; GCN: s_load_dword 641; GCN: {{buffer|flat}}_store_dword 642define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 643 %ext = sext i1 %x to i32 644 store i32 %ext, i32addrspace(1)* %out, align 4 645 ret void 646} 647 648; FUNC-LABEL: {{^}}i1_arg_sext_i64: 649; HSA-VI: kernarg_segment_byte_size = 12 650; HSA-VI: kernarg_segment_alignment = 4 651 652; GCN: s_load_dword 653; GCN: s_bfe_i64 654; GCN: {{buffer|flat}}_store_dwordx2 655define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 656 %ext = sext i1 %x to i64 657 store i64 %ext, i64 addrspace(1)* %out, align 8 658 ret void 659} 660