1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_alignment = 4 9; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 10; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 11; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 12; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 13; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 14; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 15; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 16; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 17; FIXME: Should be using s_load_dword 18; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 19 20define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 21entry: 22 %0 = zext i8 %in to i32 23 store i32 %0, i32 addrspace(1)* %out, align 4 24 ret void 25} 26 27; FUNC-LABEL: {{^}}i8_zext_arg: 28; HSA-VI: kernarg_segment_alignment = 4 29; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 32; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 33; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 34; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 35; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 36; FIXME: Should be using s_load_dword 37; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 38 39define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 40entry: 41 %0 = zext i8 %in to i32 42 store i32 %0, i32 addrspace(1)* %out, align 4 43 ret void 44} 45 46; FUNC-LABEL: {{^}}i8_sext_arg: 47; HSA-VI: kernarg_segment_alignment = 4 48; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 49; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 50; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 51; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 52; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 53; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 54; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 55; FIXME: Should be using s_load_dword 56; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 57 58define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 59entry: 60 %0 = sext i8 %in to i32 61 store i32 %0, i32 addrspace(1)* %out, align 4 62 ret void 63} 64 65; FUNC-LABEL: {{^}}i16_arg: 66; HSA-VI: kernarg_segment_alignment = 4 67; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 68; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 69; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 70; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 71; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 72; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 73; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 74; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 75; FIXME: Should be using s_load_dword 76; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 77 78define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 79entry: 80 %0 = zext i16 %in to i32 81 store i32 %0, i32 addrspace(1)* %out, align 4 82 ret void 83} 84 85; FUNC-LABEL: {{^}}i16_zext_arg: 86; HSA-VI: kernarg_segment_alignment = 4 87; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 88; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 89; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 90; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 91; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 92; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 93; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 94; FIXME: Should be using s_load_dword 95; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 96 97define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 98entry: 99 %0 = zext i16 %in to i32 100 store i32 %0, i32 addrspace(1)* %out, align 4 101 ret void 102} 103 104; FUNC-LABEL: {{^}}i16_sext_arg: 105; HSA-VI: kernarg_segment_alignment = 4 106; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 107; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 108; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 109; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 110; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 111; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 112; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 113; FIXME: Should be using s_load_dword 114; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 115 116define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 117entry: 118 %0 = sext i16 %in to i32 119 store i32 %0, i32 addrspace(1)* %out, align 4 120 ret void 121} 122 123; FUNC-LABEL: {{^}}i32_arg: 124; HSA-VI: kernarg_segment_alignment = 4 125; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 126; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 127; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 128; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 129define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 130entry: 131 store i32 %in, i32 addrspace(1)* %out, align 4 132 ret void 133} 134 135; FUNC-LABEL: {{^}}f32_arg: 136; HSA-VI: kernarg_segment_alignment = 4 137; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 138; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 139; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 140; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 141define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 142entry: 143 store float %in, float addrspace(1)* %out, align 4 144 ret void 145} 146 147; FUNC-LABEL: {{^}}v2i8_arg: 148; HSA-VI: kernarg_segment_alignment = 4 149; EG: VTX_READ_8 150; EG: VTX_READ_8 151; MESA-GCN: buffer_load_ubyte 152; MESA-GCN: buffer_load_ubyte 153; HSA-VI: flat_load_ubyte 154; HSA-VI: flat_load_ubyte 155define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 156entry: 157 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 158 ret void 159} 160 161; FUNC-LABEL: {{^}}v2i16_arg: 162; HSA-VI: kernarg_segment_alignment = 4 163; EG: VTX_READ_16 164; EG: VTX_READ_16 165 166; SI: buffer_load_ushort 167; SI: buffer_load_ushort 168 169; VI: s_load_dword s 170define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 171entry: 172 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 173 ret void 174} 175 176; FUNC-LABEL: {{^}}v2i32_arg: 177; HSA-VI: kernarg_segment_alignment = 4 178; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 179; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 180; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 181; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 182; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 183define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 184entry: 185 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 186 ret void 187} 188 189; FUNC-LABEL: {{^}}v2f32_arg: 190; HSA-VI: kernarg_segment_alignment = 4 191; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 192; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 193; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 194; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 195; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 196define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 197entry: 198 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 199 ret void 200} 201 202; FUNC-LABEL: {{^}}v3i8_arg: 203; HSA-VI: kernarg_segment_alignment = 4 204; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 205; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 206; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 207; MESA-GCN: buffer_load_ubyte 208; MESA-GCN: buffer_load_ubyte 209; MESA-GCN: buffer_load_ubyte 210; HSA-VI: flat_load_ubyte 211; HSA-VI: flat_load_ubyte 212; HSA-VI: flat_load_ubyte 213define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 214entry: 215 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 216 ret void 217} 218 219; FUNC-LABEL: {{^}}v3i16_arg: 220; HSA-VI: kernarg_segment_alignment = 4 221; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 222; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 223; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 224; MESA-GCN: buffer_load_ushort 225; MESA-GCN: buffer_load_ushort 226; MESA-GCN: buffer_load_ushort 227; HSA-VI: flat_load_ushort 228; HSA-VI: flat_load_ushort 229; HSA-VI: flat_load_ushort 230define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 231entry: 232 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 233 ret void 234} 235; FUNC-LABEL: {{^}}v3i32_arg: 236; HSA-VI: kernarg_segment_alignment = 4 237; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 238; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 239; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 240; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 241; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 242; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 243define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 244entry: 245 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 246 ret void 247} 248 249; FUNC-LABEL: {{^}}v3f32_arg: 250; HSA-VI: kernarg_segment_alignment = 4 251; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 252; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 253; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 254; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 255; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 256; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 257define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 258entry: 259 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 260 ret void 261} 262 263; FUNC-LABEL: {{^}}v4i8_arg: 264; HSA-VI: kernarg_segment_alignment = 4 265; EG: VTX_READ_8 266; EG: VTX_READ_8 267; EG: VTX_READ_8 268; EG: VTX_READ_8 269; MESA-GCN: buffer_load_ubyte 270; MESA-GCN: buffer_load_ubyte 271; MESA-GCN: buffer_load_ubyte 272; MESA-GCN: buffer_load_ubyte 273; HSA-VI: flat_load_ubyte 274; HSA-VI: flat_load_ubyte 275; HSA-VI: flat_load_ubyte 276; HSA-VI: flat_load_ubyte 277define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 278entry: 279 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 280 ret void 281} 282 283; FUNC-LABEL: {{^}}v4i16_arg: 284; HSA-VI: kernarg_segment_alignment = 4 285; EG: VTX_READ_16 286; EG: VTX_READ_16 287; EG: VTX_READ_16 288; EG: VTX_READ_16 289 290; SI: buffer_load_ushort 291; SI: buffer_load_ushort 292; SI: buffer_load_ushort 293; SI: buffer_load_ushort 294 295; VI: s_load_dword s 296; VI: s_load_dword s 297define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 298entry: 299 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 300 ret void 301} 302 303; FUNC-LABEL: {{^}}v4i32_arg: 304; HSA-VI: kernarg_segment_alignment = 4 305; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 306; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 307; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 308; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 309 310; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 311; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 312; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 313define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 314entry: 315 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 316 ret void 317} 318 319; FUNC-LABEL: {{^}}v4f32_arg: 320; HSA-VI: kernarg_segment_alignment = 4 321; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 322; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 323; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 324; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 325; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 326; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 327; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 328define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 329entry: 330 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 331 ret void 332} 333 334; FUNC-LABEL: {{^}}v8i8_arg: 335; HSA-VI: kernarg_segment_alignment = 4 336; EG: VTX_READ_8 337; EG: VTX_READ_8 338; EG: VTX_READ_8 339; EG: VTX_READ_8 340; EG: VTX_READ_8 341; EG: VTX_READ_8 342; EG: VTX_READ_8 343; EG: VTX_READ_8 344; MESA-GCN: buffer_load_ubyte 345; MESA-GCN: buffer_load_ubyte 346; MESA-GCN: buffer_load_ubyte 347; MESA-GCN: buffer_load_ubyte 348; MESA-GCN: buffer_load_ubyte 349; MESA-GCN: buffer_load_ubyte 350; MESA-GCN: buffer_load_ubyte 351; HSA-GCN: float_load_ubyte 352; HSA-GCN: float_load_ubyte 353; HSA-GCN: float_load_ubyte 354; HSA-GCN: float_load_ubyte 355; HSA-GCN: float_load_ubyte 356; HSA-GCN: float_load_ubyte 357; HSA-GCN: float_load_ubyte 358; HSA-GCN: float_load_ubyte 359define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 360entry: 361 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 362 ret void 363} 364 365; FUNC-LABEL: {{^}}v8i16_arg: 366; HSA-VI: kernarg_segment_alignment = 4 367; EG: VTX_READ_16 368; EG: VTX_READ_16 369; EG: VTX_READ_16 370; EG: VTX_READ_16 371; EG: VTX_READ_16 372; EG: VTX_READ_16 373; EG: VTX_READ_16 374; EG: VTX_READ_16 375 376; SI: buffer_load_ushort 377; SI: buffer_load_ushort 378; SI: buffer_load_ushort 379; SI: buffer_load_ushort 380; SI: buffer_load_ushort 381; SI: buffer_load_ushort 382; SI: buffer_load_ushort 383; SI: buffer_load_ushort 384 385; VI: s_load_dword s 386; VI: s_load_dword s 387; VI: s_load_dword s 388; VI: s_load_dword s 389define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 390entry: 391 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 392 ret void 393} 394 395; FUNC-LABEL: {{^}}v8i32_arg: 396; HSA-VI: kernarg_segment_alignment = 5 397; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 398; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 399; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 400; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 401; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 402; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 403; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 404; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 405; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 406; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 407; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 408define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 409entry: 410 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 411 ret void 412} 413 414; FUNC-LABEL: {{^}}v8f32_arg: 415; HSA-VI: kernarg_segment_alignment = 5 416; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 417; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 418; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 419; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 420; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 421; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 422; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 423; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 424; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 425define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 426entry: 427 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 428 ret void 429} 430 431; FUNC-LABEL: {{^}}v16i8_arg: 432; HSA-VI: kernarg_segment_alignment = 4 433; EG: VTX_READ_8 434; EG: VTX_READ_8 435; EG: VTX_READ_8 436; EG: VTX_READ_8 437; EG: VTX_READ_8 438; EG: VTX_READ_8 439; EG: VTX_READ_8 440; EG: VTX_READ_8 441; EG: VTX_READ_8 442; EG: VTX_READ_8 443; EG: VTX_READ_8 444; EG: VTX_READ_8 445; EG: VTX_READ_8 446; EG: VTX_READ_8 447; EG: VTX_READ_8 448; EG: VTX_READ_8 449; MESA-GCN: buffer_load_ubyte 450; MESA-GCN: buffer_load_ubyte 451; MESA-GCN: buffer_load_ubyte 452; MESA-GCN: buffer_load_ubyte 453; MESA-GCN: buffer_load_ubyte 454; MESA-GCN: buffer_load_ubyte 455; MESA-GCN: buffer_load_ubyte 456; MESA-GCN: buffer_load_ubyte 457; MESA-GCN: buffer_load_ubyte 458; MESA-GCN: buffer_load_ubyte 459; MESA-GCN: buffer_load_ubyte 460; MESA-GCN: buffer_load_ubyte 461; MESA-GCN: buffer_load_ubyte 462; MESA-GCN: buffer_load_ubyte 463; MESA-GCN: buffer_load_ubyte 464; MESA-GCN: buffer_load_ubyte 465; HSA-VI: flat_load_ubyte 466; HSA-VI: flat_load_ubyte 467; HSA-VI: flat_load_ubyte 468; HSA-VI: flat_load_ubyte 469; HSA-VI: flat_load_ubyte 470; HSA-VI: flat_load_ubyte 471; HSA-VI: flat_load_ubyte 472; HSA-VI: flat_load_ubyte 473; HSA-VI: flat_load_ubyte 474; HSA-VI: flat_load_ubyte 475; HSA-VI: flat_load_ubyte 476; HSA-VI: flat_load_ubyte 477; HSA-VI: flat_load_ubyte 478; HSA-VI: flat_load_ubyte 479; HSA-VI: flat_load_ubyte 480; HSA-VI: flat_load_ubyte 481define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 482entry: 483 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 484 ret void 485} 486 487; FUNC-LABEL: {{^}}v16i16_arg: 488; HSA-VI: kernarg_segment_alignment = 5 489; EG: VTX_READ_16 490; EG: VTX_READ_16 491; EG: VTX_READ_16 492; EG: VTX_READ_16 493; EG: VTX_READ_16 494; EG: VTX_READ_16 495; EG: VTX_READ_16 496; EG: VTX_READ_16 497; EG: VTX_READ_16 498; EG: VTX_READ_16 499; EG: VTX_READ_16 500; EG: VTX_READ_16 501; EG: VTX_READ_16 502; EG: VTX_READ_16 503; EG: VTX_READ_16 504; EG: VTX_READ_16 505 506; SI: buffer_load_ushort 507; SI: buffer_load_ushort 508; SI: buffer_load_ushort 509; SI: buffer_load_ushort 510; SI: buffer_load_ushort 511; SI: buffer_load_ushort 512; SI: buffer_load_ushort 513; SI: buffer_load_ushort 514; SI: buffer_load_ushort 515; SI: buffer_load_ushort 516; SI: buffer_load_ushort 517; SI: buffer_load_ushort 518; SI: buffer_load_ushort 519; SI: buffer_load_ushort 520; SI: buffer_load_ushort 521; SI: buffer_load_ushort 522 523; VI: s_load_dword s 524; VI: s_load_dword s 525; VI: s_load_dword s 526; VI: s_load_dword s 527; VI: s_load_dword s 528; VI: s_load_dword s 529; VI: s_load_dword s 530; VI: s_load_dword s 531define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 532entry: 533 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 534 ret void 535} 536 537; FUNC-LABEL: {{^}}v16i32_arg: 538; HSA-VI: kernarg_segment_alignment = 6 539; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 540; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 541; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 542; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 543; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 544; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 545; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 546; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 547; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 555; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 556; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 557; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 558define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 559entry: 560 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 561 ret void 562} 563 564; FUNC-LABEL: {{^}}v16f32_arg: 565; HSA-VI: kernarg_segment_alignment = 6 566; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 567; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 568; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 569; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 570; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 571; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 572; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 573; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 574; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 575; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 576; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 582; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 583; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 584; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 585define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 586entry: 587 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 588 ret void 589} 590 591; FUNC-LABEL: {{^}}kernel_arg_i64: 592; MESA-GCN: s_load_dwordx2 593; MESA-GCN: s_load_dwordx2 594; MESA-GCN: buffer_store_dwordx2 595; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 596define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 597 store i64 %a, i64 addrspace(1)* %out, align 8 598 ret void 599} 600 601; FUNC-LABEL: {{^}}f64_kernel_arg: 602; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 603; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb 604; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 605; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c 606; MESA-GCN: buffer_store_dwordx2 607; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 608define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 609entry: 610 store double %in, double addrspace(1)* %out 611 ret void 612} 613 614; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 615; XGCN: s_load_dwordx2 616; XGCN: s_load_dwordx2 617; XGCN: buffer_store_dwordx2 618; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 619; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 620; ret void 621; } 622 623; FUNC-LABEL: {{^}}i1_arg: 624; SI: buffer_load_ubyte 625; SI: v_and_b32_e32 626; SI: buffer_store_byte 627; SI: s_endpgm 628define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 629 store i1 %x, i1 addrspace(1)* %out, align 1 630 ret void 631} 632 633; FUNC-LABEL: {{^}}i1_arg_zext_i32: 634; SI: buffer_load_ubyte 635; SI: buffer_store_dword 636; SI: s_endpgm 637define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 638 %ext = zext i1 %x to i32 639 store i32 %ext, i32 addrspace(1)* %out, align 4 640 ret void 641} 642 643; FUNC-LABEL: {{^}}i1_arg_zext_i64: 644; SI: buffer_load_ubyte 645; SI: buffer_store_dwordx2 646; SI: s_endpgm 647define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 648 %ext = zext i1 %x to i64 649 store i64 %ext, i64 addrspace(1)* %out, align 8 650 ret void 651} 652 653; FUNC-LABEL: {{^}}i1_arg_sext_i32: 654; SI: buffer_load_ubyte 655; SI: buffer_store_dword 656; SI: s_endpgm 657define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 658 %ext = sext i1 %x to i32 659 store i32 %ext, i32addrspace(1)* %out, align 4 660 ret void 661} 662 663; FUNC-LABEL: {{^}}i1_arg_sext_i64: 664; SI: buffer_load_ubyte 665; SI: v_bfe_i32 666; SI: v_ashrrev_i32 667; SI: buffer_store_dwordx2 668; SI: s_endpgm 669define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 670 %ext = sext i1 %x to i64 671 store i64 %ext, i64 addrspace(1)* %out, align 8 672 ret void 673} 674