1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 15; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 16; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 17; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 18; FIXME: Should be using s_load_dword 19; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 20 21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 22entry: 23 %0 = zext i8 %in to i32 24 store i32 %0, i32 addrspace(1)* %out, align 4 25 ret void 26} 27 28; FUNC-LABEL: {{^}}i8_zext_arg: 29; HSA-VI: kernarg_segment_byte_size = 12 30; HSA-VI: kernarg_segment_alignment = 4 31; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 32; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 33; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 34; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 35; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 36; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 37; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 38; FIXME: Should be using s_load_dword 39; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 40 41define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 42entry: 43 %0 = zext i8 %in to i32 44 store i32 %0, i32 addrspace(1)* %out, align 4 45 ret void 46} 47 48; FUNC-LABEL: {{^}}i8_sext_arg: 49; HSA-VI: kernarg_segment_byte_size = 12 50; HSA-VI: kernarg_segment_alignment = 4 51; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 52; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 53; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 54; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 55; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 56; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 57; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 58; FIXME: Should be using s_load_dword 59; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 60 61define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 62entry: 63 %0 = sext i8 %in to i32 64 store i32 %0, i32 addrspace(1)* %out, align 4 65 ret void 66} 67 68; FUNC-LABEL: {{^}}i16_arg: 69; HSA-VI: kernarg_segment_byte_size = 12 70; HSA-VI: kernarg_segment_alignment = 4 71 72; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 73; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 74; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 75; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 76; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 77; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 78; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 79; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 80; FIXME: Should be using s_load_dword 81; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 82 83define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 84entry: 85 %0 = zext i16 %in to i32 86 store i32 %0, i32 addrspace(1)* %out, align 4 87 ret void 88} 89 90; FUNC-LABEL: {{^}}i16_zext_arg: 91; HSA-VI: kernarg_segment_byte_size = 12 92; HSA-VI: kernarg_segment_alignment = 4 93 94; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 95; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 96; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 97; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 98; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 99; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 100; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 101; FIXME: Should be using s_load_dword 102; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 103 104define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 105entry: 106 %0 = zext i16 %in to i32 107 store i32 %0, i32 addrspace(1)* %out, align 4 108 ret void 109} 110 111; FUNC-LABEL: {{^}}i16_sext_arg: 112; HSA-VI: kernarg_segment_byte_size = 12 113; HSA-VI: kernarg_segment_alignment = 4 114 115; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 116; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 117; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 118; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 119; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 120; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 121; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 122; FIXME: Should be using s_load_dword 123; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 124 125define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 126entry: 127 %0 = sext i16 %in to i32 128 store i32 %0, i32 addrspace(1)* %out, align 4 129 ret void 130} 131 132; FUNC-LABEL: {{^}}i32_arg: 133; HSA-VI: kernarg_segment_byte_size = 12 134; HSA-VI: kernarg_segment_alignment = 4 135 136; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 137; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 138; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 139; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 140define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 141entry: 142 store i32 %in, i32 addrspace(1)* %out, align 4 143 ret void 144} 145 146; FUNC-LABEL: {{^}}f32_arg: 147; HSA-VI: kernarg_segment_byte_size = 12 148; HSA-VI: kernarg_segment_alignment = 4 149; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 150; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 151; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 152; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 153define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 154entry: 155 store float %in, float addrspace(1)* %out, align 4 156 ret void 157} 158 159; FUNC-LABEL: {{^}}v2i8_arg: 160; HSA-VI: kernarg_segment_byte_size = 12 161; HSA-VI: kernarg_segment_alignment = 4 162 163; EG: VTX_READ_8 164; EG: VTX_READ_8 165 166; SI: buffer_load_ubyte 167; SI: buffer_load_ubyte 168 169; HSA: flat_load_ushort 170define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 171entry: 172 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 173 ret void 174} 175 176; FUNC-LABEL: {{^}}v2i16_arg: 177; HSA-VI: kernarg_segment_byte_size = 12 178; HSA-VI: kernarg_segment_alignment = 4 179 180; EG: VTX_READ_16 181; EG: VTX_READ_16 182 183; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb 184; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c 185; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8 186define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 187entry: 188 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 189 ret void 190} 191 192; FUNC-LABEL: {{^}}v2i32_arg: 193; HSA-VI: kernarg_segment_byte_size = 16 194; HSA-VI: kernarg_segment_alignment = 4 195 196; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 197; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 198; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 199; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 200; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 201define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 202entry: 203 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 204 ret void 205} 206 207; FUNC-LABEL: {{^}}v2f32_arg: 208; HSA-VI: kernarg_segment_byte_size = 16 209; HSA-VI: kernarg_segment_alignment = 4 210 211; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 212; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 215; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 216define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 217entry: 218 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 219 ret void 220} 221 222; FUNC-LABEL: {{^}}v3i8_arg: 223; HSA-VI: kernarg_segment_byte_size = 12 224; HSA-VI: kernarg_segment_alignment = 4 225 226; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 227; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 228; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 229; SI: buffer_load_ubyte 230; SI: buffer_load_ubyte 231; SI: buffer_load_ubyte 232 233; MESA-VI: buffer_load_ushort 234; MESA-VI: buffer_load_ubyte 235 236; HSA-VI: flat_load_ushort 237; HSA-VI: flat_load_ubyte 238define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 239entry: 240 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 241 ret void 242} 243 244; FUNC-LABEL: {{^}}v3i16_arg: 245; HSA-VI: kernarg_segment_byte_size = 16 246; HSA-VI: kernarg_segment_alignment = 4 247 248; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 249; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 250; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 251 252; GCN-DAG: s_load_dword s 253; GCN-DAG: {{buffer|flat}}_load_ushort 254define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 255entry: 256 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 257 ret void 258} 259; FUNC-LABEL: {{^}}v3i32_arg: 260; HSA-VI: kernarg_segment_byte_size = 32 261; HSA-VI: kernarg_segment_alignment = 4 262; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 263; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 264; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 265; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 266; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 267; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 268define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 269entry: 270 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 271 ret void 272} 273 274; FUNC-LABEL: {{^}}v3f32_arg: 275; HSA-VI: kernarg_segment_byte_size = 32 276; HSA-VI: kernarg_segment_alignment = 4 277; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 278; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 279; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 280; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 281; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 282; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 283define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 284entry: 285 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 286 ret void 287} 288 289; FUNC-LABEL: {{^}}v4i8_arg: 290; HSA-VI: kernarg_segment_byte_size = 12 291; HSA-VI: kernarg_segment_alignment = 4 292; EG: VTX_READ_8 293; EG: VTX_READ_8 294; EG: VTX_READ_8 295; EG: VTX_READ_8 296 297; SI: buffer_load_ubyte 298; SI: buffer_load_ubyte 299; SI: buffer_load_ubyte 300; SI: buffer_load_ubyte 301 302; VI: s_load_dword s 303define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 304entry: 305 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 306 ret void 307} 308 309; FUNC-LABEL: {{^}}v4i16_arg: 310; HSA-VI: kernarg_segment_byte_size = 16 311; HSA-VI: kernarg_segment_alignment = 4 312; EG: VTX_READ_16 313; EG: VTX_READ_16 314; EG: VTX_READ_16 315; EG: VTX_READ_16 316 317; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb 318; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 319 320; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c 321; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30 322 323; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8 324; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc 325define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 326entry: 327 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 328 ret void 329} 330 331; FUNC-LABEL: {{^}}v4i32_arg: 332; HSA-VI: kernarg_segment_byte_size = 32 333; HSA-VI: kernarg_segment_alignment = 4 334; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 335; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 336; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 337; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 338 339; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 340; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 341; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 342define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 343entry: 344 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 345 ret void 346} 347 348; FUNC-LABEL: {{^}}v4f32_arg: 349; HSA-VI: kernarg_segment_byte_size = 32 350; HSA-VI: kernarg_segment_alignment = 4 351; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 352; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 353; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 354; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 355; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 356; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 357; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 358define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 359entry: 360 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 361 ret void 362} 363 364; FUNC-LABEL: {{^}}v8i8_arg: 365; HSA-VI: kernarg_segment_byte_size = 16 366; HSA-VI: kernarg_segment_alignment = 4 367; EG: VTX_READ_8 368; EG: VTX_READ_8 369; EG: VTX_READ_8 370; EG: VTX_READ_8 371; EG: VTX_READ_8 372; EG: VTX_READ_8 373; EG: VTX_READ_8 374; EG: VTX_READ_8 375 376; SI: buffer_load_ubyte 377; SI: buffer_load_ubyte 378; SI: buffer_load_ubyte 379; SI: buffer_load_ubyte 380; SI: buffer_load_ubyte 381; SI: buffer_load_ubyte 382; SI: buffer_load_ubyte 383 384; VI: s_load_dwordx2 385; VI: s_load_dwordx2 386define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 387entry: 388 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 389 ret void 390} 391 392; FUNC-LABEL: {{^}}v8i16_arg: 393; HSA-VI: kernarg_segment_byte_size = 32 394; HSA-VI: kernarg_segment_alignment = 4 395; EG: VTX_READ_16 396; EG: VTX_READ_16 397; EG: VTX_READ_16 398; EG: VTX_READ_16 399; EG: VTX_READ_16 400; EG: VTX_READ_16 401; EG: VTX_READ_16 402; EG: VTX_READ_16 403 404; SI: s_load_dwordx2 405; SI: s_load_dwordx2 406; SI: s_load_dwordx2 407 408; VI: s_load_dwordx2 409; VI: s_load_dword s 410; VI: s_load_dword s 411; VI: s_load_dword s 412; VI: s_load_dword s 413define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 414entry: 415 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 416 ret void 417} 418 419; FUNC-LABEL: {{^}}v8i32_arg: 420; HSA-VI: kernarg_segment_byte_size = 64 421; HSA-VI: kernarg_segment_alignment = 5 422; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 423; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 424; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 425; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 426; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 427; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 428; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 429; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 430; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 431; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 432; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 433define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 434entry: 435 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 436 ret void 437} 438 439; FUNC-LABEL: {{^}}v8f32_arg: 440; HSA-VI: kernarg_segment_byte_size = 64 441; HSA-VI: kernarg_segment_alignment = 5 442; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 443; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 444; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 445; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 446; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 447; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 448; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 449; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 450; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 451define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 452entry: 453 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 454 ret void 455} 456 457; FUNC-LABEL: {{^}}v16i8_arg: 458; HSA-VI: kernarg_segment_byte_size = 32 459; HSA-VI: kernarg_segment_alignment = 4 460; EG: VTX_READ_8 461; EG: VTX_READ_8 462; EG: VTX_READ_8 463; EG: VTX_READ_8 464; EG: VTX_READ_8 465; EG: VTX_READ_8 466; EG: VTX_READ_8 467; EG: VTX_READ_8 468; EG: VTX_READ_8 469; EG: VTX_READ_8 470; EG: VTX_READ_8 471; EG: VTX_READ_8 472; EG: VTX_READ_8 473; EG: VTX_READ_8 474; EG: VTX_READ_8 475; EG: VTX_READ_8 476 477; SI: buffer_load_ubyte 478; SI: buffer_load_ubyte 479; SI: buffer_load_ubyte 480; SI: buffer_load_ubyte 481; SI: buffer_load_ubyte 482; SI: buffer_load_ubyte 483; SI: buffer_load_ubyte 484; SI: buffer_load_ubyte 485; SI: buffer_load_ubyte 486; SI: buffer_load_ubyte 487; SI: buffer_load_ubyte 488; SI: buffer_load_ubyte 489; SI: buffer_load_ubyte 490; SI: buffer_load_ubyte 491; SI: buffer_load_ubyte 492; SI: buffer_load_ubyte 493 494; VI: s_load_dwordx2 495; VI: s_load_dwordx2 496; VI: s_load_dwordx2 497define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 498entry: 499 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 500 ret void 501} 502 503; FUNC-LABEL: {{^}}v16i16_arg: 504; HSA-VI: kernarg_segment_byte_size = 64 505; HSA-VI: kernarg_segment_alignment = 5 506; EG: VTX_READ_16 507; EG: VTX_READ_16 508; EG: VTX_READ_16 509; EG: VTX_READ_16 510; EG: VTX_READ_16 511; EG: VTX_READ_16 512; EG: VTX_READ_16 513; EG: VTX_READ_16 514; EG: VTX_READ_16 515; EG: VTX_READ_16 516; EG: VTX_READ_16 517; EG: VTX_READ_16 518; EG: VTX_READ_16 519; EG: VTX_READ_16 520; EG: VTX_READ_16 521; EG: VTX_READ_16 522 523; SI: s_load_dword s 524; SI: s_load_dword s 525; SI: s_load_dword s 526; SI: s_load_dword s 527; SI: s_load_dwordx2 528; SI: s_load_dwordx2 529; SI: s_load_dwordx2 530 531; VI: s_load_dword s 532; VI: s_load_dword s 533; VI: s_load_dword s 534; VI: s_load_dword s 535; VI: s_load_dword s 536; VI: s_load_dword s 537; VI: s_load_dword s 538; VI: s_load_dword s 539define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 540entry: 541 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 542 ret void 543} 544 545; FUNC-LABEL: {{^}}v16i32_arg: 546; HSA-VI: kernarg_segment_byte_size = 128 547; HSA-VI: kernarg_segment_alignment = 6 548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 560; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 561; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 562; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 563; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 564; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 565; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 566; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 567define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 568entry: 569 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 570 ret void 571} 572 573; FUNC-LABEL: {{^}}v16f32_arg: 574; HSA-VI: kernarg_segment_byte_size = 128 575; HSA-VI: kernarg_segment_alignment = 6 576; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 588; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 589; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 590; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 591; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 592; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 593; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 594; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 595define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 596entry: 597 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 598 ret void 599} 600 601; FUNC-LABEL: {{^}}kernel_arg_i64: 602; MESA-GCN: s_load_dwordx2 603; MESA-GCN: s_load_dwordx2 604; MESA-GCN: buffer_store_dwordx2 605; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 606define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 607 store i64 %a, i64 addrspace(1)* %out, align 8 608 ret void 609} 610 611; FUNC-LABEL: {{^}}f64_kernel_arg: 612; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 613; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb 614; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 615; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c 616; MESA-GCN: buffer_store_dwordx2 617; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 618define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 619entry: 620 store double %in, double addrspace(1)* %out 621 ret void 622} 623 624; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 625; XGCN: s_load_dwordx2 626; XGCN: s_load_dwordx2 627; XGCN: buffer_store_dwordx2 628; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 629; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 630; ret void 631; } 632 633; FUNC-LABEL: {{^}}i1_arg: 634; HSA-VI: kernarg_segment_byte_size = 12 635; HSA-VI: kernarg_segment_alignment = 4 636 637; SI: buffer_load_ubyte 638; SI: v_and_b32_e32 639; SI: buffer_store_byte 640; SI: s_endpgm 641define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 642 store i1 %x, i1 addrspace(1)* %out, align 1 643 ret void 644} 645 646; FUNC-LABEL: {{^}}i1_arg_zext_i32: 647; HSA-VI: kernarg_segment_byte_size = 12 648; HSA-VI: kernarg_segment_alignment = 4 649 650; SI: buffer_load_ubyte 651; SI: buffer_store_dword 652; SI: s_endpgm 653define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 654 %ext = zext i1 %x to i32 655 store i32 %ext, i32 addrspace(1)* %out, align 4 656 ret void 657} 658 659; FUNC-LABEL: {{^}}i1_arg_zext_i64: 660; HSA-VI: kernarg_segment_byte_size = 12 661; HSA-VI: kernarg_segment_alignment = 4 662 663; SI: buffer_load_ubyte 664; SI: buffer_store_dwordx2 665; SI: s_endpgm 666define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 667 %ext = zext i1 %x to i64 668 store i64 %ext, i64 addrspace(1)* %out, align 8 669 ret void 670} 671 672; FUNC-LABEL: {{^}}i1_arg_sext_i32: 673; HSA-VI: kernarg_segment_byte_size = 12 674; HSA-VI: kernarg_segment_alignment = 4 675 676; SI: buffer_load_ubyte 677; SI: buffer_store_dword 678; SI: s_endpgm 679define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 680 %ext = sext i1 %x to i32 681 store i32 %ext, i32addrspace(1)* %out, align 4 682 ret void 683} 684 685; FUNC-LABEL: {{^}}i1_arg_sext_i64: 686; HSA-VI: kernarg_segment_byte_size = 12 687; HSA-VI: kernarg_segment_alignment = 4 688 689; SI: buffer_load_ubyte 690; SI: v_bfe_i32 691; SI: v_ashrrev_i32 692; SI: buffer_store_dwordx2 693; SI: s_endpgm 694define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 695 %ext = sext i1 %x to i64 696 store i64 %ext, i64 addrspace(1)* %out, align 8 697 ret void 698} 699