1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s 3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s 4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s 5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s 6 7; FUNC-LABEL: {{^}}i8_arg: 8; HSA-VI: kernarg_segment_byte_size = 12 9; HSA-VI: kernarg_segment_alignment = 4 10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 14; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 15; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 16; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 17; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 18; FIXME: Should be using s_load_dword 19; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 20 21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 22entry: 23 %0 = zext i8 %in to i32 24 store i32 %0, i32 addrspace(1)* %out, align 4 25 ret void 26} 27 28; FUNC-LABEL: {{^}}i8_zext_arg: 29; HSA-VI: kernarg_segment_byte_size = 12 30; HSA-VI: kernarg_segment_alignment = 4 31; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 32; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 33; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 34; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 35; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 36; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 37; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 38; FIXME: Should be using s_load_dword 39; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 40 41define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 42entry: 43 %0 = zext i8 %in to i32 44 store i32 %0, i32 addrspace(1)* %out, align 4 45 ret void 46} 47 48; FUNC-LABEL: {{^}}i8_sext_arg: 49; HSA-VI: kernarg_segment_byte_size = 12 50; HSA-VI: kernarg_segment_alignment = 4 51; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 52; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 53; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 54; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 55; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 56; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 57; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 58; FIXME: Should be using s_load_dword 59; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 60 61define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 62entry: 63 %0 = sext i8 %in to i32 64 store i32 %0, i32 addrspace(1)* %out, align 4 65 ret void 66} 67 68; FUNC-LABEL: {{^}}i16_arg: 69; HSA-VI: kernarg_segment_byte_size = 12 70; HSA-VI: kernarg_segment_alignment = 4 71 72; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 73; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb 74; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c 75; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff 76; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 77; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 78; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 79; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 80; FIXME: Should be using s_load_dword 81; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 82 83define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 84entry: 85 %0 = zext i16 %in to i32 86 store i32 %0, i32 addrspace(1)* %out, align 4 87 ret void 88} 89 90; FUNC-LABEL: {{^}}i16_zext_arg: 91; HSA-VI: kernarg_segment_byte_size = 12 92; HSA-VI: kernarg_segment_alignment = 4 93 94; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 95; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 96; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 97; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 98; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 99; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 100; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 101; FIXME: Should be using s_load_dword 102; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 103 104define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 105entry: 106 %0 = zext i16 %in to i32 107 store i32 %0, i32 addrspace(1)* %out, align 4 108 ret void 109} 110 111; FUNC-LABEL: {{^}}i16_sext_arg: 112; HSA-VI: kernarg_segment_byte_size = 12 113; HSA-VI: kernarg_segment_alignment = 4 114 115; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z 116; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 117; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 118; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 119; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 120; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] 121; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] 122; FIXME: Should be using s_load_dword 123; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} 124 125define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 126entry: 127 %0 = sext i16 %in to i32 128 store i32 %0, i32 addrspace(1)* %out, align 4 129 ret void 130} 131 132; FUNC-LABEL: {{^}}i32_arg: 133; HSA-VI: kernarg_segment_byte_size = 12 134; HSA-VI: kernarg_segment_alignment = 4 135 136; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 137; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 138; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 139; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 140define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 141entry: 142 store i32 %in, i32 addrspace(1)* %out, align 4 143 ret void 144} 145 146; FUNC-LABEL: {{^}}f32_arg: 147; HSA-VI: kernarg_segment_byte_size = 12 148; HSA-VI: kernarg_segment_alignment = 4 149; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z 150; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb 151; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c 152; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 153define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 154entry: 155 store float %in, float addrspace(1)* %out, align 4 156 ret void 157} 158 159; FUNC-LABEL: {{^}}v2i8_arg: 160; HSA-VI: kernarg_segment_byte_size = 12 161; HSA-VI: kernarg_segment_alignment = 4 162 163; EG: VTX_READ_8 164; EG: VTX_READ_8 165; MESA-GCN: buffer_load_ubyte 166; MESA-GCN: buffer_load_ubyte 167; HSA-VI: flat_load_ubyte 168; HSA-VI: flat_load_ubyte 169define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 170entry: 171 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 172 ret void 173} 174 175; FUNC-LABEL: {{^}}v2i16_arg: 176; HSA-VI: kernarg_segment_byte_size = 12 177; HSA-VI: kernarg_segment_alignment = 4 178 179; EG: VTX_READ_16 180; EG: VTX_READ_16 181 182; SI: buffer_load_ushort 183; SI: buffer_load_ushort 184 185; VI: s_load_dword s 186define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 187entry: 188 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 189 ret void 190} 191 192; FUNC-LABEL: {{^}}v2i32_arg: 193; HSA-VI: kernarg_segment_byte_size = 16 194; HSA-VI: kernarg_segment_alignment = 4 195 196; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 197; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 198; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 199; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 200; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 201define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 202entry: 203 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 204 ret void 205} 206 207; FUNC-LABEL: {{^}}v2f32_arg: 208; HSA-VI: kernarg_segment_byte_size = 16 209; HSA-VI: kernarg_segment_alignment = 4 210 211; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X 212; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W 213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb 214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c 215; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 216define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 217entry: 218 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 219 ret void 220} 221 222; FUNC-LABEL: {{^}}v3i8_arg: 223; HSA-VI: kernarg_segment_byte_size = 12 224; HSA-VI: kernarg_segment_alignment = 4 225 226; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 227; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 228; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 229; MESA-GCN: buffer_load_ubyte 230; MESA-GCN: buffer_load_ubyte 231; MESA-GCN: buffer_load_ubyte 232; HSA-VI: flat_load_ubyte 233; HSA-VI: flat_load_ubyte 234; HSA-VI: flat_load_ubyte 235define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 236entry: 237 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 238 ret void 239} 240 241; FUNC-LABEL: {{^}}v3i16_arg: 242; HSA-VI: kernarg_segment_byte_size = 16 243; HSA-VI: kernarg_segment_alignment = 4 244 245; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 246; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 247; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 248; MESA-GCN: buffer_load_ushort 249; MESA-GCN: buffer_load_ushort 250; MESA-GCN: buffer_load_ushort 251; HSA-VI: flat_load_ushort 252; HSA-VI: flat_load_ushort 253; HSA-VI: flat_load_ushort 254define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 255entry: 256 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 257 ret void 258} 259; FUNC-LABEL: {{^}}v3i32_arg: 260; HSA-VI: kernarg_segment_byte_size = 32 261; HSA-VI: kernarg_segment_alignment = 4 262; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 263; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 264; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 265; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 266; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 267; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 268define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 269entry: 270 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 271 ret void 272} 273 274; FUNC-LABEL: {{^}}v3f32_arg: 275; HSA-VI: kernarg_segment_byte_size = 32 276; HSA-VI: kernarg_segment_alignment = 4 277; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 278; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 279; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 280; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd 281; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 282; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 283define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 284entry: 285 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 286 ret void 287} 288 289; FUNC-LABEL: {{^}}v4i8_arg: 290; HSA-VI: kernarg_segment_byte_size = 12 291; HSA-VI: kernarg_segment_alignment = 4 292; EG: VTX_READ_8 293; EG: VTX_READ_8 294; EG: VTX_READ_8 295; EG: VTX_READ_8 296; MESA-GCN: buffer_load_ubyte 297; MESA-GCN: buffer_load_ubyte 298; MESA-GCN: buffer_load_ubyte 299; MESA-GCN: buffer_load_ubyte 300; HSA-VI: flat_load_ubyte 301; HSA-VI: flat_load_ubyte 302; HSA-VI: flat_load_ubyte 303; HSA-VI: flat_load_ubyte 304define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 305entry: 306 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 307 ret void 308} 309 310; FUNC-LABEL: {{^}}v4i16_arg: 311; HSA-VI: kernarg_segment_byte_size = 16 312; HSA-VI: kernarg_segment_alignment = 4 313; EG: VTX_READ_16 314; EG: VTX_READ_16 315; EG: VTX_READ_16 316; EG: VTX_READ_16 317 318; SI: buffer_load_ushort 319; SI: buffer_load_ushort 320; SI: buffer_load_ushort 321; SI: buffer_load_ushort 322 323; VI: s_load_dword s 324; VI: s_load_dword s 325define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 326entry: 327 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 328 ret void 329} 330 331; FUNC-LABEL: {{^}}v4i32_arg: 332; HSA-VI: kernarg_segment_byte_size = 32 333; HSA-VI: kernarg_segment_alignment = 4 334; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 335; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 336; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 337; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 338 339; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 340; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 341; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 342define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 343entry: 344 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 345 ret void 346} 347 348; FUNC-LABEL: {{^}}v4f32_arg: 349; HSA-VI: kernarg_segment_byte_size = 32 350; HSA-VI: kernarg_segment_alignment = 4 351; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y 352; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z 353; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W 354; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X 355; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd 356; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 357; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 358define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 359entry: 360 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 361 ret void 362} 363 364; FUNC-LABEL: {{^}}v8i8_arg: 365; HSA-VI: kernarg_segment_byte_size = 16 366; HSA-VI: kernarg_segment_alignment = 4 367; EG: VTX_READ_8 368; EG: VTX_READ_8 369; EG: VTX_READ_8 370; EG: VTX_READ_8 371; EG: VTX_READ_8 372; EG: VTX_READ_8 373; EG: VTX_READ_8 374; EG: VTX_READ_8 375; MESA-GCN: buffer_load_ubyte 376; MESA-GCN: buffer_load_ubyte 377; MESA-GCN: buffer_load_ubyte 378; MESA-GCN: buffer_load_ubyte 379; MESA-GCN: buffer_load_ubyte 380; MESA-GCN: buffer_load_ubyte 381; MESA-GCN: buffer_load_ubyte 382; HSA-GCN: float_load_ubyte 383; HSA-GCN: float_load_ubyte 384; HSA-GCN: float_load_ubyte 385; HSA-GCN: float_load_ubyte 386; HSA-GCN: float_load_ubyte 387; HSA-GCN: float_load_ubyte 388; HSA-GCN: float_load_ubyte 389; HSA-GCN: float_load_ubyte 390define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 391entry: 392 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 393 ret void 394} 395 396; FUNC-LABEL: {{^}}v8i16_arg: 397; HSA-VI: kernarg_segment_byte_size = 32 398; HSA-VI: kernarg_segment_alignment = 4 399; EG: VTX_READ_16 400; EG: VTX_READ_16 401; EG: VTX_READ_16 402; EG: VTX_READ_16 403; EG: VTX_READ_16 404; EG: VTX_READ_16 405; EG: VTX_READ_16 406; EG: VTX_READ_16 407 408; SI: buffer_load_ushort 409; SI: buffer_load_ushort 410; SI: buffer_load_ushort 411; SI: buffer_load_ushort 412; SI: buffer_load_ushort 413; SI: buffer_load_ushort 414; SI: buffer_load_ushort 415; SI: buffer_load_ushort 416 417; VI: s_load_dword s 418; VI: s_load_dword s 419; VI: s_load_dword s 420; VI: s_load_dword s 421define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 422entry: 423 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 424 ret void 425} 426 427; FUNC-LABEL: {{^}}v8i32_arg: 428; HSA-VI: kernarg_segment_byte_size = 64 429; HSA-VI: kernarg_segment_alignment = 5 430; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 431; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 432; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 433; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 434; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 435; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 436; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 437; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 438; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 439; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 440; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20 441define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 442entry: 443 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 444 ret void 445} 446 447; FUNC-LABEL: {{^}}v8f32_arg: 448; HSA-VI: kernarg_segment_byte_size = 64 449; HSA-VI: kernarg_segment_alignment = 5 450; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y 451; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z 452; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W 453; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X 454; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y 455; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z 456; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W 457; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X 458; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 459define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 460entry: 461 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 462 ret void 463} 464 465; FUNC-LABEL: {{^}}v16i8_arg: 466; HSA-VI: kernarg_segment_byte_size = 32 467; HSA-VI: kernarg_segment_alignment = 4 468; EG: VTX_READ_8 469; EG: VTX_READ_8 470; EG: VTX_READ_8 471; EG: VTX_READ_8 472; EG: VTX_READ_8 473; EG: VTX_READ_8 474; EG: VTX_READ_8 475; EG: VTX_READ_8 476; EG: VTX_READ_8 477; EG: VTX_READ_8 478; EG: VTX_READ_8 479; EG: VTX_READ_8 480; EG: VTX_READ_8 481; EG: VTX_READ_8 482; EG: VTX_READ_8 483; EG: VTX_READ_8 484; MESA-GCN: buffer_load_ubyte 485; MESA-GCN: buffer_load_ubyte 486; MESA-GCN: buffer_load_ubyte 487; MESA-GCN: buffer_load_ubyte 488; MESA-GCN: buffer_load_ubyte 489; MESA-GCN: buffer_load_ubyte 490; MESA-GCN: buffer_load_ubyte 491; MESA-GCN: buffer_load_ubyte 492; MESA-GCN: buffer_load_ubyte 493; MESA-GCN: buffer_load_ubyte 494; MESA-GCN: buffer_load_ubyte 495; MESA-GCN: buffer_load_ubyte 496; MESA-GCN: buffer_load_ubyte 497; MESA-GCN: buffer_load_ubyte 498; MESA-GCN: buffer_load_ubyte 499; MESA-GCN: buffer_load_ubyte 500; HSA-VI: flat_load_ubyte 501; HSA-VI: flat_load_ubyte 502; HSA-VI: flat_load_ubyte 503; HSA-VI: flat_load_ubyte 504; HSA-VI: flat_load_ubyte 505; HSA-VI: flat_load_ubyte 506; HSA-VI: flat_load_ubyte 507; HSA-VI: flat_load_ubyte 508; HSA-VI: flat_load_ubyte 509; HSA-VI: flat_load_ubyte 510; HSA-VI: flat_load_ubyte 511; HSA-VI: flat_load_ubyte 512; HSA-VI: flat_load_ubyte 513; HSA-VI: flat_load_ubyte 514; HSA-VI: flat_load_ubyte 515; HSA-VI: flat_load_ubyte 516define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 517entry: 518 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 519 ret void 520} 521 522; FUNC-LABEL: {{^}}v16i16_arg: 523; HSA-VI: kernarg_segment_byte_size = 64 524; HSA-VI: kernarg_segment_alignment = 5 525; EG: VTX_READ_16 526; EG: VTX_READ_16 527; EG: VTX_READ_16 528; EG: VTX_READ_16 529; EG: VTX_READ_16 530; EG: VTX_READ_16 531; EG: VTX_READ_16 532; EG: VTX_READ_16 533; EG: VTX_READ_16 534; EG: VTX_READ_16 535; EG: VTX_READ_16 536; EG: VTX_READ_16 537; EG: VTX_READ_16 538; EG: VTX_READ_16 539; EG: VTX_READ_16 540; EG: VTX_READ_16 541 542; SI: buffer_load_ushort 543; SI: buffer_load_ushort 544; SI: buffer_load_ushort 545; SI: buffer_load_ushort 546; SI: buffer_load_ushort 547; SI: buffer_load_ushort 548; SI: buffer_load_ushort 549; SI: buffer_load_ushort 550; SI: buffer_load_ushort 551; SI: buffer_load_ushort 552; SI: buffer_load_ushort 553; SI: buffer_load_ushort 554; SI: buffer_load_ushort 555; SI: buffer_load_ushort 556; SI: buffer_load_ushort 557; SI: buffer_load_ushort 558 559; VI: s_load_dword s 560; VI: s_load_dword s 561; VI: s_load_dword s 562; VI: s_load_dword s 563; VI: s_load_dword s 564; VI: s_load_dword s 565; VI: s_load_dword s 566; VI: s_load_dword s 567define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 568entry: 569 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 570 ret void 571} 572 573; FUNC-LABEL: {{^}}v16i32_arg: 574; HSA-VI: kernarg_segment_byte_size = 128 575; HSA-VI: kernarg_segment_alignment = 6 576; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 588; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 589; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 590; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 591; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 592; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 593; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 594; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 595define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 596entry: 597 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 598 ret void 599} 600 601; FUNC-LABEL: {{^}}v16f32_arg: 602; HSA-VI: kernarg_segment_byte_size = 128 603; HSA-VI: kernarg_segment_alignment = 6 604; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y 605; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z 606; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W 607; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X 608; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y 609; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z 610; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W 611; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X 612; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y 613; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z 614; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W 615; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X 616; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y 617; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z 618; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W 619; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X 620; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 621; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 622; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 623define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 624entry: 625 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 626 ret void 627} 628 629; FUNC-LABEL: {{^}}kernel_arg_i64: 630; MESA-GCN: s_load_dwordx2 631; MESA-GCN: s_load_dwordx2 632; MESA-GCN: buffer_store_dwordx2 633; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 634define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 635 store i64 %a, i64 addrspace(1)* %out, align 8 636 ret void 637} 638 639; FUNC-LABEL: {{^}}f64_kernel_arg: 640; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9 641; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb 642; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24 643; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c 644; MESA-GCN: buffer_store_dwordx2 645; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 646define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 647entry: 648 store double %in, double addrspace(1)* %out 649 ret void 650} 651 652; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 653; XGCN: s_load_dwordx2 654; XGCN: s_load_dwordx2 655; XGCN: buffer_store_dwordx2 656; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 657; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 658; ret void 659; } 660 661; FUNC-LABEL: {{^}}i1_arg: 662; HSA-VI: kernarg_segment_byte_size = 12 663; HSA-VI: kernarg_segment_alignment = 4 664 665; SI: buffer_load_ubyte 666; SI: v_and_b32_e32 667; SI: buffer_store_byte 668; SI: s_endpgm 669define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 670 store i1 %x, i1 addrspace(1)* %out, align 1 671 ret void 672} 673 674; FUNC-LABEL: {{^}}i1_arg_zext_i32: 675; HSA-VI: kernarg_segment_byte_size = 12 676; HSA-VI: kernarg_segment_alignment = 4 677 678; SI: buffer_load_ubyte 679; SI: buffer_store_dword 680; SI: s_endpgm 681define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 682 %ext = zext i1 %x to i32 683 store i32 %ext, i32 addrspace(1)* %out, align 4 684 ret void 685} 686 687; FUNC-LABEL: {{^}}i1_arg_zext_i64: 688; HSA-VI: kernarg_segment_byte_size = 12 689; HSA-VI: kernarg_segment_alignment = 4 690 691; SI: buffer_load_ubyte 692; SI: buffer_store_dwordx2 693; SI: s_endpgm 694define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 695 %ext = zext i1 %x to i64 696 store i64 %ext, i64 addrspace(1)* %out, align 8 697 ret void 698} 699 700; FUNC-LABEL: {{^}}i1_arg_sext_i32: 701; HSA-VI: kernarg_segment_byte_size = 12 702; HSA-VI: kernarg_segment_alignment = 4 703 704; SI: buffer_load_ubyte 705; SI: buffer_store_dword 706; SI: s_endpgm 707define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 708 %ext = sext i1 %x to i32 709 store i32 %ext, i32addrspace(1)* %out, align 4 710 ret void 711} 712 713; FUNC-LABEL: {{^}}i1_arg_sext_i64: 714; HSA-VI: kernarg_segment_byte_size = 12 715; HSA-VI: kernarg_segment_alignment = 4 716 717; SI: buffer_load_ubyte 718; SI: v_bfe_i32 719; SI: v_ashrrev_i32 720; SI: buffer_store_dwordx2 721; SI: s_endpgm 722define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 723 %ext = sext i1 %x to i64 724 store i64 %ext, i64 addrspace(1)* %out, align 8 725 ret void 726} 727