1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s 4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s 6; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s 7 8define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 9; SI-LABEL: i8_arg: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dword s2, s[0:1], 0xb 12; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 13; SI-NEXT: s_mov_b32 s3, 0xf000 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_and_b32 s4, s2, 0xff 16; SI-NEXT: s_mov_b32 s2, -1 17; SI-NEXT: v_mov_b32_e32 v0, s4 18; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 19; SI-NEXT: s_endpgm 20; 21; VI-LABEL: i8_arg: 22; VI: ; %bb.0: 23; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 24; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: v_mov_b32_e32 v0, s2 27; VI-NEXT: s_and_b32 s0, s0, 0xff 28; VI-NEXT: v_mov_b32_e32 v1, s3 29; VI-NEXT: v_mov_b32_e32 v2, s0 30; VI-NEXT: flat_store_dword v[0:1], v2 31; VI-NEXT: s_endpgm 32; 33; GFX9-LABEL: i8_arg: 34; GFX9: ; %bb.0: 35; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 36; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 37; GFX9-NEXT: v_mov_b32_e32 v0, 0 38; GFX9-NEXT: s_waitcnt lgkmcnt(0) 39; GFX9-NEXT: s_and_b32 s2, s2, 0xff 40; GFX9-NEXT: v_mov_b32_e32 v1, s2 41; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 42; GFX9-NEXT: s_endpgm 43; 44; EG-LABEL: i8_arg: 45; EG: ; %bb.0: 46; EG-NEXT: ALU 0, @8, KC0[], KC1[] 47; EG-NEXT: TEX 0 @6 48; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 49; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 50; EG-NEXT: CF_END 51; EG-NEXT: PAD 52; EG-NEXT: Fetch clause starting at 6: 53; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 54; EG-NEXT: ALU clause starting at 8: 55; EG-NEXT: MOV * T0.X, 0.0, 56; EG-NEXT: ALU clause starting at 9: 57; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 58; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 59; 60; CM-LABEL: i8_arg: 61; CM: ; %bb.0: 62; CM-NEXT: ALU 0, @8, KC0[], KC1[] 63; CM-NEXT: TEX 0 @6 64; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 65; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 66; CM-NEXT: CF_END 67; CM-NEXT: PAD 68; CM-NEXT: Fetch clause starting at 6: 69; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 70; CM-NEXT: ALU clause starting at 8: 71; CM-NEXT: MOV * T0.X, 0.0, 72; CM-NEXT: ALU clause starting at 9: 73; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 74; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 75 %ext = zext i8 %in to i32 76 store i32 %ext, i32 addrspace(1)* %out, align 4 77 ret void 78} 79 80define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 81; SI-LABEL: i8_zext_arg: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dword s2, s[0:1], 0xb 84; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 85; SI-NEXT: s_mov_b32 s3, 0xf000 86; SI-NEXT: s_waitcnt lgkmcnt(0) 87; SI-NEXT: s_and_b32 s4, s2, 0xff 88; SI-NEXT: s_mov_b32 s2, -1 89; SI-NEXT: v_mov_b32_e32 v0, s4 90; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: i8_zext_arg: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 96; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: v_mov_b32_e32 v0, s2 99; VI-NEXT: s_and_b32 s0, s0, 0xff 100; VI-NEXT: v_mov_b32_e32 v1, s3 101; VI-NEXT: v_mov_b32_e32 v2, s0 102; VI-NEXT: flat_store_dword v[0:1], v2 103; VI-NEXT: s_endpgm 104; 105; GFX9-LABEL: i8_zext_arg: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 108; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 109; GFX9-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: s_and_b32 s2, s2, 0xff 112; GFX9-NEXT: v_mov_b32_e32 v1, s2 113; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 114; GFX9-NEXT: s_endpgm 115; 116; EG-LABEL: i8_zext_arg: 117; EG: ; %bb.0: 118; EG-NEXT: ALU 0, @8, KC0[], KC1[] 119; EG-NEXT: TEX 0 @6 120; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 122; EG-NEXT: CF_END 123; EG-NEXT: PAD 124; EG-NEXT: Fetch clause starting at 6: 125; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 126; EG-NEXT: ALU clause starting at 8: 127; EG-NEXT: MOV * T0.X, 0.0, 128; EG-NEXT: ALU clause starting at 9: 129; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 130; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 131; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 132; 133; CM-LABEL: i8_zext_arg: 134; CM: ; %bb.0: 135; CM-NEXT: ALU 0, @8, KC0[], KC1[] 136; CM-NEXT: TEX 0 @6 137; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 138; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 139; CM-NEXT: CF_END 140; CM-NEXT: PAD 141; CM-NEXT: Fetch clause starting at 6: 142; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 143; CM-NEXT: ALU clause starting at 8: 144; CM-NEXT: MOV * T0.X, 0.0, 145; CM-NEXT: ALU clause starting at 9: 146; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 147; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 148; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 149; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 150 %ext = zext i8 %in to i32 151 store i32 %ext, i32 addrspace(1)* %out, align 4 152 ret void 153} 154 155define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 156; SI-LABEL: i8_sext_arg: 157; SI: ; %bb.0: 158; SI-NEXT: s_load_dword s2, s[0:1], 0xb 159; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 160; SI-NEXT: s_mov_b32 s3, 0xf000 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: s_sext_i32_i8 s4, s2 163; SI-NEXT: s_mov_b32 s2, -1 164; SI-NEXT: v_mov_b32_e32 v0, s4 165; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 166; SI-NEXT: s_endpgm 167; 168; VI-LABEL: i8_sext_arg: 169; VI: ; %bb.0: 170; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 171; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: v_mov_b32_e32 v0, s2 174; VI-NEXT: s_sext_i32_i8 s0, s0 175; VI-NEXT: v_mov_b32_e32 v1, s3 176; VI-NEXT: v_mov_b32_e32 v2, s0 177; VI-NEXT: flat_store_dword v[0:1], v2 178; VI-NEXT: s_endpgm 179; 180; GFX9-LABEL: i8_sext_arg: 181; GFX9: ; %bb.0: 182; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 183; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 184; GFX9-NEXT: v_mov_b32_e32 v0, 0 185; GFX9-NEXT: s_waitcnt lgkmcnt(0) 186; GFX9-NEXT: s_sext_i32_i8 s2, s2 187; GFX9-NEXT: v_mov_b32_e32 v1, s2 188; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 189; GFX9-NEXT: s_endpgm 190; 191; EG-LABEL: i8_sext_arg: 192; EG: ; %bb.0: 193; EG-NEXT: ALU 0, @8, KC0[], KC1[] 194; EG-NEXT: TEX 0 @6 195; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 196; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 197; EG-NEXT: CF_END 198; EG-NEXT: PAD 199; EG-NEXT: Fetch clause starting at 6: 200; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 201; EG-NEXT: ALU clause starting at 8: 202; EG-NEXT: MOV * T0.X, 0.0, 203; EG-NEXT: ALU clause starting at 9: 204; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 205; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 206; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 207; 208; CM-LABEL: i8_sext_arg: 209; CM: ; %bb.0: 210; CM-NEXT: ALU 0, @8, KC0[], KC1[] 211; CM-NEXT: TEX 0 @6 212; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 213; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 214; CM-NEXT: CF_END 215; CM-NEXT: PAD 216; CM-NEXT: Fetch clause starting at 6: 217; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 218; CM-NEXT: ALU clause starting at 8: 219; CM-NEXT: MOV * T0.X, 0.0, 220; CM-NEXT: ALU clause starting at 9: 221; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 222; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 223; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 224; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 225 %ext = sext i8 %in to i32 226 store i32 %ext, i32 addrspace(1)* %out, align 4 227 ret void 228} 229 230define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 231; SI-LABEL: i16_arg: 232; SI: ; %bb.0: 233; SI-NEXT: s_load_dword s2, s[0:1], 0xb 234; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 235; SI-NEXT: s_mov_b32 s3, 0xf000 236; SI-NEXT: s_waitcnt lgkmcnt(0) 237; SI-NEXT: s_and_b32 s4, s2, 0xffff 238; SI-NEXT: s_mov_b32 s2, -1 239; SI-NEXT: v_mov_b32_e32 v0, s4 240; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 241; SI-NEXT: s_endpgm 242; 243; VI-LABEL: i16_arg: 244; VI: ; %bb.0: 245; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 246; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: v_mov_b32_e32 v0, s2 249; VI-NEXT: s_and_b32 s0, s0, 0xffff 250; VI-NEXT: v_mov_b32_e32 v1, s3 251; VI-NEXT: v_mov_b32_e32 v2, s0 252; VI-NEXT: flat_store_dword v[0:1], v2 253; VI-NEXT: s_endpgm 254; 255; GFX9-LABEL: i16_arg: 256; GFX9: ; %bb.0: 257; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 258; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 259; GFX9-NEXT: v_mov_b32_e32 v0, 0 260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 261; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 262; GFX9-NEXT: v_mov_b32_e32 v1, s2 263; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 264; GFX9-NEXT: s_endpgm 265; 266; EG-LABEL: i16_arg: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 0, @8, KC0[], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: MOV * T0.X, 0.0, 278; EG-NEXT: ALU clause starting at 9: 279; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 280; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 281; 282; CM-LABEL: i16_arg: 283; CM: ; %bb.0: 284; CM-NEXT: ALU 0, @8, KC0[], KC1[] 285; CM-NEXT: TEX 0 @6 286; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 287; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 288; CM-NEXT: CF_END 289; CM-NEXT: PAD 290; CM-NEXT: Fetch clause starting at 6: 291; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 292; CM-NEXT: ALU clause starting at 8: 293; CM-NEXT: MOV * T0.X, 0.0, 294; CM-NEXT: ALU clause starting at 9: 295; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 296; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 297 %ext = zext i16 %in to i32 298 store i32 %ext, i32 addrspace(1)* %out, align 4 299 ret void 300} 301 302define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 303; SI-LABEL: i16_zext_arg: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dword s2, s[0:1], 0xb 306; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 307; SI-NEXT: s_mov_b32 s3, 0xf000 308; SI-NEXT: s_waitcnt lgkmcnt(0) 309; SI-NEXT: s_and_b32 s4, s2, 0xffff 310; SI-NEXT: s_mov_b32 s2, -1 311; SI-NEXT: v_mov_b32_e32 v0, s4 312; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 313; SI-NEXT: s_endpgm 314; 315; VI-LABEL: i16_zext_arg: 316; VI: ; %bb.0: 317; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 318; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 319; VI-NEXT: s_waitcnt lgkmcnt(0) 320; VI-NEXT: v_mov_b32_e32 v0, s2 321; VI-NEXT: s_and_b32 s0, s0, 0xffff 322; VI-NEXT: v_mov_b32_e32 v1, s3 323; VI-NEXT: v_mov_b32_e32 v2, s0 324; VI-NEXT: flat_store_dword v[0:1], v2 325; VI-NEXT: s_endpgm 326; 327; GFX9-LABEL: i16_zext_arg: 328; GFX9: ; %bb.0: 329; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 330; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 331; GFX9-NEXT: v_mov_b32_e32 v0, 0 332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 333; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 334; GFX9-NEXT: v_mov_b32_e32 v1, s2 335; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 336; GFX9-NEXT: s_endpgm 337; 338; EG-LABEL: i16_zext_arg: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 0, @8, KC0[], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: MOV * T0.X, 0.0, 350; EG-NEXT: ALU clause starting at 9: 351; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 352; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 353; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 354; 355; CM-LABEL: i16_zext_arg: 356; CM: ; %bb.0: 357; CM-NEXT: ALU 0, @8, KC0[], KC1[] 358; CM-NEXT: TEX 0 @6 359; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 360; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 361; CM-NEXT: CF_END 362; CM-NEXT: PAD 363; CM-NEXT: Fetch clause starting at 6: 364; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 365; CM-NEXT: ALU clause starting at 8: 366; CM-NEXT: MOV * T0.X, 0.0, 367; CM-NEXT: ALU clause starting at 9: 368; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 369; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 370; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 371; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 372 %ext = zext i16 %in to i32 373 store i32 %ext, i32 addrspace(1)* %out, align 4 374 ret void 375} 376 377define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 378; SI-LABEL: i16_sext_arg: 379; SI: ; %bb.0: 380; SI-NEXT: s_load_dword s2, s[0:1], 0xb 381; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 382; SI-NEXT: s_mov_b32 s3, 0xf000 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_sext_i32_i16 s4, s2 385; SI-NEXT: s_mov_b32 s2, -1 386; SI-NEXT: v_mov_b32_e32 v0, s4 387; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; SI-NEXT: s_endpgm 389; 390; VI-LABEL: i16_sext_arg: 391; VI: ; %bb.0: 392; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 393; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 394; VI-NEXT: s_waitcnt lgkmcnt(0) 395; VI-NEXT: v_mov_b32_e32 v0, s2 396; VI-NEXT: s_sext_i32_i16 s0, s0 397; VI-NEXT: v_mov_b32_e32 v1, s3 398; VI-NEXT: v_mov_b32_e32 v2, s0 399; VI-NEXT: flat_store_dword v[0:1], v2 400; VI-NEXT: s_endpgm 401; 402; GFX9-LABEL: i16_sext_arg: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 405; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 406; GFX9-NEXT: v_mov_b32_e32 v0, 0 407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 408; GFX9-NEXT: s_sext_i32_i16 s2, s2 409; GFX9-NEXT: v_mov_b32_e32 v1, s2 410; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 411; GFX9-NEXT: s_endpgm 412; 413; EG-LABEL: i16_sext_arg: 414; EG: ; %bb.0: 415; EG-NEXT: ALU 0, @8, KC0[], KC1[] 416; EG-NEXT: TEX 0 @6 417; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 418; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 419; EG-NEXT: CF_END 420; EG-NEXT: PAD 421; EG-NEXT: Fetch clause starting at 6: 422; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 423; EG-NEXT: ALU clause starting at 8: 424; EG-NEXT: MOV * T0.X, 0.0, 425; EG-NEXT: ALU clause starting at 9: 426; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 427; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 428; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 429; 430; CM-LABEL: i16_sext_arg: 431; CM: ; %bb.0: 432; CM-NEXT: ALU 0, @8, KC0[], KC1[] 433; CM-NEXT: TEX 0 @6 434; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 435; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 436; CM-NEXT: CF_END 437; CM-NEXT: PAD 438; CM-NEXT: Fetch clause starting at 6: 439; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 440; CM-NEXT: ALU clause starting at 8: 441; CM-NEXT: MOV * T0.X, 0.0, 442; CM-NEXT: ALU clause starting at 9: 443; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 444; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 445; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 446; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 447 %ext = sext i16 %in to i32 448 store i32 %ext, i32 addrspace(1)* %out, align 4 449 ret void 450} 451 452define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 453; SI-LABEL: i32_arg: 454; SI: ; %bb.0: ; %entry 455; SI-NEXT: s_load_dword s4, s[0:1], 0xb 456; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 457; SI-NEXT: s_mov_b32 s3, 0xf000 458; SI-NEXT: s_mov_b32 s2, -1 459; SI-NEXT: s_waitcnt lgkmcnt(0) 460; SI-NEXT: v_mov_b32_e32 v0, s4 461; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 462; SI-NEXT: s_endpgm 463; 464; VI-LABEL: i32_arg: 465; VI: ; %bb.0: ; %entry 466; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 467; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 468; VI-NEXT: s_waitcnt lgkmcnt(0) 469; VI-NEXT: v_mov_b32_e32 v0, s2 470; VI-NEXT: v_mov_b32_e32 v1, s3 471; VI-NEXT: v_mov_b32_e32 v2, s0 472; VI-NEXT: flat_store_dword v[0:1], v2 473; VI-NEXT: s_endpgm 474; 475; GFX9-LABEL: i32_arg: 476; GFX9: ; %bb.0: ; %entry 477; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 478; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 479; GFX9-NEXT: v_mov_b32_e32 v0, 0 480; GFX9-NEXT: s_waitcnt lgkmcnt(0) 481; GFX9-NEXT: v_mov_b32_e32 v1, s2 482; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 483; GFX9-NEXT: s_endpgm 484; 485; EG-LABEL: i32_arg: 486; EG: ; %bb.0: ; %entry 487; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 489; EG-NEXT: CF_END 490; EG-NEXT: PAD 491; EG-NEXT: ALU clause starting at 4: 492; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 493; EG-NEXT: MOV * T1.X, KC0[2].Z, 494; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 495; 496; CM-LABEL: i32_arg: 497; CM: ; %bb.0: ; %entry 498; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 500; CM-NEXT: CF_END 501; CM-NEXT: PAD 502; CM-NEXT: ALU clause starting at 4: 503; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 504; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 505; CM-NEXT: MOV * T1.X, KC0[2].Z, 506entry: 507 store i32 %in, i32 addrspace(1)* %out, align 4 508 ret void 509} 510 511define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 512; SI-LABEL: f32_arg: 513; SI: ; %bb.0: ; %entry 514; SI-NEXT: s_load_dword s4, s[0:1], 0xb 515; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 516; SI-NEXT: s_mov_b32 s3, 0xf000 517; SI-NEXT: s_mov_b32 s2, -1 518; SI-NEXT: s_waitcnt lgkmcnt(0) 519; SI-NEXT: v_mov_b32_e32 v0, s4 520; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 521; SI-NEXT: s_endpgm 522; 523; VI-LABEL: f32_arg: 524; VI: ; %bb.0: ; %entry 525; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 526; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: v_mov_b32_e32 v0, s2 529; VI-NEXT: v_mov_b32_e32 v1, s3 530; VI-NEXT: v_mov_b32_e32 v2, s0 531; VI-NEXT: flat_store_dword v[0:1], v2 532; VI-NEXT: s_endpgm 533; 534; GFX9-LABEL: f32_arg: 535; GFX9: ; %bb.0: ; %entry 536; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 537; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 538; GFX9-NEXT: v_mov_b32_e32 v0, 0 539; GFX9-NEXT: s_waitcnt lgkmcnt(0) 540; GFX9-NEXT: v_mov_b32_e32 v1, s2 541; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 542; GFX9-NEXT: s_endpgm 543; 544; EG-LABEL: f32_arg: 545; EG: ; %bb.0: ; %entry 546; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 547; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 548; EG-NEXT: CF_END 549; EG-NEXT: PAD 550; EG-NEXT: ALU clause starting at 4: 551; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 552; EG-NEXT: MOV * T1.X, KC0[2].Z, 553; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 554; 555; CM-LABEL: f32_arg: 556; CM: ; %bb.0: ; %entry 557; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 558; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 559; CM-NEXT: CF_END 560; CM-NEXT: PAD 561; CM-NEXT: ALU clause starting at 4: 562; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 563; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 564; CM-NEXT: MOV * T1.X, KC0[2].Z, 565entry: 566 store float %in, float addrspace(1)* %out, align 4 567 ret void 568} 569 570define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 571; SI-LABEL: v2i8_arg: 572; SI: ; %bb.0: ; %entry 573; SI-NEXT: s_load_dword s4, s[0:1], 0xb 574; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 575; SI-NEXT: s_mov_b32 s3, 0xf000 576; SI-NEXT: s_mov_b32 s2, -1 577; SI-NEXT: s_waitcnt lgkmcnt(0) 578; SI-NEXT: v_mov_b32_e32 v0, s4 579; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 580; SI-NEXT: s_endpgm 581; 582; VI-LABEL: v2i8_arg: 583; VI: ; %bb.0: ; %entry 584; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 585; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 586; VI-NEXT: s_waitcnt lgkmcnt(0) 587; VI-NEXT: v_mov_b32_e32 v0, s2 588; VI-NEXT: v_mov_b32_e32 v1, s3 589; VI-NEXT: v_mov_b32_e32 v2, s0 590; VI-NEXT: flat_store_short v[0:1], v2 591; VI-NEXT: s_endpgm 592; 593; GFX9-LABEL: v2i8_arg: 594; GFX9: ; %bb.0: ; %entry 595; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 596; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 597; GFX9-NEXT: v_mov_b32_e32 v0, 0 598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 599; GFX9-NEXT: v_mov_b32_e32 v1, s2 600; GFX9-NEXT: global_store_short v0, v1, s[0:1] 601; GFX9-NEXT: s_endpgm 602; 603; EG-LABEL: v2i8_arg: 604; EG: ; %bb.0: ; %entry 605; EG-NEXT: ALU 0, @10, KC0[], KC1[] 606; EG-NEXT: TEX 1 @6 607; EG-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] 608; EG-NEXT: MEM_RAT MSKOR T4.XW, T5.X 609; EG-NEXT: CF_END 610; EG-NEXT: PAD 611; EG-NEXT: Fetch clause starting at 6: 612; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 613; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 614; EG-NEXT: ALU clause starting at 10: 615; EG-NEXT: MOV * T4.X, 0.0, 616; EG-NEXT: ALU clause starting at 11: 617; EG-NEXT: LSHL T0.W, T5.X, literal.x, 618; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 619; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 620; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 621; EG-NEXT: OR_INT * T0.W, PV.W, PS, 622; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 623; EG-NEXT: AND_INT T0.W, PS, literal.x, 624; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 625; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 626; EG-NEXT: LSHL T4.X, PV.W, PS, 627; EG-NEXT: LSHL * T4.W, literal.x, PS, 628; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 629; EG-NEXT: MOV T4.Y, 0.0, 630; EG-NEXT: MOV * T4.Z, 0.0, 631; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 632; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 633; 634; CM-LABEL: v2i8_arg: 635; CM: ; %bb.0: ; %entry 636; CM-NEXT: ALU 0, @10, KC0[], KC1[] 637; CM-NEXT: TEX 1 @6 638; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] 639; CM-NEXT: MEM_RAT MSKOR T4.XW, T5.X 640; CM-NEXT: CF_END 641; CM-NEXT: PAD 642; CM-NEXT: Fetch clause starting at 6: 643; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 644; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 645; CM-NEXT: ALU clause starting at 10: 646; CM-NEXT: MOV * T4.X, 0.0, 647; CM-NEXT: ALU clause starting at 11: 648; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 649; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 650; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 651; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 652; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 653; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 654; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 655; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 656; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 657; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 658; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 659; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 660; CM-NEXT: MOV T4.Y, 0.0, 661; CM-NEXT: MOV * T4.Z, 0.0, 662; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 663; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 664entry: 665 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 666 ret void 667} 668 669define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 670; SI-LABEL: v2i16_arg: 671; SI: ; %bb.0: ; %entry 672; SI-NEXT: s_load_dword s4, s[0:1], 0xb 673; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 674; SI-NEXT: s_mov_b32 s3, 0xf000 675; SI-NEXT: s_mov_b32 s2, -1 676; SI-NEXT: s_waitcnt lgkmcnt(0) 677; SI-NEXT: v_mov_b32_e32 v0, s4 678; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 679; SI-NEXT: s_endpgm 680; 681; VI-LABEL: v2i16_arg: 682; VI: ; %bb.0: ; %entry 683; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 684; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 685; VI-NEXT: s_waitcnt lgkmcnt(0) 686; VI-NEXT: v_mov_b32_e32 v0, s2 687; VI-NEXT: v_mov_b32_e32 v1, s3 688; VI-NEXT: v_mov_b32_e32 v2, s0 689; VI-NEXT: flat_store_dword v[0:1], v2 690; VI-NEXT: s_endpgm 691; 692; GFX9-LABEL: v2i16_arg: 693; GFX9: ; %bb.0: ; %entry 694; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 695; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 696; GFX9-NEXT: v_mov_b32_e32 v0, 0 697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 698; GFX9-NEXT: v_mov_b32_e32 v1, s2 699; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 700; GFX9-NEXT: s_endpgm 701; 702; EG-LABEL: v2i16_arg: 703; EG: ; %bb.0: ; %entry 704; EG-NEXT: ALU 0, @10, KC0[], KC1[] 705; EG-NEXT: TEX 1 @6 706; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 707; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 708; EG-NEXT: CF_END 709; EG-NEXT: PAD 710; EG-NEXT: Fetch clause starting at 6: 711; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 712; EG-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3 713; EG-NEXT: ALU clause starting at 10: 714; EG-NEXT: MOV * T4.X, 0.0, 715; EG-NEXT: ALU clause starting at 11: 716; EG-NEXT: LSHL T0.W, T5.X, literal.x, 717; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 718; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 719; EG-NEXT: OR_INT T4.X, PV.W, PS, 720; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 721; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 722; 723; CM-LABEL: v2i16_arg: 724; CM: ; %bb.0: ; %entry 725; CM-NEXT: ALU 0, @10, KC0[], KC1[] 726; CM-NEXT: TEX 1 @6 727; CM-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 728; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X 729; CM-NEXT: CF_END 730; CM-NEXT: PAD 731; CM-NEXT: Fetch clause starting at 6: 732; CM-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 733; CM-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3 734; CM-NEXT: ALU clause starting at 10: 735; CM-NEXT: MOV * T4.X, 0.0, 736; CM-NEXT: ALU clause starting at 11: 737; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 738; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 739; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 740; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W, 741; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 742; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 743entry: 744 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 745 ret void 746} 747 748define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 749; SI-LABEL: v2i32_arg: 750; SI: ; %bb.0: ; %entry 751; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 752; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 753; SI-NEXT: s_mov_b32 s3, 0xf000 754; SI-NEXT: s_mov_b32 s2, -1 755; SI-NEXT: s_waitcnt lgkmcnt(0) 756; SI-NEXT: v_mov_b32_e32 v0, s4 757; SI-NEXT: v_mov_b32_e32 v1, s5 758; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 759; SI-NEXT: s_endpgm 760; 761; VI-LABEL: v2i32_arg: 762; VI: ; %bb.0: ; %entry 763; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 764; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 765; VI-NEXT: s_waitcnt lgkmcnt(0) 766; VI-NEXT: v_mov_b32_e32 v0, s2 767; VI-NEXT: v_mov_b32_e32 v3, s1 768; VI-NEXT: v_mov_b32_e32 v1, s3 769; VI-NEXT: v_mov_b32_e32 v2, s0 770; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 771; VI-NEXT: s_endpgm 772; 773; GFX9-LABEL: v2i32_arg: 774; GFX9: ; %bb.0: ; %entry 775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 776; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 777; GFX9-NEXT: v_mov_b32_e32 v2, 0 778; GFX9-NEXT: s_waitcnt lgkmcnt(0) 779; GFX9-NEXT: v_mov_b32_e32 v0, s2 780; GFX9-NEXT: v_mov_b32_e32 v1, s3 781; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 782; GFX9-NEXT: s_endpgm 783; 784; EG-LABEL: v2i32_arg: 785; EG: ; %bb.0: ; %entry 786; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 787; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 788; EG-NEXT: CF_END 789; EG-NEXT: PAD 790; EG-NEXT: ALU clause starting at 4: 791; EG-NEXT: MOV * T0.Y, KC0[3].X, 792; EG-NEXT: MOV T0.X, KC0[2].W, 793; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 794; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 795; 796; CM-LABEL: v2i32_arg: 797; CM: ; %bb.0: ; %entry 798; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 799; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 800; CM-NEXT: CF_END 801; CM-NEXT: PAD 802; CM-NEXT: ALU clause starting at 4: 803; CM-NEXT: MOV * T0.Y, KC0[3].X, 804; CM-NEXT: MOV * T0.X, KC0[2].W, 805; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 806; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 807entry: 808 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 809 ret void 810} 811 812define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 813; SI-LABEL: v2f32_arg: 814; SI: ; %bb.0: ; %entry 815; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 816; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 817; SI-NEXT: s_mov_b32 s3, 0xf000 818; SI-NEXT: s_mov_b32 s2, -1 819; SI-NEXT: s_waitcnt lgkmcnt(0) 820; SI-NEXT: v_mov_b32_e32 v0, s4 821; SI-NEXT: v_mov_b32_e32 v1, s5 822; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 823; SI-NEXT: s_endpgm 824; 825; VI-LABEL: v2f32_arg: 826; VI: ; %bb.0: ; %entry 827; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 828; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 829; VI-NEXT: s_waitcnt lgkmcnt(0) 830; VI-NEXT: v_mov_b32_e32 v0, s2 831; VI-NEXT: v_mov_b32_e32 v3, s1 832; VI-NEXT: v_mov_b32_e32 v1, s3 833; VI-NEXT: v_mov_b32_e32 v2, s0 834; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 835; VI-NEXT: s_endpgm 836; 837; GFX9-LABEL: v2f32_arg: 838; GFX9: ; %bb.0: ; %entry 839; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 840; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 841; GFX9-NEXT: v_mov_b32_e32 v2, 0 842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 843; GFX9-NEXT: v_mov_b32_e32 v0, s2 844; GFX9-NEXT: v_mov_b32_e32 v1, s3 845; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 846; GFX9-NEXT: s_endpgm 847; 848; EG-LABEL: v2f32_arg: 849; EG: ; %bb.0: ; %entry 850; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 851; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 852; EG-NEXT: CF_END 853; EG-NEXT: PAD 854; EG-NEXT: ALU clause starting at 4: 855; EG-NEXT: MOV * T0.Y, KC0[3].X, 856; EG-NEXT: MOV T0.X, KC0[2].W, 857; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 858; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 859; 860; CM-LABEL: v2f32_arg: 861; CM: ; %bb.0: ; %entry 862; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 863; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 864; CM-NEXT: CF_END 865; CM-NEXT: PAD 866; CM-NEXT: ALU clause starting at 4: 867; CM-NEXT: MOV * T0.Y, KC0[3].X, 868; CM-NEXT: MOV * T0.X, KC0[2].W, 869; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 870; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 871entry: 872 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 873 ret void 874} 875 876define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 877; SI-LABEL: v3i8_arg: 878; SI: ; %bb.0: ; %entry 879; SI-NEXT: s_load_dword s4, s[0:1], 0xb 880; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 881; SI-NEXT: s_mov_b32 s3, 0xf000 882; SI-NEXT: s_waitcnt lgkmcnt(0) 883; SI-NEXT: s_lshr_b32 s5, s4, 16 884; SI-NEXT: s_mov_b32 s2, -1 885; SI-NEXT: v_mov_b32_e32 v0, s4 886; SI-NEXT: v_mov_b32_e32 v1, s5 887; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 888; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 889; SI-NEXT: s_endpgm 890; 891; VI-LABEL: v3i8_arg: 892; VI: ; %bb.0: ; %entry 893; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 894; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 895; VI-NEXT: s_waitcnt lgkmcnt(0) 896; VI-NEXT: v_mov_b32_e32 v0, s2 897; VI-NEXT: s_lshr_b32 s1, s0, 16 898; VI-NEXT: v_mov_b32_e32 v4, s0 899; VI-NEXT: s_add_u32 s0, s2, 2 900; VI-NEXT: v_mov_b32_e32 v5, s1 901; VI-NEXT: s_addc_u32 s1, s3, 0 902; VI-NEXT: v_mov_b32_e32 v3, s1 903; VI-NEXT: v_mov_b32_e32 v2, s0 904; VI-NEXT: v_mov_b32_e32 v1, s3 905; VI-NEXT: flat_store_byte v[2:3], v5 906; VI-NEXT: flat_store_short v[0:1], v4 907; VI-NEXT: s_endpgm 908; 909; GFX9-LABEL: v3i8_arg: 910; GFX9: ; %bb.0: ; %entry 911; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 912; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 913; GFX9-NEXT: v_mov_b32_e32 v0, 0 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: v_mov_b32_e32 v1, s2 916; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2 917; GFX9-NEXT: global_store_short v0, v1, s[0:1] 918; GFX9-NEXT: s_endpgm 919; 920; EG-LABEL: v3i8_arg: 921; EG: ; %bb.0: ; %entry 922; EG-NEXT: ALU 0, @12, KC0[], KC1[] 923; EG-NEXT: TEX 2 @6 924; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[] 925; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 926; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X 927; EG-NEXT: CF_END 928; EG-NEXT: Fetch clause starting at 6: 929; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 930; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 931; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 932; EG-NEXT: ALU clause starting at 12: 933; EG-NEXT: MOV * T4.X, 0.0, 934; EG-NEXT: ALU clause starting at 13: 935; EG-NEXT: LSHL T0.W, T5.X, literal.x, 936; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 937; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 938; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 939; EG-NEXT: OR_INT * T0.W, PV.W, PS, 940; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 941; EG-NEXT: AND_INT T0.W, PS, literal.x, 942; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 943; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 944; EG-NEXT: LSHL T4.X, PV.W, PS, 945; EG-NEXT: LSHL * T4.W, literal.x, PS, 946; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 947; EG-NEXT: MOV T4.Y, 0.0, 948; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 949; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 950; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 951; EG-NEXT: AND_INT * T2.W, T6.X, literal.y, 952; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 953; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 954; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 955; EG-NEXT: LSHL T5.X, T2.W, PV.W, 956; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 957; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 958; EG-NEXT: MOV T5.Y, 0.0, 959; EG-NEXT: MOV T4.Z, 0.0, 960; EG-NEXT: MOV * T5.Z, 0.0, 961; EG-NEXT: LSHR T6.X, T0.W, literal.x, 962; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 963; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 964; 965; CM-LABEL: v3i8_arg: 966; CM: ; %bb.0: ; %entry 967; CM-NEXT: ALU 0, @12, KC0[], KC1[] 968; CM-NEXT: TEX 2 @6 969; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[] 970; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 971; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X 972; CM-NEXT: CF_END 973; CM-NEXT: Fetch clause starting at 6: 974; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 975; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 976; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 977; CM-NEXT: ALU clause starting at 12: 978; CM-NEXT: MOV * T4.X, 0.0, 979; CM-NEXT: ALU clause starting at 13: 980; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 981; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 982; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 983; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 984; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 985; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 986; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 987; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 988; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 989; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 990; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 991; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 992; CM-NEXT: MOV T4.Y, 0.0, 993; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 994; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 995; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 996; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 997; CM-NEXT: AND_INT T0.Z, T6.X, literal.x, 998; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 999; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1000; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1001; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1002; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1003; CM-NEXT: MOV T5.Y, 0.0, 1004; CM-NEXT: MOV * T4.Z, 0.0, 1005; CM-NEXT: MOV * T5.Z, 0.0, 1006; CM-NEXT: LSHR * T6.X, T0.W, literal.x, 1007; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1008; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1009; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1010entry: 1011 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 1012 ret void 1013} 1014 1015define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 1016; SI-LABEL: v3i16_arg: 1017; SI: ; %bb.0: ; %entry 1018; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1019; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1020; SI-NEXT: s_mov_b32 s3, 0xf000 1021; SI-NEXT: s_mov_b32 s2, -1 1022; SI-NEXT: s_waitcnt lgkmcnt(0) 1023; SI-NEXT: v_mov_b32_e32 v0, s5 1024; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1025; SI-NEXT: s_waitcnt expcnt(0) 1026; SI-NEXT: v_mov_b32_e32 v0, s4 1027; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1028; SI-NEXT: s_endpgm 1029; 1030; VI-LABEL: v3i16_arg: 1031; VI: ; %bb.0: ; %entry 1032; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1033; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1034; VI-NEXT: s_waitcnt lgkmcnt(0) 1035; VI-NEXT: s_add_u32 s4, s2, 4 1036; VI-NEXT: s_addc_u32 s5, s3, 0 1037; VI-NEXT: v_mov_b32_e32 v2, s4 1038; VI-NEXT: v_mov_b32_e32 v4, s1 1039; VI-NEXT: v_mov_b32_e32 v0, s2 1040; VI-NEXT: v_mov_b32_e32 v3, s5 1041; VI-NEXT: v_mov_b32_e32 v1, s3 1042; VI-NEXT: v_mov_b32_e32 v5, s0 1043; VI-NEXT: flat_store_short v[2:3], v4 1044; VI-NEXT: flat_store_dword v[0:1], v5 1045; VI-NEXT: s_endpgm 1046; 1047; GFX9-LABEL: v3i16_arg: 1048; GFX9: ; %bb.0: ; %entry 1049; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1050; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1051; GFX9-NEXT: v_mov_b32_e32 v0, 0 1052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX9-NEXT: v_mov_b32_e32 v1, s3 1054; GFX9-NEXT: v_mov_b32_e32 v2, s2 1055; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4 1056; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1057; GFX9-NEXT: s_endpgm 1058; 1059; EG-LABEL: v3i16_arg: 1060; EG: ; %bb.0: ; %entry 1061; EG-NEXT: ALU 0, @12, KC0[], KC1[] 1062; EG-NEXT: TEX 2 @6 1063; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1064; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 1065; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1066; EG-NEXT: CF_END 1067; EG-NEXT: Fetch clause starting at 6: 1068; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1069; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1070; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1071; EG-NEXT: ALU clause starting at 12: 1072; EG-NEXT: MOV * T5.X, 0.0, 1073; EG-NEXT: ALU clause starting at 13: 1074; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1075; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1076; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1077; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1078; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1079; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1080; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1081; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1082; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1083; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1084; EG-NEXT: MOV T5.Y, 0.0, 1085; EG-NEXT: MOV * T5.Z, 0.0, 1086; EG-NEXT: LSHR T8.X, T0.W, literal.x, 1087; EG-NEXT: LSHL T0.W, T7.X, literal.y, 1088; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, 1089; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 1090; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1091; EG-NEXT: OR_INT T6.X, PV.W, PS, 1092; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1093; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1094; 1095; CM-LABEL: v3i16_arg: 1096; CM: ; %bb.0: ; %entry 1097; CM-NEXT: ALU 0, @12, KC0[], KC1[] 1098; CM-NEXT: TEX 2 @6 1099; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1100; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1101; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X 1102; CM-NEXT: CF_END 1103; CM-NEXT: Fetch clause starting at 6: 1104; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1105; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1106; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1107; CM-NEXT: ALU clause starting at 12: 1108; CM-NEXT: MOV * T5.X, 0.0, 1109; CM-NEXT: ALU clause starting at 13: 1110; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1111; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1112; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1113; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1114; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1115; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1116; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1117; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1118; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1119; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1120; CM-NEXT: MOV T5.Y, 0.0, 1121; CM-NEXT: MOV * T5.Z, 0.0, 1122; CM-NEXT: LSHL T0.Z, T7.X, literal.x, 1123; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 1124; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1125; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1126; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1127; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1128; CM-NEXT: LSHR * T8.X, T0.W, literal.x, 1129; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1130entry: 1131 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 1132 ret void 1133} 1134 1135define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 1136; SI-LABEL: v3i32_arg: 1137; SI: ; %bb.0: ; %entry 1138; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1139; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1140; SI-NEXT: s_mov_b32 s3, 0xf000 1141; SI-NEXT: s_mov_b32 s2, -1 1142; SI-NEXT: s_waitcnt lgkmcnt(0) 1143; SI-NEXT: v_mov_b32_e32 v0, s6 1144; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 1145; SI-NEXT: s_waitcnt expcnt(0) 1146; SI-NEXT: v_mov_b32_e32 v0, s4 1147; SI-NEXT: v_mov_b32_e32 v1, s5 1148; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1149; SI-NEXT: s_endpgm 1150; 1151; VI-LABEL: v3i32_arg: 1152; VI: ; %bb.0: ; %entry 1153; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1154; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1155; VI-NEXT: s_waitcnt lgkmcnt(0) 1156; VI-NEXT: v_mov_b32_e32 v3, s4 1157; VI-NEXT: v_mov_b32_e32 v0, s0 1158; VI-NEXT: v_mov_b32_e32 v1, s1 1159; VI-NEXT: v_mov_b32_e32 v2, s2 1160; VI-NEXT: v_mov_b32_e32 v4, s5 1161; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1162; VI-NEXT: s_endpgm 1163; 1164; GFX9-LABEL: v3i32_arg: 1165; GFX9: ; %bb.0: ; %entry 1166; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1167; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1168; GFX9-NEXT: v_mov_b32_e32 v3, 0 1169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX9-NEXT: v_mov_b32_e32 v0, s0 1171; GFX9-NEXT: v_mov_b32_e32 v1, s1 1172; GFX9-NEXT: v_mov_b32_e32 v2, s2 1173; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1174; GFX9-NEXT: s_endpgm 1175; 1176; EG-LABEL: v3i32_arg: 1177; EG: ; %bb.0: ; %entry 1178; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1179; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1180; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1181; EG-NEXT: CF_END 1182; EG-NEXT: ALU clause starting at 4: 1183; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1184; EG-NEXT: MOV T0.X, KC0[3].Y, 1185; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1186; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1187; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1188; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1189; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1190; EG-NEXT: MOV * T3.X, KC0[3].W, 1191; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1192; 1193; CM-LABEL: v3i32_arg: 1194; CM: ; %bb.0: ; %entry 1195; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1196; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1197; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1198; CM-NEXT: CF_END 1199; CM-NEXT: ALU clause starting at 4: 1200; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1201; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1202; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1203; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1204; CM-NEXT: MOV T1.X, KC0[3].W, 1205; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1206; CM-NEXT: MOV * T2.X, KC0[3].Y, 1207; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1208; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1209entry: 1210 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 1211 ret void 1212} 1213 1214define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 1215; SI-LABEL: v3f32_arg: 1216; SI: ; %bb.0: ; %entry 1217; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1218; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1219; SI-NEXT: s_mov_b32 s3, 0xf000 1220; SI-NEXT: s_mov_b32 s2, -1 1221; SI-NEXT: s_waitcnt lgkmcnt(0) 1222; SI-NEXT: v_mov_b32_e32 v0, s6 1223; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 1224; SI-NEXT: s_waitcnt expcnt(0) 1225; SI-NEXT: v_mov_b32_e32 v0, s4 1226; SI-NEXT: v_mov_b32_e32 v1, s5 1227; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1228; SI-NEXT: s_endpgm 1229; 1230; VI-LABEL: v3f32_arg: 1231; VI: ; %bb.0: ; %entry 1232; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1233; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1234; VI-NEXT: s_waitcnt lgkmcnt(0) 1235; VI-NEXT: v_mov_b32_e32 v3, s4 1236; VI-NEXT: v_mov_b32_e32 v0, s0 1237; VI-NEXT: v_mov_b32_e32 v1, s1 1238; VI-NEXT: v_mov_b32_e32 v2, s2 1239; VI-NEXT: v_mov_b32_e32 v4, s5 1240; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1241; VI-NEXT: s_endpgm 1242; 1243; GFX9-LABEL: v3f32_arg: 1244; GFX9: ; %bb.0: ; %entry 1245; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1246; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1247; GFX9-NEXT: v_mov_b32_e32 v3, 0 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: v_mov_b32_e32 v0, s0 1250; GFX9-NEXT: v_mov_b32_e32 v1, s1 1251; GFX9-NEXT: v_mov_b32_e32 v2, s2 1252; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1253; GFX9-NEXT: s_endpgm 1254; 1255; EG-LABEL: v3f32_arg: 1256; EG: ; %bb.0: ; %entry 1257; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1258; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1259; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1260; EG-NEXT: CF_END 1261; EG-NEXT: ALU clause starting at 4: 1262; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1263; EG-NEXT: MOV T0.X, KC0[3].Y, 1264; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1265; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1266; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1267; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1268; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1269; EG-NEXT: MOV * T3.X, KC0[3].W, 1270; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1271; 1272; CM-LABEL: v3f32_arg: 1273; CM: ; %bb.0: ; %entry 1274; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1275; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1276; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1277; CM-NEXT: CF_END 1278; CM-NEXT: ALU clause starting at 4: 1279; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1280; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1281; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1282; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1283; CM-NEXT: MOV T1.X, KC0[3].W, 1284; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1285; CM-NEXT: MOV * T2.X, KC0[3].Y, 1286; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1287; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1288entry: 1289 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 1290 ret void 1291} 1292 1293define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 1294; SI-LABEL: v4i8_arg: 1295; SI: ; %bb.0: ; %entry 1296; SI-NEXT: s_load_dword s4, s[0:1], 0xb 1297; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1298; SI-NEXT: s_mov_b32 s3, 0xf000 1299; SI-NEXT: s_mov_b32 s2, -1 1300; SI-NEXT: s_waitcnt lgkmcnt(0) 1301; SI-NEXT: v_mov_b32_e32 v0, s4 1302; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1303; SI-NEXT: s_endpgm 1304; 1305; VI-LABEL: v4i8_arg: 1306; VI: ; %bb.0: ; %entry 1307; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1308; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1309; VI-NEXT: s_waitcnt lgkmcnt(0) 1310; VI-NEXT: v_mov_b32_e32 v0, s2 1311; VI-NEXT: v_mov_b32_e32 v1, s3 1312; VI-NEXT: v_mov_b32_e32 v2, s0 1313; VI-NEXT: flat_store_dword v[0:1], v2 1314; VI-NEXT: s_endpgm 1315; 1316; GFX9-LABEL: v4i8_arg: 1317; GFX9: ; %bb.0: ; %entry 1318; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1319; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 1320; GFX9-NEXT: v_mov_b32_e32 v0, 0 1321; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX9-NEXT: v_mov_b32_e32 v1, s2 1323; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1324; GFX9-NEXT: s_endpgm 1325; 1326; EG-LABEL: v4i8_arg: 1327; EG: ; %bb.0: ; %entry 1328; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1329; EG-NEXT: TEX 3 @6 1330; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1331; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 1332; EG-NEXT: CF_END 1333; EG-NEXT: PAD 1334; EG-NEXT: Fetch clause starting at 6: 1335; EG-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3 1336; EG-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3 1337; EG-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3 1338; EG-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3 1339; EG-NEXT: ALU clause starting at 14: 1340; EG-NEXT: MOV * T4.X, 0.0, 1341; EG-NEXT: ALU clause starting at 15: 1342; EG-NEXT: AND_INT * T0.W, T5.X, literal.x, 1343; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1344; EG-NEXT: AND_INT T0.Z, T4.X, literal.x, 1345; EG-NEXT: LSHL T0.W, PV.W, literal.y, 1346; EG-NEXT: LSHL * T1.W, T7.X, literal.z, 1347; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1348; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1349; EG-NEXT: OR_INT T0.W, PS, PV.W, 1350; EG-NEXT: LSHL * T1.W, PV.Z, literal.x, 1351; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1352; EG-NEXT: OR_INT T0.W, PV.W, PS, 1353; EG-NEXT: AND_INT * T1.W, T6.X, literal.x, 1354; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1355; EG-NEXT: OR_INT T4.X, PV.W, PS, 1356; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 1357; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1358; 1359; CM-LABEL: v4i8_arg: 1360; CM: ; %bb.0: ; %entry 1361; CM-NEXT: ALU 0, @14, KC0[], KC1[] 1362; CM-NEXT: TEX 3 @6 1363; CM-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1364; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X 1365; CM-NEXT: CF_END 1366; CM-NEXT: PAD 1367; CM-NEXT: Fetch clause starting at 6: 1368; CM-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3 1369; CM-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3 1370; CM-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3 1371; CM-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3 1372; CM-NEXT: ALU clause starting at 14: 1373; CM-NEXT: MOV * T4.X, 0.0, 1374; CM-NEXT: ALU clause starting at 15: 1375; CM-NEXT: AND_INT * T0.W, T5.X, literal.x, 1376; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1377; CM-NEXT: AND_INT T0.Y, T4.X, literal.x, 1378; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1379; CM-NEXT: LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212 1380; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1381; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1382; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1383; CM-NEXT: LSHL * T0.W, PV.Y, literal.x, 1384; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1385; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1386; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 1387; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1388; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W, 1389; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 1390; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1391entry: 1392 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 1393 ret void 1394} 1395 1396define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 1397; SI-LABEL: v4i16_arg: 1398; SI: ; %bb.0: ; %entry 1399; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1400; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1401; SI-NEXT: s_mov_b32 s3, 0xf000 1402; SI-NEXT: s_mov_b32 s2, -1 1403; SI-NEXT: s_waitcnt lgkmcnt(0) 1404; SI-NEXT: v_mov_b32_e32 v0, s4 1405; SI-NEXT: v_mov_b32_e32 v1, s5 1406; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1407; SI-NEXT: s_endpgm 1408; 1409; VI-LABEL: v4i16_arg: 1410; VI: ; %bb.0: ; %entry 1411; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1412; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1413; VI-NEXT: s_waitcnt lgkmcnt(0) 1414; VI-NEXT: v_mov_b32_e32 v0, s2 1415; VI-NEXT: v_mov_b32_e32 v3, s1 1416; VI-NEXT: v_mov_b32_e32 v1, s3 1417; VI-NEXT: v_mov_b32_e32 v2, s0 1418; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1419; VI-NEXT: s_endpgm 1420; 1421; GFX9-LABEL: v4i16_arg: 1422; GFX9: ; %bb.0: ; %entry 1423; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1424; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1425; GFX9-NEXT: v_mov_b32_e32 v2, 0 1426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX9-NEXT: v_mov_b32_e32 v0, s2 1428; GFX9-NEXT: v_mov_b32_e32 v1, s3 1429; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1430; GFX9-NEXT: s_endpgm 1431; 1432; EG-LABEL: v4i16_arg: 1433; EG: ; %bb.0: ; %entry 1434; EG-NEXT: ALU 1, @20, KC0[], KC1[] 1435; EG-NEXT: TEX 0 @12 1436; EG-NEXT: ALU 5, @22, KC0[], KC1[] 1437; EG-NEXT: TEX 0 @14 1438; EG-NEXT: ALU 5, @28, KC0[], KC1[] 1439; EG-NEXT: TEX 0 @16 1440; EG-NEXT: ALU 5, @34, KC0[], KC1[] 1441; EG-NEXT: TEX 0 @18 1442; EG-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[] 1443; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 1444; EG-NEXT: CF_END 1445; EG-NEXT: PAD 1446; EG-NEXT: Fetch clause starting at 12: 1447; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 1448; EG-NEXT: Fetch clause starting at 14: 1449; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 1450; EG-NEXT: Fetch clause starting at 16: 1451; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 1452; EG-NEXT: Fetch clause starting at 18: 1453; EG-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3 1454; EG-NEXT: ALU clause starting at 20: 1455; EG-NEXT: MOV * T0.Y, T3.X, 1456; EG-NEXT: MOV * T5.X, 0.0, 1457; EG-NEXT: ALU clause starting at 22: 1458; EG-NEXT: LSHL T0.W, T6.X, literal.x, 1459; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 1460; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1461; EG-NEXT: OR_INT * T0.W, PS, PV.W, 1462; EG-NEXT: MOV * T3.X, PV.W, 1463; EG-NEXT: MOV * T0.Y, PV.X, 1464; EG-NEXT: ALU clause starting at 28: 1465; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 1466; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 1467; EG-NEXT: -65536(nan), 65535(9.183409e-41) 1468; EG-NEXT: OR_INT * T0.W, PV.W, PS, 1469; EG-NEXT: MOV T3.X, PV.W, 1470; EG-NEXT: MOV * T0.Y, T2.X, 1471; EG-NEXT: ALU clause starting at 34: 1472; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 1473; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 1474; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 1475; EG-NEXT: OR_INT * T0.W, PV.W, PS, 1476; EG-NEXT: MOV * T2.X, PV.W, 1477; EG-NEXT: MOV * T0.Y, PV.X, 1478; EG-NEXT: ALU clause starting at 40: 1479; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, 1480; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 1481; EG-NEXT: AND_INT * T1.W, T5.X, literal.z, 1482; EG-NEXT: 2(2.802597e-45), -65536(nan) 1483; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1484; EG-NEXT: OR_INT * T5.X, PV.W, PS, 1485; EG-NEXT: MOV T2.X, PV.X, 1486; EG-NEXT: MOV * T5.Y, T3.X, 1487; 1488; CM-LABEL: v4i16_arg: 1489; CM: ; %bb.0: ; %entry 1490; CM-NEXT: ALU 1, @20, KC0[], KC1[] 1491; CM-NEXT: TEX 0 @12 1492; CM-NEXT: ALU 5, @22, KC0[], KC1[] 1493; CM-NEXT: TEX 0 @14 1494; CM-NEXT: ALU 5, @28, KC0[], KC1[] 1495; CM-NEXT: TEX 0 @16 1496; CM-NEXT: ALU 5, @34, KC0[], KC1[] 1497; CM-NEXT: TEX 0 @18 1498; CM-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[] 1499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 1500; CM-NEXT: CF_END 1501; CM-NEXT: PAD 1502; CM-NEXT: Fetch clause starting at 12: 1503; CM-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 1504; CM-NEXT: Fetch clause starting at 14: 1505; CM-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 1506; CM-NEXT: Fetch clause starting at 16: 1507; CM-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 1508; CM-NEXT: Fetch clause starting at 18: 1509; CM-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3 1510; CM-NEXT: ALU clause starting at 20: 1511; CM-NEXT: MOV * T0.Y, T3.X, 1512; CM-NEXT: MOV * T5.X, 0.0, 1513; CM-NEXT: ALU clause starting at 22: 1514; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 1515; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 1516; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1517; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 1518; CM-NEXT: MOV * T3.X, PV.W, 1519; CM-NEXT: MOV * T0.Y, PV.X, 1520; CM-NEXT: ALU clause starting at 28: 1521; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 1522; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1523; CM-NEXT: -65536(nan), 65535(9.183409e-41) 1524; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 1525; CM-NEXT: MOV T3.X, PV.W, 1526; CM-NEXT: MOV * T0.Y, T2.X, 1527; CM-NEXT: ALU clause starting at 34: 1528; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 1529; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 1530; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 1531; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 1532; CM-NEXT: MOV * T2.X, PV.W, 1533; CM-NEXT: MOV * T0.Y, PV.X, 1534; CM-NEXT: ALU clause starting at 40: 1535; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x, 1536; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 1537; CM-NEXT: AND_INT * T0.W, T5.X, literal.z, 1538; CM-NEXT: 2(2.802597e-45), -65536(nan) 1539; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1540; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 1541; CM-NEXT: MOV T2.X, PV.X, 1542; CM-NEXT: MOV * T5.Y, T3.X, 1543entry: 1544 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 1545 ret void 1546} 1547 1548define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 1549; SI-LABEL: v4i32_arg: 1550; SI: ; %bb.0: ; %entry 1551; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1552; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1553; SI-NEXT: s_mov_b32 s3, 0xf000 1554; SI-NEXT: s_mov_b32 s2, -1 1555; SI-NEXT: s_waitcnt lgkmcnt(0) 1556; SI-NEXT: v_mov_b32_e32 v0, s4 1557; SI-NEXT: v_mov_b32_e32 v1, s5 1558; SI-NEXT: v_mov_b32_e32 v2, s6 1559; SI-NEXT: v_mov_b32_e32 v3, s7 1560; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1561; SI-NEXT: s_endpgm 1562; 1563; VI-LABEL: v4i32_arg: 1564; VI: ; %bb.0: ; %entry 1565; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1566; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1567; VI-NEXT: s_waitcnt lgkmcnt(0) 1568; VI-NEXT: v_mov_b32_e32 v4, s4 1569; VI-NEXT: v_mov_b32_e32 v0, s0 1570; VI-NEXT: v_mov_b32_e32 v5, s5 1571; VI-NEXT: v_mov_b32_e32 v1, s1 1572; VI-NEXT: v_mov_b32_e32 v2, s2 1573; VI-NEXT: v_mov_b32_e32 v3, s3 1574; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1575; VI-NEXT: s_endpgm 1576; 1577; GFX9-LABEL: v4i32_arg: 1578; GFX9: ; %bb.0: ; %entry 1579; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1580; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1581; GFX9-NEXT: v_mov_b32_e32 v4, 0 1582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX9-NEXT: v_mov_b32_e32 v0, s0 1584; GFX9-NEXT: v_mov_b32_e32 v1, s1 1585; GFX9-NEXT: v_mov_b32_e32 v2, s2 1586; GFX9-NEXT: v_mov_b32_e32 v3, s3 1587; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1588; GFX9-NEXT: s_endpgm 1589; 1590; EG-LABEL: v4i32_arg: 1591; EG: ; %bb.0: ; %entry 1592; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1593; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1594; EG-NEXT: CF_END 1595; EG-NEXT: PAD 1596; EG-NEXT: ALU clause starting at 4: 1597; EG-NEXT: MOV * T0.W, KC0[4].X, 1598; EG-NEXT: MOV * T0.Z, KC0[3].W, 1599; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1600; EG-NEXT: MOV T0.X, KC0[3].Y, 1601; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1603; 1604; CM-LABEL: v4i32_arg: 1605; CM: ; %bb.0: ; %entry 1606; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1607; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1608; CM-NEXT: CF_END 1609; CM-NEXT: PAD 1610; CM-NEXT: ALU clause starting at 4: 1611; CM-NEXT: MOV * T0.W, KC0[4].X, 1612; CM-NEXT: MOV * T0.Z, KC0[3].W, 1613; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1614; CM-NEXT: MOV * T0.X, KC0[3].Y, 1615; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1616; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1617entry: 1618 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 1619 ret void 1620} 1621 1622define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 1623; SI-LABEL: v4f32_arg: 1624; SI: ; %bb.0: ; %entry 1625; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1626; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1627; SI-NEXT: s_mov_b32 s3, 0xf000 1628; SI-NEXT: s_mov_b32 s2, -1 1629; SI-NEXT: s_waitcnt lgkmcnt(0) 1630; SI-NEXT: v_mov_b32_e32 v0, s4 1631; SI-NEXT: v_mov_b32_e32 v1, s5 1632; SI-NEXT: v_mov_b32_e32 v2, s6 1633; SI-NEXT: v_mov_b32_e32 v3, s7 1634; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1635; SI-NEXT: s_endpgm 1636; 1637; VI-LABEL: v4f32_arg: 1638; VI: ; %bb.0: ; %entry 1639; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1640; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1641; VI-NEXT: s_waitcnt lgkmcnt(0) 1642; VI-NEXT: v_mov_b32_e32 v4, s4 1643; VI-NEXT: v_mov_b32_e32 v0, s0 1644; VI-NEXT: v_mov_b32_e32 v5, s5 1645; VI-NEXT: v_mov_b32_e32 v1, s1 1646; VI-NEXT: v_mov_b32_e32 v2, s2 1647; VI-NEXT: v_mov_b32_e32 v3, s3 1648; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1649; VI-NEXT: s_endpgm 1650; 1651; GFX9-LABEL: v4f32_arg: 1652; GFX9: ; %bb.0: ; %entry 1653; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1654; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1655; GFX9-NEXT: v_mov_b32_e32 v4, 0 1656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX9-NEXT: v_mov_b32_e32 v0, s0 1658; GFX9-NEXT: v_mov_b32_e32 v1, s1 1659; GFX9-NEXT: v_mov_b32_e32 v2, s2 1660; GFX9-NEXT: v_mov_b32_e32 v3, s3 1661; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1662; GFX9-NEXT: s_endpgm 1663; 1664; EG-LABEL: v4f32_arg: 1665; EG: ; %bb.0: ; %entry 1666; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1667; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1668; EG-NEXT: CF_END 1669; EG-NEXT: PAD 1670; EG-NEXT: ALU clause starting at 4: 1671; EG-NEXT: MOV * T0.W, KC0[4].X, 1672; EG-NEXT: MOV * T0.Z, KC0[3].W, 1673; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1674; EG-NEXT: MOV T0.X, KC0[3].Y, 1675; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1676; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1677; 1678; CM-LABEL: v4f32_arg: 1679; CM: ; %bb.0: ; %entry 1680; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1681; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1682; CM-NEXT: CF_END 1683; CM-NEXT: PAD 1684; CM-NEXT: ALU clause starting at 4: 1685; CM-NEXT: MOV * T0.W, KC0[4].X, 1686; CM-NEXT: MOV * T0.Z, KC0[3].W, 1687; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1688; CM-NEXT: MOV * T0.X, KC0[3].Y, 1689; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1690; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1691entry: 1692 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 1693 ret void 1694} 1695 1696define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { 1697; SI-LABEL: v5i8_arg: 1698; SI: ; %bb.0: ; %entry 1699; SI-NEXT: s_load_dword s2, s[0:1], 0xc 1700; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1701; SI-NEXT: s_load_dword s0, s[0:1], 0xb 1702; SI-NEXT: s_mov_b32 s7, 0xf000 1703; SI-NEXT: s_mov_b32 s6, -1 1704; SI-NEXT: s_waitcnt lgkmcnt(0) 1705; SI-NEXT: v_mov_b32_e32 v0, s2 1706; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4 1707; SI-NEXT: s_waitcnt expcnt(0) 1708; SI-NEXT: v_mov_b32_e32 v0, s0 1709; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1710; SI-NEXT: s_endpgm 1711; 1712; VI-LABEL: v5i8_arg: 1713; VI: ; %bb.0: ; %entry 1714; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1715; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 1716; VI-NEXT: s_load_dword s1, s[0:1], 0x30 1717; VI-NEXT: s_waitcnt lgkmcnt(0) 1718; VI-NEXT: s_add_u32 s0, s2, 4 1719; VI-NEXT: v_mov_b32_e32 v0, s2 1720; VI-NEXT: v_mov_b32_e32 v4, s1 1721; VI-NEXT: s_addc_u32 s1, s3, 0 1722; VI-NEXT: v_mov_b32_e32 v3, s1 1723; VI-NEXT: v_mov_b32_e32 v2, s0 1724; VI-NEXT: v_mov_b32_e32 v1, s3 1725; VI-NEXT: flat_store_byte v[2:3], v4 1726; VI-NEXT: v_mov_b32_e32 v2, s4 1727; VI-NEXT: flat_store_dword v[0:1], v2 1728; VI-NEXT: s_endpgm 1729; 1730; GFX9-LABEL: v5i8_arg: 1731; GFX9: ; %bb.0: ; %entry 1732; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1733; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 1734; GFX9-NEXT: v_mov_b32_e32 v0, 0 1735; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1736; GFX9-NEXT: v_mov_b32_e32 v1, s3 1737; GFX9-NEXT: v_mov_b32_e32 v2, s2 1738; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4 1739; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1740; GFX9-NEXT: s_endpgm 1741; 1742; EG-LABEL: v5i8_arg: 1743; EG: ; %bb.0: ; %entry 1744; EG-NEXT: ALU 0, @16, KC0[], KC1[] 1745; EG-NEXT: TEX 4 @6 1746; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1747; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1748; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 1749; EG-NEXT: CF_END 1750; EG-NEXT: Fetch clause starting at 6: 1751; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1752; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1753; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1754; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1755; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1756; EG-NEXT: ALU clause starting at 16: 1757; EG-NEXT: MOV * T5.X, 0.0, 1758; EG-NEXT: ALU clause starting at 17: 1759; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1760; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1761; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1762; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1763; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 1764; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1765; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1766; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1767; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1768; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1769; EG-NEXT: MOV T5.Y, 0.0, 1770; EG-NEXT: MOV T5.Z, 0.0, 1771; EG-NEXT: AND_INT T1.W, T9.X, literal.x, 1772; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x, 1773; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1774; EG-NEXT: LSHL T1.W, PV.W, literal.x, 1775; EG-NEXT: LSHL * T2.W, T7.X, literal.y, 1776; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) 1777; EG-NEXT: OR_INT T1.W, PS, PV.W, 1778; EG-NEXT: LSHL * T2.W, T0.Z, literal.x, 1779; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1780; EG-NEXT: OR_INT T1.W, PV.W, PS, 1781; EG-NEXT: AND_INT * T2.W, T6.X, literal.x, 1782; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1783; EG-NEXT: OR_INT T6.X, PV.W, PS, 1784; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1785; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1786; EG-NEXT: LSHR * T8.X, T0.W, literal.x, 1787; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1788; 1789; CM-LABEL: v5i8_arg: 1790; CM: ; %bb.0: ; %entry 1791; CM-NEXT: ALU 0, @16, KC0[], KC1[] 1792; CM-NEXT: TEX 4 @6 1793; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1794; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X 1795; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X 1796; CM-NEXT: CF_END 1797; CM-NEXT: Fetch clause starting at 6: 1798; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1799; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1800; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1801; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1802; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1803; CM-NEXT: ALU clause starting at 16: 1804; CM-NEXT: MOV * T5.X, 0.0, 1805; CM-NEXT: ALU clause starting at 17: 1806; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1807; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1808; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1809; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1810; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1811; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1812; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1813; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1814; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1815; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1816; CM-NEXT: MOV T5.Y, 0.0, 1817; CM-NEXT: MOV T5.Z, 0.0, 1818; CM-NEXT: AND_INT * T1.W, T9.X, literal.x, 1819; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1820; CM-NEXT: AND_INT T0.Y, T8.X, literal.x, 1821; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1822; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212 1823; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1824; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1825; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1826; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, 1827; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1828; CM-NEXT: LSHR T7.X, T0.W, literal.x, 1829; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1830; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1831; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) 1832; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1833; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 1834; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1835entry: 1836 store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4 1837 ret void 1838} 1839 1840define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind { 1841; SI-LABEL: v5i16_arg: 1842; SI: ; %bb.0: ; %entry 1843; SI-NEXT: s_load_dword s2, s[0:1], 0xf 1844; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1845; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1846; SI-NEXT: s_mov_b32 s7, 0xf000 1847; SI-NEXT: s_mov_b32 s6, -1 1848; SI-NEXT: s_waitcnt lgkmcnt(0) 1849; SI-NEXT: v_mov_b32_e32 v0, s2 1850; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 1851; SI-NEXT: s_waitcnt expcnt(0) 1852; SI-NEXT: v_mov_b32_e32 v0, s0 1853; SI-NEXT: v_mov_b32_e32 v1, s1 1854; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1855; SI-NEXT: s_endpgm 1856; 1857; VI-LABEL: v5i16_arg: 1858; VI: ; %bb.0: ; %entry 1859; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1860; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 1861; VI-NEXT: s_load_dword s1, s[0:1], 0x3c 1862; VI-NEXT: s_waitcnt lgkmcnt(0) 1863; VI-NEXT: s_add_u32 s0, s2, 8 1864; VI-NEXT: v_mov_b32_e32 v0, s2 1865; VI-NEXT: v_mov_b32_e32 v4, s1 1866; VI-NEXT: s_addc_u32 s1, s3, 0 1867; VI-NEXT: v_mov_b32_e32 v3, s1 1868; VI-NEXT: v_mov_b32_e32 v2, s0 1869; VI-NEXT: flat_store_short v[2:3], v4 1870; VI-NEXT: v_mov_b32_e32 v2, s4 1871; VI-NEXT: v_mov_b32_e32 v1, s3 1872; VI-NEXT: v_mov_b32_e32 v3, s5 1873; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1874; VI-NEXT: s_endpgm 1875; 1876; GFX9-LABEL: v5i16_arg: 1877; GFX9: ; %bb.0: ; %entry 1878; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1879; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 1880; GFX9-NEXT: s_load_dword s6, s[4:5], 0x18 1881; GFX9-NEXT: v_mov_b32_e32 v2, 0 1882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1883; GFX9-NEXT: v_mov_b32_e32 v0, s2 1884; GFX9-NEXT: v_mov_b32_e32 v3, s6 1885; GFX9-NEXT: v_mov_b32_e32 v1, s3 1886; GFX9-NEXT: global_store_short v2, v3, s[0:1] offset:8 1887; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1888; GFX9-NEXT: s_endpgm 1889; 1890; EG-LABEL: v5i16_arg: 1891; EG: ; %bb.0: ; %entry 1892; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1893; EG-NEXT: TEX 4 @10 1894; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[] 1895; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1896; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1897; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1898; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1899; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1900; EG-NEXT: CF_END 1901; EG-NEXT: PAD 1902; EG-NEXT: Fetch clause starting at 10: 1903; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1904; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1905; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1906; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1907; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1908; EG-NEXT: ALU clause starting at 20: 1909; EG-NEXT: MOV * T0.X, 0.0, 1910; EG-NEXT: ALU clause starting at 21: 1911; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1912; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1913; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1914; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, 1915; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1916; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1917; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1918; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1919; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1920; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1921; EG-NEXT: MOV T5.Y, 0.0, 1922; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x, 1923; EG-NEXT: AND_INT * T2.W, T4.X, literal.y, 1924; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1925; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1926; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1927; EG-NEXT: LSHL T4.X, T2.W, PV.W, 1928; EG-NEXT: LSHL * T4.W, literal.x, PV.W, 1929; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1930; EG-NEXT: MOV T4.Y, 0.0, 1931; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 1932; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1933; EG-NEXT: AND_INT T2.W, PV.W, literal.x, 1934; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, 1935; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1936; EG-NEXT: LSHL * T2.W, PV.W, literal.x, 1937; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1938; EG-NEXT: LSHL T3.X, T3.W, PV.W, 1939; EG-NEXT: LSHL * T3.W, literal.x, PV.W, 1940; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1941; EG-NEXT: MOV T3.Y, 0.0, 1942; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1943; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1944; EG-NEXT: AND_INT T6.W, PV.W, literal.x, 1945; EG-NEXT: AND_INT * T7.W, T2.X, literal.y, 1946; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1947; EG-NEXT: LSHL * T6.W, PV.W, literal.x, 1948; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1949; EG-NEXT: LSHL T6.X, T7.W, PV.W, 1950; EG-NEXT: LSHL * T6.W, literal.x, PV.W, 1951; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1952; EG-NEXT: MOV T6.Y, 0.0, 1953; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 1954; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) 1955; EG-NEXT: AND_INT T8.W, PV.W, literal.x, 1956; EG-NEXT: AND_INT * T9.W, T1.X, literal.y, 1957; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1958; EG-NEXT: LSHL * T8.W, PV.W, literal.x, 1959; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1960; EG-NEXT: LSHL T8.X, T9.W, PV.W, 1961; EG-NEXT: LSHL * T8.W, literal.x, PV.W, 1962; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1963; EG-NEXT: MOV T8.Y, 0.0, 1964; EG-NEXT: MOV T5.Z, 0.0, 1965; EG-NEXT: MOV * T4.Z, 0.0, 1966; EG-NEXT: MOV T3.Z, 0.0, 1967; EG-NEXT: MOV * T6.Z, 0.0, 1968; EG-NEXT: MOV * T8.Z, 0.0, 1969; EG-NEXT: LSHR T0.X, T7.W, literal.x, 1970; EG-NEXT: LSHR * T1.X, T2.W, literal.x, 1971; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1972; EG-NEXT: LSHR T2.X, T1.W, literal.x, 1973; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1974; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1975; EG-NEXT: LSHR * T9.X, T0.W, literal.x, 1976; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1977; 1978; CM-LABEL: v5i16_arg: 1979; CM: ; %bb.0: ; %entry 1980; CM-NEXT: ALU 0, @20, KC0[], KC1[] 1981; CM-NEXT: TEX 4 @10 1982; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[] 1983; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1984; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1985; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1986; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1987; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1988; CM-NEXT: CF_END 1989; CM-NEXT: PAD 1990; CM-NEXT: Fetch clause starting at 10: 1991; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1992; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1993; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1994; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1995; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1996; CM-NEXT: ALU clause starting at 20: 1997; CM-NEXT: MOV * T0.X, 0.0, 1998; CM-NEXT: ALU clause starting at 21: 1999; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 2000; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2001; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 2002; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2003; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, 2004; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 2005; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2006; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 2007; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 2008; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2009; CM-NEXT: MOV T5.Y, 0.0, 2010; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 2011; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2012; CM-NEXT: AND_INT T0.Z, T4.X, literal.x, 2013; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 2014; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2015; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 2016; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 2017; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2018; CM-NEXT: MOV T4.Y, 0.0, 2019; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2020; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2021; CM-NEXT: AND_INT * T2.W, PV.W, literal.x, 2022; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2023; CM-NEXT: AND_INT T0.Z, T3.X, literal.x, 2024; CM-NEXT: LSHL * T2.W, PV.W, literal.y, 2025; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2026; CM-NEXT: LSHL T3.X, PV.Z, PV.W, 2027; CM-NEXT: LSHL * T3.W, literal.x, PV.W, 2028; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2029; CM-NEXT: MOV T3.Y, 0.0, 2030; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2031; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 2032; CM-NEXT: AND_INT * T6.W, PV.W, literal.x, 2033; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2034; CM-NEXT: AND_INT T0.Z, T2.X, literal.x, 2035; CM-NEXT: LSHL * T6.W, PV.W, literal.y, 2036; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2037; CM-NEXT: LSHL T6.X, PV.Z, PV.W, 2038; CM-NEXT: LSHL * T6.W, literal.x, PV.W, 2039; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2040; CM-NEXT: MOV T6.Y, 0.0, 2041; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 2042; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00) 2043; CM-NEXT: AND_INT * T8.W, PV.W, literal.x, 2044; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2045; CM-NEXT: AND_INT T0.Z, T1.X, literal.x, 2046; CM-NEXT: LSHL * T8.W, PV.W, literal.y, 2047; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2048; CM-NEXT: LSHL T8.X, PV.Z, PV.W, 2049; CM-NEXT: LSHL * T8.W, literal.x, PV.W, 2050; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2051; CM-NEXT: MOV T8.Y, 0.0, 2052; CM-NEXT: MOV * T5.Z, 0.0, 2053; CM-NEXT: MOV * T4.Z, 0.0, 2054; CM-NEXT: MOV * T3.Z, 0.0, 2055; CM-NEXT: MOV * T6.Z, 0.0, 2056; CM-NEXT: MOV * T8.Z, 0.0, 2057; CM-NEXT: LSHR * T0.X, T7.W, literal.x, 2058; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2059; CM-NEXT: LSHR * T1.X, T2.W, literal.x, 2060; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2061; CM-NEXT: LSHR * T2.X, T1.W, literal.x, 2062; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2063; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 2064; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2065; CM-NEXT: LSHR * T9.X, T0.W, literal.x, 2066; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2067entry: 2068 store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4 2069 ret void 2070} 2071 2072define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind { 2073; SI-LABEL: v5i32_arg: 2074; SI: ; %bb.0: ; %entry 2075; SI-NEXT: s_load_dword s8, s[0:1], 0x15 2076; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2077; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 2078; SI-NEXT: s_mov_b32 s7, 0xf000 2079; SI-NEXT: s_mov_b32 s6, -1 2080; SI-NEXT: s_waitcnt lgkmcnt(0) 2081; SI-NEXT: v_mov_b32_e32 v0, s8 2082; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 2083; SI-NEXT: s_waitcnt expcnt(0) 2084; SI-NEXT: v_mov_b32_e32 v0, s0 2085; SI-NEXT: v_mov_b32_e32 v1, s1 2086; SI-NEXT: v_mov_b32_e32 v2, s2 2087; SI-NEXT: v_mov_b32_e32 v3, s3 2088; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2089; SI-NEXT: s_endpgm 2090; 2091; VI-LABEL: v5i32_arg: 2092; VI: ; %bb.0: ; %entry 2093; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2094; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 2095; VI-NEXT: s_load_dword s1, s[0:1], 0x54 2096; VI-NEXT: s_waitcnt lgkmcnt(0) 2097; VI-NEXT: s_add_u32 s0, s2, 16 2098; VI-NEXT: v_mov_b32_e32 v5, s3 2099; VI-NEXT: v_mov_b32_e32 v2, s1 2100; VI-NEXT: s_addc_u32 s1, s3, 0 2101; VI-NEXT: v_mov_b32_e32 v0, s0 2102; VI-NEXT: v_mov_b32_e32 v1, s1 2103; VI-NEXT: flat_store_dword v[0:1], v2 2104; VI-NEXT: v_mov_b32_e32 v0, s4 2105; VI-NEXT: v_mov_b32_e32 v4, s2 2106; VI-NEXT: v_mov_b32_e32 v1, s5 2107; VI-NEXT: v_mov_b32_e32 v2, s6 2108; VI-NEXT: v_mov_b32_e32 v3, s7 2109; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2110; VI-NEXT: s_endpgm 2111; 2112; GFX9-LABEL: v5i32_arg: 2113; GFX9: ; %bb.0: ; %entry 2114; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2115; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2116; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 2117; GFX9-NEXT: v_mov_b32_e32 v4, 0 2118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX9-NEXT: v_mov_b32_e32 v0, s0 2120; GFX9-NEXT: v_mov_b32_e32 v5, s8 2121; GFX9-NEXT: v_mov_b32_e32 v1, s1 2122; GFX9-NEXT: v_mov_b32_e32 v2, s2 2123; GFX9-NEXT: v_mov_b32_e32 v3, s3 2124; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 2125; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2126; GFX9-NEXT: s_endpgm 2127; 2128; EG-LABEL: v5i32_arg: 2129; EG: ; %bb.0: ; %entry 2130; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2131; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2132; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2133; EG-NEXT: CF_END 2134; EG-NEXT: ALU clause starting at 4: 2135; EG-NEXT: MOV * T0.W, KC0[5].X, 2136; EG-NEXT: MOV * T0.Z, KC0[4].W, 2137; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2138; EG-NEXT: MOV T0.X, KC0[4].Y, 2139; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2140; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2141; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2142; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2143; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2144; EG-NEXT: MOV * T3.X, KC0[5].Y, 2145; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2146; 2147; CM-LABEL: v5i32_arg: 2148; CM: ; %bb.0: ; %entry 2149; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2150; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2151; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2152; CM-NEXT: CF_END 2153; CM-NEXT: ALU clause starting at 4: 2154; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2155; CM-NEXT: MOV * T0.W, KC0[5].X, 2156; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2157; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2158; CM-NEXT: MOV * T0.Z, KC0[4].W, 2159; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2160; CM-NEXT: MOV T2.X, KC0[5].Y, 2161; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2162; CM-NEXT: MOV * T0.X, KC0[4].Y, 2163; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2164; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2165entry: 2166 store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4 2167 ret void 2168} 2169 2170define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind { 2171; SI-LABEL: v5f32_arg: 2172; SI: ; %bb.0: ; %entry 2173; SI-NEXT: s_load_dword s8, s[0:1], 0x15 2174; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2175; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 2176; SI-NEXT: s_mov_b32 s7, 0xf000 2177; SI-NEXT: s_mov_b32 s6, -1 2178; SI-NEXT: s_waitcnt lgkmcnt(0) 2179; SI-NEXT: v_mov_b32_e32 v0, s8 2180; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 2181; SI-NEXT: s_waitcnt expcnt(0) 2182; SI-NEXT: v_mov_b32_e32 v0, s0 2183; SI-NEXT: v_mov_b32_e32 v1, s1 2184; SI-NEXT: v_mov_b32_e32 v2, s2 2185; SI-NEXT: v_mov_b32_e32 v3, s3 2186; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2187; SI-NEXT: s_endpgm 2188; 2189; VI-LABEL: v5f32_arg: 2190; VI: ; %bb.0: ; %entry 2191; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2192; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 2193; VI-NEXT: s_load_dword s1, s[0:1], 0x54 2194; VI-NEXT: s_waitcnt lgkmcnt(0) 2195; VI-NEXT: s_add_u32 s0, s2, 16 2196; VI-NEXT: v_mov_b32_e32 v5, s3 2197; VI-NEXT: v_mov_b32_e32 v3, s1 2198; VI-NEXT: s_addc_u32 s1, s3, 0 2199; VI-NEXT: v_mov_b32_e32 v2, s1 2200; VI-NEXT: v_mov_b32_e32 v1, s0 2201; VI-NEXT: v_mov_b32_e32 v0, s4 2202; VI-NEXT: flat_store_dword v[1:2], v3 2203; VI-NEXT: v_mov_b32_e32 v1, s5 2204; VI-NEXT: v_mov_b32_e32 v2, s6 2205; VI-NEXT: v_mov_b32_e32 v3, s7 2206; VI-NEXT: v_mov_b32_e32 v4, s2 2207; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2208; VI-NEXT: s_endpgm 2209; 2210; GFX9-LABEL: v5f32_arg: 2211; GFX9: ; %bb.0: ; %entry 2212; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2213; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2214; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 2215; GFX9-NEXT: v_mov_b32_e32 v4, 0 2216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2217; GFX9-NEXT: v_mov_b32_e32 v0, s0 2218; GFX9-NEXT: v_mov_b32_e32 v1, s1 2219; GFX9-NEXT: v_mov_b32_e32 v2, s2 2220; GFX9-NEXT: v_mov_b32_e32 v3, s3 2221; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2222; GFX9-NEXT: s_nop 0 2223; GFX9-NEXT: v_mov_b32_e32 v0, s8 2224; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 2225; GFX9-NEXT: s_endpgm 2226; 2227; EG-LABEL: v5f32_arg: 2228; EG: ; %bb.0: ; %entry 2229; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2230; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2231; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2232; EG-NEXT: CF_END 2233; EG-NEXT: ALU clause starting at 4: 2234; EG-NEXT: MOV * T0.W, KC0[5].X, 2235; EG-NEXT: MOV * T0.Z, KC0[4].W, 2236; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2237; EG-NEXT: MOV T0.X, KC0[4].Y, 2238; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2239; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2240; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2241; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2242; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2243; EG-NEXT: MOV * T3.X, KC0[5].Y, 2244; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2245; 2246; CM-LABEL: v5f32_arg: 2247; CM: ; %bb.0: ; %entry 2248; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2249; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2250; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2251; CM-NEXT: CF_END 2252; CM-NEXT: ALU clause starting at 4: 2253; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2254; CM-NEXT: MOV * T0.W, KC0[5].X, 2255; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2256; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2257; CM-NEXT: MOV * T0.Z, KC0[4].W, 2258; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2259; CM-NEXT: MOV T2.X, KC0[5].Y, 2260; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2261; CM-NEXT: MOV * T0.X, KC0[4].Y, 2262; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2263; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2264entry: 2265 store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4 2266 ret void 2267} 2268 2269define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { 2270; SI-LABEL: v5i64_arg: 2271; SI: ; %bb.0: ; %entry 2272; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 2273; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2274; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 2275; SI-NEXT: s_mov_b32 s15, 0xf000 2276; SI-NEXT: s_mov_b32 s14, -1 2277; SI-NEXT: s_waitcnt lgkmcnt(0) 2278; SI-NEXT: v_mov_b32_e32 v0, s8 2279; SI-NEXT: v_mov_b32_e32 v1, s9 2280; SI-NEXT: v_mov_b32_e32 v2, s10 2281; SI-NEXT: v_mov_b32_e32 v3, s11 2282; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 2283; SI-NEXT: s_waitcnt expcnt(0) 2284; SI-NEXT: v_mov_b32_e32 v0, s4 2285; SI-NEXT: v_mov_b32_e32 v1, s5 2286; SI-NEXT: v_mov_b32_e32 v2, s6 2287; SI-NEXT: v_mov_b32_e32 v3, s7 2288; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2289; SI-NEXT: s_waitcnt expcnt(0) 2290; SI-NEXT: v_mov_b32_e32 v0, s0 2291; SI-NEXT: v_mov_b32_e32 v1, s1 2292; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 2293; SI-NEXT: s_endpgm 2294; 2295; VI-LABEL: v5i64_arg: 2296; VI: ; %bb.0: ; %entry 2297; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2298; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 2299; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 2300; VI-NEXT: s_waitcnt lgkmcnt(0) 2301; VI-NEXT: v_mov_b32_e32 v0, s8 2302; VI-NEXT: s_add_u32 s8, s2, 16 2303; VI-NEXT: v_mov_b32_e32 v1, s9 2304; VI-NEXT: s_addc_u32 s9, s3, 0 2305; VI-NEXT: v_mov_b32_e32 v4, s8 2306; VI-NEXT: v_mov_b32_e32 v2, s10 2307; VI-NEXT: v_mov_b32_e32 v3, s11 2308; VI-NEXT: v_mov_b32_e32 v5, s9 2309; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2310; VI-NEXT: v_mov_b32_e32 v5, s3 2311; VI-NEXT: v_mov_b32_e32 v0, s4 2312; VI-NEXT: v_mov_b32_e32 v1, s5 2313; VI-NEXT: v_mov_b32_e32 v2, s6 2314; VI-NEXT: v_mov_b32_e32 v3, s7 2315; VI-NEXT: v_mov_b32_e32 v4, s2 2316; VI-NEXT: s_add_u32 s2, s2, 32 2317; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2318; VI-NEXT: s_addc_u32 s3, s3, 0 2319; VI-NEXT: v_mov_b32_e32 v2, s2 2320; VI-NEXT: v_mov_b32_e32 v0, s0 2321; VI-NEXT: v_mov_b32_e32 v1, s1 2322; VI-NEXT: v_mov_b32_e32 v3, s3 2323; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2324; VI-NEXT: s_endpgm 2325; 2326; GFX9-LABEL: v5i64_arg: 2327; GFX9: ; %bb.0: ; %entry 2328; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 2329; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2330; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 2331; GFX9-NEXT: v_mov_b32_e32 v4, 0 2332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2333; GFX9-NEXT: v_mov_b32_e32 v0, s12 2334; GFX9-NEXT: v_mov_b32_e32 v1, s13 2335; GFX9-NEXT: v_mov_b32_e32 v2, s14 2336; GFX9-NEXT: v_mov_b32_e32 v3, s15 2337; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 2338; GFX9-NEXT: s_nop 0 2339; GFX9-NEXT: v_mov_b32_e32 v0, s8 2340; GFX9-NEXT: v_mov_b32_e32 v1, s9 2341; GFX9-NEXT: v_mov_b32_e32 v2, s10 2342; GFX9-NEXT: v_mov_b32_e32 v3, s11 2343; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2344; GFX9-NEXT: s_nop 0 2345; GFX9-NEXT: v_mov_b32_e32 v0, s0 2346; GFX9-NEXT: v_mov_b32_e32 v1, s1 2347; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 2348; GFX9-NEXT: s_endpgm 2349; 2350; EG-LABEL: v5i64_arg: 2351; EG: ; %bb.0: ; %entry 2352; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2353; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2354; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2355; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2356; EG-NEXT: CF_END 2357; EG-NEXT: PAD 2358; EG-NEXT: ALU clause starting at 6: 2359; EG-NEXT: MOV * T0.W, KC0[7].X, 2360; EG-NEXT: MOV * T0.Z, KC0[6].W, 2361; EG-NEXT: MOV T0.Y, KC0[6].Z, 2362; EG-NEXT: MOV * T1.W, KC0[8].X, 2363; EG-NEXT: MOV T0.X, KC0[6].Y, 2364; EG-NEXT: MOV * T1.Z, KC0[7].W, 2365; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2366; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2367; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2368; EG-NEXT: MOV T1.X, KC0[7].Y, 2369; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2370; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2371; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2372; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2373; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2374; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2375; EG-NEXT: MOV T5.Y, KC0[8].Z, 2376; EG-NEXT: MOV * T5.X, KC0[8].Y, 2377; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2378; 2379; CM-LABEL: v5i64_arg: 2380; CM: ; %bb.0: ; %entry 2381; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2382; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2383; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2384; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2385; CM-NEXT: CF_END 2386; CM-NEXT: PAD 2387; CM-NEXT: ALU clause starting at 6: 2388; CM-NEXT: MOV * T0.W, KC0[8].X, 2389; CM-NEXT: MOV T1.Y, KC0[8].Z, 2390; CM-NEXT: MOV * T0.Z, KC0[7].W, 2391; CM-NEXT: MOV T1.X, KC0[8].Y, 2392; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2393; CM-NEXT: MOV T0.X, KC0[7].Y, 2394; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2395; CM-NEXT: MOV * T2.W, KC0[7].X, 2396; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2397; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2398; CM-NEXT: MOV T2.Z, KC0[6].W, 2399; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2400; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2401; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2402; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2403; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2404; CM-NEXT: MOV * T2.X, KC0[6].Y, 2405; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2406; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2407entry: 2408 store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 2409 ret void 2410} 2411 2412define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { 2413; SI-LABEL: v5f64_arg: 2414; SI: ; %bb.0: ; %entry 2415; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 2416; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2417; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 2418; SI-NEXT: s_mov_b32 s15, 0xf000 2419; SI-NEXT: s_mov_b32 s14, -1 2420; SI-NEXT: s_waitcnt lgkmcnt(0) 2421; SI-NEXT: v_mov_b32_e32 v0, s8 2422; SI-NEXT: v_mov_b32_e32 v1, s9 2423; SI-NEXT: v_mov_b32_e32 v2, s10 2424; SI-NEXT: v_mov_b32_e32 v3, s11 2425; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 2426; SI-NEXT: s_waitcnt expcnt(0) 2427; SI-NEXT: v_mov_b32_e32 v0, s4 2428; SI-NEXT: v_mov_b32_e32 v1, s5 2429; SI-NEXT: v_mov_b32_e32 v2, s6 2430; SI-NEXT: v_mov_b32_e32 v3, s7 2431; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2432; SI-NEXT: s_waitcnt expcnt(0) 2433; SI-NEXT: v_mov_b32_e32 v0, s0 2434; SI-NEXT: v_mov_b32_e32 v1, s1 2435; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 2436; SI-NEXT: s_endpgm 2437; 2438; VI-LABEL: v5f64_arg: 2439; VI: ; %bb.0: ; %entry 2440; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2441; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 2442; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 2443; VI-NEXT: s_waitcnt lgkmcnt(0) 2444; VI-NEXT: v_mov_b32_e32 v0, s8 2445; VI-NEXT: s_add_u32 s8, s2, 16 2446; VI-NEXT: v_mov_b32_e32 v1, s9 2447; VI-NEXT: s_addc_u32 s9, s3, 0 2448; VI-NEXT: v_mov_b32_e32 v4, s8 2449; VI-NEXT: v_mov_b32_e32 v2, s10 2450; VI-NEXT: v_mov_b32_e32 v3, s11 2451; VI-NEXT: v_mov_b32_e32 v5, s9 2452; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2453; VI-NEXT: v_mov_b32_e32 v5, s3 2454; VI-NEXT: v_mov_b32_e32 v0, s4 2455; VI-NEXT: v_mov_b32_e32 v1, s5 2456; VI-NEXT: v_mov_b32_e32 v2, s6 2457; VI-NEXT: v_mov_b32_e32 v3, s7 2458; VI-NEXT: v_mov_b32_e32 v4, s2 2459; VI-NEXT: s_add_u32 s2, s2, 32 2460; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2461; VI-NEXT: s_addc_u32 s3, s3, 0 2462; VI-NEXT: v_mov_b32_e32 v2, s2 2463; VI-NEXT: v_mov_b32_e32 v0, s0 2464; VI-NEXT: v_mov_b32_e32 v1, s1 2465; VI-NEXT: v_mov_b32_e32 v3, s3 2466; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2467; VI-NEXT: s_endpgm 2468; 2469; GFX9-LABEL: v5f64_arg: 2470; GFX9: ; %bb.0: ; %entry 2471; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 2472; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2473; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 2474; GFX9-NEXT: v_mov_b32_e32 v4, 0 2475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2476; GFX9-NEXT: v_mov_b32_e32 v0, s12 2477; GFX9-NEXT: v_mov_b32_e32 v1, s13 2478; GFX9-NEXT: v_mov_b32_e32 v2, s14 2479; GFX9-NEXT: v_mov_b32_e32 v3, s15 2480; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 2481; GFX9-NEXT: s_nop 0 2482; GFX9-NEXT: v_mov_b32_e32 v0, s8 2483; GFX9-NEXT: v_mov_b32_e32 v1, s9 2484; GFX9-NEXT: v_mov_b32_e32 v2, s10 2485; GFX9-NEXT: v_mov_b32_e32 v3, s11 2486; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2487; GFX9-NEXT: s_nop 0 2488; GFX9-NEXT: v_mov_b32_e32 v0, s0 2489; GFX9-NEXT: v_mov_b32_e32 v1, s1 2490; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 2491; GFX9-NEXT: s_endpgm 2492; 2493; EG-LABEL: v5f64_arg: 2494; EG: ; %bb.0: ; %entry 2495; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2496; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2497; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2498; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2499; EG-NEXT: CF_END 2500; EG-NEXT: PAD 2501; EG-NEXT: ALU clause starting at 6: 2502; EG-NEXT: MOV * T0.W, KC0[7].X, 2503; EG-NEXT: MOV * T0.Z, KC0[6].W, 2504; EG-NEXT: MOV T0.Y, KC0[6].Z, 2505; EG-NEXT: MOV * T1.W, KC0[8].X, 2506; EG-NEXT: MOV T0.X, KC0[6].Y, 2507; EG-NEXT: MOV * T1.Z, KC0[7].W, 2508; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2509; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2510; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2511; EG-NEXT: MOV T1.X, KC0[7].Y, 2512; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2513; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2514; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2515; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2516; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2517; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2518; EG-NEXT: MOV T5.Y, KC0[8].Z, 2519; EG-NEXT: MOV * T5.X, KC0[8].Y, 2520; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2521; 2522; CM-LABEL: v5f64_arg: 2523; CM: ; %bb.0: ; %entry 2524; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2525; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2526; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2527; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2528; CM-NEXT: CF_END 2529; CM-NEXT: PAD 2530; CM-NEXT: ALU clause starting at 6: 2531; CM-NEXT: MOV * T0.W, KC0[8].X, 2532; CM-NEXT: MOV T1.Y, KC0[8].Z, 2533; CM-NEXT: MOV * T0.Z, KC0[7].W, 2534; CM-NEXT: MOV T1.X, KC0[8].Y, 2535; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2536; CM-NEXT: MOV T0.X, KC0[7].Y, 2537; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2538; CM-NEXT: MOV * T2.W, KC0[7].X, 2539; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2540; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2541; CM-NEXT: MOV T2.Z, KC0[6].W, 2542; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2543; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2544; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2545; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2546; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2547; CM-NEXT: MOV * T2.X, KC0[6].Y, 2548; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2549; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2550entry: 2551 store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 2552 ret void 2553} 2554 2555; FIXME: Lots of unpack and re-pack junk on VI 2556define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 2557; SI-LABEL: v8i8_arg: 2558; SI: ; %bb.0: ; %entry 2559; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 2560; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2561; SI-NEXT: s_mov_b32 s3, 0xf000 2562; SI-NEXT: s_mov_b32 s2, -1 2563; SI-NEXT: s_waitcnt lgkmcnt(0) 2564; SI-NEXT: v_mov_b32_e32 v0, s4 2565; SI-NEXT: v_mov_b32_e32 v1, s5 2566; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2567; SI-NEXT: s_endpgm 2568; 2569; VI-LABEL: v8i8_arg: 2570; VI: ; %bb.0: ; %entry 2571; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2572; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 2573; VI-NEXT: s_waitcnt lgkmcnt(0) 2574; VI-NEXT: v_mov_b32_e32 v0, s2 2575; VI-NEXT: v_mov_b32_e32 v3, s1 2576; VI-NEXT: v_mov_b32_e32 v1, s3 2577; VI-NEXT: v_mov_b32_e32 v2, s0 2578; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2579; VI-NEXT: s_endpgm 2580; 2581; GFX9-LABEL: v8i8_arg: 2582; GFX9: ; %bb.0: ; %entry 2583; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 2584; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 2585; GFX9-NEXT: v_mov_b32_e32 v2, 0 2586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2587; GFX9-NEXT: v_mov_b32_e32 v0, s2 2588; GFX9-NEXT: v_mov_b32_e32 v1, s3 2589; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2590; GFX9-NEXT: s_endpgm 2591; 2592; EG-LABEL: v8i8_arg: 2593; EG: ; %bb.0: ; %entry 2594; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2595; EG-NEXT: TEX 0 @20 2596; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2597; EG-NEXT: TEX 0 @22 2598; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2599; EG-NEXT: TEX 0 @24 2600; EG-NEXT: ALU 7, @50, KC0[], KC1[] 2601; EG-NEXT: TEX 0 @26 2602; EG-NEXT: ALU 7, @58, KC0[], KC1[] 2603; EG-NEXT: TEX 0 @28 2604; EG-NEXT: ALU 7, @66, KC0[], KC1[] 2605; EG-NEXT: TEX 0 @30 2606; EG-NEXT: ALU 7, @74, KC0[], KC1[] 2607; EG-NEXT: TEX 0 @32 2608; EG-NEXT: ALU 5, @82, KC0[], KC1[] 2609; EG-NEXT: TEX 0 @34 2610; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2611; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 2612; EG-NEXT: CF_END 2613; EG-NEXT: PAD 2614; EG-NEXT: Fetch clause starting at 20: 2615; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2616; EG-NEXT: Fetch clause starting at 22: 2617; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2618; EG-NEXT: Fetch clause starting at 24: 2619; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2620; EG-NEXT: Fetch clause starting at 26: 2621; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2622; EG-NEXT: Fetch clause starting at 28: 2623; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2624; EG-NEXT: Fetch clause starting at 30: 2625; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2626; EG-NEXT: Fetch clause starting at 32: 2627; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2628; EG-NEXT: Fetch clause starting at 34: 2629; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2630; EG-NEXT: ALU clause starting at 36: 2631; EG-NEXT: MOV * T0.Y, T2.X, 2632; EG-NEXT: MOV * T5.X, 0.0, 2633; EG-NEXT: ALU clause starting at 38: 2634; EG-NEXT: LSHL T0.W, T6.X, literal.x, 2635; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2636; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2637; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2638; EG-NEXT: MOV T2.X, PV.W, 2639; EG-NEXT: MOV * T0.Y, T3.X, 2640; EG-NEXT: ALU clause starting at 44: 2641; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2642; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 2643; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2644; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2645; EG-NEXT: MOV T3.X, PV.W, 2646; EG-NEXT: MOV * T0.Y, T2.X, 2647; EG-NEXT: ALU clause starting at 50: 2648; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2649; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2650; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2651; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2652; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2653; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2654; EG-NEXT: MOV T2.X, PV.W, 2655; EG-NEXT: MOV * T0.Y, T3.X, 2656; EG-NEXT: ALU clause starting at 58: 2657; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2658; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2659; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2660; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2661; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2662; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2663; EG-NEXT: MOV T3.X, PV.W, 2664; EG-NEXT: MOV * T0.Y, T2.X, 2665; EG-NEXT: ALU clause starting at 66: 2666; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2667; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2668; EG-NEXT: 255(3.573311e-43), -65281(nan) 2669; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2670; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2671; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2672; EG-NEXT: MOV T2.X, PV.W, 2673; EG-NEXT: MOV * T0.Y, T3.X, 2674; EG-NEXT: ALU clause starting at 74: 2675; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2676; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2677; EG-NEXT: 255(3.573311e-43), -65281(nan) 2678; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2679; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2680; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2681; EG-NEXT: MOV T3.X, PV.W, 2682; EG-NEXT: MOV * T0.Y, T2.X, 2683; EG-NEXT: ALU clause starting at 82: 2684; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2685; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 2686; EG-NEXT: -256(nan), 255(3.573311e-43) 2687; EG-NEXT: OR_INT * T5.Y, PV.W, PS, 2688; EG-NEXT: MOV T2.X, PV.Y, 2689; EG-NEXT: MOV * T0.Y, T3.X, 2690; EG-NEXT: ALU clause starting at 88: 2691; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2692; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, 2693; EG-NEXT: -256(nan), 255(3.573311e-43) 2694; EG-NEXT: OR_INT T5.X, PV.W, PS, 2695; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2696; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2697; 2698; CM-LABEL: v8i8_arg: 2699; CM: ; %bb.0: ; %entry 2700; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2701; CM-NEXT: TEX 0 @20 2702; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2703; CM-NEXT: TEX 0 @22 2704; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2705; CM-NEXT: TEX 0 @24 2706; CM-NEXT: ALU 7, @50, KC0[], KC1[] 2707; CM-NEXT: TEX 0 @26 2708; CM-NEXT: ALU 7, @58, KC0[], KC1[] 2709; CM-NEXT: TEX 0 @28 2710; CM-NEXT: ALU 7, @66, KC0[], KC1[] 2711; CM-NEXT: TEX 0 @30 2712; CM-NEXT: ALU 7, @74, KC0[], KC1[] 2713; CM-NEXT: TEX 0 @32 2714; CM-NEXT: ALU 5, @82, KC0[], KC1[] 2715; CM-NEXT: TEX 0 @34 2716; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2717; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 2718; CM-NEXT: CF_END 2719; CM-NEXT: PAD 2720; CM-NEXT: Fetch clause starting at 20: 2721; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2722; CM-NEXT: Fetch clause starting at 22: 2723; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2724; CM-NEXT: Fetch clause starting at 24: 2725; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2726; CM-NEXT: Fetch clause starting at 26: 2727; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2728; CM-NEXT: Fetch clause starting at 28: 2729; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2730; CM-NEXT: Fetch clause starting at 30: 2731; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2732; CM-NEXT: Fetch clause starting at 32: 2733; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2734; CM-NEXT: Fetch clause starting at 34: 2735; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2736; CM-NEXT: ALU clause starting at 36: 2737; CM-NEXT: MOV * T0.Y, T2.X, 2738; CM-NEXT: MOV * T5.X, 0.0, 2739; CM-NEXT: ALU clause starting at 38: 2740; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 2741; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2742; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2743; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2744; CM-NEXT: MOV T2.X, PV.W, 2745; CM-NEXT: MOV * T0.Y, T3.X, 2746; CM-NEXT: ALU clause starting at 44: 2747; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2748; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 2749; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2750; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2751; CM-NEXT: MOV T3.X, PV.W, 2752; CM-NEXT: MOV * T0.Y, T2.X, 2753; CM-NEXT: ALU clause starting at 50: 2754; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2755; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2756; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2757; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2758; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2759; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2760; CM-NEXT: MOV T2.X, PV.W, 2761; CM-NEXT: MOV * T0.Y, T3.X, 2762; CM-NEXT: ALU clause starting at 58: 2763; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2764; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2765; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2766; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2767; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2768; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2769; CM-NEXT: MOV T3.X, PV.W, 2770; CM-NEXT: MOV * T0.Y, T2.X, 2771; CM-NEXT: ALU clause starting at 66: 2772; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2773; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2774; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2775; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2776; CM-NEXT: -65281(nan), 8(1.121039e-44) 2777; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2778; CM-NEXT: MOV T2.X, PV.W, 2779; CM-NEXT: MOV * T0.Y, T3.X, 2780; CM-NEXT: ALU clause starting at 74: 2781; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2782; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2783; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2784; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2785; CM-NEXT: -65281(nan), 8(1.121039e-44) 2786; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2787; CM-NEXT: MOV T3.X, PV.W, 2788; CM-NEXT: MOV * T0.Y, T2.X, 2789; CM-NEXT: ALU clause starting at 82: 2790; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2791; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 2792; CM-NEXT: -256(nan), 255(3.573311e-43) 2793; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W, 2794; CM-NEXT: MOV T2.X, PV.Y, 2795; CM-NEXT: MOV * T0.Y, T3.X, 2796; CM-NEXT: ALU clause starting at 88: 2797; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2798; CM-NEXT: AND_INT * T0.W, T5.X, literal.y, 2799; CM-NEXT: -256(nan), 255(3.573311e-43) 2800; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 2801; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2802; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2803entry: 2804 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 2805 ret void 2806} 2807 2808define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 2809; SI-LABEL: v8i16_arg: 2810; SI: ; %bb.0: ; %entry 2811; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 2812; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2813; SI-NEXT: s_mov_b32 s3, 0xf000 2814; SI-NEXT: s_mov_b32 s2, -1 2815; SI-NEXT: s_waitcnt lgkmcnt(0) 2816; SI-NEXT: v_mov_b32_e32 v0, s4 2817; SI-NEXT: v_mov_b32_e32 v1, s5 2818; SI-NEXT: v_mov_b32_e32 v2, s6 2819; SI-NEXT: v_mov_b32_e32 v3, s7 2820; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2821; SI-NEXT: s_endpgm 2822; 2823; VI-LABEL: v8i16_arg: 2824; VI: ; %bb.0: ; %entry 2825; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2826; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 2827; VI-NEXT: s_waitcnt lgkmcnt(0) 2828; VI-NEXT: v_mov_b32_e32 v4, s4 2829; VI-NEXT: v_mov_b32_e32 v0, s0 2830; VI-NEXT: v_mov_b32_e32 v5, s5 2831; VI-NEXT: v_mov_b32_e32 v1, s1 2832; VI-NEXT: v_mov_b32_e32 v2, s2 2833; VI-NEXT: v_mov_b32_e32 v3, s3 2834; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2835; VI-NEXT: s_endpgm 2836; 2837; GFX9-LABEL: v8i16_arg: 2838; GFX9: ; %bb.0: ; %entry 2839; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2840; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 2841; GFX9-NEXT: v_mov_b32_e32 v4, 0 2842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2843; GFX9-NEXT: v_mov_b32_e32 v0, s0 2844; GFX9-NEXT: v_mov_b32_e32 v1, s1 2845; GFX9-NEXT: v_mov_b32_e32 v2, s2 2846; GFX9-NEXT: v_mov_b32_e32 v3, s3 2847; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2848; GFX9-NEXT: s_endpgm 2849; 2850; EG-LABEL: v8i16_arg: 2851; EG: ; %bb.0: ; %entry 2852; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2853; EG-NEXT: TEX 0 @20 2854; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2855; EG-NEXT: TEX 0 @22 2856; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2857; EG-NEXT: TEX 0 @24 2858; EG-NEXT: ALU 5, @50, KC0[], KC1[] 2859; EG-NEXT: TEX 0 @26 2860; EG-NEXT: ALU 5, @56, KC0[], KC1[] 2861; EG-NEXT: TEX 0 @28 2862; EG-NEXT: ALU 5, @62, KC0[], KC1[] 2863; EG-NEXT: TEX 0 @30 2864; EG-NEXT: ALU 5, @68, KC0[], KC1[] 2865; EG-NEXT: TEX 0 @32 2866; EG-NEXT: ALU 5, @74, KC0[], KC1[] 2867; EG-NEXT: TEX 0 @34 2868; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2869; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 2870; EG-NEXT: CF_END 2871; EG-NEXT: PAD 2872; EG-NEXT: Fetch clause starting at 20: 2873; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2874; EG-NEXT: Fetch clause starting at 22: 2875; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2876; EG-NEXT: Fetch clause starting at 24: 2877; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2878; EG-NEXT: Fetch clause starting at 26: 2879; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2880; EG-NEXT: Fetch clause starting at 28: 2881; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2882; EG-NEXT: Fetch clause starting at 30: 2883; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2884; EG-NEXT: Fetch clause starting at 32: 2885; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2886; EG-NEXT: Fetch clause starting at 34: 2887; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2888; EG-NEXT: ALU clause starting at 36: 2889; EG-NEXT: MOV * T0.Y, T3.X, 2890; EG-NEXT: MOV * T7.X, 0.0, 2891; EG-NEXT: ALU clause starting at 38: 2892; EG-NEXT: LSHL T0.W, T8.X, literal.x, 2893; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2894; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2895; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2896; EG-NEXT: MOV T3.X, PV.W, 2897; EG-NEXT: MOV * T0.Y, T5.X, 2898; EG-NEXT: ALU clause starting at 44: 2899; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2900; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2901; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2902; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2903; EG-NEXT: MOV T5.X, PV.W, 2904; EG-NEXT: MOV * T0.Y, T3.X, 2905; EG-NEXT: ALU clause starting at 50: 2906; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2907; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2908; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2909; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2910; EG-NEXT: MOV T3.X, PV.W, 2911; EG-NEXT: MOV * T0.Y, T5.X, 2912; EG-NEXT: ALU clause starting at 56: 2913; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2914; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2915; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2916; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2917; EG-NEXT: MOV T5.X, PV.W, 2918; EG-NEXT: MOV * T0.Y, T2.X, 2919; EG-NEXT: ALU clause starting at 62: 2920; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2921; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2922; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2923; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2924; EG-NEXT: MOV T2.X, PV.W, 2925; EG-NEXT: MOV * T0.Y, T4.X, 2926; EG-NEXT: ALU clause starting at 68: 2927; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2928; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2929; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2930; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2931; EG-NEXT: MOV T4.X, PV.W, 2932; EG-NEXT: MOV * T0.Y, T2.X, 2933; EG-NEXT: ALU clause starting at 74: 2934; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2935; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2936; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2937; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 2938; EG-NEXT: MOV T2.X, PV.Z, 2939; EG-NEXT: MOV * T0.Y, T4.X, 2940; EG-NEXT: ALU clause starting at 80: 2941; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 2942; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 2943; EG-NEXT: AND_INT * T1.W, T7.X, literal.z, 2944; EG-NEXT: 2(2.802597e-45), -65536(nan) 2945; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2946; EG-NEXT: OR_INT * T7.X, PV.W, PS, 2947; EG-NEXT: MOV T4.X, PV.X, 2948; EG-NEXT: MOV * T7.W, T3.X, 2949; EG-NEXT: MOV * T7.Y, T5.X, 2950; 2951; CM-LABEL: v8i16_arg: 2952; CM: ; %bb.0: ; %entry 2953; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2954; CM-NEXT: TEX 0 @20 2955; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2956; CM-NEXT: TEX 0 @22 2957; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2958; CM-NEXT: TEX 0 @24 2959; CM-NEXT: ALU 5, @50, KC0[], KC1[] 2960; CM-NEXT: TEX 0 @26 2961; CM-NEXT: ALU 5, @56, KC0[], KC1[] 2962; CM-NEXT: TEX 0 @28 2963; CM-NEXT: ALU 5, @62, KC0[], KC1[] 2964; CM-NEXT: TEX 0 @30 2965; CM-NEXT: ALU 5, @68, KC0[], KC1[] 2966; CM-NEXT: TEX 0 @32 2967; CM-NEXT: ALU 5, @74, KC0[], KC1[] 2968; CM-NEXT: TEX 0 @34 2969; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2970; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 2971; CM-NEXT: CF_END 2972; CM-NEXT: PAD 2973; CM-NEXT: Fetch clause starting at 20: 2974; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2975; CM-NEXT: Fetch clause starting at 22: 2976; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2977; CM-NEXT: Fetch clause starting at 24: 2978; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2979; CM-NEXT: Fetch clause starting at 26: 2980; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2981; CM-NEXT: Fetch clause starting at 28: 2982; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2983; CM-NEXT: Fetch clause starting at 30: 2984; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2985; CM-NEXT: Fetch clause starting at 32: 2986; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2987; CM-NEXT: Fetch clause starting at 34: 2988; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2989; CM-NEXT: ALU clause starting at 36: 2990; CM-NEXT: MOV * T0.Y, T3.X, 2991; CM-NEXT: MOV * T7.X, 0.0, 2992; CM-NEXT: ALU clause starting at 38: 2993; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 2994; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2995; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2996; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2997; CM-NEXT: MOV T3.X, PV.W, 2998; CM-NEXT: MOV * T0.Y, T5.X, 2999; CM-NEXT: ALU clause starting at 44: 3000; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3001; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3002; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3003; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3004; CM-NEXT: MOV T5.X, PV.W, 3005; CM-NEXT: MOV * T0.Y, T3.X, 3006; CM-NEXT: ALU clause starting at 50: 3007; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3008; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3009; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3010; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3011; CM-NEXT: MOV T3.X, PV.W, 3012; CM-NEXT: MOV * T0.Y, T5.X, 3013; CM-NEXT: ALU clause starting at 56: 3014; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3015; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3016; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3017; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3018; CM-NEXT: MOV T5.X, PV.W, 3019; CM-NEXT: MOV * T0.Y, T2.X, 3020; CM-NEXT: ALU clause starting at 62: 3021; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3022; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3023; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3024; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3025; CM-NEXT: MOV T2.X, PV.W, 3026; CM-NEXT: MOV * T0.Y, T4.X, 3027; CM-NEXT: ALU clause starting at 68: 3028; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3029; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3030; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3031; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3032; CM-NEXT: MOV T4.X, PV.W, 3033; CM-NEXT: MOV * T0.Y, T2.X, 3034; CM-NEXT: ALU clause starting at 74: 3035; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3036; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3037; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3038; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3039; CM-NEXT: MOV T2.X, PV.Z, 3040; CM-NEXT: MOV * T0.Y, T4.X, 3041; CM-NEXT: ALU clause starting at 80: 3042; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 3043; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 3044; CM-NEXT: AND_INT * T0.W, T7.X, literal.z, 3045; CM-NEXT: 2(2.802597e-45), -65536(nan) 3046; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3047; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3048; CM-NEXT: MOV T4.X, PV.X, 3049; CM-NEXT: MOV * T7.W, T3.X, 3050; CM-NEXT: MOV * T7.Y, T5.X, 3051entry: 3052 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 3053 ret void 3054} 3055 3056define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 3057; SI-LABEL: v8i32_arg: 3058; SI: ; %bb.0: ; %entry 3059; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3060; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3061; SI-NEXT: s_mov_b32 s3, 0xf000 3062; SI-NEXT: s_mov_b32 s2, -1 3063; SI-NEXT: s_waitcnt lgkmcnt(0) 3064; SI-NEXT: v_mov_b32_e32 v0, s8 3065; SI-NEXT: v_mov_b32_e32 v1, s9 3066; SI-NEXT: v_mov_b32_e32 v2, s10 3067; SI-NEXT: v_mov_b32_e32 v3, s11 3068; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3069; SI-NEXT: s_waitcnt expcnt(0) 3070; SI-NEXT: v_mov_b32_e32 v0, s4 3071; SI-NEXT: v_mov_b32_e32 v1, s5 3072; SI-NEXT: v_mov_b32_e32 v2, s6 3073; SI-NEXT: v_mov_b32_e32 v3, s7 3074; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3075; SI-NEXT: s_endpgm 3076; 3077; VI-LABEL: v8i32_arg: 3078; VI: ; %bb.0: ; %entry 3079; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 3080; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 3081; VI-NEXT: s_waitcnt lgkmcnt(0) 3082; VI-NEXT: v_mov_b32_e32 v0, s4 3083; VI-NEXT: s_add_u32 s4, s8, 16 3084; VI-NEXT: v_mov_b32_e32 v1, s5 3085; VI-NEXT: s_addc_u32 s5, s9, 0 3086; VI-NEXT: v_mov_b32_e32 v4, s4 3087; VI-NEXT: v_mov_b32_e32 v2, s6 3088; VI-NEXT: v_mov_b32_e32 v3, s7 3089; VI-NEXT: v_mov_b32_e32 v5, s5 3090; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3091; VI-NEXT: v_mov_b32_e32 v4, s8 3092; VI-NEXT: v_mov_b32_e32 v0, s0 3093; VI-NEXT: v_mov_b32_e32 v1, s1 3094; VI-NEXT: v_mov_b32_e32 v2, s2 3095; VI-NEXT: v_mov_b32_e32 v3, s3 3096; VI-NEXT: v_mov_b32_e32 v5, s9 3097; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3098; VI-NEXT: s_endpgm 3099; 3100; GFX9-LABEL: v8i32_arg: 3101; GFX9: ; %bb.0: ; %entry 3102; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3103; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3104; GFX9-NEXT: v_mov_b32_e32 v4, 0 3105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3106; GFX9-NEXT: v_mov_b32_e32 v0, s12 3107; GFX9-NEXT: v_mov_b32_e32 v1, s13 3108; GFX9-NEXT: v_mov_b32_e32 v2, s14 3109; GFX9-NEXT: v_mov_b32_e32 v3, s15 3110; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3111; GFX9-NEXT: s_nop 0 3112; GFX9-NEXT: v_mov_b32_e32 v0, s8 3113; GFX9-NEXT: v_mov_b32_e32 v1, s9 3114; GFX9-NEXT: v_mov_b32_e32 v2, s10 3115; GFX9-NEXT: v_mov_b32_e32 v3, s11 3116; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3117; GFX9-NEXT: s_endpgm 3118; 3119; EG-LABEL: v8i32_arg: 3120; EG: ; %bb.0: ; %entry 3121; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3124; EG-NEXT: CF_END 3125; EG-NEXT: ALU clause starting at 4: 3126; EG-NEXT: MOV * T0.W, KC0[5].X, 3127; EG-NEXT: MOV * T0.Z, KC0[4].W, 3128; EG-NEXT: MOV T0.Y, KC0[4].Z, 3129; EG-NEXT: MOV * T1.W, KC0[6].X, 3130; EG-NEXT: MOV T0.X, KC0[4].Y, 3131; EG-NEXT: MOV * T1.Z, KC0[5].W, 3132; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3133; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3134; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3135; EG-NEXT: MOV T1.X, KC0[5].Y, 3136; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3137; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3138; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3139; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3140; 3141; CM-LABEL: v8i32_arg: 3142; CM: ; %bb.0: ; %entry 3143; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3144; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3145; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3146; CM-NEXT: CF_END 3147; CM-NEXT: ALU clause starting at 4: 3148; CM-NEXT: MOV * T0.W, KC0[6].X, 3149; CM-NEXT: MOV * T0.Z, KC0[5].W, 3150; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3151; CM-NEXT: MOV T0.X, KC0[5].Y, 3152; CM-NEXT: MOV * T1.W, KC0[5].X, 3153; CM-NEXT: MOV T1.Z, KC0[4].W, 3154; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3155; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3156; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3157; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3158; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3159; CM-NEXT: MOV * T1.X, KC0[4].Y, 3160; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3161; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3162entry: 3163 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 3164 ret void 3165} 3166 3167define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 3168; SI-LABEL: v8f32_arg: 3169; SI: ; %bb.0: ; %entry 3170; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3171; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3172; SI-NEXT: s_mov_b32 s3, 0xf000 3173; SI-NEXT: s_mov_b32 s2, -1 3174; SI-NEXT: s_waitcnt lgkmcnt(0) 3175; SI-NEXT: v_mov_b32_e32 v0, s8 3176; SI-NEXT: v_mov_b32_e32 v1, s9 3177; SI-NEXT: v_mov_b32_e32 v2, s10 3178; SI-NEXT: v_mov_b32_e32 v3, s11 3179; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3180; SI-NEXT: s_waitcnt expcnt(0) 3181; SI-NEXT: v_mov_b32_e32 v0, s4 3182; SI-NEXT: v_mov_b32_e32 v1, s5 3183; SI-NEXT: v_mov_b32_e32 v2, s6 3184; SI-NEXT: v_mov_b32_e32 v3, s7 3185; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3186; SI-NEXT: s_endpgm 3187; 3188; VI-LABEL: v8f32_arg: 3189; VI: ; %bb.0: ; %entry 3190; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 3191; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 3192; VI-NEXT: s_waitcnt lgkmcnt(0) 3193; VI-NEXT: v_mov_b32_e32 v0, s4 3194; VI-NEXT: s_add_u32 s4, s8, 16 3195; VI-NEXT: v_mov_b32_e32 v1, s5 3196; VI-NEXT: s_addc_u32 s5, s9, 0 3197; VI-NEXT: v_mov_b32_e32 v4, s4 3198; VI-NEXT: v_mov_b32_e32 v2, s6 3199; VI-NEXT: v_mov_b32_e32 v3, s7 3200; VI-NEXT: v_mov_b32_e32 v5, s5 3201; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3202; VI-NEXT: v_mov_b32_e32 v4, s8 3203; VI-NEXT: v_mov_b32_e32 v0, s0 3204; VI-NEXT: v_mov_b32_e32 v1, s1 3205; VI-NEXT: v_mov_b32_e32 v2, s2 3206; VI-NEXT: v_mov_b32_e32 v3, s3 3207; VI-NEXT: v_mov_b32_e32 v5, s9 3208; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3209; VI-NEXT: s_endpgm 3210; 3211; GFX9-LABEL: v8f32_arg: 3212; GFX9: ; %bb.0: ; %entry 3213; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3214; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3215; GFX9-NEXT: v_mov_b32_e32 v4, 0 3216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3217; GFX9-NEXT: v_mov_b32_e32 v0, s12 3218; GFX9-NEXT: v_mov_b32_e32 v1, s13 3219; GFX9-NEXT: v_mov_b32_e32 v2, s14 3220; GFX9-NEXT: v_mov_b32_e32 v3, s15 3221; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3222; GFX9-NEXT: s_nop 0 3223; GFX9-NEXT: v_mov_b32_e32 v0, s8 3224; GFX9-NEXT: v_mov_b32_e32 v1, s9 3225; GFX9-NEXT: v_mov_b32_e32 v2, s10 3226; GFX9-NEXT: v_mov_b32_e32 v3, s11 3227; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3228; GFX9-NEXT: s_endpgm 3229; 3230; EG-LABEL: v8f32_arg: 3231; EG: ; %bb.0: ; %entry 3232; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3233; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3234; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3235; EG-NEXT: CF_END 3236; EG-NEXT: ALU clause starting at 4: 3237; EG-NEXT: MOV * T0.W, KC0[5].X, 3238; EG-NEXT: MOV * T0.Z, KC0[4].W, 3239; EG-NEXT: MOV T0.Y, KC0[4].Z, 3240; EG-NEXT: MOV * T1.W, KC0[6].X, 3241; EG-NEXT: MOV T0.X, KC0[4].Y, 3242; EG-NEXT: MOV * T1.Z, KC0[5].W, 3243; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3244; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3245; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3246; EG-NEXT: MOV T1.X, KC0[5].Y, 3247; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3248; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3249; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3250; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3251; 3252; CM-LABEL: v8f32_arg: 3253; CM: ; %bb.0: ; %entry 3254; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3255; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3256; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3257; CM-NEXT: CF_END 3258; CM-NEXT: ALU clause starting at 4: 3259; CM-NEXT: MOV * T0.W, KC0[6].X, 3260; CM-NEXT: MOV * T0.Z, KC0[5].W, 3261; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3262; CM-NEXT: MOV T0.X, KC0[5].Y, 3263; CM-NEXT: MOV * T1.W, KC0[5].X, 3264; CM-NEXT: MOV T1.Z, KC0[4].W, 3265; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3266; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3267; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3268; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3269; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3270; CM-NEXT: MOV * T1.X, KC0[4].Y, 3271; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3272; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3273entry: 3274 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 3275 ret void 3276} 3277 3278; FIXME: Pack/repack on VI 3279define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 3280; SI-LABEL: v16i8_arg: 3281; SI: ; %bb.0: ; %entry 3282; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 3283; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3284; SI-NEXT: s_mov_b32 s3, 0xf000 3285; SI-NEXT: s_mov_b32 s2, -1 3286; SI-NEXT: s_waitcnt lgkmcnt(0) 3287; SI-NEXT: v_mov_b32_e32 v0, s4 3288; SI-NEXT: v_mov_b32_e32 v1, s5 3289; SI-NEXT: v_mov_b32_e32 v2, s6 3290; SI-NEXT: v_mov_b32_e32 v3, s7 3291; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3292; SI-NEXT: s_endpgm 3293; 3294; VI-LABEL: v16i8_arg: 3295; VI: ; %bb.0: ; %entry 3296; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 3297; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 3298; VI-NEXT: s_waitcnt lgkmcnt(0) 3299; VI-NEXT: v_mov_b32_e32 v4, s4 3300; VI-NEXT: v_mov_b32_e32 v0, s0 3301; VI-NEXT: v_mov_b32_e32 v5, s5 3302; VI-NEXT: v_mov_b32_e32 v1, s1 3303; VI-NEXT: v_mov_b32_e32 v2, s2 3304; VI-NEXT: v_mov_b32_e32 v3, s3 3305; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3306; VI-NEXT: s_endpgm 3307; 3308; GFX9-LABEL: v16i8_arg: 3309; GFX9: ; %bb.0: ; %entry 3310; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 3311; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 3312; GFX9-NEXT: v_mov_b32_e32 v4, 0 3313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3314; GFX9-NEXT: v_mov_b32_e32 v0, s0 3315; GFX9-NEXT: v_mov_b32_e32 v1, s1 3316; GFX9-NEXT: v_mov_b32_e32 v2, s2 3317; GFX9-NEXT: v_mov_b32_e32 v3, s3 3318; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 3319; GFX9-NEXT: s_endpgm 3320; 3321; EG-LABEL: v16i8_arg: 3322; EG: ; %bb.0: ; %entry 3323; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3324; EG-NEXT: TEX 0 @36 3325; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3326; EG-NEXT: TEX 0 @38 3327; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3328; EG-NEXT: TEX 0 @40 3329; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3330; EG-NEXT: TEX 0 @42 3331; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3332; EG-NEXT: TEX 0 @44 3333; EG-NEXT: ALU 7, @94, KC0[], KC1[] 3334; EG-NEXT: TEX 0 @46 3335; EG-NEXT: ALU 7, @102, KC0[], KC1[] 3336; EG-NEXT: TEX 0 @48 3337; EG-NEXT: ALU 7, @110, KC0[], KC1[] 3338; EG-NEXT: TEX 0 @50 3339; EG-NEXT: ALU 7, @118, KC0[], KC1[] 3340; EG-NEXT: TEX 0 @52 3341; EG-NEXT: ALU 7, @126, KC0[], KC1[] 3342; EG-NEXT: TEX 0 @54 3343; EG-NEXT: ALU 7, @134, KC0[], KC1[] 3344; EG-NEXT: TEX 0 @56 3345; EG-NEXT: ALU 7, @142, KC0[], KC1[] 3346; EG-NEXT: TEX 0 @58 3347; EG-NEXT: ALU 7, @150, KC0[], KC1[] 3348; EG-NEXT: TEX 0 @60 3349; EG-NEXT: ALU 5, @158, KC0[], KC1[] 3350; EG-NEXT: TEX 0 @62 3351; EG-NEXT: ALU 5, @164, KC0[], KC1[] 3352; EG-NEXT: TEX 0 @64 3353; EG-NEXT: ALU 5, @170, KC0[], KC1[] 3354; EG-NEXT: TEX 0 @66 3355; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3356; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 3357; EG-NEXT: CF_END 3358; EG-NEXT: PAD 3359; EG-NEXT: Fetch clause starting at 36: 3360; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3361; EG-NEXT: Fetch clause starting at 38: 3362; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3363; EG-NEXT: Fetch clause starting at 40: 3364; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3365; EG-NEXT: Fetch clause starting at 42: 3366; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3367; EG-NEXT: Fetch clause starting at 44: 3368; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3369; EG-NEXT: Fetch clause starting at 46: 3370; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3371; EG-NEXT: Fetch clause starting at 48: 3372; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3373; EG-NEXT: Fetch clause starting at 50: 3374; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3375; EG-NEXT: Fetch clause starting at 52: 3376; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3377; EG-NEXT: Fetch clause starting at 54: 3378; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3379; EG-NEXT: Fetch clause starting at 56: 3380; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3381; EG-NEXT: Fetch clause starting at 58: 3382; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3383; EG-NEXT: Fetch clause starting at 60: 3384; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3385; EG-NEXT: Fetch clause starting at 62: 3386; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3387; EG-NEXT: Fetch clause starting at 64: 3388; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3389; EG-NEXT: Fetch clause starting at 66: 3390; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3391; EG-NEXT: ALU clause starting at 68: 3392; EG-NEXT: MOV * T0.Y, T2.X, 3393; EG-NEXT: MOV * T7.X, 0.0, 3394; EG-NEXT: ALU clause starting at 70: 3395; EG-NEXT: LSHL T0.W, T8.X, literal.x, 3396; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3397; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3398; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3399; EG-NEXT: MOV T2.X, PV.W, 3400; EG-NEXT: MOV * T0.Y, T3.X, 3401; EG-NEXT: ALU clause starting at 76: 3402; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3403; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3404; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3405; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3406; EG-NEXT: MOV T3.X, PV.W, 3407; EG-NEXT: MOV * T0.Y, T4.X, 3408; EG-NEXT: ALU clause starting at 82: 3409; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3410; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3411; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3412; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3413; EG-NEXT: MOV T4.X, PV.W, 3414; EG-NEXT: MOV * T0.Y, T5.X, 3415; EG-NEXT: ALU clause starting at 88: 3416; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3417; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3418; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3419; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3420; EG-NEXT: MOV T5.X, PV.W, 3421; EG-NEXT: MOV * T0.Y, T2.X, 3422; EG-NEXT: ALU clause starting at 94: 3423; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3424; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3425; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3426; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3427; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3428; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3429; EG-NEXT: MOV T2.X, PV.W, 3430; EG-NEXT: MOV * T0.Y, T3.X, 3431; EG-NEXT: ALU clause starting at 102: 3432; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3433; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3434; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3435; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3436; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3437; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3438; EG-NEXT: MOV T3.X, PV.W, 3439; EG-NEXT: MOV * T0.Y, T4.X, 3440; EG-NEXT: ALU clause starting at 110: 3441; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3442; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3443; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3444; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3445; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3446; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3447; EG-NEXT: MOV T4.X, PV.W, 3448; EG-NEXT: MOV * T0.Y, T5.X, 3449; EG-NEXT: ALU clause starting at 118: 3450; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3451; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3452; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3453; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3454; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3455; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3456; EG-NEXT: MOV T5.X, PV.W, 3457; EG-NEXT: MOV * T0.Y, T2.X, 3458; EG-NEXT: ALU clause starting at 126: 3459; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3460; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3461; EG-NEXT: 255(3.573311e-43), -65281(nan) 3462; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3463; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3464; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3465; EG-NEXT: MOV T2.X, PV.W, 3466; EG-NEXT: MOV * T0.Y, T3.X, 3467; EG-NEXT: ALU clause starting at 134: 3468; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3469; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3470; EG-NEXT: 255(3.573311e-43), -65281(nan) 3471; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3472; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3473; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3474; EG-NEXT: MOV T3.X, PV.W, 3475; EG-NEXT: MOV * T0.Y, T4.X, 3476; EG-NEXT: ALU clause starting at 142: 3477; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3478; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3479; EG-NEXT: 255(3.573311e-43), -65281(nan) 3480; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3481; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3482; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3483; EG-NEXT: MOV T4.X, PV.W, 3484; EG-NEXT: MOV * T0.Y, T5.X, 3485; EG-NEXT: ALU clause starting at 150: 3486; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3487; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3488; EG-NEXT: 255(3.573311e-43), -65281(nan) 3489; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3490; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3491; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3492; EG-NEXT: MOV T5.X, PV.W, 3493; EG-NEXT: MOV * T0.Y, T2.X, 3494; EG-NEXT: ALU clause starting at 158: 3495; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3496; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3497; EG-NEXT: -256(nan), 255(3.573311e-43) 3498; EG-NEXT: OR_INT * T7.W, PV.W, PS, 3499; EG-NEXT: MOV T2.X, PV.W, 3500; EG-NEXT: MOV * T0.Y, T3.X, 3501; EG-NEXT: ALU clause starting at 164: 3502; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3503; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3504; EG-NEXT: -256(nan), 255(3.573311e-43) 3505; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 3506; EG-NEXT: MOV T3.X, PV.Z, 3507; EG-NEXT: MOV * T0.Y, T4.X, 3508; EG-NEXT: ALU clause starting at 170: 3509; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3510; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3511; EG-NEXT: -256(nan), 255(3.573311e-43) 3512; EG-NEXT: OR_INT * T7.Y, PV.W, PS, 3513; EG-NEXT: MOV T4.X, PV.Y, 3514; EG-NEXT: MOV * T0.Y, T5.X, 3515; EG-NEXT: ALU clause starting at 176: 3516; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3517; EG-NEXT: AND_INT * T1.W, T7.X, literal.y, 3518; EG-NEXT: -256(nan), 255(3.573311e-43) 3519; EG-NEXT: OR_INT T7.X, PV.W, PS, 3520; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3521; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3522; 3523; CM-LABEL: v16i8_arg: 3524; CM: ; %bb.0: ; %entry 3525; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3526; CM-NEXT: TEX 0 @36 3527; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3528; CM-NEXT: TEX 0 @38 3529; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3530; CM-NEXT: TEX 0 @40 3531; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3532; CM-NEXT: TEX 0 @42 3533; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3534; CM-NEXT: TEX 0 @44 3535; CM-NEXT: ALU 7, @94, KC0[], KC1[] 3536; CM-NEXT: TEX 0 @46 3537; CM-NEXT: ALU 7, @102, KC0[], KC1[] 3538; CM-NEXT: TEX 0 @48 3539; CM-NEXT: ALU 7, @110, KC0[], KC1[] 3540; CM-NEXT: TEX 0 @50 3541; CM-NEXT: ALU 7, @118, KC0[], KC1[] 3542; CM-NEXT: TEX 0 @52 3543; CM-NEXT: ALU 7, @126, KC0[], KC1[] 3544; CM-NEXT: TEX 0 @54 3545; CM-NEXT: ALU 7, @134, KC0[], KC1[] 3546; CM-NEXT: TEX 0 @56 3547; CM-NEXT: ALU 7, @142, KC0[], KC1[] 3548; CM-NEXT: TEX 0 @58 3549; CM-NEXT: ALU 7, @150, KC0[], KC1[] 3550; CM-NEXT: TEX 0 @60 3551; CM-NEXT: ALU 5, @158, KC0[], KC1[] 3552; CM-NEXT: TEX 0 @62 3553; CM-NEXT: ALU 5, @164, KC0[], KC1[] 3554; CM-NEXT: TEX 0 @64 3555; CM-NEXT: ALU 5, @170, KC0[], KC1[] 3556; CM-NEXT: TEX 0 @66 3557; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3558; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 3559; CM-NEXT: CF_END 3560; CM-NEXT: PAD 3561; CM-NEXT: Fetch clause starting at 36: 3562; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3563; CM-NEXT: Fetch clause starting at 38: 3564; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3565; CM-NEXT: Fetch clause starting at 40: 3566; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3567; CM-NEXT: Fetch clause starting at 42: 3568; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3569; CM-NEXT: Fetch clause starting at 44: 3570; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3571; CM-NEXT: Fetch clause starting at 46: 3572; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3573; CM-NEXT: Fetch clause starting at 48: 3574; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3575; CM-NEXT: Fetch clause starting at 50: 3576; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3577; CM-NEXT: Fetch clause starting at 52: 3578; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3579; CM-NEXT: Fetch clause starting at 54: 3580; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3581; CM-NEXT: Fetch clause starting at 56: 3582; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3583; CM-NEXT: Fetch clause starting at 58: 3584; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3585; CM-NEXT: Fetch clause starting at 60: 3586; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3587; CM-NEXT: Fetch clause starting at 62: 3588; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3589; CM-NEXT: Fetch clause starting at 64: 3590; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3591; CM-NEXT: Fetch clause starting at 66: 3592; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3593; CM-NEXT: ALU clause starting at 68: 3594; CM-NEXT: MOV * T0.Y, T2.X, 3595; CM-NEXT: MOV * T7.X, 0.0, 3596; CM-NEXT: ALU clause starting at 70: 3597; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 3598; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 3599; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3600; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 3601; CM-NEXT: MOV T2.X, PV.W, 3602; CM-NEXT: MOV * T0.Y, T3.X, 3603; CM-NEXT: ALU clause starting at 76: 3604; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3605; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3606; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3607; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3608; CM-NEXT: MOV T3.X, PV.W, 3609; CM-NEXT: MOV * T0.Y, T4.X, 3610; CM-NEXT: ALU clause starting at 82: 3611; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3612; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3613; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3614; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3615; CM-NEXT: MOV T4.X, PV.W, 3616; CM-NEXT: MOV * T0.Y, T5.X, 3617; CM-NEXT: ALU clause starting at 88: 3618; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3619; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3620; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3621; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3622; CM-NEXT: MOV T5.X, PV.W, 3623; CM-NEXT: MOV * T0.Y, T2.X, 3624; CM-NEXT: ALU clause starting at 94: 3625; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3626; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3627; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3628; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3629; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3630; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3631; CM-NEXT: MOV T2.X, PV.W, 3632; CM-NEXT: MOV * T0.Y, T3.X, 3633; CM-NEXT: ALU clause starting at 102: 3634; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3635; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3636; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3637; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3638; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3639; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3640; CM-NEXT: MOV T3.X, PV.W, 3641; CM-NEXT: MOV * T0.Y, T4.X, 3642; CM-NEXT: ALU clause starting at 110: 3643; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3644; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3645; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3646; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3647; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3648; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3649; CM-NEXT: MOV T4.X, PV.W, 3650; CM-NEXT: MOV * T0.Y, T5.X, 3651; CM-NEXT: ALU clause starting at 118: 3652; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3653; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3654; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3655; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3656; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3657; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3658; CM-NEXT: MOV T5.X, PV.W, 3659; CM-NEXT: MOV * T0.Y, T2.X, 3660; CM-NEXT: ALU clause starting at 126: 3661; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3662; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3663; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3664; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3665; CM-NEXT: -65281(nan), 8(1.121039e-44) 3666; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3667; CM-NEXT: MOV T2.X, PV.W, 3668; CM-NEXT: MOV * T0.Y, T3.X, 3669; CM-NEXT: ALU clause starting at 134: 3670; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3671; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3672; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3673; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3674; CM-NEXT: -65281(nan), 8(1.121039e-44) 3675; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3676; CM-NEXT: MOV T3.X, PV.W, 3677; CM-NEXT: MOV * T0.Y, T4.X, 3678; CM-NEXT: ALU clause starting at 142: 3679; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3680; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3681; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3682; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3683; CM-NEXT: -65281(nan), 8(1.121039e-44) 3684; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3685; CM-NEXT: MOV T4.X, PV.W, 3686; CM-NEXT: MOV * T0.Y, T5.X, 3687; CM-NEXT: ALU clause starting at 150: 3688; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3689; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3690; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3691; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3692; CM-NEXT: -65281(nan), 8(1.121039e-44) 3693; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3694; CM-NEXT: MOV T5.X, PV.W, 3695; CM-NEXT: MOV * T0.Y, T2.X, 3696; CM-NEXT: ALU clause starting at 158: 3697; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3698; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3699; CM-NEXT: -256(nan), 255(3.573311e-43) 3700; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W, 3701; CM-NEXT: MOV T2.X, PV.W, 3702; CM-NEXT: MOV * T0.Y, T3.X, 3703; CM-NEXT: ALU clause starting at 164: 3704; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3705; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3706; CM-NEXT: -256(nan), 255(3.573311e-43) 3707; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3708; CM-NEXT: MOV T3.X, PV.Z, 3709; CM-NEXT: MOV * T0.Y, T4.X, 3710; CM-NEXT: ALU clause starting at 170: 3711; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3712; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3713; CM-NEXT: -256(nan), 255(3.573311e-43) 3714; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W, 3715; CM-NEXT: MOV T4.X, PV.Y, 3716; CM-NEXT: MOV * T0.Y, T5.X, 3717; CM-NEXT: ALU clause starting at 176: 3718; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3719; CM-NEXT: AND_INT * T0.W, T7.X, literal.y, 3720; CM-NEXT: -256(nan), 255(3.573311e-43) 3721; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3722; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3723; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3724entry: 3725 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 3726 ret void 3727} 3728 3729define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 3730; SI-LABEL: v16i16_arg: 3731; SI: ; %bb.0: ; %entry 3732; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3733; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3734; SI-NEXT: s_mov_b32 s3, 0xf000 3735; SI-NEXT: s_mov_b32 s2, -1 3736; SI-NEXT: s_waitcnt lgkmcnt(0) 3737; SI-NEXT: v_mov_b32_e32 v0, s8 3738; SI-NEXT: v_mov_b32_e32 v1, s9 3739; SI-NEXT: v_mov_b32_e32 v2, s10 3740; SI-NEXT: v_mov_b32_e32 v3, s11 3741; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3742; SI-NEXT: s_waitcnt expcnt(0) 3743; SI-NEXT: v_mov_b32_e32 v0, s4 3744; SI-NEXT: v_mov_b32_e32 v1, s5 3745; SI-NEXT: v_mov_b32_e32 v2, s6 3746; SI-NEXT: v_mov_b32_e32 v3, s7 3747; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3748; SI-NEXT: s_endpgm 3749; 3750; VI-LABEL: v16i16_arg: 3751; VI: ; %bb.0: ; %entry 3752; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 3753; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44 3754; VI-NEXT: s_waitcnt lgkmcnt(0) 3755; VI-NEXT: v_mov_b32_e32 v0, s4 3756; VI-NEXT: s_add_u32 s4, s8, 16 3757; VI-NEXT: v_mov_b32_e32 v1, s5 3758; VI-NEXT: s_addc_u32 s5, s9, 0 3759; VI-NEXT: v_mov_b32_e32 v4, s4 3760; VI-NEXT: v_mov_b32_e32 v2, s6 3761; VI-NEXT: v_mov_b32_e32 v3, s7 3762; VI-NEXT: v_mov_b32_e32 v5, s5 3763; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3764; VI-NEXT: v_mov_b32_e32 v4, s8 3765; VI-NEXT: v_mov_b32_e32 v0, s0 3766; VI-NEXT: v_mov_b32_e32 v1, s1 3767; VI-NEXT: v_mov_b32_e32 v2, s2 3768; VI-NEXT: v_mov_b32_e32 v3, s3 3769; VI-NEXT: v_mov_b32_e32 v5, s9 3770; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3771; VI-NEXT: s_endpgm 3772; 3773; GFX9-LABEL: v16i16_arg: 3774; GFX9: ; %bb.0: ; %entry 3775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3776; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3777; GFX9-NEXT: v_mov_b32_e32 v4, 0 3778; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3779; GFX9-NEXT: v_mov_b32_e32 v0, s12 3780; GFX9-NEXT: v_mov_b32_e32 v1, s13 3781; GFX9-NEXT: v_mov_b32_e32 v2, s14 3782; GFX9-NEXT: v_mov_b32_e32 v3, s15 3783; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3784; GFX9-NEXT: s_nop 0 3785; GFX9-NEXT: v_mov_b32_e32 v0, s8 3786; GFX9-NEXT: v_mov_b32_e32 v1, s9 3787; GFX9-NEXT: v_mov_b32_e32 v2, s10 3788; GFX9-NEXT: v_mov_b32_e32 v3, s11 3789; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3790; GFX9-NEXT: s_endpgm 3791; 3792; EG-LABEL: v16i16_arg: 3793; EG: ; %bb.0: ; %entry 3794; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3795; EG-NEXT: TEX 0 @36 3796; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3797; EG-NEXT: TEX 0 @38 3798; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3799; EG-NEXT: TEX 0 @40 3800; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3801; EG-NEXT: TEX 0 @42 3802; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3803; EG-NEXT: TEX 0 @44 3804; EG-NEXT: ALU 5, @94, KC0[], KC1[] 3805; EG-NEXT: TEX 0 @46 3806; EG-NEXT: ALU 5, @100, KC0[], KC1[] 3807; EG-NEXT: TEX 0 @48 3808; EG-NEXT: ALU 5, @106, KC0[], KC1[] 3809; EG-NEXT: TEX 0 @50 3810; EG-NEXT: ALU 5, @112, KC0[], KC1[] 3811; EG-NEXT: TEX 0 @52 3812; EG-NEXT: ALU 5, @118, KC0[], KC1[] 3813; EG-NEXT: TEX 0 @54 3814; EG-NEXT: ALU 5, @124, KC0[], KC1[] 3815; EG-NEXT: TEX 0 @56 3816; EG-NEXT: ALU 5, @130, KC0[], KC1[] 3817; EG-NEXT: TEX 0 @58 3818; EG-NEXT: ALU 5, @136, KC0[], KC1[] 3819; EG-NEXT: TEX 0 @60 3820; EG-NEXT: ALU 5, @142, KC0[], KC1[] 3821; EG-NEXT: TEX 0 @62 3822; EG-NEXT: ALU 5, @148, KC0[], KC1[] 3823; EG-NEXT: TEX 0 @64 3824; EG-NEXT: ALU 5, @154, KC0[], KC1[] 3825; EG-NEXT: TEX 0 @66 3826; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[] 3827; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 3828; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 3829; EG-NEXT: CF_END 3830; EG-NEXT: Fetch clause starting at 36: 3831; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 3832; EG-NEXT: Fetch clause starting at 38: 3833; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 3834; EG-NEXT: Fetch clause starting at 40: 3835; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 3836; EG-NEXT: Fetch clause starting at 42: 3837; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 3838; EG-NEXT: Fetch clause starting at 44: 3839; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 3840; EG-NEXT: Fetch clause starting at 46: 3841; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 3842; EG-NEXT: Fetch clause starting at 48: 3843; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 3844; EG-NEXT: Fetch clause starting at 50: 3845; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 3846; EG-NEXT: Fetch clause starting at 52: 3847; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 3848; EG-NEXT: Fetch clause starting at 54: 3849; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 3850; EG-NEXT: Fetch clause starting at 56: 3851; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 3852; EG-NEXT: Fetch clause starting at 58: 3853; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 3854; EG-NEXT: Fetch clause starting at 60: 3855; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 3856; EG-NEXT: Fetch clause starting at 62: 3857; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 3858; EG-NEXT: Fetch clause starting at 64: 3859; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 3860; EG-NEXT: Fetch clause starting at 66: 3861; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 3862; EG-NEXT: ALU clause starting at 68: 3863; EG-NEXT: MOV * T0.Y, T3.X, 3864; EG-NEXT: MOV * T11.X, 0.0, 3865; EG-NEXT: ALU clause starting at 70: 3866; EG-NEXT: LSHL T0.W, T12.X, literal.x, 3867; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3868; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 3869; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3870; EG-NEXT: MOV T3.X, PV.W, 3871; EG-NEXT: MOV * T0.Y, T5.X, 3872; EG-NEXT: ALU clause starting at 76: 3873; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3874; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3875; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3876; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3877; EG-NEXT: MOV T5.X, PV.W, 3878; EG-NEXT: MOV * T0.Y, T7.X, 3879; EG-NEXT: ALU clause starting at 82: 3880; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3881; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3882; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3883; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3884; EG-NEXT: MOV T7.X, PV.W, 3885; EG-NEXT: MOV * T0.Y, T9.X, 3886; EG-NEXT: ALU clause starting at 88: 3887; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3888; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3889; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3890; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3891; EG-NEXT: MOV T9.X, PV.W, 3892; EG-NEXT: MOV * T0.Y, T3.X, 3893; EG-NEXT: ALU clause starting at 94: 3894; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3895; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3896; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3897; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3898; EG-NEXT: MOV T3.X, PV.W, 3899; EG-NEXT: MOV * T0.Y, T5.X, 3900; EG-NEXT: ALU clause starting at 100: 3901; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3902; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3903; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3904; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3905; EG-NEXT: MOV T5.X, PV.W, 3906; EG-NEXT: MOV * T0.Y, T7.X, 3907; EG-NEXT: ALU clause starting at 106: 3908; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3909; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3910; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3911; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3912; EG-NEXT: MOV T7.X, PV.W, 3913; EG-NEXT: MOV * T0.Y, T9.X, 3914; EG-NEXT: ALU clause starting at 112: 3915; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3916; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3917; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3918; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3919; EG-NEXT: MOV T9.X, PV.W, 3920; EG-NEXT: MOV * T0.Y, T2.X, 3921; EG-NEXT: ALU clause starting at 118: 3922; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3923; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3924; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3925; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3926; EG-NEXT: MOV T2.X, PV.W, 3927; EG-NEXT: MOV * T0.Y, T4.X, 3928; EG-NEXT: ALU clause starting at 124: 3929; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3930; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3931; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3932; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3933; EG-NEXT: MOV T4.X, PV.W, 3934; EG-NEXT: MOV * T0.Y, T6.X, 3935; EG-NEXT: ALU clause starting at 130: 3936; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3937; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3938; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3939; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3940; EG-NEXT: MOV T6.X, PV.W, 3941; EG-NEXT: MOV * T0.Y, T8.X, 3942; EG-NEXT: ALU clause starting at 136: 3943; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3944; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3945; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3946; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3947; EG-NEXT: MOV T8.X, PV.W, 3948; EG-NEXT: MOV * T0.Y, T2.X, 3949; EG-NEXT: ALU clause starting at 142: 3950; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3951; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3952; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3953; EG-NEXT: OR_INT * T12.Z, PV.W, PS, 3954; EG-NEXT: MOV T2.X, PV.Z, 3955; EG-NEXT: MOV * T0.Y, T4.X, 3956; EG-NEXT: ALU clause starting at 148: 3957; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3958; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3959; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3960; EG-NEXT: OR_INT * T12.X, PV.W, PS, 3961; EG-NEXT: MOV T4.X, PV.X, 3962; EG-NEXT: MOV * T0.Y, T6.X, 3963; EG-NEXT: ALU clause starting at 154: 3964; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3965; EG-NEXT: AND_INT * T1.W, T13.X, literal.y, 3966; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3967; EG-NEXT: OR_INT * T11.Z, PV.W, PS, 3968; EG-NEXT: MOV T6.X, PV.Z, 3969; EG-NEXT: MOV * T0.Y, T8.X, 3970; EG-NEXT: ALU clause starting at 160: 3971; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, 3972; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3973; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 3974; EG-NEXT: LSHR T14.X, PV.W, literal.x, 3975; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 3976; EG-NEXT: AND_INT * T1.W, T11.X, literal.z, 3977; EG-NEXT: 2(2.802597e-45), -65536(nan) 3978; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3979; EG-NEXT: OR_INT * T11.X, PV.W, PS, 3980; EG-NEXT: MOV T8.X, PV.X, 3981; EG-NEXT: MOV * T12.W, T3.X, 3982; EG-NEXT: MOV T12.Y, T5.X, 3983; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212 3984; EG-NEXT: MOV * T11.Y, T9.X, 3985; 3986; CM-LABEL: v16i16_arg: 3987; CM: ; %bb.0: ; %entry 3988; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3989; CM-NEXT: TEX 0 @36 3990; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3991; CM-NEXT: TEX 0 @38 3992; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3993; CM-NEXT: TEX 0 @40 3994; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3995; CM-NEXT: TEX 0 @42 3996; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3997; CM-NEXT: TEX 0 @44 3998; CM-NEXT: ALU 5, @94, KC0[], KC1[] 3999; CM-NEXT: TEX 0 @46 4000; CM-NEXT: ALU 5, @100, KC0[], KC1[] 4001; CM-NEXT: TEX 0 @48 4002; CM-NEXT: ALU 5, @106, KC0[], KC1[] 4003; CM-NEXT: TEX 0 @50 4004; CM-NEXT: ALU 5, @112, KC0[], KC1[] 4005; CM-NEXT: TEX 0 @52 4006; CM-NEXT: ALU 5, @118, KC0[], KC1[] 4007; CM-NEXT: TEX 0 @54 4008; CM-NEXT: ALU 5, @124, KC0[], KC1[] 4009; CM-NEXT: TEX 0 @56 4010; CM-NEXT: ALU 5, @130, KC0[], KC1[] 4011; CM-NEXT: TEX 0 @58 4012; CM-NEXT: ALU 5, @136, KC0[], KC1[] 4013; CM-NEXT: TEX 0 @60 4014; CM-NEXT: ALU 5, @142, KC0[], KC1[] 4015; CM-NEXT: TEX 0 @62 4016; CM-NEXT: ALU 5, @148, KC0[], KC1[] 4017; CM-NEXT: TEX 0 @64 4018; CM-NEXT: ALU 5, @154, KC0[], KC1[] 4019; CM-NEXT: TEX 0 @66 4020; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[] 4021; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X 4022; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X 4023; CM-NEXT: CF_END 4024; CM-NEXT: Fetch clause starting at 36: 4025; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 4026; CM-NEXT: Fetch clause starting at 38: 4027; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 4028; CM-NEXT: Fetch clause starting at 40: 4029; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 4030; CM-NEXT: Fetch clause starting at 42: 4031; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 4032; CM-NEXT: Fetch clause starting at 44: 4033; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 4034; CM-NEXT: Fetch clause starting at 46: 4035; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 4036; CM-NEXT: Fetch clause starting at 48: 4037; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 4038; CM-NEXT: Fetch clause starting at 50: 4039; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 4040; CM-NEXT: Fetch clause starting at 52: 4041; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 4042; CM-NEXT: Fetch clause starting at 54: 4043; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 4044; CM-NEXT: Fetch clause starting at 56: 4045; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 4046; CM-NEXT: Fetch clause starting at 58: 4047; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 4048; CM-NEXT: Fetch clause starting at 60: 4049; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 4050; CM-NEXT: Fetch clause starting at 62: 4051; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 4052; CM-NEXT: Fetch clause starting at 64: 4053; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 4054; CM-NEXT: Fetch clause starting at 66: 4055; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 4056; CM-NEXT: ALU clause starting at 68: 4057; CM-NEXT: MOV * T0.Y, T3.X, 4058; CM-NEXT: MOV * T11.X, 0.0, 4059; CM-NEXT: ALU clause starting at 70: 4060; CM-NEXT: LSHL T0.Z, T12.X, literal.x, 4061; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 4062; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 4063; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 4064; CM-NEXT: MOV T3.X, PV.W, 4065; CM-NEXT: MOV * T0.Y, T5.X, 4066; CM-NEXT: ALU clause starting at 76: 4067; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4068; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4069; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4070; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4071; CM-NEXT: MOV T5.X, PV.W, 4072; CM-NEXT: MOV * T0.Y, T7.X, 4073; CM-NEXT: ALU clause starting at 82: 4074; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4075; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4076; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4077; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4078; CM-NEXT: MOV T7.X, PV.W, 4079; CM-NEXT: MOV * T0.Y, T9.X, 4080; CM-NEXT: ALU clause starting at 88: 4081; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4082; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4083; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4084; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4085; CM-NEXT: MOV T9.X, PV.W, 4086; CM-NEXT: MOV * T0.Y, T3.X, 4087; CM-NEXT: ALU clause starting at 94: 4088; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4089; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4090; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4091; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4092; CM-NEXT: MOV T3.X, PV.W, 4093; CM-NEXT: MOV * T0.Y, T5.X, 4094; CM-NEXT: ALU clause starting at 100: 4095; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4096; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4097; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4098; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4099; CM-NEXT: MOV T5.X, PV.W, 4100; CM-NEXT: MOV * T0.Y, T7.X, 4101; CM-NEXT: ALU clause starting at 106: 4102; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4103; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4104; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4105; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4106; CM-NEXT: MOV T7.X, PV.W, 4107; CM-NEXT: MOV * T0.Y, T9.X, 4108; CM-NEXT: ALU clause starting at 112: 4109; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4110; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4111; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4112; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4113; CM-NEXT: MOV T9.X, PV.W, 4114; CM-NEXT: MOV * T0.Y, T2.X, 4115; CM-NEXT: ALU clause starting at 118: 4116; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4117; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4118; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4119; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4120; CM-NEXT: MOV T2.X, PV.W, 4121; CM-NEXT: MOV * T0.Y, T4.X, 4122; CM-NEXT: ALU clause starting at 124: 4123; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4124; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4125; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4126; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4127; CM-NEXT: MOV T4.X, PV.W, 4128; CM-NEXT: MOV * T0.Y, T6.X, 4129; CM-NEXT: ALU clause starting at 130: 4130; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4131; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4132; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4133; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4134; CM-NEXT: MOV T6.X, PV.W, 4135; CM-NEXT: MOV * T0.Y, T8.X, 4136; CM-NEXT: ALU clause starting at 136: 4137; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4138; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4139; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4140; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4141; CM-NEXT: MOV T8.X, PV.W, 4142; CM-NEXT: MOV * T0.Y, T2.X, 4143; CM-NEXT: ALU clause starting at 142: 4144; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4145; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4146; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4147; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W, 4148; CM-NEXT: MOV T2.X, PV.Z, 4149; CM-NEXT: MOV * T0.Y, T4.X, 4150; CM-NEXT: ALU clause starting at 148: 4151; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4152; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4153; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4154; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W, 4155; CM-NEXT: MOV T4.X, PV.X, 4156; CM-NEXT: MOV * T0.Y, T6.X, 4157; CM-NEXT: ALU clause starting at 154: 4158; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4159; CM-NEXT: AND_INT * T0.W, T13.X, literal.y, 4160; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4161; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W, 4162; CM-NEXT: MOV T6.X, PV.Z, 4163; CM-NEXT: MOV * T0.Y, T8.X, 4164; CM-NEXT: ALU clause starting at 160: 4165; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4166; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4167; CM-NEXT: LSHR * T13.X, PV.W, literal.x, 4168; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4169; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x, 4170; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 4171; CM-NEXT: AND_INT * T0.W, T11.X, literal.z, 4172; CM-NEXT: 2(2.802597e-45), -65536(nan) 4173; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 4174; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W, 4175; CM-NEXT: MOV T8.X, PV.X, 4176; CM-NEXT: MOV * T12.W, T3.X, 4177; CM-NEXT: MOV T12.Y, T5.X, 4178; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212 4179; CM-NEXT: MOV * T11.Y, T9.X, 4180entry: 4181 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 4182 ret void 4183} 4184 4185define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 4186; SI-LABEL: v16i32_arg: 4187; SI: ; %bb.0: ; %entry 4188; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 4189; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4190; SI-NEXT: s_mov_b32 s3, 0xf000 4191; SI-NEXT: s_mov_b32 s2, -1 4192; SI-NEXT: s_waitcnt lgkmcnt(0) 4193; SI-NEXT: v_mov_b32_e32 v0, s16 4194; SI-NEXT: v_mov_b32_e32 v1, s17 4195; SI-NEXT: v_mov_b32_e32 v2, s18 4196; SI-NEXT: v_mov_b32_e32 v3, s19 4197; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4198; SI-NEXT: s_waitcnt expcnt(0) 4199; SI-NEXT: v_mov_b32_e32 v0, s12 4200; SI-NEXT: v_mov_b32_e32 v1, s13 4201; SI-NEXT: v_mov_b32_e32 v2, s14 4202; SI-NEXT: v_mov_b32_e32 v3, s15 4203; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4204; SI-NEXT: s_waitcnt expcnt(0) 4205; SI-NEXT: v_mov_b32_e32 v0, s8 4206; SI-NEXT: v_mov_b32_e32 v1, s9 4207; SI-NEXT: v_mov_b32_e32 v2, s10 4208; SI-NEXT: v_mov_b32_e32 v3, s11 4209; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4210; SI-NEXT: s_waitcnt expcnt(0) 4211; SI-NEXT: v_mov_b32_e32 v0, s4 4212; SI-NEXT: v_mov_b32_e32 v1, s5 4213; SI-NEXT: v_mov_b32_e32 v2, s6 4214; SI-NEXT: v_mov_b32_e32 v3, s7 4215; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4216; SI-NEXT: s_endpgm 4217; 4218; VI-LABEL: v16i32_arg: 4219; VI: ; %bb.0: ; %entry 4220; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 4221; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 4222; VI-NEXT: s_waitcnt lgkmcnt(0) 4223; VI-NEXT: v_mov_b32_e32 v0, s12 4224; VI-NEXT: s_add_u32 s12, s16, 48 4225; VI-NEXT: v_mov_b32_e32 v1, s13 4226; VI-NEXT: s_addc_u32 s13, s17, 0 4227; VI-NEXT: v_mov_b32_e32 v4, s12 4228; VI-NEXT: v_mov_b32_e32 v2, s14 4229; VI-NEXT: v_mov_b32_e32 v3, s15 4230; VI-NEXT: v_mov_b32_e32 v5, s13 4231; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4232; VI-NEXT: s_nop 0 4233; VI-NEXT: v_mov_b32_e32 v0, s8 4234; VI-NEXT: s_add_u32 s8, s16, 32 4235; VI-NEXT: v_mov_b32_e32 v1, s9 4236; VI-NEXT: s_addc_u32 s9, s17, 0 4237; VI-NEXT: v_mov_b32_e32 v4, s8 4238; VI-NEXT: v_mov_b32_e32 v2, s10 4239; VI-NEXT: v_mov_b32_e32 v3, s11 4240; VI-NEXT: v_mov_b32_e32 v5, s9 4241; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4242; VI-NEXT: s_nop 0 4243; VI-NEXT: v_mov_b32_e32 v0, s4 4244; VI-NEXT: s_add_u32 s4, s16, 16 4245; VI-NEXT: v_mov_b32_e32 v1, s5 4246; VI-NEXT: s_addc_u32 s5, s17, 0 4247; VI-NEXT: v_mov_b32_e32 v4, s4 4248; VI-NEXT: v_mov_b32_e32 v2, s6 4249; VI-NEXT: v_mov_b32_e32 v3, s7 4250; VI-NEXT: v_mov_b32_e32 v5, s5 4251; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4252; VI-NEXT: v_mov_b32_e32 v4, s16 4253; VI-NEXT: v_mov_b32_e32 v0, s0 4254; VI-NEXT: v_mov_b32_e32 v1, s1 4255; VI-NEXT: v_mov_b32_e32 v2, s2 4256; VI-NEXT: v_mov_b32_e32 v3, s3 4257; VI-NEXT: v_mov_b32_e32 v5, s17 4258; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4259; VI-NEXT: s_endpgm 4260; 4261; GFX9-LABEL: v16i32_arg: 4262; GFX9: ; %bb.0: ; %entry 4263; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 4264; GFX9-NEXT: v_mov_b32_e32 v4, 0 4265; GFX9-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x40 4266; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4267; GFX9-NEXT: v_mov_b32_e32 v0, s12 4268; GFX9-NEXT: v_mov_b32_e32 v1, s13 4269; GFX9-NEXT: v_mov_b32_e32 v2, s14 4270; GFX9-NEXT: v_mov_b32_e32 v3, s15 4271; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 4272; GFX9-NEXT: s_nop 0 4273; GFX9-NEXT: v_mov_b32_e32 v0, s8 4274; GFX9-NEXT: v_mov_b32_e32 v1, s9 4275; GFX9-NEXT: v_mov_b32_e32 v2, s10 4276; GFX9-NEXT: v_mov_b32_e32 v3, s11 4277; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 4278; GFX9-NEXT: s_nop 0 4279; GFX9-NEXT: v_mov_b32_e32 v0, s4 4280; GFX9-NEXT: v_mov_b32_e32 v1, s5 4281; GFX9-NEXT: v_mov_b32_e32 v2, s6 4282; GFX9-NEXT: v_mov_b32_e32 v3, s7 4283; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 4284; GFX9-NEXT: s_nop 0 4285; GFX9-NEXT: v_mov_b32_e32 v0, s0 4286; GFX9-NEXT: v_mov_b32_e32 v1, s1 4287; GFX9-NEXT: v_mov_b32_e32 v2, s2 4288; GFX9-NEXT: v_mov_b32_e32 v3, s3 4289; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] 4290; GFX9-NEXT: s_endpgm 4291; 4292; EG-LABEL: v16i32_arg: 4293; EG: ; %bb.0: ; %entry 4294; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4295; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4296; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4297; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4298; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4299; EG-NEXT: CF_END 4300; EG-NEXT: ALU clause starting at 6: 4301; EG-NEXT: MOV * T0.W, KC0[7].X, 4302; EG-NEXT: MOV * T0.Z, KC0[6].W, 4303; EG-NEXT: MOV T0.Y, KC0[6].Z, 4304; EG-NEXT: MOV * T1.W, KC0[8].X, 4305; EG-NEXT: MOV T0.X, KC0[6].Y, 4306; EG-NEXT: MOV * T1.Z, KC0[7].W, 4307; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4308; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4309; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4310; EG-NEXT: MOV * T3.W, KC0[9].X, 4311; EG-NEXT: MOV T1.X, KC0[7].Y, 4312; EG-NEXT: MOV * T3.Z, KC0[8].W, 4313; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4314; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4315; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4316; EG-NEXT: MOV T3.Y, KC0[8].Z, 4317; EG-NEXT: MOV * T5.W, KC0[10].X, 4318; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4319; EG-NEXT: MOV T3.X, KC0[8].Y, 4320; EG-NEXT: MOV * T5.Z, KC0[9].W, 4321; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4322; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4323; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4324; EG-NEXT: MOV T5.Y, KC0[9].Z, 4325; EG-NEXT: MOV * T5.X, KC0[9].Y, 4326; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4327; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4328; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4329; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4330; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4331; 4332; CM-LABEL: v16i32_arg: 4333; CM: ; %bb.0: ; %entry 4334; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4335; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4336; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4337; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4338; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4339; CM-NEXT: CF_END 4340; CM-NEXT: ALU clause starting at 6: 4341; CM-NEXT: MOV * T0.W, KC0[10].X, 4342; CM-NEXT: MOV * T0.Z, KC0[9].W, 4343; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4344; CM-NEXT: MOV T0.X, KC0[9].Y, 4345; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4346; CM-NEXT: MOV * T2.W, KC0[9].X, 4347; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4348; CM-NEXT: MOV T2.Z, KC0[8].W, 4349; CM-NEXT: MOV * T1.W, KC0[8].X, 4350; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4351; CM-NEXT: MOV T2.Y, KC0[8].Z, 4352; CM-NEXT: MOV * T1.Z, KC0[7].W, 4353; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4354; CM-NEXT: MOV T2.X, KC0[8].Y, 4355; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4356; CM-NEXT: MOV T1.X, KC0[7].Y, 4357; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4358; CM-NEXT: MOV * T4.W, KC0[7].X, 4359; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4360; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4361; CM-NEXT: MOV T4.Z, KC0[6].W, 4362; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4363; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4364; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4365; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4366; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4367; CM-NEXT: MOV * T4.X, KC0[6].Y, 4368; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4369; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4370entry: 4371 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 4372 ret void 4373} 4374 4375define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 4376; SI-LABEL: v16f32_arg: 4377; SI: ; %bb.0: ; %entry 4378; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 4379; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4380; SI-NEXT: s_mov_b32 s3, 0xf000 4381; SI-NEXT: s_mov_b32 s2, -1 4382; SI-NEXT: s_waitcnt lgkmcnt(0) 4383; SI-NEXT: v_mov_b32_e32 v0, s16 4384; SI-NEXT: v_mov_b32_e32 v1, s17 4385; SI-NEXT: v_mov_b32_e32 v2, s18 4386; SI-NEXT: v_mov_b32_e32 v3, s19 4387; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4388; SI-NEXT: s_waitcnt expcnt(0) 4389; SI-NEXT: v_mov_b32_e32 v0, s12 4390; SI-NEXT: v_mov_b32_e32 v1, s13 4391; SI-NEXT: v_mov_b32_e32 v2, s14 4392; SI-NEXT: v_mov_b32_e32 v3, s15 4393; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4394; SI-NEXT: s_waitcnt expcnt(0) 4395; SI-NEXT: v_mov_b32_e32 v0, s8 4396; SI-NEXT: v_mov_b32_e32 v1, s9 4397; SI-NEXT: v_mov_b32_e32 v2, s10 4398; SI-NEXT: v_mov_b32_e32 v3, s11 4399; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4400; SI-NEXT: s_waitcnt expcnt(0) 4401; SI-NEXT: v_mov_b32_e32 v0, s4 4402; SI-NEXT: v_mov_b32_e32 v1, s5 4403; SI-NEXT: v_mov_b32_e32 v2, s6 4404; SI-NEXT: v_mov_b32_e32 v3, s7 4405; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4406; SI-NEXT: s_endpgm 4407; 4408; VI-LABEL: v16f32_arg: 4409; VI: ; %bb.0: ; %entry 4410; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 4411; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 4412; VI-NEXT: s_waitcnt lgkmcnt(0) 4413; VI-NEXT: v_mov_b32_e32 v0, s12 4414; VI-NEXT: s_add_u32 s12, s16, 48 4415; VI-NEXT: v_mov_b32_e32 v1, s13 4416; VI-NEXT: s_addc_u32 s13, s17, 0 4417; VI-NEXT: v_mov_b32_e32 v4, s12 4418; VI-NEXT: v_mov_b32_e32 v2, s14 4419; VI-NEXT: v_mov_b32_e32 v3, s15 4420; VI-NEXT: v_mov_b32_e32 v5, s13 4421; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4422; VI-NEXT: s_nop 0 4423; VI-NEXT: v_mov_b32_e32 v0, s8 4424; VI-NEXT: s_add_u32 s8, s16, 32 4425; VI-NEXT: v_mov_b32_e32 v1, s9 4426; VI-NEXT: s_addc_u32 s9, s17, 0 4427; VI-NEXT: v_mov_b32_e32 v4, s8 4428; VI-NEXT: v_mov_b32_e32 v2, s10 4429; VI-NEXT: v_mov_b32_e32 v3, s11 4430; VI-NEXT: v_mov_b32_e32 v5, s9 4431; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4432; VI-NEXT: s_nop 0 4433; VI-NEXT: v_mov_b32_e32 v0, s4 4434; VI-NEXT: s_add_u32 s4, s16, 16 4435; VI-NEXT: v_mov_b32_e32 v1, s5 4436; VI-NEXT: s_addc_u32 s5, s17, 0 4437; VI-NEXT: v_mov_b32_e32 v4, s4 4438; VI-NEXT: v_mov_b32_e32 v2, s6 4439; VI-NEXT: v_mov_b32_e32 v3, s7 4440; VI-NEXT: v_mov_b32_e32 v5, s5 4441; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4442; VI-NEXT: v_mov_b32_e32 v4, s16 4443; VI-NEXT: v_mov_b32_e32 v0, s0 4444; VI-NEXT: v_mov_b32_e32 v1, s1 4445; VI-NEXT: v_mov_b32_e32 v2, s2 4446; VI-NEXT: v_mov_b32_e32 v3, s3 4447; VI-NEXT: v_mov_b32_e32 v5, s17 4448; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4449; VI-NEXT: s_endpgm 4450; 4451; GFX9-LABEL: v16f32_arg: 4452; GFX9: ; %bb.0: ; %entry 4453; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 4454; GFX9-NEXT: v_mov_b32_e32 v4, 0 4455; GFX9-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x40 4456; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4457; GFX9-NEXT: v_mov_b32_e32 v0, s12 4458; GFX9-NEXT: v_mov_b32_e32 v1, s13 4459; GFX9-NEXT: v_mov_b32_e32 v2, s14 4460; GFX9-NEXT: v_mov_b32_e32 v3, s15 4461; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 4462; GFX9-NEXT: s_nop 0 4463; GFX9-NEXT: v_mov_b32_e32 v0, s8 4464; GFX9-NEXT: v_mov_b32_e32 v1, s9 4465; GFX9-NEXT: v_mov_b32_e32 v2, s10 4466; GFX9-NEXT: v_mov_b32_e32 v3, s11 4467; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 4468; GFX9-NEXT: s_nop 0 4469; GFX9-NEXT: v_mov_b32_e32 v0, s4 4470; GFX9-NEXT: v_mov_b32_e32 v1, s5 4471; GFX9-NEXT: v_mov_b32_e32 v2, s6 4472; GFX9-NEXT: v_mov_b32_e32 v3, s7 4473; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 4474; GFX9-NEXT: s_nop 0 4475; GFX9-NEXT: v_mov_b32_e32 v0, s0 4476; GFX9-NEXT: v_mov_b32_e32 v1, s1 4477; GFX9-NEXT: v_mov_b32_e32 v2, s2 4478; GFX9-NEXT: v_mov_b32_e32 v3, s3 4479; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] 4480; GFX9-NEXT: s_endpgm 4481; 4482; EG-LABEL: v16f32_arg: 4483; EG: ; %bb.0: ; %entry 4484; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4485; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4486; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4487; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4489; EG-NEXT: CF_END 4490; EG-NEXT: ALU clause starting at 6: 4491; EG-NEXT: MOV * T0.W, KC0[7].X, 4492; EG-NEXT: MOV * T0.Z, KC0[6].W, 4493; EG-NEXT: MOV T0.Y, KC0[6].Z, 4494; EG-NEXT: MOV * T1.W, KC0[8].X, 4495; EG-NEXT: MOV T0.X, KC0[6].Y, 4496; EG-NEXT: MOV * T1.Z, KC0[7].W, 4497; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4498; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4499; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4500; EG-NEXT: MOV * T3.W, KC0[9].X, 4501; EG-NEXT: MOV T1.X, KC0[7].Y, 4502; EG-NEXT: MOV * T3.Z, KC0[8].W, 4503; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4504; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4505; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4506; EG-NEXT: MOV T3.Y, KC0[8].Z, 4507; EG-NEXT: MOV * T5.W, KC0[10].X, 4508; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4509; EG-NEXT: MOV T3.X, KC0[8].Y, 4510; EG-NEXT: MOV * T5.Z, KC0[9].W, 4511; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4512; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4513; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4514; EG-NEXT: MOV T5.Y, KC0[9].Z, 4515; EG-NEXT: MOV * T5.X, KC0[9].Y, 4516; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4517; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4518; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4519; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4520; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4521; 4522; CM-LABEL: v16f32_arg: 4523; CM: ; %bb.0: ; %entry 4524; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4525; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4526; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4527; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4528; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4529; CM-NEXT: CF_END 4530; CM-NEXT: ALU clause starting at 6: 4531; CM-NEXT: MOV * T0.W, KC0[10].X, 4532; CM-NEXT: MOV * T0.Z, KC0[9].W, 4533; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4534; CM-NEXT: MOV T0.X, KC0[9].Y, 4535; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4536; CM-NEXT: MOV * T2.W, KC0[9].X, 4537; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4538; CM-NEXT: MOV T2.Z, KC0[8].W, 4539; CM-NEXT: MOV * T1.W, KC0[8].X, 4540; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4541; CM-NEXT: MOV T2.Y, KC0[8].Z, 4542; CM-NEXT: MOV * T1.Z, KC0[7].W, 4543; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4544; CM-NEXT: MOV T2.X, KC0[8].Y, 4545; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4546; CM-NEXT: MOV T1.X, KC0[7].Y, 4547; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4548; CM-NEXT: MOV * T4.W, KC0[7].X, 4549; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4550; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4551; CM-NEXT: MOV T4.Z, KC0[6].W, 4552; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4553; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4554; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4555; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4556; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4557; CM-NEXT: MOV * T4.X, KC0[6].Y, 4558; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4559; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4560entry: 4561 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 4562 ret void 4563} 4564 4565define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 4566; SI-LABEL: kernel_arg_i64: 4567; SI: ; %bb.0: 4568; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4569; SI-NEXT: s_mov_b32 s7, 0xf000 4570; SI-NEXT: s_mov_b32 s6, -1 4571; SI-NEXT: s_waitcnt lgkmcnt(0) 4572; SI-NEXT: s_mov_b32 s4, s0 4573; SI-NEXT: s_mov_b32 s5, s1 4574; SI-NEXT: v_mov_b32_e32 v0, s2 4575; SI-NEXT: v_mov_b32_e32 v1, s3 4576; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4577; SI-NEXT: s_endpgm 4578; 4579; VI-LABEL: kernel_arg_i64: 4580; VI: ; %bb.0: 4581; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 4582; VI-NEXT: s_waitcnt lgkmcnt(0) 4583; VI-NEXT: v_mov_b32_e32 v0, s0 4584; VI-NEXT: v_mov_b32_e32 v1, s1 4585; VI-NEXT: v_mov_b32_e32 v2, s2 4586; VI-NEXT: v_mov_b32_e32 v3, s3 4587; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4588; VI-NEXT: s_endpgm 4589; 4590; GFX9-LABEL: kernel_arg_i64: 4591; GFX9: ; %bb.0: 4592; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4593; GFX9-NEXT: v_mov_b32_e32 v2, 0 4594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4595; GFX9-NEXT: v_mov_b32_e32 v0, s2 4596; GFX9-NEXT: v_mov_b32_e32 v1, s3 4597; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4598; GFX9-NEXT: s_endpgm 4599; 4600; EG-LABEL: kernel_arg_i64: 4601; EG: ; %bb.0: 4602; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4603; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4604; EG-NEXT: CF_END 4605; EG-NEXT: PAD 4606; EG-NEXT: ALU clause starting at 4: 4607; EG-NEXT: MOV * T0.Y, KC0[3].X, 4608; EG-NEXT: MOV T0.X, KC0[2].W, 4609; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4610; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4611; 4612; CM-LABEL: kernel_arg_i64: 4613; CM: ; %bb.0: 4614; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4615; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4616; CM-NEXT: CF_END 4617; CM-NEXT: PAD 4618; CM-NEXT: ALU clause starting at 4: 4619; CM-NEXT: MOV * T0.Y, KC0[3].X, 4620; CM-NEXT: MOV * T0.X, KC0[2].W, 4621; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4622; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4623 store i64 %a, i64 addrspace(1)* %out, align 8 4624 ret void 4625} 4626 4627define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 4628; SI-LABEL: f64_kernel_arg: 4629; SI: ; %bb.0: ; %entry 4630; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4631; SI-NEXT: s_mov_b32 s7, 0xf000 4632; SI-NEXT: s_mov_b32 s6, -1 4633; SI-NEXT: s_waitcnt lgkmcnt(0) 4634; SI-NEXT: s_mov_b32 s4, s0 4635; SI-NEXT: s_mov_b32 s5, s1 4636; SI-NEXT: v_mov_b32_e32 v0, s2 4637; SI-NEXT: v_mov_b32_e32 v1, s3 4638; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4639; SI-NEXT: s_endpgm 4640; 4641; VI-LABEL: f64_kernel_arg: 4642; VI: ; %bb.0: ; %entry 4643; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 4644; VI-NEXT: s_waitcnt lgkmcnt(0) 4645; VI-NEXT: v_mov_b32_e32 v0, s0 4646; VI-NEXT: v_mov_b32_e32 v1, s1 4647; VI-NEXT: v_mov_b32_e32 v2, s2 4648; VI-NEXT: v_mov_b32_e32 v3, s3 4649; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4650; VI-NEXT: s_endpgm 4651; 4652; GFX9-LABEL: f64_kernel_arg: 4653; GFX9: ; %bb.0: ; %entry 4654; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4655; GFX9-NEXT: v_mov_b32_e32 v2, 0 4656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4657; GFX9-NEXT: v_mov_b32_e32 v0, s2 4658; GFX9-NEXT: v_mov_b32_e32 v1, s3 4659; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4660; GFX9-NEXT: s_endpgm 4661; 4662; EG-LABEL: f64_kernel_arg: 4663; EG: ; %bb.0: ; %entry 4664; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4665; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4666; EG-NEXT: CF_END 4667; EG-NEXT: PAD 4668; EG-NEXT: ALU clause starting at 4: 4669; EG-NEXT: MOV * T0.Y, KC0[3].X, 4670; EG-NEXT: MOV T0.X, KC0[2].W, 4671; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4672; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4673; 4674; CM-LABEL: f64_kernel_arg: 4675; CM: ; %bb.0: ; %entry 4676; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4677; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4678; CM-NEXT: CF_END 4679; CM-NEXT: PAD 4680; CM-NEXT: ALU clause starting at 4: 4681; CM-NEXT: MOV * T0.Y, KC0[3].X, 4682; CM-NEXT: MOV * T0.X, KC0[2].W, 4683; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4684; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4685entry: 4686 store double %in, double addrspace(1)* %out 4687 ret void 4688} 4689 4690; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 4691; XGCN: s_load_dwordx2 4692; XGCN: s_load_dwordx2 4693; XGCN: buffer_store_dwordx2 4694; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 4695; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 4696; ret void 4697; } 4698 4699define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 4700; SI-LABEL: i65_arg: 4701; SI: ; %bb.0: ; %entry 4702; SI-NEXT: s_load_dword s2, s[0:1], 0xd 4703; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 4704; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4705; SI-NEXT: s_mov_b32 s3, 0xf000 4706; SI-NEXT: s_waitcnt lgkmcnt(0) 4707; SI-NEXT: s_and_b32 s6, s2, 1 4708; SI-NEXT: s_mov_b32 s2, -1 4709; SI-NEXT: v_mov_b32_e32 v0, s4 4710; SI-NEXT: v_mov_b32_e32 v1, s5 4711; SI-NEXT: v_mov_b32_e32 v2, s6 4712; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:8 4713; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4714; SI-NEXT: s_endpgm 4715; 4716; VI-LABEL: i65_arg: 4717; VI: ; %bb.0: ; %entry 4718; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4719; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4720; VI-NEXT: s_load_dword s0, s[0:1], 0x34 4721; VI-NEXT: s_waitcnt lgkmcnt(0) 4722; VI-NEXT: v_mov_b32_e32 v0, s2 4723; VI-NEXT: v_mov_b32_e32 v1, s3 4724; VI-NEXT: s_and_b32 s1, s0, 1 4725; VI-NEXT: s_add_u32 s0, s2, 8 4726; VI-NEXT: v_mov_b32_e32 v4, s1 4727; VI-NEXT: s_addc_u32 s1, s3, 0 4728; VI-NEXT: v_mov_b32_e32 v3, s1 4729; VI-NEXT: v_mov_b32_e32 v2, s0 4730; VI-NEXT: flat_store_byte v[2:3], v4 4731; VI-NEXT: v_mov_b32_e32 v2, s4 4732; VI-NEXT: v_mov_b32_e32 v3, s5 4733; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4734; VI-NEXT: s_endpgm 4735; 4736; GFX9-LABEL: i65_arg: 4737; GFX9: ; %bb.0: ; %entry 4738; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4739; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 4740; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 4741; GFX9-NEXT: v_mov_b32_e32 v2, 0 4742; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4743; GFX9-NEXT: v_mov_b32_e32 v0, s2 4744; GFX9-NEXT: s_and_b32 s4, s6, 1 4745; GFX9-NEXT: v_mov_b32_e32 v3, s4 4746; GFX9-NEXT: v_mov_b32_e32 v1, s3 4747; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8 4748; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4749; GFX9-NEXT: s_endpgm 4750; 4751; EG-LABEL: i65_arg: 4752; EG: ; %bb.0: ; %entry 4753; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[] 4754; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 4755; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 4756; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 4757; EG-NEXT: CF_END 4758; EG-NEXT: PAD 4759; EG-NEXT: ALU clause starting at 6: 4760; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4761; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4762; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 4763; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4764; EG-NEXT: LSHL T1.W, PV.W, literal.x, 4765; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1, 4766; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4767; EG-NEXT: LSHL T1.X, PS, PV.W, 4768; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 4769; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4770; EG-NEXT: MOV T1.Y, 0.0, 4771; EG-NEXT: MOV * T1.Z, 0.0, 4772; EG-NEXT: LSHR T0.X, T0.W, literal.x, 4773; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4774; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) 4775; EG-NEXT: LSHR T2.X, PV.W, literal.x, 4776; EG-NEXT: MOV * T3.X, KC0[3].X, 4777; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4778; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, 4779; EG-NEXT: MOV * T5.X, KC0[2].W, 4780; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4781; 4782; CM-LABEL: i65_arg: 4783; CM: ; %bb.0: ; %entry 4784; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[] 4785; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X 4786; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 4787; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X 4788; CM-NEXT: CF_END 4789; CM-NEXT: PAD 4790; CM-NEXT: ALU clause starting at 6: 4791; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4792; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4793; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 4794; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4795; CM-NEXT: LSHL T0.Z, PV.W, literal.x, 4796; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1, 4797; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4798; CM-NEXT: LSHL T1.X, PV.W, PV.Z, 4799; CM-NEXT: LSHL * T1.W, literal.x, PV.Z, 4800; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4801; CM-NEXT: MOV T1.Y, 0.0, 4802; CM-NEXT: MOV * T1.Z, 0.0, 4803; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 4804; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4805; CM-NEXT: MOV T2.X, KC0[2].W, 4806; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4807; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 4808; CM-NEXT: LSHR * T3.X, PV.W, literal.x, 4809; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4810; CM-NEXT: MOV * T4.X, KC0[3].X, 4811; CM-NEXT: LSHR * T5.X, T0.W, literal.x, 4812; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4813entry: 4814 store i65 %in, i65 addrspace(1)* %out, align 4 4815 ret void 4816} 4817 4818define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 4819; SI-LABEL: i1_arg: 4820; SI: ; %bb.0: 4821; SI-NEXT: s_load_dword s2, s[0:1], 0xb 4822; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4823; SI-NEXT: s_mov_b32 s3, 0xf000 4824; SI-NEXT: s_waitcnt lgkmcnt(0) 4825; SI-NEXT: s_and_b32 s4, s2, 1 4826; SI-NEXT: s_mov_b32 s2, -1 4827; SI-NEXT: v_mov_b32_e32 v0, s4 4828; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 4829; SI-NEXT: s_endpgm 4830; 4831; VI-LABEL: i1_arg: 4832; VI: ; %bb.0: 4833; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4834; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 4835; VI-NEXT: s_waitcnt lgkmcnt(0) 4836; VI-NEXT: v_mov_b32_e32 v0, s2 4837; VI-NEXT: s_and_b32 s0, s0, 1 4838; VI-NEXT: v_mov_b32_e32 v1, s3 4839; VI-NEXT: v_mov_b32_e32 v2, s0 4840; VI-NEXT: flat_store_byte v[0:1], v2 4841; VI-NEXT: s_endpgm 4842; 4843; GFX9-LABEL: i1_arg: 4844; GFX9: ; %bb.0: 4845; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4846; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 4847; GFX9-NEXT: v_mov_b32_e32 v0, 0 4848; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4849; GFX9-NEXT: s_and_b32 s2, s2, 1 4850; GFX9-NEXT: v_mov_b32_e32 v1, s2 4851; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 4852; GFX9-NEXT: s_endpgm 4853; 4854; EG-LABEL: i1_arg: 4855; EG: ; %bb.0: 4856; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4857; EG-NEXT: TEX 0 @6 4858; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4859; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4860; EG-NEXT: CF_END 4861; EG-NEXT: PAD 4862; EG-NEXT: Fetch clause starting at 6: 4863; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4864; EG-NEXT: ALU clause starting at 8: 4865; EG-NEXT: MOV * T0.X, 0.0, 4866; EG-NEXT: ALU clause starting at 9: 4867; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 4868; EG-NEXT: AND_INT * T1.W, T0.X, 1, 4869; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4870; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 4871; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4872; EG-NEXT: LSHL T0.X, T1.W, PV.W, 4873; EG-NEXT: LSHL * T0.W, literal.x, PV.W, 4874; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4875; EG-NEXT: MOV T0.Y, 0.0, 4876; EG-NEXT: MOV * T0.Z, 0.0, 4877; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4878; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4879; 4880; CM-LABEL: i1_arg: 4881; CM: ; %bb.0: 4882; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4883; CM-NEXT: TEX 0 @6 4884; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4885; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4886; CM-NEXT: CF_END 4887; CM-NEXT: PAD 4888; CM-NEXT: Fetch clause starting at 6: 4889; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4890; CM-NEXT: ALU clause starting at 8: 4891; CM-NEXT: MOV * T0.X, 0.0, 4892; CM-NEXT: ALU clause starting at 9: 4893; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 4894; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4895; CM-NEXT: AND_INT T0.Z, T0.X, 1, 4896; CM-NEXT: LSHL * T0.W, PV.W, literal.x, 4897; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4898; CM-NEXT: LSHL T0.X, PV.Z, PV.W, 4899; CM-NEXT: LSHL * T0.W, literal.x, PV.W, 4900; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4901; CM-NEXT: MOV T0.Y, 0.0, 4902; CM-NEXT: MOV * T0.Z, 0.0, 4903; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4904; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4905 store i1 %x, i1 addrspace(1)* %out, align 1 4906 ret void 4907} 4908 4909define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 4910; SI-LABEL: i1_arg_zext_i32: 4911; SI: ; %bb.0: 4912; SI-NEXT: s_load_dword s2, s[0:1], 0xb 4913; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4914; SI-NEXT: s_mov_b32 s3, 0xf000 4915; SI-NEXT: s_waitcnt lgkmcnt(0) 4916; SI-NEXT: s_and_b32 s4, s2, 1 4917; SI-NEXT: s_mov_b32 s2, -1 4918; SI-NEXT: v_mov_b32_e32 v0, s4 4919; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 4920; SI-NEXT: s_endpgm 4921; 4922; VI-LABEL: i1_arg_zext_i32: 4923; VI: ; %bb.0: 4924; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4925; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 4926; VI-NEXT: s_waitcnt lgkmcnt(0) 4927; VI-NEXT: v_mov_b32_e32 v0, s2 4928; VI-NEXT: s_and_b32 s0, s0, 1 4929; VI-NEXT: v_mov_b32_e32 v1, s3 4930; VI-NEXT: v_mov_b32_e32 v2, s0 4931; VI-NEXT: flat_store_dword v[0:1], v2 4932; VI-NEXT: s_endpgm 4933; 4934; GFX9-LABEL: i1_arg_zext_i32: 4935; GFX9: ; %bb.0: 4936; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4937; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 4938; GFX9-NEXT: v_mov_b32_e32 v0, 0 4939; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4940; GFX9-NEXT: s_and_b32 s2, s2, 1 4941; GFX9-NEXT: v_mov_b32_e32 v1, s2 4942; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4943; GFX9-NEXT: s_endpgm 4944; 4945; EG-LABEL: i1_arg_zext_i32: 4946; EG: ; %bb.0: 4947; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4948; EG-NEXT: TEX 0 @6 4949; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4950; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 4951; EG-NEXT: CF_END 4952; EG-NEXT: PAD 4953; EG-NEXT: Fetch clause starting at 6: 4954; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4955; EG-NEXT: ALU clause starting at 8: 4956; EG-NEXT: MOV * T0.X, 0.0, 4957; EG-NEXT: ALU clause starting at 9: 4958; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4959; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4960; 4961; CM-LABEL: i1_arg_zext_i32: 4962; CM: ; %bb.0: 4963; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4964; CM-NEXT: TEX 0 @6 4965; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4966; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 4967; CM-NEXT: CF_END 4968; CM-NEXT: PAD 4969; CM-NEXT: Fetch clause starting at 6: 4970; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4971; CM-NEXT: ALU clause starting at 8: 4972; CM-NEXT: MOV * T0.X, 0.0, 4973; CM-NEXT: ALU clause starting at 9: 4974; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4975; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4976 %ext = zext i1 %x to i32 4977 store i32 %ext, i32 addrspace(1)* %out, align 4 4978 ret void 4979} 4980 4981define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 4982; SI-LABEL: i1_arg_zext_i64: 4983; SI: ; %bb.0: 4984; SI-NEXT: s_load_dword s4, s[0:1], 0xb 4985; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4986; SI-NEXT: s_mov_b32 s3, 0xf000 4987; SI-NEXT: s_mov_b32 s2, -1 4988; SI-NEXT: s_waitcnt lgkmcnt(0) 4989; SI-NEXT: s_and_b32 s4, s4, 1 4990; SI-NEXT: v_mov_b32_e32 v1, 0 4991; SI-NEXT: v_mov_b32_e32 v0, s4 4992; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4993; SI-NEXT: s_endpgm 4994; 4995; VI-LABEL: i1_arg_zext_i64: 4996; VI: ; %bb.0: 4997; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4998; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 4999; VI-NEXT: v_mov_b32_e32 v1, 0 5000; VI-NEXT: s_waitcnt lgkmcnt(0) 5001; VI-NEXT: v_mov_b32_e32 v2, s2 5002; VI-NEXT: s_and_b32 s0, s0, 1 5003; VI-NEXT: v_mov_b32_e32 v0, s0 5004; VI-NEXT: v_mov_b32_e32 v3, s3 5005; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5006; VI-NEXT: s_endpgm 5007; 5008; GFX9-LABEL: i1_arg_zext_i64: 5009; GFX9: ; %bb.0: 5010; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5011; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5012; GFX9-NEXT: v_mov_b32_e32 v1, 0 5013; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5014; GFX9-NEXT: s_and_b32 s2, s2, 1 5015; GFX9-NEXT: v_mov_b32_e32 v0, s2 5016; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 5017; GFX9-NEXT: s_endpgm 5018; 5019; EG-LABEL: i1_arg_zext_i64: 5020; EG: ; %bb.0: 5021; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5022; EG-NEXT: TEX 0 @6 5023; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5024; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 5025; EG-NEXT: CF_END 5026; EG-NEXT: PAD 5027; EG-NEXT: Fetch clause starting at 6: 5028; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5029; EG-NEXT: ALU clause starting at 8: 5030; EG-NEXT: MOV * T0.X, 0.0, 5031; EG-NEXT: ALU clause starting at 9: 5032; EG-NEXT: MOV * T0.Y, 0.0, 5033; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5034; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5035; 5036; CM-LABEL: i1_arg_zext_i64: 5037; CM: ; %bb.0: 5038; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5039; CM-NEXT: TEX 0 @6 5040; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5041; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5042; CM-NEXT: CF_END 5043; CM-NEXT: PAD 5044; CM-NEXT: Fetch clause starting at 6: 5045; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5046; CM-NEXT: ALU clause starting at 8: 5047; CM-NEXT: MOV * T0.X, 0.0, 5048; CM-NEXT: ALU clause starting at 9: 5049; CM-NEXT: MOV * T0.Y, 0.0, 5050; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5051; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5052 %ext = zext i1 %x to i64 5053 store i64 %ext, i64 addrspace(1)* %out, align 8 5054 ret void 5055} 5056 5057define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 5058; SI-LABEL: i1_arg_sext_i32: 5059; SI: ; %bb.0: 5060; SI-NEXT: s_load_dword s2, s[0:1], 0xb 5061; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5062; SI-NEXT: s_mov_b32 s3, 0xf000 5063; SI-NEXT: s_waitcnt lgkmcnt(0) 5064; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 5065; SI-NEXT: s_mov_b32 s2, -1 5066; SI-NEXT: v_mov_b32_e32 v0, s4 5067; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5068; SI-NEXT: s_endpgm 5069; 5070; VI-LABEL: i1_arg_sext_i32: 5071; VI: ; %bb.0: 5072; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5073; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 5074; VI-NEXT: s_waitcnt lgkmcnt(0) 5075; VI-NEXT: v_mov_b32_e32 v0, s2 5076; VI-NEXT: s_bfe_i32 s0, s0, 0x10000 5077; VI-NEXT: v_mov_b32_e32 v1, s3 5078; VI-NEXT: v_mov_b32_e32 v2, s0 5079; VI-NEXT: flat_store_dword v[0:1], v2 5080; VI-NEXT: s_endpgm 5081; 5082; GFX9-LABEL: i1_arg_sext_i32: 5083; GFX9: ; %bb.0: 5084; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5085; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5086; GFX9-NEXT: v_mov_b32_e32 v0, 0 5087; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5088; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 5089; GFX9-NEXT: v_mov_b32_e32 v1, s2 5090; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5091; GFX9-NEXT: s_endpgm 5092; 5093; EG-LABEL: i1_arg_sext_i32: 5094; EG: ; %bb.0: 5095; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5096; EG-NEXT: TEX 0 @6 5097; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5098; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 5099; EG-NEXT: CF_END 5100; EG-NEXT: PAD 5101; EG-NEXT: Fetch clause starting at 6: 5102; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5103; EG-NEXT: ALU clause starting at 8: 5104; EG-NEXT: MOV * T0.X, 0.0, 5105; EG-NEXT: ALU clause starting at 9: 5106; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5107; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5108; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5109; 5110; CM-LABEL: i1_arg_sext_i32: 5111; CM: ; %bb.0: 5112; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5113; CM-NEXT: TEX 0 @6 5114; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5115; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 5116; CM-NEXT: CF_END 5117; CM-NEXT: PAD 5118; CM-NEXT: Fetch clause starting at 6: 5119; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5120; CM-NEXT: ALU clause starting at 8: 5121; CM-NEXT: MOV * T0.X, 0.0, 5122; CM-NEXT: ALU clause starting at 9: 5123; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5124; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5125; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5126 %ext = sext i1 %x to i32 5127 store i32 %ext, i32addrspace(1)* %out, align 4 5128 ret void 5129} 5130 5131define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 5132; SI-LABEL: i1_arg_sext_i64: 5133; SI: ; %bb.0: 5134; SI-NEXT: s_load_dword s2, s[0:1], 0xb 5135; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5136; SI-NEXT: s_mov_b32 s3, 0xf000 5137; SI-NEXT: s_waitcnt lgkmcnt(0) 5138; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 5139; SI-NEXT: s_mov_b32 s2, -1 5140; SI-NEXT: v_mov_b32_e32 v0, s4 5141; SI-NEXT: v_mov_b32_e32 v1, s5 5142; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5143; SI-NEXT: s_endpgm 5144; 5145; VI-LABEL: i1_arg_sext_i64: 5146; VI: ; %bb.0: 5147; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5148; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 5149; VI-NEXT: s_waitcnt lgkmcnt(0) 5150; VI-NEXT: v_mov_b32_e32 v0, s2 5151; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 5152; VI-NEXT: v_mov_b32_e32 v3, s1 5153; VI-NEXT: v_mov_b32_e32 v1, s3 5154; VI-NEXT: v_mov_b32_e32 v2, s0 5155; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5156; VI-NEXT: s_endpgm 5157; 5158; GFX9-LABEL: i1_arg_sext_i64: 5159; GFX9: ; %bb.0: 5160; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5161; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5162; GFX9-NEXT: v_mov_b32_e32 v2, 0 5163; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5164; GFX9-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 5165; GFX9-NEXT: v_mov_b32_e32 v0, s2 5166; GFX9-NEXT: v_mov_b32_e32 v1, s3 5167; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 5168; GFX9-NEXT: s_endpgm 5169; 5170; EG-LABEL: i1_arg_sext_i64: 5171; EG: ; %bb.0: 5172; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5173; EG-NEXT: TEX 0 @6 5174; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5175; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 5176; EG-NEXT: CF_END 5177; EG-NEXT: PAD 5178; EG-NEXT: Fetch clause starting at 6: 5179; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5180; EG-NEXT: ALU clause starting at 8: 5181; EG-NEXT: MOV * T0.X, 0.0, 5182; EG-NEXT: ALU clause starting at 9: 5183; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5184; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5185; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5186; EG-NEXT: MOV * T0.Y, PV.X, 5187; 5188; CM-LABEL: i1_arg_sext_i64: 5189; CM: ; %bb.0: 5190; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5191; CM-NEXT: TEX 0 @6 5192; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5193; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5194; CM-NEXT: CF_END 5195; CM-NEXT: PAD 5196; CM-NEXT: Fetch clause starting at 6: 5197; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5198; CM-NEXT: ALU clause starting at 8: 5199; CM-NEXT: MOV * T0.X, 0.0, 5200; CM-NEXT: ALU clause starting at 9: 5201; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5202; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 5203; CM-NEXT: MOV * T0.Y, PV.X, 5204; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5205 %ext = sext i1 %x to i64 5206 store i64 %ext, i64 addrspace(1)* %out, align 8 5207 ret void 5208} 5209 5210define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 5211; SI-LABEL: empty_struct_arg: 5212; SI: ; %bb.0: 5213; SI-NEXT: s_endpgm 5214; 5215; VI-LABEL: empty_struct_arg: 5216; VI: ; %bb.0: 5217; VI-NEXT: s_endpgm 5218; 5219; GFX9-LABEL: empty_struct_arg: 5220; GFX9: ; %bb.0: 5221; GFX9-NEXT: s_endpgm 5222; 5223; EGCM-LABEL: empty_struct_arg: 5224; EGCM: ; %bb.0: 5225; EGCM-NEXT: CF_END 5226; EGCM-NEXT: PAD 5227 ret void 5228} 5229 5230; The correct load offsets for these: 5231; load 4 from 0, 5232; load 8 from 8 5233; load 4 from 24 5234; load 8 from 32 5235 5236; With the SelectionDAG argument lowering, the alignments for the 5237; struct members is not properly considered, making these wrong. 5238 5239; FIXME: Total argument size is computed wrong 5240define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 5241; SI-LABEL: struct_argument_alignment: 5242; SI: ; %bb.0: 5243; SI-NEXT: s_load_dword s8, s[0:1], 0x9 5244; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5245; SI-NEXT: s_load_dword s9, s[0:1], 0xf 5246; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 5247; SI-NEXT: s_mov_b32 s0, 0 5248; SI-NEXT: s_mov_b32 s3, 0xf000 5249; SI-NEXT: s_mov_b32 s2, -1 5250; SI-NEXT: s_mov_b32 s1, s0 5251; SI-NEXT: s_waitcnt lgkmcnt(0) 5252; SI-NEXT: v_mov_b32_e32 v0, s8 5253; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5254; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5255; SI-NEXT: v_mov_b32_e32 v0, s4 5256; SI-NEXT: v_mov_b32_e32 v1, s5 5257; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5258; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5259; SI-NEXT: v_mov_b32_e32 v0, s9 5260; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5261; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5262; SI-NEXT: v_mov_b32_e32 v0, s6 5263; SI-NEXT: v_mov_b32_e32 v1, s7 5264; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5265; SI-NEXT: s_waitcnt vmcnt(0) 5266; SI-NEXT: s_endpgm 5267; 5268; VI-LABEL: struct_argument_alignment: 5269; VI: ; %bb.0: 5270; VI-NEXT: s_load_dword s4, s[0:1], 0x24 5271; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5272; VI-NEXT: s_load_dword s5, s[0:1], 0x3c 5273; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 5274; VI-NEXT: v_mov_b32_e32 v0, 0 5275; VI-NEXT: v_mov_b32_e32 v1, 0 5276; VI-NEXT: s_waitcnt lgkmcnt(0) 5277; VI-NEXT: v_mov_b32_e32 v2, s4 5278; VI-NEXT: flat_store_dword v[0:1], v2 5279; VI-NEXT: s_waitcnt vmcnt(0) 5280; VI-NEXT: v_mov_b32_e32 v2, s2 5281; VI-NEXT: v_mov_b32_e32 v3, s3 5282; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5283; VI-NEXT: s_waitcnt vmcnt(0) 5284; VI-NEXT: v_mov_b32_e32 v2, s5 5285; VI-NEXT: flat_store_dword v[0:1], v2 5286; VI-NEXT: s_waitcnt vmcnt(0) 5287; VI-NEXT: v_mov_b32_e32 v3, s1 5288; VI-NEXT: v_mov_b32_e32 v2, s0 5289; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5290; VI-NEXT: s_waitcnt vmcnt(0) 5291; VI-NEXT: s_endpgm 5292; 5293; GFX9-LABEL: struct_argument_alignment: 5294; GFX9: ; %bb.0: 5295; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 5296; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5297; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 5298; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 5299; GFX9-NEXT: v_mov_b32_e32 v0, 0 5300; GFX9-NEXT: v_mov_b32_e32 v1, 0 5301; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5302; GFX9-NEXT: v_mov_b32_e32 v2, s6 5303; GFX9-NEXT: global_store_dword v[0:1], v2, off 5304; GFX9-NEXT: s_waitcnt vmcnt(0) 5305; GFX9-NEXT: v_mov_b32_e32 v3, s1 5306; GFX9-NEXT: v_mov_b32_e32 v2, s0 5307; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5308; GFX9-NEXT: s_waitcnt vmcnt(0) 5309; GFX9-NEXT: v_mov_b32_e32 v2, s7 5310; GFX9-NEXT: global_store_dword v[0:1], v2, off 5311; GFX9-NEXT: s_waitcnt vmcnt(0) 5312; GFX9-NEXT: v_mov_b32_e32 v2, s2 5313; GFX9-NEXT: v_mov_b32_e32 v3, s3 5314; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5315; GFX9-NEXT: s_waitcnt vmcnt(0) 5316; GFX9-NEXT: s_endpgm 5317; 5318; EG-LABEL: struct_argument_alignment: 5319; EG: ; %bb.0: 5320; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5321; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0 5322; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 5323; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0 5324; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0 5325; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0 5326; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 5327; EG-NEXT: CF_END 5328; EG-NEXT: ALU clause starting at 8: 5329; EG-NEXT: MOV T0.X, KC0[4].Y, 5330; EG-NEXT: MOV * T1.X, KC0[4].Z, 5331; EG-NEXT: MOV T2.X, KC0[3].W, 5332; EG-NEXT: MOV * T3.X, KC0[2].W, 5333; EG-NEXT: MOV T4.X, literal.x, 5334; EG-NEXT: MOV * T5.X, KC0[3].X, 5335; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5336; EG-NEXT: MOV T6.X, literal.x, 5337; EG-NEXT: MOV * T7.X, KC0[2].Y, 5338; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5339; 5340; CM-LABEL: struct_argument_alignment: 5341; CM: ; %bb.0: 5342; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5343; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X 5344; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X 5345; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X 5346; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X 5347; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5348; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X 5349; CM-NEXT: CF_END 5350; CM-NEXT: ALU clause starting at 8: 5351; CM-NEXT: MOV * T0.X, KC0[4].Y, 5352; CM-NEXT: MOV * T1.X, KC0[4].Z, 5353; CM-NEXT: MOV * T2.X, KC0[3].W, 5354; CM-NEXT: MOV * T3.X, KC0[2].W, 5355; CM-NEXT: MOV * T4.X, literal.x, 5356; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5357; CM-NEXT: MOV * T5.X, KC0[3].X, 5358; CM-NEXT: MOV * T6.X, literal.x, 5359; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5360; CM-NEXT: MOV * T7.X, KC0[2].Y, 5361 %val0 = extractvalue {i32, i64} %arg0, 0 5362 %val1 = extractvalue {i32, i64} %arg0, 1 5363 %val2 = extractvalue {i32, i64} %arg1, 0 5364 %val3 = extractvalue {i32, i64} %arg1, 1 5365 store volatile i32 %val0, i32 addrspace(1)* null 5366 store volatile i64 %val1, i64 addrspace(1)* null 5367 store volatile i32 %val2, i32 addrspace(1)* null 5368 store volatile i64 %val3, i64 addrspace(1)* null 5369 ret void 5370} 5371 5372; No padding between i8 and next struct, but round up at end to 4 byte 5373; multiple. 5374define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 5375; SI-LABEL: packed_struct_argument_alignment: 5376; SI: ; %bb.0: 5377; SI-NEXT: s_mov_b32 s3, 0xf000 5378; SI-NEXT: s_mov_b32 s2, -1 5379; SI-NEXT: s_load_dword s6, s[0:1], 0x9 5380; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xa 5381; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:49 5382; SI-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:50 5383; SI-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:51 5384; SI-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:52 5385; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53 5386; SI-NEXT: s_mov_b32 s0, 0 5387; SI-NEXT: s_mov_b32 s1, s0 5388; SI-NEXT: s_waitcnt lgkmcnt(0) 5389; SI-NEXT: v_mov_b32_e32 v2, s6 5390; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 5391; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5392; SI-NEXT: v_mov_b32_e32 v2, s4 5393; SI-NEXT: v_mov_b32_e32 v3, s5 5394; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 5395; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5396; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 5397; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 5398; SI-NEXT: v_or_b32_e32 v2, v2, v4 5399; SI-NEXT: v_or_b32_e32 v3, v3, v6 5400; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 5401; SI-NEXT: v_or_b32_e32 v2, v3, v2 5402; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 5403; SI-NEXT: s_waitcnt vmcnt(0) 5404; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5405; SI-NEXT: s_waitcnt vmcnt(0) 5406; SI-NEXT: s_endpgm 5407; 5408; VI-LABEL: packed_struct_argument_alignment: 5409; VI: ; %bb.0: 5410; VI-NEXT: s_add_u32 s2, s0, 49 5411; VI-NEXT: s_addc_u32 s3, s1, 0 5412; VI-NEXT: s_add_u32 s4, s0, 50 5413; VI-NEXT: s_addc_u32 s5, s1, 0 5414; VI-NEXT: v_mov_b32_e32 v2, s2 5415; VI-NEXT: v_mov_b32_e32 v3, s3 5416; VI-NEXT: s_add_u32 s2, s2, 3 5417; VI-NEXT: s_addc_u32 s3, s3, 0 5418; VI-NEXT: v_mov_b32_e32 v5, s3 5419; VI-NEXT: v_mov_b32_e32 v4, s2 5420; VI-NEXT: s_add_u32 s2, s0, 51 5421; VI-NEXT: s_addc_u32 s3, s1, 0 5422; VI-NEXT: v_mov_b32_e32 v0, s4 5423; VI-NEXT: v_mov_b32_e32 v7, s3 5424; VI-NEXT: v_mov_b32_e32 v1, s5 5425; VI-NEXT: v_mov_b32_e32 v6, s2 5426; VI-NEXT: s_load_dword s4, s[0:1], 0x24 5427; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 5428; VI-NEXT: flat_load_ubyte v8, v[0:1] 5429; VI-NEXT: flat_load_ubyte v9, v[2:3] 5430; VI-NEXT: flat_load_ubyte v10, v[4:5] 5431; VI-NEXT: flat_load_ubyte v6, v[6:7] 5432; VI-NEXT: s_add_u32 s0, s0, 53 5433; VI-NEXT: s_addc_u32 s1, s1, 0 5434; VI-NEXT: v_mov_b32_e32 v0, s0 5435; VI-NEXT: v_mov_b32_e32 v1, s1 5436; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 5437; VI-NEXT: v_mov_b32_e32 v2, 0 5438; VI-NEXT: s_waitcnt lgkmcnt(0) 5439; VI-NEXT: v_mov_b32_e32 v5, s3 5440; VI-NEXT: v_mov_b32_e32 v3, 0 5441; VI-NEXT: v_mov_b32_e32 v7, s4 5442; VI-NEXT: v_mov_b32_e32 v4, s2 5443; VI-NEXT: flat_store_dword v[2:3], v7 5444; VI-NEXT: s_waitcnt vmcnt(0) 5445; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] 5446; VI-NEXT: s_waitcnt vmcnt(0) 5447; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 5448; VI-NEXT: v_or_b32_e32 v4, v4, v9 5449; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 5450; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5451; VI-NEXT: v_or_b32_e32 v4, v5, v4 5452; VI-NEXT: flat_store_dword v[2:3], v4 5453; VI-NEXT: s_waitcnt vmcnt(0) 5454; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5455; VI-NEXT: s_waitcnt vmcnt(0) 5456; VI-NEXT: s_endpgm 5457; 5458; GFX9-LABEL: packed_struct_argument_alignment: 5459; GFX9: ; %bb.0: 5460; GFX9-NEXT: v_mov_b32_e32 v2, 0 5461; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 5462; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 5463; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 5464; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 5465; GFX9-NEXT: v_mov_b32_e32 v2, 0 5466; GFX9-NEXT: v_mov_b32_e32 v3, 0 5467; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5468; GFX9-NEXT: v_mov_b32_e32 v7, s2 5469; GFX9-NEXT: v_mov_b32_e32 v5, s1 5470; GFX9-NEXT: v_mov_b32_e32 v4, s0 5471; GFX9-NEXT: global_store_dword v[2:3], v7, off 5472; GFX9-NEXT: s_waitcnt vmcnt(0) 5473; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off 5474; GFX9-NEXT: s_waitcnt vmcnt(0) 5475; GFX9-NEXT: global_store_dword v[2:3], v6, off 5476; GFX9-NEXT: s_waitcnt vmcnt(0) 5477; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 5478; GFX9-NEXT: s_waitcnt vmcnt(0) 5479; GFX9-NEXT: s_endpgm 5480; 5481; EG-LABEL: packed_struct_argument_alignment: 5482; EG: ; %bb.0: 5483; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5484; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 5485; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5486; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5487; EG-NEXT: ALU 2, @25, KC0[], KC1[] 5488; EG-NEXT: TEX 0 @12 5489; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5490; EG-NEXT: TEX 0 @14 5491; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5492; EG-NEXT: TEX 0 @16 5493; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1 5494; EG-NEXT: CF_END 5495; EG-NEXT: Fetch clause starting at 12: 5496; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5497; EG-NEXT: Fetch clause starting at 14: 5498; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5499; EG-NEXT: Fetch clause starting at 16: 5500; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5501; EG-NEXT: ALU clause starting at 18: 5502; EG-NEXT: MOV T0.X, KC0[2].Z, 5503; EG-NEXT: MOV * T1.X, literal.x, 5504; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5505; EG-NEXT: MOV T2.X, KC0[2].W, 5506; EG-NEXT: MOV * T3.X, literal.x, 5507; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5508; EG-NEXT: MOV * T4.X, KC0[2].Y, 5509; EG-NEXT: ALU clause starting at 25: 5510; EG-NEXT: MOV T0.X, 0.0, 5511; EG-NEXT: MOV * T2.X, 0.0, 5512; EG-NEXT: MOV * T4.X, 0.0, 5513; 5514; CM-LABEL: packed_struct_argument_alignment: 5515; CM: ; %bb.0: 5516; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5517; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5518; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5519; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5520; CM-NEXT: ALU 2, @25, KC0[], KC1[] 5521; CM-NEXT: TEX 0 @12 5522; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5523; CM-NEXT: TEX 0 @14 5524; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5525; CM-NEXT: TEX 0 @16 5526; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5527; CM-NEXT: CF_END 5528; CM-NEXT: Fetch clause starting at 12: 5529; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5530; CM-NEXT: Fetch clause starting at 14: 5531; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5532; CM-NEXT: Fetch clause starting at 16: 5533; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5534; CM-NEXT: ALU clause starting at 18: 5535; CM-NEXT: MOV * T0.X, KC0[2].Z, 5536; CM-NEXT: MOV * T1.X, literal.x, 5537; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5538; CM-NEXT: MOV * T2.X, KC0[2].W, 5539; CM-NEXT: MOV * T3.X, literal.x, 5540; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5541; CM-NEXT: MOV * T4.X, KC0[2].Y, 5542; CM-NEXT: ALU clause starting at 25: 5543; CM-NEXT: MOV * T0.X, 0.0, 5544; CM-NEXT: MOV * T2.X, 0.0, 5545; CM-NEXT: MOV * T4.X, 0.0, 5546 %val0 = extractvalue <{i32, i64}> %arg0, 0 5547 %val1 = extractvalue <{i32, i64}> %arg0, 1 5548 %val2 = extractvalue <{i32, i64}> %arg1, 0 5549 %val3 = extractvalue <{i32, i64}> %arg1, 1 5550 store volatile i32 %val0, i32 addrspace(1)* null 5551 store volatile i64 %val1, i64 addrspace(1)* null 5552 store volatile i32 %val2, i32 addrspace(1)* null 5553 store volatile i64 %val3, i64 addrspace(1)* null 5554 ret void 5555} 5556 5557define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 5558; SI-LABEL: struct_argument_alignment_after: 5559; SI: ; %bb.0: 5560; SI-NEXT: s_load_dword s12, s[0:1], 0x9 5561; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 5562; SI-NEXT: s_load_dword s13, s[0:1], 0xf 5563; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 5564; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 5565; SI-NEXT: s_mov_b32 s4, 0 5566; SI-NEXT: s_mov_b32 s7, 0xf000 5567; SI-NEXT: s_mov_b32 s6, -1 5568; SI-NEXT: s_mov_b32 s5, s4 5569; SI-NEXT: s_waitcnt lgkmcnt(0) 5570; SI-NEXT: v_mov_b32_e32 v0, s12 5571; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5572; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5573; SI-NEXT: v_mov_b32_e32 v0, s8 5574; SI-NEXT: v_mov_b32_e32 v1, s9 5575; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5576; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5577; SI-NEXT: v_mov_b32_e32 v0, s13 5578; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5579; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5580; SI-NEXT: v_mov_b32_e32 v0, s10 5581; SI-NEXT: v_mov_b32_e32 v1, s11 5582; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5583; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5584; SI-NEXT: v_mov_b32_e32 v0, s0 5585; SI-NEXT: v_mov_b32_e32 v1, s1 5586; SI-NEXT: v_mov_b32_e32 v2, s2 5587; SI-NEXT: v_mov_b32_e32 v3, s3 5588; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5589; SI-NEXT: s_waitcnt vmcnt(0) 5590; SI-NEXT: s_endpgm 5591; 5592; VI-LABEL: struct_argument_alignment_after: 5593; VI: ; %bb.0: 5594; VI-NEXT: s_load_dword s8, s[0:1], 0x24 5595; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5596; VI-NEXT: s_load_dword s9, s[0:1], 0x3c 5597; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 5598; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 5599; VI-NEXT: v_mov_b32_e32 v4, 0 5600; VI-NEXT: v_mov_b32_e32 v5, 0 5601; VI-NEXT: s_waitcnt lgkmcnt(0) 5602; VI-NEXT: v_mov_b32_e32 v0, s8 5603; VI-NEXT: flat_store_dword v[4:5], v0 5604; VI-NEXT: s_waitcnt vmcnt(0) 5605; VI-NEXT: v_mov_b32_e32 v0, s4 5606; VI-NEXT: v_mov_b32_e32 v1, s5 5607; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5608; VI-NEXT: s_waitcnt vmcnt(0) 5609; VI-NEXT: v_mov_b32_e32 v0, s9 5610; VI-NEXT: flat_store_dword v[4:5], v0 5611; VI-NEXT: s_waitcnt vmcnt(0) 5612; VI-NEXT: v_mov_b32_e32 v0, s6 5613; VI-NEXT: v_mov_b32_e32 v1, s7 5614; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5615; VI-NEXT: s_waitcnt vmcnt(0) 5616; VI-NEXT: v_mov_b32_e32 v0, s0 5617; VI-NEXT: v_mov_b32_e32 v1, s1 5618; VI-NEXT: v_mov_b32_e32 v2, s2 5619; VI-NEXT: v_mov_b32_e32 v3, s3 5620; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 5621; VI-NEXT: s_waitcnt vmcnt(0) 5622; VI-NEXT: s_endpgm 5623; 5624; GFX9-LABEL: struct_argument_alignment_after: 5625; GFX9: ; %bb.0: 5626; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 5627; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 5628; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 5629; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 5630; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 5631; GFX9-NEXT: v_mov_b32_e32 v4, 0 5632; GFX9-NEXT: v_mov_b32_e32 v5, 0 5633; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5634; GFX9-NEXT: v_mov_b32_e32 v0, s10 5635; GFX9-NEXT: global_store_dword v[4:5], v0, off 5636; GFX9-NEXT: s_waitcnt vmcnt(0) 5637; GFX9-NEXT: v_mov_b32_e32 v0, s6 5638; GFX9-NEXT: v_mov_b32_e32 v1, s7 5639; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5640; GFX9-NEXT: s_waitcnt vmcnt(0) 5641; GFX9-NEXT: v_mov_b32_e32 v0, s11 5642; GFX9-NEXT: global_store_dword v[4:5], v0, off 5643; GFX9-NEXT: s_waitcnt vmcnt(0) 5644; GFX9-NEXT: v_mov_b32_e32 v0, s8 5645; GFX9-NEXT: v_mov_b32_e32 v1, s9 5646; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5647; GFX9-NEXT: s_waitcnt vmcnt(0) 5648; GFX9-NEXT: v_mov_b32_e32 v0, s0 5649; GFX9-NEXT: v_mov_b32_e32 v1, s1 5650; GFX9-NEXT: v_mov_b32_e32 v2, s2 5651; GFX9-NEXT: v_mov_b32_e32 v3, s3 5652; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 5653; GFX9-NEXT: s_waitcnt vmcnt(0) 5654; GFX9-NEXT: s_endpgm 5655; 5656; EG-LABEL: struct_argument_alignment_after: 5657; EG: ; %bb.0: 5658; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5659; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0 5660; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0 5661; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0 5662; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0 5663; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0 5664; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0 5665; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1 5666; EG-NEXT: CF_END 5667; EG-NEXT: PAD 5668; EG-NEXT: ALU clause starting at 10: 5669; EG-NEXT: MOV * T0.W, KC0[6].X, 5670; EG-NEXT: MOV * T0.Z, KC0[5].W, 5671; EG-NEXT: MOV * T0.Y, KC0[5].Z, 5672; EG-NEXT: MOV T0.X, KC0[5].Y, 5673; EG-NEXT: MOV * T1.X, KC0[4].Y, 5674; EG-NEXT: MOV T2.X, KC0[4].Z, 5675; EG-NEXT: MOV * T3.X, KC0[3].W, 5676; EG-NEXT: MOV T4.X, KC0[2].W, 5677; EG-NEXT: MOV * T5.X, literal.x, 5678; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5679; EG-NEXT: MOV T6.X, KC0[3].X, 5680; EG-NEXT: MOV * T7.X, literal.x, 5681; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5682; EG-NEXT: MOV * T8.X, KC0[2].Y, 5683; 5684; CM-LABEL: struct_argument_alignment_after: 5685; CM: ; %bb.0: 5686; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5687; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X 5688; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X 5689; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X 5690; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X 5691; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X 5692; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X 5693; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X 5694; CM-NEXT: CF_END 5695; CM-NEXT: PAD 5696; CM-NEXT: ALU clause starting at 10: 5697; CM-NEXT: MOV * T0.W, KC0[6].X, 5698; CM-NEXT: MOV * T0.Z, KC0[5].W, 5699; CM-NEXT: MOV * T0.Y, KC0[5].Z, 5700; CM-NEXT: MOV * T0.X, KC0[5].Y, 5701; CM-NEXT: MOV * T1.X, KC0[4].Y, 5702; CM-NEXT: MOV * T2.X, KC0[4].Z, 5703; CM-NEXT: MOV * T3.X, KC0[3].W, 5704; CM-NEXT: MOV * T4.X, KC0[2].W, 5705; CM-NEXT: MOV * T5.X, literal.x, 5706; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5707; CM-NEXT: MOV * T6.X, KC0[3].X, 5708; CM-NEXT: MOV * T7.X, literal.x, 5709; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5710; CM-NEXT: MOV * T8.X, KC0[2].Y, 5711 %val0 = extractvalue {i32, i64} %arg0, 0 5712 %val1 = extractvalue {i32, i64} %arg0, 1 5713 %val2 = extractvalue {i32, i64} %arg2, 0 5714 %val3 = extractvalue {i32, i64} %arg2, 1 5715 store volatile i32 %val0, i32 addrspace(1)* null 5716 store volatile i64 %val1, i64 addrspace(1)* null 5717 store volatile i32 %val2, i32 addrspace(1)* null 5718 store volatile i64 %val3, i64 addrspace(1)* null 5719 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 5720 ret void 5721} 5722 5723define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 5724; SI-LABEL: array_3xi32: 5725; SI: ; %bb.0: 5726; SI-NEXT: s_load_dword s4, s[0:1], 0xc 5727; SI-NEXT: s_load_dword s5, s[0:1], 0x9 5728; SI-NEXT: s_load_dword s6, s[0:1], 0xa 5729; SI-NEXT: s_load_dword s0, s[0:1], 0xb 5730; SI-NEXT: s_mov_b32 s3, 0xf000 5731; SI-NEXT: s_mov_b32 s2, -1 5732; SI-NEXT: s_waitcnt lgkmcnt(0) 5733; SI-NEXT: v_mov_b32_e32 v0, s5 5734; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 5735; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5736; SI-NEXT: v_mov_b32_e32 v0, s4 5737; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5738; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5739; SI-NEXT: v_mov_b32_e32 v0, s0 5740; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5741; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5742; SI-NEXT: v_mov_b32_e32 v0, s6 5743; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5744; SI-NEXT: s_waitcnt vmcnt(0) 5745; SI-NEXT: s_endpgm 5746; 5747; VI-LABEL: array_3xi32: 5748; VI: ; %bb.0: 5749; VI-NEXT: s_load_dword s2, s[0:1], 0x24 5750; VI-NEXT: s_load_dword s3, s[0:1], 0x28 5751; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 5752; VI-NEXT: s_load_dword s0, s[0:1], 0x30 5753; VI-NEXT: s_waitcnt lgkmcnt(0) 5754; VI-NEXT: v_mov_b32_e32 v0, s2 5755; VI-NEXT: v_mov_b32_e32 v1, s0 5756; VI-NEXT: flat_store_short v[0:1], v0 5757; VI-NEXT: s_waitcnt vmcnt(0) 5758; VI-NEXT: flat_store_dword v[0:1], v1 5759; VI-NEXT: s_waitcnt vmcnt(0) 5760; VI-NEXT: v_mov_b32_e32 v0, s4 5761; VI-NEXT: flat_store_dword v[0:1], v0 5762; VI-NEXT: s_waitcnt vmcnt(0) 5763; VI-NEXT: v_mov_b32_e32 v0, s3 5764; VI-NEXT: flat_store_dword v[0:1], v0 5765; VI-NEXT: s_waitcnt vmcnt(0) 5766; VI-NEXT: s_endpgm 5767; 5768; GFX9-LABEL: array_3xi32: 5769; GFX9: ; %bb.0: 5770; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 5771; GFX9-NEXT: s_load_dword s1, s[4:5], 0x4 5772; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5773; GFX9-NEXT: s_load_dword s3, s[4:5], 0xc 5774; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5775; GFX9-NEXT: v_mov_b32_e32 v0, s0 5776; GFX9-NEXT: v_mov_b32_e32 v1, s3 5777; GFX9-NEXT: global_store_short v[0:1], v0, off 5778; GFX9-NEXT: s_waitcnt vmcnt(0) 5779; GFX9-NEXT: global_store_dword v[0:1], v1, off 5780; GFX9-NEXT: s_waitcnt vmcnt(0) 5781; GFX9-NEXT: v_mov_b32_e32 v0, s2 5782; GFX9-NEXT: global_store_dword v[0:1], v0, off 5783; GFX9-NEXT: s_waitcnt vmcnt(0) 5784; GFX9-NEXT: v_mov_b32_e32 v0, s1 5785; GFX9-NEXT: global_store_dword v[0:1], v0, off 5786; GFX9-NEXT: s_waitcnt vmcnt(0) 5787; GFX9-NEXT: s_endpgm 5788; 5789; EG-LABEL: array_3xi32: 5790; EG: ; %bb.0: 5791; EG-NEXT: ALU 0, @10, KC0[], KC1[] 5792; EG-NEXT: TEX 0 @8 5793; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5794; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5795; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0 5796; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 5797; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1 5798; EG-NEXT: CF_END 5799; EG-NEXT: Fetch clause starting at 8: 5800; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5801; EG-NEXT: ALU clause starting at 10: 5802; EG-NEXT: MOV * T0.X, 0.0, 5803; EG-NEXT: ALU clause starting at 11: 5804; EG-NEXT: AND_INT T0.X, T0.X, literal.x, 5805; EG-NEXT: MOV * T0.W, literal.x, 5806; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5807; EG-NEXT: MOV T0.Y, 0.0, 5808; EG-NEXT: MOV * T0.Z, 0.0, 5809; EG-NEXT: MOV T1.X, KC0[2].Z, 5810; EG-NEXT: MOV * T2.X, KC0[2].W, 5811; EG-NEXT: MOV T3.X, KC0[3].X, 5812; EG-NEXT: MOV * T4.X, literal.x, 5813; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5814; 5815; CM-LABEL: array_3xi32: 5816; CM: ; %bb.0: 5817; CM-NEXT: ALU 0, @10, KC0[], KC1[] 5818; CM-NEXT: TEX 0 @8 5819; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5820; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5821; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X 5822; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X 5823; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5824; CM-NEXT: CF_END 5825; CM-NEXT: Fetch clause starting at 8: 5826; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5827; CM-NEXT: ALU clause starting at 10: 5828; CM-NEXT: MOV * T0.X, 0.0, 5829; CM-NEXT: ALU clause starting at 11: 5830; CM-NEXT: AND_INT T0.X, T0.X, literal.x, 5831; CM-NEXT: MOV * T0.W, literal.x, 5832; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5833; CM-NEXT: MOV T0.Y, 0.0, 5834; CM-NEXT: MOV * T0.Z, 0.0, 5835; CM-NEXT: MOV * T1.X, KC0[2].Z, 5836; CM-NEXT: MOV * T2.X, KC0[2].W, 5837; CM-NEXT: MOV * T3.X, KC0[3].X, 5838; CM-NEXT: MOV * T4.X, literal.x, 5839; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5840 store volatile i16 %arg0, i16 addrspace(1)* undef 5841 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 5842 ret void 5843} 5844 5845; FIXME: Why not all scalar loads? 5846define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 5847; SI-LABEL: array_3xi16: 5848; SI: ; %bb.0: 5849; SI-NEXT: s_load_dword s4, s[0:1], 0x9 5850; SI-NEXT: s_mov_b32 s3, 0xf000 5851; SI-NEXT: s_mov_b32 s2, -1 5852; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 5853; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:40 5854; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:38 5855; SI-NEXT: s_waitcnt lgkmcnt(0) 5856; SI-NEXT: v_mov_b32_e32 v3, s4 5857; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 5858; SI-NEXT: s_waitcnt vmcnt(0) 5859; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 5860; SI-NEXT: s_waitcnt vmcnt(0) 5861; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 5862; SI-NEXT: s_waitcnt vmcnt(0) 5863; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 5864; SI-NEXT: s_waitcnt vmcnt(0) 5865; SI-NEXT: s_endpgm 5866; 5867; VI-LABEL: array_3xi16: 5868; VI: ; %bb.0: 5869; VI-NEXT: s_add_u32 s2, s0, 38 5870; VI-NEXT: s_addc_u32 s3, s1, 0 5871; VI-NEXT: s_add_u32 s4, s2, 2 5872; VI-NEXT: s_addc_u32 s5, s3, 0 5873; VI-NEXT: v_mov_b32_e32 v2, s2 5874; VI-NEXT: v_mov_b32_e32 v3, s3 5875; VI-NEXT: s_add_u32 s2, s0, 42 5876; VI-NEXT: v_mov_b32_e32 v0, s4 5877; VI-NEXT: s_addc_u32 s3, s1, 0 5878; VI-NEXT: v_mov_b32_e32 v1, s5 5879; VI-NEXT: v_mov_b32_e32 v5, s3 5880; VI-NEXT: v_mov_b32_e32 v4, s2 5881; VI-NEXT: flat_load_ushort v0, v[0:1] 5882; VI-NEXT: flat_load_ushort v1, v[2:3] 5883; VI-NEXT: flat_load_ushort v2, v[4:5] 5884; VI-NEXT: s_load_dword s0, s[0:1], 0x24 5885; VI-NEXT: s_waitcnt lgkmcnt(0) 5886; VI-NEXT: v_mov_b32_e32 v3, s0 5887; VI-NEXT: s_waitcnt vmcnt(1) 5888; VI-NEXT: flat_store_byte v[0:1], v3 5889; VI-NEXT: s_waitcnt vmcnt(0) 5890; VI-NEXT: flat_store_short v[0:1], v2 5891; VI-NEXT: s_waitcnt vmcnt(0) 5892; VI-NEXT: flat_store_short v[0:1], v1 5893; VI-NEXT: s_waitcnt vmcnt(0) 5894; VI-NEXT: flat_store_short v[0:1], v0 5895; VI-NEXT: s_waitcnt vmcnt(0) 5896; VI-NEXT: s_endpgm 5897; 5898; GFX9-LABEL: array_3xi16: 5899; GFX9: ; %bb.0: 5900; GFX9-NEXT: v_mov_b32_e32 v0, 0 5901; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:2 5902; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 5903; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:6 5904; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 5905; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5906; GFX9-NEXT: v_mov_b32_e32 v0, s0 5907; GFX9-NEXT: s_waitcnt vmcnt(2) 5908; GFX9-NEXT: global_store_byte v[0:1], v0, off 5909; GFX9-NEXT: s_waitcnt vmcnt(0) 5910; GFX9-NEXT: global_store_short v[0:1], v3, off 5911; GFX9-NEXT: s_waitcnt vmcnt(0) 5912; GFX9-NEXT: global_store_short v[0:1], v2, off 5913; GFX9-NEXT: s_waitcnt vmcnt(0) 5914; GFX9-NEXT: global_store_short v[0:1], v1, off 5915; GFX9-NEXT: s_waitcnt vmcnt(0) 5916; GFX9-NEXT: s_endpgm 5917; 5918; EG-LABEL: array_3xi16: 5919; EG: ; %bb.0: 5920; EG-NEXT: ALU 0, @20, KC0[], KC1[] 5921; EG-NEXT: TEX 1 @12 5922; EG-NEXT: ALU 11, @21, KC0[], KC1[] 5923; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5924; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5925; EG-NEXT: TEX 0 @16 5926; EG-NEXT: ALU 3, @33, KC0[], KC1[] 5927; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5928; EG-NEXT: TEX 0 @18 5929; EG-NEXT: ALU 3, @37, KC0[], KC1[] 5930; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5931; EG-NEXT: CF_END 5932; EG-NEXT: Fetch clause starting at 12: 5933; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5934; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5935; EG-NEXT: Fetch clause starting at 16: 5936; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5937; EG-NEXT: Fetch clause starting at 18: 5938; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5939; EG-NEXT: ALU clause starting at 20: 5940; EG-NEXT: MOV * T0.X, 0.0, 5941; EG-NEXT: ALU clause starting at 21: 5942; EG-NEXT: AND_INT T1.X, T1.X, literal.x, 5943; EG-NEXT: MOV * T1.W, literal.x, 5944; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5945; EG-NEXT: MOV * T1.Y, 0.0, 5946; EG-NEXT: AND_INT T2.X, T2.X, literal.x, 5947; EG-NEXT: MOV * T2.W, literal.x, 5948; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5949; EG-NEXT: MOV T2.Y, 0.0, 5950; EG-NEXT: MOV T1.Z, 0.0, 5951; EG-NEXT: MOV * T2.Z, 0.0, 5952; EG-NEXT: MOV * T3.X, literal.x, 5953; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5954; EG-NEXT: ALU clause starting at 33: 5955; EG-NEXT: AND_INT T2.X, T1.X, literal.x, 5956; EG-NEXT: MOV T2.Y, 0.0, 5957; EG-NEXT: MOV * T2.Z, 0.0, 5958; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5959; EG-NEXT: ALU clause starting at 37: 5960; EG-NEXT: AND_INT T2.X, T0.X, literal.x, 5961; EG-NEXT: MOV T2.Y, 0.0, 5962; EG-NEXT: MOV * T2.Z, 0.0, 5963; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5964; 5965; CM-LABEL: array_3xi16: 5966; CM: ; %bb.0: 5967; CM-NEXT: ALU 0, @20, KC0[], KC1[] 5968; CM-NEXT: TEX 1 @12 5969; CM-NEXT: ALU 11, @21, KC0[], KC1[] 5970; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5971; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5972; CM-NEXT: TEX 0 @16 5973; CM-NEXT: ALU 3, @33, KC0[], KC1[] 5974; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5975; CM-NEXT: TEX 0 @18 5976; CM-NEXT: ALU 3, @37, KC0[], KC1[] 5977; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5978; CM-NEXT: CF_END 5979; CM-NEXT: Fetch clause starting at 12: 5980; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5981; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5982; CM-NEXT: Fetch clause starting at 16: 5983; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5984; CM-NEXT: Fetch clause starting at 18: 5985; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5986; CM-NEXT: ALU clause starting at 20: 5987; CM-NEXT: MOV * T0.X, 0.0, 5988; CM-NEXT: ALU clause starting at 21: 5989; CM-NEXT: AND_INT T1.X, T1.X, literal.x, 5990; CM-NEXT: MOV * T1.W, literal.x, 5991; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5992; CM-NEXT: MOV * T1.Y, 0.0, 5993; CM-NEXT: AND_INT T2.X, T2.X, literal.x, 5994; CM-NEXT: MOV * T2.W, literal.x, 5995; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5996; CM-NEXT: MOV T2.Y, 0.0, 5997; CM-NEXT: MOV * T1.Z, 0.0, 5998; CM-NEXT: MOV * T2.Z, 0.0, 5999; CM-NEXT: MOV * T3.X, literal.x, 6000; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 6001; CM-NEXT: ALU clause starting at 33: 6002; CM-NEXT: AND_INT T2.X, T1.X, literal.x, 6003; CM-NEXT: MOV T2.Y, 0.0, 6004; CM-NEXT: MOV * T2.Z, 0.0, 6005; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 6006; CM-NEXT: ALU clause starting at 37: 6007; CM-NEXT: AND_INT T2.X, T0.X, literal.x, 6008; CM-NEXT: MOV T2.Y, 0.0, 6009; CM-NEXT: MOV * T2.Z, 0.0, 6010; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 6011 store volatile i8 %arg0, i8 addrspace(1)* undef 6012 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 6013 ret void 6014} 6015 6016define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { 6017; SI-LABEL: small_array_round_down_offset: 6018; SI: ; %bb.0: 6019; SI-NEXT: s_mov_b32 s3, 0xf000 6020; SI-NEXT: s_mov_b32 s2, -1 6021; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 6022; SI-NEXT: s_waitcnt vmcnt(0) 6023; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 6024; SI-NEXT: s_waitcnt vmcnt(0) 6025; SI-NEXT: s_endpgm 6026; 6027; VI-LABEL: small_array_round_down_offset: 6028; VI: ; %bb.0: 6029; VI-NEXT: s_add_u32 s0, s0, 37 6030; VI-NEXT: s_addc_u32 s1, s1, 0 6031; VI-NEXT: v_mov_b32_e32 v0, s0 6032; VI-NEXT: v_mov_b32_e32 v1, s1 6033; VI-NEXT: flat_load_ubyte v0, v[0:1] 6034; VI-NEXT: s_waitcnt vmcnt(0) 6035; VI-NEXT: flat_store_byte v[0:1], v0 6036; VI-NEXT: s_waitcnt vmcnt(0) 6037; VI-NEXT: s_endpgm 6038; 6039; GFX9-LABEL: small_array_round_down_offset: 6040; GFX9: ; %bb.0: 6041; GFX9-NEXT: v_mov_b32_e32 v0, 0 6042; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 6043; GFX9-NEXT: s_waitcnt vmcnt(0) 6044; GFX9-NEXT: global_store_byte v[0:1], v0, off 6045; GFX9-NEXT: s_waitcnt vmcnt(0) 6046; GFX9-NEXT: s_endpgm 6047; 6048; EGCM-LABEL: small_array_round_down_offset: 6049; EGCM: ; %bb.0: 6050; EGCM-NEXT: ALU 0, @8, KC0[], KC1[] 6051; EGCM-NEXT: TEX 0 @6 6052; EGCM-NEXT: ALU 6, @9, KC0[], KC1[] 6053; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 6054; EGCM-NEXT: CF_END 6055; EGCM-NEXT: PAD 6056; EGCM-NEXT: Fetch clause starting at 6: 6057; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3 6058; EGCM-NEXT: ALU clause starting at 8: 6059; EGCM-NEXT: MOV * T0.X, 0.0, 6060; EGCM-NEXT: ALU clause starting at 9: 6061; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x, 6062; EGCM-NEXT: MOV * T0.W, literal.x, 6063; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 6064; EGCM-NEXT: MOV T0.Y, 0.0, 6065; EGCM-NEXT: MOV * T0.Z, 0.0, 6066; EGCM-NEXT: MOV * T1.X, literal.x, 6067; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 6068 %val = extractvalue [1 x i8] %arg, 0 6069 store volatile i8 %val, i8 addrspace(1)* undef 6070 ret void 6071} 6072 6073define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { 6074; SI-LABEL: byref_align_constant_i32_arg: 6075; SI: ; %bb.0: 6076; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 6077; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6078; SI-NEXT: s_mov_b32 s3, 0xf000 6079; SI-NEXT: s_mov_b32 s2, -1 6080; SI-NEXT: s_waitcnt lgkmcnt(0) 6081; SI-NEXT: v_mov_b32_e32 v0, s4 6082; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6083; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6084; SI-NEXT: v_mov_b32_e32 v0, s5 6085; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6086; SI-NEXT: s_waitcnt vmcnt(0) 6087; SI-NEXT: s_endpgm 6088; 6089; VI-LABEL: byref_align_constant_i32_arg: 6090; VI: ; %bb.0: 6091; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6092; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 6093; VI-NEXT: s_waitcnt lgkmcnt(0) 6094; VI-NEXT: v_mov_b32_e32 v0, s2 6095; VI-NEXT: v_mov_b32_e32 v1, s3 6096; VI-NEXT: v_mov_b32_e32 v2, s0 6097; VI-NEXT: v_mov_b32_e32 v3, s1 6098; VI-NEXT: flat_store_dword v[0:1], v2 6099; VI-NEXT: s_waitcnt vmcnt(0) 6100; VI-NEXT: flat_store_dword v[0:1], v3 6101; VI-NEXT: s_waitcnt vmcnt(0) 6102; VI-NEXT: s_endpgm 6103; 6104; GFX9-LABEL: byref_align_constant_i32_arg: 6105; GFX9: ; %bb.0: 6106; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6107; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x100 6108; GFX9-NEXT: v_mov_b32_e32 v0, 0 6109; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6110; GFX9-NEXT: v_mov_b32_e32 v1, s2 6111; GFX9-NEXT: v_mov_b32_e32 v2, s3 6112; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 6113; GFX9-NEXT: s_waitcnt vmcnt(0) 6114; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 6115; GFX9-NEXT: s_waitcnt vmcnt(0) 6116; GFX9-NEXT: s_endpgm 6117; 6118; EG-LABEL: byref_align_constant_i32_arg: 6119; EG: ; %bb.0: 6120; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 6121; EG-NEXT: TEX 0 @6 6122; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 6123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0 6124; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6125; EG-NEXT: CF_END 6126; EG-NEXT: Fetch clause starting at 6: 6127; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 6128; EG-NEXT: ALU clause starting at 8: 6129; EG-NEXT: MOV * T0.X, KC0[18].Y, 6130; EG-NEXT: ALU clause starting at 9: 6131; EG-NEXT: MOV T1.X, KC0[18].Z, 6132; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6134; 6135; CM-LABEL: byref_align_constant_i32_arg: 6136; CM: ; %bb.0: 6137; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 6138; CM-NEXT: TEX 0 @6 6139; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 6140; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X 6141; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6142; CM-NEXT: CF_END 6143; CM-NEXT: Fetch clause starting at 6: 6144; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 6145; CM-NEXT: ALU clause starting at 8: 6146; CM-NEXT: MOV * T0.X, KC0[18].Y, 6147; CM-NEXT: ALU clause starting at 9: 6148; CM-NEXT: MOV * T1.X, KC0[18].Z, 6149; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6150; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6151 %in = load i32, i32 addrspace(4)* %in.byref 6152 store volatile i32 %in, i32 addrspace(1)* %out, align 4 6153 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 6154 ret void 6155} 6156 6157define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { 6158; SI-LABEL: byref_natural_align_constant_v16i32_arg: 6159; SI: ; %bb.0: 6160; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 6161; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 6162; SI-NEXT: s_load_dword s0, s[0:1], 0x29 6163; SI-NEXT: s_mov_b32 s23, 0xf000 6164; SI-NEXT: s_mov_b32 s22, -1 6165; SI-NEXT: s_waitcnt lgkmcnt(0) 6166; SI-NEXT: v_mov_b32_e32 v0, s16 6167; SI-NEXT: v_mov_b32_e32 v1, s17 6168; SI-NEXT: v_mov_b32_e32 v2, s18 6169; SI-NEXT: v_mov_b32_e32 v3, s19 6170; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 6171; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6172; SI-NEXT: v_mov_b32_e32 v0, s12 6173; SI-NEXT: v_mov_b32_e32 v1, s13 6174; SI-NEXT: v_mov_b32_e32 v2, s14 6175; SI-NEXT: v_mov_b32_e32 v3, s15 6176; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 6177; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6178; SI-NEXT: v_mov_b32_e32 v0, s8 6179; SI-NEXT: v_mov_b32_e32 v1, s9 6180; SI-NEXT: v_mov_b32_e32 v2, s10 6181; SI-NEXT: v_mov_b32_e32 v3, s11 6182; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 6183; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6184; SI-NEXT: v_mov_b32_e32 v0, s4 6185; SI-NEXT: v_mov_b32_e32 v1, s5 6186; SI-NEXT: v_mov_b32_e32 v2, s6 6187; SI-NEXT: v_mov_b32_e32 v3, s7 6188; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 6189; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6190; SI-NEXT: v_mov_b32_e32 v0, s0 6191; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 6192; SI-NEXT: s_waitcnt vmcnt(0) 6193; SI-NEXT: s_endpgm 6194; 6195; VI-LABEL: byref_natural_align_constant_v16i32_arg: 6196; VI: ; %bb.0: 6197; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 6198; VI-NEXT: s_load_dword s18, s[0:1], 0xa4 6199; VI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x64 6200; VI-NEXT: s_waitcnt lgkmcnt(0) 6201; VI-NEXT: v_mov_b32_e32 v0, s12 6202; VI-NEXT: s_add_u32 s12, s16, 48 6203; VI-NEXT: v_mov_b32_e32 v1, s13 6204; VI-NEXT: s_addc_u32 s13, s17, 0 6205; VI-NEXT: v_mov_b32_e32 v4, s12 6206; VI-NEXT: v_mov_b32_e32 v2, s14 6207; VI-NEXT: v_mov_b32_e32 v3, s15 6208; VI-NEXT: v_mov_b32_e32 v5, s13 6209; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6210; VI-NEXT: s_waitcnt vmcnt(0) 6211; VI-NEXT: v_mov_b32_e32 v0, s8 6212; VI-NEXT: s_add_u32 s8, s16, 32 6213; VI-NEXT: v_mov_b32_e32 v1, s9 6214; VI-NEXT: s_addc_u32 s9, s17, 0 6215; VI-NEXT: v_mov_b32_e32 v4, s8 6216; VI-NEXT: v_mov_b32_e32 v2, s10 6217; VI-NEXT: v_mov_b32_e32 v3, s11 6218; VI-NEXT: v_mov_b32_e32 v5, s9 6219; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6220; VI-NEXT: s_waitcnt vmcnt(0) 6221; VI-NEXT: v_mov_b32_e32 v0, s4 6222; VI-NEXT: s_add_u32 s4, s16, 16 6223; VI-NEXT: v_mov_b32_e32 v1, s5 6224; VI-NEXT: s_addc_u32 s5, s17, 0 6225; VI-NEXT: v_mov_b32_e32 v4, s4 6226; VI-NEXT: v_mov_b32_e32 v2, s6 6227; VI-NEXT: v_mov_b32_e32 v3, s7 6228; VI-NEXT: v_mov_b32_e32 v5, s5 6229; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6230; VI-NEXT: s_waitcnt vmcnt(0) 6231; VI-NEXT: v_mov_b32_e32 v4, s16 6232; VI-NEXT: v_mov_b32_e32 v0, s0 6233; VI-NEXT: v_mov_b32_e32 v1, s1 6234; VI-NEXT: v_mov_b32_e32 v2, s2 6235; VI-NEXT: v_mov_b32_e32 v3, s3 6236; VI-NEXT: v_mov_b32_e32 v5, s17 6237; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6238; VI-NEXT: s_waitcnt vmcnt(0) 6239; VI-NEXT: v_mov_b32_e32 v0, s18 6240; VI-NEXT: flat_store_dword v[4:5], v0 6241; VI-NEXT: s_waitcnt vmcnt(0) 6242; VI-NEXT: s_endpgm 6243; 6244; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: 6245; GFX9: ; %bb.0: 6246; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 6247; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 6248; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6249; GFX9-NEXT: v_mov_b32_e32 v4, 0 6250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6251; GFX9-NEXT: v_mov_b32_e32 v0, s20 6252; GFX9-NEXT: v_mov_b32_e32 v1, s21 6253; GFX9-NEXT: v_mov_b32_e32 v2, s22 6254; GFX9-NEXT: v_mov_b32_e32 v3, s23 6255; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 6256; GFX9-NEXT: s_waitcnt vmcnt(0) 6257; GFX9-NEXT: v_mov_b32_e32 v0, s16 6258; GFX9-NEXT: v_mov_b32_e32 v1, s17 6259; GFX9-NEXT: v_mov_b32_e32 v2, s18 6260; GFX9-NEXT: v_mov_b32_e32 v3, s19 6261; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 6262; GFX9-NEXT: s_waitcnt vmcnt(0) 6263; GFX9-NEXT: v_mov_b32_e32 v0, s12 6264; GFX9-NEXT: v_mov_b32_e32 v1, s13 6265; GFX9-NEXT: v_mov_b32_e32 v2, s14 6266; GFX9-NEXT: v_mov_b32_e32 v3, s15 6267; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 6268; GFX9-NEXT: s_waitcnt vmcnt(0) 6269; GFX9-NEXT: v_mov_b32_e32 v0, s8 6270; GFX9-NEXT: v_mov_b32_e32 v1, s9 6271; GFX9-NEXT: v_mov_b32_e32 v2, s10 6272; GFX9-NEXT: v_mov_b32_e32 v3, s11 6273; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 6274; GFX9-NEXT: s_waitcnt vmcnt(0) 6275; GFX9-NEXT: v_mov_b32_e32 v0, s2 6276; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 6277; GFX9-NEXT: s_waitcnt vmcnt(0) 6278; GFX9-NEXT: s_endpgm 6279; 6280; EG-LABEL: byref_natural_align_constant_v16i32_arg: 6281; EG: ; %bb.0: 6282; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6283; EG-NEXT: TEX 0 @16 6284; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6285; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 6286; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6287; EG-NEXT: TEX 0 @18 6288; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6289; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6290; EG-NEXT: TEX 0 @20 6291; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6292; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6293; EG-NEXT: TEX 0 @22 6294; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 6295; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6296; EG-NEXT: CF_END 6297; EG-NEXT: PAD 6298; EG-NEXT: Fetch clause starting at 16: 6299; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6300; EG-NEXT: Fetch clause starting at 18: 6301; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6302; EG-NEXT: Fetch clause starting at 20: 6303; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6304; EG-NEXT: Fetch clause starting at 22: 6305; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6306; EG-NEXT: ALU clause starting at 24: 6307; EG-NEXT: MOV * T0.X, KC0[6].Y, 6308; EG-NEXT: ALU clause starting at 25: 6309; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6310; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6311; EG-NEXT: LSHR * T2.X, PV.W, literal.x, 6312; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6313; EG-NEXT: ALU clause starting at 29: 6314; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6315; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6316; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6317; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6318; EG-NEXT: ALU clause starting at 33: 6319; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6320; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6321; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6322; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6323; EG-NEXT: ALU clause starting at 37: 6324; EG-NEXT: MOV T1.X, KC0[10].Y, 6325; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6326; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6327; 6328; CM-LABEL: byref_natural_align_constant_v16i32_arg: 6329; CM: ; %bb.0: 6330; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6331; CM-NEXT: TEX 0 @16 6332; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6333; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X 6334; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6335; CM-NEXT: TEX 0 @18 6336; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6337; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6338; CM-NEXT: TEX 0 @20 6339; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6340; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6341; CM-NEXT: TEX 0 @22 6342; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 6343; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6344; CM-NEXT: CF_END 6345; CM-NEXT: PAD 6346; CM-NEXT: Fetch clause starting at 16: 6347; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6348; CM-NEXT: Fetch clause starting at 18: 6349; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6350; CM-NEXT: Fetch clause starting at 20: 6351; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6352; CM-NEXT: Fetch clause starting at 22: 6353; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6354; CM-NEXT: ALU clause starting at 24: 6355; CM-NEXT: MOV * T0.X, KC0[6].Y, 6356; CM-NEXT: ALU clause starting at 25: 6357; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6358; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6359; CM-NEXT: LSHR * T2.X, PV.W, literal.x, 6360; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6361; CM-NEXT: ALU clause starting at 29: 6362; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6363; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6364; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6365; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6366; CM-NEXT: ALU clause starting at 33: 6367; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6368; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6369; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6370; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6371; CM-NEXT: ALU clause starting at 37: 6372; CM-NEXT: MOV * T1.X, KC0[10].Y, 6373; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6374; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6375 %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref 6376 %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* 6377 store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 6378 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 6379 ret void 6380} 6381