1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s 4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s 6; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s 7 8define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 9; SI-LABEL: i8_arg: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dword s2, s[0:1], 0xb 12; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 13; SI-NEXT: s_mov_b32 s3, 0xf000 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_and_b32 s4, s2, 0xff 16; SI-NEXT: s_mov_b32 s2, -1 17; SI-NEXT: v_mov_b32_e32 v0, s4 18; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 19; SI-NEXT: s_endpgm 20; 21; VI-LABEL: i8_arg: 22; VI: ; %bb.0: 23; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 24; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: s_and_b32 s2, s2, 0xff 27; VI-NEXT: v_mov_b32_e32 v0, s0 28; VI-NEXT: v_mov_b32_e32 v1, s1 29; VI-NEXT: v_mov_b32_e32 v2, s2 30; VI-NEXT: flat_store_dword v[0:1], v2 31; VI-NEXT: s_endpgm 32; 33; GFX9-LABEL: i8_arg: 34; GFX9: ; %bb.0: 35; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 36; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 37; GFX9-NEXT: v_mov_b32_e32 v0, 0 38; GFX9-NEXT: s_waitcnt lgkmcnt(0) 39; GFX9-NEXT: s_and_b32 s2, s2, 0xff 40; GFX9-NEXT: v_mov_b32_e32 v1, s2 41; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 42; GFX9-NEXT: s_endpgm 43; 44; EG-LABEL: i8_arg: 45; EG: ; %bb.0: 46; EG-NEXT: ALU 0, @8, KC0[], KC1[] 47; EG-NEXT: TEX 0 @6 48; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 49; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 50; EG-NEXT: CF_END 51; EG-NEXT: PAD 52; EG-NEXT: Fetch clause starting at 6: 53; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 54; EG-NEXT: ALU clause starting at 8: 55; EG-NEXT: MOV * T0.X, 0.0, 56; EG-NEXT: ALU clause starting at 9: 57; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 58; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 59; 60; CM-LABEL: i8_arg: 61; CM: ; %bb.0: 62; CM-NEXT: ALU 0, @8, KC0[], KC1[] 63; CM-NEXT: TEX 0 @6 64; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 65; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 66; CM-NEXT: CF_END 67; CM-NEXT: PAD 68; CM-NEXT: Fetch clause starting at 6: 69; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 70; CM-NEXT: ALU clause starting at 8: 71; CM-NEXT: MOV * T0.X, 0.0, 72; CM-NEXT: ALU clause starting at 9: 73; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 74; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 75 %ext = zext i8 %in to i32 76 store i32 %ext, i32 addrspace(1)* %out, align 4 77 ret void 78} 79 80define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 81; SI-LABEL: i8_zext_arg: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dword s2, s[0:1], 0xb 84; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 85; SI-NEXT: s_mov_b32 s3, 0xf000 86; SI-NEXT: s_waitcnt lgkmcnt(0) 87; SI-NEXT: s_and_b32 s4, s2, 0xff 88; SI-NEXT: s_mov_b32 s2, -1 89; SI-NEXT: v_mov_b32_e32 v0, s4 90; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: i8_zext_arg: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 96; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: s_and_b32 s2, s2, 0xff 99; VI-NEXT: v_mov_b32_e32 v0, s0 100; VI-NEXT: v_mov_b32_e32 v1, s1 101; VI-NEXT: v_mov_b32_e32 v2, s2 102; VI-NEXT: flat_store_dword v[0:1], v2 103; VI-NEXT: s_endpgm 104; 105; GFX9-LABEL: i8_zext_arg: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 109; GFX9-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: s_and_b32 s2, s2, 0xff 112; GFX9-NEXT: v_mov_b32_e32 v1, s2 113; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 114; GFX9-NEXT: s_endpgm 115; 116; EG-LABEL: i8_zext_arg: 117; EG: ; %bb.0: 118; EG-NEXT: ALU 0, @8, KC0[], KC1[] 119; EG-NEXT: TEX 0 @6 120; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 122; EG-NEXT: CF_END 123; EG-NEXT: PAD 124; EG-NEXT: Fetch clause starting at 6: 125; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 126; EG-NEXT: ALU clause starting at 8: 127; EG-NEXT: MOV * T0.X, 0.0, 128; EG-NEXT: ALU clause starting at 9: 129; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 130; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 131; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 132; 133; CM-LABEL: i8_zext_arg: 134; CM: ; %bb.0: 135; CM-NEXT: ALU 0, @8, KC0[], KC1[] 136; CM-NEXT: TEX 0 @6 137; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 138; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 139; CM-NEXT: CF_END 140; CM-NEXT: PAD 141; CM-NEXT: Fetch clause starting at 6: 142; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 143; CM-NEXT: ALU clause starting at 8: 144; CM-NEXT: MOV * T0.X, 0.0, 145; CM-NEXT: ALU clause starting at 9: 146; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 147; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 148; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 149; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 150 %ext = zext i8 %in to i32 151 store i32 %ext, i32 addrspace(1)* %out, align 4 152 ret void 153} 154 155define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 156; SI-LABEL: i8_sext_arg: 157; SI: ; %bb.0: 158; SI-NEXT: s_load_dword s2, s[0:1], 0xb 159; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 160; SI-NEXT: s_mov_b32 s3, 0xf000 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: s_sext_i32_i8 s4, s2 163; SI-NEXT: s_mov_b32 s2, -1 164; SI-NEXT: v_mov_b32_e32 v0, s4 165; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 166; SI-NEXT: s_endpgm 167; 168; VI-LABEL: i8_sext_arg: 169; VI: ; %bb.0: 170; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 171; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: s_sext_i32_i8 s2, s2 174; VI-NEXT: v_mov_b32_e32 v0, s0 175; VI-NEXT: v_mov_b32_e32 v1, s1 176; VI-NEXT: v_mov_b32_e32 v2, s2 177; VI-NEXT: flat_store_dword v[0:1], v2 178; VI-NEXT: s_endpgm 179; 180; GFX9-LABEL: i8_sext_arg: 181; GFX9: ; %bb.0: 182; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 183; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 184; GFX9-NEXT: v_mov_b32_e32 v0, 0 185; GFX9-NEXT: s_waitcnt lgkmcnt(0) 186; GFX9-NEXT: s_sext_i32_i8 s2, s2 187; GFX9-NEXT: v_mov_b32_e32 v1, s2 188; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 189; GFX9-NEXT: s_endpgm 190; 191; EG-LABEL: i8_sext_arg: 192; EG: ; %bb.0: 193; EG-NEXT: ALU 0, @8, KC0[], KC1[] 194; EG-NEXT: TEX 0 @6 195; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 196; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 197; EG-NEXT: CF_END 198; EG-NEXT: PAD 199; EG-NEXT: Fetch clause starting at 6: 200; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 201; EG-NEXT: ALU clause starting at 8: 202; EG-NEXT: MOV * T0.X, 0.0, 203; EG-NEXT: ALU clause starting at 9: 204; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 205; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 206; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 207; 208; CM-LABEL: i8_sext_arg: 209; CM: ; %bb.0: 210; CM-NEXT: ALU 0, @8, KC0[], KC1[] 211; CM-NEXT: TEX 0 @6 212; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 213; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 214; CM-NEXT: CF_END 215; CM-NEXT: PAD 216; CM-NEXT: Fetch clause starting at 6: 217; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 218; CM-NEXT: ALU clause starting at 8: 219; CM-NEXT: MOV * T0.X, 0.0, 220; CM-NEXT: ALU clause starting at 9: 221; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 222; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 223; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 224; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 225 %ext = sext i8 %in to i32 226 store i32 %ext, i32 addrspace(1)* %out, align 4 227 ret void 228} 229 230define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 231; SI-LABEL: i16_arg: 232; SI: ; %bb.0: 233; SI-NEXT: s_load_dword s2, s[0:1], 0xb 234; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 235; SI-NEXT: s_mov_b32 s3, 0xf000 236; SI-NEXT: s_waitcnt lgkmcnt(0) 237; SI-NEXT: s_and_b32 s4, s2, 0xffff 238; SI-NEXT: s_mov_b32 s2, -1 239; SI-NEXT: v_mov_b32_e32 v0, s4 240; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 241; SI-NEXT: s_endpgm 242; 243; VI-LABEL: i16_arg: 244; VI: ; %bb.0: 245; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 246; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: s_and_b32 s2, s2, 0xffff 249; VI-NEXT: v_mov_b32_e32 v0, s0 250; VI-NEXT: v_mov_b32_e32 v1, s1 251; VI-NEXT: v_mov_b32_e32 v2, s2 252; VI-NEXT: flat_store_dword v[0:1], v2 253; VI-NEXT: s_endpgm 254; 255; GFX9-LABEL: i16_arg: 256; GFX9: ; %bb.0: 257; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 258; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 259; GFX9-NEXT: v_mov_b32_e32 v0, 0 260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 261; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 262; GFX9-NEXT: v_mov_b32_e32 v1, s2 263; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 264; GFX9-NEXT: s_endpgm 265; 266; EG-LABEL: i16_arg: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 0, @8, KC0[], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: MOV * T0.X, 0.0, 278; EG-NEXT: ALU clause starting at 9: 279; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 280; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 281; 282; CM-LABEL: i16_arg: 283; CM: ; %bb.0: 284; CM-NEXT: ALU 0, @8, KC0[], KC1[] 285; CM-NEXT: TEX 0 @6 286; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 287; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 288; CM-NEXT: CF_END 289; CM-NEXT: PAD 290; CM-NEXT: Fetch clause starting at 6: 291; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 292; CM-NEXT: ALU clause starting at 8: 293; CM-NEXT: MOV * T0.X, 0.0, 294; CM-NEXT: ALU clause starting at 9: 295; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 296; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 297 %ext = zext i16 %in to i32 298 store i32 %ext, i32 addrspace(1)* %out, align 4 299 ret void 300} 301 302define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 303; SI-LABEL: i16_zext_arg: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dword s2, s[0:1], 0xb 306; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 307; SI-NEXT: s_mov_b32 s3, 0xf000 308; SI-NEXT: s_waitcnt lgkmcnt(0) 309; SI-NEXT: s_and_b32 s4, s2, 0xffff 310; SI-NEXT: s_mov_b32 s2, -1 311; SI-NEXT: v_mov_b32_e32 v0, s4 312; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 313; SI-NEXT: s_endpgm 314; 315; VI-LABEL: i16_zext_arg: 316; VI: ; %bb.0: 317; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 318; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 319; VI-NEXT: s_waitcnt lgkmcnt(0) 320; VI-NEXT: s_and_b32 s2, s2, 0xffff 321; VI-NEXT: v_mov_b32_e32 v0, s0 322; VI-NEXT: v_mov_b32_e32 v1, s1 323; VI-NEXT: v_mov_b32_e32 v2, s2 324; VI-NEXT: flat_store_dword v[0:1], v2 325; VI-NEXT: s_endpgm 326; 327; GFX9-LABEL: i16_zext_arg: 328; GFX9: ; %bb.0: 329; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 330; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 331; GFX9-NEXT: v_mov_b32_e32 v0, 0 332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 333; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 334; GFX9-NEXT: v_mov_b32_e32 v1, s2 335; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 336; GFX9-NEXT: s_endpgm 337; 338; EG-LABEL: i16_zext_arg: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 0, @8, KC0[], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: MOV * T0.X, 0.0, 350; EG-NEXT: ALU clause starting at 9: 351; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 352; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 353; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 354; 355; CM-LABEL: i16_zext_arg: 356; CM: ; %bb.0: 357; CM-NEXT: ALU 0, @8, KC0[], KC1[] 358; CM-NEXT: TEX 0 @6 359; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 360; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 361; CM-NEXT: CF_END 362; CM-NEXT: PAD 363; CM-NEXT: Fetch clause starting at 6: 364; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 365; CM-NEXT: ALU clause starting at 8: 366; CM-NEXT: MOV * T0.X, 0.0, 367; CM-NEXT: ALU clause starting at 9: 368; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 369; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 370; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 371; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 372 %ext = zext i16 %in to i32 373 store i32 %ext, i32 addrspace(1)* %out, align 4 374 ret void 375} 376 377define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 378; SI-LABEL: i16_sext_arg: 379; SI: ; %bb.0: 380; SI-NEXT: s_load_dword s2, s[0:1], 0xb 381; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 382; SI-NEXT: s_mov_b32 s3, 0xf000 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_sext_i32_i16 s4, s2 385; SI-NEXT: s_mov_b32 s2, -1 386; SI-NEXT: v_mov_b32_e32 v0, s4 387; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; SI-NEXT: s_endpgm 389; 390; VI-LABEL: i16_sext_arg: 391; VI: ; %bb.0: 392; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 393; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 394; VI-NEXT: s_waitcnt lgkmcnt(0) 395; VI-NEXT: s_sext_i32_i16 s2, s2 396; VI-NEXT: v_mov_b32_e32 v0, s0 397; VI-NEXT: v_mov_b32_e32 v1, s1 398; VI-NEXT: v_mov_b32_e32 v2, s2 399; VI-NEXT: flat_store_dword v[0:1], v2 400; VI-NEXT: s_endpgm 401; 402; GFX9-LABEL: i16_sext_arg: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 405; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 406; GFX9-NEXT: v_mov_b32_e32 v0, 0 407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 408; GFX9-NEXT: s_sext_i32_i16 s2, s2 409; GFX9-NEXT: v_mov_b32_e32 v1, s2 410; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 411; GFX9-NEXT: s_endpgm 412; 413; EG-LABEL: i16_sext_arg: 414; EG: ; %bb.0: 415; EG-NEXT: ALU 0, @8, KC0[], KC1[] 416; EG-NEXT: TEX 0 @6 417; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 418; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 419; EG-NEXT: CF_END 420; EG-NEXT: PAD 421; EG-NEXT: Fetch clause starting at 6: 422; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 423; EG-NEXT: ALU clause starting at 8: 424; EG-NEXT: MOV * T0.X, 0.0, 425; EG-NEXT: ALU clause starting at 9: 426; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 427; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 428; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 429; 430; CM-LABEL: i16_sext_arg: 431; CM: ; %bb.0: 432; CM-NEXT: ALU 0, @8, KC0[], KC1[] 433; CM-NEXT: TEX 0 @6 434; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 435; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 436; CM-NEXT: CF_END 437; CM-NEXT: PAD 438; CM-NEXT: Fetch clause starting at 6: 439; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 440; CM-NEXT: ALU clause starting at 8: 441; CM-NEXT: MOV * T0.X, 0.0, 442; CM-NEXT: ALU clause starting at 9: 443; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 444; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 445; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 446; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 447 %ext = sext i16 %in to i32 448 store i32 %ext, i32 addrspace(1)* %out, align 4 449 ret void 450} 451 452define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 453; SI-LABEL: i32_arg: 454; SI: ; %bb.0: ; %entry 455; SI-NEXT: s_load_dword s4, s[0:1], 0xb 456; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 457; SI-NEXT: s_mov_b32 s3, 0xf000 458; SI-NEXT: s_mov_b32 s2, -1 459; SI-NEXT: s_waitcnt lgkmcnt(0) 460; SI-NEXT: v_mov_b32_e32 v0, s4 461; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 462; SI-NEXT: s_endpgm 463; 464; VI-LABEL: i32_arg: 465; VI: ; %bb.0: ; %entry 466; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 467; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 468; VI-NEXT: s_waitcnt lgkmcnt(0) 469; VI-NEXT: v_mov_b32_e32 v0, s2 470; VI-NEXT: v_mov_b32_e32 v1, s3 471; VI-NEXT: v_mov_b32_e32 v2, s0 472; VI-NEXT: flat_store_dword v[0:1], v2 473; VI-NEXT: s_endpgm 474; 475; GFX9-LABEL: i32_arg: 476; GFX9: ; %bb.0: ; %entry 477; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 478; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 479; GFX9-NEXT: v_mov_b32_e32 v0, 0 480; GFX9-NEXT: s_waitcnt lgkmcnt(0) 481; GFX9-NEXT: v_mov_b32_e32 v1, s2 482; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 483; GFX9-NEXT: s_endpgm 484; 485; EG-LABEL: i32_arg: 486; EG: ; %bb.0: ; %entry 487; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 489; EG-NEXT: CF_END 490; EG-NEXT: PAD 491; EG-NEXT: ALU clause starting at 4: 492; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 493; EG-NEXT: MOV * T1.X, KC0[2].Z, 494; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 495; 496; CM-LABEL: i32_arg: 497; CM: ; %bb.0: ; %entry 498; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 500; CM-NEXT: CF_END 501; CM-NEXT: PAD 502; CM-NEXT: ALU clause starting at 4: 503; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 504; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 505; CM-NEXT: MOV * T1.X, KC0[2].Z, 506entry: 507 store i32 %in, i32 addrspace(1)* %out, align 4 508 ret void 509} 510 511define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 512; SI-LABEL: f32_arg: 513; SI: ; %bb.0: ; %entry 514; SI-NEXT: s_load_dword s4, s[0:1], 0xb 515; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 516; SI-NEXT: s_mov_b32 s3, 0xf000 517; SI-NEXT: s_mov_b32 s2, -1 518; SI-NEXT: s_waitcnt lgkmcnt(0) 519; SI-NEXT: v_mov_b32_e32 v0, s4 520; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 521; SI-NEXT: s_endpgm 522; 523; VI-LABEL: f32_arg: 524; VI: ; %bb.0: ; %entry 525; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 526; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: v_mov_b32_e32 v0, s2 529; VI-NEXT: v_mov_b32_e32 v1, s3 530; VI-NEXT: v_mov_b32_e32 v2, s0 531; VI-NEXT: flat_store_dword v[0:1], v2 532; VI-NEXT: s_endpgm 533; 534; GFX9-LABEL: f32_arg: 535; GFX9: ; %bb.0: ; %entry 536; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 538; GFX9-NEXT: v_mov_b32_e32 v0, 0 539; GFX9-NEXT: s_waitcnt lgkmcnt(0) 540; GFX9-NEXT: v_mov_b32_e32 v1, s2 541; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 542; GFX9-NEXT: s_endpgm 543; 544; EG-LABEL: f32_arg: 545; EG: ; %bb.0: ; %entry 546; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 547; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 548; EG-NEXT: CF_END 549; EG-NEXT: PAD 550; EG-NEXT: ALU clause starting at 4: 551; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 552; EG-NEXT: MOV * T1.X, KC0[2].Z, 553; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 554; 555; CM-LABEL: f32_arg: 556; CM: ; %bb.0: ; %entry 557; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 558; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 559; CM-NEXT: CF_END 560; CM-NEXT: PAD 561; CM-NEXT: ALU clause starting at 4: 562; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 563; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 564; CM-NEXT: MOV * T1.X, KC0[2].Z, 565entry: 566 store float %in, float addrspace(1)* %out, align 4 567 ret void 568} 569 570define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 571; SI-LABEL: v2i8_arg: 572; SI: ; %bb.0: ; %entry 573; SI-NEXT: s_load_dword s4, s[0:1], 0xb 574; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 575; SI-NEXT: s_mov_b32 s3, 0xf000 576; SI-NEXT: s_mov_b32 s2, -1 577; SI-NEXT: s_waitcnt lgkmcnt(0) 578; SI-NEXT: v_mov_b32_e32 v0, s4 579; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 580; SI-NEXT: s_endpgm 581; 582; VI-LABEL: v2i8_arg: 583; VI: ; %bb.0: ; %entry 584; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 585; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 586; VI-NEXT: s_waitcnt lgkmcnt(0) 587; VI-NEXT: v_mov_b32_e32 v0, s2 588; VI-NEXT: v_mov_b32_e32 v1, s3 589; VI-NEXT: v_mov_b32_e32 v2, s0 590; VI-NEXT: flat_store_short v[0:1], v2 591; VI-NEXT: s_endpgm 592; 593; GFX9-LABEL: v2i8_arg: 594; GFX9: ; %bb.0: ; %entry 595; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 596; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 597; GFX9-NEXT: v_mov_b32_e32 v0, 0 598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 599; GFX9-NEXT: v_mov_b32_e32 v1, s2 600; GFX9-NEXT: global_store_short v0, v1, s[0:1] 601; GFX9-NEXT: s_endpgm 602; 603; EG-LABEL: v2i8_arg: 604; EG: ; %bb.0: ; %entry 605; EG-NEXT: ALU 0, @10, KC0[], KC1[] 606; EG-NEXT: TEX 1 @6 607; EG-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] 608; EG-NEXT: MEM_RAT MSKOR T4.XW, T5.X 609; EG-NEXT: CF_END 610; EG-NEXT: PAD 611; EG-NEXT: Fetch clause starting at 6: 612; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 613; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 614; EG-NEXT: ALU clause starting at 10: 615; EG-NEXT: MOV * T4.X, 0.0, 616; EG-NEXT: ALU clause starting at 11: 617; EG-NEXT: LSHL T0.W, T5.X, literal.x, 618; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 619; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 620; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 621; EG-NEXT: OR_INT * T0.W, PV.W, PS, 622; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 623; EG-NEXT: AND_INT T0.W, PS, literal.x, 624; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 625; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 626; EG-NEXT: LSHL T4.X, PV.W, PS, 627; EG-NEXT: LSHL * T4.W, literal.x, PS, 628; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 629; EG-NEXT: MOV T4.Y, 0.0, 630; EG-NEXT: MOV * T4.Z, 0.0, 631; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 632; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 633; 634; CM-LABEL: v2i8_arg: 635; CM: ; %bb.0: ; %entry 636; CM-NEXT: ALU 0, @10, KC0[], KC1[] 637; CM-NEXT: TEX 1 @6 638; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] 639; CM-NEXT: MEM_RAT MSKOR T4.XW, T5.X 640; CM-NEXT: CF_END 641; CM-NEXT: PAD 642; CM-NEXT: Fetch clause starting at 6: 643; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 644; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 645; CM-NEXT: ALU clause starting at 10: 646; CM-NEXT: MOV * T4.X, 0.0, 647; CM-NEXT: ALU clause starting at 11: 648; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 649; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 650; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 651; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 652; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 653; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 654; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 655; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 656; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 657; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 658; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 659; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 660; CM-NEXT: MOV T4.Y, 0.0, 661; CM-NEXT: MOV * T4.Z, 0.0, 662; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 663; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 664entry: 665 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 666 ret void 667} 668 669define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 670; SI-LABEL: v2i16_arg: 671; SI: ; %bb.0: ; %entry 672; SI-NEXT: s_load_dword s4, s[0:1], 0xb 673; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 674; SI-NEXT: s_mov_b32 s3, 0xf000 675; SI-NEXT: s_mov_b32 s2, -1 676; SI-NEXT: s_waitcnt lgkmcnt(0) 677; SI-NEXT: v_mov_b32_e32 v0, s4 678; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 679; SI-NEXT: s_endpgm 680; 681; VI-LABEL: v2i16_arg: 682; VI: ; %bb.0: ; %entry 683; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 684; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 685; VI-NEXT: s_waitcnt lgkmcnt(0) 686; VI-NEXT: v_mov_b32_e32 v0, s2 687; VI-NEXT: v_mov_b32_e32 v1, s3 688; VI-NEXT: v_mov_b32_e32 v2, s0 689; VI-NEXT: flat_store_dword v[0:1], v2 690; VI-NEXT: s_endpgm 691; 692; GFX9-LABEL: v2i16_arg: 693; GFX9: ; %bb.0: ; %entry 694; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 695; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 696; GFX9-NEXT: v_mov_b32_e32 v0, 0 697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 698; GFX9-NEXT: v_mov_b32_e32 v1, s2 699; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 700; GFX9-NEXT: s_endpgm 701; 702; EG-LABEL: v2i16_arg: 703; EG: ; %bb.0: ; %entry 704; EG-NEXT: ALU 0, @10, KC0[], KC1[] 705; EG-NEXT: TEX 1 @6 706; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 707; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 708; EG-NEXT: CF_END 709; EG-NEXT: PAD 710; EG-NEXT: Fetch clause starting at 6: 711; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 712; EG-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3 713; EG-NEXT: ALU clause starting at 10: 714; EG-NEXT: MOV * T4.X, 0.0, 715; EG-NEXT: ALU clause starting at 11: 716; EG-NEXT: LSHL T0.W, T5.X, literal.x, 717; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 718; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 719; EG-NEXT: OR_INT T4.X, PV.W, PS, 720; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 721; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 722; 723; CM-LABEL: v2i16_arg: 724; CM: ; %bb.0: ; %entry 725; CM-NEXT: ALU 0, @10, KC0[], KC1[] 726; CM-NEXT: TEX 1 @6 727; CM-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 728; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X 729; CM-NEXT: CF_END 730; CM-NEXT: PAD 731; CM-NEXT: Fetch clause starting at 6: 732; CM-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 733; CM-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3 734; CM-NEXT: ALU clause starting at 10: 735; CM-NEXT: MOV * T4.X, 0.0, 736; CM-NEXT: ALU clause starting at 11: 737; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 738; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 739; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 740; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W, 741; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 742; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 743entry: 744 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 745 ret void 746} 747 748define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 749; SI-LABEL: v2i32_arg: 750; SI: ; %bb.0: ; %entry 751; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 752; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 753; SI-NEXT: s_mov_b32 s3, 0xf000 754; SI-NEXT: s_mov_b32 s2, -1 755; SI-NEXT: s_waitcnt lgkmcnt(0) 756; SI-NEXT: v_mov_b32_e32 v0, s4 757; SI-NEXT: v_mov_b32_e32 v1, s5 758; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 759; SI-NEXT: s_endpgm 760; 761; VI-LABEL: v2i32_arg: 762; VI: ; %bb.0: ; %entry 763; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 764; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 765; VI-NEXT: s_waitcnt lgkmcnt(0) 766; VI-NEXT: v_mov_b32_e32 v0, s2 767; VI-NEXT: v_mov_b32_e32 v3, s1 768; VI-NEXT: v_mov_b32_e32 v1, s3 769; VI-NEXT: v_mov_b32_e32 v2, s0 770; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 771; VI-NEXT: s_endpgm 772; 773; GFX9-LABEL: v2i32_arg: 774; GFX9: ; %bb.0: ; %entry 775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 776; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 777; GFX9-NEXT: v_mov_b32_e32 v2, 0 778; GFX9-NEXT: s_waitcnt lgkmcnt(0) 779; GFX9-NEXT: v_mov_b32_e32 v0, s0 780; GFX9-NEXT: v_mov_b32_e32 v1, s1 781; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 782; GFX9-NEXT: s_endpgm 783; 784; EG-LABEL: v2i32_arg: 785; EG: ; %bb.0: ; %entry 786; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 787; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 788; EG-NEXT: CF_END 789; EG-NEXT: PAD 790; EG-NEXT: ALU clause starting at 4: 791; EG-NEXT: MOV * T0.Y, KC0[3].X, 792; EG-NEXT: MOV T0.X, KC0[2].W, 793; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 794; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 795; 796; CM-LABEL: v2i32_arg: 797; CM: ; %bb.0: ; %entry 798; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 799; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 800; CM-NEXT: CF_END 801; CM-NEXT: PAD 802; CM-NEXT: ALU clause starting at 4: 803; CM-NEXT: MOV * T0.Y, KC0[3].X, 804; CM-NEXT: MOV * T0.X, KC0[2].W, 805; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 806; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 807entry: 808 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 809 ret void 810} 811 812define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 813; SI-LABEL: v2f32_arg: 814; SI: ; %bb.0: ; %entry 815; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 816; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 817; SI-NEXT: s_mov_b32 s3, 0xf000 818; SI-NEXT: s_mov_b32 s2, -1 819; SI-NEXT: s_waitcnt lgkmcnt(0) 820; SI-NEXT: v_mov_b32_e32 v0, s4 821; SI-NEXT: v_mov_b32_e32 v1, s5 822; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 823; SI-NEXT: s_endpgm 824; 825; VI-LABEL: v2f32_arg: 826; VI: ; %bb.0: ; %entry 827; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 828; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 829; VI-NEXT: s_waitcnt lgkmcnt(0) 830; VI-NEXT: v_mov_b32_e32 v0, s2 831; VI-NEXT: v_mov_b32_e32 v3, s1 832; VI-NEXT: v_mov_b32_e32 v1, s3 833; VI-NEXT: v_mov_b32_e32 v2, s0 834; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 835; VI-NEXT: s_endpgm 836; 837; GFX9-LABEL: v2f32_arg: 838; GFX9: ; %bb.0: ; %entry 839; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 840; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 841; GFX9-NEXT: v_mov_b32_e32 v2, 0 842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 843; GFX9-NEXT: v_mov_b32_e32 v0, s0 844; GFX9-NEXT: v_mov_b32_e32 v1, s1 845; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 846; GFX9-NEXT: s_endpgm 847; 848; EG-LABEL: v2f32_arg: 849; EG: ; %bb.0: ; %entry 850; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 851; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 852; EG-NEXT: CF_END 853; EG-NEXT: PAD 854; EG-NEXT: ALU clause starting at 4: 855; EG-NEXT: MOV * T0.Y, KC0[3].X, 856; EG-NEXT: MOV T0.X, KC0[2].W, 857; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 858; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 859; 860; CM-LABEL: v2f32_arg: 861; CM: ; %bb.0: ; %entry 862; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 863; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 864; CM-NEXT: CF_END 865; CM-NEXT: PAD 866; CM-NEXT: ALU clause starting at 4: 867; CM-NEXT: MOV * T0.Y, KC0[3].X, 868; CM-NEXT: MOV * T0.X, KC0[2].W, 869; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 870; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 871entry: 872 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 873 ret void 874} 875 876define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 877; SI-LABEL: v3i8_arg: 878; SI: ; %bb.0: ; %entry 879; SI-NEXT: s_load_dword s4, s[0:1], 0xb 880; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 881; SI-NEXT: s_mov_b32 s3, 0xf000 882; SI-NEXT: s_waitcnt lgkmcnt(0) 883; SI-NEXT: s_lshr_b32 s5, s4, 16 884; SI-NEXT: s_mov_b32 s2, -1 885; SI-NEXT: v_mov_b32_e32 v0, s4 886; SI-NEXT: v_mov_b32_e32 v1, s5 887; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 888; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 889; SI-NEXT: s_endpgm 890; 891; VI-LABEL: v3i8_arg: 892; VI: ; %bb.0: ; %entry 893; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 894; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 895; VI-NEXT: s_waitcnt lgkmcnt(0) 896; VI-NEXT: s_lshr_b32 s3, s2, 16 897; VI-NEXT: v_mov_b32_e32 v0, s0 898; VI-NEXT: v_mov_b32_e32 v1, s1 899; VI-NEXT: s_add_u32 s0, s0, 2 900; VI-NEXT: s_addc_u32 s1, s1, 0 901; VI-NEXT: v_mov_b32_e32 v3, s1 902; VI-NEXT: v_mov_b32_e32 v5, s3 903; VI-NEXT: v_mov_b32_e32 v2, s0 904; VI-NEXT: v_mov_b32_e32 v4, s2 905; VI-NEXT: flat_store_byte v[2:3], v5 906; VI-NEXT: flat_store_short v[0:1], v4 907; VI-NEXT: s_endpgm 908; 909; GFX9-LABEL: v3i8_arg: 910; GFX9: ; %bb.0: ; %entry 911; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 912; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 913; GFX9-NEXT: v_mov_b32_e32 v0, 0 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: v_mov_b32_e32 v1, s2 916; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2 917; GFX9-NEXT: global_store_short v0, v1, s[0:1] 918; GFX9-NEXT: s_endpgm 919; 920; EG-LABEL: v3i8_arg: 921; EG: ; %bb.0: ; %entry 922; EG-NEXT: ALU 0, @12, KC0[], KC1[] 923; EG-NEXT: TEX 2 @6 924; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[] 925; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 926; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X 927; EG-NEXT: CF_END 928; EG-NEXT: Fetch clause starting at 6: 929; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 930; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 931; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 932; EG-NEXT: ALU clause starting at 12: 933; EG-NEXT: MOV * T4.X, 0.0, 934; EG-NEXT: ALU clause starting at 13: 935; EG-NEXT: LSHL T0.W, T5.X, literal.x, 936; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 937; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 938; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 939; EG-NEXT: OR_INT * T0.W, PV.W, PS, 940; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 941; EG-NEXT: AND_INT T0.W, PS, literal.x, 942; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 943; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 944; EG-NEXT: LSHL T4.X, PV.W, PS, 945; EG-NEXT: LSHL * T4.W, literal.x, PS, 946; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 947; EG-NEXT: MOV T4.Y, 0.0, 948; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 949; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 950; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 951; EG-NEXT: AND_INT * T2.W, T6.X, literal.y, 952; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 953; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 954; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 955; EG-NEXT: LSHL T5.X, T2.W, PV.W, 956; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 957; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 958; EG-NEXT: MOV T5.Y, 0.0, 959; EG-NEXT: MOV T4.Z, 0.0, 960; EG-NEXT: MOV * T5.Z, 0.0, 961; EG-NEXT: LSHR T6.X, T0.W, literal.x, 962; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 963; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 964; 965; CM-LABEL: v3i8_arg: 966; CM: ; %bb.0: ; %entry 967; CM-NEXT: ALU 0, @12, KC0[], KC1[] 968; CM-NEXT: TEX 2 @6 969; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[] 970; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 971; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X 972; CM-NEXT: CF_END 973; CM-NEXT: Fetch clause starting at 6: 974; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 975; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 976; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 977; CM-NEXT: ALU clause starting at 12: 978; CM-NEXT: MOV * T4.X, 0.0, 979; CM-NEXT: ALU clause starting at 13: 980; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 981; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 982; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 983; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 984; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 985; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 986; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 987; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 988; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 989; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 990; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 991; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 992; CM-NEXT: MOV T4.Y, 0.0, 993; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 994; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 995; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 996; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 997; CM-NEXT: AND_INT T0.Z, T6.X, literal.x, 998; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 999; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1000; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1001; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1002; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1003; CM-NEXT: MOV T5.Y, 0.0, 1004; CM-NEXT: MOV * T4.Z, 0.0, 1005; CM-NEXT: MOV * T5.Z, 0.0, 1006; CM-NEXT: LSHR * T6.X, T0.W, literal.x, 1007; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1008; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1009; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1010entry: 1011 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 1012 ret void 1013} 1014 1015define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 1016; SI-LABEL: v3i16_arg: 1017; SI: ; %bb.0: ; %entry 1018; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1019; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1020; SI-NEXT: s_mov_b32 s3, 0xf000 1021; SI-NEXT: s_mov_b32 s2, -1 1022; SI-NEXT: s_waitcnt lgkmcnt(0) 1023; SI-NEXT: v_mov_b32_e32 v0, s5 1024; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1025; SI-NEXT: s_waitcnt expcnt(0) 1026; SI-NEXT: v_mov_b32_e32 v0, s4 1027; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1028; SI-NEXT: s_endpgm 1029; 1030; VI-LABEL: v3i16_arg: 1031; VI: ; %bb.0: ; %entry 1032; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1033; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1034; VI-NEXT: s_waitcnt lgkmcnt(0) 1035; VI-NEXT: s_add_u32 s4, s2, 4 1036; VI-NEXT: s_addc_u32 s5, s3, 0 1037; VI-NEXT: v_mov_b32_e32 v2, s4 1038; VI-NEXT: v_mov_b32_e32 v4, s1 1039; VI-NEXT: v_mov_b32_e32 v0, s2 1040; VI-NEXT: v_mov_b32_e32 v3, s5 1041; VI-NEXT: v_mov_b32_e32 v1, s3 1042; VI-NEXT: v_mov_b32_e32 v5, s0 1043; VI-NEXT: flat_store_short v[2:3], v4 1044; VI-NEXT: flat_store_dword v[0:1], v5 1045; VI-NEXT: s_endpgm 1046; 1047; GFX9-LABEL: v3i16_arg: 1048; GFX9: ; %bb.0: ; %entry 1049; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1050; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1051; GFX9-NEXT: v_mov_b32_e32 v0, 0 1052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX9-NEXT: v_mov_b32_e32 v1, s1 1054; GFX9-NEXT: v_mov_b32_e32 v2, s0 1055; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:4 1056; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 1057; GFX9-NEXT: s_endpgm 1058; 1059; EG-LABEL: v3i16_arg: 1060; EG: ; %bb.0: ; %entry 1061; EG-NEXT: ALU 0, @12, KC0[], KC1[] 1062; EG-NEXT: TEX 2 @6 1063; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1064; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 1065; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1066; EG-NEXT: CF_END 1067; EG-NEXT: Fetch clause starting at 6: 1068; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1069; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1070; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1071; EG-NEXT: ALU clause starting at 12: 1072; EG-NEXT: MOV * T5.X, 0.0, 1073; EG-NEXT: ALU clause starting at 13: 1074; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1075; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1076; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1077; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1078; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1079; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1080; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1081; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1082; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1083; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1084; EG-NEXT: MOV T5.Y, 0.0, 1085; EG-NEXT: MOV * T5.Z, 0.0, 1086; EG-NEXT: LSHR T8.X, T0.W, literal.x, 1087; EG-NEXT: LSHL T0.W, T7.X, literal.y, 1088; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, 1089; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 1090; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1091; EG-NEXT: OR_INT T6.X, PV.W, PS, 1092; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1093; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1094; 1095; CM-LABEL: v3i16_arg: 1096; CM: ; %bb.0: ; %entry 1097; CM-NEXT: ALU 0, @12, KC0[], KC1[] 1098; CM-NEXT: TEX 2 @6 1099; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1100; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1101; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X 1102; CM-NEXT: CF_END 1103; CM-NEXT: Fetch clause starting at 6: 1104; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1105; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1106; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1107; CM-NEXT: ALU clause starting at 12: 1108; CM-NEXT: MOV * T5.X, 0.0, 1109; CM-NEXT: ALU clause starting at 13: 1110; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1111; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1112; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1113; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1114; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1115; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1116; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1117; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1118; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1119; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1120; CM-NEXT: MOV T5.Y, 0.0, 1121; CM-NEXT: MOV * T5.Z, 0.0, 1122; CM-NEXT: LSHL T0.Z, T7.X, literal.x, 1123; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 1124; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1125; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1126; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1127; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1128; CM-NEXT: LSHR * T8.X, T0.W, literal.x, 1129; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1130entry: 1131 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 1132 ret void 1133} 1134 1135define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 1136; SI-LABEL: v3i32_arg: 1137; SI: ; %bb.0: ; %entry 1138; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1139; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1140; SI-NEXT: s_mov_b32 s3, 0xf000 1141; SI-NEXT: s_mov_b32 s2, -1 1142; SI-NEXT: s_waitcnt lgkmcnt(0) 1143; SI-NEXT: v_mov_b32_e32 v0, s6 1144; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 1145; SI-NEXT: s_waitcnt expcnt(0) 1146; SI-NEXT: v_mov_b32_e32 v0, s4 1147; SI-NEXT: v_mov_b32_e32 v1, s5 1148; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1149; SI-NEXT: s_endpgm 1150; 1151; VI-LABEL: v3i32_arg: 1152; VI: ; %bb.0: ; %entry 1153; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1154; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1155; VI-NEXT: s_waitcnt lgkmcnt(0) 1156; VI-NEXT: v_mov_b32_e32 v0, s4 1157; VI-NEXT: v_mov_b32_e32 v4, s1 1158; VI-NEXT: v_mov_b32_e32 v1, s5 1159; VI-NEXT: v_mov_b32_e32 v2, s6 1160; VI-NEXT: v_mov_b32_e32 v3, s0 1161; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1162; VI-NEXT: s_endpgm 1163; 1164; GFX9-LABEL: v3i32_arg: 1165; GFX9: ; %bb.0: ; %entry 1166; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1167; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1168; GFX9-NEXT: v_mov_b32_e32 v3, 0 1169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX9-NEXT: v_mov_b32_e32 v0, s0 1171; GFX9-NEXT: v_mov_b32_e32 v1, s1 1172; GFX9-NEXT: v_mov_b32_e32 v2, s2 1173; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1174; GFX9-NEXT: s_endpgm 1175; 1176; EG-LABEL: v3i32_arg: 1177; EG: ; %bb.0: ; %entry 1178; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1179; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1180; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1181; EG-NEXT: CF_END 1182; EG-NEXT: ALU clause starting at 4: 1183; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1184; EG-NEXT: MOV T0.X, KC0[3].Y, 1185; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1186; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1187; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1188; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1189; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1190; EG-NEXT: MOV * T3.X, KC0[3].W, 1191; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1192; 1193; CM-LABEL: v3i32_arg: 1194; CM: ; %bb.0: ; %entry 1195; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1196; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1197; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1198; CM-NEXT: CF_END 1199; CM-NEXT: ALU clause starting at 4: 1200; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1201; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1202; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1203; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1204; CM-NEXT: MOV T1.X, KC0[3].W, 1205; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1206; CM-NEXT: MOV * T2.X, KC0[3].Y, 1207; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1208; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1209entry: 1210 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 1211 ret void 1212} 1213 1214define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 1215; SI-LABEL: v3f32_arg: 1216; SI: ; %bb.0: ; %entry 1217; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1218; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1219; SI-NEXT: s_mov_b32 s3, 0xf000 1220; SI-NEXT: s_mov_b32 s2, -1 1221; SI-NEXT: s_waitcnt lgkmcnt(0) 1222; SI-NEXT: v_mov_b32_e32 v0, s6 1223; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 1224; SI-NEXT: s_waitcnt expcnt(0) 1225; SI-NEXT: v_mov_b32_e32 v0, s4 1226; SI-NEXT: v_mov_b32_e32 v1, s5 1227; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1228; SI-NEXT: s_endpgm 1229; 1230; VI-LABEL: v3f32_arg: 1231; VI: ; %bb.0: ; %entry 1232; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1233; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1234; VI-NEXT: s_waitcnt lgkmcnt(0) 1235; VI-NEXT: v_mov_b32_e32 v0, s4 1236; VI-NEXT: v_mov_b32_e32 v4, s1 1237; VI-NEXT: v_mov_b32_e32 v1, s5 1238; VI-NEXT: v_mov_b32_e32 v2, s6 1239; VI-NEXT: v_mov_b32_e32 v3, s0 1240; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1241; VI-NEXT: s_endpgm 1242; 1243; GFX9-LABEL: v3f32_arg: 1244; GFX9: ; %bb.0: ; %entry 1245; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1246; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1247; GFX9-NEXT: v_mov_b32_e32 v3, 0 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: v_mov_b32_e32 v0, s0 1250; GFX9-NEXT: v_mov_b32_e32 v1, s1 1251; GFX9-NEXT: v_mov_b32_e32 v2, s2 1252; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1253; GFX9-NEXT: s_endpgm 1254; 1255; EG-LABEL: v3f32_arg: 1256; EG: ; %bb.0: ; %entry 1257; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1258; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1259; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1260; EG-NEXT: CF_END 1261; EG-NEXT: ALU clause starting at 4: 1262; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1263; EG-NEXT: MOV T0.X, KC0[3].Y, 1264; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1265; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1266; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1267; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1268; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1269; EG-NEXT: MOV * T3.X, KC0[3].W, 1270; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1271; 1272; CM-LABEL: v3f32_arg: 1273; CM: ; %bb.0: ; %entry 1274; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1275; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1276; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1277; CM-NEXT: CF_END 1278; CM-NEXT: ALU clause starting at 4: 1279; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1280; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1281; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1282; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1283; CM-NEXT: MOV T1.X, KC0[3].W, 1284; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1285; CM-NEXT: MOV * T2.X, KC0[3].Y, 1286; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1287; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1288entry: 1289 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 1290 ret void 1291} 1292 1293define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 1294; SI-LABEL: v4i8_arg: 1295; SI: ; %bb.0: ; %entry 1296; SI-NEXT: s_load_dword s4, s[0:1], 0xb 1297; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1298; SI-NEXT: s_mov_b32 s3, 0xf000 1299; SI-NEXT: s_mov_b32 s2, -1 1300; SI-NEXT: s_waitcnt lgkmcnt(0) 1301; SI-NEXT: v_mov_b32_e32 v0, s4 1302; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1303; SI-NEXT: s_endpgm 1304; 1305; VI-LABEL: v4i8_arg: 1306; VI: ; %bb.0: ; %entry 1307; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1308; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1309; VI-NEXT: s_waitcnt lgkmcnt(0) 1310; VI-NEXT: v_mov_b32_e32 v0, s2 1311; VI-NEXT: v_mov_b32_e32 v1, s3 1312; VI-NEXT: v_mov_b32_e32 v2, s0 1313; VI-NEXT: flat_store_dword v[0:1], v2 1314; VI-NEXT: s_endpgm 1315; 1316; GFX9-LABEL: v4i8_arg: 1317; GFX9: ; %bb.0: ; %entry 1318; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 1319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1320; GFX9-NEXT: v_mov_b32_e32 v0, 0 1321; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX9-NEXT: v_mov_b32_e32 v1, s2 1323; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1324; GFX9-NEXT: s_endpgm 1325; 1326; EG-LABEL: v4i8_arg: 1327; EG: ; %bb.0: ; %entry 1328; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1329; EG-NEXT: TEX 3 @6 1330; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1331; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 1332; EG-NEXT: CF_END 1333; EG-NEXT: PAD 1334; EG-NEXT: Fetch clause starting at 6: 1335; EG-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3 1336; EG-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3 1337; EG-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3 1338; EG-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3 1339; EG-NEXT: ALU clause starting at 14: 1340; EG-NEXT: MOV * T4.X, 0.0, 1341; EG-NEXT: ALU clause starting at 15: 1342; EG-NEXT: AND_INT * T0.W, T5.X, literal.x, 1343; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1344; EG-NEXT: AND_INT T0.Z, T4.X, literal.x, 1345; EG-NEXT: LSHL T0.W, PV.W, literal.y, 1346; EG-NEXT: LSHL * T1.W, T7.X, literal.z, 1347; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1348; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1349; EG-NEXT: OR_INT T0.W, PS, PV.W, 1350; EG-NEXT: LSHL * T1.W, PV.Z, literal.x, 1351; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1352; EG-NEXT: OR_INT T0.W, PV.W, PS, 1353; EG-NEXT: AND_INT * T1.W, T6.X, literal.x, 1354; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1355; EG-NEXT: OR_INT T4.X, PV.W, PS, 1356; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 1357; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1358; 1359; CM-LABEL: v4i8_arg: 1360; CM: ; %bb.0: ; %entry 1361; CM-NEXT: ALU 0, @14, KC0[], KC1[] 1362; CM-NEXT: TEX 3 @6 1363; CM-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1364; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X 1365; CM-NEXT: CF_END 1366; CM-NEXT: PAD 1367; CM-NEXT: Fetch clause starting at 6: 1368; CM-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3 1369; CM-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3 1370; CM-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3 1371; CM-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3 1372; CM-NEXT: ALU clause starting at 14: 1373; CM-NEXT: MOV * T4.X, 0.0, 1374; CM-NEXT: ALU clause starting at 15: 1375; CM-NEXT: AND_INT * T0.W, T5.X, literal.x, 1376; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1377; CM-NEXT: AND_INT T0.Y, T4.X, literal.x, 1378; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1379; CM-NEXT: LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212 1380; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1381; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1382; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1383; CM-NEXT: LSHL * T0.W, PV.Y, literal.x, 1384; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1385; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1386; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 1387; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1388; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W, 1389; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 1390; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1391entry: 1392 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 1393 ret void 1394} 1395 1396define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 1397; SI-LABEL: v4i16_arg: 1398; SI: ; %bb.0: ; %entry 1399; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1400; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1401; SI-NEXT: s_mov_b32 s3, 0xf000 1402; SI-NEXT: s_mov_b32 s2, -1 1403; SI-NEXT: s_waitcnt lgkmcnt(0) 1404; SI-NEXT: v_mov_b32_e32 v0, s4 1405; SI-NEXT: v_mov_b32_e32 v1, s5 1406; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1407; SI-NEXT: s_endpgm 1408; 1409; VI-LABEL: v4i16_arg: 1410; VI: ; %bb.0: ; %entry 1411; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1412; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1413; VI-NEXT: s_waitcnt lgkmcnt(0) 1414; VI-NEXT: v_mov_b32_e32 v0, s2 1415; VI-NEXT: v_mov_b32_e32 v3, s1 1416; VI-NEXT: v_mov_b32_e32 v1, s3 1417; VI-NEXT: v_mov_b32_e32 v2, s0 1418; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1419; VI-NEXT: s_endpgm 1420; 1421; GFX9-LABEL: v4i16_arg: 1422; GFX9: ; %bb.0: ; %entry 1423; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1424; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1425; GFX9-NEXT: v_mov_b32_e32 v2, 0 1426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX9-NEXT: v_mov_b32_e32 v0, s0 1428; GFX9-NEXT: v_mov_b32_e32 v1, s1 1429; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1430; GFX9-NEXT: s_endpgm 1431; 1432; EG-LABEL: v4i16_arg: 1433; EG: ; %bb.0: ; %entry 1434; EG-NEXT: ALU 1, @20, KC0[], KC1[] 1435; EG-NEXT: TEX 0 @12 1436; EG-NEXT: ALU 5, @22, KC0[], KC1[] 1437; EG-NEXT: TEX 0 @14 1438; EG-NEXT: ALU 5, @28, KC0[], KC1[] 1439; EG-NEXT: TEX 0 @16 1440; EG-NEXT: ALU 5, @34, KC0[], KC1[] 1441; EG-NEXT: TEX 0 @18 1442; EG-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[] 1443; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 1444; EG-NEXT: CF_END 1445; EG-NEXT: PAD 1446; EG-NEXT: Fetch clause starting at 12: 1447; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 1448; EG-NEXT: Fetch clause starting at 14: 1449; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 1450; EG-NEXT: Fetch clause starting at 16: 1451; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 1452; EG-NEXT: Fetch clause starting at 18: 1453; EG-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3 1454; EG-NEXT: ALU clause starting at 20: 1455; EG-NEXT: MOV * T0.Y, T3.X, 1456; EG-NEXT: MOV * T5.X, 0.0, 1457; EG-NEXT: ALU clause starting at 22: 1458; EG-NEXT: LSHL T0.W, T6.X, literal.x, 1459; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 1460; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1461; EG-NEXT: OR_INT * T0.W, PS, PV.W, 1462; EG-NEXT: MOV * T3.X, PV.W, 1463; EG-NEXT: MOV * T0.Y, PV.X, 1464; EG-NEXT: ALU clause starting at 28: 1465; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 1466; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 1467; EG-NEXT: -65536(nan), 65535(9.183409e-41) 1468; EG-NEXT: OR_INT * T0.W, PV.W, PS, 1469; EG-NEXT: MOV T3.X, PV.W, 1470; EG-NEXT: MOV * T0.Y, T2.X, 1471; EG-NEXT: ALU clause starting at 34: 1472; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 1473; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 1474; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 1475; EG-NEXT: OR_INT * T0.W, PV.W, PS, 1476; EG-NEXT: MOV * T2.X, PV.W, 1477; EG-NEXT: MOV * T0.Y, PV.X, 1478; EG-NEXT: ALU clause starting at 40: 1479; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, 1480; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 1481; EG-NEXT: AND_INT * T1.W, T5.X, literal.z, 1482; EG-NEXT: 2(2.802597e-45), -65536(nan) 1483; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1484; EG-NEXT: OR_INT * T5.X, PV.W, PS, 1485; EG-NEXT: MOV T2.X, PV.X, 1486; EG-NEXT: MOV * T5.Y, T3.X, 1487; 1488; CM-LABEL: v4i16_arg: 1489; CM: ; %bb.0: ; %entry 1490; CM-NEXT: ALU 1, @20, KC0[], KC1[] 1491; CM-NEXT: TEX 0 @12 1492; CM-NEXT: ALU 5, @22, KC0[], KC1[] 1493; CM-NEXT: TEX 0 @14 1494; CM-NEXT: ALU 5, @28, KC0[], KC1[] 1495; CM-NEXT: TEX 0 @16 1496; CM-NEXT: ALU 5, @34, KC0[], KC1[] 1497; CM-NEXT: TEX 0 @18 1498; CM-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[] 1499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 1500; CM-NEXT: CF_END 1501; CM-NEXT: PAD 1502; CM-NEXT: Fetch clause starting at 12: 1503; CM-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 1504; CM-NEXT: Fetch clause starting at 14: 1505; CM-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 1506; CM-NEXT: Fetch clause starting at 16: 1507; CM-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 1508; CM-NEXT: Fetch clause starting at 18: 1509; CM-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3 1510; CM-NEXT: ALU clause starting at 20: 1511; CM-NEXT: MOV * T0.Y, T3.X, 1512; CM-NEXT: MOV * T5.X, 0.0, 1513; CM-NEXT: ALU clause starting at 22: 1514; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 1515; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 1516; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1517; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 1518; CM-NEXT: MOV * T3.X, PV.W, 1519; CM-NEXT: MOV * T0.Y, PV.X, 1520; CM-NEXT: ALU clause starting at 28: 1521; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 1522; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1523; CM-NEXT: -65536(nan), 65535(9.183409e-41) 1524; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 1525; CM-NEXT: MOV T3.X, PV.W, 1526; CM-NEXT: MOV * T0.Y, T2.X, 1527; CM-NEXT: ALU clause starting at 34: 1528; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 1529; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 1530; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 1531; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 1532; CM-NEXT: MOV * T2.X, PV.W, 1533; CM-NEXT: MOV * T0.Y, PV.X, 1534; CM-NEXT: ALU clause starting at 40: 1535; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x, 1536; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 1537; CM-NEXT: AND_INT * T0.W, T5.X, literal.z, 1538; CM-NEXT: 2(2.802597e-45), -65536(nan) 1539; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1540; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 1541; CM-NEXT: MOV T2.X, PV.X, 1542; CM-NEXT: MOV * T5.Y, T3.X, 1543entry: 1544 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 1545 ret void 1546} 1547 1548define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 1549; SI-LABEL: v4i32_arg: 1550; SI: ; %bb.0: ; %entry 1551; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1552; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1553; SI-NEXT: s_mov_b32 s3, 0xf000 1554; SI-NEXT: s_mov_b32 s2, -1 1555; SI-NEXT: s_waitcnt lgkmcnt(0) 1556; SI-NEXT: v_mov_b32_e32 v0, s4 1557; SI-NEXT: v_mov_b32_e32 v1, s5 1558; SI-NEXT: v_mov_b32_e32 v2, s6 1559; SI-NEXT: v_mov_b32_e32 v3, s7 1560; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1561; SI-NEXT: s_endpgm 1562; 1563; VI-LABEL: v4i32_arg: 1564; VI: ; %bb.0: ; %entry 1565; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1566; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1567; VI-NEXT: s_waitcnt lgkmcnt(0) 1568; VI-NEXT: v_mov_b32_e32 v4, s4 1569; VI-NEXT: v_mov_b32_e32 v0, s0 1570; VI-NEXT: v_mov_b32_e32 v5, s5 1571; VI-NEXT: v_mov_b32_e32 v1, s1 1572; VI-NEXT: v_mov_b32_e32 v2, s2 1573; VI-NEXT: v_mov_b32_e32 v3, s3 1574; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1575; VI-NEXT: s_endpgm 1576; 1577; GFX9-LABEL: v4i32_arg: 1578; GFX9: ; %bb.0: ; %entry 1579; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1580; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1581; GFX9-NEXT: v_mov_b32_e32 v4, 0 1582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX9-NEXT: v_mov_b32_e32 v0, s0 1584; GFX9-NEXT: v_mov_b32_e32 v1, s1 1585; GFX9-NEXT: v_mov_b32_e32 v2, s2 1586; GFX9-NEXT: v_mov_b32_e32 v3, s3 1587; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1588; GFX9-NEXT: s_endpgm 1589; 1590; EG-LABEL: v4i32_arg: 1591; EG: ; %bb.0: ; %entry 1592; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1593; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1594; EG-NEXT: CF_END 1595; EG-NEXT: PAD 1596; EG-NEXT: ALU clause starting at 4: 1597; EG-NEXT: MOV * T0.W, KC0[4].X, 1598; EG-NEXT: MOV * T0.Z, KC0[3].W, 1599; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1600; EG-NEXT: MOV T0.X, KC0[3].Y, 1601; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1603; 1604; CM-LABEL: v4i32_arg: 1605; CM: ; %bb.0: ; %entry 1606; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1607; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1608; CM-NEXT: CF_END 1609; CM-NEXT: PAD 1610; CM-NEXT: ALU clause starting at 4: 1611; CM-NEXT: MOV * T0.W, KC0[4].X, 1612; CM-NEXT: MOV * T0.Z, KC0[3].W, 1613; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1614; CM-NEXT: MOV * T0.X, KC0[3].Y, 1615; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1616; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1617entry: 1618 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 1619 ret void 1620} 1621 1622define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 1623; SI-LABEL: v4f32_arg: 1624; SI: ; %bb.0: ; %entry 1625; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1626; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1627; SI-NEXT: s_mov_b32 s3, 0xf000 1628; SI-NEXT: s_mov_b32 s2, -1 1629; SI-NEXT: s_waitcnt lgkmcnt(0) 1630; SI-NEXT: v_mov_b32_e32 v0, s4 1631; SI-NEXT: v_mov_b32_e32 v1, s5 1632; SI-NEXT: v_mov_b32_e32 v2, s6 1633; SI-NEXT: v_mov_b32_e32 v3, s7 1634; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1635; SI-NEXT: s_endpgm 1636; 1637; VI-LABEL: v4f32_arg: 1638; VI: ; %bb.0: ; %entry 1639; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1640; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1641; VI-NEXT: s_waitcnt lgkmcnt(0) 1642; VI-NEXT: v_mov_b32_e32 v4, s4 1643; VI-NEXT: v_mov_b32_e32 v0, s0 1644; VI-NEXT: v_mov_b32_e32 v5, s5 1645; VI-NEXT: v_mov_b32_e32 v1, s1 1646; VI-NEXT: v_mov_b32_e32 v2, s2 1647; VI-NEXT: v_mov_b32_e32 v3, s3 1648; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1649; VI-NEXT: s_endpgm 1650; 1651; GFX9-LABEL: v4f32_arg: 1652; GFX9: ; %bb.0: ; %entry 1653; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1654; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1655; GFX9-NEXT: v_mov_b32_e32 v4, 0 1656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX9-NEXT: v_mov_b32_e32 v0, s0 1658; GFX9-NEXT: v_mov_b32_e32 v1, s1 1659; GFX9-NEXT: v_mov_b32_e32 v2, s2 1660; GFX9-NEXT: v_mov_b32_e32 v3, s3 1661; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1662; GFX9-NEXT: s_endpgm 1663; 1664; EG-LABEL: v4f32_arg: 1665; EG: ; %bb.0: ; %entry 1666; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1667; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1668; EG-NEXT: CF_END 1669; EG-NEXT: PAD 1670; EG-NEXT: ALU clause starting at 4: 1671; EG-NEXT: MOV * T0.W, KC0[4].X, 1672; EG-NEXT: MOV * T0.Z, KC0[3].W, 1673; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1674; EG-NEXT: MOV T0.X, KC0[3].Y, 1675; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1676; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1677; 1678; CM-LABEL: v4f32_arg: 1679; CM: ; %bb.0: ; %entry 1680; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1681; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1682; CM-NEXT: CF_END 1683; CM-NEXT: PAD 1684; CM-NEXT: ALU clause starting at 4: 1685; CM-NEXT: MOV * T0.W, KC0[4].X, 1686; CM-NEXT: MOV * T0.Z, KC0[3].W, 1687; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1688; CM-NEXT: MOV * T0.X, KC0[3].Y, 1689; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1690; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1691entry: 1692 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 1693 ret void 1694} 1695 1696define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { 1697; SI-LABEL: v5i8_arg: 1698; SI: ; %bb.0: ; %entry 1699; SI-NEXT: s_load_dword s2, s[0:1], 0xc 1700; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1701; SI-NEXT: s_load_dword s0, s[0:1], 0xb 1702; SI-NEXT: s_mov_b32 s7, 0xf000 1703; SI-NEXT: s_mov_b32 s6, -1 1704; SI-NEXT: s_waitcnt lgkmcnt(0) 1705; SI-NEXT: v_mov_b32_e32 v0, s2 1706; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4 1707; SI-NEXT: s_waitcnt expcnt(0) 1708; SI-NEXT: v_mov_b32_e32 v0, s0 1709; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1710; SI-NEXT: s_endpgm 1711; 1712; VI-LABEL: v5i8_arg: 1713; VI: ; %bb.0: ; %entry 1714; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1715; VI-NEXT: s_load_dword s4, s[0:1], 0x30 1716; VI-NEXT: s_load_dword s5, s[0:1], 0x2c 1717; VI-NEXT: s_waitcnt lgkmcnt(0) 1718; VI-NEXT: s_add_u32 s0, s2, 4 1719; VI-NEXT: s_addc_u32 s1, s3, 0 1720; VI-NEXT: v_mov_b32_e32 v3, s1 1721; VI-NEXT: v_mov_b32_e32 v4, s4 1722; VI-NEXT: v_mov_b32_e32 v0, s2 1723; VI-NEXT: v_mov_b32_e32 v2, s0 1724; VI-NEXT: v_mov_b32_e32 v1, s3 1725; VI-NEXT: flat_store_byte v[2:3], v4 1726; VI-NEXT: v_mov_b32_e32 v2, s5 1727; VI-NEXT: flat_store_dword v[0:1], v2 1728; VI-NEXT: s_endpgm 1729; 1730; GFX9-LABEL: v5i8_arg: 1731; GFX9: ; %bb.0: ; %entry 1732; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1733; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1734; GFX9-NEXT: v_mov_b32_e32 v0, 0 1735; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1736; GFX9-NEXT: v_mov_b32_e32 v1, s1 1737; GFX9-NEXT: v_mov_b32_e32 v2, s0 1738; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:4 1739; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 1740; GFX9-NEXT: s_endpgm 1741; 1742; EG-LABEL: v5i8_arg: 1743; EG: ; %bb.0: ; %entry 1744; EG-NEXT: ALU 0, @16, KC0[], KC1[] 1745; EG-NEXT: TEX 4 @6 1746; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1747; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1748; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 1749; EG-NEXT: CF_END 1750; EG-NEXT: Fetch clause starting at 6: 1751; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1752; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1753; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1754; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1755; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1756; EG-NEXT: ALU clause starting at 16: 1757; EG-NEXT: MOV * T5.X, 0.0, 1758; EG-NEXT: ALU clause starting at 17: 1759; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1760; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1761; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1762; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1763; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 1764; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1765; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1766; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1767; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1768; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1769; EG-NEXT: MOV T5.Y, 0.0, 1770; EG-NEXT: MOV T5.Z, 0.0, 1771; EG-NEXT: AND_INT T1.W, T9.X, literal.x, 1772; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x, 1773; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1774; EG-NEXT: LSHL T1.W, PV.W, literal.x, 1775; EG-NEXT: LSHL * T2.W, T7.X, literal.y, 1776; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) 1777; EG-NEXT: OR_INT T1.W, PS, PV.W, 1778; EG-NEXT: LSHL * T2.W, T0.Z, literal.x, 1779; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1780; EG-NEXT: OR_INT T1.W, PV.W, PS, 1781; EG-NEXT: AND_INT * T2.W, T6.X, literal.x, 1782; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1783; EG-NEXT: OR_INT T6.X, PV.W, PS, 1784; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1785; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1786; EG-NEXT: LSHR * T8.X, T0.W, literal.x, 1787; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1788; 1789; CM-LABEL: v5i8_arg: 1790; CM: ; %bb.0: ; %entry 1791; CM-NEXT: ALU 0, @16, KC0[], KC1[] 1792; CM-NEXT: TEX 4 @6 1793; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1794; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X 1795; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X 1796; CM-NEXT: CF_END 1797; CM-NEXT: Fetch clause starting at 6: 1798; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1799; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1800; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1801; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1802; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1803; CM-NEXT: ALU clause starting at 16: 1804; CM-NEXT: MOV * T5.X, 0.0, 1805; CM-NEXT: ALU clause starting at 17: 1806; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1807; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1808; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1809; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1810; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1811; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1812; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1813; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1814; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1815; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1816; CM-NEXT: MOV T5.Y, 0.0, 1817; CM-NEXT: MOV T5.Z, 0.0, 1818; CM-NEXT: AND_INT * T1.W, T9.X, literal.x, 1819; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1820; CM-NEXT: AND_INT T0.Y, T8.X, literal.x, 1821; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1822; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212 1823; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1824; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1825; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1826; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, 1827; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1828; CM-NEXT: LSHR T7.X, T0.W, literal.x, 1829; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1830; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1831; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) 1832; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1833; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 1834; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1835entry: 1836 store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4 1837 ret void 1838} 1839 1840define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind { 1841; SI-LABEL: v5i16_arg: 1842; SI: ; %bb.0: ; %entry 1843; SI-NEXT: s_load_dword s2, s[0:1], 0xf 1844; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1845; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1846; SI-NEXT: s_mov_b32 s7, 0xf000 1847; SI-NEXT: s_mov_b32 s6, -1 1848; SI-NEXT: s_waitcnt lgkmcnt(0) 1849; SI-NEXT: v_mov_b32_e32 v0, s2 1850; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 1851; SI-NEXT: s_waitcnt expcnt(0) 1852; SI-NEXT: v_mov_b32_e32 v0, s0 1853; SI-NEXT: v_mov_b32_e32 v1, s1 1854; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1855; SI-NEXT: s_endpgm 1856; 1857; VI-LABEL: v5i16_arg: 1858; VI: ; %bb.0: ; %entry 1859; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1860; VI-NEXT: s_load_dword s5, s[0:1], 0x3c 1861; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1862; VI-NEXT: s_waitcnt lgkmcnt(0) 1863; VI-NEXT: s_add_u32 s4, s2, 8 1864; VI-NEXT: v_mov_b32_e32 v4, s5 1865; VI-NEXT: s_addc_u32 s5, s3, 0 1866; VI-NEXT: v_mov_b32_e32 v2, s4 1867; VI-NEXT: v_mov_b32_e32 v3, s5 1868; VI-NEXT: v_mov_b32_e32 v0, s2 1869; VI-NEXT: flat_store_short v[2:3], v4 1870; VI-NEXT: v_mov_b32_e32 v3, s1 1871; VI-NEXT: v_mov_b32_e32 v1, s3 1872; VI-NEXT: v_mov_b32_e32 v2, s0 1873; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1874; VI-NEXT: s_endpgm 1875; 1876; GFX9-LABEL: v5i16_arg: 1877; GFX9: ; %bb.0: ; %entry 1878; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1879; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1880; GFX9-NEXT: v_mov_b32_e32 v2, 0 1881; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1882; GFX9-NEXT: v_mov_b32_e32 v3, s2 1883; GFX9-NEXT: v_mov_b32_e32 v0, s0 1884; GFX9-NEXT: v_mov_b32_e32 v1, s1 1885; GFX9-NEXT: global_store_short v2, v3, s[6:7] offset:8 1886; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 1887; GFX9-NEXT: s_endpgm 1888; 1889; EG-LABEL: v5i16_arg: 1890; EG: ; %bb.0: ; %entry 1891; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1892; EG-NEXT: TEX 4 @10 1893; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[] 1894; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1895; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1896; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1897; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1898; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1899; EG-NEXT: CF_END 1900; EG-NEXT: PAD 1901; EG-NEXT: Fetch clause starting at 10: 1902; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1903; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1904; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1905; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1906; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1907; EG-NEXT: ALU clause starting at 20: 1908; EG-NEXT: MOV * T0.X, 0.0, 1909; EG-NEXT: ALU clause starting at 21: 1910; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1911; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1912; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1913; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, 1914; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1915; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1916; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1917; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1918; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1919; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1920; EG-NEXT: MOV T5.Y, 0.0, 1921; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x, 1922; EG-NEXT: AND_INT * T2.W, T4.X, literal.y, 1923; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1924; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1925; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1926; EG-NEXT: LSHL T4.X, T2.W, PV.W, 1927; EG-NEXT: LSHL * T4.W, literal.x, PV.W, 1928; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1929; EG-NEXT: MOV T4.Y, 0.0, 1930; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 1931; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1932; EG-NEXT: AND_INT T2.W, PV.W, literal.x, 1933; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, 1934; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1935; EG-NEXT: LSHL * T2.W, PV.W, literal.x, 1936; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1937; EG-NEXT: LSHL T3.X, T3.W, PV.W, 1938; EG-NEXT: LSHL * T3.W, literal.x, PV.W, 1939; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1940; EG-NEXT: MOV T3.Y, 0.0, 1941; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1942; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1943; EG-NEXT: AND_INT T6.W, PV.W, literal.x, 1944; EG-NEXT: AND_INT * T7.W, T2.X, literal.y, 1945; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1946; EG-NEXT: LSHL * T6.W, PV.W, literal.x, 1947; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1948; EG-NEXT: LSHL T6.X, T7.W, PV.W, 1949; EG-NEXT: LSHL * T6.W, literal.x, PV.W, 1950; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1951; EG-NEXT: MOV T6.Y, 0.0, 1952; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 1953; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) 1954; EG-NEXT: AND_INT T8.W, PV.W, literal.x, 1955; EG-NEXT: AND_INT * T9.W, T1.X, literal.y, 1956; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1957; EG-NEXT: LSHL * T8.W, PV.W, literal.x, 1958; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1959; EG-NEXT: LSHL T8.X, T9.W, PV.W, 1960; EG-NEXT: LSHL * T8.W, literal.x, PV.W, 1961; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1962; EG-NEXT: MOV T8.Y, 0.0, 1963; EG-NEXT: MOV T5.Z, 0.0, 1964; EG-NEXT: MOV * T4.Z, 0.0, 1965; EG-NEXT: MOV T3.Z, 0.0, 1966; EG-NEXT: MOV * T6.Z, 0.0, 1967; EG-NEXT: MOV * T8.Z, 0.0, 1968; EG-NEXT: LSHR T0.X, T7.W, literal.x, 1969; EG-NEXT: LSHR * T1.X, T2.W, literal.x, 1970; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1971; EG-NEXT: LSHR T2.X, T1.W, literal.x, 1972; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1973; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1974; EG-NEXT: LSHR * T9.X, T0.W, literal.x, 1975; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1976; 1977; CM-LABEL: v5i16_arg: 1978; CM: ; %bb.0: ; %entry 1979; CM-NEXT: ALU 0, @20, KC0[], KC1[] 1980; CM-NEXT: TEX 4 @10 1981; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[] 1982; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1983; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1984; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1985; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1986; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1987; CM-NEXT: CF_END 1988; CM-NEXT: PAD 1989; CM-NEXT: Fetch clause starting at 10: 1990; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1991; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1992; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1993; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1994; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1995; CM-NEXT: ALU clause starting at 20: 1996; CM-NEXT: MOV * T0.X, 0.0, 1997; CM-NEXT: ALU clause starting at 21: 1998; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1999; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2000; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 2001; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2002; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, 2003; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 2004; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2005; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 2006; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 2007; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2008; CM-NEXT: MOV T5.Y, 0.0, 2009; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 2010; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2011; CM-NEXT: AND_INT T0.Z, T4.X, literal.x, 2012; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 2013; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2014; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 2015; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 2016; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2017; CM-NEXT: MOV T4.Y, 0.0, 2018; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2019; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2020; CM-NEXT: AND_INT * T2.W, PV.W, literal.x, 2021; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2022; CM-NEXT: AND_INT T0.Z, T3.X, literal.x, 2023; CM-NEXT: LSHL * T2.W, PV.W, literal.y, 2024; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2025; CM-NEXT: LSHL T3.X, PV.Z, PV.W, 2026; CM-NEXT: LSHL * T3.W, literal.x, PV.W, 2027; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2028; CM-NEXT: MOV T3.Y, 0.0, 2029; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2030; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 2031; CM-NEXT: AND_INT * T6.W, PV.W, literal.x, 2032; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2033; CM-NEXT: AND_INT T0.Z, T2.X, literal.x, 2034; CM-NEXT: LSHL * T6.W, PV.W, literal.y, 2035; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2036; CM-NEXT: LSHL T6.X, PV.Z, PV.W, 2037; CM-NEXT: LSHL * T6.W, literal.x, PV.W, 2038; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2039; CM-NEXT: MOV T6.Y, 0.0, 2040; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 2041; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00) 2042; CM-NEXT: AND_INT * T8.W, PV.W, literal.x, 2043; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2044; CM-NEXT: AND_INT T0.Z, T1.X, literal.x, 2045; CM-NEXT: LSHL * T8.W, PV.W, literal.y, 2046; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2047; CM-NEXT: LSHL T8.X, PV.Z, PV.W, 2048; CM-NEXT: LSHL * T8.W, literal.x, PV.W, 2049; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2050; CM-NEXT: MOV T8.Y, 0.0, 2051; CM-NEXT: MOV * T5.Z, 0.0, 2052; CM-NEXT: MOV * T4.Z, 0.0, 2053; CM-NEXT: MOV * T3.Z, 0.0, 2054; CM-NEXT: MOV * T6.Z, 0.0, 2055; CM-NEXT: MOV * T8.Z, 0.0, 2056; CM-NEXT: LSHR * T0.X, T7.W, literal.x, 2057; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2058; CM-NEXT: LSHR * T1.X, T2.W, literal.x, 2059; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2060; CM-NEXT: LSHR * T2.X, T1.W, literal.x, 2061; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2062; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 2063; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2064; CM-NEXT: LSHR * T9.X, T0.W, literal.x, 2065; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2066entry: 2067 store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4 2068 ret void 2069} 2070 2071define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind { 2072; SI-LABEL: v5i32_arg: 2073; SI: ; %bb.0: ; %entry 2074; SI-NEXT: s_load_dword s8, s[0:1], 0x15 2075; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2076; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 2077; SI-NEXT: s_mov_b32 s7, 0xf000 2078; SI-NEXT: s_mov_b32 s6, -1 2079; SI-NEXT: s_waitcnt lgkmcnt(0) 2080; SI-NEXT: v_mov_b32_e32 v0, s8 2081; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 2082; SI-NEXT: s_waitcnt expcnt(0) 2083; SI-NEXT: v_mov_b32_e32 v0, s0 2084; SI-NEXT: v_mov_b32_e32 v1, s1 2085; SI-NEXT: v_mov_b32_e32 v2, s2 2086; SI-NEXT: v_mov_b32_e32 v3, s3 2087; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2088; SI-NEXT: s_endpgm 2089; 2090; VI-LABEL: v5i32_arg: 2091; VI: ; %bb.0: ; %entry 2092; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2093; VI-NEXT: s_load_dword s7, s[0:1], 0x54 2094; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 2095; VI-NEXT: s_waitcnt lgkmcnt(0) 2096; VI-NEXT: s_add_u32 s6, s4, 16 2097; VI-NEXT: v_mov_b32_e32 v2, s7 2098; VI-NEXT: s_addc_u32 s7, s5, 0 2099; VI-NEXT: v_mov_b32_e32 v0, s6 2100; VI-NEXT: v_mov_b32_e32 v1, s7 2101; VI-NEXT: v_mov_b32_e32 v4, s4 2102; VI-NEXT: flat_store_dword v[0:1], v2 2103; VI-NEXT: v_mov_b32_e32 v0, s0 2104; VI-NEXT: v_mov_b32_e32 v5, s5 2105; VI-NEXT: v_mov_b32_e32 v1, s1 2106; VI-NEXT: v_mov_b32_e32 v2, s2 2107; VI-NEXT: v_mov_b32_e32 v3, s3 2108; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2109; VI-NEXT: s_endpgm 2110; 2111; GFX9-LABEL: v5i32_arg: 2112; GFX9: ; %bb.0: ; %entry 2113; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 2114; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2115; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2116; GFX9-NEXT: v_mov_b32_e32 v4, 0 2117; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2118; GFX9-NEXT: v_mov_b32_e32 v5, s8 2119; GFX9-NEXT: v_mov_b32_e32 v0, s0 2120; GFX9-NEXT: v_mov_b32_e32 v1, s1 2121; GFX9-NEXT: v_mov_b32_e32 v2, s2 2122; GFX9-NEXT: v_mov_b32_e32 v3, s3 2123; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 2124; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2125; GFX9-NEXT: s_endpgm 2126; 2127; EG-LABEL: v5i32_arg: 2128; EG: ; %bb.0: ; %entry 2129; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2130; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2131; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2132; EG-NEXT: CF_END 2133; EG-NEXT: ALU clause starting at 4: 2134; EG-NEXT: MOV * T0.W, KC0[5].X, 2135; EG-NEXT: MOV * T0.Z, KC0[4].W, 2136; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2137; EG-NEXT: MOV T0.X, KC0[4].Y, 2138; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2139; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2140; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2141; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2142; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2143; EG-NEXT: MOV * T3.X, KC0[5].Y, 2144; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2145; 2146; CM-LABEL: v5i32_arg: 2147; CM: ; %bb.0: ; %entry 2148; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2149; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2150; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2151; CM-NEXT: CF_END 2152; CM-NEXT: ALU clause starting at 4: 2153; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2154; CM-NEXT: MOV * T0.W, KC0[5].X, 2155; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2156; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2157; CM-NEXT: MOV * T0.Z, KC0[4].W, 2158; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2159; CM-NEXT: MOV T2.X, KC0[5].Y, 2160; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2161; CM-NEXT: MOV * T0.X, KC0[4].Y, 2162; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2163; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2164entry: 2165 store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4 2166 ret void 2167} 2168 2169define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind { 2170; SI-LABEL: v5f32_arg: 2171; SI: ; %bb.0: ; %entry 2172; SI-NEXT: s_load_dword s8, s[0:1], 0x15 2173; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2174; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 2175; SI-NEXT: s_mov_b32 s7, 0xf000 2176; SI-NEXT: s_mov_b32 s6, -1 2177; SI-NEXT: s_waitcnt lgkmcnt(0) 2178; SI-NEXT: v_mov_b32_e32 v0, s8 2179; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 2180; SI-NEXT: s_waitcnt expcnt(0) 2181; SI-NEXT: v_mov_b32_e32 v0, s0 2182; SI-NEXT: v_mov_b32_e32 v1, s1 2183; SI-NEXT: v_mov_b32_e32 v2, s2 2184; SI-NEXT: v_mov_b32_e32 v3, s3 2185; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2186; SI-NEXT: s_endpgm 2187; 2188; VI-LABEL: v5f32_arg: 2189; VI: ; %bb.0: ; %entry 2190; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2191; VI-NEXT: s_load_dword s7, s[0:1], 0x54 2192; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 2193; VI-NEXT: s_waitcnt lgkmcnt(0) 2194; VI-NEXT: s_add_u32 s6, s4, 16 2195; VI-NEXT: v_mov_b32_e32 v3, s7 2196; VI-NEXT: s_addc_u32 s7, s5, 0 2197; VI-NEXT: v_mov_b32_e32 v1, s6 2198; VI-NEXT: v_mov_b32_e32 v2, s7 2199; VI-NEXT: v_mov_b32_e32 v4, s4 2200; VI-NEXT: v_mov_b32_e32 v0, s0 2201; VI-NEXT: flat_store_dword v[1:2], v3 2202; VI-NEXT: v_mov_b32_e32 v1, s1 2203; VI-NEXT: v_mov_b32_e32 v2, s2 2204; VI-NEXT: v_mov_b32_e32 v3, s3 2205; VI-NEXT: v_mov_b32_e32 v5, s5 2206; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2207; VI-NEXT: s_endpgm 2208; 2209; GFX9-LABEL: v5f32_arg: 2210; GFX9: ; %bb.0: ; %entry 2211; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2212; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2213; GFX9-NEXT: v_mov_b32_e32 v4, 0 2214; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 2215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2216; GFX9-NEXT: v_mov_b32_e32 v0, s0 2217; GFX9-NEXT: v_mov_b32_e32 v1, s1 2218; GFX9-NEXT: v_mov_b32_e32 v2, s2 2219; GFX9-NEXT: v_mov_b32_e32 v3, s3 2220; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2221; GFX9-NEXT: s_nop 0 2222; GFX9-NEXT: v_mov_b32_e32 v0, s4 2223; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 2224; GFX9-NEXT: s_endpgm 2225; 2226; EG-LABEL: v5f32_arg: 2227; EG: ; %bb.0: ; %entry 2228; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2229; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2230; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2231; EG-NEXT: CF_END 2232; EG-NEXT: ALU clause starting at 4: 2233; EG-NEXT: MOV * T0.W, KC0[5].X, 2234; EG-NEXT: MOV * T0.Z, KC0[4].W, 2235; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2236; EG-NEXT: MOV T0.X, KC0[4].Y, 2237; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2238; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2239; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2240; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2241; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2242; EG-NEXT: MOV * T3.X, KC0[5].Y, 2243; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2244; 2245; CM-LABEL: v5f32_arg: 2246; CM: ; %bb.0: ; %entry 2247; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2248; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2249; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2250; CM-NEXT: CF_END 2251; CM-NEXT: ALU clause starting at 4: 2252; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2253; CM-NEXT: MOV * T0.W, KC0[5].X, 2254; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2255; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2256; CM-NEXT: MOV * T0.Z, KC0[4].W, 2257; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2258; CM-NEXT: MOV T2.X, KC0[5].Y, 2259; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2260; CM-NEXT: MOV * T0.X, KC0[4].Y, 2261; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2262; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2263entry: 2264 store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4 2265 ret void 2266} 2267 2268define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { 2269; SI-LABEL: v5i64_arg: 2270; SI: ; %bb.0: ; %entry 2271; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 2272; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2273; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 2274; SI-NEXT: s_mov_b32 s15, 0xf000 2275; SI-NEXT: s_mov_b32 s14, -1 2276; SI-NEXT: s_waitcnt lgkmcnt(0) 2277; SI-NEXT: v_mov_b32_e32 v0, s8 2278; SI-NEXT: v_mov_b32_e32 v1, s9 2279; SI-NEXT: v_mov_b32_e32 v2, s10 2280; SI-NEXT: v_mov_b32_e32 v3, s11 2281; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 2282; SI-NEXT: s_waitcnt expcnt(0) 2283; SI-NEXT: v_mov_b32_e32 v0, s4 2284; SI-NEXT: v_mov_b32_e32 v1, s5 2285; SI-NEXT: v_mov_b32_e32 v2, s6 2286; SI-NEXT: v_mov_b32_e32 v3, s7 2287; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2288; SI-NEXT: s_waitcnt expcnt(0) 2289; SI-NEXT: v_mov_b32_e32 v0, s0 2290; SI-NEXT: v_mov_b32_e32 v1, s1 2291; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 2292; SI-NEXT: s_endpgm 2293; 2294; VI-LABEL: v5i64_arg: 2295; VI: ; %bb.0: ; %entry 2296; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 2297; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2298; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 2299; VI-NEXT: s_waitcnt lgkmcnt(0) 2300; VI-NEXT: v_mov_b32_e32 v0, s8 2301; VI-NEXT: s_add_u32 s8, s2, 16 2302; VI-NEXT: v_mov_b32_e32 v1, s9 2303; VI-NEXT: s_addc_u32 s9, s3, 0 2304; VI-NEXT: v_mov_b32_e32 v4, s8 2305; VI-NEXT: v_mov_b32_e32 v2, s10 2306; VI-NEXT: v_mov_b32_e32 v3, s11 2307; VI-NEXT: v_mov_b32_e32 v5, s9 2308; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2309; VI-NEXT: v_mov_b32_e32 v5, s3 2310; VI-NEXT: v_mov_b32_e32 v0, s4 2311; VI-NEXT: v_mov_b32_e32 v1, s5 2312; VI-NEXT: v_mov_b32_e32 v2, s6 2313; VI-NEXT: v_mov_b32_e32 v3, s7 2314; VI-NEXT: v_mov_b32_e32 v4, s2 2315; VI-NEXT: s_add_u32 s2, s2, 32 2316; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2317; VI-NEXT: s_addc_u32 s3, s3, 0 2318; VI-NEXT: v_mov_b32_e32 v2, s2 2319; VI-NEXT: v_mov_b32_e32 v0, s0 2320; VI-NEXT: v_mov_b32_e32 v1, s1 2321; VI-NEXT: v_mov_b32_e32 v3, s3 2322; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2323; VI-NEXT: s_endpgm 2324; 2325; GFX9-LABEL: v5i64_arg: 2326; GFX9: ; %bb.0: ; %entry 2327; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 2328; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2329; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 2330; GFX9-NEXT: v_mov_b32_e32 v4, 0 2331; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2332; GFX9-NEXT: v_mov_b32_e32 v0, s12 2333; GFX9-NEXT: v_mov_b32_e32 v1, s13 2334; GFX9-NEXT: v_mov_b32_e32 v2, s14 2335; GFX9-NEXT: v_mov_b32_e32 v3, s15 2336; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 2337; GFX9-NEXT: s_nop 0 2338; GFX9-NEXT: v_mov_b32_e32 v0, s8 2339; GFX9-NEXT: v_mov_b32_e32 v1, s9 2340; GFX9-NEXT: v_mov_b32_e32 v2, s10 2341; GFX9-NEXT: v_mov_b32_e32 v3, s11 2342; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2343; GFX9-NEXT: s_nop 0 2344; GFX9-NEXT: v_mov_b32_e32 v0, s0 2345; GFX9-NEXT: v_mov_b32_e32 v1, s1 2346; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 2347; GFX9-NEXT: s_endpgm 2348; 2349; EG-LABEL: v5i64_arg: 2350; EG: ; %bb.0: ; %entry 2351; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2352; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2353; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2354; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2355; EG-NEXT: CF_END 2356; EG-NEXT: PAD 2357; EG-NEXT: ALU clause starting at 6: 2358; EG-NEXT: MOV * T0.W, KC0[7].X, 2359; EG-NEXT: MOV * T0.Z, KC0[6].W, 2360; EG-NEXT: MOV T0.Y, KC0[6].Z, 2361; EG-NEXT: MOV * T1.W, KC0[8].X, 2362; EG-NEXT: MOV T0.X, KC0[6].Y, 2363; EG-NEXT: MOV * T1.Z, KC0[7].W, 2364; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2365; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2366; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2367; EG-NEXT: MOV T1.X, KC0[7].Y, 2368; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2369; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2370; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2371; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2372; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2373; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2374; EG-NEXT: MOV T5.Y, KC0[8].Z, 2375; EG-NEXT: MOV * T5.X, KC0[8].Y, 2376; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2377; 2378; CM-LABEL: v5i64_arg: 2379; CM: ; %bb.0: ; %entry 2380; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2381; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2382; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2383; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2384; CM-NEXT: CF_END 2385; CM-NEXT: PAD 2386; CM-NEXT: ALU clause starting at 6: 2387; CM-NEXT: MOV * T0.W, KC0[8].X, 2388; CM-NEXT: MOV T1.Y, KC0[8].Z, 2389; CM-NEXT: MOV * T0.Z, KC0[7].W, 2390; CM-NEXT: MOV T1.X, KC0[8].Y, 2391; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2392; CM-NEXT: MOV T0.X, KC0[7].Y, 2393; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2394; CM-NEXT: MOV * T2.W, KC0[7].X, 2395; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2396; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2397; CM-NEXT: MOV T2.Z, KC0[6].W, 2398; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2399; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2400; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2401; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2402; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2403; CM-NEXT: MOV * T2.X, KC0[6].Y, 2404; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2405; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2406entry: 2407 store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 2408 ret void 2409} 2410 2411define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { 2412; SI-LABEL: v5f64_arg: 2413; SI: ; %bb.0: ; %entry 2414; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 2415; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2416; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 2417; SI-NEXT: s_mov_b32 s15, 0xf000 2418; SI-NEXT: s_mov_b32 s14, -1 2419; SI-NEXT: s_waitcnt lgkmcnt(0) 2420; SI-NEXT: v_mov_b32_e32 v0, s8 2421; SI-NEXT: v_mov_b32_e32 v1, s9 2422; SI-NEXT: v_mov_b32_e32 v2, s10 2423; SI-NEXT: v_mov_b32_e32 v3, s11 2424; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 2425; SI-NEXT: s_waitcnt expcnt(0) 2426; SI-NEXT: v_mov_b32_e32 v0, s4 2427; SI-NEXT: v_mov_b32_e32 v1, s5 2428; SI-NEXT: v_mov_b32_e32 v2, s6 2429; SI-NEXT: v_mov_b32_e32 v3, s7 2430; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2431; SI-NEXT: s_waitcnt expcnt(0) 2432; SI-NEXT: v_mov_b32_e32 v0, s0 2433; SI-NEXT: v_mov_b32_e32 v1, s1 2434; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 2435; SI-NEXT: s_endpgm 2436; 2437; VI-LABEL: v5f64_arg: 2438; VI: ; %bb.0: ; %entry 2439; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 2440; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2441; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 2442; VI-NEXT: s_waitcnt lgkmcnt(0) 2443; VI-NEXT: v_mov_b32_e32 v0, s8 2444; VI-NEXT: s_add_u32 s8, s2, 16 2445; VI-NEXT: v_mov_b32_e32 v1, s9 2446; VI-NEXT: s_addc_u32 s9, s3, 0 2447; VI-NEXT: v_mov_b32_e32 v4, s8 2448; VI-NEXT: v_mov_b32_e32 v2, s10 2449; VI-NEXT: v_mov_b32_e32 v3, s11 2450; VI-NEXT: v_mov_b32_e32 v5, s9 2451; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2452; VI-NEXT: v_mov_b32_e32 v5, s3 2453; VI-NEXT: v_mov_b32_e32 v0, s4 2454; VI-NEXT: v_mov_b32_e32 v1, s5 2455; VI-NEXT: v_mov_b32_e32 v2, s6 2456; VI-NEXT: v_mov_b32_e32 v3, s7 2457; VI-NEXT: v_mov_b32_e32 v4, s2 2458; VI-NEXT: s_add_u32 s2, s2, 32 2459; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2460; VI-NEXT: s_addc_u32 s3, s3, 0 2461; VI-NEXT: v_mov_b32_e32 v2, s2 2462; VI-NEXT: v_mov_b32_e32 v0, s0 2463; VI-NEXT: v_mov_b32_e32 v1, s1 2464; VI-NEXT: v_mov_b32_e32 v3, s3 2465; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2466; VI-NEXT: s_endpgm 2467; 2468; GFX9-LABEL: v5f64_arg: 2469; GFX9: ; %bb.0: ; %entry 2470; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 2471; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2472; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 2473; GFX9-NEXT: v_mov_b32_e32 v4, 0 2474; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2475; GFX9-NEXT: v_mov_b32_e32 v0, s12 2476; GFX9-NEXT: v_mov_b32_e32 v1, s13 2477; GFX9-NEXT: v_mov_b32_e32 v2, s14 2478; GFX9-NEXT: v_mov_b32_e32 v3, s15 2479; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 2480; GFX9-NEXT: s_nop 0 2481; GFX9-NEXT: v_mov_b32_e32 v0, s8 2482; GFX9-NEXT: v_mov_b32_e32 v1, s9 2483; GFX9-NEXT: v_mov_b32_e32 v2, s10 2484; GFX9-NEXT: v_mov_b32_e32 v3, s11 2485; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2486; GFX9-NEXT: s_nop 0 2487; GFX9-NEXT: v_mov_b32_e32 v0, s0 2488; GFX9-NEXT: v_mov_b32_e32 v1, s1 2489; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 2490; GFX9-NEXT: s_endpgm 2491; 2492; EG-LABEL: v5f64_arg: 2493; EG: ; %bb.0: ; %entry 2494; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2495; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2496; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2497; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2498; EG-NEXT: CF_END 2499; EG-NEXT: PAD 2500; EG-NEXT: ALU clause starting at 6: 2501; EG-NEXT: MOV * T0.W, KC0[7].X, 2502; EG-NEXT: MOV * T0.Z, KC0[6].W, 2503; EG-NEXT: MOV T0.Y, KC0[6].Z, 2504; EG-NEXT: MOV * T1.W, KC0[8].X, 2505; EG-NEXT: MOV T0.X, KC0[6].Y, 2506; EG-NEXT: MOV * T1.Z, KC0[7].W, 2507; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2508; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2509; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2510; EG-NEXT: MOV T1.X, KC0[7].Y, 2511; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2512; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2513; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2514; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2515; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2516; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2517; EG-NEXT: MOV T5.Y, KC0[8].Z, 2518; EG-NEXT: MOV * T5.X, KC0[8].Y, 2519; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2520; 2521; CM-LABEL: v5f64_arg: 2522; CM: ; %bb.0: ; %entry 2523; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2524; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2525; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2526; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2527; CM-NEXT: CF_END 2528; CM-NEXT: PAD 2529; CM-NEXT: ALU clause starting at 6: 2530; CM-NEXT: MOV * T0.W, KC0[8].X, 2531; CM-NEXT: MOV T1.Y, KC0[8].Z, 2532; CM-NEXT: MOV * T0.Z, KC0[7].W, 2533; CM-NEXT: MOV T1.X, KC0[8].Y, 2534; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2535; CM-NEXT: MOV T0.X, KC0[7].Y, 2536; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2537; CM-NEXT: MOV * T2.W, KC0[7].X, 2538; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2539; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2540; CM-NEXT: MOV T2.Z, KC0[6].W, 2541; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2542; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2543; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2544; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2545; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2546; CM-NEXT: MOV * T2.X, KC0[6].Y, 2547; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2548; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2549entry: 2550 store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 2551 ret void 2552} 2553 2554; FIXME: Lots of unpack and re-pack junk on VI 2555define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 2556; SI-LABEL: v8i8_arg: 2557; SI: ; %bb.0: ; %entry 2558; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 2559; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2560; SI-NEXT: s_mov_b32 s3, 0xf000 2561; SI-NEXT: s_mov_b32 s2, -1 2562; SI-NEXT: s_waitcnt lgkmcnt(0) 2563; SI-NEXT: v_mov_b32_e32 v0, s4 2564; SI-NEXT: v_mov_b32_e32 v1, s5 2565; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2566; SI-NEXT: s_endpgm 2567; 2568; VI-LABEL: v8i8_arg: 2569; VI: ; %bb.0: ; %entry 2570; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2571; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 2572; VI-NEXT: s_waitcnt lgkmcnt(0) 2573; VI-NEXT: v_mov_b32_e32 v0, s2 2574; VI-NEXT: v_mov_b32_e32 v3, s1 2575; VI-NEXT: v_mov_b32_e32 v1, s3 2576; VI-NEXT: v_mov_b32_e32 v2, s0 2577; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2578; VI-NEXT: s_endpgm 2579; 2580; GFX9-LABEL: v8i8_arg: 2581; GFX9: ; %bb.0: ; %entry 2582; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2583; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2584; GFX9-NEXT: v_mov_b32_e32 v2, 0 2585; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2586; GFX9-NEXT: v_mov_b32_e32 v0, s0 2587; GFX9-NEXT: v_mov_b32_e32 v1, s1 2588; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2589; GFX9-NEXT: s_endpgm 2590; 2591; EG-LABEL: v8i8_arg: 2592; EG: ; %bb.0: ; %entry 2593; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2594; EG-NEXT: TEX 0 @20 2595; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2596; EG-NEXT: TEX 0 @22 2597; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2598; EG-NEXT: TEX 0 @24 2599; EG-NEXT: ALU 7, @50, KC0[], KC1[] 2600; EG-NEXT: TEX 0 @26 2601; EG-NEXT: ALU 7, @58, KC0[], KC1[] 2602; EG-NEXT: TEX 0 @28 2603; EG-NEXT: ALU 7, @66, KC0[], KC1[] 2604; EG-NEXT: TEX 0 @30 2605; EG-NEXT: ALU 7, @74, KC0[], KC1[] 2606; EG-NEXT: TEX 0 @32 2607; EG-NEXT: ALU 5, @82, KC0[], KC1[] 2608; EG-NEXT: TEX 0 @34 2609; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2610; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 2611; EG-NEXT: CF_END 2612; EG-NEXT: PAD 2613; EG-NEXT: Fetch clause starting at 20: 2614; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2615; EG-NEXT: Fetch clause starting at 22: 2616; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2617; EG-NEXT: Fetch clause starting at 24: 2618; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2619; EG-NEXT: Fetch clause starting at 26: 2620; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2621; EG-NEXT: Fetch clause starting at 28: 2622; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2623; EG-NEXT: Fetch clause starting at 30: 2624; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2625; EG-NEXT: Fetch clause starting at 32: 2626; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2627; EG-NEXT: Fetch clause starting at 34: 2628; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2629; EG-NEXT: ALU clause starting at 36: 2630; EG-NEXT: MOV * T0.Y, T2.X, 2631; EG-NEXT: MOV * T5.X, 0.0, 2632; EG-NEXT: ALU clause starting at 38: 2633; EG-NEXT: LSHL T0.W, T6.X, literal.x, 2634; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2635; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2636; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2637; EG-NEXT: MOV T2.X, PV.W, 2638; EG-NEXT: MOV * T0.Y, T3.X, 2639; EG-NEXT: ALU clause starting at 44: 2640; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2641; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 2642; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2643; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2644; EG-NEXT: MOV T3.X, PV.W, 2645; EG-NEXT: MOV * T0.Y, T2.X, 2646; EG-NEXT: ALU clause starting at 50: 2647; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2648; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2649; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2650; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2651; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2652; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2653; EG-NEXT: MOV T2.X, PV.W, 2654; EG-NEXT: MOV * T0.Y, T3.X, 2655; EG-NEXT: ALU clause starting at 58: 2656; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2657; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2658; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2659; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2660; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2661; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2662; EG-NEXT: MOV T3.X, PV.W, 2663; EG-NEXT: MOV * T0.Y, T2.X, 2664; EG-NEXT: ALU clause starting at 66: 2665; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2666; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2667; EG-NEXT: 255(3.573311e-43), -65281(nan) 2668; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2669; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2670; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2671; EG-NEXT: MOV T2.X, PV.W, 2672; EG-NEXT: MOV * T0.Y, T3.X, 2673; EG-NEXT: ALU clause starting at 74: 2674; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2675; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2676; EG-NEXT: 255(3.573311e-43), -65281(nan) 2677; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2678; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2679; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2680; EG-NEXT: MOV T3.X, PV.W, 2681; EG-NEXT: MOV * T0.Y, T2.X, 2682; EG-NEXT: ALU clause starting at 82: 2683; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2684; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 2685; EG-NEXT: -256(nan), 255(3.573311e-43) 2686; EG-NEXT: OR_INT * T5.Y, PV.W, PS, 2687; EG-NEXT: MOV T2.X, PV.Y, 2688; EG-NEXT: MOV * T0.Y, T3.X, 2689; EG-NEXT: ALU clause starting at 88: 2690; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2691; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, 2692; EG-NEXT: -256(nan), 255(3.573311e-43) 2693; EG-NEXT: OR_INT T5.X, PV.W, PS, 2694; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2695; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2696; 2697; CM-LABEL: v8i8_arg: 2698; CM: ; %bb.0: ; %entry 2699; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2700; CM-NEXT: TEX 0 @20 2701; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2702; CM-NEXT: TEX 0 @22 2703; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2704; CM-NEXT: TEX 0 @24 2705; CM-NEXT: ALU 7, @50, KC0[], KC1[] 2706; CM-NEXT: TEX 0 @26 2707; CM-NEXT: ALU 7, @58, KC0[], KC1[] 2708; CM-NEXT: TEX 0 @28 2709; CM-NEXT: ALU 7, @66, KC0[], KC1[] 2710; CM-NEXT: TEX 0 @30 2711; CM-NEXT: ALU 7, @74, KC0[], KC1[] 2712; CM-NEXT: TEX 0 @32 2713; CM-NEXT: ALU 5, @82, KC0[], KC1[] 2714; CM-NEXT: TEX 0 @34 2715; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2716; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 2717; CM-NEXT: CF_END 2718; CM-NEXT: PAD 2719; CM-NEXT: Fetch clause starting at 20: 2720; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2721; CM-NEXT: Fetch clause starting at 22: 2722; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2723; CM-NEXT: Fetch clause starting at 24: 2724; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2725; CM-NEXT: Fetch clause starting at 26: 2726; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2727; CM-NEXT: Fetch clause starting at 28: 2728; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2729; CM-NEXT: Fetch clause starting at 30: 2730; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2731; CM-NEXT: Fetch clause starting at 32: 2732; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2733; CM-NEXT: Fetch clause starting at 34: 2734; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2735; CM-NEXT: ALU clause starting at 36: 2736; CM-NEXT: MOV * T0.Y, T2.X, 2737; CM-NEXT: MOV * T5.X, 0.0, 2738; CM-NEXT: ALU clause starting at 38: 2739; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 2740; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2741; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2742; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2743; CM-NEXT: MOV T2.X, PV.W, 2744; CM-NEXT: MOV * T0.Y, T3.X, 2745; CM-NEXT: ALU clause starting at 44: 2746; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2747; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 2748; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2749; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2750; CM-NEXT: MOV T3.X, PV.W, 2751; CM-NEXT: MOV * T0.Y, T2.X, 2752; CM-NEXT: ALU clause starting at 50: 2753; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2754; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2755; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2756; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2757; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2758; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2759; CM-NEXT: MOV T2.X, PV.W, 2760; CM-NEXT: MOV * T0.Y, T3.X, 2761; CM-NEXT: ALU clause starting at 58: 2762; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2763; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2764; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2765; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2766; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2767; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2768; CM-NEXT: MOV T3.X, PV.W, 2769; CM-NEXT: MOV * T0.Y, T2.X, 2770; CM-NEXT: ALU clause starting at 66: 2771; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2772; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2773; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2774; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2775; CM-NEXT: -65281(nan), 8(1.121039e-44) 2776; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2777; CM-NEXT: MOV T2.X, PV.W, 2778; CM-NEXT: MOV * T0.Y, T3.X, 2779; CM-NEXT: ALU clause starting at 74: 2780; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2781; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2782; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2783; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2784; CM-NEXT: -65281(nan), 8(1.121039e-44) 2785; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2786; CM-NEXT: MOV T3.X, PV.W, 2787; CM-NEXT: MOV * T0.Y, T2.X, 2788; CM-NEXT: ALU clause starting at 82: 2789; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2790; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 2791; CM-NEXT: -256(nan), 255(3.573311e-43) 2792; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W, 2793; CM-NEXT: MOV T2.X, PV.Y, 2794; CM-NEXT: MOV * T0.Y, T3.X, 2795; CM-NEXT: ALU clause starting at 88: 2796; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2797; CM-NEXT: AND_INT * T0.W, T5.X, literal.y, 2798; CM-NEXT: -256(nan), 255(3.573311e-43) 2799; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 2800; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2801; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2802entry: 2803 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 2804 ret void 2805} 2806 2807define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 2808; SI-LABEL: v8i16_arg: 2809; SI: ; %bb.0: ; %entry 2810; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 2811; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2812; SI-NEXT: s_mov_b32 s3, 0xf000 2813; SI-NEXT: s_mov_b32 s2, -1 2814; SI-NEXT: s_waitcnt lgkmcnt(0) 2815; SI-NEXT: v_mov_b32_e32 v0, s4 2816; SI-NEXT: v_mov_b32_e32 v1, s5 2817; SI-NEXT: v_mov_b32_e32 v2, s6 2818; SI-NEXT: v_mov_b32_e32 v3, s7 2819; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2820; SI-NEXT: s_endpgm 2821; 2822; VI-LABEL: v8i16_arg: 2823; VI: ; %bb.0: ; %entry 2824; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2825; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 2826; VI-NEXT: s_waitcnt lgkmcnt(0) 2827; VI-NEXT: v_mov_b32_e32 v4, s4 2828; VI-NEXT: v_mov_b32_e32 v0, s0 2829; VI-NEXT: v_mov_b32_e32 v5, s5 2830; VI-NEXT: v_mov_b32_e32 v1, s1 2831; VI-NEXT: v_mov_b32_e32 v2, s2 2832; VI-NEXT: v_mov_b32_e32 v3, s3 2833; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2834; VI-NEXT: s_endpgm 2835; 2836; GFX9-LABEL: v8i16_arg: 2837; GFX9: ; %bb.0: ; %entry 2838; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 2839; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2840; GFX9-NEXT: v_mov_b32_e32 v4, 0 2841; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2842; GFX9-NEXT: v_mov_b32_e32 v0, s0 2843; GFX9-NEXT: v_mov_b32_e32 v1, s1 2844; GFX9-NEXT: v_mov_b32_e32 v2, s2 2845; GFX9-NEXT: v_mov_b32_e32 v3, s3 2846; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2847; GFX9-NEXT: s_endpgm 2848; 2849; EG-LABEL: v8i16_arg: 2850; EG: ; %bb.0: ; %entry 2851; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2852; EG-NEXT: TEX 0 @20 2853; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2854; EG-NEXT: TEX 0 @22 2855; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2856; EG-NEXT: TEX 0 @24 2857; EG-NEXT: ALU 5, @50, KC0[], KC1[] 2858; EG-NEXT: TEX 0 @26 2859; EG-NEXT: ALU 5, @56, KC0[], KC1[] 2860; EG-NEXT: TEX 0 @28 2861; EG-NEXT: ALU 5, @62, KC0[], KC1[] 2862; EG-NEXT: TEX 0 @30 2863; EG-NEXT: ALU 5, @68, KC0[], KC1[] 2864; EG-NEXT: TEX 0 @32 2865; EG-NEXT: ALU 5, @74, KC0[], KC1[] 2866; EG-NEXT: TEX 0 @34 2867; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2868; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 2869; EG-NEXT: CF_END 2870; EG-NEXT: PAD 2871; EG-NEXT: Fetch clause starting at 20: 2872; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2873; EG-NEXT: Fetch clause starting at 22: 2874; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2875; EG-NEXT: Fetch clause starting at 24: 2876; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2877; EG-NEXT: Fetch clause starting at 26: 2878; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2879; EG-NEXT: Fetch clause starting at 28: 2880; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2881; EG-NEXT: Fetch clause starting at 30: 2882; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2883; EG-NEXT: Fetch clause starting at 32: 2884; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2885; EG-NEXT: Fetch clause starting at 34: 2886; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2887; EG-NEXT: ALU clause starting at 36: 2888; EG-NEXT: MOV * T0.Y, T3.X, 2889; EG-NEXT: MOV * T7.X, 0.0, 2890; EG-NEXT: ALU clause starting at 38: 2891; EG-NEXT: LSHL T0.W, T8.X, literal.x, 2892; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2893; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2894; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2895; EG-NEXT: MOV T3.X, PV.W, 2896; EG-NEXT: MOV * T0.Y, T5.X, 2897; EG-NEXT: ALU clause starting at 44: 2898; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2899; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2900; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2901; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2902; EG-NEXT: MOV T5.X, PV.W, 2903; EG-NEXT: MOV * T0.Y, T3.X, 2904; EG-NEXT: ALU clause starting at 50: 2905; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2906; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2907; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2908; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2909; EG-NEXT: MOV T3.X, PV.W, 2910; EG-NEXT: MOV * T0.Y, T5.X, 2911; EG-NEXT: ALU clause starting at 56: 2912; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2913; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2914; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2915; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2916; EG-NEXT: MOV T5.X, PV.W, 2917; EG-NEXT: MOV * T0.Y, T2.X, 2918; EG-NEXT: ALU clause starting at 62: 2919; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2920; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2921; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2922; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2923; EG-NEXT: MOV T2.X, PV.W, 2924; EG-NEXT: MOV * T0.Y, T4.X, 2925; EG-NEXT: ALU clause starting at 68: 2926; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2927; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2928; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2929; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2930; EG-NEXT: MOV T4.X, PV.W, 2931; EG-NEXT: MOV * T0.Y, T2.X, 2932; EG-NEXT: ALU clause starting at 74: 2933; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2934; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2935; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2936; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 2937; EG-NEXT: MOV T2.X, PV.Z, 2938; EG-NEXT: MOV * T0.Y, T4.X, 2939; EG-NEXT: ALU clause starting at 80: 2940; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 2941; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 2942; EG-NEXT: AND_INT * T1.W, T7.X, literal.z, 2943; EG-NEXT: 2(2.802597e-45), -65536(nan) 2944; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2945; EG-NEXT: OR_INT * T7.X, PV.W, PS, 2946; EG-NEXT: MOV T4.X, PV.X, 2947; EG-NEXT: MOV * T7.W, T3.X, 2948; EG-NEXT: MOV * T7.Y, T5.X, 2949; 2950; CM-LABEL: v8i16_arg: 2951; CM: ; %bb.0: ; %entry 2952; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2953; CM-NEXT: TEX 0 @20 2954; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2955; CM-NEXT: TEX 0 @22 2956; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2957; CM-NEXT: TEX 0 @24 2958; CM-NEXT: ALU 5, @50, KC0[], KC1[] 2959; CM-NEXT: TEX 0 @26 2960; CM-NEXT: ALU 5, @56, KC0[], KC1[] 2961; CM-NEXT: TEX 0 @28 2962; CM-NEXT: ALU 5, @62, KC0[], KC1[] 2963; CM-NEXT: TEX 0 @30 2964; CM-NEXT: ALU 5, @68, KC0[], KC1[] 2965; CM-NEXT: TEX 0 @32 2966; CM-NEXT: ALU 5, @74, KC0[], KC1[] 2967; CM-NEXT: TEX 0 @34 2968; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2969; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 2970; CM-NEXT: CF_END 2971; CM-NEXT: PAD 2972; CM-NEXT: Fetch clause starting at 20: 2973; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2974; CM-NEXT: Fetch clause starting at 22: 2975; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2976; CM-NEXT: Fetch clause starting at 24: 2977; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2978; CM-NEXT: Fetch clause starting at 26: 2979; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2980; CM-NEXT: Fetch clause starting at 28: 2981; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2982; CM-NEXT: Fetch clause starting at 30: 2983; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2984; CM-NEXT: Fetch clause starting at 32: 2985; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2986; CM-NEXT: Fetch clause starting at 34: 2987; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2988; CM-NEXT: ALU clause starting at 36: 2989; CM-NEXT: MOV * T0.Y, T3.X, 2990; CM-NEXT: MOV * T7.X, 0.0, 2991; CM-NEXT: ALU clause starting at 38: 2992; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 2993; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2994; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2995; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2996; CM-NEXT: MOV T3.X, PV.W, 2997; CM-NEXT: MOV * T0.Y, T5.X, 2998; CM-NEXT: ALU clause starting at 44: 2999; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3000; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3001; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3002; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3003; CM-NEXT: MOV T5.X, PV.W, 3004; CM-NEXT: MOV * T0.Y, T3.X, 3005; CM-NEXT: ALU clause starting at 50: 3006; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3007; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3008; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3009; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3010; CM-NEXT: MOV T3.X, PV.W, 3011; CM-NEXT: MOV * T0.Y, T5.X, 3012; CM-NEXT: ALU clause starting at 56: 3013; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3014; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3015; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3016; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3017; CM-NEXT: MOV T5.X, PV.W, 3018; CM-NEXT: MOV * T0.Y, T2.X, 3019; CM-NEXT: ALU clause starting at 62: 3020; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3021; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3022; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3023; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3024; CM-NEXT: MOV T2.X, PV.W, 3025; CM-NEXT: MOV * T0.Y, T4.X, 3026; CM-NEXT: ALU clause starting at 68: 3027; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3028; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3029; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3030; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3031; CM-NEXT: MOV T4.X, PV.W, 3032; CM-NEXT: MOV * T0.Y, T2.X, 3033; CM-NEXT: ALU clause starting at 74: 3034; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3035; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3036; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3037; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3038; CM-NEXT: MOV T2.X, PV.Z, 3039; CM-NEXT: MOV * T0.Y, T4.X, 3040; CM-NEXT: ALU clause starting at 80: 3041; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 3042; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 3043; CM-NEXT: AND_INT * T0.W, T7.X, literal.z, 3044; CM-NEXT: 2(2.802597e-45), -65536(nan) 3045; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3046; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3047; CM-NEXT: MOV T4.X, PV.X, 3048; CM-NEXT: MOV * T7.W, T3.X, 3049; CM-NEXT: MOV * T7.Y, T5.X, 3050entry: 3051 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 3052 ret void 3053} 3054 3055define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 3056; SI-LABEL: v8i32_arg: 3057; SI: ; %bb.0: ; %entry 3058; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3059; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3060; SI-NEXT: s_mov_b32 s3, 0xf000 3061; SI-NEXT: s_mov_b32 s2, -1 3062; SI-NEXT: s_waitcnt lgkmcnt(0) 3063; SI-NEXT: v_mov_b32_e32 v0, s8 3064; SI-NEXT: v_mov_b32_e32 v1, s9 3065; SI-NEXT: v_mov_b32_e32 v2, s10 3066; SI-NEXT: v_mov_b32_e32 v3, s11 3067; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3068; SI-NEXT: s_waitcnt expcnt(0) 3069; SI-NEXT: v_mov_b32_e32 v0, s4 3070; SI-NEXT: v_mov_b32_e32 v1, s5 3071; SI-NEXT: v_mov_b32_e32 v2, s6 3072; SI-NEXT: v_mov_b32_e32 v3, s7 3073; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3074; SI-NEXT: s_endpgm 3075; 3076; VI-LABEL: v8i32_arg: 3077; VI: ; %bb.0: ; %entry 3078; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 3079; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3080; VI-NEXT: s_waitcnt lgkmcnt(0) 3081; VI-NEXT: v_mov_b32_e32 v0, s8 3082; VI-NEXT: s_add_u32 s2, s0, 16 3083; VI-NEXT: s_addc_u32 s3, s1, 0 3084; VI-NEXT: v_mov_b32_e32 v5, s3 3085; VI-NEXT: v_mov_b32_e32 v1, s9 3086; VI-NEXT: v_mov_b32_e32 v2, s10 3087; VI-NEXT: v_mov_b32_e32 v3, s11 3088; VI-NEXT: v_mov_b32_e32 v4, s2 3089; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3090; VI-NEXT: v_mov_b32_e32 v5, s1 3091; VI-NEXT: v_mov_b32_e32 v0, s4 3092; VI-NEXT: v_mov_b32_e32 v1, s5 3093; VI-NEXT: v_mov_b32_e32 v2, s6 3094; VI-NEXT: v_mov_b32_e32 v3, s7 3095; VI-NEXT: v_mov_b32_e32 v4, s0 3096; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3097; VI-NEXT: s_endpgm 3098; 3099; GFX9-LABEL: v8i32_arg: 3100; GFX9: ; %bb.0: ; %entry 3101; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3102; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3103; GFX9-NEXT: v_mov_b32_e32 v4, 0 3104; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3105; GFX9-NEXT: v_mov_b32_e32 v0, s12 3106; GFX9-NEXT: v_mov_b32_e32 v1, s13 3107; GFX9-NEXT: v_mov_b32_e32 v2, s14 3108; GFX9-NEXT: v_mov_b32_e32 v3, s15 3109; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3110; GFX9-NEXT: s_nop 0 3111; GFX9-NEXT: v_mov_b32_e32 v0, s8 3112; GFX9-NEXT: v_mov_b32_e32 v1, s9 3113; GFX9-NEXT: v_mov_b32_e32 v2, s10 3114; GFX9-NEXT: v_mov_b32_e32 v3, s11 3115; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3116; GFX9-NEXT: s_endpgm 3117; 3118; EG-LABEL: v8i32_arg: 3119; EG: ; %bb.0: ; %entry 3120; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3123; EG-NEXT: CF_END 3124; EG-NEXT: ALU clause starting at 4: 3125; EG-NEXT: MOV * T0.W, KC0[5].X, 3126; EG-NEXT: MOV * T0.Z, KC0[4].W, 3127; EG-NEXT: MOV T0.Y, KC0[4].Z, 3128; EG-NEXT: MOV * T1.W, KC0[6].X, 3129; EG-NEXT: MOV T0.X, KC0[4].Y, 3130; EG-NEXT: MOV * T1.Z, KC0[5].W, 3131; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3132; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3134; EG-NEXT: MOV T1.X, KC0[5].Y, 3135; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3136; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3137; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3138; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3139; 3140; CM-LABEL: v8i32_arg: 3141; CM: ; %bb.0: ; %entry 3142; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3143; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3144; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3145; CM-NEXT: CF_END 3146; CM-NEXT: ALU clause starting at 4: 3147; CM-NEXT: MOV * T0.W, KC0[6].X, 3148; CM-NEXT: MOV * T0.Z, KC0[5].W, 3149; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3150; CM-NEXT: MOV T0.X, KC0[5].Y, 3151; CM-NEXT: MOV * T1.W, KC0[5].X, 3152; CM-NEXT: MOV T1.Z, KC0[4].W, 3153; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3154; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3155; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3156; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3157; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3158; CM-NEXT: MOV * T1.X, KC0[4].Y, 3159; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3160; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3161entry: 3162 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 3163 ret void 3164} 3165 3166define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 3167; SI-LABEL: v8f32_arg: 3168; SI: ; %bb.0: ; %entry 3169; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3170; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3171; SI-NEXT: s_mov_b32 s3, 0xf000 3172; SI-NEXT: s_mov_b32 s2, -1 3173; SI-NEXT: s_waitcnt lgkmcnt(0) 3174; SI-NEXT: v_mov_b32_e32 v0, s8 3175; SI-NEXT: v_mov_b32_e32 v1, s9 3176; SI-NEXT: v_mov_b32_e32 v2, s10 3177; SI-NEXT: v_mov_b32_e32 v3, s11 3178; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3179; SI-NEXT: s_waitcnt expcnt(0) 3180; SI-NEXT: v_mov_b32_e32 v0, s4 3181; SI-NEXT: v_mov_b32_e32 v1, s5 3182; SI-NEXT: v_mov_b32_e32 v2, s6 3183; SI-NEXT: v_mov_b32_e32 v3, s7 3184; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3185; SI-NEXT: s_endpgm 3186; 3187; VI-LABEL: v8f32_arg: 3188; VI: ; %bb.0: ; %entry 3189; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 3190; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3191; VI-NEXT: s_waitcnt lgkmcnt(0) 3192; VI-NEXT: v_mov_b32_e32 v0, s8 3193; VI-NEXT: s_add_u32 s2, s0, 16 3194; VI-NEXT: s_addc_u32 s3, s1, 0 3195; VI-NEXT: v_mov_b32_e32 v5, s3 3196; VI-NEXT: v_mov_b32_e32 v1, s9 3197; VI-NEXT: v_mov_b32_e32 v2, s10 3198; VI-NEXT: v_mov_b32_e32 v3, s11 3199; VI-NEXT: v_mov_b32_e32 v4, s2 3200; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3201; VI-NEXT: v_mov_b32_e32 v5, s1 3202; VI-NEXT: v_mov_b32_e32 v0, s4 3203; VI-NEXT: v_mov_b32_e32 v1, s5 3204; VI-NEXT: v_mov_b32_e32 v2, s6 3205; VI-NEXT: v_mov_b32_e32 v3, s7 3206; VI-NEXT: v_mov_b32_e32 v4, s0 3207; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3208; VI-NEXT: s_endpgm 3209; 3210; GFX9-LABEL: v8f32_arg: 3211; GFX9: ; %bb.0: ; %entry 3212; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3213; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3214; GFX9-NEXT: v_mov_b32_e32 v4, 0 3215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3216; GFX9-NEXT: v_mov_b32_e32 v0, s12 3217; GFX9-NEXT: v_mov_b32_e32 v1, s13 3218; GFX9-NEXT: v_mov_b32_e32 v2, s14 3219; GFX9-NEXT: v_mov_b32_e32 v3, s15 3220; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3221; GFX9-NEXT: s_nop 0 3222; GFX9-NEXT: v_mov_b32_e32 v0, s8 3223; GFX9-NEXT: v_mov_b32_e32 v1, s9 3224; GFX9-NEXT: v_mov_b32_e32 v2, s10 3225; GFX9-NEXT: v_mov_b32_e32 v3, s11 3226; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3227; GFX9-NEXT: s_endpgm 3228; 3229; EG-LABEL: v8f32_arg: 3230; EG: ; %bb.0: ; %entry 3231; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3232; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3233; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3234; EG-NEXT: CF_END 3235; EG-NEXT: ALU clause starting at 4: 3236; EG-NEXT: MOV * T0.W, KC0[5].X, 3237; EG-NEXT: MOV * T0.Z, KC0[4].W, 3238; EG-NEXT: MOV T0.Y, KC0[4].Z, 3239; EG-NEXT: MOV * T1.W, KC0[6].X, 3240; EG-NEXT: MOV T0.X, KC0[4].Y, 3241; EG-NEXT: MOV * T1.Z, KC0[5].W, 3242; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3243; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3244; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3245; EG-NEXT: MOV T1.X, KC0[5].Y, 3246; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3247; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3248; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3249; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3250; 3251; CM-LABEL: v8f32_arg: 3252; CM: ; %bb.0: ; %entry 3253; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3254; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3255; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3256; CM-NEXT: CF_END 3257; CM-NEXT: ALU clause starting at 4: 3258; CM-NEXT: MOV * T0.W, KC0[6].X, 3259; CM-NEXT: MOV * T0.Z, KC0[5].W, 3260; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3261; CM-NEXT: MOV T0.X, KC0[5].Y, 3262; CM-NEXT: MOV * T1.W, KC0[5].X, 3263; CM-NEXT: MOV T1.Z, KC0[4].W, 3264; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3265; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3266; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3267; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3268; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3269; CM-NEXT: MOV * T1.X, KC0[4].Y, 3270; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3271; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3272entry: 3273 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 3274 ret void 3275} 3276 3277; FIXME: Pack/repack on VI 3278define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 3279; SI-LABEL: v16i8_arg: 3280; SI: ; %bb.0: ; %entry 3281; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 3282; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3283; SI-NEXT: s_mov_b32 s3, 0xf000 3284; SI-NEXT: s_mov_b32 s2, -1 3285; SI-NEXT: s_waitcnt lgkmcnt(0) 3286; SI-NEXT: v_mov_b32_e32 v0, s4 3287; SI-NEXT: v_mov_b32_e32 v1, s5 3288; SI-NEXT: v_mov_b32_e32 v2, s6 3289; SI-NEXT: v_mov_b32_e32 v3, s7 3290; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3291; SI-NEXT: s_endpgm 3292; 3293; VI-LABEL: v16i8_arg: 3294; VI: ; %bb.0: ; %entry 3295; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 3296; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 3297; VI-NEXT: s_waitcnt lgkmcnt(0) 3298; VI-NEXT: v_mov_b32_e32 v4, s4 3299; VI-NEXT: v_mov_b32_e32 v0, s0 3300; VI-NEXT: v_mov_b32_e32 v5, s5 3301; VI-NEXT: v_mov_b32_e32 v1, s1 3302; VI-NEXT: v_mov_b32_e32 v2, s2 3303; VI-NEXT: v_mov_b32_e32 v3, s3 3304; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3305; VI-NEXT: s_endpgm 3306; 3307; GFX9-LABEL: v16i8_arg: 3308; GFX9: ; %bb.0: ; %entry 3309; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 3310; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 3311; GFX9-NEXT: v_mov_b32_e32 v4, 0 3312; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3313; GFX9-NEXT: v_mov_b32_e32 v0, s0 3314; GFX9-NEXT: v_mov_b32_e32 v1, s1 3315; GFX9-NEXT: v_mov_b32_e32 v2, s2 3316; GFX9-NEXT: v_mov_b32_e32 v3, s3 3317; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 3318; GFX9-NEXT: s_endpgm 3319; 3320; EG-LABEL: v16i8_arg: 3321; EG: ; %bb.0: ; %entry 3322; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3323; EG-NEXT: TEX 0 @36 3324; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3325; EG-NEXT: TEX 0 @38 3326; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3327; EG-NEXT: TEX 0 @40 3328; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3329; EG-NEXT: TEX 0 @42 3330; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3331; EG-NEXT: TEX 0 @44 3332; EG-NEXT: ALU 7, @94, KC0[], KC1[] 3333; EG-NEXT: TEX 0 @46 3334; EG-NEXT: ALU 7, @102, KC0[], KC1[] 3335; EG-NEXT: TEX 0 @48 3336; EG-NEXT: ALU 7, @110, KC0[], KC1[] 3337; EG-NEXT: TEX 0 @50 3338; EG-NEXT: ALU 7, @118, KC0[], KC1[] 3339; EG-NEXT: TEX 0 @52 3340; EG-NEXT: ALU 7, @126, KC0[], KC1[] 3341; EG-NEXT: TEX 0 @54 3342; EG-NEXT: ALU 7, @134, KC0[], KC1[] 3343; EG-NEXT: TEX 0 @56 3344; EG-NEXT: ALU 7, @142, KC0[], KC1[] 3345; EG-NEXT: TEX 0 @58 3346; EG-NEXT: ALU 7, @150, KC0[], KC1[] 3347; EG-NEXT: TEX 0 @60 3348; EG-NEXT: ALU 5, @158, KC0[], KC1[] 3349; EG-NEXT: TEX 0 @62 3350; EG-NEXT: ALU 5, @164, KC0[], KC1[] 3351; EG-NEXT: TEX 0 @64 3352; EG-NEXT: ALU 5, @170, KC0[], KC1[] 3353; EG-NEXT: TEX 0 @66 3354; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3355; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 3356; EG-NEXT: CF_END 3357; EG-NEXT: PAD 3358; EG-NEXT: Fetch clause starting at 36: 3359; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3360; EG-NEXT: Fetch clause starting at 38: 3361; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3362; EG-NEXT: Fetch clause starting at 40: 3363; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3364; EG-NEXT: Fetch clause starting at 42: 3365; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3366; EG-NEXT: Fetch clause starting at 44: 3367; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3368; EG-NEXT: Fetch clause starting at 46: 3369; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3370; EG-NEXT: Fetch clause starting at 48: 3371; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3372; EG-NEXT: Fetch clause starting at 50: 3373; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3374; EG-NEXT: Fetch clause starting at 52: 3375; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3376; EG-NEXT: Fetch clause starting at 54: 3377; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3378; EG-NEXT: Fetch clause starting at 56: 3379; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3380; EG-NEXT: Fetch clause starting at 58: 3381; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3382; EG-NEXT: Fetch clause starting at 60: 3383; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3384; EG-NEXT: Fetch clause starting at 62: 3385; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3386; EG-NEXT: Fetch clause starting at 64: 3387; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3388; EG-NEXT: Fetch clause starting at 66: 3389; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3390; EG-NEXT: ALU clause starting at 68: 3391; EG-NEXT: MOV * T0.Y, T2.X, 3392; EG-NEXT: MOV * T7.X, 0.0, 3393; EG-NEXT: ALU clause starting at 70: 3394; EG-NEXT: LSHL T0.W, T8.X, literal.x, 3395; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3396; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3397; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3398; EG-NEXT: MOV T2.X, PV.W, 3399; EG-NEXT: MOV * T0.Y, T3.X, 3400; EG-NEXT: ALU clause starting at 76: 3401; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3402; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3403; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3404; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3405; EG-NEXT: MOV T3.X, PV.W, 3406; EG-NEXT: MOV * T0.Y, T4.X, 3407; EG-NEXT: ALU clause starting at 82: 3408; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3409; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3410; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3411; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3412; EG-NEXT: MOV T4.X, PV.W, 3413; EG-NEXT: MOV * T0.Y, T5.X, 3414; EG-NEXT: ALU clause starting at 88: 3415; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3416; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3417; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3418; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3419; EG-NEXT: MOV T5.X, PV.W, 3420; EG-NEXT: MOV * T0.Y, T2.X, 3421; EG-NEXT: ALU clause starting at 94: 3422; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3423; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3424; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3425; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3426; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3427; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3428; EG-NEXT: MOV T2.X, PV.W, 3429; EG-NEXT: MOV * T0.Y, T3.X, 3430; EG-NEXT: ALU clause starting at 102: 3431; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3432; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3433; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3434; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3435; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3436; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3437; EG-NEXT: MOV T3.X, PV.W, 3438; EG-NEXT: MOV * T0.Y, T4.X, 3439; EG-NEXT: ALU clause starting at 110: 3440; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3441; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3442; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3443; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3444; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3445; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3446; EG-NEXT: MOV T4.X, PV.W, 3447; EG-NEXT: MOV * T0.Y, T5.X, 3448; EG-NEXT: ALU clause starting at 118: 3449; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3450; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3451; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3452; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3453; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3454; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3455; EG-NEXT: MOV T5.X, PV.W, 3456; EG-NEXT: MOV * T0.Y, T2.X, 3457; EG-NEXT: ALU clause starting at 126: 3458; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3459; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3460; EG-NEXT: 255(3.573311e-43), -65281(nan) 3461; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3462; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3463; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3464; EG-NEXT: MOV T2.X, PV.W, 3465; EG-NEXT: MOV * T0.Y, T3.X, 3466; EG-NEXT: ALU clause starting at 134: 3467; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3468; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3469; EG-NEXT: 255(3.573311e-43), -65281(nan) 3470; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3471; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3472; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3473; EG-NEXT: MOV T3.X, PV.W, 3474; EG-NEXT: MOV * T0.Y, T4.X, 3475; EG-NEXT: ALU clause starting at 142: 3476; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3477; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3478; EG-NEXT: 255(3.573311e-43), -65281(nan) 3479; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3480; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3481; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3482; EG-NEXT: MOV T4.X, PV.W, 3483; EG-NEXT: MOV * T0.Y, T5.X, 3484; EG-NEXT: ALU clause starting at 150: 3485; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3486; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3487; EG-NEXT: 255(3.573311e-43), -65281(nan) 3488; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3489; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3490; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3491; EG-NEXT: MOV T5.X, PV.W, 3492; EG-NEXT: MOV * T0.Y, T2.X, 3493; EG-NEXT: ALU clause starting at 158: 3494; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3495; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3496; EG-NEXT: -256(nan), 255(3.573311e-43) 3497; EG-NEXT: OR_INT * T7.W, PV.W, PS, 3498; EG-NEXT: MOV T2.X, PV.W, 3499; EG-NEXT: MOV * T0.Y, T3.X, 3500; EG-NEXT: ALU clause starting at 164: 3501; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3502; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3503; EG-NEXT: -256(nan), 255(3.573311e-43) 3504; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 3505; EG-NEXT: MOV T3.X, PV.Z, 3506; EG-NEXT: MOV * T0.Y, T4.X, 3507; EG-NEXT: ALU clause starting at 170: 3508; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3509; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3510; EG-NEXT: -256(nan), 255(3.573311e-43) 3511; EG-NEXT: OR_INT * T7.Y, PV.W, PS, 3512; EG-NEXT: MOV T4.X, PV.Y, 3513; EG-NEXT: MOV * T0.Y, T5.X, 3514; EG-NEXT: ALU clause starting at 176: 3515; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3516; EG-NEXT: AND_INT * T1.W, T7.X, literal.y, 3517; EG-NEXT: -256(nan), 255(3.573311e-43) 3518; EG-NEXT: OR_INT T7.X, PV.W, PS, 3519; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3520; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3521; 3522; CM-LABEL: v16i8_arg: 3523; CM: ; %bb.0: ; %entry 3524; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3525; CM-NEXT: TEX 0 @36 3526; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3527; CM-NEXT: TEX 0 @38 3528; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3529; CM-NEXT: TEX 0 @40 3530; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3531; CM-NEXT: TEX 0 @42 3532; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3533; CM-NEXT: TEX 0 @44 3534; CM-NEXT: ALU 7, @94, KC0[], KC1[] 3535; CM-NEXT: TEX 0 @46 3536; CM-NEXT: ALU 7, @102, KC0[], KC1[] 3537; CM-NEXT: TEX 0 @48 3538; CM-NEXT: ALU 7, @110, KC0[], KC1[] 3539; CM-NEXT: TEX 0 @50 3540; CM-NEXT: ALU 7, @118, KC0[], KC1[] 3541; CM-NEXT: TEX 0 @52 3542; CM-NEXT: ALU 7, @126, KC0[], KC1[] 3543; CM-NEXT: TEX 0 @54 3544; CM-NEXT: ALU 7, @134, KC0[], KC1[] 3545; CM-NEXT: TEX 0 @56 3546; CM-NEXT: ALU 7, @142, KC0[], KC1[] 3547; CM-NEXT: TEX 0 @58 3548; CM-NEXT: ALU 7, @150, KC0[], KC1[] 3549; CM-NEXT: TEX 0 @60 3550; CM-NEXT: ALU 5, @158, KC0[], KC1[] 3551; CM-NEXT: TEX 0 @62 3552; CM-NEXT: ALU 5, @164, KC0[], KC1[] 3553; CM-NEXT: TEX 0 @64 3554; CM-NEXT: ALU 5, @170, KC0[], KC1[] 3555; CM-NEXT: TEX 0 @66 3556; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3557; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 3558; CM-NEXT: CF_END 3559; CM-NEXT: PAD 3560; CM-NEXT: Fetch clause starting at 36: 3561; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3562; CM-NEXT: Fetch clause starting at 38: 3563; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3564; CM-NEXT: Fetch clause starting at 40: 3565; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3566; CM-NEXT: Fetch clause starting at 42: 3567; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3568; CM-NEXT: Fetch clause starting at 44: 3569; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3570; CM-NEXT: Fetch clause starting at 46: 3571; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3572; CM-NEXT: Fetch clause starting at 48: 3573; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3574; CM-NEXT: Fetch clause starting at 50: 3575; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3576; CM-NEXT: Fetch clause starting at 52: 3577; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3578; CM-NEXT: Fetch clause starting at 54: 3579; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3580; CM-NEXT: Fetch clause starting at 56: 3581; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3582; CM-NEXT: Fetch clause starting at 58: 3583; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3584; CM-NEXT: Fetch clause starting at 60: 3585; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3586; CM-NEXT: Fetch clause starting at 62: 3587; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3588; CM-NEXT: Fetch clause starting at 64: 3589; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3590; CM-NEXT: Fetch clause starting at 66: 3591; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3592; CM-NEXT: ALU clause starting at 68: 3593; CM-NEXT: MOV * T0.Y, T2.X, 3594; CM-NEXT: MOV * T7.X, 0.0, 3595; CM-NEXT: ALU clause starting at 70: 3596; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 3597; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 3598; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3599; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 3600; CM-NEXT: MOV T2.X, PV.W, 3601; CM-NEXT: MOV * T0.Y, T3.X, 3602; CM-NEXT: ALU clause starting at 76: 3603; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3604; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3605; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3606; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3607; CM-NEXT: MOV T3.X, PV.W, 3608; CM-NEXT: MOV * T0.Y, T4.X, 3609; CM-NEXT: ALU clause starting at 82: 3610; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3611; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3612; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3613; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3614; CM-NEXT: MOV T4.X, PV.W, 3615; CM-NEXT: MOV * T0.Y, T5.X, 3616; CM-NEXT: ALU clause starting at 88: 3617; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3618; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3619; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3620; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3621; CM-NEXT: MOV T5.X, PV.W, 3622; CM-NEXT: MOV * T0.Y, T2.X, 3623; CM-NEXT: ALU clause starting at 94: 3624; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3625; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3626; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3627; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3628; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3629; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3630; CM-NEXT: MOV T2.X, PV.W, 3631; CM-NEXT: MOV * T0.Y, T3.X, 3632; CM-NEXT: ALU clause starting at 102: 3633; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3634; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3635; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3636; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3637; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3638; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3639; CM-NEXT: MOV T3.X, PV.W, 3640; CM-NEXT: MOV * T0.Y, T4.X, 3641; CM-NEXT: ALU clause starting at 110: 3642; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3643; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3644; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3645; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3646; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3647; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3648; CM-NEXT: MOV T4.X, PV.W, 3649; CM-NEXT: MOV * T0.Y, T5.X, 3650; CM-NEXT: ALU clause starting at 118: 3651; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3652; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3653; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3654; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3655; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3656; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3657; CM-NEXT: MOV T5.X, PV.W, 3658; CM-NEXT: MOV * T0.Y, T2.X, 3659; CM-NEXT: ALU clause starting at 126: 3660; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3661; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3662; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3663; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3664; CM-NEXT: -65281(nan), 8(1.121039e-44) 3665; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3666; CM-NEXT: MOV T2.X, PV.W, 3667; CM-NEXT: MOV * T0.Y, T3.X, 3668; CM-NEXT: ALU clause starting at 134: 3669; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3670; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3671; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3672; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3673; CM-NEXT: -65281(nan), 8(1.121039e-44) 3674; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3675; CM-NEXT: MOV T3.X, PV.W, 3676; CM-NEXT: MOV * T0.Y, T4.X, 3677; CM-NEXT: ALU clause starting at 142: 3678; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3679; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3680; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3681; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3682; CM-NEXT: -65281(nan), 8(1.121039e-44) 3683; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3684; CM-NEXT: MOV T4.X, PV.W, 3685; CM-NEXT: MOV * T0.Y, T5.X, 3686; CM-NEXT: ALU clause starting at 150: 3687; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3688; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3689; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3690; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3691; CM-NEXT: -65281(nan), 8(1.121039e-44) 3692; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3693; CM-NEXT: MOV T5.X, PV.W, 3694; CM-NEXT: MOV * T0.Y, T2.X, 3695; CM-NEXT: ALU clause starting at 158: 3696; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3697; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3698; CM-NEXT: -256(nan), 255(3.573311e-43) 3699; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W, 3700; CM-NEXT: MOV T2.X, PV.W, 3701; CM-NEXT: MOV * T0.Y, T3.X, 3702; CM-NEXT: ALU clause starting at 164: 3703; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3704; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3705; CM-NEXT: -256(nan), 255(3.573311e-43) 3706; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3707; CM-NEXT: MOV T3.X, PV.Z, 3708; CM-NEXT: MOV * T0.Y, T4.X, 3709; CM-NEXT: ALU clause starting at 170: 3710; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3711; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3712; CM-NEXT: -256(nan), 255(3.573311e-43) 3713; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W, 3714; CM-NEXT: MOV T4.X, PV.Y, 3715; CM-NEXT: MOV * T0.Y, T5.X, 3716; CM-NEXT: ALU clause starting at 176: 3717; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3718; CM-NEXT: AND_INT * T0.W, T7.X, literal.y, 3719; CM-NEXT: -256(nan), 255(3.573311e-43) 3720; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3721; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3722; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3723entry: 3724 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 3725 ret void 3726} 3727 3728define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 3729; SI-LABEL: v16i16_arg: 3730; SI: ; %bb.0: ; %entry 3731; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3732; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3733; SI-NEXT: s_mov_b32 s3, 0xf000 3734; SI-NEXT: s_mov_b32 s2, -1 3735; SI-NEXT: s_waitcnt lgkmcnt(0) 3736; SI-NEXT: v_mov_b32_e32 v0, s8 3737; SI-NEXT: v_mov_b32_e32 v1, s9 3738; SI-NEXT: v_mov_b32_e32 v2, s10 3739; SI-NEXT: v_mov_b32_e32 v3, s11 3740; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3741; SI-NEXT: s_waitcnt expcnt(0) 3742; SI-NEXT: v_mov_b32_e32 v0, s4 3743; SI-NEXT: v_mov_b32_e32 v1, s5 3744; SI-NEXT: v_mov_b32_e32 v2, s6 3745; SI-NEXT: v_mov_b32_e32 v3, s7 3746; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3747; SI-NEXT: s_endpgm 3748; 3749; VI-LABEL: v16i16_arg: 3750; VI: ; %bb.0: ; %entry 3751; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 3752; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3753; VI-NEXT: s_waitcnt lgkmcnt(0) 3754; VI-NEXT: v_mov_b32_e32 v0, s8 3755; VI-NEXT: s_add_u32 s2, s0, 16 3756; VI-NEXT: s_addc_u32 s3, s1, 0 3757; VI-NEXT: v_mov_b32_e32 v5, s3 3758; VI-NEXT: v_mov_b32_e32 v1, s9 3759; VI-NEXT: v_mov_b32_e32 v2, s10 3760; VI-NEXT: v_mov_b32_e32 v3, s11 3761; VI-NEXT: v_mov_b32_e32 v4, s2 3762; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3763; VI-NEXT: v_mov_b32_e32 v5, s1 3764; VI-NEXT: v_mov_b32_e32 v0, s4 3765; VI-NEXT: v_mov_b32_e32 v1, s5 3766; VI-NEXT: v_mov_b32_e32 v2, s6 3767; VI-NEXT: v_mov_b32_e32 v3, s7 3768; VI-NEXT: v_mov_b32_e32 v4, s0 3769; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3770; VI-NEXT: s_endpgm 3771; 3772; GFX9-LABEL: v16i16_arg: 3773; GFX9: ; %bb.0: ; %entry 3774; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3776; GFX9-NEXT: v_mov_b32_e32 v4, 0 3777; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3778; GFX9-NEXT: v_mov_b32_e32 v0, s12 3779; GFX9-NEXT: v_mov_b32_e32 v1, s13 3780; GFX9-NEXT: v_mov_b32_e32 v2, s14 3781; GFX9-NEXT: v_mov_b32_e32 v3, s15 3782; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3783; GFX9-NEXT: s_nop 0 3784; GFX9-NEXT: v_mov_b32_e32 v0, s8 3785; GFX9-NEXT: v_mov_b32_e32 v1, s9 3786; GFX9-NEXT: v_mov_b32_e32 v2, s10 3787; GFX9-NEXT: v_mov_b32_e32 v3, s11 3788; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3789; GFX9-NEXT: s_endpgm 3790; 3791; EG-LABEL: v16i16_arg: 3792; EG: ; %bb.0: ; %entry 3793; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3794; EG-NEXT: TEX 0 @36 3795; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3796; EG-NEXT: TEX 0 @38 3797; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3798; EG-NEXT: TEX 0 @40 3799; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3800; EG-NEXT: TEX 0 @42 3801; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3802; EG-NEXT: TEX 0 @44 3803; EG-NEXT: ALU 5, @94, KC0[], KC1[] 3804; EG-NEXT: TEX 0 @46 3805; EG-NEXT: ALU 5, @100, KC0[], KC1[] 3806; EG-NEXT: TEX 0 @48 3807; EG-NEXT: ALU 5, @106, KC0[], KC1[] 3808; EG-NEXT: TEX 0 @50 3809; EG-NEXT: ALU 5, @112, KC0[], KC1[] 3810; EG-NEXT: TEX 0 @52 3811; EG-NEXT: ALU 5, @118, KC0[], KC1[] 3812; EG-NEXT: TEX 0 @54 3813; EG-NEXT: ALU 5, @124, KC0[], KC1[] 3814; EG-NEXT: TEX 0 @56 3815; EG-NEXT: ALU 5, @130, KC0[], KC1[] 3816; EG-NEXT: TEX 0 @58 3817; EG-NEXT: ALU 5, @136, KC0[], KC1[] 3818; EG-NEXT: TEX 0 @60 3819; EG-NEXT: ALU 5, @142, KC0[], KC1[] 3820; EG-NEXT: TEX 0 @62 3821; EG-NEXT: ALU 5, @148, KC0[], KC1[] 3822; EG-NEXT: TEX 0 @64 3823; EG-NEXT: ALU 5, @154, KC0[], KC1[] 3824; EG-NEXT: TEX 0 @66 3825; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[] 3826; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 3827; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 3828; EG-NEXT: CF_END 3829; EG-NEXT: Fetch clause starting at 36: 3830; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 3831; EG-NEXT: Fetch clause starting at 38: 3832; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 3833; EG-NEXT: Fetch clause starting at 40: 3834; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 3835; EG-NEXT: Fetch clause starting at 42: 3836; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 3837; EG-NEXT: Fetch clause starting at 44: 3838; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 3839; EG-NEXT: Fetch clause starting at 46: 3840; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 3841; EG-NEXT: Fetch clause starting at 48: 3842; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 3843; EG-NEXT: Fetch clause starting at 50: 3844; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 3845; EG-NEXT: Fetch clause starting at 52: 3846; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 3847; EG-NEXT: Fetch clause starting at 54: 3848; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 3849; EG-NEXT: Fetch clause starting at 56: 3850; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 3851; EG-NEXT: Fetch clause starting at 58: 3852; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 3853; EG-NEXT: Fetch clause starting at 60: 3854; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 3855; EG-NEXT: Fetch clause starting at 62: 3856; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 3857; EG-NEXT: Fetch clause starting at 64: 3858; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 3859; EG-NEXT: Fetch clause starting at 66: 3860; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 3861; EG-NEXT: ALU clause starting at 68: 3862; EG-NEXT: MOV * T0.Y, T3.X, 3863; EG-NEXT: MOV * T11.X, 0.0, 3864; EG-NEXT: ALU clause starting at 70: 3865; EG-NEXT: LSHL T0.W, T12.X, literal.x, 3866; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3867; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 3868; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3869; EG-NEXT: MOV T3.X, PV.W, 3870; EG-NEXT: MOV * T0.Y, T5.X, 3871; EG-NEXT: ALU clause starting at 76: 3872; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3873; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3874; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3875; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3876; EG-NEXT: MOV T5.X, PV.W, 3877; EG-NEXT: MOV * T0.Y, T7.X, 3878; EG-NEXT: ALU clause starting at 82: 3879; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3880; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3881; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3882; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3883; EG-NEXT: MOV T7.X, PV.W, 3884; EG-NEXT: MOV * T0.Y, T9.X, 3885; EG-NEXT: ALU clause starting at 88: 3886; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3887; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3888; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3889; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3890; EG-NEXT: MOV T9.X, PV.W, 3891; EG-NEXT: MOV * T0.Y, T3.X, 3892; EG-NEXT: ALU clause starting at 94: 3893; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3894; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3895; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3896; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3897; EG-NEXT: MOV T3.X, PV.W, 3898; EG-NEXT: MOV * T0.Y, T5.X, 3899; EG-NEXT: ALU clause starting at 100: 3900; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3901; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3902; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3903; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3904; EG-NEXT: MOV T5.X, PV.W, 3905; EG-NEXT: MOV * T0.Y, T7.X, 3906; EG-NEXT: ALU clause starting at 106: 3907; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3908; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3909; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3910; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3911; EG-NEXT: MOV T7.X, PV.W, 3912; EG-NEXT: MOV * T0.Y, T9.X, 3913; EG-NEXT: ALU clause starting at 112: 3914; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3915; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3916; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3917; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3918; EG-NEXT: MOV T9.X, PV.W, 3919; EG-NEXT: MOV * T0.Y, T2.X, 3920; EG-NEXT: ALU clause starting at 118: 3921; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3922; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3923; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3924; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3925; EG-NEXT: MOV T2.X, PV.W, 3926; EG-NEXT: MOV * T0.Y, T4.X, 3927; EG-NEXT: ALU clause starting at 124: 3928; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3929; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3930; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3931; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3932; EG-NEXT: MOV T4.X, PV.W, 3933; EG-NEXT: MOV * T0.Y, T6.X, 3934; EG-NEXT: ALU clause starting at 130: 3935; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3936; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3937; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3938; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3939; EG-NEXT: MOV T6.X, PV.W, 3940; EG-NEXT: MOV * T0.Y, T8.X, 3941; EG-NEXT: ALU clause starting at 136: 3942; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3943; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3944; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3945; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3946; EG-NEXT: MOV T8.X, PV.W, 3947; EG-NEXT: MOV * T0.Y, T2.X, 3948; EG-NEXT: ALU clause starting at 142: 3949; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3950; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3951; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3952; EG-NEXT: OR_INT * T12.Z, PV.W, PS, 3953; EG-NEXT: MOV T2.X, PV.Z, 3954; EG-NEXT: MOV * T0.Y, T4.X, 3955; EG-NEXT: ALU clause starting at 148: 3956; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3957; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3958; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3959; EG-NEXT: OR_INT * T12.X, PV.W, PS, 3960; EG-NEXT: MOV T4.X, PV.X, 3961; EG-NEXT: MOV * T0.Y, T6.X, 3962; EG-NEXT: ALU clause starting at 154: 3963; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3964; EG-NEXT: AND_INT * T1.W, T13.X, literal.y, 3965; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3966; EG-NEXT: OR_INT * T11.Z, PV.W, PS, 3967; EG-NEXT: MOV T6.X, PV.Z, 3968; EG-NEXT: MOV * T0.Y, T8.X, 3969; EG-NEXT: ALU clause starting at 160: 3970; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, 3971; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3972; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 3973; EG-NEXT: LSHR T14.X, PV.W, literal.x, 3974; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 3975; EG-NEXT: AND_INT * T1.W, T11.X, literal.z, 3976; EG-NEXT: 2(2.802597e-45), -65536(nan) 3977; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3978; EG-NEXT: OR_INT * T11.X, PV.W, PS, 3979; EG-NEXT: MOV T8.X, PV.X, 3980; EG-NEXT: MOV * T12.W, T3.X, 3981; EG-NEXT: MOV T12.Y, T5.X, 3982; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212 3983; EG-NEXT: MOV * T11.Y, T9.X, 3984; 3985; CM-LABEL: v16i16_arg: 3986; CM: ; %bb.0: ; %entry 3987; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3988; CM-NEXT: TEX 0 @36 3989; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3990; CM-NEXT: TEX 0 @38 3991; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3992; CM-NEXT: TEX 0 @40 3993; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3994; CM-NEXT: TEX 0 @42 3995; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3996; CM-NEXT: TEX 0 @44 3997; CM-NEXT: ALU 5, @94, KC0[], KC1[] 3998; CM-NEXT: TEX 0 @46 3999; CM-NEXT: ALU 5, @100, KC0[], KC1[] 4000; CM-NEXT: TEX 0 @48 4001; CM-NEXT: ALU 5, @106, KC0[], KC1[] 4002; CM-NEXT: TEX 0 @50 4003; CM-NEXT: ALU 5, @112, KC0[], KC1[] 4004; CM-NEXT: TEX 0 @52 4005; CM-NEXT: ALU 5, @118, KC0[], KC1[] 4006; CM-NEXT: TEX 0 @54 4007; CM-NEXT: ALU 5, @124, KC0[], KC1[] 4008; CM-NEXT: TEX 0 @56 4009; CM-NEXT: ALU 5, @130, KC0[], KC1[] 4010; CM-NEXT: TEX 0 @58 4011; CM-NEXT: ALU 5, @136, KC0[], KC1[] 4012; CM-NEXT: TEX 0 @60 4013; CM-NEXT: ALU 5, @142, KC0[], KC1[] 4014; CM-NEXT: TEX 0 @62 4015; CM-NEXT: ALU 5, @148, KC0[], KC1[] 4016; CM-NEXT: TEX 0 @64 4017; CM-NEXT: ALU 5, @154, KC0[], KC1[] 4018; CM-NEXT: TEX 0 @66 4019; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[] 4020; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X 4021; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X 4022; CM-NEXT: CF_END 4023; CM-NEXT: Fetch clause starting at 36: 4024; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 4025; CM-NEXT: Fetch clause starting at 38: 4026; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 4027; CM-NEXT: Fetch clause starting at 40: 4028; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 4029; CM-NEXT: Fetch clause starting at 42: 4030; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 4031; CM-NEXT: Fetch clause starting at 44: 4032; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 4033; CM-NEXT: Fetch clause starting at 46: 4034; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 4035; CM-NEXT: Fetch clause starting at 48: 4036; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 4037; CM-NEXT: Fetch clause starting at 50: 4038; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 4039; CM-NEXT: Fetch clause starting at 52: 4040; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 4041; CM-NEXT: Fetch clause starting at 54: 4042; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 4043; CM-NEXT: Fetch clause starting at 56: 4044; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 4045; CM-NEXT: Fetch clause starting at 58: 4046; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 4047; CM-NEXT: Fetch clause starting at 60: 4048; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 4049; CM-NEXT: Fetch clause starting at 62: 4050; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 4051; CM-NEXT: Fetch clause starting at 64: 4052; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 4053; CM-NEXT: Fetch clause starting at 66: 4054; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 4055; CM-NEXT: ALU clause starting at 68: 4056; CM-NEXT: MOV * T0.Y, T3.X, 4057; CM-NEXT: MOV * T11.X, 0.0, 4058; CM-NEXT: ALU clause starting at 70: 4059; CM-NEXT: LSHL T0.Z, T12.X, literal.x, 4060; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 4061; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 4062; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 4063; CM-NEXT: MOV T3.X, PV.W, 4064; CM-NEXT: MOV * T0.Y, T5.X, 4065; CM-NEXT: ALU clause starting at 76: 4066; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4067; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4068; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4069; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4070; CM-NEXT: MOV T5.X, PV.W, 4071; CM-NEXT: MOV * T0.Y, T7.X, 4072; CM-NEXT: ALU clause starting at 82: 4073; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4074; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4075; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4076; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4077; CM-NEXT: MOV T7.X, PV.W, 4078; CM-NEXT: MOV * T0.Y, T9.X, 4079; CM-NEXT: ALU clause starting at 88: 4080; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4081; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4082; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4083; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4084; CM-NEXT: MOV T9.X, PV.W, 4085; CM-NEXT: MOV * T0.Y, T3.X, 4086; CM-NEXT: ALU clause starting at 94: 4087; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4088; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4089; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4090; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4091; CM-NEXT: MOV T3.X, PV.W, 4092; CM-NEXT: MOV * T0.Y, T5.X, 4093; CM-NEXT: ALU clause starting at 100: 4094; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4095; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4096; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4097; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4098; CM-NEXT: MOV T5.X, PV.W, 4099; CM-NEXT: MOV * T0.Y, T7.X, 4100; CM-NEXT: ALU clause starting at 106: 4101; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4102; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4103; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4104; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4105; CM-NEXT: MOV T7.X, PV.W, 4106; CM-NEXT: MOV * T0.Y, T9.X, 4107; CM-NEXT: ALU clause starting at 112: 4108; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4109; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4110; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4111; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4112; CM-NEXT: MOV T9.X, PV.W, 4113; CM-NEXT: MOV * T0.Y, T2.X, 4114; CM-NEXT: ALU clause starting at 118: 4115; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4116; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4117; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4118; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4119; CM-NEXT: MOV T2.X, PV.W, 4120; CM-NEXT: MOV * T0.Y, T4.X, 4121; CM-NEXT: ALU clause starting at 124: 4122; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4123; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4124; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4125; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4126; CM-NEXT: MOV T4.X, PV.W, 4127; CM-NEXT: MOV * T0.Y, T6.X, 4128; CM-NEXT: ALU clause starting at 130: 4129; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4130; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4131; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4132; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4133; CM-NEXT: MOV T6.X, PV.W, 4134; CM-NEXT: MOV * T0.Y, T8.X, 4135; CM-NEXT: ALU clause starting at 136: 4136; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4137; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4138; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4139; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4140; CM-NEXT: MOV T8.X, PV.W, 4141; CM-NEXT: MOV * T0.Y, T2.X, 4142; CM-NEXT: ALU clause starting at 142: 4143; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4144; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4145; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4146; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W, 4147; CM-NEXT: MOV T2.X, PV.Z, 4148; CM-NEXT: MOV * T0.Y, T4.X, 4149; CM-NEXT: ALU clause starting at 148: 4150; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4151; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4152; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4153; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W, 4154; CM-NEXT: MOV T4.X, PV.X, 4155; CM-NEXT: MOV * T0.Y, T6.X, 4156; CM-NEXT: ALU clause starting at 154: 4157; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4158; CM-NEXT: AND_INT * T0.W, T13.X, literal.y, 4159; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4160; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W, 4161; CM-NEXT: MOV T6.X, PV.Z, 4162; CM-NEXT: MOV * T0.Y, T8.X, 4163; CM-NEXT: ALU clause starting at 160: 4164; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4165; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4166; CM-NEXT: LSHR * T13.X, PV.W, literal.x, 4167; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4168; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x, 4169; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 4170; CM-NEXT: AND_INT * T0.W, T11.X, literal.z, 4171; CM-NEXT: 2(2.802597e-45), -65536(nan) 4172; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 4173; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W, 4174; CM-NEXT: MOV T8.X, PV.X, 4175; CM-NEXT: MOV * T12.W, T3.X, 4176; CM-NEXT: MOV T12.Y, T5.X, 4177; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212 4178; CM-NEXT: MOV * T11.Y, T9.X, 4179entry: 4180 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 4181 ret void 4182} 4183 4184define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 4185; SI-LABEL: v16i32_arg: 4186; SI: ; %bb.0: ; %entry 4187; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 4188; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4189; SI-NEXT: s_mov_b32 s3, 0xf000 4190; SI-NEXT: s_mov_b32 s2, -1 4191; SI-NEXT: s_waitcnt lgkmcnt(0) 4192; SI-NEXT: v_mov_b32_e32 v0, s16 4193; SI-NEXT: v_mov_b32_e32 v1, s17 4194; SI-NEXT: v_mov_b32_e32 v2, s18 4195; SI-NEXT: v_mov_b32_e32 v3, s19 4196; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4197; SI-NEXT: s_waitcnt expcnt(0) 4198; SI-NEXT: v_mov_b32_e32 v0, s12 4199; SI-NEXT: v_mov_b32_e32 v1, s13 4200; SI-NEXT: v_mov_b32_e32 v2, s14 4201; SI-NEXT: v_mov_b32_e32 v3, s15 4202; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4203; SI-NEXT: s_waitcnt expcnt(0) 4204; SI-NEXT: v_mov_b32_e32 v0, s8 4205; SI-NEXT: v_mov_b32_e32 v1, s9 4206; SI-NEXT: v_mov_b32_e32 v2, s10 4207; SI-NEXT: v_mov_b32_e32 v3, s11 4208; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4209; SI-NEXT: s_waitcnt expcnt(0) 4210; SI-NEXT: v_mov_b32_e32 v0, s4 4211; SI-NEXT: v_mov_b32_e32 v1, s5 4212; SI-NEXT: v_mov_b32_e32 v2, s6 4213; SI-NEXT: v_mov_b32_e32 v3, s7 4214; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4215; SI-NEXT: s_endpgm 4216; 4217; VI-LABEL: v16i32_arg: 4218; VI: ; %bb.0: ; %entry 4219; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 4220; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4221; VI-NEXT: s_waitcnt lgkmcnt(0) 4222; VI-NEXT: v_mov_b32_e32 v0, s16 4223; VI-NEXT: s_add_u32 s2, s0, 48 4224; VI-NEXT: s_addc_u32 s3, s1, 0 4225; VI-NEXT: v_mov_b32_e32 v5, s3 4226; VI-NEXT: v_mov_b32_e32 v4, s2 4227; VI-NEXT: s_add_u32 s2, s0, 32 4228; VI-NEXT: v_mov_b32_e32 v1, s17 4229; VI-NEXT: v_mov_b32_e32 v2, s18 4230; VI-NEXT: v_mov_b32_e32 v3, s19 4231; VI-NEXT: s_addc_u32 s3, s1, 0 4232; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4233; VI-NEXT: v_mov_b32_e32 v5, s3 4234; VI-NEXT: v_mov_b32_e32 v4, s2 4235; VI-NEXT: s_add_u32 s2, s0, 16 4236; VI-NEXT: v_mov_b32_e32 v0, s12 4237; VI-NEXT: v_mov_b32_e32 v1, s13 4238; VI-NEXT: v_mov_b32_e32 v2, s14 4239; VI-NEXT: v_mov_b32_e32 v3, s15 4240; VI-NEXT: s_addc_u32 s3, s1, 0 4241; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4242; VI-NEXT: v_mov_b32_e32 v5, s3 4243; VI-NEXT: v_mov_b32_e32 v0, s8 4244; VI-NEXT: v_mov_b32_e32 v1, s9 4245; VI-NEXT: v_mov_b32_e32 v2, s10 4246; VI-NEXT: v_mov_b32_e32 v3, s11 4247; VI-NEXT: v_mov_b32_e32 v4, s2 4248; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4249; VI-NEXT: v_mov_b32_e32 v5, s1 4250; VI-NEXT: v_mov_b32_e32 v0, s4 4251; VI-NEXT: v_mov_b32_e32 v1, s5 4252; VI-NEXT: v_mov_b32_e32 v2, s6 4253; VI-NEXT: v_mov_b32_e32 v3, s7 4254; VI-NEXT: v_mov_b32_e32 v4, s0 4255; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4256; VI-NEXT: s_endpgm 4257; 4258; GFX9-LABEL: v16i32_arg: 4259; GFX9: ; %bb.0: ; %entry 4260; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 4261; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4262; GFX9-NEXT: v_mov_b32_e32 v4, 0 4263; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4264; GFX9-NEXT: v_mov_b32_e32 v0, s20 4265; GFX9-NEXT: v_mov_b32_e32 v1, s21 4266; GFX9-NEXT: v_mov_b32_e32 v2, s22 4267; GFX9-NEXT: v_mov_b32_e32 v3, s23 4268; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 4269; GFX9-NEXT: s_nop 0 4270; GFX9-NEXT: v_mov_b32_e32 v0, s16 4271; GFX9-NEXT: v_mov_b32_e32 v1, s17 4272; GFX9-NEXT: v_mov_b32_e32 v2, s18 4273; GFX9-NEXT: v_mov_b32_e32 v3, s19 4274; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 4275; GFX9-NEXT: s_nop 0 4276; GFX9-NEXT: v_mov_b32_e32 v0, s12 4277; GFX9-NEXT: v_mov_b32_e32 v1, s13 4278; GFX9-NEXT: v_mov_b32_e32 v2, s14 4279; GFX9-NEXT: v_mov_b32_e32 v3, s15 4280; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 4281; GFX9-NEXT: s_nop 0 4282; GFX9-NEXT: v_mov_b32_e32 v0, s8 4283; GFX9-NEXT: v_mov_b32_e32 v1, s9 4284; GFX9-NEXT: v_mov_b32_e32 v2, s10 4285; GFX9-NEXT: v_mov_b32_e32 v3, s11 4286; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 4287; GFX9-NEXT: s_endpgm 4288; 4289; EG-LABEL: v16i32_arg: 4290; EG: ; %bb.0: ; %entry 4291; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4292; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4293; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4294; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4295; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4296; EG-NEXT: CF_END 4297; EG-NEXT: ALU clause starting at 6: 4298; EG-NEXT: MOV * T0.W, KC0[7].X, 4299; EG-NEXT: MOV * T0.Z, KC0[6].W, 4300; EG-NEXT: MOV T0.Y, KC0[6].Z, 4301; EG-NEXT: MOV * T1.W, KC0[8].X, 4302; EG-NEXT: MOV T0.X, KC0[6].Y, 4303; EG-NEXT: MOV * T1.Z, KC0[7].W, 4304; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4305; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4306; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4307; EG-NEXT: MOV * T3.W, KC0[9].X, 4308; EG-NEXT: MOV T1.X, KC0[7].Y, 4309; EG-NEXT: MOV * T3.Z, KC0[8].W, 4310; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4311; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4312; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4313; EG-NEXT: MOV T3.Y, KC0[8].Z, 4314; EG-NEXT: MOV * T5.W, KC0[10].X, 4315; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4316; EG-NEXT: MOV T3.X, KC0[8].Y, 4317; EG-NEXT: MOV * T5.Z, KC0[9].W, 4318; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4319; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4320; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4321; EG-NEXT: MOV T5.Y, KC0[9].Z, 4322; EG-NEXT: MOV * T5.X, KC0[9].Y, 4323; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4324; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4325; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4326; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4327; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4328; 4329; CM-LABEL: v16i32_arg: 4330; CM: ; %bb.0: ; %entry 4331; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4332; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4333; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4334; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4335; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4336; CM-NEXT: CF_END 4337; CM-NEXT: ALU clause starting at 6: 4338; CM-NEXT: MOV * T0.W, KC0[10].X, 4339; CM-NEXT: MOV * T0.Z, KC0[9].W, 4340; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4341; CM-NEXT: MOV T0.X, KC0[9].Y, 4342; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4343; CM-NEXT: MOV * T2.W, KC0[9].X, 4344; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4345; CM-NEXT: MOV T2.Z, KC0[8].W, 4346; CM-NEXT: MOV * T1.W, KC0[8].X, 4347; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4348; CM-NEXT: MOV T2.Y, KC0[8].Z, 4349; CM-NEXT: MOV * T1.Z, KC0[7].W, 4350; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4351; CM-NEXT: MOV T2.X, KC0[8].Y, 4352; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4353; CM-NEXT: MOV T1.X, KC0[7].Y, 4354; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4355; CM-NEXT: MOV * T4.W, KC0[7].X, 4356; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4357; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4358; CM-NEXT: MOV T4.Z, KC0[6].W, 4359; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4360; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4361; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4362; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4363; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4364; CM-NEXT: MOV * T4.X, KC0[6].Y, 4365; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4366; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4367entry: 4368 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 4369 ret void 4370} 4371 4372define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 4373; SI-LABEL: v16f32_arg: 4374; SI: ; %bb.0: ; %entry 4375; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 4376; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4377; SI-NEXT: s_mov_b32 s3, 0xf000 4378; SI-NEXT: s_mov_b32 s2, -1 4379; SI-NEXT: s_waitcnt lgkmcnt(0) 4380; SI-NEXT: v_mov_b32_e32 v0, s16 4381; SI-NEXT: v_mov_b32_e32 v1, s17 4382; SI-NEXT: v_mov_b32_e32 v2, s18 4383; SI-NEXT: v_mov_b32_e32 v3, s19 4384; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4385; SI-NEXT: s_waitcnt expcnt(0) 4386; SI-NEXT: v_mov_b32_e32 v0, s12 4387; SI-NEXT: v_mov_b32_e32 v1, s13 4388; SI-NEXT: v_mov_b32_e32 v2, s14 4389; SI-NEXT: v_mov_b32_e32 v3, s15 4390; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4391; SI-NEXT: s_waitcnt expcnt(0) 4392; SI-NEXT: v_mov_b32_e32 v0, s8 4393; SI-NEXT: v_mov_b32_e32 v1, s9 4394; SI-NEXT: v_mov_b32_e32 v2, s10 4395; SI-NEXT: v_mov_b32_e32 v3, s11 4396; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4397; SI-NEXT: s_waitcnt expcnt(0) 4398; SI-NEXT: v_mov_b32_e32 v0, s4 4399; SI-NEXT: v_mov_b32_e32 v1, s5 4400; SI-NEXT: v_mov_b32_e32 v2, s6 4401; SI-NEXT: v_mov_b32_e32 v3, s7 4402; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4403; SI-NEXT: s_endpgm 4404; 4405; VI-LABEL: v16f32_arg: 4406; VI: ; %bb.0: ; %entry 4407; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 4408; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4409; VI-NEXT: s_waitcnt lgkmcnt(0) 4410; VI-NEXT: v_mov_b32_e32 v0, s16 4411; VI-NEXT: s_add_u32 s2, s0, 48 4412; VI-NEXT: s_addc_u32 s3, s1, 0 4413; VI-NEXT: v_mov_b32_e32 v5, s3 4414; VI-NEXT: v_mov_b32_e32 v4, s2 4415; VI-NEXT: s_add_u32 s2, s0, 32 4416; VI-NEXT: v_mov_b32_e32 v1, s17 4417; VI-NEXT: v_mov_b32_e32 v2, s18 4418; VI-NEXT: v_mov_b32_e32 v3, s19 4419; VI-NEXT: s_addc_u32 s3, s1, 0 4420; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4421; VI-NEXT: v_mov_b32_e32 v5, s3 4422; VI-NEXT: v_mov_b32_e32 v4, s2 4423; VI-NEXT: s_add_u32 s2, s0, 16 4424; VI-NEXT: v_mov_b32_e32 v0, s12 4425; VI-NEXT: v_mov_b32_e32 v1, s13 4426; VI-NEXT: v_mov_b32_e32 v2, s14 4427; VI-NEXT: v_mov_b32_e32 v3, s15 4428; VI-NEXT: s_addc_u32 s3, s1, 0 4429; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4430; VI-NEXT: v_mov_b32_e32 v5, s3 4431; VI-NEXT: v_mov_b32_e32 v0, s8 4432; VI-NEXT: v_mov_b32_e32 v1, s9 4433; VI-NEXT: v_mov_b32_e32 v2, s10 4434; VI-NEXT: v_mov_b32_e32 v3, s11 4435; VI-NEXT: v_mov_b32_e32 v4, s2 4436; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4437; VI-NEXT: v_mov_b32_e32 v5, s1 4438; VI-NEXT: v_mov_b32_e32 v0, s4 4439; VI-NEXT: v_mov_b32_e32 v1, s5 4440; VI-NEXT: v_mov_b32_e32 v2, s6 4441; VI-NEXT: v_mov_b32_e32 v3, s7 4442; VI-NEXT: v_mov_b32_e32 v4, s0 4443; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4444; VI-NEXT: s_endpgm 4445; 4446; GFX9-LABEL: v16f32_arg: 4447; GFX9: ; %bb.0: ; %entry 4448; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 4449; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4450; GFX9-NEXT: v_mov_b32_e32 v4, 0 4451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4452; GFX9-NEXT: v_mov_b32_e32 v0, s20 4453; GFX9-NEXT: v_mov_b32_e32 v1, s21 4454; GFX9-NEXT: v_mov_b32_e32 v2, s22 4455; GFX9-NEXT: v_mov_b32_e32 v3, s23 4456; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 4457; GFX9-NEXT: s_nop 0 4458; GFX9-NEXT: v_mov_b32_e32 v0, s16 4459; GFX9-NEXT: v_mov_b32_e32 v1, s17 4460; GFX9-NEXT: v_mov_b32_e32 v2, s18 4461; GFX9-NEXT: v_mov_b32_e32 v3, s19 4462; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 4463; GFX9-NEXT: s_nop 0 4464; GFX9-NEXT: v_mov_b32_e32 v0, s12 4465; GFX9-NEXT: v_mov_b32_e32 v1, s13 4466; GFX9-NEXT: v_mov_b32_e32 v2, s14 4467; GFX9-NEXT: v_mov_b32_e32 v3, s15 4468; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 4469; GFX9-NEXT: s_nop 0 4470; GFX9-NEXT: v_mov_b32_e32 v0, s8 4471; GFX9-NEXT: v_mov_b32_e32 v1, s9 4472; GFX9-NEXT: v_mov_b32_e32 v2, s10 4473; GFX9-NEXT: v_mov_b32_e32 v3, s11 4474; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 4475; GFX9-NEXT: s_endpgm 4476; 4477; EG-LABEL: v16f32_arg: 4478; EG: ; %bb.0: ; %entry 4479; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4480; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4481; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4482; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4483; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4484; EG-NEXT: CF_END 4485; EG-NEXT: ALU clause starting at 6: 4486; EG-NEXT: MOV * T0.W, KC0[7].X, 4487; EG-NEXT: MOV * T0.Z, KC0[6].W, 4488; EG-NEXT: MOV T0.Y, KC0[6].Z, 4489; EG-NEXT: MOV * T1.W, KC0[8].X, 4490; EG-NEXT: MOV T0.X, KC0[6].Y, 4491; EG-NEXT: MOV * T1.Z, KC0[7].W, 4492; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4493; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4494; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4495; EG-NEXT: MOV * T3.W, KC0[9].X, 4496; EG-NEXT: MOV T1.X, KC0[7].Y, 4497; EG-NEXT: MOV * T3.Z, KC0[8].W, 4498; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4499; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4500; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4501; EG-NEXT: MOV T3.Y, KC0[8].Z, 4502; EG-NEXT: MOV * T5.W, KC0[10].X, 4503; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4504; EG-NEXT: MOV T3.X, KC0[8].Y, 4505; EG-NEXT: MOV * T5.Z, KC0[9].W, 4506; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4507; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4508; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4509; EG-NEXT: MOV T5.Y, KC0[9].Z, 4510; EG-NEXT: MOV * T5.X, KC0[9].Y, 4511; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4512; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4513; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4514; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4515; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4516; 4517; CM-LABEL: v16f32_arg: 4518; CM: ; %bb.0: ; %entry 4519; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4520; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4521; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4522; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4523; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4524; CM-NEXT: CF_END 4525; CM-NEXT: ALU clause starting at 6: 4526; CM-NEXT: MOV * T0.W, KC0[10].X, 4527; CM-NEXT: MOV * T0.Z, KC0[9].W, 4528; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4529; CM-NEXT: MOV T0.X, KC0[9].Y, 4530; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4531; CM-NEXT: MOV * T2.W, KC0[9].X, 4532; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4533; CM-NEXT: MOV T2.Z, KC0[8].W, 4534; CM-NEXT: MOV * T1.W, KC0[8].X, 4535; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4536; CM-NEXT: MOV T2.Y, KC0[8].Z, 4537; CM-NEXT: MOV * T1.Z, KC0[7].W, 4538; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4539; CM-NEXT: MOV T2.X, KC0[8].Y, 4540; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4541; CM-NEXT: MOV T1.X, KC0[7].Y, 4542; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4543; CM-NEXT: MOV * T4.W, KC0[7].X, 4544; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4545; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4546; CM-NEXT: MOV T4.Z, KC0[6].W, 4547; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4548; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4549; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4550; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4551; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4552; CM-NEXT: MOV * T4.X, KC0[6].Y, 4553; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4554; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4555entry: 4556 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 4557 ret void 4558} 4559 4560define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 4561; SI-LABEL: kernel_arg_i64: 4562; SI: ; %bb.0: 4563; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4564; SI-NEXT: s_mov_b32 s7, 0xf000 4565; SI-NEXT: s_mov_b32 s6, -1 4566; SI-NEXT: s_waitcnt lgkmcnt(0) 4567; SI-NEXT: s_mov_b32 s4, s0 4568; SI-NEXT: s_mov_b32 s5, s1 4569; SI-NEXT: v_mov_b32_e32 v0, s2 4570; SI-NEXT: v_mov_b32_e32 v1, s3 4571; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4572; SI-NEXT: s_endpgm 4573; 4574; VI-LABEL: kernel_arg_i64: 4575; VI: ; %bb.0: 4576; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 4577; VI-NEXT: s_waitcnt lgkmcnt(0) 4578; VI-NEXT: v_mov_b32_e32 v0, s0 4579; VI-NEXT: v_mov_b32_e32 v1, s1 4580; VI-NEXT: v_mov_b32_e32 v2, s2 4581; VI-NEXT: v_mov_b32_e32 v3, s3 4582; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4583; VI-NEXT: s_endpgm 4584; 4585; GFX9-LABEL: kernel_arg_i64: 4586; GFX9: ; %bb.0: 4587; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4588; GFX9-NEXT: v_mov_b32_e32 v2, 0 4589; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4590; GFX9-NEXT: v_mov_b32_e32 v0, s2 4591; GFX9-NEXT: v_mov_b32_e32 v1, s3 4592; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4593; GFX9-NEXT: s_endpgm 4594; 4595; EG-LABEL: kernel_arg_i64: 4596; EG: ; %bb.0: 4597; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4598; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4599; EG-NEXT: CF_END 4600; EG-NEXT: PAD 4601; EG-NEXT: ALU clause starting at 4: 4602; EG-NEXT: MOV * T0.Y, KC0[3].X, 4603; EG-NEXT: MOV T0.X, KC0[2].W, 4604; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4605; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4606; 4607; CM-LABEL: kernel_arg_i64: 4608; CM: ; %bb.0: 4609; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4610; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4611; CM-NEXT: CF_END 4612; CM-NEXT: PAD 4613; CM-NEXT: ALU clause starting at 4: 4614; CM-NEXT: MOV * T0.Y, KC0[3].X, 4615; CM-NEXT: MOV * T0.X, KC0[2].W, 4616; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4617; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4618 store i64 %a, i64 addrspace(1)* %out, align 8 4619 ret void 4620} 4621 4622define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 4623; SI-LABEL: f64_kernel_arg: 4624; SI: ; %bb.0: ; %entry 4625; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4626; SI-NEXT: s_mov_b32 s7, 0xf000 4627; SI-NEXT: s_mov_b32 s6, -1 4628; SI-NEXT: s_waitcnt lgkmcnt(0) 4629; SI-NEXT: s_mov_b32 s4, s0 4630; SI-NEXT: s_mov_b32 s5, s1 4631; SI-NEXT: v_mov_b32_e32 v0, s2 4632; SI-NEXT: v_mov_b32_e32 v1, s3 4633; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4634; SI-NEXT: s_endpgm 4635; 4636; VI-LABEL: f64_kernel_arg: 4637; VI: ; %bb.0: ; %entry 4638; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 4639; VI-NEXT: s_waitcnt lgkmcnt(0) 4640; VI-NEXT: v_mov_b32_e32 v0, s0 4641; VI-NEXT: v_mov_b32_e32 v1, s1 4642; VI-NEXT: v_mov_b32_e32 v2, s2 4643; VI-NEXT: v_mov_b32_e32 v3, s3 4644; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4645; VI-NEXT: s_endpgm 4646; 4647; GFX9-LABEL: f64_kernel_arg: 4648; GFX9: ; %bb.0: ; %entry 4649; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4650; GFX9-NEXT: v_mov_b32_e32 v2, 0 4651; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4652; GFX9-NEXT: v_mov_b32_e32 v0, s2 4653; GFX9-NEXT: v_mov_b32_e32 v1, s3 4654; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4655; GFX9-NEXT: s_endpgm 4656; 4657; EG-LABEL: f64_kernel_arg: 4658; EG: ; %bb.0: ; %entry 4659; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4660; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4661; EG-NEXT: CF_END 4662; EG-NEXT: PAD 4663; EG-NEXT: ALU clause starting at 4: 4664; EG-NEXT: MOV * T0.Y, KC0[3].X, 4665; EG-NEXT: MOV T0.X, KC0[2].W, 4666; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4667; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4668; 4669; CM-LABEL: f64_kernel_arg: 4670; CM: ; %bb.0: ; %entry 4671; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4672; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4673; CM-NEXT: CF_END 4674; CM-NEXT: PAD 4675; CM-NEXT: ALU clause starting at 4: 4676; CM-NEXT: MOV * T0.Y, KC0[3].X, 4677; CM-NEXT: MOV * T0.X, KC0[2].W, 4678; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4679; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4680entry: 4681 store double %in, double addrspace(1)* %out 4682 ret void 4683} 4684 4685; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 4686; XGCN: s_load_dwordx2 4687; XGCN: s_load_dwordx2 4688; XGCN: buffer_store_dwordx2 4689; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 4690; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 4691; ret void 4692; } 4693 4694define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 4695; SI-LABEL: i65_arg: 4696; SI: ; %bb.0: ; %entry 4697; SI-NEXT: s_load_dword s2, s[0:1], 0xd 4698; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 4699; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4700; SI-NEXT: s_mov_b32 s3, 0xf000 4701; SI-NEXT: s_waitcnt lgkmcnt(0) 4702; SI-NEXT: s_and_b32 s6, s2, 1 4703; SI-NEXT: s_mov_b32 s2, -1 4704; SI-NEXT: v_mov_b32_e32 v0, s4 4705; SI-NEXT: v_mov_b32_e32 v1, s5 4706; SI-NEXT: v_mov_b32_e32 v2, s6 4707; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:8 4708; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4709; SI-NEXT: s_endpgm 4710; 4711; VI-LABEL: i65_arg: 4712; VI: ; %bb.0: ; %entry 4713; VI-NEXT: s_load_dword s4, s[0:1], 0x34 4714; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4715; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 4716; VI-NEXT: s_waitcnt lgkmcnt(0) 4717; VI-NEXT: s_and_b32 s4, s4, 1 4718; VI-NEXT: v_mov_b32_e32 v0, s2 4719; VI-NEXT: v_mov_b32_e32 v1, s3 4720; VI-NEXT: s_add_u32 s2, s2, 8 4721; VI-NEXT: s_addc_u32 s3, s3, 0 4722; VI-NEXT: v_mov_b32_e32 v2, s2 4723; VI-NEXT: v_mov_b32_e32 v4, s4 4724; VI-NEXT: v_mov_b32_e32 v3, s3 4725; VI-NEXT: flat_store_byte v[2:3], v4 4726; VI-NEXT: v_mov_b32_e32 v3, s1 4727; VI-NEXT: v_mov_b32_e32 v2, s0 4728; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4729; VI-NEXT: s_endpgm 4730; 4731; GFX9-LABEL: i65_arg: 4732; GFX9: ; %bb.0: ; %entry 4733; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 4734; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4735; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4736; GFX9-NEXT: v_mov_b32_e32 v2, 0 4737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4738; GFX9-NEXT: s_and_b32 s4, s6, 1 4739; GFX9-NEXT: v_mov_b32_e32 v0, s0 4740; GFX9-NEXT: v_mov_b32_e32 v3, s4 4741; GFX9-NEXT: v_mov_b32_e32 v1, s1 4742; GFX9-NEXT: global_store_byte v2, v3, s[2:3] offset:8 4743; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4744; GFX9-NEXT: s_endpgm 4745; 4746; EG-LABEL: i65_arg: 4747; EG: ; %bb.0: ; %entry 4748; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[] 4749; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 4750; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 4751; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 4752; EG-NEXT: CF_END 4753; EG-NEXT: PAD 4754; EG-NEXT: ALU clause starting at 6: 4755; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4756; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4757; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 4758; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4759; EG-NEXT: LSHL T1.W, PV.W, literal.x, 4760; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1, 4761; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4762; EG-NEXT: LSHL T1.X, PS, PV.W, 4763; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 4764; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4765; EG-NEXT: MOV T1.Y, 0.0, 4766; EG-NEXT: MOV * T1.Z, 0.0, 4767; EG-NEXT: LSHR T0.X, T0.W, literal.x, 4768; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4769; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) 4770; EG-NEXT: LSHR T2.X, PV.W, literal.x, 4771; EG-NEXT: MOV * T3.X, KC0[3].X, 4772; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4773; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, 4774; EG-NEXT: MOV * T5.X, KC0[2].W, 4775; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4776; 4777; CM-LABEL: i65_arg: 4778; CM: ; %bb.0: ; %entry 4779; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[] 4780; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X 4781; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 4782; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X 4783; CM-NEXT: CF_END 4784; CM-NEXT: PAD 4785; CM-NEXT: ALU clause starting at 6: 4786; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4787; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4788; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 4789; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4790; CM-NEXT: LSHL T0.Z, PV.W, literal.x, 4791; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1, 4792; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4793; CM-NEXT: LSHL T1.X, PV.W, PV.Z, 4794; CM-NEXT: LSHL * T1.W, literal.x, PV.Z, 4795; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4796; CM-NEXT: MOV T1.Y, 0.0, 4797; CM-NEXT: MOV * T1.Z, 0.0, 4798; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 4799; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4800; CM-NEXT: MOV T2.X, KC0[2].W, 4801; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4802; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 4803; CM-NEXT: LSHR * T3.X, PV.W, literal.x, 4804; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4805; CM-NEXT: MOV * T4.X, KC0[3].X, 4806; CM-NEXT: LSHR * T5.X, T0.W, literal.x, 4807; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4808entry: 4809 store i65 %in, i65 addrspace(1)* %out, align 4 4810 ret void 4811} 4812 4813define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 4814; SI-LABEL: i1_arg: 4815; SI: ; %bb.0: 4816; SI-NEXT: s_load_dword s2, s[0:1], 0xb 4817; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4818; SI-NEXT: s_mov_b32 s3, 0xf000 4819; SI-NEXT: s_waitcnt lgkmcnt(0) 4820; SI-NEXT: s_and_b32 s4, s2, 1 4821; SI-NEXT: s_mov_b32 s2, -1 4822; SI-NEXT: v_mov_b32_e32 v0, s4 4823; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 4824; SI-NEXT: s_endpgm 4825; 4826; VI-LABEL: i1_arg: 4827; VI: ; %bb.0: 4828; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 4829; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4830; VI-NEXT: s_waitcnt lgkmcnt(0) 4831; VI-NEXT: s_and_b32 s2, s2, 1 4832; VI-NEXT: v_mov_b32_e32 v0, s0 4833; VI-NEXT: v_mov_b32_e32 v1, s1 4834; VI-NEXT: v_mov_b32_e32 v2, s2 4835; VI-NEXT: flat_store_byte v[0:1], v2 4836; VI-NEXT: s_endpgm 4837; 4838; GFX9-LABEL: i1_arg: 4839; GFX9: ; %bb.0: 4840; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 4841; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4842; GFX9-NEXT: v_mov_b32_e32 v0, 0 4843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4844; GFX9-NEXT: s_and_b32 s2, s2, 1 4845; GFX9-NEXT: v_mov_b32_e32 v1, s2 4846; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 4847; GFX9-NEXT: s_endpgm 4848; 4849; EG-LABEL: i1_arg: 4850; EG: ; %bb.0: 4851; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4852; EG-NEXT: TEX 0 @6 4853; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4854; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4855; EG-NEXT: CF_END 4856; EG-NEXT: PAD 4857; EG-NEXT: Fetch clause starting at 6: 4858; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4859; EG-NEXT: ALU clause starting at 8: 4860; EG-NEXT: MOV * T0.X, 0.0, 4861; EG-NEXT: ALU clause starting at 9: 4862; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 4863; EG-NEXT: AND_INT * T1.W, T0.X, 1, 4864; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4865; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 4866; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4867; EG-NEXT: LSHL T0.X, T1.W, PV.W, 4868; EG-NEXT: LSHL * T0.W, literal.x, PV.W, 4869; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4870; EG-NEXT: MOV T0.Y, 0.0, 4871; EG-NEXT: MOV * T0.Z, 0.0, 4872; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4873; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4874; 4875; CM-LABEL: i1_arg: 4876; CM: ; %bb.0: 4877; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4878; CM-NEXT: TEX 0 @6 4879; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4880; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4881; CM-NEXT: CF_END 4882; CM-NEXT: PAD 4883; CM-NEXT: Fetch clause starting at 6: 4884; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4885; CM-NEXT: ALU clause starting at 8: 4886; CM-NEXT: MOV * T0.X, 0.0, 4887; CM-NEXT: ALU clause starting at 9: 4888; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 4889; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4890; CM-NEXT: AND_INT T0.Z, T0.X, 1, 4891; CM-NEXT: LSHL * T0.W, PV.W, literal.x, 4892; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4893; CM-NEXT: LSHL T0.X, PV.Z, PV.W, 4894; CM-NEXT: LSHL * T0.W, literal.x, PV.W, 4895; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4896; CM-NEXT: MOV T0.Y, 0.0, 4897; CM-NEXT: MOV * T0.Z, 0.0, 4898; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4899; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4900 store i1 %x, i1 addrspace(1)* %out, align 1 4901 ret void 4902} 4903 4904define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 4905; SI-LABEL: i1_arg_zext_i32: 4906; SI: ; %bb.0: 4907; SI-NEXT: s_load_dword s2, s[0:1], 0xb 4908; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4909; SI-NEXT: s_mov_b32 s3, 0xf000 4910; SI-NEXT: s_waitcnt lgkmcnt(0) 4911; SI-NEXT: s_and_b32 s4, s2, 1 4912; SI-NEXT: s_mov_b32 s2, -1 4913; SI-NEXT: v_mov_b32_e32 v0, s4 4914; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 4915; SI-NEXT: s_endpgm 4916; 4917; VI-LABEL: i1_arg_zext_i32: 4918; VI: ; %bb.0: 4919; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 4920; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4921; VI-NEXT: s_waitcnt lgkmcnt(0) 4922; VI-NEXT: s_and_b32 s2, s2, 1 4923; VI-NEXT: v_mov_b32_e32 v0, s0 4924; VI-NEXT: v_mov_b32_e32 v1, s1 4925; VI-NEXT: v_mov_b32_e32 v2, s2 4926; VI-NEXT: flat_store_dword v[0:1], v2 4927; VI-NEXT: s_endpgm 4928; 4929; GFX9-LABEL: i1_arg_zext_i32: 4930; GFX9: ; %bb.0: 4931; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 4932; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4933; GFX9-NEXT: v_mov_b32_e32 v0, 0 4934; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4935; GFX9-NEXT: s_and_b32 s2, s2, 1 4936; GFX9-NEXT: v_mov_b32_e32 v1, s2 4937; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4938; GFX9-NEXT: s_endpgm 4939; 4940; EG-LABEL: i1_arg_zext_i32: 4941; EG: ; %bb.0: 4942; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4943; EG-NEXT: TEX 0 @6 4944; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4945; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 4946; EG-NEXT: CF_END 4947; EG-NEXT: PAD 4948; EG-NEXT: Fetch clause starting at 6: 4949; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4950; EG-NEXT: ALU clause starting at 8: 4951; EG-NEXT: MOV * T0.X, 0.0, 4952; EG-NEXT: ALU clause starting at 9: 4953; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4954; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4955; 4956; CM-LABEL: i1_arg_zext_i32: 4957; CM: ; %bb.0: 4958; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4959; CM-NEXT: TEX 0 @6 4960; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4961; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 4962; CM-NEXT: CF_END 4963; CM-NEXT: PAD 4964; CM-NEXT: Fetch clause starting at 6: 4965; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4966; CM-NEXT: ALU clause starting at 8: 4967; CM-NEXT: MOV * T0.X, 0.0, 4968; CM-NEXT: ALU clause starting at 9: 4969; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4970; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4971 %ext = zext i1 %x to i32 4972 store i32 %ext, i32 addrspace(1)* %out, align 4 4973 ret void 4974} 4975 4976define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 4977; SI-LABEL: i1_arg_zext_i64: 4978; SI: ; %bb.0: 4979; SI-NEXT: s_load_dword s4, s[0:1], 0xb 4980; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4981; SI-NEXT: s_mov_b32 s3, 0xf000 4982; SI-NEXT: s_mov_b32 s2, -1 4983; SI-NEXT: s_waitcnt lgkmcnt(0) 4984; SI-NEXT: s_and_b32 s4, s4, 1 4985; SI-NEXT: v_mov_b32_e32 v1, 0 4986; SI-NEXT: v_mov_b32_e32 v0, s4 4987; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4988; SI-NEXT: s_endpgm 4989; 4990; VI-LABEL: i1_arg_zext_i64: 4991; VI: ; %bb.0: 4992; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 4993; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4994; VI-NEXT: v_mov_b32_e32 v1, 0 4995; VI-NEXT: s_waitcnt lgkmcnt(0) 4996; VI-NEXT: s_and_b32 s2, s2, 1 4997; VI-NEXT: v_mov_b32_e32 v3, s1 4998; VI-NEXT: v_mov_b32_e32 v0, s2 4999; VI-NEXT: v_mov_b32_e32 v2, s0 5000; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5001; VI-NEXT: s_endpgm 5002; 5003; GFX9-LABEL: i1_arg_zext_i64: 5004; GFX9: ; %bb.0: 5005; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5006; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5007; GFX9-NEXT: v_mov_b32_e32 v1, 0 5008; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5009; GFX9-NEXT: s_and_b32 s2, s2, 1 5010; GFX9-NEXT: v_mov_b32_e32 v0, s2 5011; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 5012; GFX9-NEXT: s_endpgm 5013; 5014; EG-LABEL: i1_arg_zext_i64: 5015; EG: ; %bb.0: 5016; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5017; EG-NEXT: TEX 0 @6 5018; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5019; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 5020; EG-NEXT: CF_END 5021; EG-NEXT: PAD 5022; EG-NEXT: Fetch clause starting at 6: 5023; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5024; EG-NEXT: ALU clause starting at 8: 5025; EG-NEXT: MOV * T0.X, 0.0, 5026; EG-NEXT: ALU clause starting at 9: 5027; EG-NEXT: MOV * T0.Y, 0.0, 5028; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5029; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5030; 5031; CM-LABEL: i1_arg_zext_i64: 5032; CM: ; %bb.0: 5033; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5034; CM-NEXT: TEX 0 @6 5035; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5036; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5037; CM-NEXT: CF_END 5038; CM-NEXT: PAD 5039; CM-NEXT: Fetch clause starting at 6: 5040; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5041; CM-NEXT: ALU clause starting at 8: 5042; CM-NEXT: MOV * T0.X, 0.0, 5043; CM-NEXT: ALU clause starting at 9: 5044; CM-NEXT: MOV * T0.Y, 0.0, 5045; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5046; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5047 %ext = zext i1 %x to i64 5048 store i64 %ext, i64 addrspace(1)* %out, align 8 5049 ret void 5050} 5051 5052define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 5053; SI-LABEL: i1_arg_sext_i32: 5054; SI: ; %bb.0: 5055; SI-NEXT: s_load_dword s2, s[0:1], 0xb 5056; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5057; SI-NEXT: s_mov_b32 s3, 0xf000 5058; SI-NEXT: s_waitcnt lgkmcnt(0) 5059; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 5060; SI-NEXT: s_mov_b32 s2, -1 5061; SI-NEXT: v_mov_b32_e32 v0, s4 5062; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5063; SI-NEXT: s_endpgm 5064; 5065; VI-LABEL: i1_arg_sext_i32: 5066; VI: ; %bb.0: 5067; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 5068; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5069; VI-NEXT: s_waitcnt lgkmcnt(0) 5070; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 5071; VI-NEXT: v_mov_b32_e32 v0, s0 5072; VI-NEXT: v_mov_b32_e32 v1, s1 5073; VI-NEXT: v_mov_b32_e32 v2, s2 5074; VI-NEXT: flat_store_dword v[0:1], v2 5075; VI-NEXT: s_endpgm 5076; 5077; GFX9-LABEL: i1_arg_sext_i32: 5078; GFX9: ; %bb.0: 5079; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5080; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5081; GFX9-NEXT: v_mov_b32_e32 v0, 0 5082; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5083; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 5084; GFX9-NEXT: v_mov_b32_e32 v1, s2 5085; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5086; GFX9-NEXT: s_endpgm 5087; 5088; EG-LABEL: i1_arg_sext_i32: 5089; EG: ; %bb.0: 5090; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5091; EG-NEXT: TEX 0 @6 5092; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5093; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 5094; EG-NEXT: CF_END 5095; EG-NEXT: PAD 5096; EG-NEXT: Fetch clause starting at 6: 5097; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5098; EG-NEXT: ALU clause starting at 8: 5099; EG-NEXT: MOV * T0.X, 0.0, 5100; EG-NEXT: ALU clause starting at 9: 5101; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5102; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5103; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5104; 5105; CM-LABEL: i1_arg_sext_i32: 5106; CM: ; %bb.0: 5107; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5108; CM-NEXT: TEX 0 @6 5109; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5110; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 5111; CM-NEXT: CF_END 5112; CM-NEXT: PAD 5113; CM-NEXT: Fetch clause starting at 6: 5114; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5115; CM-NEXT: ALU clause starting at 8: 5116; CM-NEXT: MOV * T0.X, 0.0, 5117; CM-NEXT: ALU clause starting at 9: 5118; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5119; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5120; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5121 %ext = sext i1 %x to i32 5122 store i32 %ext, i32addrspace(1)* %out, align 4 5123 ret void 5124} 5125 5126define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 5127; SI-LABEL: i1_arg_sext_i64: 5128; SI: ; %bb.0: 5129; SI-NEXT: s_load_dword s2, s[0:1], 0xb 5130; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5131; SI-NEXT: s_mov_b32 s3, 0xf000 5132; SI-NEXT: s_waitcnt lgkmcnt(0) 5133; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 5134; SI-NEXT: s_mov_b32 s2, -1 5135; SI-NEXT: v_mov_b32_e32 v0, s4 5136; SI-NEXT: v_mov_b32_e32 v1, s5 5137; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5138; SI-NEXT: s_endpgm 5139; 5140; VI-LABEL: i1_arg_sext_i64: 5141; VI: ; %bb.0: 5142; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 5143; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5144; VI-NEXT: s_waitcnt lgkmcnt(0) 5145; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 5146; VI-NEXT: v_mov_b32_e32 v0, s0 5147; VI-NEXT: v_mov_b32_e32 v2, s2 5148; VI-NEXT: v_mov_b32_e32 v1, s1 5149; VI-NEXT: v_mov_b32_e32 v3, s3 5150; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5151; VI-NEXT: s_endpgm 5152; 5153; GFX9-LABEL: i1_arg_sext_i64: 5154; GFX9: ; %bb.0: 5155; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 5156; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 5157; GFX9-NEXT: v_mov_b32_e32 v2, 0 5158; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5159; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 5160; GFX9-NEXT: v_mov_b32_e32 v0, s0 5161; GFX9-NEXT: v_mov_b32_e32 v1, s1 5162; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5163; GFX9-NEXT: s_endpgm 5164; 5165; EG-LABEL: i1_arg_sext_i64: 5166; EG: ; %bb.0: 5167; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5168; EG-NEXT: TEX 0 @6 5169; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5170; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 5171; EG-NEXT: CF_END 5172; EG-NEXT: PAD 5173; EG-NEXT: Fetch clause starting at 6: 5174; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5175; EG-NEXT: ALU clause starting at 8: 5176; EG-NEXT: MOV * T0.X, 0.0, 5177; EG-NEXT: ALU clause starting at 9: 5178; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5179; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5180; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5181; EG-NEXT: MOV * T0.Y, PV.X, 5182; 5183; CM-LABEL: i1_arg_sext_i64: 5184; CM: ; %bb.0: 5185; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5186; CM-NEXT: TEX 0 @6 5187; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5188; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5189; CM-NEXT: CF_END 5190; CM-NEXT: PAD 5191; CM-NEXT: Fetch clause starting at 6: 5192; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5193; CM-NEXT: ALU clause starting at 8: 5194; CM-NEXT: MOV * T0.X, 0.0, 5195; CM-NEXT: ALU clause starting at 9: 5196; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5197; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 5198; CM-NEXT: MOV * T0.Y, PV.X, 5199; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5200 %ext = sext i1 %x to i64 5201 store i64 %ext, i64 addrspace(1)* %out, align 8 5202 ret void 5203} 5204 5205define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 5206; SI-LABEL: empty_struct_arg: 5207; SI: ; %bb.0: 5208; SI-NEXT: s_endpgm 5209; 5210; VI-LABEL: empty_struct_arg: 5211; VI: ; %bb.0: 5212; VI-NEXT: s_endpgm 5213; 5214; GFX9-LABEL: empty_struct_arg: 5215; GFX9: ; %bb.0: 5216; GFX9-NEXT: s_endpgm 5217; 5218; EGCM-LABEL: empty_struct_arg: 5219; EGCM: ; %bb.0: 5220; EGCM-NEXT: CF_END 5221; EGCM-NEXT: PAD 5222 ret void 5223} 5224 5225; The correct load offsets for these: 5226; load 4 from 0, 5227; load 8 from 8 5228; load 4 from 24 5229; load 8 from 32 5230 5231; With the SelectionDAG argument lowering, the alignments for the 5232; struct members is not properly considered, making these wrong. 5233 5234; FIXME: Total argument size is computed wrong 5235define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 5236; SI-LABEL: struct_argument_alignment: 5237; SI: ; %bb.0: 5238; SI-NEXT: s_load_dword s8, s[0:1], 0x9 5239; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5240; SI-NEXT: s_load_dword s9, s[0:1], 0xf 5241; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 5242; SI-NEXT: s_mov_b32 s0, 0 5243; SI-NEXT: s_mov_b32 s3, 0xf000 5244; SI-NEXT: s_mov_b32 s2, -1 5245; SI-NEXT: s_mov_b32 s1, s0 5246; SI-NEXT: s_waitcnt lgkmcnt(0) 5247; SI-NEXT: v_mov_b32_e32 v0, s8 5248; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5249; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5250; SI-NEXT: v_mov_b32_e32 v0, s4 5251; SI-NEXT: v_mov_b32_e32 v1, s5 5252; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5253; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5254; SI-NEXT: v_mov_b32_e32 v0, s9 5255; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5256; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5257; SI-NEXT: v_mov_b32_e32 v0, s6 5258; SI-NEXT: v_mov_b32_e32 v1, s7 5259; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5260; SI-NEXT: s_waitcnt vmcnt(0) 5261; SI-NEXT: s_endpgm 5262; 5263; VI-LABEL: struct_argument_alignment: 5264; VI: ; %bb.0: 5265; VI-NEXT: s_load_dword s4, s[0:1], 0x24 5266; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5267; VI-NEXT: s_load_dword s5, s[0:1], 0x3c 5268; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 5269; VI-NEXT: v_mov_b32_e32 v0, 0 5270; VI-NEXT: v_mov_b32_e32 v1, 0 5271; VI-NEXT: s_waitcnt lgkmcnt(0) 5272; VI-NEXT: v_mov_b32_e32 v2, s4 5273; VI-NEXT: flat_store_dword v[0:1], v2 5274; VI-NEXT: s_waitcnt vmcnt(0) 5275; VI-NEXT: v_mov_b32_e32 v2, s2 5276; VI-NEXT: v_mov_b32_e32 v3, s3 5277; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5278; VI-NEXT: s_waitcnt vmcnt(0) 5279; VI-NEXT: v_mov_b32_e32 v2, s5 5280; VI-NEXT: flat_store_dword v[0:1], v2 5281; VI-NEXT: s_waitcnt vmcnt(0) 5282; VI-NEXT: v_mov_b32_e32 v3, s1 5283; VI-NEXT: v_mov_b32_e32 v2, s0 5284; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5285; VI-NEXT: s_waitcnt vmcnt(0) 5286; VI-NEXT: s_endpgm 5287; 5288; GFX9-LABEL: struct_argument_alignment: 5289; GFX9: ; %bb.0: 5290; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 5291; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5292; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 5293; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 5294; GFX9-NEXT: v_mov_b32_e32 v0, 0 5295; GFX9-NEXT: v_mov_b32_e32 v1, 0 5296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5297; GFX9-NEXT: v_mov_b32_e32 v2, s6 5298; GFX9-NEXT: global_store_dword v[0:1], v2, off 5299; GFX9-NEXT: s_waitcnt vmcnt(0) 5300; GFX9-NEXT: v_mov_b32_e32 v3, s1 5301; GFX9-NEXT: v_mov_b32_e32 v2, s0 5302; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5303; GFX9-NEXT: s_waitcnt vmcnt(0) 5304; GFX9-NEXT: v_mov_b32_e32 v2, s7 5305; GFX9-NEXT: global_store_dword v[0:1], v2, off 5306; GFX9-NEXT: s_waitcnt vmcnt(0) 5307; GFX9-NEXT: v_mov_b32_e32 v2, s2 5308; GFX9-NEXT: v_mov_b32_e32 v3, s3 5309; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5310; GFX9-NEXT: s_waitcnt vmcnt(0) 5311; GFX9-NEXT: s_endpgm 5312; 5313; EG-LABEL: struct_argument_alignment: 5314; EG: ; %bb.0: 5315; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5316; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0 5317; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 5318; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0 5319; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0 5320; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0 5321; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 5322; EG-NEXT: CF_END 5323; EG-NEXT: ALU clause starting at 8: 5324; EG-NEXT: MOV T0.X, KC0[4].Y, 5325; EG-NEXT: MOV * T1.X, KC0[4].Z, 5326; EG-NEXT: MOV T2.X, KC0[3].W, 5327; EG-NEXT: MOV * T3.X, KC0[2].W, 5328; EG-NEXT: MOV T4.X, literal.x, 5329; EG-NEXT: MOV * T5.X, KC0[3].X, 5330; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5331; EG-NEXT: MOV T6.X, literal.x, 5332; EG-NEXT: MOV * T7.X, KC0[2].Y, 5333; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5334; 5335; CM-LABEL: struct_argument_alignment: 5336; CM: ; %bb.0: 5337; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5338; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X 5339; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X 5340; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X 5341; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X 5342; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5343; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X 5344; CM-NEXT: CF_END 5345; CM-NEXT: ALU clause starting at 8: 5346; CM-NEXT: MOV * T0.X, KC0[4].Y, 5347; CM-NEXT: MOV * T1.X, KC0[4].Z, 5348; CM-NEXT: MOV * T2.X, KC0[3].W, 5349; CM-NEXT: MOV * T3.X, KC0[2].W, 5350; CM-NEXT: MOV * T4.X, literal.x, 5351; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5352; CM-NEXT: MOV * T5.X, KC0[3].X, 5353; CM-NEXT: MOV * T6.X, literal.x, 5354; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5355; CM-NEXT: MOV * T7.X, KC0[2].Y, 5356 %val0 = extractvalue {i32, i64} %arg0, 0 5357 %val1 = extractvalue {i32, i64} %arg0, 1 5358 %val2 = extractvalue {i32, i64} %arg1, 0 5359 %val3 = extractvalue {i32, i64} %arg1, 1 5360 store volatile i32 %val0, i32 addrspace(1)* null 5361 store volatile i64 %val1, i64 addrspace(1)* null 5362 store volatile i32 %val2, i32 addrspace(1)* null 5363 store volatile i64 %val3, i64 addrspace(1)* null 5364 ret void 5365} 5366 5367; No padding between i8 and next struct, but round up at end to 4 byte 5368; multiple. 5369define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 5370; SI-LABEL: packed_struct_argument_alignment: 5371; SI: ; %bb.0: 5372; SI-NEXT: s_mov_b32 s3, 0xf000 5373; SI-NEXT: s_mov_b32 s2, -1 5374; SI-NEXT: s_load_dword s6, s[0:1], 0x9 5375; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xa 5376; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:49 5377; SI-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:50 5378; SI-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:51 5379; SI-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:52 5380; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53 5381; SI-NEXT: s_mov_b32 s0, 0 5382; SI-NEXT: s_mov_b32 s1, s0 5383; SI-NEXT: s_waitcnt lgkmcnt(0) 5384; SI-NEXT: v_mov_b32_e32 v2, s6 5385; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 5386; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5387; SI-NEXT: v_mov_b32_e32 v2, s4 5388; SI-NEXT: v_mov_b32_e32 v3, s5 5389; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 5390; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5391; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 5392; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 5393; SI-NEXT: v_or_b32_e32 v2, v2, v4 5394; SI-NEXT: v_or_b32_e32 v3, v3, v6 5395; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 5396; SI-NEXT: v_or_b32_e32 v2, v3, v2 5397; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 5398; SI-NEXT: s_waitcnt vmcnt(0) 5399; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5400; SI-NEXT: s_waitcnt vmcnt(0) 5401; SI-NEXT: s_endpgm 5402; 5403; VI-LABEL: packed_struct_argument_alignment: 5404; VI: ; %bb.0: 5405; VI-NEXT: s_add_u32 s2, s0, 49 5406; VI-NEXT: s_addc_u32 s3, s1, 0 5407; VI-NEXT: s_add_u32 s4, s0, 50 5408; VI-NEXT: s_addc_u32 s5, s1, 0 5409; VI-NEXT: v_mov_b32_e32 v2, s2 5410; VI-NEXT: v_mov_b32_e32 v3, s3 5411; VI-NEXT: s_add_u32 s2, s2, 3 5412; VI-NEXT: s_addc_u32 s3, s3, 0 5413; VI-NEXT: v_mov_b32_e32 v5, s3 5414; VI-NEXT: v_mov_b32_e32 v4, s2 5415; VI-NEXT: s_add_u32 s2, s0, 51 5416; VI-NEXT: s_addc_u32 s3, s1, 0 5417; VI-NEXT: v_mov_b32_e32 v0, s4 5418; VI-NEXT: v_mov_b32_e32 v7, s3 5419; VI-NEXT: v_mov_b32_e32 v1, s5 5420; VI-NEXT: v_mov_b32_e32 v6, s2 5421; VI-NEXT: flat_load_ubyte v8, v[0:1] 5422; VI-NEXT: flat_load_ubyte v9, v[2:3] 5423; VI-NEXT: flat_load_ubyte v10, v[4:5] 5424; VI-NEXT: flat_load_ubyte v6, v[6:7] 5425; VI-NEXT: s_add_u32 s2, s0, 53 5426; VI-NEXT: s_addc_u32 s3, s1, 0 5427; VI-NEXT: v_mov_b32_e32 v0, s2 5428; VI-NEXT: v_mov_b32_e32 v1, s3 5429; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 5430; VI-NEXT: s_load_dword s2, s[0:1], 0x24 5431; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 5432; VI-NEXT: v_mov_b32_e32 v2, 0 5433; VI-NEXT: v_mov_b32_e32 v3, 0 5434; VI-NEXT: s_waitcnt lgkmcnt(0) 5435; VI-NEXT: v_mov_b32_e32 v7, s2 5436; VI-NEXT: v_mov_b32_e32 v5, s1 5437; VI-NEXT: v_mov_b32_e32 v4, s0 5438; VI-NEXT: flat_store_dword v[2:3], v7 5439; VI-NEXT: s_waitcnt vmcnt(0) 5440; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] 5441; VI-NEXT: s_waitcnt vmcnt(0) 5442; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 5443; VI-NEXT: v_or_b32_e32 v4, v4, v9 5444; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 5445; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5446; VI-NEXT: v_or_b32_e32 v4, v5, v4 5447; VI-NEXT: flat_store_dword v[2:3], v4 5448; VI-NEXT: s_waitcnt vmcnt(0) 5449; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5450; VI-NEXT: s_waitcnt vmcnt(0) 5451; VI-NEXT: s_endpgm 5452; 5453; GFX9-LABEL: packed_struct_argument_alignment: 5454; GFX9: ; %bb.0: 5455; GFX9-NEXT: v_mov_b32_e32 v2, 0 5456; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 5457; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 5458; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 5459; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 5460; GFX9-NEXT: v_mov_b32_e32 v2, 0 5461; GFX9-NEXT: v_mov_b32_e32 v3, 0 5462; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5463; GFX9-NEXT: v_mov_b32_e32 v7, s2 5464; GFX9-NEXT: v_mov_b32_e32 v5, s1 5465; GFX9-NEXT: v_mov_b32_e32 v4, s0 5466; GFX9-NEXT: global_store_dword v[2:3], v7, off 5467; GFX9-NEXT: s_waitcnt vmcnt(0) 5468; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off 5469; GFX9-NEXT: s_waitcnt vmcnt(0) 5470; GFX9-NEXT: global_store_dword v[2:3], v6, off 5471; GFX9-NEXT: s_waitcnt vmcnt(0) 5472; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 5473; GFX9-NEXT: s_waitcnt vmcnt(0) 5474; GFX9-NEXT: s_endpgm 5475; 5476; EG-LABEL: packed_struct_argument_alignment: 5477; EG: ; %bb.0: 5478; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5479; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 5480; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5481; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5482; EG-NEXT: ALU 2, @25, KC0[], KC1[] 5483; EG-NEXT: TEX 0 @12 5484; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5485; EG-NEXT: TEX 0 @14 5486; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5487; EG-NEXT: TEX 0 @16 5488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1 5489; EG-NEXT: CF_END 5490; EG-NEXT: Fetch clause starting at 12: 5491; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5492; EG-NEXT: Fetch clause starting at 14: 5493; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5494; EG-NEXT: Fetch clause starting at 16: 5495; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5496; EG-NEXT: ALU clause starting at 18: 5497; EG-NEXT: MOV T0.X, KC0[2].Z, 5498; EG-NEXT: MOV * T1.X, literal.x, 5499; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5500; EG-NEXT: MOV T2.X, KC0[2].W, 5501; EG-NEXT: MOV * T3.X, literal.x, 5502; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5503; EG-NEXT: MOV * T4.X, KC0[2].Y, 5504; EG-NEXT: ALU clause starting at 25: 5505; EG-NEXT: MOV T0.X, 0.0, 5506; EG-NEXT: MOV * T2.X, 0.0, 5507; EG-NEXT: MOV * T4.X, 0.0, 5508; 5509; CM-LABEL: packed_struct_argument_alignment: 5510; CM: ; %bb.0: 5511; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5512; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5513; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5514; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5515; CM-NEXT: ALU 2, @25, KC0[], KC1[] 5516; CM-NEXT: TEX 0 @12 5517; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5518; CM-NEXT: TEX 0 @14 5519; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5520; CM-NEXT: TEX 0 @16 5521; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5522; CM-NEXT: CF_END 5523; CM-NEXT: Fetch clause starting at 12: 5524; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5525; CM-NEXT: Fetch clause starting at 14: 5526; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5527; CM-NEXT: Fetch clause starting at 16: 5528; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5529; CM-NEXT: ALU clause starting at 18: 5530; CM-NEXT: MOV * T0.X, KC0[2].Z, 5531; CM-NEXT: MOV * T1.X, literal.x, 5532; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5533; CM-NEXT: MOV * T2.X, KC0[2].W, 5534; CM-NEXT: MOV * T3.X, literal.x, 5535; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5536; CM-NEXT: MOV * T4.X, KC0[2].Y, 5537; CM-NEXT: ALU clause starting at 25: 5538; CM-NEXT: MOV * T0.X, 0.0, 5539; CM-NEXT: MOV * T2.X, 0.0, 5540; CM-NEXT: MOV * T4.X, 0.0, 5541 %val0 = extractvalue <{i32, i64}> %arg0, 0 5542 %val1 = extractvalue <{i32, i64}> %arg0, 1 5543 %val2 = extractvalue <{i32, i64}> %arg1, 0 5544 %val3 = extractvalue <{i32, i64}> %arg1, 1 5545 store volatile i32 %val0, i32 addrspace(1)* null 5546 store volatile i64 %val1, i64 addrspace(1)* null 5547 store volatile i32 %val2, i32 addrspace(1)* null 5548 store volatile i64 %val3, i64 addrspace(1)* null 5549 ret void 5550} 5551 5552define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 5553; SI-LABEL: struct_argument_alignment_after: 5554; SI: ; %bb.0: 5555; SI-NEXT: s_load_dword s12, s[0:1], 0x9 5556; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 5557; SI-NEXT: s_load_dword s13, s[0:1], 0xf 5558; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 5559; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 5560; SI-NEXT: s_mov_b32 s4, 0 5561; SI-NEXT: s_mov_b32 s7, 0xf000 5562; SI-NEXT: s_mov_b32 s6, -1 5563; SI-NEXT: s_mov_b32 s5, s4 5564; SI-NEXT: s_waitcnt lgkmcnt(0) 5565; SI-NEXT: v_mov_b32_e32 v0, s12 5566; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5567; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5568; SI-NEXT: v_mov_b32_e32 v0, s8 5569; SI-NEXT: v_mov_b32_e32 v1, s9 5570; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5571; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5572; SI-NEXT: v_mov_b32_e32 v0, s13 5573; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5574; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5575; SI-NEXT: v_mov_b32_e32 v0, s10 5576; SI-NEXT: v_mov_b32_e32 v1, s11 5577; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5578; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5579; SI-NEXT: v_mov_b32_e32 v0, s0 5580; SI-NEXT: v_mov_b32_e32 v1, s1 5581; SI-NEXT: v_mov_b32_e32 v2, s2 5582; SI-NEXT: v_mov_b32_e32 v3, s3 5583; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5584; SI-NEXT: s_waitcnt vmcnt(0) 5585; SI-NEXT: s_endpgm 5586; 5587; VI-LABEL: struct_argument_alignment_after: 5588; VI: ; %bb.0: 5589; VI-NEXT: s_load_dword s8, s[0:1], 0x24 5590; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5591; VI-NEXT: s_load_dword s9, s[0:1], 0x3c 5592; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 5593; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 5594; VI-NEXT: v_mov_b32_e32 v4, 0 5595; VI-NEXT: v_mov_b32_e32 v5, 0 5596; VI-NEXT: s_waitcnt lgkmcnt(0) 5597; VI-NEXT: v_mov_b32_e32 v0, s8 5598; VI-NEXT: flat_store_dword v[4:5], v0 5599; VI-NEXT: s_waitcnt vmcnt(0) 5600; VI-NEXT: v_mov_b32_e32 v0, s4 5601; VI-NEXT: v_mov_b32_e32 v1, s5 5602; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5603; VI-NEXT: s_waitcnt vmcnt(0) 5604; VI-NEXT: v_mov_b32_e32 v0, s9 5605; VI-NEXT: flat_store_dword v[4:5], v0 5606; VI-NEXT: s_waitcnt vmcnt(0) 5607; VI-NEXT: v_mov_b32_e32 v0, s6 5608; VI-NEXT: v_mov_b32_e32 v1, s7 5609; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5610; VI-NEXT: s_waitcnt vmcnt(0) 5611; VI-NEXT: v_mov_b32_e32 v0, s0 5612; VI-NEXT: v_mov_b32_e32 v1, s1 5613; VI-NEXT: v_mov_b32_e32 v2, s2 5614; VI-NEXT: v_mov_b32_e32 v3, s3 5615; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 5616; VI-NEXT: s_waitcnt vmcnt(0) 5617; VI-NEXT: s_endpgm 5618; 5619; GFX9-LABEL: struct_argument_alignment_after: 5620; GFX9: ; %bb.0: 5621; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 5622; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 5623; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 5624; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 5625; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 5626; GFX9-NEXT: v_mov_b32_e32 v4, 0 5627; GFX9-NEXT: v_mov_b32_e32 v5, 0 5628; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5629; GFX9-NEXT: v_mov_b32_e32 v0, s10 5630; GFX9-NEXT: global_store_dword v[4:5], v0, off 5631; GFX9-NEXT: s_waitcnt vmcnt(0) 5632; GFX9-NEXT: v_mov_b32_e32 v0, s6 5633; GFX9-NEXT: v_mov_b32_e32 v1, s7 5634; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5635; GFX9-NEXT: s_waitcnt vmcnt(0) 5636; GFX9-NEXT: v_mov_b32_e32 v0, s11 5637; GFX9-NEXT: global_store_dword v[4:5], v0, off 5638; GFX9-NEXT: s_waitcnt vmcnt(0) 5639; GFX9-NEXT: v_mov_b32_e32 v0, s8 5640; GFX9-NEXT: v_mov_b32_e32 v1, s9 5641; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5642; GFX9-NEXT: s_waitcnt vmcnt(0) 5643; GFX9-NEXT: v_mov_b32_e32 v0, s0 5644; GFX9-NEXT: v_mov_b32_e32 v1, s1 5645; GFX9-NEXT: v_mov_b32_e32 v2, s2 5646; GFX9-NEXT: v_mov_b32_e32 v3, s3 5647; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 5648; GFX9-NEXT: s_waitcnt vmcnt(0) 5649; GFX9-NEXT: s_endpgm 5650; 5651; EG-LABEL: struct_argument_alignment_after: 5652; EG: ; %bb.0: 5653; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5654; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0 5655; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0 5656; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0 5657; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0 5658; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0 5659; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0 5660; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1 5661; EG-NEXT: CF_END 5662; EG-NEXT: PAD 5663; EG-NEXT: ALU clause starting at 10: 5664; EG-NEXT: MOV * T0.W, KC0[6].X, 5665; EG-NEXT: MOV * T0.Z, KC0[5].W, 5666; EG-NEXT: MOV * T0.Y, KC0[5].Z, 5667; EG-NEXT: MOV T0.X, KC0[5].Y, 5668; EG-NEXT: MOV * T1.X, KC0[4].Y, 5669; EG-NEXT: MOV T2.X, KC0[4].Z, 5670; EG-NEXT: MOV * T3.X, KC0[3].W, 5671; EG-NEXT: MOV T4.X, KC0[2].W, 5672; EG-NEXT: MOV * T5.X, literal.x, 5673; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5674; EG-NEXT: MOV T6.X, KC0[3].X, 5675; EG-NEXT: MOV * T7.X, literal.x, 5676; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5677; EG-NEXT: MOV * T8.X, KC0[2].Y, 5678; 5679; CM-LABEL: struct_argument_alignment_after: 5680; CM: ; %bb.0: 5681; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5682; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X 5683; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X 5684; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X 5685; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X 5686; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X 5687; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X 5688; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X 5689; CM-NEXT: CF_END 5690; CM-NEXT: PAD 5691; CM-NEXT: ALU clause starting at 10: 5692; CM-NEXT: MOV * T0.W, KC0[6].X, 5693; CM-NEXT: MOV * T0.Z, KC0[5].W, 5694; CM-NEXT: MOV * T0.Y, KC0[5].Z, 5695; CM-NEXT: MOV * T0.X, KC0[5].Y, 5696; CM-NEXT: MOV * T1.X, KC0[4].Y, 5697; CM-NEXT: MOV * T2.X, KC0[4].Z, 5698; CM-NEXT: MOV * T3.X, KC0[3].W, 5699; CM-NEXT: MOV * T4.X, KC0[2].W, 5700; CM-NEXT: MOV * T5.X, literal.x, 5701; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5702; CM-NEXT: MOV * T6.X, KC0[3].X, 5703; CM-NEXT: MOV * T7.X, literal.x, 5704; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5705; CM-NEXT: MOV * T8.X, KC0[2].Y, 5706 %val0 = extractvalue {i32, i64} %arg0, 0 5707 %val1 = extractvalue {i32, i64} %arg0, 1 5708 %val2 = extractvalue {i32, i64} %arg2, 0 5709 %val3 = extractvalue {i32, i64} %arg2, 1 5710 store volatile i32 %val0, i32 addrspace(1)* null 5711 store volatile i64 %val1, i64 addrspace(1)* null 5712 store volatile i32 %val2, i32 addrspace(1)* null 5713 store volatile i64 %val3, i64 addrspace(1)* null 5714 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 5715 ret void 5716} 5717 5718define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 5719; SI-LABEL: array_3xi32: 5720; SI: ; %bb.0: 5721; SI-NEXT: s_load_dword s4, s[0:1], 0xc 5722; SI-NEXT: s_load_dword s5, s[0:1], 0x9 5723; SI-NEXT: s_load_dword s6, s[0:1], 0xa 5724; SI-NEXT: s_load_dword s0, s[0:1], 0xb 5725; SI-NEXT: s_mov_b32 s3, 0xf000 5726; SI-NEXT: s_mov_b32 s2, -1 5727; SI-NEXT: s_waitcnt lgkmcnt(0) 5728; SI-NEXT: v_mov_b32_e32 v0, s5 5729; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 5730; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5731; SI-NEXT: v_mov_b32_e32 v0, s4 5732; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5733; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5734; SI-NEXT: v_mov_b32_e32 v0, s0 5735; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5736; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5737; SI-NEXT: v_mov_b32_e32 v0, s6 5738; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5739; SI-NEXT: s_waitcnt vmcnt(0) 5740; SI-NEXT: s_endpgm 5741; 5742; VI-LABEL: array_3xi32: 5743; VI: ; %bb.0: 5744; VI-NEXT: s_load_dword s2, s[0:1], 0x24 5745; VI-NEXT: s_load_dword s3, s[0:1], 0x30 5746; VI-NEXT: s_load_dword s4, s[0:1], 0x28 5747; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 5748; VI-NEXT: s_waitcnt lgkmcnt(0) 5749; VI-NEXT: v_mov_b32_e32 v0, s2 5750; VI-NEXT: v_mov_b32_e32 v1, s3 5751; VI-NEXT: flat_store_short v[0:1], v0 5752; VI-NEXT: s_waitcnt vmcnt(0) 5753; VI-NEXT: flat_store_dword v[0:1], v1 5754; VI-NEXT: s_waitcnt vmcnt(0) 5755; VI-NEXT: v_mov_b32_e32 v0, s0 5756; VI-NEXT: flat_store_dword v[0:1], v0 5757; VI-NEXT: s_waitcnt vmcnt(0) 5758; VI-NEXT: v_mov_b32_e32 v0, s4 5759; VI-NEXT: flat_store_dword v[0:1], v0 5760; VI-NEXT: s_waitcnt vmcnt(0) 5761; VI-NEXT: s_endpgm 5762; 5763; GFX9-LABEL: array_3xi32: 5764; GFX9: ; %bb.0: 5765; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 5766; GFX9-NEXT: s_load_dword s1, s[4:5], 0xc 5767; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4 5768; GFX9-NEXT: s_load_dword s3, s[4:5], 0x8 5769; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5770; GFX9-NEXT: v_mov_b32_e32 v0, s0 5771; GFX9-NEXT: v_mov_b32_e32 v1, s1 5772; GFX9-NEXT: global_store_short v[0:1], v0, off 5773; GFX9-NEXT: s_waitcnt vmcnt(0) 5774; GFX9-NEXT: global_store_dword v[0:1], v1, off 5775; GFX9-NEXT: s_waitcnt vmcnt(0) 5776; GFX9-NEXT: v_mov_b32_e32 v0, s3 5777; GFX9-NEXT: global_store_dword v[0:1], v0, off 5778; GFX9-NEXT: s_waitcnt vmcnt(0) 5779; GFX9-NEXT: v_mov_b32_e32 v0, s2 5780; GFX9-NEXT: global_store_dword v[0:1], v0, off 5781; GFX9-NEXT: s_waitcnt vmcnt(0) 5782; GFX9-NEXT: s_endpgm 5783; 5784; EG-LABEL: array_3xi32: 5785; EG: ; %bb.0: 5786; EG-NEXT: ALU 0, @10, KC0[], KC1[] 5787; EG-NEXT: TEX 0 @8 5788; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5789; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5790; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0 5791; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 5792; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1 5793; EG-NEXT: CF_END 5794; EG-NEXT: Fetch clause starting at 8: 5795; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5796; EG-NEXT: ALU clause starting at 10: 5797; EG-NEXT: MOV * T0.X, 0.0, 5798; EG-NEXT: ALU clause starting at 11: 5799; EG-NEXT: AND_INT T0.X, T0.X, literal.x, 5800; EG-NEXT: MOV * T0.W, literal.x, 5801; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5802; EG-NEXT: MOV T0.Y, 0.0, 5803; EG-NEXT: MOV * T0.Z, 0.0, 5804; EG-NEXT: MOV T1.X, KC0[2].Z, 5805; EG-NEXT: MOV * T2.X, KC0[2].W, 5806; EG-NEXT: MOV T3.X, KC0[3].X, 5807; EG-NEXT: MOV * T4.X, literal.x, 5808; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5809; 5810; CM-LABEL: array_3xi32: 5811; CM: ; %bb.0: 5812; CM-NEXT: ALU 0, @10, KC0[], KC1[] 5813; CM-NEXT: TEX 0 @8 5814; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5815; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5816; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X 5817; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X 5818; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5819; CM-NEXT: CF_END 5820; CM-NEXT: Fetch clause starting at 8: 5821; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5822; CM-NEXT: ALU clause starting at 10: 5823; CM-NEXT: MOV * T0.X, 0.0, 5824; CM-NEXT: ALU clause starting at 11: 5825; CM-NEXT: AND_INT T0.X, T0.X, literal.x, 5826; CM-NEXT: MOV * T0.W, literal.x, 5827; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5828; CM-NEXT: MOV T0.Y, 0.0, 5829; CM-NEXT: MOV * T0.Z, 0.0, 5830; CM-NEXT: MOV * T1.X, KC0[2].Z, 5831; CM-NEXT: MOV * T2.X, KC0[2].W, 5832; CM-NEXT: MOV * T3.X, KC0[3].X, 5833; CM-NEXT: MOV * T4.X, literal.x, 5834; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5835 store volatile i16 %arg0, i16 addrspace(1)* undef 5836 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 5837 ret void 5838} 5839 5840; FIXME: Why not all scalar loads? 5841define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 5842; SI-LABEL: array_3xi16: 5843; SI: ; %bb.0: 5844; SI-NEXT: s_load_dword s4, s[0:1], 0x9 5845; SI-NEXT: s_mov_b32 s3, 0xf000 5846; SI-NEXT: s_mov_b32 s2, -1 5847; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 5848; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:40 5849; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:38 5850; SI-NEXT: s_waitcnt lgkmcnt(0) 5851; SI-NEXT: v_mov_b32_e32 v3, s4 5852; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 5853; SI-NEXT: s_waitcnt vmcnt(0) 5854; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 5855; SI-NEXT: s_waitcnt vmcnt(0) 5856; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 5857; SI-NEXT: s_waitcnt vmcnt(0) 5858; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 5859; SI-NEXT: s_waitcnt vmcnt(0) 5860; SI-NEXT: s_endpgm 5861; 5862; VI-LABEL: array_3xi16: 5863; VI: ; %bb.0: 5864; VI-NEXT: s_add_u32 s2, s0, 38 5865; VI-NEXT: s_addc_u32 s3, s1, 0 5866; VI-NEXT: s_add_u32 s4, s2, 2 5867; VI-NEXT: s_addc_u32 s5, s3, 0 5868; VI-NEXT: v_mov_b32_e32 v0, s2 5869; VI-NEXT: v_mov_b32_e32 v1, s3 5870; VI-NEXT: s_add_u32 s2, s0, 42 5871; VI-NEXT: s_addc_u32 s3, s1, 0 5872; VI-NEXT: v_mov_b32_e32 v2, s2 5873; VI-NEXT: v_mov_b32_e32 v3, s3 5874; VI-NEXT: flat_load_ushort v4, v[0:1] 5875; VI-NEXT: flat_load_ushort v2, v[2:3] 5876; VI-NEXT: v_mov_b32_e32 v0, s4 5877; VI-NEXT: v_mov_b32_e32 v1, s5 5878; VI-NEXT: flat_load_ushort v0, v[0:1] 5879; VI-NEXT: s_load_dword s0, s[0:1], 0x24 5880; VI-NEXT: s_waitcnt lgkmcnt(0) 5881; VI-NEXT: v_mov_b32_e32 v1, s0 5882; VI-NEXT: s_waitcnt vmcnt(0) 5883; VI-NEXT: flat_store_byte v[0:1], v1 5884; VI-NEXT: s_waitcnt vmcnt(0) 5885; VI-NEXT: flat_store_short v[0:1], v2 5886; VI-NEXT: s_waitcnt vmcnt(0) 5887; VI-NEXT: flat_store_short v[0:1], v4 5888; VI-NEXT: s_waitcnt vmcnt(0) 5889; VI-NEXT: flat_store_short v[0:1], v0 5890; VI-NEXT: s_waitcnt vmcnt(0) 5891; VI-NEXT: s_endpgm 5892; 5893; GFX9-LABEL: array_3xi16: 5894; GFX9: ; %bb.0: 5895; GFX9-NEXT: v_mov_b32_e32 v0, 0 5896; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 5897; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 5898; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 5899; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 5900; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5901; GFX9-NEXT: v_mov_b32_e32 v0, s0 5902; GFX9-NEXT: s_waitcnt vmcnt(2) 5903; GFX9-NEXT: global_store_byte v[0:1], v0, off 5904; GFX9-NEXT: s_waitcnt vmcnt(0) 5905; GFX9-NEXT: global_store_short v[0:1], v1, off 5906; GFX9-NEXT: s_waitcnt vmcnt(0) 5907; GFX9-NEXT: global_store_short v[0:1], v2, off 5908; GFX9-NEXT: s_waitcnt vmcnt(0) 5909; GFX9-NEXT: global_store_short v[0:1], v3, off 5910; GFX9-NEXT: s_waitcnt vmcnt(0) 5911; GFX9-NEXT: s_endpgm 5912; 5913; EG-LABEL: array_3xi16: 5914; EG: ; %bb.0: 5915; EG-NEXT: ALU 0, @20, KC0[], KC1[] 5916; EG-NEXT: TEX 1 @12 5917; EG-NEXT: ALU 11, @21, KC0[], KC1[] 5918; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5919; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5920; EG-NEXT: TEX 0 @16 5921; EG-NEXT: ALU 3, @33, KC0[], KC1[] 5922; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5923; EG-NEXT: TEX 0 @18 5924; EG-NEXT: ALU 3, @37, KC0[], KC1[] 5925; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5926; EG-NEXT: CF_END 5927; EG-NEXT: Fetch clause starting at 12: 5928; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5929; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5930; EG-NEXT: Fetch clause starting at 16: 5931; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5932; EG-NEXT: Fetch clause starting at 18: 5933; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5934; EG-NEXT: ALU clause starting at 20: 5935; EG-NEXT: MOV * T0.X, 0.0, 5936; EG-NEXT: ALU clause starting at 21: 5937; EG-NEXT: AND_INT T1.X, T1.X, literal.x, 5938; EG-NEXT: MOV * T1.W, literal.x, 5939; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5940; EG-NEXT: MOV * T1.Y, 0.0, 5941; EG-NEXT: AND_INT T2.X, T2.X, literal.x, 5942; EG-NEXT: MOV * T2.W, literal.x, 5943; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5944; EG-NEXT: MOV T2.Y, 0.0, 5945; EG-NEXT: MOV T1.Z, 0.0, 5946; EG-NEXT: MOV * T2.Z, 0.0, 5947; EG-NEXT: MOV * T3.X, literal.x, 5948; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5949; EG-NEXT: ALU clause starting at 33: 5950; EG-NEXT: AND_INT T2.X, T1.X, literal.x, 5951; EG-NEXT: MOV T2.Y, 0.0, 5952; EG-NEXT: MOV * T2.Z, 0.0, 5953; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5954; EG-NEXT: ALU clause starting at 37: 5955; EG-NEXT: AND_INT T2.X, T0.X, literal.x, 5956; EG-NEXT: MOV T2.Y, 0.0, 5957; EG-NEXT: MOV * T2.Z, 0.0, 5958; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5959; 5960; CM-LABEL: array_3xi16: 5961; CM: ; %bb.0: 5962; CM-NEXT: ALU 0, @20, KC0[], KC1[] 5963; CM-NEXT: TEX 1 @12 5964; CM-NEXT: ALU 11, @21, KC0[], KC1[] 5965; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5966; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5967; CM-NEXT: TEX 0 @16 5968; CM-NEXT: ALU 3, @33, KC0[], KC1[] 5969; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5970; CM-NEXT: TEX 0 @18 5971; CM-NEXT: ALU 3, @37, KC0[], KC1[] 5972; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5973; CM-NEXT: CF_END 5974; CM-NEXT: Fetch clause starting at 12: 5975; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5976; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5977; CM-NEXT: Fetch clause starting at 16: 5978; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5979; CM-NEXT: Fetch clause starting at 18: 5980; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5981; CM-NEXT: ALU clause starting at 20: 5982; CM-NEXT: MOV * T0.X, 0.0, 5983; CM-NEXT: ALU clause starting at 21: 5984; CM-NEXT: AND_INT T1.X, T1.X, literal.x, 5985; CM-NEXT: MOV * T1.W, literal.x, 5986; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5987; CM-NEXT: MOV * T1.Y, 0.0, 5988; CM-NEXT: AND_INT T2.X, T2.X, literal.x, 5989; CM-NEXT: MOV * T2.W, literal.x, 5990; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5991; CM-NEXT: MOV T2.Y, 0.0, 5992; CM-NEXT: MOV * T1.Z, 0.0, 5993; CM-NEXT: MOV * T2.Z, 0.0, 5994; CM-NEXT: MOV * T3.X, literal.x, 5995; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5996; CM-NEXT: ALU clause starting at 33: 5997; CM-NEXT: AND_INT T2.X, T1.X, literal.x, 5998; CM-NEXT: MOV T2.Y, 0.0, 5999; CM-NEXT: MOV * T2.Z, 0.0, 6000; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 6001; CM-NEXT: ALU clause starting at 37: 6002; CM-NEXT: AND_INT T2.X, T0.X, literal.x, 6003; CM-NEXT: MOV T2.Y, 0.0, 6004; CM-NEXT: MOV * T2.Z, 0.0, 6005; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 6006 store volatile i8 %arg0, i8 addrspace(1)* undef 6007 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 6008 ret void 6009} 6010 6011define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { 6012; SI-LABEL: small_array_round_down_offset: 6013; SI: ; %bb.0: 6014; SI-NEXT: s_mov_b32 s3, 0xf000 6015; SI-NEXT: s_mov_b32 s2, -1 6016; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 6017; SI-NEXT: s_waitcnt vmcnt(0) 6018; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 6019; SI-NEXT: s_waitcnt vmcnt(0) 6020; SI-NEXT: s_endpgm 6021; 6022; VI-LABEL: small_array_round_down_offset: 6023; VI: ; %bb.0: 6024; VI-NEXT: s_add_u32 s0, s0, 37 6025; VI-NEXT: s_addc_u32 s1, s1, 0 6026; VI-NEXT: v_mov_b32_e32 v0, s0 6027; VI-NEXT: v_mov_b32_e32 v1, s1 6028; VI-NEXT: flat_load_ubyte v0, v[0:1] 6029; VI-NEXT: s_waitcnt vmcnt(0) 6030; VI-NEXT: flat_store_byte v[0:1], v0 6031; VI-NEXT: s_waitcnt vmcnt(0) 6032; VI-NEXT: s_endpgm 6033; 6034; GFX9-LABEL: small_array_round_down_offset: 6035; GFX9: ; %bb.0: 6036; GFX9-NEXT: v_mov_b32_e32 v0, 0 6037; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 6038; GFX9-NEXT: s_waitcnt vmcnt(0) 6039; GFX9-NEXT: global_store_byte v[0:1], v0, off 6040; GFX9-NEXT: s_waitcnt vmcnt(0) 6041; GFX9-NEXT: s_endpgm 6042; 6043; EGCM-LABEL: small_array_round_down_offset: 6044; EGCM: ; %bb.0: 6045; EGCM-NEXT: ALU 0, @8, KC0[], KC1[] 6046; EGCM-NEXT: TEX 0 @6 6047; EGCM-NEXT: ALU 6, @9, KC0[], KC1[] 6048; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 6049; EGCM-NEXT: CF_END 6050; EGCM-NEXT: PAD 6051; EGCM-NEXT: Fetch clause starting at 6: 6052; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3 6053; EGCM-NEXT: ALU clause starting at 8: 6054; EGCM-NEXT: MOV * T0.X, 0.0, 6055; EGCM-NEXT: ALU clause starting at 9: 6056; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x, 6057; EGCM-NEXT: MOV * T0.W, literal.x, 6058; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 6059; EGCM-NEXT: MOV T0.Y, 0.0, 6060; EGCM-NEXT: MOV * T0.Z, 0.0, 6061; EGCM-NEXT: MOV * T1.X, literal.x, 6062; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 6063 %val = extractvalue [1 x i8] %arg, 0 6064 store volatile i8 %val, i8 addrspace(1)* undef 6065 ret void 6066} 6067 6068define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { 6069; SI-LABEL: byref_align_constant_i32_arg: 6070; SI: ; %bb.0: 6071; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 6072; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6073; SI-NEXT: s_mov_b32 s3, 0xf000 6074; SI-NEXT: s_mov_b32 s2, -1 6075; SI-NEXT: s_waitcnt lgkmcnt(0) 6076; SI-NEXT: v_mov_b32_e32 v0, s4 6077; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6078; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6079; SI-NEXT: v_mov_b32_e32 v0, s5 6080; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6081; SI-NEXT: s_waitcnt vmcnt(0) 6082; SI-NEXT: s_endpgm 6083; 6084; VI-LABEL: byref_align_constant_i32_arg: 6085; VI: ; %bb.0: 6086; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6087; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 6088; VI-NEXT: s_waitcnt lgkmcnt(0) 6089; VI-NEXT: v_mov_b32_e32 v0, s2 6090; VI-NEXT: v_mov_b32_e32 v1, s3 6091; VI-NEXT: v_mov_b32_e32 v2, s0 6092; VI-NEXT: v_mov_b32_e32 v3, s1 6093; VI-NEXT: flat_store_dword v[0:1], v2 6094; VI-NEXT: s_waitcnt vmcnt(0) 6095; VI-NEXT: flat_store_dword v[0:1], v3 6096; VI-NEXT: s_waitcnt vmcnt(0) 6097; VI-NEXT: s_endpgm 6098; 6099; GFX9-LABEL: byref_align_constant_i32_arg: 6100; GFX9: ; %bb.0: 6101; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 6102; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 6103; GFX9-NEXT: v_mov_b32_e32 v0, 0 6104; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6105; GFX9-NEXT: v_mov_b32_e32 v1, s0 6106; GFX9-NEXT: v_mov_b32_e32 v2, s1 6107; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6108; GFX9-NEXT: s_waitcnt vmcnt(0) 6109; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 6110; GFX9-NEXT: s_waitcnt vmcnt(0) 6111; GFX9-NEXT: s_endpgm 6112; 6113; EG-LABEL: byref_align_constant_i32_arg: 6114; EG: ; %bb.0: 6115; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 6116; EG-NEXT: TEX 0 @6 6117; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 6118; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0 6119; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6120; EG-NEXT: CF_END 6121; EG-NEXT: Fetch clause starting at 6: 6122; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 6123; EG-NEXT: ALU clause starting at 8: 6124; EG-NEXT: MOV * T0.X, KC0[18].Y, 6125; EG-NEXT: ALU clause starting at 9: 6126; EG-NEXT: MOV T1.X, KC0[18].Z, 6127; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6128; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6129; 6130; CM-LABEL: byref_align_constant_i32_arg: 6131; CM: ; %bb.0: 6132; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 6133; CM-NEXT: TEX 0 @6 6134; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 6135; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X 6136; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6137; CM-NEXT: CF_END 6138; CM-NEXT: Fetch clause starting at 6: 6139; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 6140; CM-NEXT: ALU clause starting at 8: 6141; CM-NEXT: MOV * T0.X, KC0[18].Y, 6142; CM-NEXT: ALU clause starting at 9: 6143; CM-NEXT: MOV * T1.X, KC0[18].Z, 6144; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6145; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6146 %in = load i32, i32 addrspace(4)* %in.byref 6147 store volatile i32 %in, i32 addrspace(1)* %out, align 4 6148 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 6149 ret void 6150} 6151 6152define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { 6153; SI-LABEL: byref_natural_align_constant_v16i32_arg: 6154; SI: ; %bb.0: 6155; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 6156; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 6157; SI-NEXT: s_load_dword s0, s[0:1], 0x29 6158; SI-NEXT: s_mov_b32 s23, 0xf000 6159; SI-NEXT: s_mov_b32 s22, -1 6160; SI-NEXT: s_waitcnt lgkmcnt(0) 6161; SI-NEXT: v_mov_b32_e32 v0, s16 6162; SI-NEXT: v_mov_b32_e32 v1, s17 6163; SI-NEXT: v_mov_b32_e32 v2, s18 6164; SI-NEXT: v_mov_b32_e32 v3, s19 6165; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 6166; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6167; SI-NEXT: v_mov_b32_e32 v0, s12 6168; SI-NEXT: v_mov_b32_e32 v1, s13 6169; SI-NEXT: v_mov_b32_e32 v2, s14 6170; SI-NEXT: v_mov_b32_e32 v3, s15 6171; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 6172; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6173; SI-NEXT: v_mov_b32_e32 v0, s8 6174; SI-NEXT: v_mov_b32_e32 v1, s9 6175; SI-NEXT: v_mov_b32_e32 v2, s10 6176; SI-NEXT: v_mov_b32_e32 v3, s11 6177; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 6178; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6179; SI-NEXT: v_mov_b32_e32 v0, s4 6180; SI-NEXT: v_mov_b32_e32 v1, s5 6181; SI-NEXT: v_mov_b32_e32 v2, s6 6182; SI-NEXT: v_mov_b32_e32 v3, s7 6183; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 6184; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6185; SI-NEXT: v_mov_b32_e32 v0, s0 6186; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 6187; SI-NEXT: s_waitcnt vmcnt(0) 6188; SI-NEXT: s_endpgm 6189; 6190; VI-LABEL: byref_natural_align_constant_v16i32_arg: 6191; VI: ; %bb.0: 6192; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 6193; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6194; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 6195; VI-NEXT: s_waitcnt lgkmcnt(0) 6196; VI-NEXT: v_mov_b32_e32 v0, s16 6197; VI-NEXT: s_add_u32 s0, s2, 48 6198; VI-NEXT: s_addc_u32 s1, s3, 0 6199; VI-NEXT: v_mov_b32_e32 v5, s1 6200; VI-NEXT: v_mov_b32_e32 v4, s0 6201; VI-NEXT: s_add_u32 s0, s2, 32 6202; VI-NEXT: v_mov_b32_e32 v1, s17 6203; VI-NEXT: v_mov_b32_e32 v2, s18 6204; VI-NEXT: v_mov_b32_e32 v3, s19 6205; VI-NEXT: s_addc_u32 s1, s3, 0 6206; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6207; VI-NEXT: s_waitcnt vmcnt(0) 6208; VI-NEXT: v_mov_b32_e32 v5, s1 6209; VI-NEXT: v_mov_b32_e32 v4, s0 6210; VI-NEXT: s_add_u32 s0, s2, 16 6211; VI-NEXT: v_mov_b32_e32 v0, s12 6212; VI-NEXT: v_mov_b32_e32 v1, s13 6213; VI-NEXT: v_mov_b32_e32 v2, s14 6214; VI-NEXT: v_mov_b32_e32 v3, s15 6215; VI-NEXT: s_addc_u32 s1, s3, 0 6216; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6217; VI-NEXT: s_waitcnt vmcnt(0) 6218; VI-NEXT: v_mov_b32_e32 v5, s1 6219; VI-NEXT: v_mov_b32_e32 v0, s8 6220; VI-NEXT: v_mov_b32_e32 v1, s9 6221; VI-NEXT: v_mov_b32_e32 v2, s10 6222; VI-NEXT: v_mov_b32_e32 v3, s11 6223; VI-NEXT: v_mov_b32_e32 v4, s0 6224; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6225; VI-NEXT: s_waitcnt vmcnt(0) 6226; VI-NEXT: v_mov_b32_e32 v5, s3 6227; VI-NEXT: v_mov_b32_e32 v0, s4 6228; VI-NEXT: v_mov_b32_e32 v1, s5 6229; VI-NEXT: v_mov_b32_e32 v2, s6 6230; VI-NEXT: v_mov_b32_e32 v3, s7 6231; VI-NEXT: v_mov_b32_e32 v4, s2 6232; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6233; VI-NEXT: s_waitcnt vmcnt(0) 6234; VI-NEXT: v_mov_b32_e32 v0, s20 6235; VI-NEXT: flat_store_dword v[4:5], v0 6236; VI-NEXT: s_waitcnt vmcnt(0) 6237; VI-NEXT: s_endpgm 6238; 6239; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: 6240; GFX9: ; %bb.0: 6241; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 6242; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6243; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 6244; GFX9-NEXT: v_mov_b32_e32 v4, 0 6245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6246; GFX9-NEXT: v_mov_b32_e32 v0, s20 6247; GFX9-NEXT: v_mov_b32_e32 v1, s21 6248; GFX9-NEXT: v_mov_b32_e32 v2, s22 6249; GFX9-NEXT: v_mov_b32_e32 v3, s23 6250; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 6251; GFX9-NEXT: s_waitcnt vmcnt(0) 6252; GFX9-NEXT: v_mov_b32_e32 v0, s16 6253; GFX9-NEXT: v_mov_b32_e32 v1, s17 6254; GFX9-NEXT: v_mov_b32_e32 v2, s18 6255; GFX9-NEXT: v_mov_b32_e32 v3, s19 6256; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 6257; GFX9-NEXT: s_waitcnt vmcnt(0) 6258; GFX9-NEXT: v_mov_b32_e32 v0, s12 6259; GFX9-NEXT: v_mov_b32_e32 v1, s13 6260; GFX9-NEXT: v_mov_b32_e32 v2, s14 6261; GFX9-NEXT: v_mov_b32_e32 v3, s15 6262; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 6263; GFX9-NEXT: s_waitcnt vmcnt(0) 6264; GFX9-NEXT: v_mov_b32_e32 v0, s8 6265; GFX9-NEXT: v_mov_b32_e32 v1, s9 6266; GFX9-NEXT: v_mov_b32_e32 v2, s10 6267; GFX9-NEXT: v_mov_b32_e32 v3, s11 6268; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 6269; GFX9-NEXT: s_waitcnt vmcnt(0) 6270; GFX9-NEXT: v_mov_b32_e32 v0, s2 6271; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 6272; GFX9-NEXT: s_waitcnt vmcnt(0) 6273; GFX9-NEXT: s_endpgm 6274; 6275; EG-LABEL: byref_natural_align_constant_v16i32_arg: 6276; EG: ; %bb.0: 6277; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6278; EG-NEXT: TEX 0 @16 6279; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6280; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 6281; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6282; EG-NEXT: TEX 0 @18 6283; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6284; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6285; EG-NEXT: TEX 0 @20 6286; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6287; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6288; EG-NEXT: TEX 0 @22 6289; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 6290; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6291; EG-NEXT: CF_END 6292; EG-NEXT: PAD 6293; EG-NEXT: Fetch clause starting at 16: 6294; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6295; EG-NEXT: Fetch clause starting at 18: 6296; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6297; EG-NEXT: Fetch clause starting at 20: 6298; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6299; EG-NEXT: Fetch clause starting at 22: 6300; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6301; EG-NEXT: ALU clause starting at 24: 6302; EG-NEXT: MOV * T0.X, KC0[6].Y, 6303; EG-NEXT: ALU clause starting at 25: 6304; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6305; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6306; EG-NEXT: LSHR * T2.X, PV.W, literal.x, 6307; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6308; EG-NEXT: ALU clause starting at 29: 6309; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6310; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6311; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6312; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6313; EG-NEXT: ALU clause starting at 33: 6314; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6315; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6316; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6317; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6318; EG-NEXT: ALU clause starting at 37: 6319; EG-NEXT: MOV T1.X, KC0[10].Y, 6320; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6321; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6322; 6323; CM-LABEL: byref_natural_align_constant_v16i32_arg: 6324; CM: ; %bb.0: 6325; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6326; CM-NEXT: TEX 0 @16 6327; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6328; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X 6329; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6330; CM-NEXT: TEX 0 @18 6331; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6332; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6333; CM-NEXT: TEX 0 @20 6334; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6335; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6336; CM-NEXT: TEX 0 @22 6337; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 6338; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6339; CM-NEXT: CF_END 6340; CM-NEXT: PAD 6341; CM-NEXT: Fetch clause starting at 16: 6342; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6343; CM-NEXT: Fetch clause starting at 18: 6344; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6345; CM-NEXT: Fetch clause starting at 20: 6346; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6347; CM-NEXT: Fetch clause starting at 22: 6348; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6349; CM-NEXT: ALU clause starting at 24: 6350; CM-NEXT: MOV * T0.X, KC0[6].Y, 6351; CM-NEXT: ALU clause starting at 25: 6352; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6353; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6354; CM-NEXT: LSHR * T2.X, PV.W, literal.x, 6355; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6356; CM-NEXT: ALU clause starting at 29: 6357; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6358; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6359; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6360; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6361; CM-NEXT: ALU clause starting at 33: 6362; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6363; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6364; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6365; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6366; CM-NEXT: ALU clause starting at 37: 6367; CM-NEXT: MOV * T1.X, KC0[10].Y, 6368; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6369; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6370 %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref 6371 %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* 6372 store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 6373 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 6374 ret void 6375} 6376