1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s 4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s 6; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s 7 8define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { 9; SI-LABEL: i8_arg: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dword s2, s[0:1], 0xb 12; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 13; SI-NEXT: s_mov_b32 s3, 0xf000 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_and_b32 s4, s2, 0xff 16; SI-NEXT: s_mov_b32 s2, -1 17; SI-NEXT: v_mov_b32_e32 v0, s4 18; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 19; SI-NEXT: s_endpgm 20; 21; VI-LABEL: i8_arg: 22; VI: ; %bb.0: 23; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 24; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: s_and_b32 s2, s2, 0xff 27; VI-NEXT: v_mov_b32_e32 v0, s0 28; VI-NEXT: v_mov_b32_e32 v1, s1 29; VI-NEXT: v_mov_b32_e32 v2, s2 30; VI-NEXT: flat_store_dword v[0:1], v2 31; VI-NEXT: s_endpgm 32; 33; GFX9-LABEL: i8_arg: 34; GFX9: ; %bb.0: 35; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 36; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 37; GFX9-NEXT: v_mov_b32_e32 v0, 0 38; GFX9-NEXT: s_waitcnt lgkmcnt(0) 39; GFX9-NEXT: s_and_b32 s2, s2, 0xff 40; GFX9-NEXT: v_mov_b32_e32 v1, s2 41; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 42; GFX9-NEXT: s_endpgm 43; 44; EG-LABEL: i8_arg: 45; EG: ; %bb.0: 46; EG-NEXT: ALU 0, @8, KC0[], KC1[] 47; EG-NEXT: TEX 0 @6 48; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 49; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 50; EG-NEXT: CF_END 51; EG-NEXT: PAD 52; EG-NEXT: Fetch clause starting at 6: 53; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 54; EG-NEXT: ALU clause starting at 8: 55; EG-NEXT: MOV * T0.X, 0.0, 56; EG-NEXT: ALU clause starting at 9: 57; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 58; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 59; 60; CM-LABEL: i8_arg: 61; CM: ; %bb.0: 62; CM-NEXT: ALU 0, @8, KC0[], KC1[] 63; CM-NEXT: TEX 0 @6 64; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 65; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 66; CM-NEXT: CF_END 67; CM-NEXT: PAD 68; CM-NEXT: Fetch clause starting at 6: 69; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 70; CM-NEXT: ALU clause starting at 8: 71; CM-NEXT: MOV * T0.X, 0.0, 72; CM-NEXT: ALU clause starting at 9: 73; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 74; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 75 %ext = zext i8 %in to i32 76 store i32 %ext, i32 addrspace(1)* %out, align 4 77 ret void 78} 79 80define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { 81; SI-LABEL: i8_zext_arg: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dword s2, s[0:1], 0xb 84; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 85; SI-NEXT: s_mov_b32 s3, 0xf000 86; SI-NEXT: s_waitcnt lgkmcnt(0) 87; SI-NEXT: s_and_b32 s4, s2, 0xff 88; SI-NEXT: s_mov_b32 s2, -1 89; SI-NEXT: v_mov_b32_e32 v0, s4 90; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: i8_zext_arg: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 96; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: s_and_b32 s2, s2, 0xff 99; VI-NEXT: v_mov_b32_e32 v0, s0 100; VI-NEXT: v_mov_b32_e32 v1, s1 101; VI-NEXT: v_mov_b32_e32 v2, s2 102; VI-NEXT: flat_store_dword v[0:1], v2 103; VI-NEXT: s_endpgm 104; 105; GFX9-LABEL: i8_zext_arg: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 109; GFX9-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: s_and_b32 s2, s2, 0xff 112; GFX9-NEXT: v_mov_b32_e32 v1, s2 113; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 114; GFX9-NEXT: s_endpgm 115; 116; EG-LABEL: i8_zext_arg: 117; EG: ; %bb.0: 118; EG-NEXT: ALU 0, @8, KC0[], KC1[] 119; EG-NEXT: TEX 0 @6 120; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 122; EG-NEXT: CF_END 123; EG-NEXT: PAD 124; EG-NEXT: Fetch clause starting at 6: 125; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 126; EG-NEXT: ALU clause starting at 8: 127; EG-NEXT: MOV * T0.X, 0.0, 128; EG-NEXT: ALU clause starting at 9: 129; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 130; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 131; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 132; 133; CM-LABEL: i8_zext_arg: 134; CM: ; %bb.0: 135; CM-NEXT: ALU 0, @8, KC0[], KC1[] 136; CM-NEXT: TEX 0 @6 137; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 138; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 139; CM-NEXT: CF_END 140; CM-NEXT: PAD 141; CM-NEXT: Fetch clause starting at 6: 142; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 143; CM-NEXT: ALU clause starting at 8: 144; CM-NEXT: MOV * T0.X, 0.0, 145; CM-NEXT: ALU clause starting at 9: 146; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 147; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 148; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 149; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 150 %ext = zext i8 %in to i32 151 store i32 %ext, i32 addrspace(1)* %out, align 4 152 ret void 153} 154 155define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { 156; SI-LABEL: i8_sext_arg: 157; SI: ; %bb.0: 158; SI-NEXT: s_load_dword s2, s[0:1], 0xb 159; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 160; SI-NEXT: s_mov_b32 s3, 0xf000 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: s_sext_i32_i8 s4, s2 163; SI-NEXT: s_mov_b32 s2, -1 164; SI-NEXT: v_mov_b32_e32 v0, s4 165; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 166; SI-NEXT: s_endpgm 167; 168; VI-LABEL: i8_sext_arg: 169; VI: ; %bb.0: 170; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 171; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: s_sext_i32_i8 s2, s2 174; VI-NEXT: v_mov_b32_e32 v0, s0 175; VI-NEXT: v_mov_b32_e32 v1, s1 176; VI-NEXT: v_mov_b32_e32 v2, s2 177; VI-NEXT: flat_store_dword v[0:1], v2 178; VI-NEXT: s_endpgm 179; 180; GFX9-LABEL: i8_sext_arg: 181; GFX9: ; %bb.0: 182; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 183; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 184; GFX9-NEXT: v_mov_b32_e32 v0, 0 185; GFX9-NEXT: s_waitcnt lgkmcnt(0) 186; GFX9-NEXT: s_sext_i32_i8 s2, s2 187; GFX9-NEXT: v_mov_b32_e32 v1, s2 188; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 189; GFX9-NEXT: s_endpgm 190; 191; EG-LABEL: i8_sext_arg: 192; EG: ; %bb.0: 193; EG-NEXT: ALU 0, @8, KC0[], KC1[] 194; EG-NEXT: TEX 0 @6 195; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 196; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 197; EG-NEXT: CF_END 198; EG-NEXT: PAD 199; EG-NEXT: Fetch clause starting at 6: 200; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 201; EG-NEXT: ALU clause starting at 8: 202; EG-NEXT: MOV * T0.X, 0.0, 203; EG-NEXT: ALU clause starting at 9: 204; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 205; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 206; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 207; 208; CM-LABEL: i8_sext_arg: 209; CM: ; %bb.0: 210; CM-NEXT: ALU 0, @8, KC0[], KC1[] 211; CM-NEXT: TEX 0 @6 212; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 213; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 214; CM-NEXT: CF_END 215; CM-NEXT: PAD 216; CM-NEXT: Fetch clause starting at 6: 217; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 218; CM-NEXT: ALU clause starting at 8: 219; CM-NEXT: MOV * T0.X, 0.0, 220; CM-NEXT: ALU clause starting at 9: 221; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 222; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 223; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 224; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 225 %ext = sext i8 %in to i32 226 store i32 %ext, i32 addrspace(1)* %out, align 4 227 ret void 228} 229 230define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { 231; SI-LABEL: i16_arg: 232; SI: ; %bb.0: 233; SI-NEXT: s_load_dword s2, s[0:1], 0xb 234; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 235; SI-NEXT: s_mov_b32 s3, 0xf000 236; SI-NEXT: s_waitcnt lgkmcnt(0) 237; SI-NEXT: s_and_b32 s4, s2, 0xffff 238; SI-NEXT: s_mov_b32 s2, -1 239; SI-NEXT: v_mov_b32_e32 v0, s4 240; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 241; SI-NEXT: s_endpgm 242; 243; VI-LABEL: i16_arg: 244; VI: ; %bb.0: 245; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 246; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: s_and_b32 s2, s2, 0xffff 249; VI-NEXT: v_mov_b32_e32 v0, s0 250; VI-NEXT: v_mov_b32_e32 v1, s1 251; VI-NEXT: v_mov_b32_e32 v2, s2 252; VI-NEXT: flat_store_dword v[0:1], v2 253; VI-NEXT: s_endpgm 254; 255; GFX9-LABEL: i16_arg: 256; GFX9: ; %bb.0: 257; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 258; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 259; GFX9-NEXT: v_mov_b32_e32 v0, 0 260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 261; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 262; GFX9-NEXT: v_mov_b32_e32 v1, s2 263; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 264; GFX9-NEXT: s_endpgm 265; 266; EG-LABEL: i16_arg: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 0, @8, KC0[], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: MOV * T0.X, 0.0, 278; EG-NEXT: ALU clause starting at 9: 279; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 280; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 281; 282; CM-LABEL: i16_arg: 283; CM: ; %bb.0: 284; CM-NEXT: ALU 0, @8, KC0[], KC1[] 285; CM-NEXT: TEX 0 @6 286; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 287; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 288; CM-NEXT: CF_END 289; CM-NEXT: PAD 290; CM-NEXT: Fetch clause starting at 6: 291; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 292; CM-NEXT: ALU clause starting at 8: 293; CM-NEXT: MOV * T0.X, 0.0, 294; CM-NEXT: ALU clause starting at 9: 295; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 296; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 297 %ext = zext i16 %in to i32 298 store i32 %ext, i32 addrspace(1)* %out, align 4 299 ret void 300} 301 302define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { 303; SI-LABEL: i16_zext_arg: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dword s2, s[0:1], 0xb 306; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 307; SI-NEXT: s_mov_b32 s3, 0xf000 308; SI-NEXT: s_waitcnt lgkmcnt(0) 309; SI-NEXT: s_and_b32 s4, s2, 0xffff 310; SI-NEXT: s_mov_b32 s2, -1 311; SI-NEXT: v_mov_b32_e32 v0, s4 312; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 313; SI-NEXT: s_endpgm 314; 315; VI-LABEL: i16_zext_arg: 316; VI: ; %bb.0: 317; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 318; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 319; VI-NEXT: s_waitcnt lgkmcnt(0) 320; VI-NEXT: s_and_b32 s2, s2, 0xffff 321; VI-NEXT: v_mov_b32_e32 v0, s0 322; VI-NEXT: v_mov_b32_e32 v1, s1 323; VI-NEXT: v_mov_b32_e32 v2, s2 324; VI-NEXT: flat_store_dword v[0:1], v2 325; VI-NEXT: s_endpgm 326; 327; GFX9-LABEL: i16_zext_arg: 328; GFX9: ; %bb.0: 329; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 330; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 331; GFX9-NEXT: v_mov_b32_e32 v0, 0 332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 333; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 334; GFX9-NEXT: v_mov_b32_e32 v1, s2 335; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 336; GFX9-NEXT: s_endpgm 337; 338; EG-LABEL: i16_zext_arg: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 0, @8, KC0[], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: MOV * T0.X, 0.0, 350; EG-NEXT: ALU clause starting at 9: 351; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 352; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 353; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 354; 355; CM-LABEL: i16_zext_arg: 356; CM: ; %bb.0: 357; CM-NEXT: ALU 0, @8, KC0[], KC1[] 358; CM-NEXT: TEX 0 @6 359; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 360; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 361; CM-NEXT: CF_END 362; CM-NEXT: PAD 363; CM-NEXT: Fetch clause starting at 6: 364; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 365; CM-NEXT: ALU clause starting at 8: 366; CM-NEXT: MOV * T0.X, 0.0, 367; CM-NEXT: ALU clause starting at 9: 368; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 369; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 370; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 371; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 372 %ext = zext i16 %in to i32 373 store i32 %ext, i32 addrspace(1)* %out, align 4 374 ret void 375} 376 377define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { 378; SI-LABEL: i16_sext_arg: 379; SI: ; %bb.0: 380; SI-NEXT: s_load_dword s2, s[0:1], 0xb 381; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 382; SI-NEXT: s_mov_b32 s3, 0xf000 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_sext_i32_i16 s4, s2 385; SI-NEXT: s_mov_b32 s2, -1 386; SI-NEXT: v_mov_b32_e32 v0, s4 387; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; SI-NEXT: s_endpgm 389; 390; VI-LABEL: i16_sext_arg: 391; VI: ; %bb.0: 392; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 393; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 394; VI-NEXT: s_waitcnt lgkmcnt(0) 395; VI-NEXT: s_sext_i32_i16 s2, s2 396; VI-NEXT: v_mov_b32_e32 v0, s0 397; VI-NEXT: v_mov_b32_e32 v1, s1 398; VI-NEXT: v_mov_b32_e32 v2, s2 399; VI-NEXT: flat_store_dword v[0:1], v2 400; VI-NEXT: s_endpgm 401; 402; GFX9-LABEL: i16_sext_arg: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 405; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 406; GFX9-NEXT: v_mov_b32_e32 v0, 0 407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 408; GFX9-NEXT: s_sext_i32_i16 s2, s2 409; GFX9-NEXT: v_mov_b32_e32 v1, s2 410; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 411; GFX9-NEXT: s_endpgm 412; 413; EG-LABEL: i16_sext_arg: 414; EG: ; %bb.0: 415; EG-NEXT: ALU 0, @8, KC0[], KC1[] 416; EG-NEXT: TEX 0 @6 417; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 418; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 419; EG-NEXT: CF_END 420; EG-NEXT: PAD 421; EG-NEXT: Fetch clause starting at 6: 422; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 423; EG-NEXT: ALU clause starting at 8: 424; EG-NEXT: MOV * T0.X, 0.0, 425; EG-NEXT: ALU clause starting at 9: 426; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 427; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 428; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 429; 430; CM-LABEL: i16_sext_arg: 431; CM: ; %bb.0: 432; CM-NEXT: ALU 0, @8, KC0[], KC1[] 433; CM-NEXT: TEX 0 @6 434; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 435; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 436; CM-NEXT: CF_END 437; CM-NEXT: PAD 438; CM-NEXT: Fetch clause starting at 6: 439; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 440; CM-NEXT: ALU clause starting at 8: 441; CM-NEXT: MOV * T0.X, 0.0, 442; CM-NEXT: ALU clause starting at 9: 443; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 444; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 445; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 446; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 447 %ext = sext i16 %in to i32 448 store i32 %ext, i32 addrspace(1)* %out, align 4 449 ret void 450} 451 452define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind { 453; SI-LABEL: i32_arg: 454; SI: ; %bb.0: ; %entry 455; SI-NEXT: s_load_dword s4, s[0:1], 0xb 456; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 457; SI-NEXT: s_mov_b32 s3, 0xf000 458; SI-NEXT: s_mov_b32 s2, -1 459; SI-NEXT: s_waitcnt lgkmcnt(0) 460; SI-NEXT: v_mov_b32_e32 v0, s4 461; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 462; SI-NEXT: s_endpgm 463; 464; VI-LABEL: i32_arg: 465; VI: ; %bb.0: ; %entry 466; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 467; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 468; VI-NEXT: s_waitcnt lgkmcnt(0) 469; VI-NEXT: v_mov_b32_e32 v0, s2 470; VI-NEXT: v_mov_b32_e32 v1, s3 471; VI-NEXT: v_mov_b32_e32 v2, s0 472; VI-NEXT: flat_store_dword v[0:1], v2 473; VI-NEXT: s_endpgm 474; 475; GFX9-LABEL: i32_arg: 476; GFX9: ; %bb.0: ; %entry 477; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 478; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 479; GFX9-NEXT: v_mov_b32_e32 v0, 0 480; GFX9-NEXT: s_waitcnt lgkmcnt(0) 481; GFX9-NEXT: v_mov_b32_e32 v1, s2 482; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 483; GFX9-NEXT: s_endpgm 484; 485; EG-LABEL: i32_arg: 486; EG: ; %bb.0: ; %entry 487; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 489; EG-NEXT: CF_END 490; EG-NEXT: PAD 491; EG-NEXT: ALU clause starting at 4: 492; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 493; EG-NEXT: MOV * T1.X, KC0[2].Z, 494; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 495; 496; CM-LABEL: i32_arg: 497; CM: ; %bb.0: ; %entry 498; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 500; CM-NEXT: CF_END 501; CM-NEXT: PAD 502; CM-NEXT: ALU clause starting at 4: 503; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 504; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 505; CM-NEXT: MOV * T1.X, KC0[2].Z, 506entry: 507 store i32 %in, i32 addrspace(1)* %out, align 4 508 ret void 509} 510 511define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind { 512; SI-LABEL: f32_arg: 513; SI: ; %bb.0: ; %entry 514; SI-NEXT: s_load_dword s4, s[0:1], 0xb 515; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 516; SI-NEXT: s_mov_b32 s3, 0xf000 517; SI-NEXT: s_mov_b32 s2, -1 518; SI-NEXT: s_waitcnt lgkmcnt(0) 519; SI-NEXT: v_mov_b32_e32 v0, s4 520; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 521; SI-NEXT: s_endpgm 522; 523; VI-LABEL: f32_arg: 524; VI: ; %bb.0: ; %entry 525; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 526; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: v_mov_b32_e32 v0, s2 529; VI-NEXT: v_mov_b32_e32 v1, s3 530; VI-NEXT: v_mov_b32_e32 v2, s0 531; VI-NEXT: flat_store_dword v[0:1], v2 532; VI-NEXT: s_endpgm 533; 534; GFX9-LABEL: f32_arg: 535; GFX9: ; %bb.0: ; %entry 536; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 538; GFX9-NEXT: v_mov_b32_e32 v0, 0 539; GFX9-NEXT: s_waitcnt lgkmcnt(0) 540; GFX9-NEXT: v_mov_b32_e32 v1, s2 541; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 542; GFX9-NEXT: s_endpgm 543; 544; EG-LABEL: f32_arg: 545; EG: ; %bb.0: ; %entry 546; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 547; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 548; EG-NEXT: CF_END 549; EG-NEXT: PAD 550; EG-NEXT: ALU clause starting at 4: 551; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 552; EG-NEXT: MOV * T1.X, KC0[2].Z, 553; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 554; 555; CM-LABEL: f32_arg: 556; CM: ; %bb.0: ; %entry 557; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 558; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 559; CM-NEXT: CF_END 560; CM-NEXT: PAD 561; CM-NEXT: ALU clause starting at 4: 562; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 563; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 564; CM-NEXT: MOV * T1.X, KC0[2].Z, 565entry: 566 store float %in, float addrspace(1)* %out, align 4 567 ret void 568} 569 570define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { 571; SI-LABEL: v2i8_arg: 572; SI: ; %bb.0: ; %entry 573; SI-NEXT: s_load_dword s4, s[0:1], 0xb 574; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 575; SI-NEXT: s_mov_b32 s3, 0xf000 576; SI-NEXT: s_mov_b32 s2, -1 577; SI-NEXT: s_waitcnt lgkmcnt(0) 578; SI-NEXT: v_mov_b32_e32 v0, s4 579; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 580; SI-NEXT: s_endpgm 581; 582; VI-LABEL: v2i8_arg: 583; VI: ; %bb.0: ; %entry 584; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 585; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 586; VI-NEXT: s_waitcnt lgkmcnt(0) 587; VI-NEXT: v_mov_b32_e32 v0, s2 588; VI-NEXT: v_mov_b32_e32 v1, s3 589; VI-NEXT: v_mov_b32_e32 v2, s0 590; VI-NEXT: flat_store_short v[0:1], v2 591; VI-NEXT: s_endpgm 592; 593; GFX9-LABEL: v2i8_arg: 594; GFX9: ; %bb.0: ; %entry 595; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 596; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 597; GFX9-NEXT: v_mov_b32_e32 v0, 0 598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 599; GFX9-NEXT: v_mov_b32_e32 v1, s2 600; GFX9-NEXT: global_store_short v0, v1, s[0:1] 601; GFX9-NEXT: s_endpgm 602; 603; EG-LABEL: v2i8_arg: 604; EG: ; %bb.0: ; %entry 605; EG-NEXT: ALU 0, @10, KC0[], KC1[] 606; EG-NEXT: TEX 1 @6 607; EG-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] 608; EG-NEXT: MEM_RAT MSKOR T4.XW, T5.X 609; EG-NEXT: CF_END 610; EG-NEXT: PAD 611; EG-NEXT: Fetch clause starting at 6: 612; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 613; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 614; EG-NEXT: ALU clause starting at 10: 615; EG-NEXT: MOV * T4.X, 0.0, 616; EG-NEXT: ALU clause starting at 11: 617; EG-NEXT: LSHL T0.W, T5.X, literal.x, 618; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 619; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 620; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 621; EG-NEXT: OR_INT * T0.W, PV.W, PS, 622; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 623; EG-NEXT: AND_INT T0.W, PS, literal.x, 624; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 625; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 626; EG-NEXT: LSHL T4.X, PV.W, PS, 627; EG-NEXT: LSHL * T4.W, literal.x, PS, 628; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 629; EG-NEXT: MOV T4.Y, 0.0, 630; EG-NEXT: MOV * T4.Z, 0.0, 631; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 632; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 633; 634; CM-LABEL: v2i8_arg: 635; CM: ; %bb.0: ; %entry 636; CM-NEXT: ALU 0, @10, KC0[], KC1[] 637; CM-NEXT: TEX 1 @6 638; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] 639; CM-NEXT: MEM_RAT MSKOR T4.XW, T5.X 640; CM-NEXT: CF_END 641; CM-NEXT: PAD 642; CM-NEXT: Fetch clause starting at 6: 643; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 644; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 645; CM-NEXT: ALU clause starting at 10: 646; CM-NEXT: MOV * T4.X, 0.0, 647; CM-NEXT: ALU clause starting at 11: 648; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 649; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 650; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 651; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 652; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 653; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 654; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 655; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 656; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 657; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 658; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 659; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 660; CM-NEXT: MOV T4.Y, 0.0, 661; CM-NEXT: MOV * T4.Z, 0.0, 662; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 663; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 664entry: 665 store <2 x i8> %in, <2 x i8> addrspace(1)* %out 666 ret void 667} 668 669define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { 670; SI-LABEL: v2i16_arg: 671; SI: ; %bb.0: ; %entry 672; SI-NEXT: s_load_dword s4, s[0:1], 0xb 673; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 674; SI-NEXT: s_mov_b32 s3, 0xf000 675; SI-NEXT: s_mov_b32 s2, -1 676; SI-NEXT: s_waitcnt lgkmcnt(0) 677; SI-NEXT: v_mov_b32_e32 v0, s4 678; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 679; SI-NEXT: s_endpgm 680; 681; VI-LABEL: v2i16_arg: 682; VI: ; %bb.0: ; %entry 683; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 684; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 685; VI-NEXT: s_waitcnt lgkmcnt(0) 686; VI-NEXT: v_mov_b32_e32 v0, s2 687; VI-NEXT: v_mov_b32_e32 v1, s3 688; VI-NEXT: v_mov_b32_e32 v2, s0 689; VI-NEXT: flat_store_dword v[0:1], v2 690; VI-NEXT: s_endpgm 691; 692; GFX9-LABEL: v2i16_arg: 693; GFX9: ; %bb.0: ; %entry 694; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 695; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 696; GFX9-NEXT: v_mov_b32_e32 v0, 0 697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 698; GFX9-NEXT: v_mov_b32_e32 v1, s2 699; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 700; GFX9-NEXT: s_endpgm 701; 702; EG-LABEL: v2i16_arg: 703; EG: ; %bb.0: ; %entry 704; EG-NEXT: ALU 0, @10, KC0[], KC1[] 705; EG-NEXT: TEX 1 @6 706; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 707; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 708; EG-NEXT: CF_END 709; EG-NEXT: PAD 710; EG-NEXT: Fetch clause starting at 6: 711; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 712; EG-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3 713; EG-NEXT: ALU clause starting at 10: 714; EG-NEXT: MOV * T4.X, 0.0, 715; EG-NEXT: ALU clause starting at 11: 716; EG-NEXT: LSHL T0.W, T5.X, literal.x, 717; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 718; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 719; EG-NEXT: OR_INT T4.X, PV.W, PS, 720; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 721; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 722; 723; CM-LABEL: v2i16_arg: 724; CM: ; %bb.0: ; %entry 725; CM-NEXT: ALU 0, @10, KC0[], KC1[] 726; CM-NEXT: TEX 1 @6 727; CM-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 728; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X 729; CM-NEXT: CF_END 730; CM-NEXT: PAD 731; CM-NEXT: Fetch clause starting at 6: 732; CM-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 733; CM-NEXT: VTX_READ_16 T4.X, T4.X, 40, #3 734; CM-NEXT: ALU clause starting at 10: 735; CM-NEXT: MOV * T4.X, 0.0, 736; CM-NEXT: ALU clause starting at 11: 737; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 738; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 739; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 740; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W, 741; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 742; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 743entry: 744 store <2 x i16> %in, <2 x i16> addrspace(1)* %out 745 ret void 746} 747 748define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { 749; SI-LABEL: v2i32_arg: 750; SI: ; %bb.0: ; %entry 751; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 752; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 753; SI-NEXT: s_mov_b32 s3, 0xf000 754; SI-NEXT: s_mov_b32 s2, -1 755; SI-NEXT: s_waitcnt lgkmcnt(0) 756; SI-NEXT: v_mov_b32_e32 v0, s4 757; SI-NEXT: v_mov_b32_e32 v1, s5 758; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 759; SI-NEXT: s_endpgm 760; 761; VI-LABEL: v2i32_arg: 762; VI: ; %bb.0: ; %entry 763; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 764; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 765; VI-NEXT: s_waitcnt lgkmcnt(0) 766; VI-NEXT: v_mov_b32_e32 v0, s2 767; VI-NEXT: v_mov_b32_e32 v3, s1 768; VI-NEXT: v_mov_b32_e32 v1, s3 769; VI-NEXT: v_mov_b32_e32 v2, s0 770; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 771; VI-NEXT: s_endpgm 772; 773; GFX9-LABEL: v2i32_arg: 774; GFX9: ; %bb.0: ; %entry 775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 776; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 777; GFX9-NEXT: v_mov_b32_e32 v2, 0 778; GFX9-NEXT: s_waitcnt lgkmcnt(0) 779; GFX9-NEXT: v_mov_b32_e32 v0, s0 780; GFX9-NEXT: v_mov_b32_e32 v1, s1 781; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 782; GFX9-NEXT: s_endpgm 783; 784; EG-LABEL: v2i32_arg: 785; EG: ; %bb.0: ; %entry 786; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 787; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 788; EG-NEXT: CF_END 789; EG-NEXT: PAD 790; EG-NEXT: ALU clause starting at 4: 791; EG-NEXT: MOV * T0.Y, KC0[3].X, 792; EG-NEXT: MOV T0.X, KC0[2].W, 793; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 794; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 795; 796; CM-LABEL: v2i32_arg: 797; CM: ; %bb.0: ; %entry 798; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 799; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 800; CM-NEXT: CF_END 801; CM-NEXT: PAD 802; CM-NEXT: ALU clause starting at 4: 803; CM-NEXT: MOV * T0.Y, KC0[3].X, 804; CM-NEXT: MOV * T0.X, KC0[2].W, 805; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 806; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 807entry: 808 store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4 809 ret void 810} 811 812define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { 813; SI-LABEL: v2f32_arg: 814; SI: ; %bb.0: ; %entry 815; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 816; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 817; SI-NEXT: s_mov_b32 s3, 0xf000 818; SI-NEXT: s_mov_b32 s2, -1 819; SI-NEXT: s_waitcnt lgkmcnt(0) 820; SI-NEXT: v_mov_b32_e32 v0, s4 821; SI-NEXT: v_mov_b32_e32 v1, s5 822; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 823; SI-NEXT: s_endpgm 824; 825; VI-LABEL: v2f32_arg: 826; VI: ; %bb.0: ; %entry 827; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 828; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 829; VI-NEXT: s_waitcnt lgkmcnt(0) 830; VI-NEXT: v_mov_b32_e32 v0, s2 831; VI-NEXT: v_mov_b32_e32 v3, s1 832; VI-NEXT: v_mov_b32_e32 v1, s3 833; VI-NEXT: v_mov_b32_e32 v2, s0 834; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 835; VI-NEXT: s_endpgm 836; 837; GFX9-LABEL: v2f32_arg: 838; GFX9: ; %bb.0: ; %entry 839; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 840; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 841; GFX9-NEXT: v_mov_b32_e32 v2, 0 842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 843; GFX9-NEXT: v_mov_b32_e32 v0, s0 844; GFX9-NEXT: v_mov_b32_e32 v1, s1 845; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 846; GFX9-NEXT: s_endpgm 847; 848; EG-LABEL: v2f32_arg: 849; EG: ; %bb.0: ; %entry 850; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 851; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 852; EG-NEXT: CF_END 853; EG-NEXT: PAD 854; EG-NEXT: ALU clause starting at 4: 855; EG-NEXT: MOV * T0.Y, KC0[3].X, 856; EG-NEXT: MOV T0.X, KC0[2].W, 857; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 858; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 859; 860; CM-LABEL: v2f32_arg: 861; CM: ; %bb.0: ; %entry 862; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 863; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 864; CM-NEXT: CF_END 865; CM-NEXT: PAD 866; CM-NEXT: ALU clause starting at 4: 867; CM-NEXT: MOV * T0.Y, KC0[3].X, 868; CM-NEXT: MOV * T0.X, KC0[2].W, 869; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 870; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 871entry: 872 store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4 873 ret void 874} 875 876define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { 877; SI-LABEL: v3i8_arg: 878; SI: ; %bb.0: ; %entry 879; SI-NEXT: s_load_dword s4, s[0:1], 0xb 880; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 881; SI-NEXT: s_mov_b32 s3, 0xf000 882; SI-NEXT: s_waitcnt lgkmcnt(0) 883; SI-NEXT: s_lshr_b32 s5, s4, 16 884; SI-NEXT: s_mov_b32 s2, -1 885; SI-NEXT: v_mov_b32_e32 v0, s4 886; SI-NEXT: v_mov_b32_e32 v1, s5 887; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 888; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 889; SI-NEXT: s_endpgm 890; 891; VI-LABEL: v3i8_arg: 892; VI: ; %bb.0: ; %entry 893; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 894; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 895; VI-NEXT: s_waitcnt lgkmcnt(0) 896; VI-NEXT: s_lshr_b32 s3, s2, 16 897; VI-NEXT: v_mov_b32_e32 v0, s0 898; VI-NEXT: v_mov_b32_e32 v1, s1 899; VI-NEXT: s_add_u32 s0, s0, 2 900; VI-NEXT: s_addc_u32 s1, s1, 0 901; VI-NEXT: v_mov_b32_e32 v3, s1 902; VI-NEXT: v_mov_b32_e32 v5, s3 903; VI-NEXT: v_mov_b32_e32 v2, s0 904; VI-NEXT: v_mov_b32_e32 v4, s2 905; VI-NEXT: flat_store_byte v[2:3], v5 906; VI-NEXT: flat_store_short v[0:1], v4 907; VI-NEXT: s_endpgm 908; 909; GFX9-LABEL: v3i8_arg: 910; GFX9: ; %bb.0: ; %entry 911; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 912; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 913; GFX9-NEXT: v_mov_b32_e32 v0, 0 914; GFX9-NEXT: s_waitcnt lgkmcnt(0) 915; GFX9-NEXT: v_mov_b32_e32 v1, s2 916; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2 917; GFX9-NEXT: global_store_short v0, v1, s[0:1] 918; GFX9-NEXT: s_endpgm 919; 920; EG-LABEL: v3i8_arg: 921; EG: ; %bb.0: ; %entry 922; EG-NEXT: ALU 0, @12, KC0[], KC1[] 923; EG-NEXT: TEX 2 @6 924; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[] 925; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 926; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X 927; EG-NEXT: CF_END 928; EG-NEXT: Fetch clause starting at 6: 929; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 930; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 931; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 932; EG-NEXT: ALU clause starting at 12: 933; EG-NEXT: MOV * T4.X, 0.0, 934; EG-NEXT: ALU clause starting at 13: 935; EG-NEXT: LSHL T0.W, T5.X, literal.x, 936; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 937; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 938; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 939; EG-NEXT: OR_INT * T0.W, PV.W, PS, 940; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 941; EG-NEXT: AND_INT T0.W, PS, literal.x, 942; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 943; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 944; EG-NEXT: LSHL T4.X, PV.W, PS, 945; EG-NEXT: LSHL * T4.W, literal.x, PS, 946; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 947; EG-NEXT: MOV T4.Y, 0.0, 948; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 949; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 950; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 951; EG-NEXT: AND_INT * T2.W, T6.X, literal.y, 952; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 953; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 954; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 955; EG-NEXT: LSHL T5.X, T2.W, PV.W, 956; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 957; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 958; EG-NEXT: MOV T5.Y, 0.0, 959; EG-NEXT: MOV T4.Z, 0.0, 960; EG-NEXT: MOV * T5.Z, 0.0, 961; EG-NEXT: LSHR T6.X, T0.W, literal.x, 962; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 963; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 964; 965; CM-LABEL: v3i8_arg: 966; CM: ; %bb.0: ; %entry 967; CM-NEXT: ALU 0, @12, KC0[], KC1[] 968; CM-NEXT: TEX 2 @6 969; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[] 970; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 971; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X 972; CM-NEXT: CF_END 973; CM-NEXT: Fetch clause starting at 6: 974; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 975; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 976; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 977; CM-NEXT: ALU clause starting at 12: 978; CM-NEXT: MOV * T4.X, 0.0, 979; CM-NEXT: ALU clause starting at 13: 980; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 981; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 982; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 983; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 984; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 985; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 986; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 987; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 988; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 989; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 990; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 991; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 992; CM-NEXT: MOV T4.Y, 0.0, 993; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 994; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 995; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 996; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 997; CM-NEXT: AND_INT T0.Z, T6.X, literal.x, 998; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 999; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1000; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1001; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1002; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1003; CM-NEXT: MOV T5.Y, 0.0, 1004; CM-NEXT: MOV * T4.Z, 0.0, 1005; CM-NEXT: MOV * T5.Z, 0.0, 1006; CM-NEXT: LSHR * T6.X, T0.W, literal.x, 1007; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1008; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1009; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1010entry: 1011 store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 1012 ret void 1013} 1014 1015define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { 1016; SI-LABEL: v3i16_arg: 1017; SI: ; %bb.0: ; %entry 1018; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1019; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1020; SI-NEXT: s_mov_b32 s3, 0xf000 1021; SI-NEXT: s_mov_b32 s2, -1 1022; SI-NEXT: s_waitcnt lgkmcnt(0) 1023; SI-NEXT: v_mov_b32_e32 v0, s5 1024; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1025; SI-NEXT: s_waitcnt expcnt(0) 1026; SI-NEXT: v_mov_b32_e32 v0, s4 1027; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1028; SI-NEXT: s_endpgm 1029; 1030; VI-LABEL: v3i16_arg: 1031; VI: ; %bb.0: ; %entry 1032; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1033; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1034; VI-NEXT: s_waitcnt lgkmcnt(0) 1035; VI-NEXT: s_add_u32 s4, s2, 4 1036; VI-NEXT: s_addc_u32 s5, s3, 0 1037; VI-NEXT: v_mov_b32_e32 v2, s4 1038; VI-NEXT: v_mov_b32_e32 v4, s1 1039; VI-NEXT: v_mov_b32_e32 v0, s2 1040; VI-NEXT: v_mov_b32_e32 v3, s5 1041; VI-NEXT: v_mov_b32_e32 v1, s3 1042; VI-NEXT: v_mov_b32_e32 v5, s0 1043; VI-NEXT: flat_store_short v[2:3], v4 1044; VI-NEXT: flat_store_dword v[0:1], v5 1045; VI-NEXT: s_endpgm 1046; 1047; GFX9-LABEL: v3i16_arg: 1048; GFX9: ; %bb.0: ; %entry 1049; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1050; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1051; GFX9-NEXT: v_mov_b32_e32 v0, 0 1052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1053; GFX9-NEXT: v_mov_b32_e32 v1, s1 1054; GFX9-NEXT: v_mov_b32_e32 v2, s0 1055; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:4 1056; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 1057; GFX9-NEXT: s_endpgm 1058; 1059; EG-LABEL: v3i16_arg: 1060; EG: ; %bb.0: ; %entry 1061; EG-NEXT: ALU 0, @12, KC0[], KC1[] 1062; EG-NEXT: TEX 2 @6 1063; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1064; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 1065; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1066; EG-NEXT: CF_END 1067; EG-NEXT: Fetch clause starting at 6: 1068; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1069; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1070; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1071; EG-NEXT: ALU clause starting at 12: 1072; EG-NEXT: MOV * T5.X, 0.0, 1073; EG-NEXT: ALU clause starting at 13: 1074; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1075; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1076; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1077; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1078; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1079; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1080; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1081; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1082; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1083; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1084; EG-NEXT: MOV T5.Y, 0.0, 1085; EG-NEXT: MOV * T5.Z, 0.0, 1086; EG-NEXT: LSHR T8.X, T0.W, literal.x, 1087; EG-NEXT: LSHL T0.W, T7.X, literal.y, 1088; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, 1089; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 1090; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1091; EG-NEXT: OR_INT T6.X, PV.W, PS, 1092; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1093; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1094; 1095; CM-LABEL: v3i16_arg: 1096; CM: ; %bb.0: ; %entry 1097; CM-NEXT: ALU 0, @12, KC0[], KC1[] 1098; CM-NEXT: TEX 2 @6 1099; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1100; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1101; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X 1102; CM-NEXT: CF_END 1103; CM-NEXT: Fetch clause starting at 6: 1104; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1105; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1106; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1107; CM-NEXT: ALU clause starting at 12: 1108; CM-NEXT: MOV * T5.X, 0.0, 1109; CM-NEXT: ALU clause starting at 13: 1110; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1111; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1112; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1113; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1114; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1115; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1116; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1117; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1118; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1119; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1120; CM-NEXT: MOV T5.Y, 0.0, 1121; CM-NEXT: MOV * T5.Z, 0.0, 1122; CM-NEXT: LSHL T0.Z, T7.X, literal.x, 1123; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 1124; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1125; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1126; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1127; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1128; CM-NEXT: LSHR * T8.X, T0.W, literal.x, 1129; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1130entry: 1131 store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 1132 ret void 1133} 1134 1135define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind { 1136; SI-LABEL: v3i32_arg: 1137; SI: ; %bb.0: ; %entry 1138; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1139; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1140; SI-NEXT: s_mov_b32 s3, 0xf000 1141; SI-NEXT: s_mov_b32 s2, -1 1142; SI-NEXT: s_waitcnt lgkmcnt(0) 1143; SI-NEXT: v_mov_b32_e32 v0, s6 1144; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 1145; SI-NEXT: s_waitcnt expcnt(0) 1146; SI-NEXT: v_mov_b32_e32 v0, s4 1147; SI-NEXT: v_mov_b32_e32 v1, s5 1148; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1149; SI-NEXT: s_endpgm 1150; 1151; VI-LABEL: v3i32_arg: 1152; VI: ; %bb.0: ; %entry 1153; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1154; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1155; VI-NEXT: s_waitcnt lgkmcnt(0) 1156; VI-NEXT: v_mov_b32_e32 v0, s4 1157; VI-NEXT: v_mov_b32_e32 v4, s1 1158; VI-NEXT: v_mov_b32_e32 v1, s5 1159; VI-NEXT: v_mov_b32_e32 v2, s6 1160; VI-NEXT: v_mov_b32_e32 v3, s0 1161; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1162; VI-NEXT: s_endpgm 1163; 1164; GFX9-LABEL: v3i32_arg: 1165; GFX9: ; %bb.0: ; %entry 1166; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1167; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1168; GFX9-NEXT: v_mov_b32_e32 v3, 0 1169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX9-NEXT: v_mov_b32_e32 v0, s0 1171; GFX9-NEXT: v_mov_b32_e32 v1, s1 1172; GFX9-NEXT: v_mov_b32_e32 v2, s2 1173; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1174; GFX9-NEXT: s_endpgm 1175; 1176; EG-LABEL: v3i32_arg: 1177; EG: ; %bb.0: ; %entry 1178; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1179; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1180; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1181; EG-NEXT: CF_END 1182; EG-NEXT: ALU clause starting at 4: 1183; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1184; EG-NEXT: MOV T0.X, KC0[3].Y, 1185; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1186; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1187; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1188; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1189; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1190; EG-NEXT: MOV * T3.X, KC0[3].W, 1191; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1192; 1193; CM-LABEL: v3i32_arg: 1194; CM: ; %bb.0: ; %entry 1195; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1196; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1197; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1198; CM-NEXT: CF_END 1199; CM-NEXT: ALU clause starting at 4: 1200; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1201; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1202; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1203; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1204; CM-NEXT: MOV T1.X, KC0[3].W, 1205; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1206; CM-NEXT: MOV * T2.X, KC0[3].Y, 1207; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1208; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1209entry: 1210 store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4 1211 ret void 1212} 1213 1214define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind { 1215; SI-LABEL: v3f32_arg: 1216; SI: ; %bb.0: ; %entry 1217; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1218; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1219; SI-NEXT: s_mov_b32 s3, 0xf000 1220; SI-NEXT: s_mov_b32 s2, -1 1221; SI-NEXT: s_waitcnt lgkmcnt(0) 1222; SI-NEXT: v_mov_b32_e32 v0, s6 1223; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 1224; SI-NEXT: s_waitcnt expcnt(0) 1225; SI-NEXT: v_mov_b32_e32 v0, s4 1226; SI-NEXT: v_mov_b32_e32 v1, s5 1227; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1228; SI-NEXT: s_endpgm 1229; 1230; VI-LABEL: v3f32_arg: 1231; VI: ; %bb.0: ; %entry 1232; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 1233; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1234; VI-NEXT: s_waitcnt lgkmcnt(0) 1235; VI-NEXT: v_mov_b32_e32 v0, s4 1236; VI-NEXT: v_mov_b32_e32 v4, s1 1237; VI-NEXT: v_mov_b32_e32 v1, s5 1238; VI-NEXT: v_mov_b32_e32 v2, s6 1239; VI-NEXT: v_mov_b32_e32 v3, s0 1240; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1241; VI-NEXT: s_endpgm 1242; 1243; GFX9-LABEL: v3f32_arg: 1244; GFX9: ; %bb.0: ; %entry 1245; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1246; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1247; GFX9-NEXT: v_mov_b32_e32 v3, 0 1248; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1249; GFX9-NEXT: v_mov_b32_e32 v0, s0 1250; GFX9-NEXT: v_mov_b32_e32 v1, s1 1251; GFX9-NEXT: v_mov_b32_e32 v2, s2 1252; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1253; GFX9-NEXT: s_endpgm 1254; 1255; EG-LABEL: v3f32_arg: 1256; EG: ; %bb.0: ; %entry 1257; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1258; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1259; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1260; EG-NEXT: CF_END 1261; EG-NEXT: ALU clause starting at 4: 1262; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1263; EG-NEXT: MOV T0.X, KC0[3].Y, 1264; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1265; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1266; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1267; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1268; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1269; EG-NEXT: MOV * T3.X, KC0[3].W, 1270; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1271; 1272; CM-LABEL: v3f32_arg: 1273; CM: ; %bb.0: ; %entry 1274; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1275; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1276; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1277; CM-NEXT: CF_END 1278; CM-NEXT: ALU clause starting at 4: 1279; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1280; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1281; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1282; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1283; CM-NEXT: MOV T1.X, KC0[3].W, 1284; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1285; CM-NEXT: MOV * T2.X, KC0[3].Y, 1286; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1287; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1288entry: 1289 store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4 1290 ret void 1291} 1292 1293define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { 1294; SI-LABEL: v4i8_arg: 1295; SI: ; %bb.0: ; %entry 1296; SI-NEXT: s_load_dword s4, s[0:1], 0xb 1297; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1298; SI-NEXT: s_mov_b32 s3, 0xf000 1299; SI-NEXT: s_mov_b32 s2, -1 1300; SI-NEXT: s_waitcnt lgkmcnt(0) 1301; SI-NEXT: v_mov_b32_e32 v0, s4 1302; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1303; SI-NEXT: s_endpgm 1304; 1305; VI-LABEL: v4i8_arg: 1306; VI: ; %bb.0: ; %entry 1307; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1308; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 1309; VI-NEXT: s_waitcnt lgkmcnt(0) 1310; VI-NEXT: v_mov_b32_e32 v0, s2 1311; VI-NEXT: v_mov_b32_e32 v1, s3 1312; VI-NEXT: v_mov_b32_e32 v2, s0 1313; VI-NEXT: flat_store_dword v[0:1], v2 1314; VI-NEXT: s_endpgm 1315; 1316; GFX9-LABEL: v4i8_arg: 1317; GFX9: ; %bb.0: ; %entry 1318; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 1319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1320; GFX9-NEXT: v_mov_b32_e32 v0, 0 1321; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX9-NEXT: v_mov_b32_e32 v1, s2 1323; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1324; GFX9-NEXT: s_endpgm 1325; 1326; EG-LABEL: v4i8_arg: 1327; EG: ; %bb.0: ; %entry 1328; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1329; EG-NEXT: TEX 3 @6 1330; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1331; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 1332; EG-NEXT: CF_END 1333; EG-NEXT: PAD 1334; EG-NEXT: Fetch clause starting at 6: 1335; EG-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3 1336; EG-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3 1337; EG-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3 1338; EG-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3 1339; EG-NEXT: ALU clause starting at 14: 1340; EG-NEXT: MOV * T4.X, 0.0, 1341; EG-NEXT: ALU clause starting at 15: 1342; EG-NEXT: AND_INT * T0.W, T5.X, literal.x, 1343; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1344; EG-NEXT: AND_INT T0.Z, T4.X, literal.x, 1345; EG-NEXT: LSHL T0.W, PV.W, literal.y, 1346; EG-NEXT: LSHL * T1.W, T7.X, literal.z, 1347; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1348; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1349; EG-NEXT: OR_INT T0.W, PS, PV.W, 1350; EG-NEXT: LSHL * T1.W, PV.Z, literal.x, 1351; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1352; EG-NEXT: OR_INT T0.W, PV.W, PS, 1353; EG-NEXT: AND_INT * T1.W, T6.X, literal.x, 1354; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1355; EG-NEXT: OR_INT T4.X, PV.W, PS, 1356; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 1357; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1358; 1359; CM-LABEL: v4i8_arg: 1360; CM: ; %bb.0: ; %entry 1361; CM-NEXT: ALU 0, @14, KC0[], KC1[] 1362; CM-NEXT: TEX 3 @6 1363; CM-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1364; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X 1365; CM-NEXT: CF_END 1366; CM-NEXT: PAD 1367; CM-NEXT: Fetch clause starting at 6: 1368; CM-NEXT: VTX_READ_8 T5.X, T4.X, 42, #3 1369; CM-NEXT: VTX_READ_8 T6.X, T4.X, 40, #3 1370; CM-NEXT: VTX_READ_8 T7.X, T4.X, 43, #3 1371; CM-NEXT: VTX_READ_8 T4.X, T4.X, 41, #3 1372; CM-NEXT: ALU clause starting at 14: 1373; CM-NEXT: MOV * T4.X, 0.0, 1374; CM-NEXT: ALU clause starting at 15: 1375; CM-NEXT: AND_INT * T0.W, T5.X, literal.x, 1376; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1377; CM-NEXT: AND_INT T0.Y, T4.X, literal.x, 1378; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1379; CM-NEXT: LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212 1380; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1381; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1382; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1383; CM-NEXT: LSHL * T0.W, PV.Y, literal.x, 1384; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1385; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1386; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 1387; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1388; CM-NEXT: OR_INT * T4.X, PV.Z, PV.W, 1389; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 1390; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1391entry: 1392 store <4 x i8> %in, <4 x i8> addrspace(1)* %out 1393 ret void 1394} 1395 1396define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { 1397; SI-LABEL: v4i16_arg: 1398; SI: ; %bb.0: ; %entry 1399; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 1400; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1401; SI-NEXT: s_mov_b32 s3, 0xf000 1402; SI-NEXT: s_mov_b32 s2, -1 1403; SI-NEXT: s_waitcnt lgkmcnt(0) 1404; SI-NEXT: v_mov_b32_e32 v0, s4 1405; SI-NEXT: v_mov_b32_e32 v1, s5 1406; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1407; SI-NEXT: s_endpgm 1408; 1409; VI-LABEL: v4i16_arg: 1410; VI: ; %bb.0: ; %entry 1411; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1412; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 1413; VI-NEXT: s_waitcnt lgkmcnt(0) 1414; VI-NEXT: v_mov_b32_e32 v0, s2 1415; VI-NEXT: v_mov_b32_e32 v3, s1 1416; VI-NEXT: v_mov_b32_e32 v1, s3 1417; VI-NEXT: v_mov_b32_e32 v2, s0 1418; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1419; VI-NEXT: s_endpgm 1420; 1421; GFX9-LABEL: v4i16_arg: 1422; GFX9: ; %bb.0: ; %entry 1423; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1424; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1425; GFX9-NEXT: v_mov_b32_e32 v2, 0 1426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1427; GFX9-NEXT: v_mov_b32_e32 v0, s0 1428; GFX9-NEXT: v_mov_b32_e32 v1, s1 1429; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1430; GFX9-NEXT: s_endpgm 1431; 1432; EG-LABEL: v4i16_arg: 1433; EG: ; %bb.0: ; %entry 1434; EG-NEXT: ALU 1, @20, KC0[], KC1[] 1435; EG-NEXT: TEX 0 @12 1436; EG-NEXT: ALU 5, @22, KC0[], KC1[] 1437; EG-NEXT: TEX 0 @14 1438; EG-NEXT: ALU 5, @28, KC0[], KC1[] 1439; EG-NEXT: TEX 0 @16 1440; EG-NEXT: ALU 5, @34, KC0[], KC1[] 1441; EG-NEXT: TEX 0 @18 1442; EG-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[] 1443; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 1444; EG-NEXT: CF_END 1445; EG-NEXT: PAD 1446; EG-NEXT: Fetch clause starting at 12: 1447; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 1448; EG-NEXT: Fetch clause starting at 14: 1449; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 1450; EG-NEXT: Fetch clause starting at 16: 1451; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 1452; EG-NEXT: Fetch clause starting at 18: 1453; EG-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3 1454; EG-NEXT: ALU clause starting at 20: 1455; EG-NEXT: MOV * T0.Y, T3.X, 1456; EG-NEXT: MOV * T5.X, 0.0, 1457; EG-NEXT: ALU clause starting at 22: 1458; EG-NEXT: LSHL T0.W, T6.X, literal.x, 1459; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 1460; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1461; EG-NEXT: OR_INT * T0.W, PS, PV.W, 1462; EG-NEXT: MOV * T3.X, PV.W, 1463; EG-NEXT: MOV * T0.Y, PV.X, 1464; EG-NEXT: ALU clause starting at 28: 1465; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 1466; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 1467; EG-NEXT: -65536(nan), 65535(9.183409e-41) 1468; EG-NEXT: OR_INT * T0.W, PV.W, PS, 1469; EG-NEXT: MOV T3.X, PV.W, 1470; EG-NEXT: MOV * T0.Y, T2.X, 1471; EG-NEXT: ALU clause starting at 34: 1472; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 1473; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 1474; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 1475; EG-NEXT: OR_INT * T0.W, PV.W, PS, 1476; EG-NEXT: MOV * T2.X, PV.W, 1477; EG-NEXT: MOV * T0.Y, PV.X, 1478; EG-NEXT: ALU clause starting at 40: 1479; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x, 1480; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 1481; EG-NEXT: AND_INT * T1.W, T5.X, literal.z, 1482; EG-NEXT: 2(2.802597e-45), -65536(nan) 1483; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1484; EG-NEXT: OR_INT * T5.X, PV.W, PS, 1485; EG-NEXT: MOV T2.X, PV.X, 1486; EG-NEXT: MOV * T5.Y, T3.X, 1487; 1488; CM-LABEL: v4i16_arg: 1489; CM: ; %bb.0: ; %entry 1490; CM-NEXT: ALU 1, @20, KC0[], KC1[] 1491; CM-NEXT: TEX 0 @12 1492; CM-NEXT: ALU 5, @22, KC0[], KC1[] 1493; CM-NEXT: TEX 0 @14 1494; CM-NEXT: ALU 5, @28, KC0[], KC1[] 1495; CM-NEXT: TEX 0 @16 1496; CM-NEXT: ALU 5, @34, KC0[], KC1[] 1497; CM-NEXT: TEX 0 @18 1498; CM-NEXT: ALU 7, @40, KC0[CB0:0-32], KC1[] 1499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 1500; CM-NEXT: CF_END 1501; CM-NEXT: PAD 1502; CM-NEXT: Fetch clause starting at 12: 1503; CM-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 1504; CM-NEXT: Fetch clause starting at 14: 1505; CM-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 1506; CM-NEXT: Fetch clause starting at 16: 1507; CM-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 1508; CM-NEXT: Fetch clause starting at 18: 1509; CM-NEXT: VTX_READ_16 T5.X, T5.X, 44, #3 1510; CM-NEXT: ALU clause starting at 20: 1511; CM-NEXT: MOV * T0.Y, T3.X, 1512; CM-NEXT: MOV * T5.X, 0.0, 1513; CM-NEXT: ALU clause starting at 22: 1514; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 1515; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 1516; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1517; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 1518; CM-NEXT: MOV * T3.X, PV.W, 1519; CM-NEXT: MOV * T0.Y, PV.X, 1520; CM-NEXT: ALU clause starting at 28: 1521; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 1522; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1523; CM-NEXT: -65536(nan), 65535(9.183409e-41) 1524; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 1525; CM-NEXT: MOV T3.X, PV.W, 1526; CM-NEXT: MOV * T0.Y, T2.X, 1527; CM-NEXT: ALU clause starting at 34: 1528; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 1529; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 1530; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 1531; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 1532; CM-NEXT: MOV * T2.X, PV.W, 1533; CM-NEXT: MOV * T0.Y, PV.X, 1534; CM-NEXT: ALU clause starting at 40: 1535; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x, 1536; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 1537; CM-NEXT: AND_INT * T0.W, T5.X, literal.z, 1538; CM-NEXT: 2(2.802597e-45), -65536(nan) 1539; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1540; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 1541; CM-NEXT: MOV T2.X, PV.X, 1542; CM-NEXT: MOV * T5.Y, T3.X, 1543entry: 1544 store <4 x i16> %in, <4 x i16> addrspace(1)* %out 1545 ret void 1546} 1547 1548define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind { 1549; SI-LABEL: v4i32_arg: 1550; SI: ; %bb.0: ; %entry 1551; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1552; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1553; SI-NEXT: s_mov_b32 s3, 0xf000 1554; SI-NEXT: s_mov_b32 s2, -1 1555; SI-NEXT: s_waitcnt lgkmcnt(0) 1556; SI-NEXT: v_mov_b32_e32 v0, s4 1557; SI-NEXT: v_mov_b32_e32 v1, s5 1558; SI-NEXT: v_mov_b32_e32 v2, s6 1559; SI-NEXT: v_mov_b32_e32 v3, s7 1560; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1561; SI-NEXT: s_endpgm 1562; 1563; VI-LABEL: v4i32_arg: 1564; VI: ; %bb.0: ; %entry 1565; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1566; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1567; VI-NEXT: s_waitcnt lgkmcnt(0) 1568; VI-NEXT: v_mov_b32_e32 v4, s4 1569; VI-NEXT: v_mov_b32_e32 v0, s0 1570; VI-NEXT: v_mov_b32_e32 v5, s5 1571; VI-NEXT: v_mov_b32_e32 v1, s1 1572; VI-NEXT: v_mov_b32_e32 v2, s2 1573; VI-NEXT: v_mov_b32_e32 v3, s3 1574; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1575; VI-NEXT: s_endpgm 1576; 1577; GFX9-LABEL: v4i32_arg: 1578; GFX9: ; %bb.0: ; %entry 1579; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1580; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1581; GFX9-NEXT: v_mov_b32_e32 v4, 0 1582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX9-NEXT: v_mov_b32_e32 v0, s0 1584; GFX9-NEXT: v_mov_b32_e32 v1, s1 1585; GFX9-NEXT: v_mov_b32_e32 v2, s2 1586; GFX9-NEXT: v_mov_b32_e32 v3, s3 1587; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1588; GFX9-NEXT: s_endpgm 1589; 1590; EG-LABEL: v4i32_arg: 1591; EG: ; %bb.0: ; %entry 1592; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1593; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1594; EG-NEXT: CF_END 1595; EG-NEXT: PAD 1596; EG-NEXT: ALU clause starting at 4: 1597; EG-NEXT: MOV * T0.W, KC0[4].X, 1598; EG-NEXT: MOV * T0.Z, KC0[3].W, 1599; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1600; EG-NEXT: MOV T0.X, KC0[3].Y, 1601; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1603; 1604; CM-LABEL: v4i32_arg: 1605; CM: ; %bb.0: ; %entry 1606; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1607; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1608; CM-NEXT: CF_END 1609; CM-NEXT: PAD 1610; CM-NEXT: ALU clause starting at 4: 1611; CM-NEXT: MOV * T0.W, KC0[4].X, 1612; CM-NEXT: MOV * T0.Z, KC0[3].W, 1613; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1614; CM-NEXT: MOV * T0.X, KC0[3].Y, 1615; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1616; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1617entry: 1618 store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 1619 ret void 1620} 1621 1622define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind { 1623; SI-LABEL: v4f32_arg: 1624; SI: ; %bb.0: ; %entry 1625; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 1626; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1627; SI-NEXT: s_mov_b32 s3, 0xf000 1628; SI-NEXT: s_mov_b32 s2, -1 1629; SI-NEXT: s_waitcnt lgkmcnt(0) 1630; SI-NEXT: v_mov_b32_e32 v0, s4 1631; SI-NEXT: v_mov_b32_e32 v1, s5 1632; SI-NEXT: v_mov_b32_e32 v2, s6 1633; SI-NEXT: v_mov_b32_e32 v3, s7 1634; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1635; SI-NEXT: s_endpgm 1636; 1637; VI-LABEL: v4f32_arg: 1638; VI: ; %bb.0: ; %entry 1639; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1640; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 1641; VI-NEXT: s_waitcnt lgkmcnt(0) 1642; VI-NEXT: v_mov_b32_e32 v4, s4 1643; VI-NEXT: v_mov_b32_e32 v0, s0 1644; VI-NEXT: v_mov_b32_e32 v5, s5 1645; VI-NEXT: v_mov_b32_e32 v1, s1 1646; VI-NEXT: v_mov_b32_e32 v2, s2 1647; VI-NEXT: v_mov_b32_e32 v3, s3 1648; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1649; VI-NEXT: s_endpgm 1650; 1651; GFX9-LABEL: v4f32_arg: 1652; GFX9: ; %bb.0: ; %entry 1653; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1654; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1655; GFX9-NEXT: v_mov_b32_e32 v4, 0 1656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX9-NEXT: v_mov_b32_e32 v0, s0 1658; GFX9-NEXT: v_mov_b32_e32 v1, s1 1659; GFX9-NEXT: v_mov_b32_e32 v2, s2 1660; GFX9-NEXT: v_mov_b32_e32 v3, s3 1661; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1662; GFX9-NEXT: s_endpgm 1663; 1664; EG-LABEL: v4f32_arg: 1665; EG: ; %bb.0: ; %entry 1666; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1667; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1668; EG-NEXT: CF_END 1669; EG-NEXT: PAD 1670; EG-NEXT: ALU clause starting at 4: 1671; EG-NEXT: MOV * T0.W, KC0[4].X, 1672; EG-NEXT: MOV * T0.Z, KC0[3].W, 1673; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1674; EG-NEXT: MOV T0.X, KC0[3].Y, 1675; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1676; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1677; 1678; CM-LABEL: v4f32_arg: 1679; CM: ; %bb.0: ; %entry 1680; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1681; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1682; CM-NEXT: CF_END 1683; CM-NEXT: PAD 1684; CM-NEXT: ALU clause starting at 4: 1685; CM-NEXT: MOV * T0.W, KC0[4].X, 1686; CM-NEXT: MOV * T0.Z, KC0[3].W, 1687; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1688; CM-NEXT: MOV * T0.X, KC0[3].Y, 1689; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1690; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1691entry: 1692 store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4 1693 ret void 1694} 1695 1696define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { 1697; SI-LABEL: v5i8_arg: 1698; SI: ; %bb.0: ; %entry 1699; SI-NEXT: s_load_dword s2, s[0:1], 0xc 1700; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1701; SI-NEXT: s_load_dword s0, s[0:1], 0xb 1702; SI-NEXT: s_mov_b32 s7, 0xf000 1703; SI-NEXT: s_mov_b32 s6, -1 1704; SI-NEXT: s_waitcnt lgkmcnt(0) 1705; SI-NEXT: v_mov_b32_e32 v0, s2 1706; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4 1707; SI-NEXT: s_waitcnt expcnt(0) 1708; SI-NEXT: v_mov_b32_e32 v0, s0 1709; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1710; SI-NEXT: s_endpgm 1711; 1712; VI-LABEL: v5i8_arg: 1713; VI: ; %bb.0: ; %entry 1714; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1715; VI-NEXT: s_load_dword s4, s[0:1], 0x30 1716; VI-NEXT: s_load_dword s5, s[0:1], 0x2c 1717; VI-NEXT: s_waitcnt lgkmcnt(0) 1718; VI-NEXT: s_add_u32 s0, s2, 4 1719; VI-NEXT: s_addc_u32 s1, s3, 0 1720; VI-NEXT: v_mov_b32_e32 v3, s1 1721; VI-NEXT: v_mov_b32_e32 v4, s4 1722; VI-NEXT: v_mov_b32_e32 v0, s2 1723; VI-NEXT: v_mov_b32_e32 v2, s0 1724; VI-NEXT: v_mov_b32_e32 v1, s3 1725; VI-NEXT: flat_store_byte v[2:3], v4 1726; VI-NEXT: v_mov_b32_e32 v2, s5 1727; VI-NEXT: flat_store_dword v[0:1], v2 1728; VI-NEXT: s_endpgm 1729; 1730; GFX9-LABEL: v5i8_arg: 1731; GFX9: ; %bb.0: ; %entry 1732; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1733; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1734; GFX9-NEXT: v_mov_b32_e32 v0, 0 1735; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1736; GFX9-NEXT: v_mov_b32_e32 v1, s1 1737; GFX9-NEXT: v_mov_b32_e32 v2, s0 1738; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:4 1739; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 1740; GFX9-NEXT: s_endpgm 1741; 1742; EG-LABEL: v5i8_arg: 1743; EG: ; %bb.0: ; %entry 1744; EG-NEXT: ALU 0, @16, KC0[], KC1[] 1745; EG-NEXT: TEX 4 @6 1746; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1747; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1748; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 1749; EG-NEXT: CF_END 1750; EG-NEXT: Fetch clause starting at 6: 1751; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1752; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1753; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1754; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1755; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1756; EG-NEXT: ALU clause starting at 16: 1757; EG-NEXT: MOV * T5.X, 0.0, 1758; EG-NEXT: ALU clause starting at 17: 1759; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1760; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1761; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1762; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1763; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 1764; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1765; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1766; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1767; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1768; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1769; EG-NEXT: MOV T5.Y, 0.0, 1770; EG-NEXT: MOV T5.Z, 0.0, 1771; EG-NEXT: AND_INT T1.W, T9.X, literal.x, 1772; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x, 1773; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1774; EG-NEXT: LSHL T1.W, PV.W, literal.x, 1775; EG-NEXT: LSHL * T2.W, T7.X, literal.y, 1776; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) 1777; EG-NEXT: OR_INT T1.W, PS, PV.W, 1778; EG-NEXT: LSHL * T2.W, T0.Z, literal.x, 1779; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1780; EG-NEXT: OR_INT T1.W, PV.W, PS, 1781; EG-NEXT: AND_INT * T2.W, T6.X, literal.x, 1782; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1783; EG-NEXT: OR_INT T6.X, PV.W, PS, 1784; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1785; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1786; EG-NEXT: LSHR * T8.X, T0.W, literal.x, 1787; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1788; 1789; CM-LABEL: v5i8_arg: 1790; CM: ; %bb.0: ; %entry 1791; CM-NEXT: ALU 0, @16, KC0[], KC1[] 1792; CM-NEXT: TEX 4 @6 1793; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1794; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X 1795; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X 1796; CM-NEXT: CF_END 1797; CM-NEXT: Fetch clause starting at 6: 1798; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1799; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1800; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1801; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1802; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1803; CM-NEXT: ALU clause starting at 16: 1804; CM-NEXT: MOV * T5.X, 0.0, 1805; CM-NEXT: ALU clause starting at 17: 1806; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1807; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1808; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1809; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1810; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1811; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1812; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1813; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1814; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1815; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1816; CM-NEXT: MOV T5.Y, 0.0, 1817; CM-NEXT: MOV T5.Z, 0.0, 1818; CM-NEXT: AND_INT * T1.W, T9.X, literal.x, 1819; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1820; CM-NEXT: AND_INT T0.Y, T8.X, literal.x, 1821; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1822; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212 1823; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1824; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1825; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1826; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, 1827; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1828; CM-NEXT: LSHR T7.X, T0.W, literal.x, 1829; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1830; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1831; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) 1832; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1833; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 1834; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1835entry: 1836 store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4 1837 ret void 1838} 1839 1840define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind { 1841; SI-LABEL: v5i16_arg: 1842; SI: ; %bb.0: ; %entry 1843; SI-NEXT: s_load_dword s2, s[0:1], 0xf 1844; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1845; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 1846; SI-NEXT: s_mov_b32 s7, 0xf000 1847; SI-NEXT: s_mov_b32 s6, -1 1848; SI-NEXT: s_waitcnt lgkmcnt(0) 1849; SI-NEXT: v_mov_b32_e32 v0, s2 1850; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:8 1851; SI-NEXT: s_waitcnt expcnt(0) 1852; SI-NEXT: v_mov_b32_e32 v0, s0 1853; SI-NEXT: v_mov_b32_e32 v1, s1 1854; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1855; SI-NEXT: s_endpgm 1856; 1857; VI-LABEL: v5i16_arg: 1858; VI: ; %bb.0: ; %entry 1859; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1860; VI-NEXT: s_load_dword s5, s[0:1], 0x3c 1861; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 1862; VI-NEXT: s_waitcnt lgkmcnt(0) 1863; VI-NEXT: s_add_u32 s4, s2, 8 1864; VI-NEXT: v_mov_b32_e32 v4, s5 1865; VI-NEXT: s_addc_u32 s5, s3, 0 1866; VI-NEXT: v_mov_b32_e32 v2, s4 1867; VI-NEXT: v_mov_b32_e32 v3, s5 1868; VI-NEXT: v_mov_b32_e32 v0, s2 1869; VI-NEXT: flat_store_short v[2:3], v4 1870; VI-NEXT: v_mov_b32_e32 v3, s1 1871; VI-NEXT: v_mov_b32_e32 v1, s3 1872; VI-NEXT: v_mov_b32_e32 v2, s0 1873; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1874; VI-NEXT: s_endpgm 1875; 1876; GFX9-LABEL: v5i16_arg: 1877; GFX9: ; %bb.0: ; %entry 1878; GFX9-NEXT: s_load_dword s6, s[4:5], 0x18 1879; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 1880; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1881; GFX9-NEXT: v_mov_b32_e32 v2, 0 1882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1883; GFX9-NEXT: v_mov_b32_e32 v3, s6 1884; GFX9-NEXT: v_mov_b32_e32 v0, s0 1885; GFX9-NEXT: v_mov_b32_e32 v1, s1 1886; GFX9-NEXT: global_store_short v2, v3, s[2:3] offset:8 1887; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1888; GFX9-NEXT: s_endpgm 1889; 1890; EG-LABEL: v5i16_arg: 1891; EG: ; %bb.0: ; %entry 1892; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1893; EG-NEXT: TEX 4 @10 1894; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[] 1895; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1896; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1897; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1898; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1899; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1900; EG-NEXT: CF_END 1901; EG-NEXT: PAD 1902; EG-NEXT: Fetch clause starting at 10: 1903; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1904; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1905; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1906; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1907; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1908; EG-NEXT: ALU clause starting at 20: 1909; EG-NEXT: MOV * T0.X, 0.0, 1910; EG-NEXT: ALU clause starting at 21: 1911; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1912; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1913; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1914; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, 1915; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1916; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1917; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1918; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1919; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1920; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1921; EG-NEXT: MOV T5.Y, 0.0, 1922; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x, 1923; EG-NEXT: AND_INT * T2.W, T4.X, literal.y, 1924; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1925; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1926; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1927; EG-NEXT: LSHL T4.X, T2.W, PV.W, 1928; EG-NEXT: LSHL * T4.W, literal.x, PV.W, 1929; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1930; EG-NEXT: MOV T4.Y, 0.0, 1931; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 1932; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1933; EG-NEXT: AND_INT T2.W, PV.W, literal.x, 1934; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, 1935; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1936; EG-NEXT: LSHL * T2.W, PV.W, literal.x, 1937; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1938; EG-NEXT: LSHL T3.X, T3.W, PV.W, 1939; EG-NEXT: LSHL * T3.W, literal.x, PV.W, 1940; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1941; EG-NEXT: MOV T3.Y, 0.0, 1942; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1943; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1944; EG-NEXT: AND_INT T6.W, PV.W, literal.x, 1945; EG-NEXT: AND_INT * T7.W, T2.X, literal.y, 1946; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1947; EG-NEXT: LSHL * T6.W, PV.W, literal.x, 1948; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1949; EG-NEXT: LSHL T6.X, T7.W, PV.W, 1950; EG-NEXT: LSHL * T6.W, literal.x, PV.W, 1951; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1952; EG-NEXT: MOV T6.Y, 0.0, 1953; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 1954; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) 1955; EG-NEXT: AND_INT T8.W, PV.W, literal.x, 1956; EG-NEXT: AND_INT * T9.W, T1.X, literal.y, 1957; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1958; EG-NEXT: LSHL * T8.W, PV.W, literal.x, 1959; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1960; EG-NEXT: LSHL T8.X, T9.W, PV.W, 1961; EG-NEXT: LSHL * T8.W, literal.x, PV.W, 1962; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1963; EG-NEXT: MOV T8.Y, 0.0, 1964; EG-NEXT: MOV T5.Z, 0.0, 1965; EG-NEXT: MOV * T4.Z, 0.0, 1966; EG-NEXT: MOV T3.Z, 0.0, 1967; EG-NEXT: MOV * T6.Z, 0.0, 1968; EG-NEXT: MOV * T8.Z, 0.0, 1969; EG-NEXT: LSHR T0.X, T7.W, literal.x, 1970; EG-NEXT: LSHR * T1.X, T2.W, literal.x, 1971; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1972; EG-NEXT: LSHR T2.X, T1.W, literal.x, 1973; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1974; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1975; EG-NEXT: LSHR * T9.X, T0.W, literal.x, 1976; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1977; 1978; CM-LABEL: v5i16_arg: 1979; CM: ; %bb.0: ; %entry 1980; CM-NEXT: ALU 0, @20, KC0[], KC1[] 1981; CM-NEXT: TEX 4 @10 1982; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[] 1983; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1984; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1985; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1986; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1987; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1988; CM-NEXT: CF_END 1989; CM-NEXT: PAD 1990; CM-NEXT: Fetch clause starting at 10: 1991; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1992; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1993; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1994; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1995; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1996; CM-NEXT: ALU clause starting at 20: 1997; CM-NEXT: MOV * T0.X, 0.0, 1998; CM-NEXT: ALU clause starting at 21: 1999; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 2000; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2001; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 2002; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2003; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, 2004; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 2005; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2006; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 2007; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 2008; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2009; CM-NEXT: MOV T5.Y, 0.0, 2010; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 2011; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2012; CM-NEXT: AND_INT T0.Z, T4.X, literal.x, 2013; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 2014; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2015; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 2016; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 2017; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2018; CM-NEXT: MOV T4.Y, 0.0, 2019; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2020; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2021; CM-NEXT: AND_INT * T2.W, PV.W, literal.x, 2022; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2023; CM-NEXT: AND_INT T0.Z, T3.X, literal.x, 2024; CM-NEXT: LSHL * T2.W, PV.W, literal.y, 2025; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2026; CM-NEXT: LSHL T3.X, PV.Z, PV.W, 2027; CM-NEXT: LSHL * T3.W, literal.x, PV.W, 2028; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2029; CM-NEXT: MOV T3.Y, 0.0, 2030; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2031; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 2032; CM-NEXT: AND_INT * T6.W, PV.W, literal.x, 2033; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2034; CM-NEXT: AND_INT T0.Z, T2.X, literal.x, 2035; CM-NEXT: LSHL * T6.W, PV.W, literal.y, 2036; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2037; CM-NEXT: LSHL T6.X, PV.Z, PV.W, 2038; CM-NEXT: LSHL * T6.W, literal.x, PV.W, 2039; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2040; CM-NEXT: MOV T6.Y, 0.0, 2041; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 2042; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00) 2043; CM-NEXT: AND_INT * T8.W, PV.W, literal.x, 2044; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2045; CM-NEXT: AND_INT T0.Z, T1.X, literal.x, 2046; CM-NEXT: LSHL * T8.W, PV.W, literal.y, 2047; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2048; CM-NEXT: LSHL T8.X, PV.Z, PV.W, 2049; CM-NEXT: LSHL * T8.W, literal.x, PV.W, 2050; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2051; CM-NEXT: MOV T8.Y, 0.0, 2052; CM-NEXT: MOV * T5.Z, 0.0, 2053; CM-NEXT: MOV * T4.Z, 0.0, 2054; CM-NEXT: MOV * T3.Z, 0.0, 2055; CM-NEXT: MOV * T6.Z, 0.0, 2056; CM-NEXT: MOV * T8.Z, 0.0, 2057; CM-NEXT: LSHR * T0.X, T7.W, literal.x, 2058; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2059; CM-NEXT: LSHR * T1.X, T2.W, literal.x, 2060; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2061; CM-NEXT: LSHR * T2.X, T1.W, literal.x, 2062; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2063; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 2064; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2065; CM-NEXT: LSHR * T9.X, T0.W, literal.x, 2066; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2067entry: 2068 store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4 2069 ret void 2070} 2071 2072define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind { 2073; SI-LABEL: v5i32_arg: 2074; SI: ; %bb.0: ; %entry 2075; SI-NEXT: s_load_dword s8, s[0:1], 0x15 2076; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2077; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 2078; SI-NEXT: s_mov_b32 s7, 0xf000 2079; SI-NEXT: s_mov_b32 s6, -1 2080; SI-NEXT: s_waitcnt lgkmcnt(0) 2081; SI-NEXT: v_mov_b32_e32 v0, s8 2082; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 2083; SI-NEXT: s_waitcnt expcnt(0) 2084; SI-NEXT: v_mov_b32_e32 v0, s0 2085; SI-NEXT: v_mov_b32_e32 v1, s1 2086; SI-NEXT: v_mov_b32_e32 v2, s2 2087; SI-NEXT: v_mov_b32_e32 v3, s3 2088; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2089; SI-NEXT: s_endpgm 2090; 2091; VI-LABEL: v5i32_arg: 2092; VI: ; %bb.0: ; %entry 2093; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2094; VI-NEXT: s_load_dword s7, s[0:1], 0x54 2095; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 2096; VI-NEXT: s_waitcnt lgkmcnt(0) 2097; VI-NEXT: s_add_u32 s6, s4, 16 2098; VI-NEXT: v_mov_b32_e32 v2, s7 2099; VI-NEXT: s_addc_u32 s7, s5, 0 2100; VI-NEXT: v_mov_b32_e32 v0, s6 2101; VI-NEXT: v_mov_b32_e32 v1, s7 2102; VI-NEXT: v_mov_b32_e32 v4, s4 2103; VI-NEXT: flat_store_dword v[0:1], v2 2104; VI-NEXT: v_mov_b32_e32 v0, s0 2105; VI-NEXT: v_mov_b32_e32 v5, s5 2106; VI-NEXT: v_mov_b32_e32 v1, s1 2107; VI-NEXT: v_mov_b32_e32 v2, s2 2108; VI-NEXT: v_mov_b32_e32 v3, s3 2109; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2110; VI-NEXT: s_endpgm 2111; 2112; GFX9-LABEL: v5i32_arg: 2113; GFX9: ; %bb.0: ; %entry 2114; GFX9-NEXT: s_load_dword s8, s[4:5], 0x30 2115; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2116; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2117; GFX9-NEXT: v_mov_b32_e32 v4, 0 2118; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX9-NEXT: v_mov_b32_e32 v5, s8 2120; GFX9-NEXT: v_mov_b32_e32 v0, s0 2121; GFX9-NEXT: v_mov_b32_e32 v1, s1 2122; GFX9-NEXT: v_mov_b32_e32 v2, s2 2123; GFX9-NEXT: v_mov_b32_e32 v3, s3 2124; GFX9-NEXT: global_store_dword v4, v5, s[6:7] offset:16 2125; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2126; GFX9-NEXT: s_endpgm 2127; 2128; EG-LABEL: v5i32_arg: 2129; EG: ; %bb.0: ; %entry 2130; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2131; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2132; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2133; EG-NEXT: CF_END 2134; EG-NEXT: ALU clause starting at 4: 2135; EG-NEXT: MOV * T0.W, KC0[5].X, 2136; EG-NEXT: MOV * T0.Z, KC0[4].W, 2137; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2138; EG-NEXT: MOV T0.X, KC0[4].Y, 2139; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2140; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2141; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2142; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2143; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2144; EG-NEXT: MOV * T3.X, KC0[5].Y, 2145; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2146; 2147; CM-LABEL: v5i32_arg: 2148; CM: ; %bb.0: ; %entry 2149; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2150; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2151; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2152; CM-NEXT: CF_END 2153; CM-NEXT: ALU clause starting at 4: 2154; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2155; CM-NEXT: MOV * T0.W, KC0[5].X, 2156; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2157; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2158; CM-NEXT: MOV * T0.Z, KC0[4].W, 2159; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2160; CM-NEXT: MOV T2.X, KC0[5].Y, 2161; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2162; CM-NEXT: MOV * T0.X, KC0[4].Y, 2163; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2164; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2165entry: 2166 store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4 2167 ret void 2168} 2169 2170define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind { 2171; SI-LABEL: v5f32_arg: 2172; SI: ; %bb.0: ; %entry 2173; SI-NEXT: s_load_dword s8, s[0:1], 0x15 2174; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2175; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 2176; SI-NEXT: s_mov_b32 s7, 0xf000 2177; SI-NEXT: s_mov_b32 s6, -1 2178; SI-NEXT: s_waitcnt lgkmcnt(0) 2179; SI-NEXT: v_mov_b32_e32 v0, s8 2180; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16 2181; SI-NEXT: s_waitcnt expcnt(0) 2182; SI-NEXT: v_mov_b32_e32 v0, s0 2183; SI-NEXT: v_mov_b32_e32 v1, s1 2184; SI-NEXT: v_mov_b32_e32 v2, s2 2185; SI-NEXT: v_mov_b32_e32 v3, s3 2186; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2187; SI-NEXT: s_endpgm 2188; 2189; VI-LABEL: v5f32_arg: 2190; VI: ; %bb.0: ; %entry 2191; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2192; VI-NEXT: s_load_dword s7, s[0:1], 0x54 2193; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 2194; VI-NEXT: s_waitcnt lgkmcnt(0) 2195; VI-NEXT: s_add_u32 s6, s4, 16 2196; VI-NEXT: v_mov_b32_e32 v3, s7 2197; VI-NEXT: s_addc_u32 s7, s5, 0 2198; VI-NEXT: v_mov_b32_e32 v1, s6 2199; VI-NEXT: v_mov_b32_e32 v2, s7 2200; VI-NEXT: v_mov_b32_e32 v4, s4 2201; VI-NEXT: v_mov_b32_e32 v0, s0 2202; VI-NEXT: flat_store_dword v[1:2], v3 2203; VI-NEXT: v_mov_b32_e32 v1, s1 2204; VI-NEXT: v_mov_b32_e32 v2, s2 2205; VI-NEXT: v_mov_b32_e32 v3, s3 2206; VI-NEXT: v_mov_b32_e32 v5, s5 2207; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2208; VI-NEXT: s_endpgm 2209; 2210; GFX9-LABEL: v5f32_arg: 2211; GFX9: ; %bb.0: ; %entry 2212; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2213; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2214; GFX9-NEXT: v_mov_b32_e32 v4, 0 2215; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 2216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2217; GFX9-NEXT: v_mov_b32_e32 v0, s0 2218; GFX9-NEXT: v_mov_b32_e32 v1, s1 2219; GFX9-NEXT: v_mov_b32_e32 v2, s2 2220; GFX9-NEXT: v_mov_b32_e32 v3, s3 2221; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2222; GFX9-NEXT: s_nop 0 2223; GFX9-NEXT: v_mov_b32_e32 v0, s4 2224; GFX9-NEXT: global_store_dword v4, v0, s[6:7] offset:16 2225; GFX9-NEXT: s_endpgm 2226; 2227; EG-LABEL: v5f32_arg: 2228; EG: ; %bb.0: ; %entry 2229; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2230; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2231; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2232; EG-NEXT: CF_END 2233; EG-NEXT: ALU clause starting at 4: 2234; EG-NEXT: MOV * T0.W, KC0[5].X, 2235; EG-NEXT: MOV * T0.Z, KC0[4].W, 2236; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2237; EG-NEXT: MOV T0.X, KC0[4].Y, 2238; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2239; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2240; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2241; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2242; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2243; EG-NEXT: MOV * T3.X, KC0[5].Y, 2244; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2245; 2246; CM-LABEL: v5f32_arg: 2247; CM: ; %bb.0: ; %entry 2248; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2249; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2250; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2251; CM-NEXT: CF_END 2252; CM-NEXT: ALU clause starting at 4: 2253; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2254; CM-NEXT: MOV * T0.W, KC0[5].X, 2255; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2256; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2257; CM-NEXT: MOV * T0.Z, KC0[4].W, 2258; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2259; CM-NEXT: MOV T2.X, KC0[5].Y, 2260; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2261; CM-NEXT: MOV * T0.X, KC0[4].Y, 2262; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2263; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2264entry: 2265 store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4 2266 ret void 2267} 2268 2269define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind { 2270; SI-LABEL: v5i64_arg: 2271; SI: ; %bb.0: ; %entry 2272; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 2273; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2274; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 2275; SI-NEXT: s_mov_b32 s15, 0xf000 2276; SI-NEXT: s_mov_b32 s14, -1 2277; SI-NEXT: s_waitcnt lgkmcnt(0) 2278; SI-NEXT: v_mov_b32_e32 v0, s8 2279; SI-NEXT: v_mov_b32_e32 v1, s9 2280; SI-NEXT: v_mov_b32_e32 v2, s10 2281; SI-NEXT: v_mov_b32_e32 v3, s11 2282; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 2283; SI-NEXT: s_waitcnt expcnt(0) 2284; SI-NEXT: v_mov_b32_e32 v0, s4 2285; SI-NEXT: v_mov_b32_e32 v1, s5 2286; SI-NEXT: v_mov_b32_e32 v2, s6 2287; SI-NEXT: v_mov_b32_e32 v3, s7 2288; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2289; SI-NEXT: s_waitcnt expcnt(0) 2290; SI-NEXT: v_mov_b32_e32 v0, s0 2291; SI-NEXT: v_mov_b32_e32 v1, s1 2292; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 2293; SI-NEXT: s_endpgm 2294; 2295; VI-LABEL: v5i64_arg: 2296; VI: ; %bb.0: ; %entry 2297; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 2298; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2299; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 2300; VI-NEXT: s_waitcnt lgkmcnt(0) 2301; VI-NEXT: v_mov_b32_e32 v0, s8 2302; VI-NEXT: s_add_u32 s8, s2, 16 2303; VI-NEXT: v_mov_b32_e32 v1, s9 2304; VI-NEXT: s_addc_u32 s9, s3, 0 2305; VI-NEXT: v_mov_b32_e32 v4, s8 2306; VI-NEXT: v_mov_b32_e32 v2, s10 2307; VI-NEXT: v_mov_b32_e32 v3, s11 2308; VI-NEXT: v_mov_b32_e32 v5, s9 2309; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2310; VI-NEXT: v_mov_b32_e32 v5, s3 2311; VI-NEXT: v_mov_b32_e32 v0, s4 2312; VI-NEXT: v_mov_b32_e32 v1, s5 2313; VI-NEXT: v_mov_b32_e32 v2, s6 2314; VI-NEXT: v_mov_b32_e32 v3, s7 2315; VI-NEXT: v_mov_b32_e32 v4, s2 2316; VI-NEXT: s_add_u32 s2, s2, 32 2317; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2318; VI-NEXT: s_addc_u32 s3, s3, 0 2319; VI-NEXT: v_mov_b32_e32 v2, s2 2320; VI-NEXT: v_mov_b32_e32 v0, s0 2321; VI-NEXT: v_mov_b32_e32 v1, s1 2322; VI-NEXT: v_mov_b32_e32 v3, s3 2323; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2324; VI-NEXT: s_endpgm 2325; 2326; GFX9-LABEL: v5i64_arg: 2327; GFX9: ; %bb.0: ; %entry 2328; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 2329; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2330; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 2331; GFX9-NEXT: v_mov_b32_e32 v4, 0 2332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2333; GFX9-NEXT: v_mov_b32_e32 v0, s12 2334; GFX9-NEXT: v_mov_b32_e32 v1, s13 2335; GFX9-NEXT: v_mov_b32_e32 v2, s14 2336; GFX9-NEXT: v_mov_b32_e32 v3, s15 2337; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 2338; GFX9-NEXT: s_nop 0 2339; GFX9-NEXT: v_mov_b32_e32 v0, s8 2340; GFX9-NEXT: v_mov_b32_e32 v1, s9 2341; GFX9-NEXT: v_mov_b32_e32 v2, s10 2342; GFX9-NEXT: v_mov_b32_e32 v3, s11 2343; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2344; GFX9-NEXT: s_nop 0 2345; GFX9-NEXT: v_mov_b32_e32 v0, s0 2346; GFX9-NEXT: v_mov_b32_e32 v1, s1 2347; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 2348; GFX9-NEXT: s_endpgm 2349; 2350; EG-LABEL: v5i64_arg: 2351; EG: ; %bb.0: ; %entry 2352; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2353; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2354; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2355; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2356; EG-NEXT: CF_END 2357; EG-NEXT: PAD 2358; EG-NEXT: ALU clause starting at 6: 2359; EG-NEXT: MOV * T0.W, KC0[7].X, 2360; EG-NEXT: MOV * T0.Z, KC0[6].W, 2361; EG-NEXT: MOV T0.Y, KC0[6].Z, 2362; EG-NEXT: MOV * T1.W, KC0[8].X, 2363; EG-NEXT: MOV T0.X, KC0[6].Y, 2364; EG-NEXT: MOV * T1.Z, KC0[7].W, 2365; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2366; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2367; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2368; EG-NEXT: MOV T1.X, KC0[7].Y, 2369; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2370; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2371; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2372; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2373; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2374; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2375; EG-NEXT: MOV T5.Y, KC0[8].Z, 2376; EG-NEXT: MOV * T5.X, KC0[8].Y, 2377; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2378; 2379; CM-LABEL: v5i64_arg: 2380; CM: ; %bb.0: ; %entry 2381; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2382; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2383; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2384; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2385; CM-NEXT: CF_END 2386; CM-NEXT: PAD 2387; CM-NEXT: ALU clause starting at 6: 2388; CM-NEXT: MOV * T0.W, KC0[8].X, 2389; CM-NEXT: MOV T1.Y, KC0[8].Z, 2390; CM-NEXT: MOV * T0.Z, KC0[7].W, 2391; CM-NEXT: MOV T1.X, KC0[8].Y, 2392; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2393; CM-NEXT: MOV T0.X, KC0[7].Y, 2394; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2395; CM-NEXT: MOV * T2.W, KC0[7].X, 2396; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2397; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2398; CM-NEXT: MOV T2.Z, KC0[6].W, 2399; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2400; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2401; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2402; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2403; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2404; CM-NEXT: MOV * T2.X, KC0[6].Y, 2405; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2406; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2407entry: 2408 store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8 2409 ret void 2410} 2411 2412define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind { 2413; SI-LABEL: v5f64_arg: 2414; SI: ; %bb.0: ; %entry 2415; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x19 2416; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2417; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x21 2418; SI-NEXT: s_mov_b32 s15, 0xf000 2419; SI-NEXT: s_mov_b32 s14, -1 2420; SI-NEXT: s_waitcnt lgkmcnt(0) 2421; SI-NEXT: v_mov_b32_e32 v0, s8 2422; SI-NEXT: v_mov_b32_e32 v1, s9 2423; SI-NEXT: v_mov_b32_e32 v2, s10 2424; SI-NEXT: v_mov_b32_e32 v3, s11 2425; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 2426; SI-NEXT: s_waitcnt expcnt(0) 2427; SI-NEXT: v_mov_b32_e32 v0, s4 2428; SI-NEXT: v_mov_b32_e32 v1, s5 2429; SI-NEXT: v_mov_b32_e32 v2, s6 2430; SI-NEXT: v_mov_b32_e32 v3, s7 2431; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2432; SI-NEXT: s_waitcnt expcnt(0) 2433; SI-NEXT: v_mov_b32_e32 v0, s0 2434; SI-NEXT: v_mov_b32_e32 v1, s1 2435; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32 2436; SI-NEXT: s_endpgm 2437; 2438; VI-LABEL: v5f64_arg: 2439; VI: ; %bb.0: ; %entry 2440; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64 2441; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2442; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x84 2443; VI-NEXT: s_waitcnt lgkmcnt(0) 2444; VI-NEXT: v_mov_b32_e32 v0, s8 2445; VI-NEXT: s_add_u32 s8, s2, 16 2446; VI-NEXT: v_mov_b32_e32 v1, s9 2447; VI-NEXT: s_addc_u32 s9, s3, 0 2448; VI-NEXT: v_mov_b32_e32 v4, s8 2449; VI-NEXT: v_mov_b32_e32 v2, s10 2450; VI-NEXT: v_mov_b32_e32 v3, s11 2451; VI-NEXT: v_mov_b32_e32 v5, s9 2452; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2453; VI-NEXT: v_mov_b32_e32 v5, s3 2454; VI-NEXT: v_mov_b32_e32 v0, s4 2455; VI-NEXT: v_mov_b32_e32 v1, s5 2456; VI-NEXT: v_mov_b32_e32 v2, s6 2457; VI-NEXT: v_mov_b32_e32 v3, s7 2458; VI-NEXT: v_mov_b32_e32 v4, s2 2459; VI-NEXT: s_add_u32 s2, s2, 32 2460; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2461; VI-NEXT: s_addc_u32 s3, s3, 0 2462; VI-NEXT: v_mov_b32_e32 v2, s2 2463; VI-NEXT: v_mov_b32_e32 v0, s0 2464; VI-NEXT: v_mov_b32_e32 v1, s1 2465; VI-NEXT: v_mov_b32_e32 v3, s3 2466; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2467; VI-NEXT: s_endpgm 2468; 2469; GFX9-LABEL: v5f64_arg: 2470; GFX9: ; %bb.0: ; %entry 2471; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 2472; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2473; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x60 2474; GFX9-NEXT: v_mov_b32_e32 v4, 0 2475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2476; GFX9-NEXT: v_mov_b32_e32 v0, s12 2477; GFX9-NEXT: v_mov_b32_e32 v1, s13 2478; GFX9-NEXT: v_mov_b32_e32 v2, s14 2479; GFX9-NEXT: v_mov_b32_e32 v3, s15 2480; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 2481; GFX9-NEXT: s_nop 0 2482; GFX9-NEXT: v_mov_b32_e32 v0, s8 2483; GFX9-NEXT: v_mov_b32_e32 v1, s9 2484; GFX9-NEXT: v_mov_b32_e32 v2, s10 2485; GFX9-NEXT: v_mov_b32_e32 v3, s11 2486; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2487; GFX9-NEXT: s_nop 0 2488; GFX9-NEXT: v_mov_b32_e32 v0, s0 2489; GFX9-NEXT: v_mov_b32_e32 v1, s1 2490; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7] offset:32 2491; GFX9-NEXT: s_endpgm 2492; 2493; EG-LABEL: v5f64_arg: 2494; EG: ; %bb.0: ; %entry 2495; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2496; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2497; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2498; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2499; EG-NEXT: CF_END 2500; EG-NEXT: PAD 2501; EG-NEXT: ALU clause starting at 6: 2502; EG-NEXT: MOV * T0.W, KC0[7].X, 2503; EG-NEXT: MOV * T0.Z, KC0[6].W, 2504; EG-NEXT: MOV T0.Y, KC0[6].Z, 2505; EG-NEXT: MOV * T1.W, KC0[8].X, 2506; EG-NEXT: MOV T0.X, KC0[6].Y, 2507; EG-NEXT: MOV * T1.Z, KC0[7].W, 2508; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2509; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2510; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2511; EG-NEXT: MOV T1.X, KC0[7].Y, 2512; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2513; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2514; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2515; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2516; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2517; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2518; EG-NEXT: MOV T5.Y, KC0[8].Z, 2519; EG-NEXT: MOV * T5.X, KC0[8].Y, 2520; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2521; 2522; CM-LABEL: v5f64_arg: 2523; CM: ; %bb.0: ; %entry 2524; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2525; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2526; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2527; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2528; CM-NEXT: CF_END 2529; CM-NEXT: PAD 2530; CM-NEXT: ALU clause starting at 6: 2531; CM-NEXT: MOV * T0.W, KC0[8].X, 2532; CM-NEXT: MOV T1.Y, KC0[8].Z, 2533; CM-NEXT: MOV * T0.Z, KC0[7].W, 2534; CM-NEXT: MOV T1.X, KC0[8].Y, 2535; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2536; CM-NEXT: MOV T0.X, KC0[7].Y, 2537; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2538; CM-NEXT: MOV * T2.W, KC0[7].X, 2539; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2540; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2541; CM-NEXT: MOV T2.Z, KC0[6].W, 2542; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2543; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2544; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2545; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2546; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2547; CM-NEXT: MOV * T2.X, KC0[6].Y, 2548; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2549; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2550entry: 2551 store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8 2552 ret void 2553} 2554 2555; FIXME: Lots of unpack and re-pack junk on VI 2556define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { 2557; SI-LABEL: v8i8_arg: 2558; SI: ; %bb.0: ; %entry 2559; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 2560; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2561; SI-NEXT: s_mov_b32 s3, 0xf000 2562; SI-NEXT: s_mov_b32 s2, -1 2563; SI-NEXT: s_waitcnt lgkmcnt(0) 2564; SI-NEXT: v_mov_b32_e32 v0, s4 2565; SI-NEXT: v_mov_b32_e32 v1, s5 2566; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2567; SI-NEXT: s_endpgm 2568; 2569; VI-LABEL: v8i8_arg: 2570; VI: ; %bb.0: ; %entry 2571; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2572; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 2573; VI-NEXT: s_waitcnt lgkmcnt(0) 2574; VI-NEXT: v_mov_b32_e32 v0, s2 2575; VI-NEXT: v_mov_b32_e32 v3, s1 2576; VI-NEXT: v_mov_b32_e32 v1, s3 2577; VI-NEXT: v_mov_b32_e32 v2, s0 2578; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2579; VI-NEXT: s_endpgm 2580; 2581; GFX9-LABEL: v8i8_arg: 2582; GFX9: ; %bb.0: ; %entry 2583; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 2584; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 2585; GFX9-NEXT: v_mov_b32_e32 v2, 0 2586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2587; GFX9-NEXT: v_mov_b32_e32 v0, s0 2588; GFX9-NEXT: v_mov_b32_e32 v1, s1 2589; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2590; GFX9-NEXT: s_endpgm 2591; 2592; EG-LABEL: v8i8_arg: 2593; EG: ; %bb.0: ; %entry 2594; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2595; EG-NEXT: TEX 0 @20 2596; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2597; EG-NEXT: TEX 0 @22 2598; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2599; EG-NEXT: TEX 0 @24 2600; EG-NEXT: ALU 7, @50, KC0[], KC1[] 2601; EG-NEXT: TEX 0 @26 2602; EG-NEXT: ALU 7, @58, KC0[], KC1[] 2603; EG-NEXT: TEX 0 @28 2604; EG-NEXT: ALU 7, @66, KC0[], KC1[] 2605; EG-NEXT: TEX 0 @30 2606; EG-NEXT: ALU 7, @74, KC0[], KC1[] 2607; EG-NEXT: TEX 0 @32 2608; EG-NEXT: ALU 5, @82, KC0[], KC1[] 2609; EG-NEXT: TEX 0 @34 2610; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2611; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 2612; EG-NEXT: CF_END 2613; EG-NEXT: PAD 2614; EG-NEXT: Fetch clause starting at 20: 2615; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2616; EG-NEXT: Fetch clause starting at 22: 2617; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2618; EG-NEXT: Fetch clause starting at 24: 2619; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2620; EG-NEXT: Fetch clause starting at 26: 2621; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2622; EG-NEXT: Fetch clause starting at 28: 2623; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2624; EG-NEXT: Fetch clause starting at 30: 2625; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2626; EG-NEXT: Fetch clause starting at 32: 2627; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2628; EG-NEXT: Fetch clause starting at 34: 2629; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2630; EG-NEXT: ALU clause starting at 36: 2631; EG-NEXT: MOV * T0.Y, T2.X, 2632; EG-NEXT: MOV * T5.X, 0.0, 2633; EG-NEXT: ALU clause starting at 38: 2634; EG-NEXT: LSHL T0.W, T6.X, literal.x, 2635; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2636; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2637; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2638; EG-NEXT: MOV T2.X, PV.W, 2639; EG-NEXT: MOV * T0.Y, T3.X, 2640; EG-NEXT: ALU clause starting at 44: 2641; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2642; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 2643; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2644; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2645; EG-NEXT: MOV T3.X, PV.W, 2646; EG-NEXT: MOV * T0.Y, T2.X, 2647; EG-NEXT: ALU clause starting at 50: 2648; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2649; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2650; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2651; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2652; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2653; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2654; EG-NEXT: MOV T2.X, PV.W, 2655; EG-NEXT: MOV * T0.Y, T3.X, 2656; EG-NEXT: ALU clause starting at 58: 2657; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2658; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2659; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2660; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2661; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2662; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2663; EG-NEXT: MOV T3.X, PV.W, 2664; EG-NEXT: MOV * T0.Y, T2.X, 2665; EG-NEXT: ALU clause starting at 66: 2666; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2667; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2668; EG-NEXT: 255(3.573311e-43), -65281(nan) 2669; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2670; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2671; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2672; EG-NEXT: MOV T2.X, PV.W, 2673; EG-NEXT: MOV * T0.Y, T3.X, 2674; EG-NEXT: ALU clause starting at 74: 2675; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2676; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2677; EG-NEXT: 255(3.573311e-43), -65281(nan) 2678; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2679; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2680; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2681; EG-NEXT: MOV T3.X, PV.W, 2682; EG-NEXT: MOV * T0.Y, T2.X, 2683; EG-NEXT: ALU clause starting at 82: 2684; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2685; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 2686; EG-NEXT: -256(nan), 255(3.573311e-43) 2687; EG-NEXT: OR_INT * T5.Y, PV.W, PS, 2688; EG-NEXT: MOV T2.X, PV.Y, 2689; EG-NEXT: MOV * T0.Y, T3.X, 2690; EG-NEXT: ALU clause starting at 88: 2691; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2692; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, 2693; EG-NEXT: -256(nan), 255(3.573311e-43) 2694; EG-NEXT: OR_INT T5.X, PV.W, PS, 2695; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2696; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2697; 2698; CM-LABEL: v8i8_arg: 2699; CM: ; %bb.0: ; %entry 2700; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2701; CM-NEXT: TEX 0 @20 2702; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2703; CM-NEXT: TEX 0 @22 2704; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2705; CM-NEXT: TEX 0 @24 2706; CM-NEXT: ALU 7, @50, KC0[], KC1[] 2707; CM-NEXT: TEX 0 @26 2708; CM-NEXT: ALU 7, @58, KC0[], KC1[] 2709; CM-NEXT: TEX 0 @28 2710; CM-NEXT: ALU 7, @66, KC0[], KC1[] 2711; CM-NEXT: TEX 0 @30 2712; CM-NEXT: ALU 7, @74, KC0[], KC1[] 2713; CM-NEXT: TEX 0 @32 2714; CM-NEXT: ALU 5, @82, KC0[], KC1[] 2715; CM-NEXT: TEX 0 @34 2716; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2717; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 2718; CM-NEXT: CF_END 2719; CM-NEXT: PAD 2720; CM-NEXT: Fetch clause starting at 20: 2721; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2722; CM-NEXT: Fetch clause starting at 22: 2723; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2724; CM-NEXT: Fetch clause starting at 24: 2725; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2726; CM-NEXT: Fetch clause starting at 26: 2727; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2728; CM-NEXT: Fetch clause starting at 28: 2729; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2730; CM-NEXT: Fetch clause starting at 30: 2731; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2732; CM-NEXT: Fetch clause starting at 32: 2733; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2734; CM-NEXT: Fetch clause starting at 34: 2735; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2736; CM-NEXT: ALU clause starting at 36: 2737; CM-NEXT: MOV * T0.Y, T2.X, 2738; CM-NEXT: MOV * T5.X, 0.0, 2739; CM-NEXT: ALU clause starting at 38: 2740; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 2741; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2742; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2743; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2744; CM-NEXT: MOV T2.X, PV.W, 2745; CM-NEXT: MOV * T0.Y, T3.X, 2746; CM-NEXT: ALU clause starting at 44: 2747; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2748; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 2749; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2750; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2751; CM-NEXT: MOV T3.X, PV.W, 2752; CM-NEXT: MOV * T0.Y, T2.X, 2753; CM-NEXT: ALU clause starting at 50: 2754; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2755; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2756; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2757; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2758; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2759; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2760; CM-NEXT: MOV T2.X, PV.W, 2761; CM-NEXT: MOV * T0.Y, T3.X, 2762; CM-NEXT: ALU clause starting at 58: 2763; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2764; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2765; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2766; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2767; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2768; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2769; CM-NEXT: MOV T3.X, PV.W, 2770; CM-NEXT: MOV * T0.Y, T2.X, 2771; CM-NEXT: ALU clause starting at 66: 2772; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2773; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2774; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2775; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2776; CM-NEXT: -65281(nan), 8(1.121039e-44) 2777; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2778; CM-NEXT: MOV T2.X, PV.W, 2779; CM-NEXT: MOV * T0.Y, T3.X, 2780; CM-NEXT: ALU clause starting at 74: 2781; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2782; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2783; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2784; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2785; CM-NEXT: -65281(nan), 8(1.121039e-44) 2786; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2787; CM-NEXT: MOV T3.X, PV.W, 2788; CM-NEXT: MOV * T0.Y, T2.X, 2789; CM-NEXT: ALU clause starting at 82: 2790; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2791; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 2792; CM-NEXT: -256(nan), 255(3.573311e-43) 2793; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W, 2794; CM-NEXT: MOV T2.X, PV.Y, 2795; CM-NEXT: MOV * T0.Y, T3.X, 2796; CM-NEXT: ALU clause starting at 88: 2797; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2798; CM-NEXT: AND_INT * T0.W, T5.X, literal.y, 2799; CM-NEXT: -256(nan), 255(3.573311e-43) 2800; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 2801; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2802; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2803entry: 2804 store <8 x i8> %in, <8 x i8> addrspace(1)* %out 2805 ret void 2806} 2807 2808define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { 2809; SI-LABEL: v8i16_arg: 2810; SI: ; %bb.0: ; %entry 2811; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 2812; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2813; SI-NEXT: s_mov_b32 s3, 0xf000 2814; SI-NEXT: s_mov_b32 s2, -1 2815; SI-NEXT: s_waitcnt lgkmcnt(0) 2816; SI-NEXT: v_mov_b32_e32 v0, s4 2817; SI-NEXT: v_mov_b32_e32 v1, s5 2818; SI-NEXT: v_mov_b32_e32 v2, s6 2819; SI-NEXT: v_mov_b32_e32 v3, s7 2820; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2821; SI-NEXT: s_endpgm 2822; 2823; VI-LABEL: v8i16_arg: 2824; VI: ; %bb.0: ; %entry 2825; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2826; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 2827; VI-NEXT: s_waitcnt lgkmcnt(0) 2828; VI-NEXT: v_mov_b32_e32 v4, s4 2829; VI-NEXT: v_mov_b32_e32 v0, s0 2830; VI-NEXT: v_mov_b32_e32 v5, s5 2831; VI-NEXT: v_mov_b32_e32 v1, s1 2832; VI-NEXT: v_mov_b32_e32 v2, s2 2833; VI-NEXT: v_mov_b32_e32 v3, s3 2834; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2835; VI-NEXT: s_endpgm 2836; 2837; GFX9-LABEL: v8i16_arg: 2838; GFX9: ; %bb.0: ; %entry 2839; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 2840; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 2841; GFX9-NEXT: v_mov_b32_e32 v4, 0 2842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2843; GFX9-NEXT: v_mov_b32_e32 v0, s0 2844; GFX9-NEXT: v_mov_b32_e32 v1, s1 2845; GFX9-NEXT: v_mov_b32_e32 v2, s2 2846; GFX9-NEXT: v_mov_b32_e32 v3, s3 2847; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 2848; GFX9-NEXT: s_endpgm 2849; 2850; EG-LABEL: v8i16_arg: 2851; EG: ; %bb.0: ; %entry 2852; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2853; EG-NEXT: TEX 0 @20 2854; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2855; EG-NEXT: TEX 0 @22 2856; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2857; EG-NEXT: TEX 0 @24 2858; EG-NEXT: ALU 5, @50, KC0[], KC1[] 2859; EG-NEXT: TEX 0 @26 2860; EG-NEXT: ALU 5, @56, KC0[], KC1[] 2861; EG-NEXT: TEX 0 @28 2862; EG-NEXT: ALU 5, @62, KC0[], KC1[] 2863; EG-NEXT: TEX 0 @30 2864; EG-NEXT: ALU 5, @68, KC0[], KC1[] 2865; EG-NEXT: TEX 0 @32 2866; EG-NEXT: ALU 5, @74, KC0[], KC1[] 2867; EG-NEXT: TEX 0 @34 2868; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2869; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 2870; EG-NEXT: CF_END 2871; EG-NEXT: PAD 2872; EG-NEXT: Fetch clause starting at 20: 2873; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2874; EG-NEXT: Fetch clause starting at 22: 2875; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2876; EG-NEXT: Fetch clause starting at 24: 2877; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2878; EG-NEXT: Fetch clause starting at 26: 2879; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2880; EG-NEXT: Fetch clause starting at 28: 2881; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2882; EG-NEXT: Fetch clause starting at 30: 2883; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2884; EG-NEXT: Fetch clause starting at 32: 2885; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2886; EG-NEXT: Fetch clause starting at 34: 2887; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2888; EG-NEXT: ALU clause starting at 36: 2889; EG-NEXT: MOV * T0.Y, T3.X, 2890; EG-NEXT: MOV * T7.X, 0.0, 2891; EG-NEXT: ALU clause starting at 38: 2892; EG-NEXT: LSHL T0.W, T8.X, literal.x, 2893; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2894; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2895; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2896; EG-NEXT: MOV T3.X, PV.W, 2897; EG-NEXT: MOV * T0.Y, T5.X, 2898; EG-NEXT: ALU clause starting at 44: 2899; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2900; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2901; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2902; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2903; EG-NEXT: MOV T5.X, PV.W, 2904; EG-NEXT: MOV * T0.Y, T3.X, 2905; EG-NEXT: ALU clause starting at 50: 2906; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2907; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2908; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2909; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2910; EG-NEXT: MOV T3.X, PV.W, 2911; EG-NEXT: MOV * T0.Y, T5.X, 2912; EG-NEXT: ALU clause starting at 56: 2913; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2914; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2915; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2916; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2917; EG-NEXT: MOV T5.X, PV.W, 2918; EG-NEXT: MOV * T0.Y, T2.X, 2919; EG-NEXT: ALU clause starting at 62: 2920; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2921; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2922; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2923; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2924; EG-NEXT: MOV T2.X, PV.W, 2925; EG-NEXT: MOV * T0.Y, T4.X, 2926; EG-NEXT: ALU clause starting at 68: 2927; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2928; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2929; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2930; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2931; EG-NEXT: MOV T4.X, PV.W, 2932; EG-NEXT: MOV * T0.Y, T2.X, 2933; EG-NEXT: ALU clause starting at 74: 2934; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2935; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2936; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2937; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 2938; EG-NEXT: MOV T2.X, PV.Z, 2939; EG-NEXT: MOV * T0.Y, T4.X, 2940; EG-NEXT: ALU clause starting at 80: 2941; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 2942; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 2943; EG-NEXT: AND_INT * T1.W, T7.X, literal.z, 2944; EG-NEXT: 2(2.802597e-45), -65536(nan) 2945; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2946; EG-NEXT: OR_INT * T7.X, PV.W, PS, 2947; EG-NEXT: MOV T4.X, PV.X, 2948; EG-NEXT: MOV * T7.W, T3.X, 2949; EG-NEXT: MOV * T7.Y, T5.X, 2950; 2951; CM-LABEL: v8i16_arg: 2952; CM: ; %bb.0: ; %entry 2953; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2954; CM-NEXT: TEX 0 @20 2955; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2956; CM-NEXT: TEX 0 @22 2957; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2958; CM-NEXT: TEX 0 @24 2959; CM-NEXT: ALU 5, @50, KC0[], KC1[] 2960; CM-NEXT: TEX 0 @26 2961; CM-NEXT: ALU 5, @56, KC0[], KC1[] 2962; CM-NEXT: TEX 0 @28 2963; CM-NEXT: ALU 5, @62, KC0[], KC1[] 2964; CM-NEXT: TEX 0 @30 2965; CM-NEXT: ALU 5, @68, KC0[], KC1[] 2966; CM-NEXT: TEX 0 @32 2967; CM-NEXT: ALU 5, @74, KC0[], KC1[] 2968; CM-NEXT: TEX 0 @34 2969; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2970; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 2971; CM-NEXT: CF_END 2972; CM-NEXT: PAD 2973; CM-NEXT: Fetch clause starting at 20: 2974; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2975; CM-NEXT: Fetch clause starting at 22: 2976; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2977; CM-NEXT: Fetch clause starting at 24: 2978; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2979; CM-NEXT: Fetch clause starting at 26: 2980; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2981; CM-NEXT: Fetch clause starting at 28: 2982; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2983; CM-NEXT: Fetch clause starting at 30: 2984; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2985; CM-NEXT: Fetch clause starting at 32: 2986; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2987; CM-NEXT: Fetch clause starting at 34: 2988; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2989; CM-NEXT: ALU clause starting at 36: 2990; CM-NEXT: MOV * T0.Y, T3.X, 2991; CM-NEXT: MOV * T7.X, 0.0, 2992; CM-NEXT: ALU clause starting at 38: 2993; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 2994; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2995; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2996; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2997; CM-NEXT: MOV T3.X, PV.W, 2998; CM-NEXT: MOV * T0.Y, T5.X, 2999; CM-NEXT: ALU clause starting at 44: 3000; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3001; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3002; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3003; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3004; CM-NEXT: MOV T5.X, PV.W, 3005; CM-NEXT: MOV * T0.Y, T3.X, 3006; CM-NEXT: ALU clause starting at 50: 3007; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3008; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3009; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3010; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3011; CM-NEXT: MOV T3.X, PV.W, 3012; CM-NEXT: MOV * T0.Y, T5.X, 3013; CM-NEXT: ALU clause starting at 56: 3014; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3015; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3016; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3017; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3018; CM-NEXT: MOV T5.X, PV.W, 3019; CM-NEXT: MOV * T0.Y, T2.X, 3020; CM-NEXT: ALU clause starting at 62: 3021; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3022; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3023; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3024; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3025; CM-NEXT: MOV T2.X, PV.W, 3026; CM-NEXT: MOV * T0.Y, T4.X, 3027; CM-NEXT: ALU clause starting at 68: 3028; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3029; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3030; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3031; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3032; CM-NEXT: MOV T4.X, PV.W, 3033; CM-NEXT: MOV * T0.Y, T2.X, 3034; CM-NEXT: ALU clause starting at 74: 3035; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3036; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3037; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3038; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3039; CM-NEXT: MOV T2.X, PV.Z, 3040; CM-NEXT: MOV * T0.Y, T4.X, 3041; CM-NEXT: ALU clause starting at 80: 3042; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 3043; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 3044; CM-NEXT: AND_INT * T0.W, T7.X, literal.z, 3045; CM-NEXT: 2(2.802597e-45), -65536(nan) 3046; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3047; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3048; CM-NEXT: MOV T4.X, PV.X, 3049; CM-NEXT: MOV * T7.W, T3.X, 3050; CM-NEXT: MOV * T7.Y, T5.X, 3051entry: 3052 store <8 x i16> %in, <8 x i16> addrspace(1)* %out 3053 ret void 3054} 3055 3056define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind { 3057; SI-LABEL: v8i32_arg: 3058; SI: ; %bb.0: ; %entry 3059; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3060; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3061; SI-NEXT: s_mov_b32 s3, 0xf000 3062; SI-NEXT: s_mov_b32 s2, -1 3063; SI-NEXT: s_waitcnt lgkmcnt(0) 3064; SI-NEXT: v_mov_b32_e32 v0, s8 3065; SI-NEXT: v_mov_b32_e32 v1, s9 3066; SI-NEXT: v_mov_b32_e32 v2, s10 3067; SI-NEXT: v_mov_b32_e32 v3, s11 3068; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3069; SI-NEXT: s_waitcnt expcnt(0) 3070; SI-NEXT: v_mov_b32_e32 v0, s4 3071; SI-NEXT: v_mov_b32_e32 v1, s5 3072; SI-NEXT: v_mov_b32_e32 v2, s6 3073; SI-NEXT: v_mov_b32_e32 v3, s7 3074; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3075; SI-NEXT: s_endpgm 3076; 3077; VI-LABEL: v8i32_arg: 3078; VI: ; %bb.0: ; %entry 3079; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 3080; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3081; VI-NEXT: s_waitcnt lgkmcnt(0) 3082; VI-NEXT: v_mov_b32_e32 v0, s8 3083; VI-NEXT: s_add_u32 s2, s0, 16 3084; VI-NEXT: s_addc_u32 s3, s1, 0 3085; VI-NEXT: v_mov_b32_e32 v5, s3 3086; VI-NEXT: v_mov_b32_e32 v1, s9 3087; VI-NEXT: v_mov_b32_e32 v2, s10 3088; VI-NEXT: v_mov_b32_e32 v3, s11 3089; VI-NEXT: v_mov_b32_e32 v4, s2 3090; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3091; VI-NEXT: v_mov_b32_e32 v5, s1 3092; VI-NEXT: v_mov_b32_e32 v0, s4 3093; VI-NEXT: v_mov_b32_e32 v1, s5 3094; VI-NEXT: v_mov_b32_e32 v2, s6 3095; VI-NEXT: v_mov_b32_e32 v3, s7 3096; VI-NEXT: v_mov_b32_e32 v4, s0 3097; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3098; VI-NEXT: s_endpgm 3099; 3100; GFX9-LABEL: v8i32_arg: 3101; GFX9: ; %bb.0: ; %entry 3102; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3103; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3104; GFX9-NEXT: v_mov_b32_e32 v4, 0 3105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3106; GFX9-NEXT: v_mov_b32_e32 v0, s12 3107; GFX9-NEXT: v_mov_b32_e32 v1, s13 3108; GFX9-NEXT: v_mov_b32_e32 v2, s14 3109; GFX9-NEXT: v_mov_b32_e32 v3, s15 3110; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3111; GFX9-NEXT: s_nop 0 3112; GFX9-NEXT: v_mov_b32_e32 v0, s8 3113; GFX9-NEXT: v_mov_b32_e32 v1, s9 3114; GFX9-NEXT: v_mov_b32_e32 v2, s10 3115; GFX9-NEXT: v_mov_b32_e32 v3, s11 3116; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3117; GFX9-NEXT: s_endpgm 3118; 3119; EG-LABEL: v8i32_arg: 3120; EG: ; %bb.0: ; %entry 3121; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3124; EG-NEXT: CF_END 3125; EG-NEXT: ALU clause starting at 4: 3126; EG-NEXT: MOV * T0.W, KC0[5].X, 3127; EG-NEXT: MOV * T0.Z, KC0[4].W, 3128; EG-NEXT: MOV T0.Y, KC0[4].Z, 3129; EG-NEXT: MOV * T1.W, KC0[6].X, 3130; EG-NEXT: MOV T0.X, KC0[4].Y, 3131; EG-NEXT: MOV * T1.Z, KC0[5].W, 3132; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3133; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3134; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3135; EG-NEXT: MOV T1.X, KC0[5].Y, 3136; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3137; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3138; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3139; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3140; 3141; CM-LABEL: v8i32_arg: 3142; CM: ; %bb.0: ; %entry 3143; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3144; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3145; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3146; CM-NEXT: CF_END 3147; CM-NEXT: ALU clause starting at 4: 3148; CM-NEXT: MOV * T0.W, KC0[6].X, 3149; CM-NEXT: MOV * T0.Z, KC0[5].W, 3150; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3151; CM-NEXT: MOV T0.X, KC0[5].Y, 3152; CM-NEXT: MOV * T1.W, KC0[5].X, 3153; CM-NEXT: MOV T1.Z, KC0[4].W, 3154; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3155; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3156; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3157; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3158; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3159; CM-NEXT: MOV * T1.X, KC0[4].Y, 3160; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3161; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3162entry: 3163 store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4 3164 ret void 3165} 3166 3167define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { 3168; SI-LABEL: v8f32_arg: 3169; SI: ; %bb.0: ; %entry 3170; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3171; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3172; SI-NEXT: s_mov_b32 s3, 0xf000 3173; SI-NEXT: s_mov_b32 s2, -1 3174; SI-NEXT: s_waitcnt lgkmcnt(0) 3175; SI-NEXT: v_mov_b32_e32 v0, s8 3176; SI-NEXT: v_mov_b32_e32 v1, s9 3177; SI-NEXT: v_mov_b32_e32 v2, s10 3178; SI-NEXT: v_mov_b32_e32 v3, s11 3179; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3180; SI-NEXT: s_waitcnt expcnt(0) 3181; SI-NEXT: v_mov_b32_e32 v0, s4 3182; SI-NEXT: v_mov_b32_e32 v1, s5 3183; SI-NEXT: v_mov_b32_e32 v2, s6 3184; SI-NEXT: v_mov_b32_e32 v3, s7 3185; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3186; SI-NEXT: s_endpgm 3187; 3188; VI-LABEL: v8f32_arg: 3189; VI: ; %bb.0: ; %entry 3190; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 3191; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3192; VI-NEXT: s_waitcnt lgkmcnt(0) 3193; VI-NEXT: v_mov_b32_e32 v0, s8 3194; VI-NEXT: s_add_u32 s2, s0, 16 3195; VI-NEXT: s_addc_u32 s3, s1, 0 3196; VI-NEXT: v_mov_b32_e32 v5, s3 3197; VI-NEXT: v_mov_b32_e32 v1, s9 3198; VI-NEXT: v_mov_b32_e32 v2, s10 3199; VI-NEXT: v_mov_b32_e32 v3, s11 3200; VI-NEXT: v_mov_b32_e32 v4, s2 3201; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3202; VI-NEXT: v_mov_b32_e32 v5, s1 3203; VI-NEXT: v_mov_b32_e32 v0, s4 3204; VI-NEXT: v_mov_b32_e32 v1, s5 3205; VI-NEXT: v_mov_b32_e32 v2, s6 3206; VI-NEXT: v_mov_b32_e32 v3, s7 3207; VI-NEXT: v_mov_b32_e32 v4, s0 3208; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3209; VI-NEXT: s_endpgm 3210; 3211; GFX9-LABEL: v8f32_arg: 3212; GFX9: ; %bb.0: ; %entry 3213; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3214; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3215; GFX9-NEXT: v_mov_b32_e32 v4, 0 3216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3217; GFX9-NEXT: v_mov_b32_e32 v0, s12 3218; GFX9-NEXT: v_mov_b32_e32 v1, s13 3219; GFX9-NEXT: v_mov_b32_e32 v2, s14 3220; GFX9-NEXT: v_mov_b32_e32 v3, s15 3221; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3222; GFX9-NEXT: s_nop 0 3223; GFX9-NEXT: v_mov_b32_e32 v0, s8 3224; GFX9-NEXT: v_mov_b32_e32 v1, s9 3225; GFX9-NEXT: v_mov_b32_e32 v2, s10 3226; GFX9-NEXT: v_mov_b32_e32 v3, s11 3227; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3228; GFX9-NEXT: s_endpgm 3229; 3230; EG-LABEL: v8f32_arg: 3231; EG: ; %bb.0: ; %entry 3232; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3233; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3234; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3235; EG-NEXT: CF_END 3236; EG-NEXT: ALU clause starting at 4: 3237; EG-NEXT: MOV * T0.W, KC0[5].X, 3238; EG-NEXT: MOV * T0.Z, KC0[4].W, 3239; EG-NEXT: MOV T0.Y, KC0[4].Z, 3240; EG-NEXT: MOV * T1.W, KC0[6].X, 3241; EG-NEXT: MOV T0.X, KC0[4].Y, 3242; EG-NEXT: MOV * T1.Z, KC0[5].W, 3243; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3244; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3245; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3246; EG-NEXT: MOV T1.X, KC0[5].Y, 3247; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3248; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3249; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3250; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3251; 3252; CM-LABEL: v8f32_arg: 3253; CM: ; %bb.0: ; %entry 3254; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3255; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3256; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3257; CM-NEXT: CF_END 3258; CM-NEXT: ALU clause starting at 4: 3259; CM-NEXT: MOV * T0.W, KC0[6].X, 3260; CM-NEXT: MOV * T0.Z, KC0[5].W, 3261; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3262; CM-NEXT: MOV T0.X, KC0[5].Y, 3263; CM-NEXT: MOV * T1.W, KC0[5].X, 3264; CM-NEXT: MOV T1.Z, KC0[4].W, 3265; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3266; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3267; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3268; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3269; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3270; CM-NEXT: MOV * T1.X, KC0[4].Y, 3271; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3272; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3273entry: 3274 store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4 3275 ret void 3276} 3277 3278; FIXME: Pack/repack on VI 3279define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { 3280; SI-LABEL: v16i8_arg: 3281; SI: ; %bb.0: ; %entry 3282; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 3283; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3284; SI-NEXT: s_mov_b32 s3, 0xf000 3285; SI-NEXT: s_mov_b32 s2, -1 3286; SI-NEXT: s_waitcnt lgkmcnt(0) 3287; SI-NEXT: v_mov_b32_e32 v0, s4 3288; SI-NEXT: v_mov_b32_e32 v1, s5 3289; SI-NEXT: v_mov_b32_e32 v2, s6 3290; SI-NEXT: v_mov_b32_e32 v3, s7 3291; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3292; SI-NEXT: s_endpgm 3293; 3294; VI-LABEL: v16i8_arg: 3295; VI: ; %bb.0: ; %entry 3296; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 3297; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 3298; VI-NEXT: s_waitcnt lgkmcnt(0) 3299; VI-NEXT: v_mov_b32_e32 v4, s4 3300; VI-NEXT: v_mov_b32_e32 v0, s0 3301; VI-NEXT: v_mov_b32_e32 v5, s5 3302; VI-NEXT: v_mov_b32_e32 v1, s1 3303; VI-NEXT: v_mov_b32_e32 v2, s2 3304; VI-NEXT: v_mov_b32_e32 v3, s3 3305; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3306; VI-NEXT: s_endpgm 3307; 3308; GFX9-LABEL: v16i8_arg: 3309; GFX9: ; %bb.0: ; %entry 3310; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 3311; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 3312; GFX9-NEXT: v_mov_b32_e32 v4, 0 3313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3314; GFX9-NEXT: v_mov_b32_e32 v0, s0 3315; GFX9-NEXT: v_mov_b32_e32 v1, s1 3316; GFX9-NEXT: v_mov_b32_e32 v2, s2 3317; GFX9-NEXT: v_mov_b32_e32 v3, s3 3318; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 3319; GFX9-NEXT: s_endpgm 3320; 3321; EG-LABEL: v16i8_arg: 3322; EG: ; %bb.0: ; %entry 3323; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3324; EG-NEXT: TEX 0 @36 3325; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3326; EG-NEXT: TEX 0 @38 3327; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3328; EG-NEXT: TEX 0 @40 3329; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3330; EG-NEXT: TEX 0 @42 3331; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3332; EG-NEXT: TEX 0 @44 3333; EG-NEXT: ALU 7, @94, KC0[], KC1[] 3334; EG-NEXT: TEX 0 @46 3335; EG-NEXT: ALU 7, @102, KC0[], KC1[] 3336; EG-NEXT: TEX 0 @48 3337; EG-NEXT: ALU 7, @110, KC0[], KC1[] 3338; EG-NEXT: TEX 0 @50 3339; EG-NEXT: ALU 7, @118, KC0[], KC1[] 3340; EG-NEXT: TEX 0 @52 3341; EG-NEXT: ALU 7, @126, KC0[], KC1[] 3342; EG-NEXT: TEX 0 @54 3343; EG-NEXT: ALU 7, @134, KC0[], KC1[] 3344; EG-NEXT: TEX 0 @56 3345; EG-NEXT: ALU 7, @142, KC0[], KC1[] 3346; EG-NEXT: TEX 0 @58 3347; EG-NEXT: ALU 7, @150, KC0[], KC1[] 3348; EG-NEXT: TEX 0 @60 3349; EG-NEXT: ALU 5, @158, KC0[], KC1[] 3350; EG-NEXT: TEX 0 @62 3351; EG-NEXT: ALU 5, @164, KC0[], KC1[] 3352; EG-NEXT: TEX 0 @64 3353; EG-NEXT: ALU 5, @170, KC0[], KC1[] 3354; EG-NEXT: TEX 0 @66 3355; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3356; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 3357; EG-NEXT: CF_END 3358; EG-NEXT: PAD 3359; EG-NEXT: Fetch clause starting at 36: 3360; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3361; EG-NEXT: Fetch clause starting at 38: 3362; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3363; EG-NEXT: Fetch clause starting at 40: 3364; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3365; EG-NEXT: Fetch clause starting at 42: 3366; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3367; EG-NEXT: Fetch clause starting at 44: 3368; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3369; EG-NEXT: Fetch clause starting at 46: 3370; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3371; EG-NEXT: Fetch clause starting at 48: 3372; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3373; EG-NEXT: Fetch clause starting at 50: 3374; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3375; EG-NEXT: Fetch clause starting at 52: 3376; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3377; EG-NEXT: Fetch clause starting at 54: 3378; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3379; EG-NEXT: Fetch clause starting at 56: 3380; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3381; EG-NEXT: Fetch clause starting at 58: 3382; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3383; EG-NEXT: Fetch clause starting at 60: 3384; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3385; EG-NEXT: Fetch clause starting at 62: 3386; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3387; EG-NEXT: Fetch clause starting at 64: 3388; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3389; EG-NEXT: Fetch clause starting at 66: 3390; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3391; EG-NEXT: ALU clause starting at 68: 3392; EG-NEXT: MOV * T0.Y, T2.X, 3393; EG-NEXT: MOV * T7.X, 0.0, 3394; EG-NEXT: ALU clause starting at 70: 3395; EG-NEXT: LSHL T0.W, T8.X, literal.x, 3396; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3397; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3398; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3399; EG-NEXT: MOV T2.X, PV.W, 3400; EG-NEXT: MOV * T0.Y, T3.X, 3401; EG-NEXT: ALU clause starting at 76: 3402; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3403; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3404; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3405; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3406; EG-NEXT: MOV T3.X, PV.W, 3407; EG-NEXT: MOV * T0.Y, T4.X, 3408; EG-NEXT: ALU clause starting at 82: 3409; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3410; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3411; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3412; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3413; EG-NEXT: MOV T4.X, PV.W, 3414; EG-NEXT: MOV * T0.Y, T5.X, 3415; EG-NEXT: ALU clause starting at 88: 3416; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3417; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3418; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3419; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3420; EG-NEXT: MOV T5.X, PV.W, 3421; EG-NEXT: MOV * T0.Y, T2.X, 3422; EG-NEXT: ALU clause starting at 94: 3423; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3424; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3425; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3426; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3427; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3428; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3429; EG-NEXT: MOV T2.X, PV.W, 3430; EG-NEXT: MOV * T0.Y, T3.X, 3431; EG-NEXT: ALU clause starting at 102: 3432; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3433; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3434; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3435; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3436; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3437; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3438; EG-NEXT: MOV T3.X, PV.W, 3439; EG-NEXT: MOV * T0.Y, T4.X, 3440; EG-NEXT: ALU clause starting at 110: 3441; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3442; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3443; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3444; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3445; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3446; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3447; EG-NEXT: MOV T4.X, PV.W, 3448; EG-NEXT: MOV * T0.Y, T5.X, 3449; EG-NEXT: ALU clause starting at 118: 3450; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3451; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3452; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3453; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3454; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3455; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3456; EG-NEXT: MOV T5.X, PV.W, 3457; EG-NEXT: MOV * T0.Y, T2.X, 3458; EG-NEXT: ALU clause starting at 126: 3459; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3460; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3461; EG-NEXT: 255(3.573311e-43), -65281(nan) 3462; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3463; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3464; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3465; EG-NEXT: MOV T2.X, PV.W, 3466; EG-NEXT: MOV * T0.Y, T3.X, 3467; EG-NEXT: ALU clause starting at 134: 3468; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3469; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3470; EG-NEXT: 255(3.573311e-43), -65281(nan) 3471; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3472; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3473; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3474; EG-NEXT: MOV T3.X, PV.W, 3475; EG-NEXT: MOV * T0.Y, T4.X, 3476; EG-NEXT: ALU clause starting at 142: 3477; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3478; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3479; EG-NEXT: 255(3.573311e-43), -65281(nan) 3480; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3481; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3482; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3483; EG-NEXT: MOV T4.X, PV.W, 3484; EG-NEXT: MOV * T0.Y, T5.X, 3485; EG-NEXT: ALU clause starting at 150: 3486; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3487; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3488; EG-NEXT: 255(3.573311e-43), -65281(nan) 3489; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3490; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3491; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3492; EG-NEXT: MOV T5.X, PV.W, 3493; EG-NEXT: MOV * T0.Y, T2.X, 3494; EG-NEXT: ALU clause starting at 158: 3495; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3496; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3497; EG-NEXT: -256(nan), 255(3.573311e-43) 3498; EG-NEXT: OR_INT * T7.W, PV.W, PS, 3499; EG-NEXT: MOV T2.X, PV.W, 3500; EG-NEXT: MOV * T0.Y, T3.X, 3501; EG-NEXT: ALU clause starting at 164: 3502; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3503; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3504; EG-NEXT: -256(nan), 255(3.573311e-43) 3505; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 3506; EG-NEXT: MOV T3.X, PV.Z, 3507; EG-NEXT: MOV * T0.Y, T4.X, 3508; EG-NEXT: ALU clause starting at 170: 3509; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3510; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3511; EG-NEXT: -256(nan), 255(3.573311e-43) 3512; EG-NEXT: OR_INT * T7.Y, PV.W, PS, 3513; EG-NEXT: MOV T4.X, PV.Y, 3514; EG-NEXT: MOV * T0.Y, T5.X, 3515; EG-NEXT: ALU clause starting at 176: 3516; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3517; EG-NEXT: AND_INT * T1.W, T7.X, literal.y, 3518; EG-NEXT: -256(nan), 255(3.573311e-43) 3519; EG-NEXT: OR_INT T7.X, PV.W, PS, 3520; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3521; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3522; 3523; CM-LABEL: v16i8_arg: 3524; CM: ; %bb.0: ; %entry 3525; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3526; CM-NEXT: TEX 0 @36 3527; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3528; CM-NEXT: TEX 0 @38 3529; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3530; CM-NEXT: TEX 0 @40 3531; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3532; CM-NEXT: TEX 0 @42 3533; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3534; CM-NEXT: TEX 0 @44 3535; CM-NEXT: ALU 7, @94, KC0[], KC1[] 3536; CM-NEXT: TEX 0 @46 3537; CM-NEXT: ALU 7, @102, KC0[], KC1[] 3538; CM-NEXT: TEX 0 @48 3539; CM-NEXT: ALU 7, @110, KC0[], KC1[] 3540; CM-NEXT: TEX 0 @50 3541; CM-NEXT: ALU 7, @118, KC0[], KC1[] 3542; CM-NEXT: TEX 0 @52 3543; CM-NEXT: ALU 7, @126, KC0[], KC1[] 3544; CM-NEXT: TEX 0 @54 3545; CM-NEXT: ALU 7, @134, KC0[], KC1[] 3546; CM-NEXT: TEX 0 @56 3547; CM-NEXT: ALU 7, @142, KC0[], KC1[] 3548; CM-NEXT: TEX 0 @58 3549; CM-NEXT: ALU 7, @150, KC0[], KC1[] 3550; CM-NEXT: TEX 0 @60 3551; CM-NEXT: ALU 5, @158, KC0[], KC1[] 3552; CM-NEXT: TEX 0 @62 3553; CM-NEXT: ALU 5, @164, KC0[], KC1[] 3554; CM-NEXT: TEX 0 @64 3555; CM-NEXT: ALU 5, @170, KC0[], KC1[] 3556; CM-NEXT: TEX 0 @66 3557; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3558; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 3559; CM-NEXT: CF_END 3560; CM-NEXT: PAD 3561; CM-NEXT: Fetch clause starting at 36: 3562; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3563; CM-NEXT: Fetch clause starting at 38: 3564; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3565; CM-NEXT: Fetch clause starting at 40: 3566; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3567; CM-NEXT: Fetch clause starting at 42: 3568; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3569; CM-NEXT: Fetch clause starting at 44: 3570; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3571; CM-NEXT: Fetch clause starting at 46: 3572; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3573; CM-NEXT: Fetch clause starting at 48: 3574; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3575; CM-NEXT: Fetch clause starting at 50: 3576; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3577; CM-NEXT: Fetch clause starting at 52: 3578; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3579; CM-NEXT: Fetch clause starting at 54: 3580; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3581; CM-NEXT: Fetch clause starting at 56: 3582; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3583; CM-NEXT: Fetch clause starting at 58: 3584; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3585; CM-NEXT: Fetch clause starting at 60: 3586; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3587; CM-NEXT: Fetch clause starting at 62: 3588; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3589; CM-NEXT: Fetch clause starting at 64: 3590; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3591; CM-NEXT: Fetch clause starting at 66: 3592; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3593; CM-NEXT: ALU clause starting at 68: 3594; CM-NEXT: MOV * T0.Y, T2.X, 3595; CM-NEXT: MOV * T7.X, 0.0, 3596; CM-NEXT: ALU clause starting at 70: 3597; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 3598; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 3599; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3600; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 3601; CM-NEXT: MOV T2.X, PV.W, 3602; CM-NEXT: MOV * T0.Y, T3.X, 3603; CM-NEXT: ALU clause starting at 76: 3604; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3605; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3606; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3607; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3608; CM-NEXT: MOV T3.X, PV.W, 3609; CM-NEXT: MOV * T0.Y, T4.X, 3610; CM-NEXT: ALU clause starting at 82: 3611; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3612; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3613; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3614; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3615; CM-NEXT: MOV T4.X, PV.W, 3616; CM-NEXT: MOV * T0.Y, T5.X, 3617; CM-NEXT: ALU clause starting at 88: 3618; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3619; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3620; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3621; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3622; CM-NEXT: MOV T5.X, PV.W, 3623; CM-NEXT: MOV * T0.Y, T2.X, 3624; CM-NEXT: ALU clause starting at 94: 3625; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3626; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3627; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3628; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3629; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3630; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3631; CM-NEXT: MOV T2.X, PV.W, 3632; CM-NEXT: MOV * T0.Y, T3.X, 3633; CM-NEXT: ALU clause starting at 102: 3634; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3635; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3636; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3637; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3638; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3639; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3640; CM-NEXT: MOV T3.X, PV.W, 3641; CM-NEXT: MOV * T0.Y, T4.X, 3642; CM-NEXT: ALU clause starting at 110: 3643; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3644; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3645; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3646; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3647; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3648; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3649; CM-NEXT: MOV T4.X, PV.W, 3650; CM-NEXT: MOV * T0.Y, T5.X, 3651; CM-NEXT: ALU clause starting at 118: 3652; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3653; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3654; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3655; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3656; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3657; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3658; CM-NEXT: MOV T5.X, PV.W, 3659; CM-NEXT: MOV * T0.Y, T2.X, 3660; CM-NEXT: ALU clause starting at 126: 3661; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3662; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3663; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3664; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3665; CM-NEXT: -65281(nan), 8(1.121039e-44) 3666; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3667; CM-NEXT: MOV T2.X, PV.W, 3668; CM-NEXT: MOV * T0.Y, T3.X, 3669; CM-NEXT: ALU clause starting at 134: 3670; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3671; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3672; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3673; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3674; CM-NEXT: -65281(nan), 8(1.121039e-44) 3675; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3676; CM-NEXT: MOV T3.X, PV.W, 3677; CM-NEXT: MOV * T0.Y, T4.X, 3678; CM-NEXT: ALU clause starting at 142: 3679; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3680; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3681; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3682; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3683; CM-NEXT: -65281(nan), 8(1.121039e-44) 3684; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3685; CM-NEXT: MOV T4.X, PV.W, 3686; CM-NEXT: MOV * T0.Y, T5.X, 3687; CM-NEXT: ALU clause starting at 150: 3688; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3689; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3690; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3691; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3692; CM-NEXT: -65281(nan), 8(1.121039e-44) 3693; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3694; CM-NEXT: MOV T5.X, PV.W, 3695; CM-NEXT: MOV * T0.Y, T2.X, 3696; CM-NEXT: ALU clause starting at 158: 3697; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3698; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3699; CM-NEXT: -256(nan), 255(3.573311e-43) 3700; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W, 3701; CM-NEXT: MOV T2.X, PV.W, 3702; CM-NEXT: MOV * T0.Y, T3.X, 3703; CM-NEXT: ALU clause starting at 164: 3704; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3705; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3706; CM-NEXT: -256(nan), 255(3.573311e-43) 3707; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3708; CM-NEXT: MOV T3.X, PV.Z, 3709; CM-NEXT: MOV * T0.Y, T4.X, 3710; CM-NEXT: ALU clause starting at 170: 3711; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3712; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3713; CM-NEXT: -256(nan), 255(3.573311e-43) 3714; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W, 3715; CM-NEXT: MOV T4.X, PV.Y, 3716; CM-NEXT: MOV * T0.Y, T5.X, 3717; CM-NEXT: ALU clause starting at 176: 3718; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3719; CM-NEXT: AND_INT * T0.W, T7.X, literal.y, 3720; CM-NEXT: -256(nan), 255(3.573311e-43) 3721; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3722; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3723; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3724entry: 3725 store <16 x i8> %in, <16 x i8> addrspace(1)* %out 3726 ret void 3727} 3728 3729define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { 3730; SI-LABEL: v16i16_arg: 3731; SI: ; %bb.0: ; %entry 3732; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 3733; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3734; SI-NEXT: s_mov_b32 s3, 0xf000 3735; SI-NEXT: s_mov_b32 s2, -1 3736; SI-NEXT: s_waitcnt lgkmcnt(0) 3737; SI-NEXT: v_mov_b32_e32 v0, s8 3738; SI-NEXT: v_mov_b32_e32 v1, s9 3739; SI-NEXT: v_mov_b32_e32 v2, s10 3740; SI-NEXT: v_mov_b32_e32 v3, s11 3741; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3742; SI-NEXT: s_waitcnt expcnt(0) 3743; SI-NEXT: v_mov_b32_e32 v0, s4 3744; SI-NEXT: v_mov_b32_e32 v1, s5 3745; SI-NEXT: v_mov_b32_e32 v2, s6 3746; SI-NEXT: v_mov_b32_e32 v3, s7 3747; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3748; SI-NEXT: s_endpgm 3749; 3750; VI-LABEL: v16i16_arg: 3751; VI: ; %bb.0: ; %entry 3752; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 3753; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3754; VI-NEXT: s_waitcnt lgkmcnt(0) 3755; VI-NEXT: v_mov_b32_e32 v0, s8 3756; VI-NEXT: s_add_u32 s2, s0, 16 3757; VI-NEXT: s_addc_u32 s3, s1, 0 3758; VI-NEXT: v_mov_b32_e32 v5, s3 3759; VI-NEXT: v_mov_b32_e32 v1, s9 3760; VI-NEXT: v_mov_b32_e32 v2, s10 3761; VI-NEXT: v_mov_b32_e32 v3, s11 3762; VI-NEXT: v_mov_b32_e32 v4, s2 3763; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3764; VI-NEXT: v_mov_b32_e32 v5, s1 3765; VI-NEXT: v_mov_b32_e32 v0, s4 3766; VI-NEXT: v_mov_b32_e32 v1, s5 3767; VI-NEXT: v_mov_b32_e32 v2, s6 3768; VI-NEXT: v_mov_b32_e32 v3, s7 3769; VI-NEXT: v_mov_b32_e32 v4, s0 3770; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3771; VI-NEXT: s_endpgm 3772; 3773; GFX9-LABEL: v16i16_arg: 3774; GFX9: ; %bb.0: ; %entry 3775; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 3776; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 3777; GFX9-NEXT: v_mov_b32_e32 v4, 0 3778; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3779; GFX9-NEXT: v_mov_b32_e32 v0, s12 3780; GFX9-NEXT: v_mov_b32_e32 v1, s13 3781; GFX9-NEXT: v_mov_b32_e32 v2, s14 3782; GFX9-NEXT: v_mov_b32_e32 v3, s15 3783; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 3784; GFX9-NEXT: s_nop 0 3785; GFX9-NEXT: v_mov_b32_e32 v0, s8 3786; GFX9-NEXT: v_mov_b32_e32 v1, s9 3787; GFX9-NEXT: v_mov_b32_e32 v2, s10 3788; GFX9-NEXT: v_mov_b32_e32 v3, s11 3789; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3790; GFX9-NEXT: s_endpgm 3791; 3792; EG-LABEL: v16i16_arg: 3793; EG: ; %bb.0: ; %entry 3794; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3795; EG-NEXT: TEX 0 @36 3796; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3797; EG-NEXT: TEX 0 @38 3798; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3799; EG-NEXT: TEX 0 @40 3800; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3801; EG-NEXT: TEX 0 @42 3802; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3803; EG-NEXT: TEX 0 @44 3804; EG-NEXT: ALU 5, @94, KC0[], KC1[] 3805; EG-NEXT: TEX 0 @46 3806; EG-NEXT: ALU 5, @100, KC0[], KC1[] 3807; EG-NEXT: TEX 0 @48 3808; EG-NEXT: ALU 5, @106, KC0[], KC1[] 3809; EG-NEXT: TEX 0 @50 3810; EG-NEXT: ALU 5, @112, KC0[], KC1[] 3811; EG-NEXT: TEX 0 @52 3812; EG-NEXT: ALU 5, @118, KC0[], KC1[] 3813; EG-NEXT: TEX 0 @54 3814; EG-NEXT: ALU 5, @124, KC0[], KC1[] 3815; EG-NEXT: TEX 0 @56 3816; EG-NEXT: ALU 5, @130, KC0[], KC1[] 3817; EG-NEXT: TEX 0 @58 3818; EG-NEXT: ALU 5, @136, KC0[], KC1[] 3819; EG-NEXT: TEX 0 @60 3820; EG-NEXT: ALU 5, @142, KC0[], KC1[] 3821; EG-NEXT: TEX 0 @62 3822; EG-NEXT: ALU 5, @148, KC0[], KC1[] 3823; EG-NEXT: TEX 0 @64 3824; EG-NEXT: ALU 5, @154, KC0[], KC1[] 3825; EG-NEXT: TEX 0 @66 3826; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[] 3827; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 3828; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 3829; EG-NEXT: CF_END 3830; EG-NEXT: Fetch clause starting at 36: 3831; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 3832; EG-NEXT: Fetch clause starting at 38: 3833; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 3834; EG-NEXT: Fetch clause starting at 40: 3835; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 3836; EG-NEXT: Fetch clause starting at 42: 3837; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 3838; EG-NEXT: Fetch clause starting at 44: 3839; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 3840; EG-NEXT: Fetch clause starting at 46: 3841; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 3842; EG-NEXT: Fetch clause starting at 48: 3843; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 3844; EG-NEXT: Fetch clause starting at 50: 3845; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 3846; EG-NEXT: Fetch clause starting at 52: 3847; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 3848; EG-NEXT: Fetch clause starting at 54: 3849; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 3850; EG-NEXT: Fetch clause starting at 56: 3851; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 3852; EG-NEXT: Fetch clause starting at 58: 3853; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 3854; EG-NEXT: Fetch clause starting at 60: 3855; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 3856; EG-NEXT: Fetch clause starting at 62: 3857; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 3858; EG-NEXT: Fetch clause starting at 64: 3859; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 3860; EG-NEXT: Fetch clause starting at 66: 3861; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 3862; EG-NEXT: ALU clause starting at 68: 3863; EG-NEXT: MOV * T0.Y, T3.X, 3864; EG-NEXT: MOV * T11.X, 0.0, 3865; EG-NEXT: ALU clause starting at 70: 3866; EG-NEXT: LSHL T0.W, T12.X, literal.x, 3867; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3868; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 3869; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3870; EG-NEXT: MOV T3.X, PV.W, 3871; EG-NEXT: MOV * T0.Y, T5.X, 3872; EG-NEXT: ALU clause starting at 76: 3873; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3874; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3875; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3876; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3877; EG-NEXT: MOV T5.X, PV.W, 3878; EG-NEXT: MOV * T0.Y, T7.X, 3879; EG-NEXT: ALU clause starting at 82: 3880; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3881; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3882; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3883; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3884; EG-NEXT: MOV T7.X, PV.W, 3885; EG-NEXT: MOV * T0.Y, T9.X, 3886; EG-NEXT: ALU clause starting at 88: 3887; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3888; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3889; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3890; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3891; EG-NEXT: MOV T9.X, PV.W, 3892; EG-NEXT: MOV * T0.Y, T3.X, 3893; EG-NEXT: ALU clause starting at 94: 3894; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3895; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3896; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3897; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3898; EG-NEXT: MOV T3.X, PV.W, 3899; EG-NEXT: MOV * T0.Y, T5.X, 3900; EG-NEXT: ALU clause starting at 100: 3901; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3902; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3903; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3904; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3905; EG-NEXT: MOV T5.X, PV.W, 3906; EG-NEXT: MOV * T0.Y, T7.X, 3907; EG-NEXT: ALU clause starting at 106: 3908; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3909; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3910; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3911; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3912; EG-NEXT: MOV T7.X, PV.W, 3913; EG-NEXT: MOV * T0.Y, T9.X, 3914; EG-NEXT: ALU clause starting at 112: 3915; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3916; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3917; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3918; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3919; EG-NEXT: MOV T9.X, PV.W, 3920; EG-NEXT: MOV * T0.Y, T2.X, 3921; EG-NEXT: ALU clause starting at 118: 3922; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3923; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3924; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3925; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3926; EG-NEXT: MOV T2.X, PV.W, 3927; EG-NEXT: MOV * T0.Y, T4.X, 3928; EG-NEXT: ALU clause starting at 124: 3929; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3930; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3931; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3932; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3933; EG-NEXT: MOV T4.X, PV.W, 3934; EG-NEXT: MOV * T0.Y, T6.X, 3935; EG-NEXT: ALU clause starting at 130: 3936; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3937; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3938; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3939; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3940; EG-NEXT: MOV T6.X, PV.W, 3941; EG-NEXT: MOV * T0.Y, T8.X, 3942; EG-NEXT: ALU clause starting at 136: 3943; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3944; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3945; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3946; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3947; EG-NEXT: MOV T8.X, PV.W, 3948; EG-NEXT: MOV * T0.Y, T2.X, 3949; EG-NEXT: ALU clause starting at 142: 3950; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3951; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3952; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3953; EG-NEXT: OR_INT * T12.Z, PV.W, PS, 3954; EG-NEXT: MOV T2.X, PV.Z, 3955; EG-NEXT: MOV * T0.Y, T4.X, 3956; EG-NEXT: ALU clause starting at 148: 3957; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3958; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3959; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3960; EG-NEXT: OR_INT * T12.X, PV.W, PS, 3961; EG-NEXT: MOV T4.X, PV.X, 3962; EG-NEXT: MOV * T0.Y, T6.X, 3963; EG-NEXT: ALU clause starting at 154: 3964; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3965; EG-NEXT: AND_INT * T1.W, T13.X, literal.y, 3966; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3967; EG-NEXT: OR_INT * T11.Z, PV.W, PS, 3968; EG-NEXT: MOV T6.X, PV.Z, 3969; EG-NEXT: MOV * T0.Y, T8.X, 3970; EG-NEXT: ALU clause starting at 160: 3971; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, 3972; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3973; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 3974; EG-NEXT: LSHR T14.X, PV.W, literal.x, 3975; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 3976; EG-NEXT: AND_INT * T1.W, T11.X, literal.z, 3977; EG-NEXT: 2(2.802597e-45), -65536(nan) 3978; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3979; EG-NEXT: OR_INT * T11.X, PV.W, PS, 3980; EG-NEXT: MOV T8.X, PV.X, 3981; EG-NEXT: MOV * T12.W, T3.X, 3982; EG-NEXT: MOV T12.Y, T5.X, 3983; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212 3984; EG-NEXT: MOV * T11.Y, T9.X, 3985; 3986; CM-LABEL: v16i16_arg: 3987; CM: ; %bb.0: ; %entry 3988; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3989; CM-NEXT: TEX 0 @36 3990; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3991; CM-NEXT: TEX 0 @38 3992; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3993; CM-NEXT: TEX 0 @40 3994; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3995; CM-NEXT: TEX 0 @42 3996; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3997; CM-NEXT: TEX 0 @44 3998; CM-NEXT: ALU 5, @94, KC0[], KC1[] 3999; CM-NEXT: TEX 0 @46 4000; CM-NEXT: ALU 5, @100, KC0[], KC1[] 4001; CM-NEXT: TEX 0 @48 4002; CM-NEXT: ALU 5, @106, KC0[], KC1[] 4003; CM-NEXT: TEX 0 @50 4004; CM-NEXT: ALU 5, @112, KC0[], KC1[] 4005; CM-NEXT: TEX 0 @52 4006; CM-NEXT: ALU 5, @118, KC0[], KC1[] 4007; CM-NEXT: TEX 0 @54 4008; CM-NEXT: ALU 5, @124, KC0[], KC1[] 4009; CM-NEXT: TEX 0 @56 4010; CM-NEXT: ALU 5, @130, KC0[], KC1[] 4011; CM-NEXT: TEX 0 @58 4012; CM-NEXT: ALU 5, @136, KC0[], KC1[] 4013; CM-NEXT: TEX 0 @60 4014; CM-NEXT: ALU 5, @142, KC0[], KC1[] 4015; CM-NEXT: TEX 0 @62 4016; CM-NEXT: ALU 5, @148, KC0[], KC1[] 4017; CM-NEXT: TEX 0 @64 4018; CM-NEXT: ALU 5, @154, KC0[], KC1[] 4019; CM-NEXT: TEX 0 @66 4020; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[] 4021; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X 4022; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X 4023; CM-NEXT: CF_END 4024; CM-NEXT: Fetch clause starting at 36: 4025; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 4026; CM-NEXT: Fetch clause starting at 38: 4027; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 4028; CM-NEXT: Fetch clause starting at 40: 4029; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 4030; CM-NEXT: Fetch clause starting at 42: 4031; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 4032; CM-NEXT: Fetch clause starting at 44: 4033; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 4034; CM-NEXT: Fetch clause starting at 46: 4035; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 4036; CM-NEXT: Fetch clause starting at 48: 4037; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 4038; CM-NEXT: Fetch clause starting at 50: 4039; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 4040; CM-NEXT: Fetch clause starting at 52: 4041; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 4042; CM-NEXT: Fetch clause starting at 54: 4043; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 4044; CM-NEXT: Fetch clause starting at 56: 4045; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 4046; CM-NEXT: Fetch clause starting at 58: 4047; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 4048; CM-NEXT: Fetch clause starting at 60: 4049; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 4050; CM-NEXT: Fetch clause starting at 62: 4051; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 4052; CM-NEXT: Fetch clause starting at 64: 4053; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 4054; CM-NEXT: Fetch clause starting at 66: 4055; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 4056; CM-NEXT: ALU clause starting at 68: 4057; CM-NEXT: MOV * T0.Y, T3.X, 4058; CM-NEXT: MOV * T11.X, 0.0, 4059; CM-NEXT: ALU clause starting at 70: 4060; CM-NEXT: LSHL T0.Z, T12.X, literal.x, 4061; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 4062; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 4063; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 4064; CM-NEXT: MOV T3.X, PV.W, 4065; CM-NEXT: MOV * T0.Y, T5.X, 4066; CM-NEXT: ALU clause starting at 76: 4067; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4068; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4069; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4070; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4071; CM-NEXT: MOV T5.X, PV.W, 4072; CM-NEXT: MOV * T0.Y, T7.X, 4073; CM-NEXT: ALU clause starting at 82: 4074; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4075; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4076; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4077; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4078; CM-NEXT: MOV T7.X, PV.W, 4079; CM-NEXT: MOV * T0.Y, T9.X, 4080; CM-NEXT: ALU clause starting at 88: 4081; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4082; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4083; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4084; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4085; CM-NEXT: MOV T9.X, PV.W, 4086; CM-NEXT: MOV * T0.Y, T3.X, 4087; CM-NEXT: ALU clause starting at 94: 4088; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4089; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4090; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4091; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4092; CM-NEXT: MOV T3.X, PV.W, 4093; CM-NEXT: MOV * T0.Y, T5.X, 4094; CM-NEXT: ALU clause starting at 100: 4095; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4096; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4097; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4098; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4099; CM-NEXT: MOV T5.X, PV.W, 4100; CM-NEXT: MOV * T0.Y, T7.X, 4101; CM-NEXT: ALU clause starting at 106: 4102; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4103; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4104; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4105; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4106; CM-NEXT: MOV T7.X, PV.W, 4107; CM-NEXT: MOV * T0.Y, T9.X, 4108; CM-NEXT: ALU clause starting at 112: 4109; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4110; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4111; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4112; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4113; CM-NEXT: MOV T9.X, PV.W, 4114; CM-NEXT: MOV * T0.Y, T2.X, 4115; CM-NEXT: ALU clause starting at 118: 4116; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4117; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4118; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4119; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4120; CM-NEXT: MOV T2.X, PV.W, 4121; CM-NEXT: MOV * T0.Y, T4.X, 4122; CM-NEXT: ALU clause starting at 124: 4123; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4124; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4125; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4126; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4127; CM-NEXT: MOV T4.X, PV.W, 4128; CM-NEXT: MOV * T0.Y, T6.X, 4129; CM-NEXT: ALU clause starting at 130: 4130; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4131; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4132; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4133; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4134; CM-NEXT: MOV T6.X, PV.W, 4135; CM-NEXT: MOV * T0.Y, T8.X, 4136; CM-NEXT: ALU clause starting at 136: 4137; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4138; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 4139; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 4140; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 4141; CM-NEXT: MOV T8.X, PV.W, 4142; CM-NEXT: MOV * T0.Y, T2.X, 4143; CM-NEXT: ALU clause starting at 142: 4144; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4145; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4146; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4147; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W, 4148; CM-NEXT: MOV T2.X, PV.Z, 4149; CM-NEXT: MOV * T0.Y, T4.X, 4150; CM-NEXT: ALU clause starting at 148: 4151; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4152; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 4153; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4154; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W, 4155; CM-NEXT: MOV T4.X, PV.X, 4156; CM-NEXT: MOV * T0.Y, T6.X, 4157; CM-NEXT: ALU clause starting at 154: 4158; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 4159; CM-NEXT: AND_INT * T0.W, T13.X, literal.y, 4160; CM-NEXT: -65536(nan), 65535(9.183409e-41) 4161; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W, 4162; CM-NEXT: MOV T6.X, PV.Z, 4163; CM-NEXT: MOV * T0.Y, T8.X, 4164; CM-NEXT: ALU clause starting at 160: 4165; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4166; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4167; CM-NEXT: LSHR * T13.X, PV.W, literal.x, 4168; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4169; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x, 4170; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 4171; CM-NEXT: AND_INT * T0.W, T11.X, literal.z, 4172; CM-NEXT: 2(2.802597e-45), -65536(nan) 4173; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 4174; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W, 4175; CM-NEXT: MOV T8.X, PV.X, 4176; CM-NEXT: MOV * T12.W, T3.X, 4177; CM-NEXT: MOV T12.Y, T5.X, 4178; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212 4179; CM-NEXT: MOV * T11.Y, T9.X, 4180entry: 4181 store <16 x i16> %in, <16 x i16> addrspace(1)* %out 4182 ret void 4183} 4184 4185define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind { 4186; SI-LABEL: v16i32_arg: 4187; SI: ; %bb.0: ; %entry 4188; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 4189; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4190; SI-NEXT: s_mov_b32 s3, 0xf000 4191; SI-NEXT: s_mov_b32 s2, -1 4192; SI-NEXT: s_waitcnt lgkmcnt(0) 4193; SI-NEXT: v_mov_b32_e32 v0, s16 4194; SI-NEXT: v_mov_b32_e32 v1, s17 4195; SI-NEXT: v_mov_b32_e32 v2, s18 4196; SI-NEXT: v_mov_b32_e32 v3, s19 4197; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4198; SI-NEXT: s_waitcnt expcnt(0) 4199; SI-NEXT: v_mov_b32_e32 v0, s12 4200; SI-NEXT: v_mov_b32_e32 v1, s13 4201; SI-NEXT: v_mov_b32_e32 v2, s14 4202; SI-NEXT: v_mov_b32_e32 v3, s15 4203; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4204; SI-NEXT: s_waitcnt expcnt(0) 4205; SI-NEXT: v_mov_b32_e32 v0, s8 4206; SI-NEXT: v_mov_b32_e32 v1, s9 4207; SI-NEXT: v_mov_b32_e32 v2, s10 4208; SI-NEXT: v_mov_b32_e32 v3, s11 4209; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4210; SI-NEXT: s_waitcnt expcnt(0) 4211; SI-NEXT: v_mov_b32_e32 v0, s4 4212; SI-NEXT: v_mov_b32_e32 v1, s5 4213; SI-NEXT: v_mov_b32_e32 v2, s6 4214; SI-NEXT: v_mov_b32_e32 v3, s7 4215; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4216; SI-NEXT: s_endpgm 4217; 4218; VI-LABEL: v16i32_arg: 4219; VI: ; %bb.0: ; %entry 4220; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 4221; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4222; VI-NEXT: s_waitcnt lgkmcnt(0) 4223; VI-NEXT: v_mov_b32_e32 v0, s16 4224; VI-NEXT: s_add_u32 s2, s0, 48 4225; VI-NEXT: s_addc_u32 s3, s1, 0 4226; VI-NEXT: v_mov_b32_e32 v5, s3 4227; VI-NEXT: v_mov_b32_e32 v4, s2 4228; VI-NEXT: s_add_u32 s2, s0, 32 4229; VI-NEXT: v_mov_b32_e32 v1, s17 4230; VI-NEXT: v_mov_b32_e32 v2, s18 4231; VI-NEXT: v_mov_b32_e32 v3, s19 4232; VI-NEXT: s_addc_u32 s3, s1, 0 4233; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4234; VI-NEXT: v_mov_b32_e32 v5, s3 4235; VI-NEXT: v_mov_b32_e32 v4, s2 4236; VI-NEXT: s_add_u32 s2, s0, 16 4237; VI-NEXT: v_mov_b32_e32 v0, s12 4238; VI-NEXT: v_mov_b32_e32 v1, s13 4239; VI-NEXT: v_mov_b32_e32 v2, s14 4240; VI-NEXT: v_mov_b32_e32 v3, s15 4241; VI-NEXT: s_addc_u32 s3, s1, 0 4242; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4243; VI-NEXT: v_mov_b32_e32 v5, s3 4244; VI-NEXT: v_mov_b32_e32 v0, s8 4245; VI-NEXT: v_mov_b32_e32 v1, s9 4246; VI-NEXT: v_mov_b32_e32 v2, s10 4247; VI-NEXT: v_mov_b32_e32 v3, s11 4248; VI-NEXT: v_mov_b32_e32 v4, s2 4249; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4250; VI-NEXT: v_mov_b32_e32 v5, s1 4251; VI-NEXT: v_mov_b32_e32 v0, s4 4252; VI-NEXT: v_mov_b32_e32 v1, s5 4253; VI-NEXT: v_mov_b32_e32 v2, s6 4254; VI-NEXT: v_mov_b32_e32 v3, s7 4255; VI-NEXT: v_mov_b32_e32 v4, s0 4256; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4257; VI-NEXT: s_endpgm 4258; 4259; GFX9-LABEL: v16i32_arg: 4260; GFX9: ; %bb.0: ; %entry 4261; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 4262; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4263; GFX9-NEXT: v_mov_b32_e32 v4, 0 4264; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4265; GFX9-NEXT: v_mov_b32_e32 v0, s20 4266; GFX9-NEXT: v_mov_b32_e32 v1, s21 4267; GFX9-NEXT: v_mov_b32_e32 v2, s22 4268; GFX9-NEXT: v_mov_b32_e32 v3, s23 4269; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 4270; GFX9-NEXT: s_nop 0 4271; GFX9-NEXT: v_mov_b32_e32 v0, s16 4272; GFX9-NEXT: v_mov_b32_e32 v1, s17 4273; GFX9-NEXT: v_mov_b32_e32 v2, s18 4274; GFX9-NEXT: v_mov_b32_e32 v3, s19 4275; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 4276; GFX9-NEXT: s_nop 0 4277; GFX9-NEXT: v_mov_b32_e32 v0, s12 4278; GFX9-NEXT: v_mov_b32_e32 v1, s13 4279; GFX9-NEXT: v_mov_b32_e32 v2, s14 4280; GFX9-NEXT: v_mov_b32_e32 v3, s15 4281; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 4282; GFX9-NEXT: s_nop 0 4283; GFX9-NEXT: v_mov_b32_e32 v0, s8 4284; GFX9-NEXT: v_mov_b32_e32 v1, s9 4285; GFX9-NEXT: v_mov_b32_e32 v2, s10 4286; GFX9-NEXT: v_mov_b32_e32 v3, s11 4287; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 4288; GFX9-NEXT: s_endpgm 4289; 4290; EG-LABEL: v16i32_arg: 4291; EG: ; %bb.0: ; %entry 4292; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4293; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4294; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4295; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4296; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4297; EG-NEXT: CF_END 4298; EG-NEXT: ALU clause starting at 6: 4299; EG-NEXT: MOV * T0.W, KC0[7].X, 4300; EG-NEXT: MOV * T0.Z, KC0[6].W, 4301; EG-NEXT: MOV T0.Y, KC0[6].Z, 4302; EG-NEXT: MOV * T1.W, KC0[8].X, 4303; EG-NEXT: MOV T0.X, KC0[6].Y, 4304; EG-NEXT: MOV * T1.Z, KC0[7].W, 4305; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4306; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4307; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4308; EG-NEXT: MOV * T3.W, KC0[9].X, 4309; EG-NEXT: MOV T1.X, KC0[7].Y, 4310; EG-NEXT: MOV * T3.Z, KC0[8].W, 4311; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4312; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4313; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4314; EG-NEXT: MOV T3.Y, KC0[8].Z, 4315; EG-NEXT: MOV * T5.W, KC0[10].X, 4316; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4317; EG-NEXT: MOV T3.X, KC0[8].Y, 4318; EG-NEXT: MOV * T5.Z, KC0[9].W, 4319; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4320; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4321; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4322; EG-NEXT: MOV T5.Y, KC0[9].Z, 4323; EG-NEXT: MOV * T5.X, KC0[9].Y, 4324; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4325; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4326; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4327; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4328; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4329; 4330; CM-LABEL: v16i32_arg: 4331; CM: ; %bb.0: ; %entry 4332; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4333; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4334; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4335; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4336; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4337; CM-NEXT: CF_END 4338; CM-NEXT: ALU clause starting at 6: 4339; CM-NEXT: MOV * T0.W, KC0[10].X, 4340; CM-NEXT: MOV * T0.Z, KC0[9].W, 4341; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4342; CM-NEXT: MOV T0.X, KC0[9].Y, 4343; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4344; CM-NEXT: MOV * T2.W, KC0[9].X, 4345; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4346; CM-NEXT: MOV T2.Z, KC0[8].W, 4347; CM-NEXT: MOV * T1.W, KC0[8].X, 4348; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4349; CM-NEXT: MOV T2.Y, KC0[8].Z, 4350; CM-NEXT: MOV * T1.Z, KC0[7].W, 4351; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4352; CM-NEXT: MOV T2.X, KC0[8].Y, 4353; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4354; CM-NEXT: MOV T1.X, KC0[7].Y, 4355; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4356; CM-NEXT: MOV * T4.W, KC0[7].X, 4357; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4358; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4359; CM-NEXT: MOV T4.Z, KC0[6].W, 4360; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4361; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4362; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4363; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4364; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4365; CM-NEXT: MOV * T4.X, KC0[6].Y, 4366; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4367; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4368entry: 4369 store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4 4370 ret void 4371} 4372 4373define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind { 4374; SI-LABEL: v16f32_arg: 4375; SI: ; %bb.0: ; %entry 4376; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 4377; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4378; SI-NEXT: s_mov_b32 s3, 0xf000 4379; SI-NEXT: s_mov_b32 s2, -1 4380; SI-NEXT: s_waitcnt lgkmcnt(0) 4381; SI-NEXT: v_mov_b32_e32 v0, s16 4382; SI-NEXT: v_mov_b32_e32 v1, s17 4383; SI-NEXT: v_mov_b32_e32 v2, s18 4384; SI-NEXT: v_mov_b32_e32 v3, s19 4385; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4386; SI-NEXT: s_waitcnt expcnt(0) 4387; SI-NEXT: v_mov_b32_e32 v0, s12 4388; SI-NEXT: v_mov_b32_e32 v1, s13 4389; SI-NEXT: v_mov_b32_e32 v2, s14 4390; SI-NEXT: v_mov_b32_e32 v3, s15 4391; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4392; SI-NEXT: s_waitcnt expcnt(0) 4393; SI-NEXT: v_mov_b32_e32 v0, s8 4394; SI-NEXT: v_mov_b32_e32 v1, s9 4395; SI-NEXT: v_mov_b32_e32 v2, s10 4396; SI-NEXT: v_mov_b32_e32 v3, s11 4397; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4398; SI-NEXT: s_waitcnt expcnt(0) 4399; SI-NEXT: v_mov_b32_e32 v0, s4 4400; SI-NEXT: v_mov_b32_e32 v1, s5 4401; SI-NEXT: v_mov_b32_e32 v2, s6 4402; SI-NEXT: v_mov_b32_e32 v3, s7 4403; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4404; SI-NEXT: s_endpgm 4405; 4406; VI-LABEL: v16f32_arg: 4407; VI: ; %bb.0: ; %entry 4408; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 4409; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4410; VI-NEXT: s_waitcnt lgkmcnt(0) 4411; VI-NEXT: v_mov_b32_e32 v0, s16 4412; VI-NEXT: s_add_u32 s2, s0, 48 4413; VI-NEXT: s_addc_u32 s3, s1, 0 4414; VI-NEXT: v_mov_b32_e32 v5, s3 4415; VI-NEXT: v_mov_b32_e32 v4, s2 4416; VI-NEXT: s_add_u32 s2, s0, 32 4417; VI-NEXT: v_mov_b32_e32 v1, s17 4418; VI-NEXT: v_mov_b32_e32 v2, s18 4419; VI-NEXT: v_mov_b32_e32 v3, s19 4420; VI-NEXT: s_addc_u32 s3, s1, 0 4421; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4422; VI-NEXT: v_mov_b32_e32 v5, s3 4423; VI-NEXT: v_mov_b32_e32 v4, s2 4424; VI-NEXT: s_add_u32 s2, s0, 16 4425; VI-NEXT: v_mov_b32_e32 v0, s12 4426; VI-NEXT: v_mov_b32_e32 v1, s13 4427; VI-NEXT: v_mov_b32_e32 v2, s14 4428; VI-NEXT: v_mov_b32_e32 v3, s15 4429; VI-NEXT: s_addc_u32 s3, s1, 0 4430; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4431; VI-NEXT: v_mov_b32_e32 v5, s3 4432; VI-NEXT: v_mov_b32_e32 v0, s8 4433; VI-NEXT: v_mov_b32_e32 v1, s9 4434; VI-NEXT: v_mov_b32_e32 v2, s10 4435; VI-NEXT: v_mov_b32_e32 v3, s11 4436; VI-NEXT: v_mov_b32_e32 v4, s2 4437; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4438; VI-NEXT: v_mov_b32_e32 v5, s1 4439; VI-NEXT: v_mov_b32_e32 v0, s4 4440; VI-NEXT: v_mov_b32_e32 v1, s5 4441; VI-NEXT: v_mov_b32_e32 v2, s6 4442; VI-NEXT: v_mov_b32_e32 v3, s7 4443; VI-NEXT: v_mov_b32_e32 v4, s0 4444; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4445; VI-NEXT: s_endpgm 4446; 4447; GFX9-LABEL: v16f32_arg: 4448; GFX9: ; %bb.0: ; %entry 4449; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 4450; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4451; GFX9-NEXT: v_mov_b32_e32 v4, 0 4452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4453; GFX9-NEXT: v_mov_b32_e32 v0, s20 4454; GFX9-NEXT: v_mov_b32_e32 v1, s21 4455; GFX9-NEXT: v_mov_b32_e32 v2, s22 4456; GFX9-NEXT: v_mov_b32_e32 v3, s23 4457; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 4458; GFX9-NEXT: s_nop 0 4459; GFX9-NEXT: v_mov_b32_e32 v0, s16 4460; GFX9-NEXT: v_mov_b32_e32 v1, s17 4461; GFX9-NEXT: v_mov_b32_e32 v2, s18 4462; GFX9-NEXT: v_mov_b32_e32 v3, s19 4463; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 4464; GFX9-NEXT: s_nop 0 4465; GFX9-NEXT: v_mov_b32_e32 v0, s12 4466; GFX9-NEXT: v_mov_b32_e32 v1, s13 4467; GFX9-NEXT: v_mov_b32_e32 v2, s14 4468; GFX9-NEXT: v_mov_b32_e32 v3, s15 4469; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 4470; GFX9-NEXT: s_nop 0 4471; GFX9-NEXT: v_mov_b32_e32 v0, s8 4472; GFX9-NEXT: v_mov_b32_e32 v1, s9 4473; GFX9-NEXT: v_mov_b32_e32 v2, s10 4474; GFX9-NEXT: v_mov_b32_e32 v3, s11 4475; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 4476; GFX9-NEXT: s_endpgm 4477; 4478; EG-LABEL: v16f32_arg: 4479; EG: ; %bb.0: ; %entry 4480; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4481; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4482; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4483; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4484; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4485; EG-NEXT: CF_END 4486; EG-NEXT: ALU clause starting at 6: 4487; EG-NEXT: MOV * T0.W, KC0[7].X, 4488; EG-NEXT: MOV * T0.Z, KC0[6].W, 4489; EG-NEXT: MOV T0.Y, KC0[6].Z, 4490; EG-NEXT: MOV * T1.W, KC0[8].X, 4491; EG-NEXT: MOV T0.X, KC0[6].Y, 4492; EG-NEXT: MOV * T1.Z, KC0[7].W, 4493; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4494; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4495; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4496; EG-NEXT: MOV * T3.W, KC0[9].X, 4497; EG-NEXT: MOV T1.X, KC0[7].Y, 4498; EG-NEXT: MOV * T3.Z, KC0[8].W, 4499; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4500; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4501; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4502; EG-NEXT: MOV T3.Y, KC0[8].Z, 4503; EG-NEXT: MOV * T5.W, KC0[10].X, 4504; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4505; EG-NEXT: MOV T3.X, KC0[8].Y, 4506; EG-NEXT: MOV * T5.Z, KC0[9].W, 4507; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4508; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4509; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4510; EG-NEXT: MOV T5.Y, KC0[9].Z, 4511; EG-NEXT: MOV * T5.X, KC0[9].Y, 4512; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4513; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4514; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4515; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4516; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4517; 4518; CM-LABEL: v16f32_arg: 4519; CM: ; %bb.0: ; %entry 4520; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4521; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4522; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4523; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4524; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4525; CM-NEXT: CF_END 4526; CM-NEXT: ALU clause starting at 6: 4527; CM-NEXT: MOV * T0.W, KC0[10].X, 4528; CM-NEXT: MOV * T0.Z, KC0[9].W, 4529; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4530; CM-NEXT: MOV T0.X, KC0[9].Y, 4531; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4532; CM-NEXT: MOV * T2.W, KC0[9].X, 4533; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4534; CM-NEXT: MOV T2.Z, KC0[8].W, 4535; CM-NEXT: MOV * T1.W, KC0[8].X, 4536; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4537; CM-NEXT: MOV T2.Y, KC0[8].Z, 4538; CM-NEXT: MOV * T1.Z, KC0[7].W, 4539; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4540; CM-NEXT: MOV T2.X, KC0[8].Y, 4541; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4542; CM-NEXT: MOV T1.X, KC0[7].Y, 4543; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4544; CM-NEXT: MOV * T4.W, KC0[7].X, 4545; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4546; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4547; CM-NEXT: MOV T4.Z, KC0[6].W, 4548; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4549; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4550; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4551; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4552; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4553; CM-NEXT: MOV * T4.X, KC0[6].Y, 4554; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4555; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4556entry: 4557 store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4 4558 ret void 4559} 4560 4561define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind { 4562; SI-LABEL: kernel_arg_i64: 4563; SI: ; %bb.0: 4564; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4565; SI-NEXT: s_mov_b32 s7, 0xf000 4566; SI-NEXT: s_mov_b32 s6, -1 4567; SI-NEXT: s_waitcnt lgkmcnt(0) 4568; SI-NEXT: s_mov_b32 s4, s0 4569; SI-NEXT: s_mov_b32 s5, s1 4570; SI-NEXT: v_mov_b32_e32 v0, s2 4571; SI-NEXT: v_mov_b32_e32 v1, s3 4572; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4573; SI-NEXT: s_endpgm 4574; 4575; VI-LABEL: kernel_arg_i64: 4576; VI: ; %bb.0: 4577; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 4578; VI-NEXT: s_waitcnt lgkmcnt(0) 4579; VI-NEXT: v_mov_b32_e32 v0, s0 4580; VI-NEXT: v_mov_b32_e32 v1, s1 4581; VI-NEXT: v_mov_b32_e32 v2, s2 4582; VI-NEXT: v_mov_b32_e32 v3, s3 4583; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4584; VI-NEXT: s_endpgm 4585; 4586; GFX9-LABEL: kernel_arg_i64: 4587; GFX9: ; %bb.0: 4588; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4589; GFX9-NEXT: v_mov_b32_e32 v2, 0 4590; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4591; GFX9-NEXT: v_mov_b32_e32 v0, s2 4592; GFX9-NEXT: v_mov_b32_e32 v1, s3 4593; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4594; GFX9-NEXT: s_endpgm 4595; 4596; EG-LABEL: kernel_arg_i64: 4597; EG: ; %bb.0: 4598; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4599; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4600; EG-NEXT: CF_END 4601; EG-NEXT: PAD 4602; EG-NEXT: ALU clause starting at 4: 4603; EG-NEXT: MOV * T0.Y, KC0[3].X, 4604; EG-NEXT: MOV T0.X, KC0[2].W, 4605; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4606; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4607; 4608; CM-LABEL: kernel_arg_i64: 4609; CM: ; %bb.0: 4610; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4611; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4612; CM-NEXT: CF_END 4613; CM-NEXT: PAD 4614; CM-NEXT: ALU clause starting at 4: 4615; CM-NEXT: MOV * T0.Y, KC0[3].X, 4616; CM-NEXT: MOV * T0.X, KC0[2].W, 4617; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4618; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4619 store i64 %a, i64 addrspace(1)* %out, align 8 4620 ret void 4621} 4622 4623define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) { 4624; SI-LABEL: f64_kernel_arg: 4625; SI: ; %bb.0: ; %entry 4626; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 4627; SI-NEXT: s_mov_b32 s7, 0xf000 4628; SI-NEXT: s_mov_b32 s6, -1 4629; SI-NEXT: s_waitcnt lgkmcnt(0) 4630; SI-NEXT: s_mov_b32 s4, s0 4631; SI-NEXT: s_mov_b32 s5, s1 4632; SI-NEXT: v_mov_b32_e32 v0, s2 4633; SI-NEXT: v_mov_b32_e32 v1, s3 4634; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4635; SI-NEXT: s_endpgm 4636; 4637; VI-LABEL: f64_kernel_arg: 4638; VI: ; %bb.0: ; %entry 4639; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 4640; VI-NEXT: s_waitcnt lgkmcnt(0) 4641; VI-NEXT: v_mov_b32_e32 v0, s0 4642; VI-NEXT: v_mov_b32_e32 v1, s1 4643; VI-NEXT: v_mov_b32_e32 v2, s2 4644; VI-NEXT: v_mov_b32_e32 v3, s3 4645; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4646; VI-NEXT: s_endpgm 4647; 4648; GFX9-LABEL: f64_kernel_arg: 4649; GFX9: ; %bb.0: ; %entry 4650; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 4651; GFX9-NEXT: v_mov_b32_e32 v2, 0 4652; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4653; GFX9-NEXT: v_mov_b32_e32 v0, s2 4654; GFX9-NEXT: v_mov_b32_e32 v1, s3 4655; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4656; GFX9-NEXT: s_endpgm 4657; 4658; EG-LABEL: f64_kernel_arg: 4659; EG: ; %bb.0: ; %entry 4660; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4661; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4662; EG-NEXT: CF_END 4663; EG-NEXT: PAD 4664; EG-NEXT: ALU clause starting at 4: 4665; EG-NEXT: MOV * T0.Y, KC0[3].X, 4666; EG-NEXT: MOV T0.X, KC0[2].W, 4667; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4668; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4669; 4670; CM-LABEL: f64_kernel_arg: 4671; CM: ; %bb.0: ; %entry 4672; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4673; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4674; CM-NEXT: CF_END 4675; CM-NEXT: PAD 4676; CM-NEXT: ALU clause starting at 4: 4677; CM-NEXT: MOV * T0.Y, KC0[3].X, 4678; CM-NEXT: MOV * T0.X, KC0[2].W, 4679; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4680; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4681entry: 4682 store double %in, double addrspace(1)* %out 4683 ret void 4684} 4685 4686; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 4687; XGCN: s_load_dwordx2 4688; XGCN: s_load_dwordx2 4689; XGCN: buffer_store_dwordx2 4690; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind { 4691; store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8 4692; ret void 4693; } 4694 4695define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { 4696; SI-LABEL: i65_arg: 4697; SI: ; %bb.0: ; %entry 4698; SI-NEXT: s_load_dword s2, s[0:1], 0xd 4699; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 4700; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4701; SI-NEXT: s_mov_b32 s3, 0xf000 4702; SI-NEXT: s_waitcnt lgkmcnt(0) 4703; SI-NEXT: s_and_b32 s6, s2, 1 4704; SI-NEXT: s_mov_b32 s2, -1 4705; SI-NEXT: v_mov_b32_e32 v0, s4 4706; SI-NEXT: v_mov_b32_e32 v1, s5 4707; SI-NEXT: v_mov_b32_e32 v2, s6 4708; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:8 4709; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4710; SI-NEXT: s_endpgm 4711; 4712; VI-LABEL: i65_arg: 4713; VI: ; %bb.0: ; %entry 4714; VI-NEXT: s_load_dword s4, s[0:1], 0x34 4715; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4716; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 4717; VI-NEXT: s_waitcnt lgkmcnt(0) 4718; VI-NEXT: s_and_b32 s4, s4, 1 4719; VI-NEXT: v_mov_b32_e32 v0, s2 4720; VI-NEXT: v_mov_b32_e32 v1, s3 4721; VI-NEXT: s_add_u32 s2, s2, 8 4722; VI-NEXT: s_addc_u32 s3, s3, 0 4723; VI-NEXT: v_mov_b32_e32 v2, s2 4724; VI-NEXT: v_mov_b32_e32 v4, s4 4725; VI-NEXT: v_mov_b32_e32 v3, s3 4726; VI-NEXT: flat_store_byte v[2:3], v4 4727; VI-NEXT: v_mov_b32_e32 v3, s1 4728; VI-NEXT: v_mov_b32_e32 v2, s0 4729; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4730; VI-NEXT: s_endpgm 4731; 4732; GFX9-LABEL: i65_arg: 4733; GFX9: ; %bb.0: ; %entry 4734; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 4735; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 4736; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 4737; GFX9-NEXT: v_mov_b32_e32 v2, 0 4738; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4739; GFX9-NEXT: s_and_b32 s4, s6, 1 4740; GFX9-NEXT: v_mov_b32_e32 v0, s0 4741; GFX9-NEXT: v_mov_b32_e32 v3, s4 4742; GFX9-NEXT: v_mov_b32_e32 v1, s1 4743; GFX9-NEXT: global_store_byte v2, v3, s[2:3] offset:8 4744; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4745; GFX9-NEXT: s_endpgm 4746; 4747; EG-LABEL: i65_arg: 4748; EG: ; %bb.0: ; %entry 4749; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[] 4750; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 4751; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 4752; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 4753; EG-NEXT: CF_END 4754; EG-NEXT: PAD 4755; EG-NEXT: ALU clause starting at 6: 4756; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4757; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4758; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 4759; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4760; EG-NEXT: LSHL T1.W, PV.W, literal.x, 4761; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1, 4762; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4763; EG-NEXT: LSHL T1.X, PS, PV.W, 4764; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 4765; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4766; EG-NEXT: MOV T1.Y, 0.0, 4767; EG-NEXT: MOV * T1.Z, 0.0, 4768; EG-NEXT: LSHR T0.X, T0.W, literal.x, 4769; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4770; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) 4771; EG-NEXT: LSHR T2.X, PV.W, literal.x, 4772; EG-NEXT: MOV * T3.X, KC0[3].X, 4773; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4774; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, 4775; EG-NEXT: MOV * T5.X, KC0[2].W, 4776; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4777; 4778; CM-LABEL: i65_arg: 4779; CM: ; %bb.0: ; %entry 4780; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[] 4781; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X 4782; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 4783; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X 4784; CM-NEXT: CF_END 4785; CM-NEXT: PAD 4786; CM-NEXT: ALU clause starting at 6: 4787; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4788; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4789; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 4790; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4791; CM-NEXT: LSHL T0.Z, PV.W, literal.x, 4792; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1, 4793; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4794; CM-NEXT: LSHL T1.X, PV.W, PV.Z, 4795; CM-NEXT: LSHL * T1.W, literal.x, PV.Z, 4796; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4797; CM-NEXT: MOV T1.Y, 0.0, 4798; CM-NEXT: MOV * T1.Z, 0.0, 4799; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 4800; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4801; CM-NEXT: MOV T2.X, KC0[2].W, 4802; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4803; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 4804; CM-NEXT: LSHR * T3.X, PV.W, literal.x, 4805; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4806; CM-NEXT: MOV * T4.X, KC0[3].X, 4807; CM-NEXT: LSHR * T5.X, T0.W, literal.x, 4808; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4809entry: 4810 store i65 %in, i65 addrspace(1)* %out, align 4 4811 ret void 4812} 4813 4814define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { 4815; SI-LABEL: i1_arg: 4816; SI: ; %bb.0: 4817; SI-NEXT: s_load_dword s2, s[0:1], 0xb 4818; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4819; SI-NEXT: s_mov_b32 s3, 0xf000 4820; SI-NEXT: s_waitcnt lgkmcnt(0) 4821; SI-NEXT: s_and_b32 s4, s2, 1 4822; SI-NEXT: s_mov_b32 s2, -1 4823; SI-NEXT: v_mov_b32_e32 v0, s4 4824; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 4825; SI-NEXT: s_endpgm 4826; 4827; VI-LABEL: i1_arg: 4828; VI: ; %bb.0: 4829; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 4830; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4831; VI-NEXT: s_waitcnt lgkmcnt(0) 4832; VI-NEXT: s_and_b32 s2, s2, 1 4833; VI-NEXT: v_mov_b32_e32 v0, s0 4834; VI-NEXT: v_mov_b32_e32 v1, s1 4835; VI-NEXT: v_mov_b32_e32 v2, s2 4836; VI-NEXT: flat_store_byte v[0:1], v2 4837; VI-NEXT: s_endpgm 4838; 4839; GFX9-LABEL: i1_arg: 4840; GFX9: ; %bb.0: 4841; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 4842; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4843; GFX9-NEXT: v_mov_b32_e32 v0, 0 4844; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4845; GFX9-NEXT: s_and_b32 s2, s2, 1 4846; GFX9-NEXT: v_mov_b32_e32 v1, s2 4847; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 4848; GFX9-NEXT: s_endpgm 4849; 4850; EG-LABEL: i1_arg: 4851; EG: ; %bb.0: 4852; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4853; EG-NEXT: TEX 0 @6 4854; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4855; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4856; EG-NEXT: CF_END 4857; EG-NEXT: PAD 4858; EG-NEXT: Fetch clause starting at 6: 4859; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4860; EG-NEXT: ALU clause starting at 8: 4861; EG-NEXT: MOV * T0.X, 0.0, 4862; EG-NEXT: ALU clause starting at 9: 4863; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 4864; EG-NEXT: AND_INT * T1.W, T0.X, 1, 4865; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4866; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 4867; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4868; EG-NEXT: LSHL T0.X, T1.W, PV.W, 4869; EG-NEXT: LSHL * T0.W, literal.x, PV.W, 4870; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4871; EG-NEXT: MOV T0.Y, 0.0, 4872; EG-NEXT: MOV * T0.Z, 0.0, 4873; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4874; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4875; 4876; CM-LABEL: i1_arg: 4877; CM: ; %bb.0: 4878; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4879; CM-NEXT: TEX 0 @6 4880; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4881; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4882; CM-NEXT: CF_END 4883; CM-NEXT: PAD 4884; CM-NEXT: Fetch clause starting at 6: 4885; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4886; CM-NEXT: ALU clause starting at 8: 4887; CM-NEXT: MOV * T0.X, 0.0, 4888; CM-NEXT: ALU clause starting at 9: 4889; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 4890; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4891; CM-NEXT: AND_INT T0.Z, T0.X, 1, 4892; CM-NEXT: LSHL * T0.W, PV.W, literal.x, 4893; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4894; CM-NEXT: LSHL T0.X, PV.Z, PV.W, 4895; CM-NEXT: LSHL * T0.W, literal.x, PV.W, 4896; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4897; CM-NEXT: MOV T0.Y, 0.0, 4898; CM-NEXT: MOV * T0.Z, 0.0, 4899; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4900; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4901 store i1 %x, i1 addrspace(1)* %out, align 1 4902 ret void 4903} 4904 4905define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 4906; SI-LABEL: i1_arg_zext_i32: 4907; SI: ; %bb.0: 4908; SI-NEXT: s_load_dword s2, s[0:1], 0xb 4909; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4910; SI-NEXT: s_mov_b32 s3, 0xf000 4911; SI-NEXT: s_waitcnt lgkmcnt(0) 4912; SI-NEXT: s_and_b32 s4, s2, 1 4913; SI-NEXT: s_mov_b32 s2, -1 4914; SI-NEXT: v_mov_b32_e32 v0, s4 4915; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 4916; SI-NEXT: s_endpgm 4917; 4918; VI-LABEL: i1_arg_zext_i32: 4919; VI: ; %bb.0: 4920; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 4921; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4922; VI-NEXT: s_waitcnt lgkmcnt(0) 4923; VI-NEXT: s_and_b32 s2, s2, 1 4924; VI-NEXT: v_mov_b32_e32 v0, s0 4925; VI-NEXT: v_mov_b32_e32 v1, s1 4926; VI-NEXT: v_mov_b32_e32 v2, s2 4927; VI-NEXT: flat_store_dword v[0:1], v2 4928; VI-NEXT: s_endpgm 4929; 4930; GFX9-LABEL: i1_arg_zext_i32: 4931; GFX9: ; %bb.0: 4932; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 4933; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 4934; GFX9-NEXT: v_mov_b32_e32 v0, 0 4935; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4936; GFX9-NEXT: s_and_b32 s2, s2, 1 4937; GFX9-NEXT: v_mov_b32_e32 v1, s2 4938; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4939; GFX9-NEXT: s_endpgm 4940; 4941; EG-LABEL: i1_arg_zext_i32: 4942; EG: ; %bb.0: 4943; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4944; EG-NEXT: TEX 0 @6 4945; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4946; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 4947; EG-NEXT: CF_END 4948; EG-NEXT: PAD 4949; EG-NEXT: Fetch clause starting at 6: 4950; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4951; EG-NEXT: ALU clause starting at 8: 4952; EG-NEXT: MOV * T0.X, 0.0, 4953; EG-NEXT: ALU clause starting at 9: 4954; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4955; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4956; 4957; CM-LABEL: i1_arg_zext_i32: 4958; CM: ; %bb.0: 4959; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4960; CM-NEXT: TEX 0 @6 4961; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4962; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 4963; CM-NEXT: CF_END 4964; CM-NEXT: PAD 4965; CM-NEXT: Fetch clause starting at 6: 4966; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4967; CM-NEXT: ALU clause starting at 8: 4968; CM-NEXT: MOV * T0.X, 0.0, 4969; CM-NEXT: ALU clause starting at 9: 4970; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4971; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4972 %ext = zext i1 %x to i32 4973 store i32 %ext, i32 addrspace(1)* %out, align 4 4974 ret void 4975} 4976 4977define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 4978; SI-LABEL: i1_arg_zext_i64: 4979; SI: ; %bb.0: 4980; SI-NEXT: s_load_dword s4, s[0:1], 0xb 4981; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4982; SI-NEXT: s_mov_b32 s3, 0xf000 4983; SI-NEXT: s_mov_b32 s2, -1 4984; SI-NEXT: s_waitcnt lgkmcnt(0) 4985; SI-NEXT: s_and_b32 s4, s4, 1 4986; SI-NEXT: v_mov_b32_e32 v1, 0 4987; SI-NEXT: v_mov_b32_e32 v0, s4 4988; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4989; SI-NEXT: s_endpgm 4990; 4991; VI-LABEL: i1_arg_zext_i64: 4992; VI: ; %bb.0: 4993; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 4994; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4995; VI-NEXT: v_mov_b32_e32 v1, 0 4996; VI-NEXT: s_waitcnt lgkmcnt(0) 4997; VI-NEXT: s_and_b32 s2, s2, 1 4998; VI-NEXT: v_mov_b32_e32 v3, s1 4999; VI-NEXT: v_mov_b32_e32 v0, s2 5000; VI-NEXT: v_mov_b32_e32 v2, s0 5001; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5002; VI-NEXT: s_endpgm 5003; 5004; GFX9-LABEL: i1_arg_zext_i64: 5005; GFX9: ; %bb.0: 5006; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5007; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5008; GFX9-NEXT: v_mov_b32_e32 v1, 0 5009; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5010; GFX9-NEXT: s_and_b32 s2, s2, 1 5011; GFX9-NEXT: v_mov_b32_e32 v0, s2 5012; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 5013; GFX9-NEXT: s_endpgm 5014; 5015; EG-LABEL: i1_arg_zext_i64: 5016; EG: ; %bb.0: 5017; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5018; EG-NEXT: TEX 0 @6 5019; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5020; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 5021; EG-NEXT: CF_END 5022; EG-NEXT: PAD 5023; EG-NEXT: Fetch clause starting at 6: 5024; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5025; EG-NEXT: ALU clause starting at 8: 5026; EG-NEXT: MOV * T0.X, 0.0, 5027; EG-NEXT: ALU clause starting at 9: 5028; EG-NEXT: MOV * T0.Y, 0.0, 5029; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5030; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5031; 5032; CM-LABEL: i1_arg_zext_i64: 5033; CM: ; %bb.0: 5034; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5035; CM-NEXT: TEX 0 @6 5036; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5037; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5038; CM-NEXT: CF_END 5039; CM-NEXT: PAD 5040; CM-NEXT: Fetch clause starting at 6: 5041; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5042; CM-NEXT: ALU clause starting at 8: 5043; CM-NEXT: MOV * T0.X, 0.0, 5044; CM-NEXT: ALU clause starting at 9: 5045; CM-NEXT: MOV * T0.Y, 0.0, 5046; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5047; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5048 %ext = zext i1 %x to i64 5049 store i64 %ext, i64 addrspace(1)* %out, align 8 5050 ret void 5051} 5052 5053define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { 5054; SI-LABEL: i1_arg_sext_i32: 5055; SI: ; %bb.0: 5056; SI-NEXT: s_load_dword s2, s[0:1], 0xb 5057; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5058; SI-NEXT: s_mov_b32 s3, 0xf000 5059; SI-NEXT: s_waitcnt lgkmcnt(0) 5060; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 5061; SI-NEXT: s_mov_b32 s2, -1 5062; SI-NEXT: v_mov_b32_e32 v0, s4 5063; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5064; SI-NEXT: s_endpgm 5065; 5066; VI-LABEL: i1_arg_sext_i32: 5067; VI: ; %bb.0: 5068; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 5069; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5070; VI-NEXT: s_waitcnt lgkmcnt(0) 5071; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 5072; VI-NEXT: v_mov_b32_e32 v0, s0 5073; VI-NEXT: v_mov_b32_e32 v1, s1 5074; VI-NEXT: v_mov_b32_e32 v2, s2 5075; VI-NEXT: flat_store_dword v[0:1], v2 5076; VI-NEXT: s_endpgm 5077; 5078; GFX9-LABEL: i1_arg_sext_i32: 5079; GFX9: ; %bb.0: 5080; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 5081; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 5082; GFX9-NEXT: v_mov_b32_e32 v0, 0 5083; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5084; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 5085; GFX9-NEXT: v_mov_b32_e32 v1, s2 5086; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5087; GFX9-NEXT: s_endpgm 5088; 5089; EG-LABEL: i1_arg_sext_i32: 5090; EG: ; %bb.0: 5091; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5092; EG-NEXT: TEX 0 @6 5093; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5094; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 5095; EG-NEXT: CF_END 5096; EG-NEXT: PAD 5097; EG-NEXT: Fetch clause starting at 6: 5098; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5099; EG-NEXT: ALU clause starting at 8: 5100; EG-NEXT: MOV * T0.X, 0.0, 5101; EG-NEXT: ALU clause starting at 9: 5102; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5103; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5104; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5105; 5106; CM-LABEL: i1_arg_sext_i32: 5107; CM: ; %bb.0: 5108; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5109; CM-NEXT: TEX 0 @6 5110; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5111; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 5112; CM-NEXT: CF_END 5113; CM-NEXT: PAD 5114; CM-NEXT: Fetch clause starting at 6: 5115; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5116; CM-NEXT: ALU clause starting at 8: 5117; CM-NEXT: MOV * T0.X, 0.0, 5118; CM-NEXT: ALU clause starting at 9: 5119; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5120; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5121; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5122 %ext = sext i1 %x to i32 5123 store i32 %ext, i32addrspace(1)* %out, align 4 5124 ret void 5125} 5126 5127define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { 5128; SI-LABEL: i1_arg_sext_i64: 5129; SI: ; %bb.0: 5130; SI-NEXT: s_load_dword s2, s[0:1], 0xb 5131; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5132; SI-NEXT: s_mov_b32 s3, 0xf000 5133; SI-NEXT: s_waitcnt lgkmcnt(0) 5134; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 5135; SI-NEXT: s_mov_b32 s2, -1 5136; SI-NEXT: v_mov_b32_e32 v0, s4 5137; SI-NEXT: v_mov_b32_e32 v1, s5 5138; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5139; SI-NEXT: s_endpgm 5140; 5141; VI-LABEL: i1_arg_sext_i64: 5142; VI: ; %bb.0: 5143; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 5144; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5145; VI-NEXT: s_waitcnt lgkmcnt(0) 5146; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 5147; VI-NEXT: v_mov_b32_e32 v0, s0 5148; VI-NEXT: v_mov_b32_e32 v2, s2 5149; VI-NEXT: v_mov_b32_e32 v1, s1 5150; VI-NEXT: v_mov_b32_e32 v3, s3 5151; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5152; VI-NEXT: s_endpgm 5153; 5154; GFX9-LABEL: i1_arg_sext_i64: 5155; GFX9: ; %bb.0: 5156; GFX9-NEXT: s_load_dword s0, s[4:5], 0x8 5157; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 5158; GFX9-NEXT: v_mov_b32_e32 v2, 0 5159; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5160; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 5161; GFX9-NEXT: v_mov_b32_e32 v0, s0 5162; GFX9-NEXT: v_mov_b32_e32 v1, s1 5163; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5164; GFX9-NEXT: s_endpgm 5165; 5166; EG-LABEL: i1_arg_sext_i64: 5167; EG: ; %bb.0: 5168; EG-NEXT: ALU 0, @8, KC0[], KC1[] 5169; EG-NEXT: TEX 0 @6 5170; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5171; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 5172; EG-NEXT: CF_END 5173; EG-NEXT: PAD 5174; EG-NEXT: Fetch clause starting at 6: 5175; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5176; EG-NEXT: ALU clause starting at 8: 5177; EG-NEXT: MOV * T0.X, 0.0, 5178; EG-NEXT: ALU clause starting at 9: 5179; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5180; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5181; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5182; EG-NEXT: MOV * T0.Y, PV.X, 5183; 5184; CM-LABEL: i1_arg_sext_i64: 5185; CM: ; %bb.0: 5186; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5187; CM-NEXT: TEX 0 @6 5188; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5189; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5190; CM-NEXT: CF_END 5191; CM-NEXT: PAD 5192; CM-NEXT: Fetch clause starting at 6: 5193; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5194; CM-NEXT: ALU clause starting at 8: 5195; CM-NEXT: MOV * T0.X, 0.0, 5196; CM-NEXT: ALU clause starting at 9: 5197; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5198; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 5199; CM-NEXT: MOV * T0.Y, PV.X, 5200; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5201 %ext = sext i1 %x to i64 5202 store i64 %ext, i64 addrspace(1)* %out, align 8 5203 ret void 5204} 5205 5206define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 5207; SI-LABEL: empty_struct_arg: 5208; SI: ; %bb.0: 5209; SI-NEXT: s_endpgm 5210; 5211; VI-LABEL: empty_struct_arg: 5212; VI: ; %bb.0: 5213; VI-NEXT: s_endpgm 5214; 5215; GFX9-LABEL: empty_struct_arg: 5216; GFX9: ; %bb.0: 5217; GFX9-NEXT: s_endpgm 5218; 5219; EGCM-LABEL: empty_struct_arg: 5220; EGCM: ; %bb.0: 5221; EGCM-NEXT: CF_END 5222; EGCM-NEXT: PAD 5223 ret void 5224} 5225 5226; The correct load offsets for these: 5227; load 4 from 0, 5228; load 8 from 8 5229; load 4 from 24 5230; load 8 from 32 5231 5232; With the SelectionDAG argument lowering, the alignments for the 5233; struct members is not properly considered, making these wrong. 5234 5235; FIXME: Total argument size is computed wrong 5236define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 5237; SI-LABEL: struct_argument_alignment: 5238; SI: ; %bb.0: 5239; SI-NEXT: s_load_dword s8, s[0:1], 0x9 5240; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5241; SI-NEXT: s_load_dword s9, s[0:1], 0xf 5242; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x11 5243; SI-NEXT: s_mov_b32 s0, 0 5244; SI-NEXT: s_mov_b32 s3, 0xf000 5245; SI-NEXT: s_mov_b32 s2, -1 5246; SI-NEXT: s_mov_b32 s1, s0 5247; SI-NEXT: s_waitcnt lgkmcnt(0) 5248; SI-NEXT: v_mov_b32_e32 v0, s8 5249; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5250; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5251; SI-NEXT: v_mov_b32_e32 v0, s4 5252; SI-NEXT: v_mov_b32_e32 v1, s5 5253; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5254; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5255; SI-NEXT: v_mov_b32_e32 v0, s9 5256; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5257; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5258; SI-NEXT: v_mov_b32_e32 v0, s6 5259; SI-NEXT: v_mov_b32_e32 v1, s7 5260; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5261; SI-NEXT: s_waitcnt vmcnt(0) 5262; SI-NEXT: s_endpgm 5263; 5264; VI-LABEL: struct_argument_alignment: 5265; VI: ; %bb.0: 5266; VI-NEXT: s_load_dword s4, s[0:1], 0x24 5267; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5268; VI-NEXT: s_load_dword s5, s[0:1], 0x3c 5269; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44 5270; VI-NEXT: v_mov_b32_e32 v0, 0 5271; VI-NEXT: v_mov_b32_e32 v1, 0 5272; VI-NEXT: s_waitcnt lgkmcnt(0) 5273; VI-NEXT: v_mov_b32_e32 v2, s4 5274; VI-NEXT: flat_store_dword v[0:1], v2 5275; VI-NEXT: s_waitcnt vmcnt(0) 5276; VI-NEXT: v_mov_b32_e32 v2, s2 5277; VI-NEXT: v_mov_b32_e32 v3, s3 5278; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5279; VI-NEXT: s_waitcnt vmcnt(0) 5280; VI-NEXT: v_mov_b32_e32 v2, s5 5281; VI-NEXT: flat_store_dword v[0:1], v2 5282; VI-NEXT: s_waitcnt vmcnt(0) 5283; VI-NEXT: v_mov_b32_e32 v3, s1 5284; VI-NEXT: v_mov_b32_e32 v2, s0 5285; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5286; VI-NEXT: s_waitcnt vmcnt(0) 5287; VI-NEXT: s_endpgm 5288; 5289; GFX9-LABEL: struct_argument_alignment: 5290; GFX9: ; %bb.0: 5291; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 5292; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 5293; GFX9-NEXT: s_load_dword s7, s[4:5], 0x18 5294; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20 5295; GFX9-NEXT: v_mov_b32_e32 v0, 0 5296; GFX9-NEXT: v_mov_b32_e32 v1, 0 5297; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5298; GFX9-NEXT: v_mov_b32_e32 v2, s6 5299; GFX9-NEXT: global_store_dword v[0:1], v2, off 5300; GFX9-NEXT: s_waitcnt vmcnt(0) 5301; GFX9-NEXT: v_mov_b32_e32 v3, s1 5302; GFX9-NEXT: v_mov_b32_e32 v2, s0 5303; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5304; GFX9-NEXT: s_waitcnt vmcnt(0) 5305; GFX9-NEXT: v_mov_b32_e32 v2, s7 5306; GFX9-NEXT: global_store_dword v[0:1], v2, off 5307; GFX9-NEXT: s_waitcnt vmcnt(0) 5308; GFX9-NEXT: v_mov_b32_e32 v2, s2 5309; GFX9-NEXT: v_mov_b32_e32 v3, s3 5310; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5311; GFX9-NEXT: s_waitcnt vmcnt(0) 5312; GFX9-NEXT: s_endpgm 5313; 5314; EG-LABEL: struct_argument_alignment: 5315; EG: ; %bb.0: 5316; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5317; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0 5318; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 5319; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0 5320; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0 5321; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0 5322; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 5323; EG-NEXT: CF_END 5324; EG-NEXT: ALU clause starting at 8: 5325; EG-NEXT: MOV T0.X, KC0[4].Y, 5326; EG-NEXT: MOV * T1.X, KC0[4].Z, 5327; EG-NEXT: MOV T2.X, KC0[3].W, 5328; EG-NEXT: MOV * T3.X, KC0[2].W, 5329; EG-NEXT: MOV T4.X, literal.x, 5330; EG-NEXT: MOV * T5.X, KC0[3].X, 5331; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5332; EG-NEXT: MOV T6.X, literal.x, 5333; EG-NEXT: MOV * T7.X, KC0[2].Y, 5334; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5335; 5336; CM-LABEL: struct_argument_alignment: 5337; CM: ; %bb.0: 5338; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5339; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X 5340; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X 5341; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X 5342; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X 5343; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5344; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X 5345; CM-NEXT: CF_END 5346; CM-NEXT: ALU clause starting at 8: 5347; CM-NEXT: MOV * T0.X, KC0[4].Y, 5348; CM-NEXT: MOV * T1.X, KC0[4].Z, 5349; CM-NEXT: MOV * T2.X, KC0[3].W, 5350; CM-NEXT: MOV * T3.X, KC0[2].W, 5351; CM-NEXT: MOV * T4.X, literal.x, 5352; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5353; CM-NEXT: MOV * T5.X, KC0[3].X, 5354; CM-NEXT: MOV * T6.X, literal.x, 5355; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5356; CM-NEXT: MOV * T7.X, KC0[2].Y, 5357 %val0 = extractvalue {i32, i64} %arg0, 0 5358 %val1 = extractvalue {i32, i64} %arg0, 1 5359 %val2 = extractvalue {i32, i64} %arg1, 0 5360 %val3 = extractvalue {i32, i64} %arg1, 1 5361 store volatile i32 %val0, i32 addrspace(1)* null 5362 store volatile i64 %val1, i64 addrspace(1)* null 5363 store volatile i32 %val2, i32 addrspace(1)* null 5364 store volatile i64 %val3, i64 addrspace(1)* null 5365 ret void 5366} 5367 5368; No padding between i8 and next struct, but round up at end to 4 byte 5369; multiple. 5370define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 5371; SI-LABEL: packed_struct_argument_alignment: 5372; SI: ; %bb.0: 5373; SI-NEXT: s_mov_b32 s3, 0xf000 5374; SI-NEXT: s_mov_b32 s2, -1 5375; SI-NEXT: s_load_dword s6, s[0:1], 0x9 5376; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xa 5377; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:49 5378; SI-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:50 5379; SI-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:51 5380; SI-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:52 5381; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53 5382; SI-NEXT: s_mov_b32 s0, 0 5383; SI-NEXT: s_mov_b32 s1, s0 5384; SI-NEXT: s_waitcnt lgkmcnt(0) 5385; SI-NEXT: v_mov_b32_e32 v2, s6 5386; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 5387; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5388; SI-NEXT: v_mov_b32_e32 v2, s4 5389; SI-NEXT: v_mov_b32_e32 v3, s5 5390; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 5391; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5392; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 5393; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 5394; SI-NEXT: v_or_b32_e32 v2, v2, v4 5395; SI-NEXT: v_or_b32_e32 v3, v3, v6 5396; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 5397; SI-NEXT: v_or_b32_e32 v2, v3, v2 5398; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 5399; SI-NEXT: s_waitcnt vmcnt(0) 5400; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5401; SI-NEXT: s_waitcnt vmcnt(0) 5402; SI-NEXT: s_endpgm 5403; 5404; VI-LABEL: packed_struct_argument_alignment: 5405; VI: ; %bb.0: 5406; VI-NEXT: s_add_u32 s2, s0, 49 5407; VI-NEXT: s_addc_u32 s3, s1, 0 5408; VI-NEXT: s_add_u32 s4, s0, 50 5409; VI-NEXT: s_addc_u32 s5, s1, 0 5410; VI-NEXT: v_mov_b32_e32 v2, s2 5411; VI-NEXT: v_mov_b32_e32 v3, s3 5412; VI-NEXT: s_add_u32 s2, s2, 3 5413; VI-NEXT: s_addc_u32 s3, s3, 0 5414; VI-NEXT: v_mov_b32_e32 v5, s3 5415; VI-NEXT: v_mov_b32_e32 v4, s2 5416; VI-NEXT: s_add_u32 s2, s0, 51 5417; VI-NEXT: s_addc_u32 s3, s1, 0 5418; VI-NEXT: v_mov_b32_e32 v0, s4 5419; VI-NEXT: v_mov_b32_e32 v7, s3 5420; VI-NEXT: v_mov_b32_e32 v1, s5 5421; VI-NEXT: v_mov_b32_e32 v6, s2 5422; VI-NEXT: flat_load_ubyte v8, v[0:1] 5423; VI-NEXT: flat_load_ubyte v9, v[2:3] 5424; VI-NEXT: flat_load_ubyte v10, v[4:5] 5425; VI-NEXT: flat_load_ubyte v6, v[6:7] 5426; VI-NEXT: s_add_u32 s2, s0, 53 5427; VI-NEXT: s_addc_u32 s3, s1, 0 5428; VI-NEXT: v_mov_b32_e32 v0, s2 5429; VI-NEXT: v_mov_b32_e32 v1, s3 5430; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 5431; VI-NEXT: s_load_dword s2, s[0:1], 0x24 5432; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28 5433; VI-NEXT: v_mov_b32_e32 v2, 0 5434; VI-NEXT: v_mov_b32_e32 v3, 0 5435; VI-NEXT: s_waitcnt lgkmcnt(0) 5436; VI-NEXT: v_mov_b32_e32 v7, s2 5437; VI-NEXT: v_mov_b32_e32 v5, s1 5438; VI-NEXT: v_mov_b32_e32 v4, s0 5439; VI-NEXT: flat_store_dword v[2:3], v7 5440; VI-NEXT: s_waitcnt vmcnt(0) 5441; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] 5442; VI-NEXT: s_waitcnt vmcnt(0) 5443; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 5444; VI-NEXT: v_or_b32_e32 v4, v4, v9 5445; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 5446; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5447; VI-NEXT: v_or_b32_e32 v4, v5, v4 5448; VI-NEXT: flat_store_dword v[2:3], v4 5449; VI-NEXT: s_waitcnt vmcnt(0) 5450; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5451; VI-NEXT: s_waitcnt vmcnt(0) 5452; VI-NEXT: s_endpgm 5453; 5454; GFX9-LABEL: packed_struct_argument_alignment: 5455; GFX9: ; %bb.0: 5456; GFX9-NEXT: v_mov_b32_e32 v2, 0 5457; GFX9-NEXT: global_load_dword v6, v2, s[4:5] offset:13 5458; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17 5459; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 5460; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4 5461; GFX9-NEXT: v_mov_b32_e32 v2, 0 5462; GFX9-NEXT: v_mov_b32_e32 v3, 0 5463; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5464; GFX9-NEXT: v_mov_b32_e32 v7, s2 5465; GFX9-NEXT: v_mov_b32_e32 v5, s1 5466; GFX9-NEXT: v_mov_b32_e32 v4, s0 5467; GFX9-NEXT: global_store_dword v[2:3], v7, off 5468; GFX9-NEXT: s_waitcnt vmcnt(0) 5469; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off 5470; GFX9-NEXT: s_waitcnt vmcnt(0) 5471; GFX9-NEXT: global_store_dword v[2:3], v6, off 5472; GFX9-NEXT: s_waitcnt vmcnt(0) 5473; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 5474; GFX9-NEXT: s_waitcnt vmcnt(0) 5475; GFX9-NEXT: s_endpgm 5476; 5477; EG-LABEL: packed_struct_argument_alignment: 5478; EG: ; %bb.0: 5479; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5480; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 5481; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5482; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5483; EG-NEXT: ALU 2, @25, KC0[], KC1[] 5484; EG-NEXT: TEX 0 @12 5485; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5486; EG-NEXT: TEX 0 @14 5487; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5488; EG-NEXT: TEX 0 @16 5489; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1 5490; EG-NEXT: CF_END 5491; EG-NEXT: Fetch clause starting at 12: 5492; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5493; EG-NEXT: Fetch clause starting at 14: 5494; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5495; EG-NEXT: Fetch clause starting at 16: 5496; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5497; EG-NEXT: ALU clause starting at 18: 5498; EG-NEXT: MOV T0.X, KC0[2].Z, 5499; EG-NEXT: MOV * T1.X, literal.x, 5500; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5501; EG-NEXT: MOV T2.X, KC0[2].W, 5502; EG-NEXT: MOV * T3.X, literal.x, 5503; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5504; EG-NEXT: MOV * T4.X, KC0[2].Y, 5505; EG-NEXT: ALU clause starting at 25: 5506; EG-NEXT: MOV T0.X, 0.0, 5507; EG-NEXT: MOV * T2.X, 0.0, 5508; EG-NEXT: MOV * T4.X, 0.0, 5509; 5510; CM-LABEL: packed_struct_argument_alignment: 5511; CM: ; %bb.0: 5512; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5513; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5514; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5515; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5516; CM-NEXT: ALU 2, @25, KC0[], KC1[] 5517; CM-NEXT: TEX 0 @12 5518; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5519; CM-NEXT: TEX 0 @14 5520; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5521; CM-NEXT: TEX 0 @16 5522; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5523; CM-NEXT: CF_END 5524; CM-NEXT: Fetch clause starting at 12: 5525; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5526; CM-NEXT: Fetch clause starting at 14: 5527; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5528; CM-NEXT: Fetch clause starting at 16: 5529; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5530; CM-NEXT: ALU clause starting at 18: 5531; CM-NEXT: MOV * T0.X, KC0[2].Z, 5532; CM-NEXT: MOV * T1.X, literal.x, 5533; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5534; CM-NEXT: MOV * T2.X, KC0[2].W, 5535; CM-NEXT: MOV * T3.X, literal.x, 5536; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5537; CM-NEXT: MOV * T4.X, KC0[2].Y, 5538; CM-NEXT: ALU clause starting at 25: 5539; CM-NEXT: MOV * T0.X, 0.0, 5540; CM-NEXT: MOV * T2.X, 0.0, 5541; CM-NEXT: MOV * T4.X, 0.0, 5542 %val0 = extractvalue <{i32, i64}> %arg0, 0 5543 %val1 = extractvalue <{i32, i64}> %arg0, 1 5544 %val2 = extractvalue <{i32, i64}> %arg1, 0 5545 %val3 = extractvalue <{i32, i64}> %arg1, 1 5546 store volatile i32 %val0, i32 addrspace(1)* null 5547 store volatile i64 %val1, i64 addrspace(1)* null 5548 store volatile i32 %val2, i32 addrspace(1)* null 5549 store volatile i64 %val3, i64 addrspace(1)* null 5550 ret void 5551} 5552 5553define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 5554; SI-LABEL: struct_argument_alignment_after: 5555; SI: ; %bb.0: 5556; SI-NEXT: s_load_dword s12, s[0:1], 0x9 5557; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 5558; SI-NEXT: s_load_dword s13, s[0:1], 0xf 5559; SI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x11 5560; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 5561; SI-NEXT: s_mov_b32 s4, 0 5562; SI-NEXT: s_mov_b32 s7, 0xf000 5563; SI-NEXT: s_mov_b32 s6, -1 5564; SI-NEXT: s_mov_b32 s5, s4 5565; SI-NEXT: s_waitcnt lgkmcnt(0) 5566; SI-NEXT: v_mov_b32_e32 v0, s12 5567; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5568; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5569; SI-NEXT: v_mov_b32_e32 v0, s8 5570; SI-NEXT: v_mov_b32_e32 v1, s9 5571; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5572; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5573; SI-NEXT: v_mov_b32_e32 v0, s13 5574; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5575; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5576; SI-NEXT: v_mov_b32_e32 v0, s10 5577; SI-NEXT: v_mov_b32_e32 v1, s11 5578; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5579; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5580; SI-NEXT: v_mov_b32_e32 v0, s0 5581; SI-NEXT: v_mov_b32_e32 v1, s1 5582; SI-NEXT: v_mov_b32_e32 v2, s2 5583; SI-NEXT: v_mov_b32_e32 v3, s3 5584; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5585; SI-NEXT: s_waitcnt vmcnt(0) 5586; SI-NEXT: s_endpgm 5587; 5588; VI-LABEL: struct_argument_alignment_after: 5589; VI: ; %bb.0: 5590; VI-NEXT: s_load_dword s8, s[0:1], 0x24 5591; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5592; VI-NEXT: s_load_dword s9, s[0:1], 0x3c 5593; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44 5594; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 5595; VI-NEXT: v_mov_b32_e32 v4, 0 5596; VI-NEXT: v_mov_b32_e32 v5, 0 5597; VI-NEXT: s_waitcnt lgkmcnt(0) 5598; VI-NEXT: v_mov_b32_e32 v0, s8 5599; VI-NEXT: flat_store_dword v[4:5], v0 5600; VI-NEXT: s_waitcnt vmcnt(0) 5601; VI-NEXT: v_mov_b32_e32 v0, s4 5602; VI-NEXT: v_mov_b32_e32 v1, s5 5603; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5604; VI-NEXT: s_waitcnt vmcnt(0) 5605; VI-NEXT: v_mov_b32_e32 v0, s9 5606; VI-NEXT: flat_store_dword v[4:5], v0 5607; VI-NEXT: s_waitcnt vmcnt(0) 5608; VI-NEXT: v_mov_b32_e32 v0, s6 5609; VI-NEXT: v_mov_b32_e32 v1, s7 5610; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5611; VI-NEXT: s_waitcnt vmcnt(0) 5612; VI-NEXT: v_mov_b32_e32 v0, s0 5613; VI-NEXT: v_mov_b32_e32 v1, s1 5614; VI-NEXT: v_mov_b32_e32 v2, s2 5615; VI-NEXT: v_mov_b32_e32 v3, s3 5616; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 5617; VI-NEXT: s_waitcnt vmcnt(0) 5618; VI-NEXT: s_endpgm 5619; 5620; GFX9-LABEL: struct_argument_alignment_after: 5621; GFX9: ; %bb.0: 5622; GFX9-NEXT: s_load_dword s10, s[4:5], 0x0 5623; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 5624; GFX9-NEXT: s_load_dword s11, s[4:5], 0x18 5625; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20 5626; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 5627; GFX9-NEXT: v_mov_b32_e32 v4, 0 5628; GFX9-NEXT: v_mov_b32_e32 v5, 0 5629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5630; GFX9-NEXT: v_mov_b32_e32 v0, s10 5631; GFX9-NEXT: global_store_dword v[4:5], v0, off 5632; GFX9-NEXT: s_waitcnt vmcnt(0) 5633; GFX9-NEXT: v_mov_b32_e32 v0, s6 5634; GFX9-NEXT: v_mov_b32_e32 v1, s7 5635; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5636; GFX9-NEXT: s_waitcnt vmcnt(0) 5637; GFX9-NEXT: v_mov_b32_e32 v0, s11 5638; GFX9-NEXT: global_store_dword v[4:5], v0, off 5639; GFX9-NEXT: s_waitcnt vmcnt(0) 5640; GFX9-NEXT: v_mov_b32_e32 v0, s8 5641; GFX9-NEXT: v_mov_b32_e32 v1, s9 5642; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5643; GFX9-NEXT: s_waitcnt vmcnt(0) 5644; GFX9-NEXT: v_mov_b32_e32 v0, s0 5645; GFX9-NEXT: v_mov_b32_e32 v1, s1 5646; GFX9-NEXT: v_mov_b32_e32 v2, s2 5647; GFX9-NEXT: v_mov_b32_e32 v3, s3 5648; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 5649; GFX9-NEXT: s_waitcnt vmcnt(0) 5650; GFX9-NEXT: s_endpgm 5651; 5652; EG-LABEL: struct_argument_alignment_after: 5653; EG: ; %bb.0: 5654; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5655; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0 5656; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0 5657; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0 5658; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0 5659; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0 5660; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0 5661; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1 5662; EG-NEXT: CF_END 5663; EG-NEXT: PAD 5664; EG-NEXT: ALU clause starting at 10: 5665; EG-NEXT: MOV * T0.W, KC0[6].X, 5666; EG-NEXT: MOV * T0.Z, KC0[5].W, 5667; EG-NEXT: MOV * T0.Y, KC0[5].Z, 5668; EG-NEXT: MOV T0.X, KC0[5].Y, 5669; EG-NEXT: MOV * T1.X, KC0[4].Y, 5670; EG-NEXT: MOV T2.X, KC0[4].Z, 5671; EG-NEXT: MOV * T3.X, KC0[3].W, 5672; EG-NEXT: MOV T4.X, KC0[2].W, 5673; EG-NEXT: MOV * T5.X, literal.x, 5674; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5675; EG-NEXT: MOV T6.X, KC0[3].X, 5676; EG-NEXT: MOV * T7.X, literal.x, 5677; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5678; EG-NEXT: MOV * T8.X, KC0[2].Y, 5679; 5680; CM-LABEL: struct_argument_alignment_after: 5681; CM: ; %bb.0: 5682; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5683; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X 5684; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X 5685; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X 5686; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X 5687; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X 5688; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X 5689; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X 5690; CM-NEXT: CF_END 5691; CM-NEXT: PAD 5692; CM-NEXT: ALU clause starting at 10: 5693; CM-NEXT: MOV * T0.W, KC0[6].X, 5694; CM-NEXT: MOV * T0.Z, KC0[5].W, 5695; CM-NEXT: MOV * T0.Y, KC0[5].Z, 5696; CM-NEXT: MOV * T0.X, KC0[5].Y, 5697; CM-NEXT: MOV * T1.X, KC0[4].Y, 5698; CM-NEXT: MOV * T2.X, KC0[4].Z, 5699; CM-NEXT: MOV * T3.X, KC0[3].W, 5700; CM-NEXT: MOV * T4.X, KC0[2].W, 5701; CM-NEXT: MOV * T5.X, literal.x, 5702; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5703; CM-NEXT: MOV * T6.X, KC0[3].X, 5704; CM-NEXT: MOV * T7.X, literal.x, 5705; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5706; CM-NEXT: MOV * T8.X, KC0[2].Y, 5707 %val0 = extractvalue {i32, i64} %arg0, 0 5708 %val1 = extractvalue {i32, i64} %arg0, 1 5709 %val2 = extractvalue {i32, i64} %arg2, 0 5710 %val3 = extractvalue {i32, i64} %arg2, 1 5711 store volatile i32 %val0, i32 addrspace(1)* null 5712 store volatile i64 %val1, i64 addrspace(1)* null 5713 store volatile i32 %val2, i32 addrspace(1)* null 5714 store volatile i64 %val3, i64 addrspace(1)* null 5715 store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null 5716 ret void 5717} 5718 5719define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 5720; SI-LABEL: array_3xi32: 5721; SI: ; %bb.0: 5722; SI-NEXT: s_load_dword s4, s[0:1], 0xc 5723; SI-NEXT: s_load_dword s5, s[0:1], 0x9 5724; SI-NEXT: s_load_dword s6, s[0:1], 0xa 5725; SI-NEXT: s_load_dword s0, s[0:1], 0xb 5726; SI-NEXT: s_mov_b32 s3, 0xf000 5727; SI-NEXT: s_mov_b32 s2, -1 5728; SI-NEXT: s_waitcnt lgkmcnt(0) 5729; SI-NEXT: v_mov_b32_e32 v0, s5 5730; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 5731; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5732; SI-NEXT: v_mov_b32_e32 v0, s4 5733; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5734; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5735; SI-NEXT: v_mov_b32_e32 v0, s0 5736; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5737; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5738; SI-NEXT: v_mov_b32_e32 v0, s6 5739; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5740; SI-NEXT: s_waitcnt vmcnt(0) 5741; SI-NEXT: s_endpgm 5742; 5743; VI-LABEL: array_3xi32: 5744; VI: ; %bb.0: 5745; VI-NEXT: s_load_dword s2, s[0:1], 0x24 5746; VI-NEXT: s_load_dword s3, s[0:1], 0x30 5747; VI-NEXT: s_load_dword s4, s[0:1], 0x28 5748; VI-NEXT: s_load_dword s0, s[0:1], 0x2c 5749; VI-NEXT: s_waitcnt lgkmcnt(0) 5750; VI-NEXT: v_mov_b32_e32 v0, s2 5751; VI-NEXT: v_mov_b32_e32 v1, s3 5752; VI-NEXT: flat_store_short v[0:1], v0 5753; VI-NEXT: s_waitcnt vmcnt(0) 5754; VI-NEXT: flat_store_dword v[0:1], v1 5755; VI-NEXT: s_waitcnt vmcnt(0) 5756; VI-NEXT: v_mov_b32_e32 v0, s0 5757; VI-NEXT: flat_store_dword v[0:1], v0 5758; VI-NEXT: s_waitcnt vmcnt(0) 5759; VI-NEXT: v_mov_b32_e32 v0, s4 5760; VI-NEXT: flat_store_dword v[0:1], v0 5761; VI-NEXT: s_waitcnt vmcnt(0) 5762; VI-NEXT: s_endpgm 5763; 5764; GFX9-LABEL: array_3xi32: 5765; GFX9: ; %bb.0: 5766; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 5767; GFX9-NEXT: s_load_dword s1, s[4:5], 0xc 5768; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4 5769; GFX9-NEXT: s_load_dword s3, s[4:5], 0x8 5770; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5771; GFX9-NEXT: v_mov_b32_e32 v0, s0 5772; GFX9-NEXT: v_mov_b32_e32 v1, s1 5773; GFX9-NEXT: global_store_short v[0:1], v0, off 5774; GFX9-NEXT: s_waitcnt vmcnt(0) 5775; GFX9-NEXT: global_store_dword v[0:1], v1, off 5776; GFX9-NEXT: s_waitcnt vmcnt(0) 5777; GFX9-NEXT: v_mov_b32_e32 v0, s3 5778; GFX9-NEXT: global_store_dword v[0:1], v0, off 5779; GFX9-NEXT: s_waitcnt vmcnt(0) 5780; GFX9-NEXT: v_mov_b32_e32 v0, s2 5781; GFX9-NEXT: global_store_dword v[0:1], v0, off 5782; GFX9-NEXT: s_waitcnt vmcnt(0) 5783; GFX9-NEXT: s_endpgm 5784; 5785; EG-LABEL: array_3xi32: 5786; EG: ; %bb.0: 5787; EG-NEXT: ALU 0, @10, KC0[], KC1[] 5788; EG-NEXT: TEX 0 @8 5789; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5790; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5791; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0 5792; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 5793; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1 5794; EG-NEXT: CF_END 5795; EG-NEXT: Fetch clause starting at 8: 5796; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5797; EG-NEXT: ALU clause starting at 10: 5798; EG-NEXT: MOV * T0.X, 0.0, 5799; EG-NEXT: ALU clause starting at 11: 5800; EG-NEXT: AND_INT T0.X, T0.X, literal.x, 5801; EG-NEXT: MOV * T0.W, literal.x, 5802; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5803; EG-NEXT: MOV T0.Y, 0.0, 5804; EG-NEXT: MOV * T0.Z, 0.0, 5805; EG-NEXT: MOV T1.X, KC0[2].Z, 5806; EG-NEXT: MOV * T2.X, KC0[2].W, 5807; EG-NEXT: MOV T3.X, KC0[3].X, 5808; EG-NEXT: MOV * T4.X, literal.x, 5809; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5810; 5811; CM-LABEL: array_3xi32: 5812; CM: ; %bb.0: 5813; CM-NEXT: ALU 0, @10, KC0[], KC1[] 5814; CM-NEXT: TEX 0 @8 5815; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5816; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5817; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X 5818; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X 5819; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5820; CM-NEXT: CF_END 5821; CM-NEXT: Fetch clause starting at 8: 5822; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5823; CM-NEXT: ALU clause starting at 10: 5824; CM-NEXT: MOV * T0.X, 0.0, 5825; CM-NEXT: ALU clause starting at 11: 5826; CM-NEXT: AND_INT T0.X, T0.X, literal.x, 5827; CM-NEXT: MOV * T0.W, literal.x, 5828; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5829; CM-NEXT: MOV T0.Y, 0.0, 5830; CM-NEXT: MOV * T0.Z, 0.0, 5831; CM-NEXT: MOV * T1.X, KC0[2].Z, 5832; CM-NEXT: MOV * T2.X, KC0[2].W, 5833; CM-NEXT: MOV * T3.X, KC0[3].X, 5834; CM-NEXT: MOV * T4.X, literal.x, 5835; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5836 store volatile i16 %arg0, i16 addrspace(1)* undef 5837 store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef 5838 ret void 5839} 5840 5841; FIXME: Why not all scalar loads? 5842define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 5843; SI-LABEL: array_3xi16: 5844; SI: ; %bb.0: 5845; SI-NEXT: s_load_dword s4, s[0:1], 0x9 5846; SI-NEXT: s_mov_b32 s3, 0xf000 5847; SI-NEXT: s_mov_b32 s2, -1 5848; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:42 5849; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:40 5850; SI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:38 5851; SI-NEXT: s_waitcnt lgkmcnt(0) 5852; SI-NEXT: v_mov_b32_e32 v3, s4 5853; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 5854; SI-NEXT: s_waitcnt vmcnt(0) 5855; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 5856; SI-NEXT: s_waitcnt vmcnt(0) 5857; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 5858; SI-NEXT: s_waitcnt vmcnt(0) 5859; SI-NEXT: buffer_store_short v2, off, s[0:3], 0 5860; SI-NEXT: s_waitcnt vmcnt(0) 5861; SI-NEXT: s_endpgm 5862; 5863; VI-LABEL: array_3xi16: 5864; VI: ; %bb.0: 5865; VI-NEXT: s_add_u32 s2, s0, 38 5866; VI-NEXT: s_addc_u32 s3, s1, 0 5867; VI-NEXT: s_add_u32 s4, s2, 2 5868; VI-NEXT: s_addc_u32 s5, s3, 0 5869; VI-NEXT: v_mov_b32_e32 v0, s2 5870; VI-NEXT: v_mov_b32_e32 v1, s3 5871; VI-NEXT: s_add_u32 s2, s0, 42 5872; VI-NEXT: s_addc_u32 s3, s1, 0 5873; VI-NEXT: v_mov_b32_e32 v2, s2 5874; VI-NEXT: v_mov_b32_e32 v3, s3 5875; VI-NEXT: flat_load_ushort v4, v[0:1] 5876; VI-NEXT: flat_load_ushort v2, v[2:3] 5877; VI-NEXT: v_mov_b32_e32 v0, s4 5878; VI-NEXT: v_mov_b32_e32 v1, s5 5879; VI-NEXT: flat_load_ushort v0, v[0:1] 5880; VI-NEXT: s_load_dword s0, s[0:1], 0x24 5881; VI-NEXT: s_waitcnt lgkmcnt(0) 5882; VI-NEXT: v_mov_b32_e32 v1, s0 5883; VI-NEXT: s_waitcnt vmcnt(0) 5884; VI-NEXT: flat_store_byte v[0:1], v1 5885; VI-NEXT: s_waitcnt vmcnt(0) 5886; VI-NEXT: flat_store_short v[0:1], v2 5887; VI-NEXT: s_waitcnt vmcnt(0) 5888; VI-NEXT: flat_store_short v[0:1], v4 5889; VI-NEXT: s_waitcnt vmcnt(0) 5890; VI-NEXT: flat_store_short v[0:1], v0 5891; VI-NEXT: s_waitcnt vmcnt(0) 5892; VI-NEXT: s_endpgm 5893; 5894; GFX9-LABEL: array_3xi16: 5895; GFX9: ; %bb.0: 5896; GFX9-NEXT: v_mov_b32_e32 v0, 0 5897; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:6 5898; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] offset:4 5899; GFX9-NEXT: global_load_ushort v3, v0, s[4:5] offset:2 5900; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 5901; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5902; GFX9-NEXT: v_mov_b32_e32 v0, s0 5903; GFX9-NEXT: s_waitcnt vmcnt(2) 5904; GFX9-NEXT: global_store_byte v[0:1], v0, off 5905; GFX9-NEXT: s_waitcnt vmcnt(0) 5906; GFX9-NEXT: global_store_short v[0:1], v1, off 5907; GFX9-NEXT: s_waitcnt vmcnt(0) 5908; GFX9-NEXT: global_store_short v[0:1], v2, off 5909; GFX9-NEXT: s_waitcnt vmcnt(0) 5910; GFX9-NEXT: global_store_short v[0:1], v3, off 5911; GFX9-NEXT: s_waitcnt vmcnt(0) 5912; GFX9-NEXT: s_endpgm 5913; 5914; EG-LABEL: array_3xi16: 5915; EG: ; %bb.0: 5916; EG-NEXT: ALU 0, @20, KC0[], KC1[] 5917; EG-NEXT: TEX 1 @12 5918; EG-NEXT: ALU 11, @21, KC0[], KC1[] 5919; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5920; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5921; EG-NEXT: TEX 0 @16 5922; EG-NEXT: ALU 3, @33, KC0[], KC1[] 5923; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5924; EG-NEXT: TEX 0 @18 5925; EG-NEXT: ALU 3, @37, KC0[], KC1[] 5926; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5927; EG-NEXT: CF_END 5928; EG-NEXT: Fetch clause starting at 12: 5929; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5930; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5931; EG-NEXT: Fetch clause starting at 16: 5932; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5933; EG-NEXT: Fetch clause starting at 18: 5934; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5935; EG-NEXT: ALU clause starting at 20: 5936; EG-NEXT: MOV * T0.X, 0.0, 5937; EG-NEXT: ALU clause starting at 21: 5938; EG-NEXT: AND_INT T1.X, T1.X, literal.x, 5939; EG-NEXT: MOV * T1.W, literal.x, 5940; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5941; EG-NEXT: MOV * T1.Y, 0.0, 5942; EG-NEXT: AND_INT T2.X, T2.X, literal.x, 5943; EG-NEXT: MOV * T2.W, literal.x, 5944; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5945; EG-NEXT: MOV T2.Y, 0.0, 5946; EG-NEXT: MOV T1.Z, 0.0, 5947; EG-NEXT: MOV * T2.Z, 0.0, 5948; EG-NEXT: MOV * T3.X, literal.x, 5949; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5950; EG-NEXT: ALU clause starting at 33: 5951; EG-NEXT: AND_INT T2.X, T1.X, literal.x, 5952; EG-NEXT: MOV T2.Y, 0.0, 5953; EG-NEXT: MOV * T2.Z, 0.0, 5954; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5955; EG-NEXT: ALU clause starting at 37: 5956; EG-NEXT: AND_INT T2.X, T0.X, literal.x, 5957; EG-NEXT: MOV T2.Y, 0.0, 5958; EG-NEXT: MOV * T2.Z, 0.0, 5959; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5960; 5961; CM-LABEL: array_3xi16: 5962; CM: ; %bb.0: 5963; CM-NEXT: ALU 0, @20, KC0[], KC1[] 5964; CM-NEXT: TEX 1 @12 5965; CM-NEXT: ALU 11, @21, KC0[], KC1[] 5966; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5967; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5968; CM-NEXT: TEX 0 @16 5969; CM-NEXT: ALU 3, @33, KC0[], KC1[] 5970; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5971; CM-NEXT: TEX 0 @18 5972; CM-NEXT: ALU 3, @37, KC0[], KC1[] 5973; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5974; CM-NEXT: CF_END 5975; CM-NEXT: Fetch clause starting at 12: 5976; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5977; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5978; CM-NEXT: Fetch clause starting at 16: 5979; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5980; CM-NEXT: Fetch clause starting at 18: 5981; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5982; CM-NEXT: ALU clause starting at 20: 5983; CM-NEXT: MOV * T0.X, 0.0, 5984; CM-NEXT: ALU clause starting at 21: 5985; CM-NEXT: AND_INT T1.X, T1.X, literal.x, 5986; CM-NEXT: MOV * T1.W, literal.x, 5987; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5988; CM-NEXT: MOV * T1.Y, 0.0, 5989; CM-NEXT: AND_INT T2.X, T2.X, literal.x, 5990; CM-NEXT: MOV * T2.W, literal.x, 5991; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5992; CM-NEXT: MOV T2.Y, 0.0, 5993; CM-NEXT: MOV * T1.Z, 0.0, 5994; CM-NEXT: MOV * T2.Z, 0.0, 5995; CM-NEXT: MOV * T3.X, literal.x, 5996; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5997; CM-NEXT: ALU clause starting at 33: 5998; CM-NEXT: AND_INT T2.X, T1.X, literal.x, 5999; CM-NEXT: MOV T2.Y, 0.0, 6000; CM-NEXT: MOV * T2.Z, 0.0, 6001; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 6002; CM-NEXT: ALU clause starting at 37: 6003; CM-NEXT: AND_INT T2.X, T0.X, literal.x, 6004; CM-NEXT: MOV T2.Y, 0.0, 6005; CM-NEXT: MOV * T2.Z, 0.0, 6006; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 6007 store volatile i8 %arg0, i8 addrspace(1)* undef 6008 store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef 6009 ret void 6010} 6011 6012define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { 6013; SI-LABEL: small_array_round_down_offset: 6014; SI: ; %bb.0: 6015; SI-NEXT: s_mov_b32 s3, 0xf000 6016; SI-NEXT: s_mov_b32 s2, -1 6017; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:37 6018; SI-NEXT: s_waitcnt vmcnt(0) 6019; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 6020; SI-NEXT: s_waitcnt vmcnt(0) 6021; SI-NEXT: s_endpgm 6022; 6023; VI-LABEL: small_array_round_down_offset: 6024; VI: ; %bb.0: 6025; VI-NEXT: s_add_u32 s0, s0, 37 6026; VI-NEXT: s_addc_u32 s1, s1, 0 6027; VI-NEXT: v_mov_b32_e32 v0, s0 6028; VI-NEXT: v_mov_b32_e32 v1, s1 6029; VI-NEXT: flat_load_ubyte v0, v[0:1] 6030; VI-NEXT: s_waitcnt vmcnt(0) 6031; VI-NEXT: flat_store_byte v[0:1], v0 6032; VI-NEXT: s_waitcnt vmcnt(0) 6033; VI-NEXT: s_endpgm 6034; 6035; GFX9-LABEL: small_array_round_down_offset: 6036; GFX9: ; %bb.0: 6037; GFX9-NEXT: v_mov_b32_e32 v0, 0 6038; GFX9-NEXT: global_load_ubyte v0, v0, s[4:5] offset:1 6039; GFX9-NEXT: s_waitcnt vmcnt(0) 6040; GFX9-NEXT: global_store_byte v[0:1], v0, off 6041; GFX9-NEXT: s_waitcnt vmcnt(0) 6042; GFX9-NEXT: s_endpgm 6043; 6044; EGCM-LABEL: small_array_round_down_offset: 6045; EGCM: ; %bb.0: 6046; EGCM-NEXT: ALU 0, @8, KC0[], KC1[] 6047; EGCM-NEXT: TEX 0 @6 6048; EGCM-NEXT: ALU 6, @9, KC0[], KC1[] 6049; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 6050; EGCM-NEXT: CF_END 6051; EGCM-NEXT: PAD 6052; EGCM-NEXT: Fetch clause starting at 6: 6053; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3 6054; EGCM-NEXT: ALU clause starting at 8: 6055; EGCM-NEXT: MOV * T0.X, 0.0, 6056; EGCM-NEXT: ALU clause starting at 9: 6057; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x, 6058; EGCM-NEXT: MOV * T0.W, literal.x, 6059; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 6060; EGCM-NEXT: MOV T0.Y, 0.0, 6061; EGCM-NEXT: MOV * T0.Z, 0.0, 6062; EGCM-NEXT: MOV * T1.X, literal.x, 6063; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 6064 %val = extractvalue [1 x i8] %arg, 0 6065 store volatile i8 %val, i8 addrspace(1)* undef 6066 ret void 6067} 6068 6069define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { 6070; SI-LABEL: byref_align_constant_i32_arg: 6071; SI: ; %bb.0: 6072; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x49 6073; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6074; SI-NEXT: s_mov_b32 s3, 0xf000 6075; SI-NEXT: s_mov_b32 s2, -1 6076; SI-NEXT: s_waitcnt lgkmcnt(0) 6077; SI-NEXT: v_mov_b32_e32 v0, s4 6078; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6079; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6080; SI-NEXT: v_mov_b32_e32 v0, s5 6081; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6082; SI-NEXT: s_waitcnt vmcnt(0) 6083; SI-NEXT: s_endpgm 6084; 6085; VI-LABEL: byref_align_constant_i32_arg: 6086; VI: ; %bb.0: 6087; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6088; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124 6089; VI-NEXT: s_waitcnt lgkmcnt(0) 6090; VI-NEXT: v_mov_b32_e32 v0, s2 6091; VI-NEXT: v_mov_b32_e32 v1, s3 6092; VI-NEXT: v_mov_b32_e32 v2, s0 6093; VI-NEXT: v_mov_b32_e32 v3, s1 6094; VI-NEXT: flat_store_dword v[0:1], v2 6095; VI-NEXT: s_waitcnt vmcnt(0) 6096; VI-NEXT: flat_store_dword v[0:1], v3 6097; VI-NEXT: s_waitcnt vmcnt(0) 6098; VI-NEXT: s_endpgm 6099; 6100; GFX9-LABEL: byref_align_constant_i32_arg: 6101; GFX9: ; %bb.0: 6102; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 6103; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 6104; GFX9-NEXT: v_mov_b32_e32 v0, 0 6105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6106; GFX9-NEXT: v_mov_b32_e32 v1, s0 6107; GFX9-NEXT: v_mov_b32_e32 v2, s1 6108; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6109; GFX9-NEXT: s_waitcnt vmcnt(0) 6110; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 6111; GFX9-NEXT: s_waitcnt vmcnt(0) 6112; GFX9-NEXT: s_endpgm 6113; 6114; EG-LABEL: byref_align_constant_i32_arg: 6115; EG: ; %bb.0: 6116; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 6117; EG-NEXT: TEX 0 @6 6118; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 6119; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0 6120; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6121; EG-NEXT: CF_END 6122; EG-NEXT: Fetch clause starting at 6: 6123; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 6124; EG-NEXT: ALU clause starting at 8: 6125; EG-NEXT: MOV * T0.X, KC0[18].Y, 6126; EG-NEXT: ALU clause starting at 9: 6127; EG-NEXT: MOV T1.X, KC0[18].Z, 6128; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6129; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6130; 6131; CM-LABEL: byref_align_constant_i32_arg: 6132; CM: ; %bb.0: 6133; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 6134; CM-NEXT: TEX 0 @6 6135; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 6136; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X 6137; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6138; CM-NEXT: CF_END 6139; CM-NEXT: Fetch clause starting at 6: 6140; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 6141; CM-NEXT: ALU clause starting at 8: 6142; CM-NEXT: MOV * T0.X, KC0[18].Y, 6143; CM-NEXT: ALU clause starting at 9: 6144; CM-NEXT: MOV * T1.X, KC0[18].Z, 6145; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6146; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6147 %in = load i32, i32 addrspace(4)* %in.byref 6148 store volatile i32 %in, i32 addrspace(1)* %out, align 4 6149 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 6150 ret void 6151} 6152 6153define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) { 6154; SI-LABEL: byref_natural_align_constant_v16i32_arg: 6155; SI: ; %bb.0: 6156; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 6157; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 6158; SI-NEXT: s_load_dword s0, s[0:1], 0x29 6159; SI-NEXT: s_mov_b32 s23, 0xf000 6160; SI-NEXT: s_mov_b32 s22, -1 6161; SI-NEXT: s_waitcnt lgkmcnt(0) 6162; SI-NEXT: v_mov_b32_e32 v0, s16 6163; SI-NEXT: v_mov_b32_e32 v1, s17 6164; SI-NEXT: v_mov_b32_e32 v2, s18 6165; SI-NEXT: v_mov_b32_e32 v3, s19 6166; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48 6167; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6168; SI-NEXT: v_mov_b32_e32 v0, s12 6169; SI-NEXT: v_mov_b32_e32 v1, s13 6170; SI-NEXT: v_mov_b32_e32 v2, s14 6171; SI-NEXT: v_mov_b32_e32 v3, s15 6172; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32 6173; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6174; SI-NEXT: v_mov_b32_e32 v0, s8 6175; SI-NEXT: v_mov_b32_e32 v1, s9 6176; SI-NEXT: v_mov_b32_e32 v2, s10 6177; SI-NEXT: v_mov_b32_e32 v3, s11 6178; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16 6179; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6180; SI-NEXT: v_mov_b32_e32 v0, s4 6181; SI-NEXT: v_mov_b32_e32 v1, s5 6182; SI-NEXT: v_mov_b32_e32 v2, s6 6183; SI-NEXT: v_mov_b32_e32 v3, s7 6184; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 6185; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6186; SI-NEXT: v_mov_b32_e32 v0, s0 6187; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 6188; SI-NEXT: s_waitcnt vmcnt(0) 6189; SI-NEXT: s_endpgm 6190; 6191; VI-LABEL: byref_natural_align_constant_v16i32_arg: 6192; VI: ; %bb.0: 6193; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 6194; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6195; VI-NEXT: s_load_dword s20, s[0:1], 0xa4 6196; VI-NEXT: s_waitcnt lgkmcnt(0) 6197; VI-NEXT: v_mov_b32_e32 v0, s16 6198; VI-NEXT: s_add_u32 s0, s2, 48 6199; VI-NEXT: s_addc_u32 s1, s3, 0 6200; VI-NEXT: v_mov_b32_e32 v5, s1 6201; VI-NEXT: v_mov_b32_e32 v4, s0 6202; VI-NEXT: s_add_u32 s0, s2, 32 6203; VI-NEXT: v_mov_b32_e32 v1, s17 6204; VI-NEXT: v_mov_b32_e32 v2, s18 6205; VI-NEXT: v_mov_b32_e32 v3, s19 6206; VI-NEXT: s_addc_u32 s1, s3, 0 6207; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6208; VI-NEXT: s_waitcnt vmcnt(0) 6209; VI-NEXT: v_mov_b32_e32 v5, s1 6210; VI-NEXT: v_mov_b32_e32 v4, s0 6211; VI-NEXT: s_add_u32 s0, s2, 16 6212; VI-NEXT: v_mov_b32_e32 v0, s12 6213; VI-NEXT: v_mov_b32_e32 v1, s13 6214; VI-NEXT: v_mov_b32_e32 v2, s14 6215; VI-NEXT: v_mov_b32_e32 v3, s15 6216; VI-NEXT: s_addc_u32 s1, s3, 0 6217; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6218; VI-NEXT: s_waitcnt vmcnt(0) 6219; VI-NEXT: v_mov_b32_e32 v5, s1 6220; VI-NEXT: v_mov_b32_e32 v0, s8 6221; VI-NEXT: v_mov_b32_e32 v1, s9 6222; VI-NEXT: v_mov_b32_e32 v2, s10 6223; VI-NEXT: v_mov_b32_e32 v3, s11 6224; VI-NEXT: v_mov_b32_e32 v4, s0 6225; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6226; VI-NEXT: s_waitcnt vmcnt(0) 6227; VI-NEXT: v_mov_b32_e32 v5, s3 6228; VI-NEXT: v_mov_b32_e32 v0, s4 6229; VI-NEXT: v_mov_b32_e32 v1, s5 6230; VI-NEXT: v_mov_b32_e32 v2, s6 6231; VI-NEXT: v_mov_b32_e32 v3, s7 6232; VI-NEXT: v_mov_b32_e32 v4, s2 6233; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6234; VI-NEXT: s_waitcnt vmcnt(0) 6235; VI-NEXT: v_mov_b32_e32 v0, s20 6236; VI-NEXT: flat_store_dword v[4:5], v0 6237; VI-NEXT: s_waitcnt vmcnt(0) 6238; VI-NEXT: s_endpgm 6239; 6240; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: 6241; GFX9: ; %bb.0: 6242; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 6243; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 6244; GFX9-NEXT: s_load_dword s2, s[4:5], 0x80 6245; GFX9-NEXT: v_mov_b32_e32 v4, 0 6246; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6247; GFX9-NEXT: v_mov_b32_e32 v0, s20 6248; GFX9-NEXT: v_mov_b32_e32 v1, s21 6249; GFX9-NEXT: v_mov_b32_e32 v2, s22 6250; GFX9-NEXT: v_mov_b32_e32 v3, s23 6251; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 6252; GFX9-NEXT: s_waitcnt vmcnt(0) 6253; GFX9-NEXT: v_mov_b32_e32 v0, s16 6254; GFX9-NEXT: v_mov_b32_e32 v1, s17 6255; GFX9-NEXT: v_mov_b32_e32 v2, s18 6256; GFX9-NEXT: v_mov_b32_e32 v3, s19 6257; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 6258; GFX9-NEXT: s_waitcnt vmcnt(0) 6259; GFX9-NEXT: v_mov_b32_e32 v0, s12 6260; GFX9-NEXT: v_mov_b32_e32 v1, s13 6261; GFX9-NEXT: v_mov_b32_e32 v2, s14 6262; GFX9-NEXT: v_mov_b32_e32 v3, s15 6263; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 6264; GFX9-NEXT: s_waitcnt vmcnt(0) 6265; GFX9-NEXT: v_mov_b32_e32 v0, s8 6266; GFX9-NEXT: v_mov_b32_e32 v1, s9 6267; GFX9-NEXT: v_mov_b32_e32 v2, s10 6268; GFX9-NEXT: v_mov_b32_e32 v3, s11 6269; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 6270; GFX9-NEXT: s_waitcnt vmcnt(0) 6271; GFX9-NEXT: v_mov_b32_e32 v0, s2 6272; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 6273; GFX9-NEXT: s_waitcnt vmcnt(0) 6274; GFX9-NEXT: s_endpgm 6275; 6276; EG-LABEL: byref_natural_align_constant_v16i32_arg: 6277; EG: ; %bb.0: 6278; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6279; EG-NEXT: TEX 0 @16 6280; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6281; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 6282; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6283; EG-NEXT: TEX 0 @18 6284; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6285; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6286; EG-NEXT: TEX 0 @20 6287; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6288; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6289; EG-NEXT: TEX 0 @22 6290; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 6291; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6292; EG-NEXT: CF_END 6293; EG-NEXT: PAD 6294; EG-NEXT: Fetch clause starting at 16: 6295; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6296; EG-NEXT: Fetch clause starting at 18: 6297; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6298; EG-NEXT: Fetch clause starting at 20: 6299; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6300; EG-NEXT: Fetch clause starting at 22: 6301; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6302; EG-NEXT: ALU clause starting at 24: 6303; EG-NEXT: MOV * T0.X, KC0[6].Y, 6304; EG-NEXT: ALU clause starting at 25: 6305; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6306; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6307; EG-NEXT: LSHR * T2.X, PV.W, literal.x, 6308; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6309; EG-NEXT: ALU clause starting at 29: 6310; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6311; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6312; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6313; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6314; EG-NEXT: ALU clause starting at 33: 6315; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6316; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6317; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6318; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6319; EG-NEXT: ALU clause starting at 37: 6320; EG-NEXT: MOV T1.X, KC0[10].Y, 6321; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6322; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6323; 6324; CM-LABEL: byref_natural_align_constant_v16i32_arg: 6325; CM: ; %bb.0: 6326; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6327; CM-NEXT: TEX 0 @16 6328; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6329; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X 6330; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6331; CM-NEXT: TEX 0 @18 6332; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6333; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6334; CM-NEXT: TEX 0 @20 6335; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6336; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6337; CM-NEXT: TEX 0 @22 6338; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 6339; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6340; CM-NEXT: CF_END 6341; CM-NEXT: PAD 6342; CM-NEXT: Fetch clause starting at 16: 6343; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6344; CM-NEXT: Fetch clause starting at 18: 6345; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6346; CM-NEXT: Fetch clause starting at 20: 6347; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6348; CM-NEXT: Fetch clause starting at 22: 6349; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6350; CM-NEXT: ALU clause starting at 24: 6351; CM-NEXT: MOV * T0.X, KC0[6].Y, 6352; CM-NEXT: ALU clause starting at 25: 6353; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6354; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6355; CM-NEXT: LSHR * T2.X, PV.W, literal.x, 6356; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6357; CM-NEXT: ALU clause starting at 29: 6358; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6359; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6360; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6361; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6362; CM-NEXT: ALU clause starting at 33: 6363; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6364; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6365; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6366; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6367; CM-NEXT: ALU clause starting at 37: 6368; CM-NEXT: MOV * T1.X, KC0[10].Y, 6369; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6370; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6371 %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref 6372 %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)* 6373 store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4 6374 store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4 6375 ret void 6376} 6377