1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 7; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 8 9declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone 10declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 11declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 12 13define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 14; SI-LABEL: fshl_i32: 15; SI: ; %bb.0: ; %entry 16; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 17; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 18; SI-NEXT: s_load_dword s0, s[0:1], 0xd 19; SI-NEXT: s_mov_b32 s7, 0xf000 20; SI-NEXT: s_mov_b32 s6, -1 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: v_mov_b32_e32 v0, s3 23; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 24; SI-NEXT: s_not_b32 s0, s0 25; SI-NEXT: s_lshr_b32 s1, s2, 1 26; SI-NEXT: v_mov_b32_e32 v1, s0 27; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 28; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 29; SI-NEXT: s_endpgm 30; 31; VI-LABEL: fshl_i32: 32; VI: ; %bb.0: ; %entry 33; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 34; VI-NEXT: s_load_dword s4, s[0:1], 0x34 35; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 36; VI-NEXT: s_waitcnt lgkmcnt(0) 37; VI-NEXT: v_mov_b32_e32 v0, s3 38; VI-NEXT: s_not_b32 s4, s4 39; VI-NEXT: s_lshr_b32 s3, s2, 1 40; VI-NEXT: v_alignbit_b32 v0, s2, v0, 1 41; VI-NEXT: v_mov_b32_e32 v1, s4 42; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 43; VI-NEXT: v_mov_b32_e32 v0, s0 44; VI-NEXT: v_mov_b32_e32 v1, s1 45; VI-NEXT: flat_store_dword v[0:1], v2 46; VI-NEXT: s_endpgm 47; 48; GFX9-LABEL: fshl_i32: 49; GFX9: ; %bb.0: ; %entry 50; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 51; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 52; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 53; GFX9-NEXT: v_mov_b32_e32 v0, 0 54; GFX9-NEXT: s_waitcnt lgkmcnt(0) 55; GFX9-NEXT: v_mov_b32_e32 v1, s3 56; GFX9-NEXT: s_lshr_b32 s0, s2, 1 57; GFX9-NEXT: s_not_b32 s1, s6 58; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 1 59; GFX9-NEXT: v_mov_b32_e32 v2, s1 60; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 61; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 62; GFX9-NEXT: s_endpgm 63; 64; R600-LABEL: fshl_i32: 65; R600: ; %bb.0: ; %entry 66; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 67; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 68; R600-NEXT: CF_END 69; R600-NEXT: PAD 70; R600-NEXT: ALU clause starting at 4: 71; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, 72; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, 73; R600-NEXT: NOT_INT * T1.W, KC0[3].X, 74; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, 75; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 76; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 77; 78; GFX10-LABEL: fshl_i32: 79; GFX10: ; %bb.0: ; %entry 80; GFX10-NEXT: s_clause 0x2 81; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 82; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 83; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 84; GFX10-NEXT: v_mov_b32_e32 v1, 0 85; GFX10-NEXT: s_waitcnt lgkmcnt(0) 86; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, 1 87; GFX10-NEXT: s_lshr_b32 s0, s2, 1 88; GFX10-NEXT: s_not_b32 s1, s6 89; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 90; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 91; GFX10-NEXT: s_endpgm 92; 93; GFX11-LABEL: fshl_i32: 94; GFX11: ; %bb.0: ; %entry 95; GFX11-NEXT: s_clause 0x2 96; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 97; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 98; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 99; GFX11-NEXT: v_mov_b32_e32 v1, 0 100; GFX11-NEXT: s_waitcnt lgkmcnt(0) 101; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 102; GFX11-NEXT: s_lshr_b32 s2, s2, 1 103; GFX11-NEXT: s_not_b32 s3, s4 104; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 105; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 106; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 107; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 108; GFX11-NEXT: s_endpgm 109entry: 110 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) 111 store i32 %0, i32 addrspace(1)* %in 112 ret void 113} 114 115define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 116; SI-LABEL: fshl_i32_imm: 117; SI: ; %bb.0: ; %entry 118; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 119; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 120; SI-NEXT: s_mov_b32 s3, 0xf000 121; SI-NEXT: s_mov_b32 s2, -1 122; SI-NEXT: s_waitcnt lgkmcnt(0) 123; SI-NEXT: v_mov_b32_e32 v0, s5 124; SI-NEXT: v_alignbit_b32 v0, s4, v0, 25 125; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 126; SI-NEXT: s_endpgm 127; 128; VI-LABEL: fshl_i32_imm: 129; VI: ; %bb.0: ; %entry 130; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 131; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 132; VI-NEXT: s_waitcnt lgkmcnt(0) 133; VI-NEXT: v_mov_b32_e32 v0, s3 134; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 135; VI-NEXT: v_mov_b32_e32 v0, s0 136; VI-NEXT: v_mov_b32_e32 v1, s1 137; VI-NEXT: flat_store_dword v[0:1], v2 138; VI-NEXT: s_endpgm 139; 140; GFX9-LABEL: fshl_i32_imm: 141; GFX9: ; %bb.0: ; %entry 142; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 143; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 144; GFX9-NEXT: v_mov_b32_e32 v0, 0 145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 146; GFX9-NEXT: v_mov_b32_e32 v1, s3 147; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 148; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 149; GFX9-NEXT: s_endpgm 150; 151; R600-LABEL: fshl_i32_imm: 152; R600: ; %bb.0: ; %entry 153; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 154; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 155; R600-NEXT: CF_END 156; R600-NEXT: PAD 157; R600-NEXT: ALU clause starting at 4: 158; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 159; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 160; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 161; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 162; 163; GFX10-LABEL: fshl_i32_imm: 164; GFX10: ; %bb.0: ; %entry 165; GFX10-NEXT: s_clause 0x1 166; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 167; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 168; GFX10-NEXT: v_mov_b32_e32 v0, 0 169; GFX10-NEXT: s_waitcnt lgkmcnt(0) 170; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 171; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 172; GFX10-NEXT: s_endpgm 173; 174; GFX11-LABEL: fshl_i32_imm: 175; GFX11: ; %bb.0: ; %entry 176; GFX11-NEXT: s_clause 0x1 177; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 178; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 179; GFX11-NEXT: v_mov_b32_e32 v0, 0 180; GFX11-NEXT: s_waitcnt lgkmcnt(0) 181; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 182; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 183; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 184; GFX11-NEXT: s_endpgm 185entry: 186 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) 187 store i32 %0, i32 addrspace(1)* %in 188 ret void 189} 190 191define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 192; SI-LABEL: fshl_v2i32: 193; SI: ; %bb.0: ; %entry 194; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 195; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 196; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 197; SI-NEXT: s_mov_b32 s11, 0xf000 198; SI-NEXT: s_mov_b32 s10, -1 199; SI-NEXT: s_waitcnt lgkmcnt(0) 200; SI-NEXT: v_mov_b32_e32 v0, s7 201; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 202; SI-NEXT: s_not_b32 s1, s1 203; SI-NEXT: s_lshr_b32 s2, s5, 1 204; SI-NEXT: v_mov_b32_e32 v1, s1 205; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 206; SI-NEXT: v_mov_b32_e32 v0, s6 207; SI-NEXT: s_not_b32 s0, s0 208; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 209; SI-NEXT: s_lshr_b32 s1, s4, 1 210; SI-NEXT: v_mov_b32_e32 v2, s0 211; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 212; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 213; SI-NEXT: s_endpgm 214; 215; VI-LABEL: fshl_v2i32: 216; VI: ; %bb.0: ; %entry 217; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 218; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 219; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 220; VI-NEXT: s_waitcnt lgkmcnt(0) 221; VI-NEXT: v_mov_b32_e32 v0, s7 222; VI-NEXT: s_not_b32 s3, s3 223; VI-NEXT: s_lshr_b32 s7, s5, 1 224; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 225; VI-NEXT: v_mov_b32_e32 v1, s3 226; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 227; VI-NEXT: v_mov_b32_e32 v0, s6 228; VI-NEXT: s_not_b32 s2, s2 229; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 230; VI-NEXT: s_lshr_b32 s3, s4, 1 231; VI-NEXT: v_mov_b32_e32 v2, s2 232; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 233; VI-NEXT: v_mov_b32_e32 v3, s1 234; VI-NEXT: v_mov_b32_e32 v2, s0 235; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 236; VI-NEXT: s_endpgm 237; 238; GFX9-LABEL: fshl_v2i32: 239; GFX9: ; %bb.0: ; %entry 240; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 241; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 242; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c 243; GFX9-NEXT: v_mov_b32_e32 v2, 0 244; GFX9-NEXT: s_waitcnt lgkmcnt(0) 245; GFX9-NEXT: v_mov_b32_e32 v0, s7 246; GFX9-NEXT: s_lshr_b32 s0, s5, 1 247; GFX9-NEXT: s_not_b32 s1, s9 248; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 249; GFX9-NEXT: v_mov_b32_e32 v1, s1 250; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 251; GFX9-NEXT: v_mov_b32_e32 v0, s6 252; GFX9-NEXT: s_not_b32 s1, s8 253; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 254; GFX9-NEXT: s_lshr_b32 s0, s4, 1 255; GFX9-NEXT: v_mov_b32_e32 v3, s1 256; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 257; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 258; GFX9-NEXT: s_endpgm 259; 260; R600-LABEL: fshl_v2i32: 261; R600: ; %bb.0: ; %entry 262; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 263; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 264; R600-NEXT: CF_END 265; R600-NEXT: PAD 266; R600-NEXT: ALU clause starting at 4: 267; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, 268; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, 269; R600-NEXT: NOT_INT * T1.W, KC0[4].X, 270; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, 271; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, 272; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, 273; R600-NEXT: NOT_INT * T1.W, KC0[3].W, 274; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, 275; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 276; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 277; 278; GFX10-LABEL: fshl_v2i32: 279; GFX10: ; %bb.0: ; %entry 280; GFX10-NEXT: s_clause 0x2 281; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 282; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 283; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 284; GFX10-NEXT: v_mov_b32_e32 v2, 0 285; GFX10-NEXT: s_waitcnt lgkmcnt(0) 286; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 287; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 288; GFX10-NEXT: s_lshr_b32 s0, s5, 1 289; GFX10-NEXT: s_not_b32 s1, s3 290; GFX10-NEXT: s_lshr_b32 s3, s4, 1 291; GFX10-NEXT: s_not_b32 s2, s2 292; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 293; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 294; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 295; GFX10-NEXT: s_endpgm 296; 297; GFX11-LABEL: fshl_v2i32: 298; GFX11: ; %bb.0: ; %entry 299; GFX11-NEXT: s_clause 0x2 300; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c 301; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c 302; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 303; GFX11-NEXT: v_mov_b32_e32 v2, 0 304; GFX11-NEXT: s_waitcnt lgkmcnt(0) 305; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 306; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 307; GFX11-NEXT: s_lshr_b32 s5, s5, 1 308; GFX11-NEXT: s_not_b32 s3, s3 309; GFX11-NEXT: s_lshr_b32 s4, s4, 1 310; GFX11-NEXT: s_not_b32 s2, s2 311; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 312; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 313; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 314; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 315; GFX11-NEXT: s_endpgm 316entry: 317 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 318 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 319 ret void 320} 321 322define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 323; SI-LABEL: fshl_v2i32_imm: 324; SI: ; %bb.0: ; %entry 325; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 326; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 327; SI-NEXT: s_mov_b32 s3, 0xf000 328; SI-NEXT: s_mov_b32 s2, -1 329; SI-NEXT: s_waitcnt lgkmcnt(0) 330; SI-NEXT: v_mov_b32_e32 v0, s7 331; SI-NEXT: v_mov_b32_e32 v2, s6 332; SI-NEXT: v_alignbit_b32 v1, s5, v0, 23 333; SI-NEXT: v_alignbit_b32 v0, s4, v2, 25 334; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 335; SI-NEXT: s_endpgm 336; 337; VI-LABEL: fshl_v2i32_imm: 338; VI: ; %bb.0: ; %entry 339; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 340; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 341; VI-NEXT: s_waitcnt lgkmcnt(0) 342; VI-NEXT: v_mov_b32_e32 v0, s7 343; VI-NEXT: v_mov_b32_e32 v2, s6 344; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 345; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 346; VI-NEXT: v_mov_b32_e32 v3, s1 347; VI-NEXT: v_mov_b32_e32 v2, s0 348; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 349; VI-NEXT: s_endpgm 350; 351; GFX9-LABEL: fshl_v2i32_imm: 352; GFX9: ; %bb.0: ; %entry 353; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 354; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 355; GFX9-NEXT: v_mov_b32_e32 v2, 0 356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 357; GFX9-NEXT: v_mov_b32_e32 v0, s7 358; GFX9-NEXT: v_mov_b32_e32 v3, s6 359; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 360; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 361; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 362; GFX9-NEXT: s_endpgm 363; 364; R600-LABEL: fshl_v2i32_imm: 365; R600: ; %bb.0: ; %entry 366; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 367; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 368; R600-NEXT: CF_END 369; R600-NEXT: PAD 370; R600-NEXT: ALU clause starting at 4: 371; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 372; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 373; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 374; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 375; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 376; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 377; 378; GFX10-LABEL: fshl_v2i32_imm: 379; GFX10: ; %bb.0: ; %entry 380; GFX10-NEXT: s_clause 0x1 381; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 382; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 383; GFX10-NEXT: v_mov_b32_e32 v2, 0 384; GFX10-NEXT: s_waitcnt lgkmcnt(0) 385; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 386; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 387; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 388; GFX10-NEXT: s_endpgm 389; 390; GFX11-LABEL: fshl_v2i32_imm: 391; GFX11: ; %bb.0: ; %entry 392; GFX11-NEXT: s_clause 0x1 393; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c 394; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 395; GFX11-NEXT: v_mov_b32_e32 v2, 0 396; GFX11-NEXT: s_waitcnt lgkmcnt(0) 397; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 398; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25 399; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 400; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 401; GFX11-NEXT: s_endpgm 402entry: 403 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 404 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 405 ret void 406} 407 408define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 409; SI-LABEL: fshl_v4i32: 410; SI: ; %bb.0: ; %entry 411; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 412; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 413; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 414; SI-NEXT: s_mov_b32 s3, 0xf000 415; SI-NEXT: s_mov_b32 s2, -1 416; SI-NEXT: s_waitcnt lgkmcnt(0) 417; SI-NEXT: v_mov_b32_e32 v0, s11 418; SI-NEXT: s_not_b32 s11, s15 419; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 420; SI-NEXT: s_lshr_b32 s7, s7, 1 421; SI-NEXT: v_mov_b32_e32 v1, s11 422; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 423; SI-NEXT: v_mov_b32_e32 v0, s10 424; SI-NEXT: s_not_b32 s7, s14 425; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 426; SI-NEXT: s_lshr_b32 s6, s6, 1 427; SI-NEXT: v_mov_b32_e32 v1, s7 428; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 429; SI-NEXT: v_mov_b32_e32 v0, s9 430; SI-NEXT: s_not_b32 s6, s13 431; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 432; SI-NEXT: s_lshr_b32 s5, s5, 1 433; SI-NEXT: v_mov_b32_e32 v1, s6 434; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 435; SI-NEXT: v_mov_b32_e32 v0, s8 436; SI-NEXT: s_not_b32 s5, s12 437; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 438; SI-NEXT: s_lshr_b32 s4, s4, 1 439; SI-NEXT: v_mov_b32_e32 v4, s5 440; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 441; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 442; SI-NEXT: s_endpgm 443; 444; VI-LABEL: fshl_v4i32: 445; VI: ; %bb.0: ; %entry 446; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 447; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 448; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 449; VI-NEXT: s_waitcnt lgkmcnt(0) 450; VI-NEXT: v_mov_b32_e32 v0, s11 451; VI-NEXT: s_not_b32 s3, s15 452; VI-NEXT: s_lshr_b32 s2, s7, 1 453; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 454; VI-NEXT: v_mov_b32_e32 v1, s3 455; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1 456; VI-NEXT: v_mov_b32_e32 v0, s10 457; VI-NEXT: s_not_b32 s3, s14 458; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 459; VI-NEXT: s_lshr_b32 s2, s6, 1 460; VI-NEXT: v_mov_b32_e32 v1, s3 461; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 462; VI-NEXT: v_mov_b32_e32 v0, s9 463; VI-NEXT: s_not_b32 s3, s13 464; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 465; VI-NEXT: s_lshr_b32 s2, s5, 1 466; VI-NEXT: v_mov_b32_e32 v1, s3 467; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 468; VI-NEXT: v_mov_b32_e32 v0, s8 469; VI-NEXT: s_not_b32 s3, s12 470; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 471; VI-NEXT: s_lshr_b32 s2, s4, 1 472; VI-NEXT: v_mov_b32_e32 v4, s3 473; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4 474; VI-NEXT: v_mov_b32_e32 v5, s1 475; VI-NEXT: v_mov_b32_e32 v4, s0 476; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 477; VI-NEXT: s_endpgm 478; 479; GFX9-LABEL: fshl_v4i32: 480; GFX9: ; %bb.0: ; %entry 481; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 482; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 483; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 484; GFX9-NEXT: v_mov_b32_e32 v4, 0 485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NEXT: s_not_b32 s1, s15 487; GFX9-NEXT: v_mov_b32_e32 v0, s11 488; GFX9-NEXT: s_lshr_b32 s0, s7, 1 489; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 490; GFX9-NEXT: v_mov_b32_e32 v1, s1 491; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 492; GFX9-NEXT: v_mov_b32_e32 v0, s10 493; GFX9-NEXT: s_not_b32 s1, s14 494; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 495; GFX9-NEXT: s_lshr_b32 s0, s6, 1 496; GFX9-NEXT: v_mov_b32_e32 v1, s1 497; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 498; GFX9-NEXT: v_mov_b32_e32 v0, s9 499; GFX9-NEXT: s_not_b32 s1, s13 500; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 501; GFX9-NEXT: s_lshr_b32 s0, s5, 1 502; GFX9-NEXT: v_mov_b32_e32 v1, s1 503; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 504; GFX9-NEXT: v_mov_b32_e32 v0, s8 505; GFX9-NEXT: s_not_b32 s1, s12 506; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 507; GFX9-NEXT: s_lshr_b32 s0, s4, 1 508; GFX9-NEXT: v_mov_b32_e32 v5, s1 509; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 510; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 511; GFX9-NEXT: s_endpgm 512; 513; R600-LABEL: fshl_v4i32: 514; R600: ; %bb.0: ; %entry 515; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 516; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 517; R600-NEXT: CF_END 518; R600-NEXT: PAD 519; R600-NEXT: ALU clause starting at 4: 520; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, 521; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 522; R600-NEXT: NOT_INT * T1.W, KC0[6].X, 523; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, 524; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, 525; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, 526; R600-NEXT: NOT_INT * T1.W, KC0[5].W, 527; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, 528; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, 529; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, 530; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, 531; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, 532; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, 533; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, 534; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, 535; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, 536; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 537; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 538; 539; GFX10-LABEL: fshl_v4i32: 540; GFX10: ; %bb.0: ; %entry 541; GFX10-NEXT: s_clause 0x1 542; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 543; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 544; GFX10-NEXT: v_mov_b32_e32 v4, 0 545; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 546; GFX10-NEXT: s_waitcnt lgkmcnt(0) 547; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 548; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 549; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 550; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 551; GFX10-NEXT: s_lshr_b32 s2, s7, 1 552; GFX10-NEXT: s_not_b32 s3, s15 553; GFX10-NEXT: s_lshr_b32 s6, s6, 1 554; GFX10-NEXT: s_not_b32 s7, s14 555; GFX10-NEXT: s_lshr_b32 s5, s5, 1 556; GFX10-NEXT: s_not_b32 s9, s13 557; GFX10-NEXT: s_lshr_b32 s4, s4, 1 558; GFX10-NEXT: s_not_b32 s8, s12 559; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3 560; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 561; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 562; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 563; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 564; GFX10-NEXT: s_endpgm 565; 566; GFX11-LABEL: fshl_v4i32: 567; GFX11: ; %bb.0: ; %entry 568; GFX11-NEXT: s_clause 0x2 569; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 570; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 571; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 572; GFX11-NEXT: v_mov_b32_e32 v4, 0 573; GFX11-NEXT: s_waitcnt lgkmcnt(0) 574; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 575; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1 576; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1 577; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1 578; GFX11-NEXT: s_lshr_b32 s2, s7, 1 579; GFX11-NEXT: s_not_b32 s3, s15 580; GFX11-NEXT: s_lshr_b32 s6, s6, 1 581; GFX11-NEXT: s_not_b32 s7, s14 582; GFX11-NEXT: s_lshr_b32 s5, s5, 1 583; GFX11-NEXT: s_not_b32 s9, s13 584; GFX11-NEXT: s_lshr_b32 s4, s4, 1 585; GFX11-NEXT: s_not_b32 s8, s12 586; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3 587; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7 588; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9 589; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8 590; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 591; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 592; GFX11-NEXT: s_endpgm 593entry: 594 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 595 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 596 ret void 597} 598 599define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 600; SI-LABEL: fshl_v4i32_imm: 601; SI: ; %bb.0: ; %entry 602; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 603; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 604; SI-NEXT: s_mov_b32 s3, 0xf000 605; SI-NEXT: s_mov_b32 s2, -1 606; SI-NEXT: s_waitcnt lgkmcnt(0) 607; SI-NEXT: v_mov_b32_e32 v0, s11 608; SI-NEXT: v_mov_b32_e32 v1, s10 609; SI-NEXT: v_alignbit_b32 v3, s7, v0, 31 610; SI-NEXT: v_mov_b32_e32 v0, s9 611; SI-NEXT: v_alignbit_b32 v2, s6, v1, 23 612; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 613; SI-NEXT: v_mov_b32_e32 v0, s8 614; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 615; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 616; SI-NEXT: s_endpgm 617; 618; VI-LABEL: fshl_v4i32_imm: 619; VI: ; %bb.0: ; %entry 620; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 621; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 622; VI-NEXT: s_waitcnt lgkmcnt(0) 623; VI-NEXT: v_mov_b32_e32 v0, s11 624; VI-NEXT: v_mov_b32_e32 v1, s10 625; VI-NEXT: v_mov_b32_e32 v4, s9 626; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 627; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 628; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25 629; VI-NEXT: v_mov_b32_e32 v0, s8 630; VI-NEXT: v_mov_b32_e32 v5, s1 631; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 632; VI-NEXT: v_mov_b32_e32 v4, s0 633; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 634; VI-NEXT: s_endpgm 635; 636; GFX9-LABEL: fshl_v4i32_imm: 637; GFX9: ; %bb.0: ; %entry 638; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 639; GFX9-NEXT: v_mov_b32_e32 v4, 0 640; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 641; GFX9-NEXT: s_waitcnt lgkmcnt(0) 642; GFX9-NEXT: v_mov_b32_e32 v0, s11 643; GFX9-NEXT: v_mov_b32_e32 v1, s10 644; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 645; GFX9-NEXT: v_mov_b32_e32 v0, s9 646; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 647; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 648; GFX9-NEXT: v_mov_b32_e32 v0, s8 649; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 650; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 651; GFX9-NEXT: s_endpgm 652; 653; R600-LABEL: fshl_v4i32_imm: 654; R600: ; %bb.0: ; %entry 655; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 656; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 657; R600-NEXT: CF_END 658; R600-NEXT: PAD 659; R600-NEXT: ALU clause starting at 4: 660; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x, 661; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 662; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 663; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 664; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 665; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 666; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x, 667; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 668; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 669; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 670; 671; GFX10-LABEL: fshl_v4i32_imm: 672; GFX10: ; %bb.0: ; %entry 673; GFX10-NEXT: s_clause 0x1 674; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 675; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 676; GFX10-NEXT: v_mov_b32_e32 v4, 0 677; GFX10-NEXT: s_waitcnt lgkmcnt(0) 678; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 679; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 680; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 681; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 682; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 683; GFX10-NEXT: s_endpgm 684; 685; GFX11-LABEL: fshl_v4i32_imm: 686; GFX11: ; %bb.0: ; %entry 687; GFX11-NEXT: s_clause 0x1 688; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 689; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 690; GFX11-NEXT: v_mov_b32_e32 v4, 0 691; GFX11-NEXT: s_waitcnt lgkmcnt(0) 692; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 693; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23 694; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25 695; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31 696; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 697; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 698; GFX11-NEXT: s_endpgm 699entry: 700 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 701 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 702 ret void 703} 704