1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 7 8declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone 9declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 10declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 11 12define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 13; SI-LABEL: fshl_i32: 14; SI: ; %bb.0: ; %entry 15; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 16; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 17; SI-NEXT: s_load_dword s0, s[0:1], 0xd 18; SI-NEXT: s_mov_b32 s7, 0xf000 19; SI-NEXT: s_mov_b32 s6, -1 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: v_mov_b32_e32 v0, s3 22; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 23; SI-NEXT: s_not_b32 s0, s0 24; SI-NEXT: s_lshr_b32 s1, s2, 1 25; SI-NEXT: v_mov_b32_e32 v1, s0 26; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 27; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 28; SI-NEXT: s_endpgm 29; 30; VI-LABEL: fshl_i32: 31; VI: ; %bb.0: ; %entry 32; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 33; VI-NEXT: s_load_dword s4, s[0:1], 0x34 34; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: v_mov_b32_e32 v0, s3 37; VI-NEXT: s_not_b32 s4, s4 38; VI-NEXT: s_lshr_b32 s3, s2, 1 39; VI-NEXT: v_alignbit_b32 v0, s2, v0, 1 40; VI-NEXT: v_mov_b32_e32 v1, s4 41; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 42; VI-NEXT: v_mov_b32_e32 v0, s0 43; VI-NEXT: v_mov_b32_e32 v1, s1 44; VI-NEXT: flat_store_dword v[0:1], v2 45; VI-NEXT: s_endpgm 46; 47; GFX9-LABEL: fshl_i32: 48; GFX9: ; %bb.0: ; %entry 49; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 50; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 51; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 52; GFX9-NEXT: v_mov_b32_e32 v0, 0 53; GFX9-NEXT: s_waitcnt lgkmcnt(0) 54; GFX9-NEXT: v_mov_b32_e32 v1, s3 55; GFX9-NEXT: s_lshr_b32 s0, s2, 1 56; GFX9-NEXT: s_not_b32 s1, s6 57; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 1 58; GFX9-NEXT: v_mov_b32_e32 v2, s1 59; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 60; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 61; GFX9-NEXT: s_endpgm 62; 63; R600-LABEL: fshl_i32: 64; R600: ; %bb.0: ; %entry 65; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 66; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 67; R600-NEXT: CF_END 68; R600-NEXT: PAD 69; R600-NEXT: ALU clause starting at 4: 70; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, 71; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, 72; R600-NEXT: NOT_INT * T1.W, KC0[3].X, 73; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, 74; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 75; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 76; 77; GFX10-LABEL: fshl_i32: 78; GFX10: ; %bb.0: ; %entry 79; GFX10-NEXT: s_clause 0x2 80; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 81; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 82; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 83; GFX10-NEXT: v_mov_b32_e32 v1, 0 84; GFX10-NEXT: s_waitcnt lgkmcnt(0) 85; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, 1 86; GFX10-NEXT: s_lshr_b32 s0, s2, 1 87; GFX10-NEXT: s_not_b32 s1, s6 88; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 89; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 90; GFX10-NEXT: s_endpgm 91entry: 92 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) 93 store i32 %0, i32 addrspace(1)* %in 94 ret void 95} 96 97define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 98; SI-LABEL: fshl_i32_imm: 99; SI: ; %bb.0: ; %entry 100; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 101; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s3, 0xf000 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s5 106; SI-NEXT: v_alignbit_b32 v0, s4, v0, 25 107; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 108; SI-NEXT: s_endpgm 109; 110; VI-LABEL: fshl_i32_imm: 111; VI: ; %bb.0: ; %entry 112; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 113; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 114; VI-NEXT: s_waitcnt lgkmcnt(0) 115; VI-NEXT: v_mov_b32_e32 v0, s3 116; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 117; VI-NEXT: v_mov_b32_e32 v0, s0 118; VI-NEXT: v_mov_b32_e32 v1, s1 119; VI-NEXT: flat_store_dword v[0:1], v2 120; VI-NEXT: s_endpgm 121; 122; GFX9-LABEL: fshl_i32_imm: 123; GFX9: ; %bb.0: ; %entry 124; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 125; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 126; GFX9-NEXT: v_mov_b32_e32 v0, 0 127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 128; GFX9-NEXT: v_mov_b32_e32 v1, s3 129; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 130; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 131; GFX9-NEXT: s_endpgm 132; 133; R600-LABEL: fshl_i32_imm: 134; R600: ; %bb.0: ; %entry 135; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 136; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 137; R600-NEXT: CF_END 138; R600-NEXT: PAD 139; R600-NEXT: ALU clause starting at 4: 140; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 141; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 142; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 143; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 144; 145; GFX10-LABEL: fshl_i32_imm: 146; GFX10: ; %bb.0: ; %entry 147; GFX10-NEXT: s_clause 0x1 148; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 149; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 150; GFX10-NEXT: v_mov_b32_e32 v0, 0 151; GFX10-NEXT: s_waitcnt lgkmcnt(0) 152; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 153; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 154; GFX10-NEXT: s_endpgm 155entry: 156 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) 157 store i32 %0, i32 addrspace(1)* %in 158 ret void 159} 160 161define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 162; SI-LABEL: fshl_v2i32: 163; SI: ; %bb.0: ; %entry 164; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 165; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 166; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 167; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 168; SI-NEXT: s_mov_b32 s7, 0xf000 169; SI-NEXT: s_waitcnt lgkmcnt(0) 170; SI-NEXT: v_mov_b32_e32 v0, s3 171; SI-NEXT: s_mov_b32 s6, -1 172; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1 173; SI-NEXT: s_not_b32 s1, s1 174; SI-NEXT: s_lshr_b32 s3, s9, 1 175; SI-NEXT: v_mov_b32_e32 v1, s1 176; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 177; SI-NEXT: v_mov_b32_e32 v0, s2 178; SI-NEXT: s_not_b32 s0, s0 179; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 180; SI-NEXT: s_lshr_b32 s1, s8, 1 181; SI-NEXT: v_mov_b32_e32 v2, s0 182; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 183; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 184; SI-NEXT: s_endpgm 185; 186; VI-LABEL: fshl_v2i32: 187; VI: ; %bb.0: ; %entry 188; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 189; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 190; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c 191; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 192; VI-NEXT: s_waitcnt lgkmcnt(0) 193; VI-NEXT: v_mov_b32_e32 v0, s3 194; VI-NEXT: s_lshr_b32 s3, s5, 1 195; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 196; VI-NEXT: s_not_b32 s5, s7 197; VI-NEXT: v_mov_b32_e32 v1, s5 198; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1 199; VI-NEXT: v_mov_b32_e32 v0, s2 200; VI-NEXT: s_not_b32 s3, s6 201; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 202; VI-NEXT: s_lshr_b32 s2, s4, 1 203; VI-NEXT: v_mov_b32_e32 v2, s3 204; VI-NEXT: v_alignbit_b32 v0, s2, v0, v2 205; VI-NEXT: v_mov_b32_e32 v3, s1 206; VI-NEXT: v_mov_b32_e32 v2, s0 207; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 208; VI-NEXT: s_endpgm 209; 210; GFX9-LABEL: fshl_v2i32: 211; GFX9: ; %bb.0: ; %entry 212; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 213; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 214; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 215; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c 216; GFX9-NEXT: v_mov_b32_e32 v2, 0 217; GFX9-NEXT: s_waitcnt lgkmcnt(0) 218; GFX9-NEXT: v_mov_b32_e32 v0, s3 219; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 220; GFX9-NEXT: s_lshr_b32 s0, s5, 1 221; GFX9-NEXT: s_not_b32 s1, s9 222; GFX9-NEXT: v_mov_b32_e32 v1, s1 223; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 224; GFX9-NEXT: v_mov_b32_e32 v0, s2 225; GFX9-NEXT: s_not_b32 s1, s8 226; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 227; GFX9-NEXT: s_lshr_b32 s0, s4, 1 228; GFX9-NEXT: v_mov_b32_e32 v3, s1 229; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 230; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 231; GFX9-NEXT: s_endpgm 232; 233; R600-LABEL: fshl_v2i32: 234; R600: ; %bb.0: ; %entry 235; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 236; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 237; R600-NEXT: CF_END 238; R600-NEXT: PAD 239; R600-NEXT: ALU clause starting at 4: 240; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, 241; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, 242; R600-NEXT: NOT_INT * T1.W, KC0[4].X, 243; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, 244; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, 245; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, 246; R600-NEXT: NOT_INT * T1.W, KC0[3].W, 247; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, 248; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 249; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 250; 251; GFX10-LABEL: fshl_v2i32: 252; GFX10: ; %bb.0: ; %entry 253; GFX10-NEXT: s_clause 0x3 254; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 255; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 256; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c 257; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 258; GFX10-NEXT: v_mov_b32_e32 v2, 0 259; GFX10-NEXT: s_waitcnt lgkmcnt(0) 260; GFX10-NEXT: s_lshr_b32 s0, s3, 1 261; GFX10-NEXT: v_alignbit_b32 v0, s3, s5, 1 262; GFX10-NEXT: v_alignbit_b32 v3, s2, s4, 1 263; GFX10-NEXT: s_not_b32 s1, s7 264; GFX10-NEXT: s_lshr_b32 s2, s2, 1 265; GFX10-NEXT: s_not_b32 s3, s6 266; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 267; GFX10-NEXT: v_alignbit_b32 v0, s2, v3, s3 268; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 269; GFX10-NEXT: s_endpgm 270entry: 271 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 272 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 273 ret void 274} 275 276define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 277; SI-LABEL: fshl_v2i32_imm: 278; SI: ; %bb.0: ; %entry 279; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 280; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb 281; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 282; SI-NEXT: s_mov_b32 s3, 0xf000 283; SI-NEXT: s_mov_b32 s2, -1 284; SI-NEXT: s_waitcnt lgkmcnt(0) 285; SI-NEXT: v_mov_b32_e32 v0, s5 286; SI-NEXT: v_alignbit_b32 v1, s7, v0, 23 287; SI-NEXT: v_mov_b32_e32 v0, s4 288; SI-NEXT: v_alignbit_b32 v0, s6, v0, 25 289; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 290; SI-NEXT: s_endpgm 291; 292; VI-LABEL: fshl_v2i32_imm: 293; VI: ; %bb.0: ; %entry 294; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 295; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 296; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 297; VI-NEXT: s_waitcnt lgkmcnt(0) 298; VI-NEXT: v_mov_b32_e32 v0, s3 299; VI-NEXT: v_mov_b32_e32 v2, s2 300; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 301; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 302; VI-NEXT: v_mov_b32_e32 v3, s1 303; VI-NEXT: v_mov_b32_e32 v2, s0 304; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 305; VI-NEXT: s_endpgm 306; 307; GFX9-LABEL: fshl_v2i32_imm: 308; GFX9: ; %bb.0: ; %entry 309; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 310; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 311; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 312; GFX9-NEXT: v_mov_b32_e32 v2, 0 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: v_mov_b32_e32 v0, s3 315; GFX9-NEXT: v_mov_b32_e32 v3, s2 316; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 317; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 318; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 319; GFX9-NEXT: s_endpgm 320; 321; R600-LABEL: fshl_v2i32_imm: 322; R600: ; %bb.0: ; %entry 323; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 324; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 325; R600-NEXT: CF_END 326; R600-NEXT: PAD 327; R600-NEXT: ALU clause starting at 4: 328; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 329; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 330; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 331; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 332; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 333; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 334; 335; GFX10-LABEL: fshl_v2i32_imm: 336; GFX10: ; %bb.0: ; %entry 337; GFX10-NEXT: s_clause 0x2 338; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 339; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 340; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 341; GFX10-NEXT: v_mov_b32_e32 v2, 0 342; GFX10-NEXT: s_waitcnt lgkmcnt(0) 343; GFX10-NEXT: v_alignbit_b32 v1, s3, s5, 23 344; GFX10-NEXT: v_alignbit_b32 v0, s2, s4, 25 345; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 346; GFX10-NEXT: s_endpgm 347entry: 348 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 349 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 350 ret void 351} 352 353define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 354; SI-LABEL: fshl_v4i32: 355; SI: ; %bb.0: ; %entry 356; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 357; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 358; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0xd 359; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 360; SI-NEXT: s_mov_b32 s11, 0xf000 361; SI-NEXT: s_waitcnt lgkmcnt(0) 362; SI-NEXT: v_mov_b32_e32 v0, s7 363; SI-NEXT: s_mov_b32 s10, -1 364; SI-NEXT: v_alignbit_b32 v0, s15, v0, 1 365; SI-NEXT: s_not_b32 s3, s3 366; SI-NEXT: s_lshr_b32 s7, s15, 1 367; SI-NEXT: v_mov_b32_e32 v1, s3 368; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 369; SI-NEXT: v_mov_b32_e32 v0, s6 370; SI-NEXT: s_not_b32 s2, s2 371; SI-NEXT: v_alignbit_b32 v0, s14, v0, 1 372; SI-NEXT: s_lshr_b32 s3, s14, 1 373; SI-NEXT: v_mov_b32_e32 v1, s2 374; SI-NEXT: v_alignbit_b32 v2, s3, v0, v1 375; SI-NEXT: v_mov_b32_e32 v0, s5 376; SI-NEXT: s_not_b32 s1, s1 377; SI-NEXT: v_alignbit_b32 v0, s13, v0, 1 378; SI-NEXT: s_lshr_b32 s2, s13, 1 379; SI-NEXT: v_mov_b32_e32 v1, s1 380; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 381; SI-NEXT: v_mov_b32_e32 v0, s4 382; SI-NEXT: s_not_b32 s0, s0 383; SI-NEXT: v_alignbit_b32 v0, s12, v0, 1 384; SI-NEXT: s_lshr_b32 s1, s12, 1 385; SI-NEXT: v_mov_b32_e32 v4, s0 386; SI-NEXT: v_alignbit_b32 v0, s1, v0, v4 387; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 388; SI-NEXT: s_endpgm 389; 390; VI-LABEL: fshl_v4i32: 391; VI: ; %bb.0: ; %entry 392; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 393; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 394; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 395; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 396; VI-NEXT: s_waitcnt lgkmcnt(0) 397; VI-NEXT: v_mov_b32_e32 v0, s7 398; VI-NEXT: s_lshr_b32 s2, s11, 1 399; VI-NEXT: s_not_b32 s3, s15 400; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1 401; VI-NEXT: v_mov_b32_e32 v1, s3 402; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1 403; VI-NEXT: v_mov_b32_e32 v0, s6 404; VI-NEXT: s_not_b32 s3, s14 405; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1 406; VI-NEXT: s_lshr_b32 s2, s10, 1 407; VI-NEXT: v_mov_b32_e32 v1, s3 408; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 409; VI-NEXT: v_mov_b32_e32 v0, s5 410; VI-NEXT: s_not_b32 s3, s13 411; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1 412; VI-NEXT: s_lshr_b32 s2, s9, 1 413; VI-NEXT: v_mov_b32_e32 v1, s3 414; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 415; VI-NEXT: v_mov_b32_e32 v0, s4 416; VI-NEXT: s_not_b32 s3, s12 417; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 418; VI-NEXT: s_lshr_b32 s2, s8, 1 419; VI-NEXT: v_mov_b32_e32 v4, s3 420; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4 421; VI-NEXT: v_mov_b32_e32 v5, s1 422; VI-NEXT: v_mov_b32_e32 v4, s0 423; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 424; VI-NEXT: s_endpgm 425; 426; GFX9-LABEL: fshl_v4i32: 427; GFX9: ; %bb.0: ; %entry 428; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 429; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 430; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 431; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 432; GFX9-NEXT: v_mov_b32_e32 v4, 0 433; GFX9-NEXT: s_waitcnt lgkmcnt(0) 434; GFX9-NEXT: v_mov_b32_e32 v0, s7 435; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1 436; GFX9-NEXT: s_lshr_b32 s0, s11, 1 437; GFX9-NEXT: s_not_b32 s1, s15 438; GFX9-NEXT: v_mov_b32_e32 v1, s1 439; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 440; GFX9-NEXT: v_mov_b32_e32 v0, s6 441; GFX9-NEXT: s_not_b32 s1, s14 442; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1 443; GFX9-NEXT: s_lshr_b32 s0, s10, 1 444; GFX9-NEXT: v_mov_b32_e32 v1, s1 445; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 446; GFX9-NEXT: v_mov_b32_e32 v0, s5 447; GFX9-NEXT: s_not_b32 s1, s13 448; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1 449; GFX9-NEXT: s_lshr_b32 s0, s9, 1 450; GFX9-NEXT: v_mov_b32_e32 v1, s1 451; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 452; GFX9-NEXT: v_mov_b32_e32 v0, s4 453; GFX9-NEXT: s_not_b32 s1, s12 454; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 455; GFX9-NEXT: s_lshr_b32 s0, s8, 1 456; GFX9-NEXT: v_mov_b32_e32 v5, s1 457; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 458; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 459; GFX9-NEXT: s_endpgm 460; 461; R600-LABEL: fshl_v4i32: 462; R600: ; %bb.0: ; %entry 463; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 464; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 465; R600-NEXT: CF_END 466; R600-NEXT: PAD 467; R600-NEXT: ALU clause starting at 4: 468; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, 469; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 470; R600-NEXT: NOT_INT * T1.W, KC0[6].X, 471; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, 472; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, 473; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, 474; R600-NEXT: NOT_INT * T1.W, KC0[5].W, 475; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, 476; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, 477; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, 478; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, 479; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, 480; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, 481; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, 482; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, 483; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, 484; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 485; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 486; 487; GFX10-LABEL: fshl_v4i32: 488; GFX10: ; %bb.0: ; %entry 489; GFX10-NEXT: s_clause 0x3 490; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 491; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 492; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 493; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 494; GFX10-NEXT: v_mov_b32_e32 v4, 0 495; GFX10-NEXT: s_waitcnt lgkmcnt(0) 496; GFX10-NEXT: s_lshr_b32 s0, s7, 1 497; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 498; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 499; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 500; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 501; GFX10-NEXT: s_not_b32 s1, s15 502; GFX10-NEXT: s_lshr_b32 s6, s6, 1 503; GFX10-NEXT: s_not_b32 s7, s14 504; GFX10-NEXT: s_lshr_b32 s5, s5, 1 505; GFX10-NEXT: s_not_b32 s9, s13 506; GFX10-NEXT: s_lshr_b32 s4, s4, 1 507; GFX10-NEXT: s_not_b32 s8, s12 508; GFX10-NEXT: v_alignbit_b32 v3, s0, v0, s1 509; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 510; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 511; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 512; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 513; GFX10-NEXT: s_endpgm 514entry: 515 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 516 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 517 ret void 518} 519 520define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 521; SI-LABEL: fshl_v4i32_imm: 522; SI: ; %bb.0: ; %entry 523; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 524; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 525; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 526; SI-NEXT: s_mov_b32 s3, 0xf000 527; SI-NEXT: s_mov_b32 s2, -1 528; SI-NEXT: s_waitcnt lgkmcnt(0) 529; SI-NEXT: v_mov_b32_e32 v0, s7 530; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31 531; SI-NEXT: v_mov_b32_e32 v0, s6 532; SI-NEXT: v_alignbit_b32 v2, s10, v0, 23 533; SI-NEXT: v_mov_b32_e32 v0, s5 534; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25 535; SI-NEXT: v_mov_b32_e32 v0, s4 536; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31 537; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 538; SI-NEXT: s_endpgm 539; 540; VI-LABEL: fshl_v4i32_imm: 541; VI: ; %bb.0: ; %entry 542; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 543; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 544; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 545; VI-NEXT: s_waitcnt lgkmcnt(0) 546; VI-NEXT: v_mov_b32_e32 v0, s7 547; VI-NEXT: v_mov_b32_e32 v1, s6 548; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31 549; VI-NEXT: v_mov_b32_e32 v0, s5 550; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23 551; VI-NEXT: v_alignbit_b32 v1, s9, v0, 25 552; VI-NEXT: v_mov_b32_e32 v0, s4 553; VI-NEXT: v_mov_b32_e32 v5, s1 554; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31 555; VI-NEXT: v_mov_b32_e32 v4, s0 556; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 557; VI-NEXT: s_endpgm 558; 559; GFX9-LABEL: fshl_v4i32_imm: 560; GFX9: ; %bb.0: ; %entry 561; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 562; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 563; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 564; GFX9-NEXT: v_mov_b32_e32 v4, 0 565; GFX9-NEXT: s_waitcnt lgkmcnt(0) 566; GFX9-NEXT: v_mov_b32_e32 v0, s7 567; GFX9-NEXT: v_mov_b32_e32 v1, s6 568; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31 569; GFX9-NEXT: v_mov_b32_e32 v0, s5 570; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23 571; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25 572; GFX9-NEXT: v_mov_b32_e32 v0, s4 573; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31 574; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 575; GFX9-NEXT: s_endpgm 576; 577; R600-LABEL: fshl_v4i32_imm: 578; R600: ; %bb.0: ; %entry 579; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 580; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 581; R600-NEXT: CF_END 582; R600-NEXT: PAD 583; R600-NEXT: ALU clause starting at 4: 584; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x, 585; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 586; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 587; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 588; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 589; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 590; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x, 591; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 592; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 593; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 594; 595; GFX10-LABEL: fshl_v4i32_imm: 596; GFX10: ; %bb.0: ; %entry 597; GFX10-NEXT: s_clause 0x2 598; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 599; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 600; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 601; GFX10-NEXT: v_mov_b32_e32 v4, 0 602; GFX10-NEXT: s_waitcnt lgkmcnt(0) 603; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 604; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 605; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 606; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 607; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 608; GFX10-NEXT: s_endpgm 609entry: 610 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 611 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 612 ret void 613} 614