1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 7 8declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone 9declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 10declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 11 12define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 13; SI-LABEL: fshl_i32: 14; SI: ; %bb.0: ; %entry 15; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 16; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 17; SI-NEXT: s_load_dword s0, s[0:1], 0xd 18; SI-NEXT: s_mov_b32 s7, 0xf000 19; SI-NEXT: s_mov_b32 s6, -1 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: v_mov_b32_e32 v0, s3 22; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 23; SI-NEXT: s_not_b32 s0, s0 24; SI-NEXT: s_lshr_b32 s1, s2, 1 25; SI-NEXT: v_mov_b32_e32 v1, s0 26; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 27; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 28; SI-NEXT: s_endpgm 29; 30; VI-LABEL: fshl_i32: 31; VI: ; %bb.0: ; %entry 32; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 33; VI-NEXT: s_load_dword s4, s[0:1], 0x34 34; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: v_mov_b32_e32 v0, s3 37; VI-NEXT: s_not_b32 s4, s4 38; VI-NEXT: s_lshr_b32 s3, s2, 1 39; VI-NEXT: v_alignbit_b32 v0, s2, v0, 1 40; VI-NEXT: v_mov_b32_e32 v1, s4 41; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 42; VI-NEXT: v_mov_b32_e32 v0, s0 43; VI-NEXT: v_mov_b32_e32 v1, s1 44; VI-NEXT: flat_store_dword v[0:1], v2 45; VI-NEXT: s_endpgm 46; 47; GFX9-LABEL: fshl_i32: 48; GFX9: ; %bb.0: ; %entry 49; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 50; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 51; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 52; GFX9-NEXT: v_mov_b32_e32 v0, 0 53; GFX9-NEXT: s_waitcnt lgkmcnt(0) 54; GFX9-NEXT: v_mov_b32_e32 v1, s3 55; GFX9-NEXT: s_lshr_b32 s0, s2, 1 56; GFX9-NEXT: s_not_b32 s1, s6 57; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 1 58; GFX9-NEXT: v_mov_b32_e32 v2, s1 59; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 60; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 61; GFX9-NEXT: s_endpgm 62; 63; R600-LABEL: fshl_i32: 64; R600: ; %bb.0: ; %entry 65; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 66; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 67; R600-NEXT: CF_END 68; R600-NEXT: PAD 69; R600-NEXT: ALU clause starting at 4: 70; R600-NEXT: LSHR T0.Z, KC0[2].Z, 1, 71; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1, 72; R600-NEXT: NOT_INT * T1.W, KC0[3].X, 73; R600-NEXT: BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS, 74; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 75; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 76; 77; GFX10-LABEL: fshl_i32: 78; GFX10: ; %bb.0: ; %entry 79; GFX10-NEXT: s_clause 0x2 80; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 81; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 82; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 83; GFX10-NEXT: v_mov_b32_e32 v1, 0 84; GFX10-NEXT: s_waitcnt lgkmcnt(0) 85; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, 1 86; GFX10-NEXT: s_lshr_b32 s0, s2, 1 87; GFX10-NEXT: s_not_b32 s1, s6 88; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 89; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 90; GFX10-NEXT: s_endpgm 91entry: 92 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) 93 store i32 %0, i32 addrspace(1)* %in 94 ret void 95} 96 97define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 98; SI-LABEL: fshl_i32_imm: 99; SI: ; %bb.0: ; %entry 100; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 101; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 102; SI-NEXT: s_mov_b32 s3, 0xf000 103; SI-NEXT: s_mov_b32 s2, -1 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s5 106; SI-NEXT: v_alignbit_b32 v0, s4, v0, 25 107; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 108; SI-NEXT: s_endpgm 109; 110; VI-LABEL: fshl_i32_imm: 111; VI: ; %bb.0: ; %entry 112; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 113; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 114; VI-NEXT: s_waitcnt lgkmcnt(0) 115; VI-NEXT: v_mov_b32_e32 v0, s3 116; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 117; VI-NEXT: v_mov_b32_e32 v0, s0 118; VI-NEXT: v_mov_b32_e32 v1, s1 119; VI-NEXT: flat_store_dword v[0:1], v2 120; VI-NEXT: s_endpgm 121; 122; GFX9-LABEL: fshl_i32_imm: 123; GFX9: ; %bb.0: ; %entry 124; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 125; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 126; GFX9-NEXT: v_mov_b32_e32 v0, 0 127; GFX9-NEXT: s_waitcnt lgkmcnt(0) 128; GFX9-NEXT: v_mov_b32_e32 v1, s3 129; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 130; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 131; GFX9-NEXT: s_endpgm 132; 133; R600-LABEL: fshl_i32_imm: 134; R600: ; %bb.0: ; %entry 135; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 136; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 137; R600-NEXT: CF_END 138; R600-NEXT: PAD 139; R600-NEXT: ALU clause starting at 4: 140; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 141; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 142; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 143; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 144; 145; GFX10-LABEL: fshl_i32_imm: 146; GFX10: ; %bb.0: ; %entry 147; GFX10-NEXT: s_clause 0x1 148; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 149; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 150; GFX10-NEXT: v_mov_b32_e32 v0, 0 151; GFX10-NEXT: s_waitcnt lgkmcnt(0) 152; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 153; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 154; GFX10-NEXT: s_endpgm 155entry: 156 %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) 157 store i32 %0, i32 addrspace(1)* %in 158 ret void 159} 160 161define amdgpu_kernel void @fshl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 162; SI-LABEL: fshl_v2i32: 163; SI: ; %bb.0: ; %entry 164; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 165; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 166; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 167; SI-NEXT: s_mov_b32 s11, 0xf000 168; SI-NEXT: s_mov_b32 s10, -1 169; SI-NEXT: s_waitcnt lgkmcnt(0) 170; SI-NEXT: v_mov_b32_e32 v0, s7 171; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 172; SI-NEXT: s_not_b32 s1, s1 173; SI-NEXT: s_lshr_b32 s2, s5, 1 174; SI-NEXT: v_mov_b32_e32 v1, s1 175; SI-NEXT: v_alignbit_b32 v1, s2, v0, v1 176; SI-NEXT: v_mov_b32_e32 v0, s6 177; SI-NEXT: s_not_b32 s0, s0 178; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 179; SI-NEXT: s_lshr_b32 s1, s4, 1 180; SI-NEXT: v_mov_b32_e32 v2, s0 181; SI-NEXT: v_alignbit_b32 v0, s1, v0, v2 182; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 183; SI-NEXT: s_endpgm 184; 185; VI-LABEL: fshl_v2i32: 186; VI: ; %bb.0: ; %entry 187; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 188; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 189; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 190; VI-NEXT: s_waitcnt lgkmcnt(0) 191; VI-NEXT: v_mov_b32_e32 v0, s7 192; VI-NEXT: s_not_b32 s3, s3 193; VI-NEXT: s_lshr_b32 s7, s5, 1 194; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 195; VI-NEXT: v_mov_b32_e32 v1, s3 196; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 197; VI-NEXT: v_mov_b32_e32 v0, s6 198; VI-NEXT: s_not_b32 s2, s2 199; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 200; VI-NEXT: s_lshr_b32 s3, s4, 1 201; VI-NEXT: v_mov_b32_e32 v2, s2 202; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2 203; VI-NEXT: v_mov_b32_e32 v3, s1 204; VI-NEXT: v_mov_b32_e32 v2, s0 205; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 206; VI-NEXT: s_endpgm 207; 208; GFX9-LABEL: fshl_v2i32: 209; GFX9: ; %bb.0: ; %entry 210; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 211; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 212; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c 213; GFX9-NEXT: v_mov_b32_e32 v2, 0 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: v_mov_b32_e32 v0, s7 216; GFX9-NEXT: s_lshr_b32 s0, s5, 1 217; GFX9-NEXT: s_not_b32 s1, s9 218; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 219; GFX9-NEXT: v_mov_b32_e32 v1, s1 220; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 221; GFX9-NEXT: v_mov_b32_e32 v0, s6 222; GFX9-NEXT: s_not_b32 s1, s8 223; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 224; GFX9-NEXT: s_lshr_b32 s0, s4, 1 225; GFX9-NEXT: v_mov_b32_e32 v3, s1 226; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3 227; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 228; GFX9-NEXT: s_endpgm 229; 230; R600-LABEL: fshl_v2i32: 231; R600: ; %bb.0: ; %entry 232; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 233; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 234; R600-NEXT: CF_END 235; R600-NEXT: PAD 236; R600-NEXT: ALU clause starting at 4: 237; R600-NEXT: LSHR T0.Z, KC0[3].X, 1, 238; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1, 239; R600-NEXT: NOT_INT * T1.W, KC0[4].X, 240; R600-NEXT: BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W, 241; R600-NEXT: LSHR T0.Z, KC0[2].W, 1, 242; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1, 243; R600-NEXT: NOT_INT * T1.W, KC0[3].W, 244; R600-NEXT: BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W, 245; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 246; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 247; 248; GFX10-LABEL: fshl_v2i32: 249; GFX10: ; %bb.0: ; %entry 250; GFX10-NEXT: s_clause 0x2 251; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 252; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 253; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 254; GFX10-NEXT: v_mov_b32_e32 v2, 0 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: v_alignbit_b32 v0, s5, s7, 1 257; GFX10-NEXT: v_alignbit_b32 v3, s4, s6, 1 258; GFX10-NEXT: s_lshr_b32 s0, s5, 1 259; GFX10-NEXT: s_not_b32 s1, s3 260; GFX10-NEXT: s_lshr_b32 s3, s4, 1 261; GFX10-NEXT: s_not_b32 s2, s2 262; GFX10-NEXT: v_alignbit_b32 v1, s0, v0, s1 263; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 264; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 265; GFX10-NEXT: s_endpgm 266entry: 267 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 268 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 269 ret void 270} 271 272define amdgpu_kernel void @fshl_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 273; SI-LABEL: fshl_v2i32_imm: 274; SI: ; %bb.0: ; %entry 275; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 276; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 277; SI-NEXT: s_mov_b32 s3, 0xf000 278; SI-NEXT: s_mov_b32 s2, -1 279; SI-NEXT: s_waitcnt lgkmcnt(0) 280; SI-NEXT: v_mov_b32_e32 v0, s7 281; SI-NEXT: v_mov_b32_e32 v2, s6 282; SI-NEXT: v_alignbit_b32 v1, s5, v0, 23 283; SI-NEXT: v_alignbit_b32 v0, s4, v2, 25 284; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 285; SI-NEXT: s_endpgm 286; 287; VI-LABEL: fshl_v2i32_imm: 288; VI: ; %bb.0: ; %entry 289; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 290; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 291; VI-NEXT: s_waitcnt lgkmcnt(0) 292; VI-NEXT: v_mov_b32_e32 v0, s7 293; VI-NEXT: v_mov_b32_e32 v2, s6 294; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 295; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 296; VI-NEXT: v_mov_b32_e32 v3, s1 297; VI-NEXT: v_mov_b32_e32 v2, s0 298; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 299; VI-NEXT: s_endpgm 300; 301; GFX9-LABEL: fshl_v2i32_imm: 302; GFX9: ; %bb.0: ; %entry 303; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 304; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 305; GFX9-NEXT: v_mov_b32_e32 v2, 0 306; GFX9-NEXT: s_waitcnt lgkmcnt(0) 307; GFX9-NEXT: v_mov_b32_e32 v0, s7 308; GFX9-NEXT: v_mov_b32_e32 v3, s6 309; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 23 310; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 25 311; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 312; GFX9-NEXT: s_endpgm 313; 314; R600-LABEL: fshl_v2i32_imm: 315; R600: ; %bb.0: ; %entry 316; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 317; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 318; R600-NEXT: CF_END 319; R600-NEXT: PAD 320; R600-NEXT: ALU clause starting at 4: 321; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 322; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 323; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 324; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 325; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 326; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 327; 328; GFX10-LABEL: fshl_v2i32_imm: 329; GFX10: ; %bb.0: ; %entry 330; GFX10-NEXT: s_clause 0x1 331; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 332; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 333; GFX10-NEXT: v_mov_b32_e32 v2, 0 334; GFX10-NEXT: s_waitcnt lgkmcnt(0) 335; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 23 336; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 337; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 338; GFX10-NEXT: s_endpgm 339entry: 340 %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 341 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 342 ret void 343} 344 345define amdgpu_kernel void @fshl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 346; SI-LABEL: fshl_v4i32: 347; SI: ; %bb.0: ; %entry 348; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 349; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 350; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 351; SI-NEXT: s_mov_b32 s3, 0xf000 352; SI-NEXT: s_mov_b32 s2, -1 353; SI-NEXT: s_waitcnt lgkmcnt(0) 354; SI-NEXT: v_mov_b32_e32 v0, s11 355; SI-NEXT: s_not_b32 s11, s15 356; SI-NEXT: v_alignbit_b32 v0, s7, v0, 1 357; SI-NEXT: s_lshr_b32 s7, s7, 1 358; SI-NEXT: v_mov_b32_e32 v1, s11 359; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 360; SI-NEXT: v_mov_b32_e32 v0, s10 361; SI-NEXT: s_not_b32 s7, s14 362; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 363; SI-NEXT: s_lshr_b32 s6, s6, 1 364; SI-NEXT: v_mov_b32_e32 v1, s7 365; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 366; SI-NEXT: v_mov_b32_e32 v0, s9 367; SI-NEXT: s_not_b32 s6, s13 368; SI-NEXT: v_alignbit_b32 v0, s5, v0, 1 369; SI-NEXT: s_lshr_b32 s5, s5, 1 370; SI-NEXT: v_mov_b32_e32 v1, s6 371; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 372; SI-NEXT: v_mov_b32_e32 v0, s8 373; SI-NEXT: s_not_b32 s5, s12 374; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 375; SI-NEXT: s_lshr_b32 s4, s4, 1 376; SI-NEXT: v_mov_b32_e32 v4, s5 377; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 378; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 379; SI-NEXT: s_endpgm 380; 381; VI-LABEL: fshl_v4i32: 382; VI: ; %bb.0: ; %entry 383; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 384; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 385; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 386; VI-NEXT: s_waitcnt lgkmcnt(0) 387; VI-NEXT: v_mov_b32_e32 v0, s11 388; VI-NEXT: s_not_b32 s3, s15 389; VI-NEXT: s_lshr_b32 s2, s7, 1 390; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1 391; VI-NEXT: v_mov_b32_e32 v1, s3 392; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1 393; VI-NEXT: v_mov_b32_e32 v0, s10 394; VI-NEXT: s_not_b32 s3, s14 395; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 396; VI-NEXT: s_lshr_b32 s2, s6, 1 397; VI-NEXT: v_mov_b32_e32 v1, s3 398; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 399; VI-NEXT: v_mov_b32_e32 v0, s9 400; VI-NEXT: s_not_b32 s3, s13 401; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1 402; VI-NEXT: s_lshr_b32 s2, s5, 1 403; VI-NEXT: v_mov_b32_e32 v1, s3 404; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1 405; VI-NEXT: v_mov_b32_e32 v0, s8 406; VI-NEXT: s_not_b32 s3, s12 407; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 408; VI-NEXT: s_lshr_b32 s2, s4, 1 409; VI-NEXT: v_mov_b32_e32 v4, s3 410; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4 411; VI-NEXT: v_mov_b32_e32 v5, s1 412; VI-NEXT: v_mov_b32_e32 v4, s0 413; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 414; VI-NEXT: s_endpgm 415; 416; GFX9-LABEL: fshl_v4i32: 417; GFX9: ; %bb.0: ; %entry 418; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 419; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 420; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 421; GFX9-NEXT: v_mov_b32_e32 v4, 0 422; GFX9-NEXT: s_waitcnt lgkmcnt(0) 423; GFX9-NEXT: s_not_b32 s1, s15 424; GFX9-NEXT: v_mov_b32_e32 v0, s11 425; GFX9-NEXT: s_lshr_b32 s0, s7, 1 426; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 1 427; GFX9-NEXT: v_mov_b32_e32 v1, s1 428; GFX9-NEXT: v_alignbit_b32 v3, s0, v0, v1 429; GFX9-NEXT: v_mov_b32_e32 v0, s10 430; GFX9-NEXT: s_not_b32 s1, s14 431; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, 1 432; GFX9-NEXT: s_lshr_b32 s0, s6, 1 433; GFX9-NEXT: v_mov_b32_e32 v1, s1 434; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 435; GFX9-NEXT: v_mov_b32_e32 v0, s9 436; GFX9-NEXT: s_not_b32 s1, s13 437; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 1 438; GFX9-NEXT: s_lshr_b32 s0, s5, 1 439; GFX9-NEXT: v_mov_b32_e32 v1, s1 440; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, v1 441; GFX9-NEXT: v_mov_b32_e32 v0, s8 442; GFX9-NEXT: s_not_b32 s1, s12 443; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 444; GFX9-NEXT: s_lshr_b32 s0, s4, 1 445; GFX9-NEXT: v_mov_b32_e32 v5, s1 446; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v5 447; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 448; GFX9-NEXT: s_endpgm 449; 450; R600-LABEL: fshl_v4i32: 451; R600: ; %bb.0: ; %entry 452; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 453; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 454; R600-NEXT: CF_END 455; R600-NEXT: PAD 456; R600-NEXT: ALU clause starting at 4: 457; R600-NEXT: LSHR T0.Z, KC0[4].X, 1, 458; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 459; R600-NEXT: NOT_INT * T1.W, KC0[6].X, 460; R600-NEXT: LSHR T0.Y, KC0[3].W, 1, 461; R600-NEXT: BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1, 462; R600-NEXT: BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W, 463; R600-NEXT: NOT_INT * T1.W, KC0[5].W, 464; R600-NEXT: LSHR T1.Y, KC0[3].Z, 1, 465; R600-NEXT: BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W, 466; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1, 467; R600-NEXT: NOT_INT * T2.W, KC0[5].Z, 468; R600-NEXT: BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W, 469; R600-NEXT: LSHR T1.Z, KC0[3].Y, 1, 470; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1, 471; R600-NEXT: NOT_INT * T2.W, KC0[5].Y, 472; R600-NEXT: BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W, 473; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 474; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 475; 476; GFX10-LABEL: fshl_v4i32: 477; GFX10: ; %bb.0: ; %entry 478; GFX10-NEXT: s_clause 0x1 479; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 480; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 481; GFX10-NEXT: v_mov_b32_e32 v4, 0 482; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; GFX10-NEXT: s_waitcnt lgkmcnt(0) 484; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1 485; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1 486; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1 487; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1 488; GFX10-NEXT: s_lshr_b32 s2, s7, 1 489; GFX10-NEXT: s_not_b32 s3, s15 490; GFX10-NEXT: s_lshr_b32 s6, s6, 1 491; GFX10-NEXT: s_not_b32 s7, s14 492; GFX10-NEXT: s_lshr_b32 s5, s5, 1 493; GFX10-NEXT: s_not_b32 s9, s13 494; GFX10-NEXT: s_lshr_b32 s4, s4, 1 495; GFX10-NEXT: s_not_b32 s8, s12 496; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3 497; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7 498; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9 499; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 500; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 501; GFX10-NEXT: s_endpgm 502entry: 503 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 504 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 505 ret void 506} 507 508define amdgpu_kernel void @fshl_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 509; SI-LABEL: fshl_v4i32_imm: 510; SI: ; %bb.0: ; %entry 511; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 512; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 513; SI-NEXT: s_mov_b32 s3, 0xf000 514; SI-NEXT: s_mov_b32 s2, -1 515; SI-NEXT: s_waitcnt lgkmcnt(0) 516; SI-NEXT: v_mov_b32_e32 v0, s11 517; SI-NEXT: v_mov_b32_e32 v1, s10 518; SI-NEXT: v_alignbit_b32 v3, s7, v0, 31 519; SI-NEXT: v_mov_b32_e32 v0, s9 520; SI-NEXT: v_alignbit_b32 v2, s6, v1, 23 521; SI-NEXT: v_alignbit_b32 v1, s5, v0, 25 522; SI-NEXT: v_mov_b32_e32 v0, s8 523; SI-NEXT: v_alignbit_b32 v0, s4, v0, 31 524; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 525; SI-NEXT: s_endpgm 526; 527; VI-LABEL: fshl_v4i32_imm: 528; VI: ; %bb.0: ; %entry 529; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 530; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 531; VI-NEXT: s_waitcnt lgkmcnt(0) 532; VI-NEXT: v_mov_b32_e32 v0, s11 533; VI-NEXT: v_mov_b32_e32 v1, s10 534; VI-NEXT: v_mov_b32_e32 v4, s9 535; VI-NEXT: v_alignbit_b32 v3, s7, v0, 31 536; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23 537; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25 538; VI-NEXT: v_mov_b32_e32 v0, s8 539; VI-NEXT: v_mov_b32_e32 v5, s1 540; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31 541; VI-NEXT: v_mov_b32_e32 v4, s0 542; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 543; VI-NEXT: s_endpgm 544; 545; GFX9-LABEL: fshl_v4i32_imm: 546; GFX9: ; %bb.0: ; %entry 547; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 548; GFX9-NEXT: v_mov_b32_e32 v4, 0 549; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 550; GFX9-NEXT: s_waitcnt lgkmcnt(0) 551; GFX9-NEXT: v_mov_b32_e32 v0, s11 552; GFX9-NEXT: v_mov_b32_e32 v1, s10 553; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 31 554; GFX9-NEXT: v_mov_b32_e32 v0, s9 555; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 23 556; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25 557; GFX9-NEXT: v_mov_b32_e32 v0, s8 558; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31 559; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 560; GFX9-NEXT: s_endpgm 561; 562; R600-LABEL: fshl_v4i32_imm: 563; R600: ; %bb.0: ; %entry 564; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 565; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 566; R600-NEXT: CF_END 567; R600-NEXT: PAD 568; R600-NEXT: ALU clause starting at 4: 569; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x, 570; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 571; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 572; R600-NEXT: 23(3.222986e-44), 0(0.000000e+00) 573; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 574; R600-NEXT: 25(3.503246e-44), 0(0.000000e+00) 575; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x, 576; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 577; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 578; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 579; 580; GFX10-LABEL: fshl_v4i32_imm: 581; GFX10: ; %bb.0: ; %entry 582; GFX10-NEXT: s_clause 0x1 583; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 584; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 585; GFX10-NEXT: v_mov_b32_e32 v4, 0 586; GFX10-NEXT: s_waitcnt lgkmcnt(0) 587; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 31 588; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 23 589; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 25 590; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 591; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 592; GFX10-NEXT: s_endpgm 593entry: 594 %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 595 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 596 ret void 597} 598