1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 7 8declare i32 @llvm.fshr.i32(i32, i32, i32) 9declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) 10declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) 11declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 12declare i16 @llvm.fshr.i16(i16, i16, i16) 13declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) 14declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) 15declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) 16declare i64 @llvm.fshr.i64(i64, i64, i64) 17declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 18declare i24 @llvm.fshr.i24(i24, i24, i24) 19declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) 20 21define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 22; SI-LABEL: fshr_i32: 23; SI: ; %bb.0: ; %entry 24; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 25; SI-NEXT: s_load_dword s6, s[0:1], 0xd 26; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 27; SI-NEXT: s_mov_b32 s3, 0xf000 28; SI-NEXT: s_mov_b32 s2, -1 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: v_mov_b32_e32 v0, s5 31; SI-NEXT: v_mov_b32_e32 v1, s6 32; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 33; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: fshr_i32: 37; VI: ; %bb.0: ; %entry 38; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 39; VI-NEXT: s_load_dword s4, s[0:1], 0x34 40; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; VI-NEXT: s_waitcnt lgkmcnt(0) 42; VI-NEXT: v_mov_b32_e32 v0, s3 43; VI-NEXT: v_mov_b32_e32 v1, s4 44; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 45; VI-NEXT: v_mov_b32_e32 v0, s0 46; VI-NEXT: v_mov_b32_e32 v1, s1 47; VI-NEXT: flat_store_dword v[0:1], v2 48; VI-NEXT: s_endpgm 49; 50; GFX9-LABEL: fshr_i32: 51; GFX9: ; %bb.0: ; %entry 52; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 53; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 54; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 55; GFX9-NEXT: v_mov_b32_e32 v0, 0 56; GFX9-NEXT: s_waitcnt lgkmcnt(0) 57; GFX9-NEXT: v_mov_b32_e32 v1, s3 58; GFX9-NEXT: v_mov_b32_e32 v2, s6 59; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 60; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 61; GFX9-NEXT: s_endpgm 62; 63; R600-LABEL: fshr_i32: 64; R600: ; %bb.0: ; %entry 65; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 66; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 67; R600-NEXT: CF_END 68; R600-NEXT: PAD 69; R600-NEXT: ALU clause starting at 4: 70; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 71; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 72; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, 73; 74; GFX10-LABEL: fshr_i32: 75; GFX10: ; %bb.0: ; %entry 76; GFX10-NEXT: s_clause 0x2 77; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 78; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 79; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 80; GFX10-NEXT: v_mov_b32_e32 v1, 0 81; GFX10-NEXT: s_waitcnt lgkmcnt(0) 82; GFX10-NEXT: v_mov_b32_e32 v0, s6 83; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0 84; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 85; GFX10-NEXT: s_endpgm 86entry: 87 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) 88 store i32 %0, i32 addrspace(1)* %in 89 ret void 90} 91 92define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 93; SI-LABEL: fshr_i32_imm: 94; SI: ; %bb.0: ; %entry 95; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 96; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 97; SI-NEXT: s_mov_b32 s3, 0xf000 98; SI-NEXT: s_mov_b32 s2, -1 99; SI-NEXT: s_waitcnt lgkmcnt(0) 100; SI-NEXT: v_mov_b32_e32 v0, s5 101; SI-NEXT: v_alignbit_b32 v0, s4, v0, 7 102; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 103; SI-NEXT: s_endpgm 104; 105; VI-LABEL: fshr_i32_imm: 106; VI: ; %bb.0: ; %entry 107; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 108; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; VI-NEXT: s_waitcnt lgkmcnt(0) 110; VI-NEXT: v_mov_b32_e32 v0, s3 111; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 112; VI-NEXT: v_mov_b32_e32 v0, s0 113; VI-NEXT: v_mov_b32_e32 v1, s1 114; VI-NEXT: flat_store_dword v[0:1], v2 115; VI-NEXT: s_endpgm 116; 117; GFX9-LABEL: fshr_i32_imm: 118; GFX9: ; %bb.0: ; %entry 119; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 120; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 121; GFX9-NEXT: v_mov_b32_e32 v0, 0 122; GFX9-NEXT: s_waitcnt lgkmcnt(0) 123; GFX9-NEXT: v_mov_b32_e32 v1, s3 124; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 125; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 126; GFX9-NEXT: s_endpgm 127; 128; R600-LABEL: fshr_i32_imm: 129; R600: ; %bb.0: ; %entry 130; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 131; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 132; R600-NEXT: CF_END 133; R600-NEXT: PAD 134; R600-NEXT: ALU clause starting at 4: 135; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 136; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 137; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 138; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 139; 140; GFX10-LABEL: fshr_i32_imm: 141; GFX10: ; %bb.0: ; %entry 142; GFX10-NEXT: s_clause 0x1 143; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 144; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 145; GFX10-NEXT: v_mov_b32_e32 v0, 0 146; GFX10-NEXT: s_waitcnt lgkmcnt(0) 147; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 148; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 149; GFX10-NEXT: s_endpgm 150entry: 151 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) 152 store i32 %0, i32 addrspace(1)* %in 153 ret void 154} 155 156define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 157; SI-LABEL: fshr_v2i32: 158; SI: ; %bb.0: ; %entry 159; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 160; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf 161; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 162; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 163; SI-NEXT: s_mov_b32 s7, 0xf000 164; SI-NEXT: s_waitcnt lgkmcnt(0) 165; SI-NEXT: v_mov_b32_e32 v0, s3 166; SI-NEXT: v_mov_b32_e32 v1, s9 167; SI-NEXT: v_mov_b32_e32 v2, s8 168; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 169; SI-NEXT: v_mov_b32_e32 v0, s2 170; SI-NEXT: s_mov_b32 s6, -1 171; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2 172; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 173; SI-NEXT: s_endpgm 174; 175; VI-LABEL: fshr_v2i32: 176; VI: ; %bb.0: ; %entry 177; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 178; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x3c 179; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 180; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: v_mov_b32_e32 v0, s3 183; VI-NEXT: v_mov_b32_e32 v1, s5 184; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 185; VI-NEXT: v_mov_b32_e32 v0, s2 186; VI-NEXT: v_mov_b32_e32 v2, s4 187; VI-NEXT: v_alignbit_b32 v0, s6, v0, v2 188; VI-NEXT: v_mov_b32_e32 v3, s1 189; VI-NEXT: v_mov_b32_e32 v2, s0 190; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 191; VI-NEXT: s_endpgm 192; 193; GFX9-LABEL: fshr_v2i32: 194; GFX9: ; %bb.0: ; %entry 195; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 196; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x3c 197; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 198; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 199; GFX9-NEXT: v_mov_b32_e32 v2, 0 200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 201; GFX9-NEXT: v_mov_b32_e32 v0, s3 202; GFX9-NEXT: v_mov_b32_e32 v1, s5 203; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 204; GFX9-NEXT: v_mov_b32_e32 v0, s2 205; GFX9-NEXT: v_mov_b32_e32 v3, s4 206; GFX9-NEXT: v_alignbit_b32 v0, s6, v0, v3 207; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 208; GFX9-NEXT: s_endpgm 209; 210; R600-LABEL: fshr_v2i32: 211; R600: ; %bb.0: ; %entry 212; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 213; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 214; R600-NEXT: CF_END 215; R600-NEXT: PAD 216; R600-NEXT: ALU clause starting at 4: 217; R600-NEXT: MOV * T0.W, KC0[4].X, 218; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W, 219; R600-NEXT: MOV * T0.W, KC0[3].W, 220; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, 221; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 222; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 223; 224; GFX10-LABEL: fshr_v2i32: 225; GFX10: ; %bb.0: ; %entry 226; GFX10-NEXT: s_clause 0x3 227; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 228; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 229; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 230; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 231; GFX10-NEXT: v_mov_b32_e32 v3, 0 232; GFX10-NEXT: s_waitcnt lgkmcnt(0) 233; GFX10-NEXT: v_mov_b32_e32 v0, s3 234; GFX10-NEXT: v_mov_b32_e32 v2, s2 235; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 236; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 237; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] 238; GFX10-NEXT: s_endpgm 239entry: 240 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 241 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 242 ret void 243} 244 245define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 246; SI-LABEL: fshr_v2i32_imm: 247; SI: ; %bb.0: ; %entry 248; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 249; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb 250; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 251; SI-NEXT: s_mov_b32 s3, 0xf000 252; SI-NEXT: s_mov_b32 s2, -1 253; SI-NEXT: s_waitcnt lgkmcnt(0) 254; SI-NEXT: v_mov_b32_e32 v0, s5 255; SI-NEXT: v_alignbit_b32 v1, s7, v0, 9 256; SI-NEXT: v_mov_b32_e32 v0, s4 257; SI-NEXT: v_alignbit_b32 v0, s6, v0, 7 258; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 259; SI-NEXT: s_endpgm 260; 261; VI-LABEL: fshr_v2i32_imm: 262; VI: ; %bb.0: ; %entry 263; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 264; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 265; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 266; VI-NEXT: s_waitcnt lgkmcnt(0) 267; VI-NEXT: v_mov_b32_e32 v0, s3 268; VI-NEXT: v_mov_b32_e32 v2, s2 269; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 270; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 271; VI-NEXT: v_mov_b32_e32 v3, s1 272; VI-NEXT: v_mov_b32_e32 v2, s0 273; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 274; VI-NEXT: s_endpgm 275; 276; GFX9-LABEL: fshr_v2i32_imm: 277; GFX9: ; %bb.0: ; %entry 278; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 279; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 280; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 281; GFX9-NEXT: v_mov_b32_e32 v2, 0 282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 283; GFX9-NEXT: v_mov_b32_e32 v0, s3 284; GFX9-NEXT: v_mov_b32_e32 v3, s2 285; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 286; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 287; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 288; GFX9-NEXT: s_endpgm 289; 290; R600-LABEL: fshr_v2i32_imm: 291; R600: ; %bb.0: ; %entry 292; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 293; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 294; R600-NEXT: CF_END 295; R600-NEXT: PAD 296; R600-NEXT: ALU clause starting at 4: 297; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 298; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 299; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 300; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 301; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 302; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 303; 304; GFX10-LABEL: fshr_v2i32_imm: 305; GFX10: ; %bb.0: ; %entry 306; GFX10-NEXT: s_clause 0x2 307; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 308; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 309; GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 310; GFX10-NEXT: v_mov_b32_e32 v2, 0 311; GFX10-NEXT: s_waitcnt lgkmcnt(0) 312; GFX10-NEXT: v_alignbit_b32 v1, s3, s5, 9 313; GFX10-NEXT: v_alignbit_b32 v0, s2, s4, 7 314; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 315; GFX10-NEXT: s_endpgm 316entry: 317 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 318 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 319 ret void 320} 321 322define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 323; SI-LABEL: fshr_v4i32: 324; SI: ; %bb.0: ; %entry 325; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 326; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x15 327; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 328; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 329; SI-NEXT: s_mov_b32 s15, 0xf000 330; SI-NEXT: s_waitcnt lgkmcnt(0) 331; SI-NEXT: v_mov_b32_e32 v0, s7 332; SI-NEXT: v_mov_b32_e32 v1, s11 333; SI-NEXT: v_mov_b32_e32 v4, s8 334; SI-NEXT: v_alignbit_b32 v3, s3, v0, v1 335; SI-NEXT: v_mov_b32_e32 v0, s6 336; SI-NEXT: v_mov_b32_e32 v1, s10 337; SI-NEXT: v_alignbit_b32 v2, s2, v0, v1 338; SI-NEXT: v_mov_b32_e32 v0, s5 339; SI-NEXT: v_mov_b32_e32 v1, s9 340; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1 341; SI-NEXT: v_mov_b32_e32 v0, s4 342; SI-NEXT: s_mov_b32 s14, -1 343; SI-NEXT: v_alignbit_b32 v0, s0, v0, v4 344; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 345; SI-NEXT: s_endpgm 346; 347; VI-LABEL: fshr_v4i32: 348; VI: ; %bb.0: ; %entry 349; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 350; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x54 351; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 352; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 353; VI-NEXT: s_waitcnt lgkmcnt(0) 354; VI-NEXT: v_mov_b32_e32 v0, s7 355; VI-NEXT: v_mov_b32_e32 v1, s11 356; VI-NEXT: v_alignbit_b32 v3, s15, v0, v1 357; VI-NEXT: v_mov_b32_e32 v0, s6 358; VI-NEXT: v_mov_b32_e32 v1, s10 359; VI-NEXT: v_alignbit_b32 v2, s14, v0, v1 360; VI-NEXT: v_mov_b32_e32 v0, s5 361; VI-NEXT: v_mov_b32_e32 v1, s9 362; VI-NEXT: v_alignbit_b32 v1, s13, v0, v1 363; VI-NEXT: v_mov_b32_e32 v0, s4 364; VI-NEXT: v_mov_b32_e32 v4, s8 365; VI-NEXT: v_alignbit_b32 v0, s12, v0, v4 366; VI-NEXT: v_mov_b32_e32 v5, s1 367; VI-NEXT: v_mov_b32_e32 v4, s0 368; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 369; VI-NEXT: s_endpgm 370; 371; GFX9-LABEL: fshr_v4i32: 372; GFX9: ; %bb.0: ; %entry 373; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 374; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x54 375; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 376; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 377; GFX9-NEXT: v_mov_b32_e32 v4, 0 378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 379; GFX9-NEXT: v_mov_b32_e32 v0, s7 380; GFX9-NEXT: v_mov_b32_e32 v1, s11 381; GFX9-NEXT: v_alignbit_b32 v3, s15, v0, v1 382; GFX9-NEXT: v_mov_b32_e32 v0, s6 383; GFX9-NEXT: v_mov_b32_e32 v1, s10 384; GFX9-NEXT: v_alignbit_b32 v2, s14, v0, v1 385; GFX9-NEXT: v_mov_b32_e32 v0, s5 386; GFX9-NEXT: v_mov_b32_e32 v1, s9 387; GFX9-NEXT: v_alignbit_b32 v1, s13, v0, v1 388; GFX9-NEXT: v_mov_b32_e32 v0, s4 389; GFX9-NEXT: v_mov_b32_e32 v5, s8 390; GFX9-NEXT: v_alignbit_b32 v0, s12, v0, v5 391; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 392; GFX9-NEXT: s_endpgm 393; 394; R600-LABEL: fshr_v4i32: 395; R600: ; %bb.0: ; %entry 396; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 397; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 398; R600-NEXT: CF_END 399; R600-NEXT: PAD 400; R600-NEXT: ALU clause starting at 4: 401; R600-NEXT: MOV * T0.W, KC0[6].X, 402; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W, 403; R600-NEXT: MOV * T1.W, KC0[5].W, 404; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W, 405; R600-NEXT: MOV * T1.W, KC0[5].Z, 406; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W, 407; R600-NEXT: MOV * T1.W, KC0[5].Y, 408; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, 409; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 410; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 411; 412; GFX10-LABEL: fshr_v4i32: 413; GFX10: ; %bb.0: ; %entry 414; GFX10-NEXT: s_clause 0x3 415; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54 416; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 417; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34 418; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 419; GFX10-NEXT: v_mov_b32_e32 v6, 0 420; GFX10-NEXT: s_waitcnt lgkmcnt(0) 421; GFX10-NEXT: v_mov_b32_e32 v0, s7 422; GFX10-NEXT: v_mov_b32_e32 v1, s6 423; GFX10-NEXT: v_mov_b32_e32 v4, s5 424; GFX10-NEXT: v_mov_b32_e32 v5, s4 425; GFX10-NEXT: v_alignbit_b32 v3, s15, s11, v0 426; GFX10-NEXT: v_alignbit_b32 v2, s14, s10, v1 427; GFX10-NEXT: v_alignbit_b32 v1, s13, s9, v4 428; GFX10-NEXT: v_alignbit_b32 v0, s12, s8, v5 429; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] 430; GFX10-NEXT: s_endpgm 431entry: 432 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 433 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 434 ret void 435} 436 437define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 438; SI-LABEL: fshr_v4i32_imm: 439; SI: ; %bb.0: ; %entry 440; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 441; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 442; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 443; SI-NEXT: s_mov_b32 s3, 0xf000 444; SI-NEXT: s_mov_b32 s2, -1 445; SI-NEXT: s_waitcnt lgkmcnt(0) 446; SI-NEXT: v_mov_b32_e32 v0, s7 447; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 448; SI-NEXT: v_mov_b32_e32 v0, s6 449; SI-NEXT: v_alignbit_b32 v2, s10, v0, 9 450; SI-NEXT: v_mov_b32_e32 v0, s5 451; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 452; SI-NEXT: v_mov_b32_e32 v0, s4 453; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 454; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 455; SI-NEXT: s_endpgm 456; 457; VI-LABEL: fshr_v4i32_imm: 458; VI: ; %bb.0: ; %entry 459; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 460; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 461; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 462; VI-NEXT: s_waitcnt lgkmcnt(0) 463; VI-NEXT: v_mov_b32_e32 v0, s7 464; VI-NEXT: v_mov_b32_e32 v1, s6 465; VI-NEXT: v_alignbit_b32 v3, s11, v0, 1 466; VI-NEXT: v_mov_b32_e32 v0, s5 467; VI-NEXT: v_alignbit_b32 v2, s10, v1, 9 468; VI-NEXT: v_alignbit_b32 v1, s9, v0, 7 469; VI-NEXT: v_mov_b32_e32 v0, s4 470; VI-NEXT: v_mov_b32_e32 v5, s1 471; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1 472; VI-NEXT: v_mov_b32_e32 v4, s0 473; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 474; VI-NEXT: s_endpgm 475; 476; GFX9-LABEL: fshr_v4i32_imm: 477; GFX9: ; %bb.0: ; %entry 478; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 479; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 480; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 481; GFX9-NEXT: v_mov_b32_e32 v4, 0 482; GFX9-NEXT: s_waitcnt lgkmcnt(0) 483; GFX9-NEXT: v_mov_b32_e32 v0, s7 484; GFX9-NEXT: v_mov_b32_e32 v1, s6 485; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 1 486; GFX9-NEXT: v_mov_b32_e32 v0, s5 487; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 9 488; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 7 489; GFX9-NEXT: v_mov_b32_e32 v0, s4 490; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1 491; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 492; GFX9-NEXT: s_endpgm 493; 494; R600-LABEL: fshr_v4i32_imm: 495; R600: ; %bb.0: ; %entry 496; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 497; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 498; R600-NEXT: CF_END 499; R600-NEXT: PAD 500; R600-NEXT: ALU clause starting at 4: 501; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 502; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 503; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 504; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 505; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 506; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, 507; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 508; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 509; 510; GFX10-LABEL: fshr_v4i32_imm: 511; GFX10: ; %bb.0: ; %entry 512; GFX10-NEXT: s_clause 0x2 513; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 514; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 515; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 516; GFX10-NEXT: v_mov_b32_e32 v4, 0 517; GFX10-NEXT: s_waitcnt lgkmcnt(0) 518; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 519; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 520; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 521; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 522; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 523; GFX10-NEXT: s_endpgm 524entry: 525 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 526 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 527 ret void 528} 529 530define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { 531; GFX89-LABEL: v_fshr_i32: 532; GFX89: ; %bb.0: 533; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2 535; GFX89-NEXT: s_setpc_b64 s[30:31] 536; 537; R600-LABEL: v_fshr_i32: 538; R600: ; %bb.0: 539; R600-NEXT: CF_END 540; R600-NEXT: PAD 541; 542; GFX10-LABEL: v_fshr_i32: 543; GFX10: ; %bb.0: 544; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 545; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 546; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 547; GFX10-NEXT: s_setpc_b64 s[30:31] 548 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) 549 ret i32 %ret 550} 551 552define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) { 553; GFX89-LABEL: v_fshr_v2i32: 554; GFX89: ; %bb.0: 555; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 556; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4 557; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5 558; GFX89-NEXT: s_setpc_b64 s[30:31] 559; 560; R600-LABEL: v_fshr_v2i32: 561; R600: ; %bb.0: 562; R600-NEXT: CF_END 563; R600-NEXT: PAD 564; 565; GFX10-LABEL: v_fshr_v2i32: 566; GFX10: ; %bb.0: 567; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 568; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 569; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 570; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 571; GFX10-NEXT: s_setpc_b64 s[30:31] 572 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) 573 ret <2 x i32> %ret 574} 575 576define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) { 577; GFX89-LABEL: v_fshr_v3i32: 578; GFX89: ; %bb.0: 579; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 580; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6 581; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7 582; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8 583; GFX89-NEXT: s_setpc_b64 s[30:31] 584; 585; R600-LABEL: v_fshr_v3i32: 586; R600: ; %bb.0: 587; R600-NEXT: CF_END 588; R600-NEXT: PAD 589; 590; GFX10-LABEL: v_fshr_v3i32: 591; GFX10: ; %bb.0: 592; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 593; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 594; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 595; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 596; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 597; GFX10-NEXT: s_setpc_b64 s[30:31] 598 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) 599 ret <3 x i32> %ret 600} 601 602define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) { 603; GFX89-LABEL: v_fshr_v4i32: 604; GFX89: ; %bb.0: 605; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 606; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8 607; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9 608; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10 609; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11 610; GFX89-NEXT: s_setpc_b64 s[30:31] 611; 612; R600-LABEL: v_fshr_v4i32: 613; R600: ; %bb.0: 614; R600-NEXT: CF_END 615; R600-NEXT: PAD 616; 617; GFX10-LABEL: v_fshr_v4i32: 618; GFX10: ; %bb.0: 619; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 621; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 622; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 623; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 624; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 625; GFX10-NEXT: s_setpc_b64 s[30:31] 626 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) 627 ret <4 x i32> %ret 628} 629 630define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { 631; SI-LABEL: v_fshr_i16: 632; SI: ; %bb.0: 633; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 634; SI-NEXT: v_or_b32_e32 v2, 16, v2 635; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 636; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 637; SI-NEXT: s_setpc_b64 s[30:31] 638; 639; VI-LABEL: v_fshr_i16: 640; VI: ; %bb.0: 641; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 642; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 643; VI-NEXT: v_xor_b32_e32 v3, -1, v2 644; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 645; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 646; VI-NEXT: v_or_b32_e32 v0, v0, v1 647; VI-NEXT: s_setpc_b64 s[30:31] 648; 649; GFX9-LABEL: v_fshr_i16: 650; GFX9: ; %bb.0: 651; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 652; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 653; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 654; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 655; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 656; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 657; GFX9-NEXT: s_setpc_b64 s[30:31] 658; 659; R600-LABEL: v_fshr_i16: 660; R600: ; %bb.0: 661; R600-NEXT: CF_END 662; R600-NEXT: PAD 663; 664; GFX10-LABEL: v_fshr_i16: 665; GFX10: ; %bb.0: 666; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 667; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 668; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 669; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 670; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 671; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 672; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 673; GFX10-NEXT: s_setpc_b64 s[30:31] 674 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) 675 ret i16 %ret 676} 677 678define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) { 679; SI-LABEL: v_fshr_v2i16: 680; SI: ; %bb.0: 681; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 682; SI-NEXT: v_or_b32_e32 v5, 16, v5 683; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 684; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 685; SI-NEXT: v_or_b32_e32 v3, 16, v4 686; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 687; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 688; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 689; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 690; SI-NEXT: v_or_b32_e32 v0, v0, v1 691; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 692; SI-NEXT: s_setpc_b64 s[30:31] 693; 694; VI-LABEL: v_fshr_v2i16: 695; VI: ; %bb.0: 696; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 697; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 698; VI-NEXT: v_mov_b32_e32 v5, 1 699; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 700; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 701; VI-NEXT: v_xor_b32_e32 v3, -1, v3 702; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 703; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 704; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 705; VI-NEXT: v_xor_b32_e32 v4, -1, v2 706; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 707; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 708; VI-NEXT: v_or_b32_e32 v0, v0, v1 709; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 710; VI-NEXT: s_setpc_b64 s[30:31] 711; 712; GFX9-LABEL: v_fshr_v2i16: 713; GFX9: ; %bb.0: 714; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 715; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 716; GFX9-NEXT: s_mov_b32 s4, 0xf000f 717; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 718; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 719; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 720; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 721; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 722; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 723; GFX9-NEXT: s_setpc_b64 s[30:31] 724; 725; R600-LABEL: v_fshr_v2i16: 726; R600: ; %bb.0: 727; R600-NEXT: CF_END 728; R600-NEXT: PAD 729; 730; GFX10-LABEL: v_fshr_v2i16: 731; GFX10: ; %bb.0: 732; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 733; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 734; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 735; GFX10-NEXT: s_mov_b32 s4, 0xf000f 736; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 737; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 738; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 739; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 740; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 741; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 742; GFX10-NEXT: s_setpc_b64 s[30:31] 743 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) 744 ret <2 x i16> %ret 745} 746 747define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) { 748; SI-LABEL: v_fshr_v3i16: 749; SI: ; %bb.0: 750; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 751; SI-NEXT: v_or_b32_e32 v7, 16, v7 752; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 753; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 754; SI-NEXT: v_or_b32_e32 v4, 16, v6 755; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 756; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 757; SI-NEXT: s_mov_b32 s4, 0xffff 758; SI-NEXT: v_or_b32_e32 v3, 16, v8 759; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 760; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 761; SI-NEXT: v_and_b32_e32 v0, s4, v0 762; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 763; SI-NEXT: v_or_b32_e32 v0, v0, v1 764; SI-NEXT: v_and_b32_e32 v2, s4, v3 765; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 766; SI-NEXT: s_setpc_b64 s[30:31] 767; 768; VI-LABEL: v_fshr_v3i16: 769; VI: ; %bb.0: 770; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 771; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 772; VI-NEXT: v_mov_b32_e32 v8, 1 773; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 774; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 775; VI-NEXT: v_xor_b32_e32 v6, -1, v6 776; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 777; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 778; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 779; VI-NEXT: v_xor_b32_e32 v7, -1, v5 780; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 781; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 782; VI-NEXT: v_or_b32_e32 v1, v1, v3 783; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 784; VI-NEXT: v_xor_b32_e32 v3, -1, v4 785; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 786; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 787; VI-NEXT: v_or_b32_e32 v0, v0, v2 788; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 789; VI-NEXT: s_setpc_b64 s[30:31] 790; 791; GFX9-LABEL: v_fshr_v3i16: 792; GFX9: ; %bb.0: 793; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 794; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 795; GFX9-NEXT: v_mov_b32_e32 v8, 1 796; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 797; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 798; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 799; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 800; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 801; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 802; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 803; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 804; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 805; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 806; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 807; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 808; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 809; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 810; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 811; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 812; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 813; GFX9-NEXT: s_setpc_b64 s[30:31] 814; 815; R600-LABEL: v_fshr_v3i16: 816; R600: ; %bb.0: 817; R600-NEXT: CF_END 818; R600-NEXT: PAD 819; 820; GFX10-LABEL: v_fshr_v3i16: 821; GFX10: ; %bb.0: 822; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 823; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 824; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 825; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 826; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 827; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 828; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 829; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6 830; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 831; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 832; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 833; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9 834; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 835; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 836; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 837; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 838; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 839; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 840; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 841; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1 842; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 843; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 844; GFX10-NEXT: s_setpc_b64 s[30:31] 845 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) 846 ret <3 x i16> %ret 847} 848 849define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) { 850; SI-LABEL: v_fshr_v4i16: 851; SI: ; %bb.0: 852; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 853; SI-NEXT: v_or_b32_e32 v9, 16, v9 854; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 855; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 856; SI-NEXT: v_or_b32_e32 v5, 16, v8 857; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 858; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 859; SI-NEXT: v_or_b32_e32 v4, 16, v11 860; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 861; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 862; SI-NEXT: v_or_b32_e32 v4, 16, v10 863; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 864; SI-NEXT: s_mov_b32 s4, 0xffff 865; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 866; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 867; SI-NEXT: v_and_b32_e32 v2, s4, v2 868; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 869; SI-NEXT: v_and_b32_e32 v0, s4, v0 870; SI-NEXT: v_or_b32_e32 v2, v2, v3 871; SI-NEXT: v_or_b32_e32 v0, v0, v1 872; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 873; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 874; SI-NEXT: s_setpc_b64 s[30:31] 875; 876; VI-LABEL: v_fshr_v4i16: 877; VI: ; %bb.0: 878; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 879; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 880; VI-NEXT: v_mov_b32_e32 v8, 1 881; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 882; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 883; VI-NEXT: v_xor_b32_e32 v6, -1, v6 884; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 885; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 886; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 887; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 888; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 889; VI-NEXT: v_xor_b32_e32 v7, -1, v7 890; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 891; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 892; VI-NEXT: v_xor_b32_e32 v8, -1, v5 893; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 894; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 895; VI-NEXT: v_or_b32_e32 v1, v1, v3 896; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 897; VI-NEXT: v_xor_b32_e32 v3, -1, v4 898; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 899; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 900; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 901; VI-NEXT: v_or_b32_e32 v0, v0, v2 902; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 903; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 904; VI-NEXT: s_setpc_b64 s[30:31] 905; 906; GFX9-LABEL: v_fshr_v4i16: 907; GFX9: ; %bb.0: 908; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 909; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 910; GFX9-NEXT: v_mov_b32_e32 v8, 1 911; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 912; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 913; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 914; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 915; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 916; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 917; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 918; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 919; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 920; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 921; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 922; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 923; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 924; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 925; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 926; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 927; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 928; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 929; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 930; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 931; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff 932; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 933; GFX9-NEXT: v_and_b32_e32 v0, v2, v0 934; GFX9-NEXT: v_and_b32_e32 v1, v2, v1 935; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 936; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 937; GFX9-NEXT: s_setpc_b64 s[30:31] 938; 939; R600-LABEL: v_fshr_v4i16: 940; R600: ; %bb.0: 941; R600-NEXT: CF_END 942; R600-NEXT: PAD 943; 944; GFX10-LABEL: v_fshr_v4i16: 945; GFX10: ; %bb.0: 946; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 947; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 948; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 949; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 950; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 951; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 952; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 953; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 954; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 955; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 956; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0 957; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 958; GFX10-NEXT: v_xor_b32_e32 v12, -1, v5 959; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 960; GFX10-NEXT: v_xor_b32_e32 v9, -1, v4 961; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 962; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 963; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11 964; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 965; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 966; GFX10-NEXT: v_lshlrev_b16 v1, v12, v1 967; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 968; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10 969; GFX10-NEXT: v_lshlrev_b16 v5, v13, v8 970; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 971; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff 972; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 973; GFX10-NEXT: v_or_b32_e32 v3, v6, v7 974; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 975; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 976; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 977; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 978; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 979; GFX10-NEXT: s_setpc_b64 s[30:31] 980 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) 981 ret <4 x i16> %ret 982} 983 984define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { 985; SI-LABEL: v_fshr_i64: 986; SI: ; %bb.0: 987; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 988; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 989; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 990; SI-NEXT: v_not_b32_e32 v4, v4 991; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 992; SI-NEXT: v_or_b32_e32 v1, v1, v3 993; SI-NEXT: v_or_b32_e32 v0, v0, v2 994; SI-NEXT: s_setpc_b64 s[30:31] 995; 996; VI-LABEL: v_fshr_i64: 997; VI: ; %bb.0: 998; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 999; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1000; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1001; VI-NEXT: v_not_b32_e32 v4, v4 1002; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1003; VI-NEXT: v_or_b32_e32 v1, v1, v3 1004; VI-NEXT: v_or_b32_e32 v0, v0, v2 1005; VI-NEXT: s_setpc_b64 s[30:31] 1006; 1007; GFX9-LABEL: v_fshr_i64: 1008; GFX9: ; %bb.0: 1009; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1010; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1011; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1012; GFX9-NEXT: v_not_b32_e32 v4, v4 1013; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1014; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1015; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 1016; GFX9-NEXT: s_setpc_b64 s[30:31] 1017; 1018; R600-LABEL: v_fshr_i64: 1019; R600: ; %bb.0: 1020; R600-NEXT: CF_END 1021; R600-NEXT: PAD 1022; 1023; GFX10-LABEL: v_fshr_i64: 1024; GFX10: ; %bb.0: 1025; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1026; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1027; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1028; GFX10-NEXT: v_not_b32_e32 v5, v4 1029; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1030; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] 1031; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1032; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1033; GFX10-NEXT: s_setpc_b64 s[30:31] 1034 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) 1035 ret i64 %ret 1036} 1037 1038define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) { 1039; SI-LABEL: v_fshr_v2i64: 1040; SI: ; %bb.0: 1041; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1042; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1043; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 1044; SI-NEXT: v_not_b32_e32 v8, v8 1045; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 1046; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 1047; SI-NEXT: v_or_b32_e32 v1, v1, v5 1048; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10 1049; SI-NEXT: v_not_b32_e32 v7, v10 1050; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 1051; SI-NEXT: v_or_b32_e32 v0, v0, v4 1052; SI-NEXT: v_or_b32_e32 v3, v3, v6 1053; SI-NEXT: v_or_b32_e32 v2, v2, v5 1054; SI-NEXT: s_setpc_b64 s[30:31] 1055; 1056; VI-LABEL: v_fshr_v2i64: 1057; VI: ; %bb.0: 1058; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1059; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1060; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1061; VI-NEXT: v_not_b32_e32 v8, v8 1062; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1063; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1064; VI-NEXT: v_or_b32_e32 v1, v1, v5 1065; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] 1066; VI-NEXT: v_not_b32_e32 v7, v10 1067; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1068; VI-NEXT: v_or_b32_e32 v0, v0, v4 1069; VI-NEXT: v_or_b32_e32 v3, v3, v6 1070; VI-NEXT: v_or_b32_e32 v2, v2, v5 1071; VI-NEXT: s_setpc_b64 s[30:31] 1072; 1073; GFX9-LABEL: v_fshr_v2i64: 1074; GFX9: ; %bb.0: 1075; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1076; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1077; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1078; GFX9-NEXT: v_not_b32_e32 v8, v8 1079; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1080; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1081; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 1082; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] 1083; GFX9-NEXT: v_not_b32_e32 v7, v10 1084; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1085; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 1086; GFX9-NEXT: v_or_b32_e32 v3, v3, v6 1087; GFX9-NEXT: v_or_b32_e32 v2, v2, v5 1088; GFX9-NEXT: s_setpc_b64 s[30:31] 1089; 1090; R600-LABEL: v_fshr_v2i64: 1091; R600: ; %bb.0: 1092; R600-NEXT: CF_END 1093; R600-NEXT: PAD 1094; 1095; GFX10-LABEL: v_fshr_v2i64: 1096; GFX10: ; %bb.0: 1097; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1098; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1099; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1100; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1101; GFX10-NEXT: v_not_b32_e32 v9, v8 1102; GFX10-NEXT: v_not_b32_e32 v11, v10 1103; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1104; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] 1105; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 1106; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 1107; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 1108; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 1109; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 1110; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 1111; GFX10-NEXT: s_setpc_b64 s[30:31] 1112 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) 1113 ret <2 x i64> %ret 1114} 1115 1116define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { 1117; SI-LABEL: v_fshr_i24: 1118; SI: ; %bb.0: 1119; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1120; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1121; SI-NEXT: v_mul_hi_u32 v3, v2, s4 1122; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1123; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1124; SI-NEXT: v_mul_lo_u32 v3, v3, 24 1125; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 1126; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 1127; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1128; SI-NEXT: s_setpc_b64 s[30:31] 1129; 1130; VI-LABEL: v_fshr_i24: 1131; VI: ; %bb.0: 1132; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1133; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1134; VI-NEXT: v_mul_hi_u32 v3, v2, s4 1135; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1136; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1137; VI-NEXT: v_mul_lo_u32 v3, v3, 24 1138; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 1139; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 1140; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1141; VI-NEXT: s_setpc_b64 s[30:31] 1142; 1143; GFX9-LABEL: v_fshr_i24: 1144; GFX9: ; %bb.0: 1145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1146; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1147; GFX9-NEXT: v_mul_hi_u32 v3, v2, s4 1148; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1149; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1150; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 1151; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 1152; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 1153; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 1154; GFX9-NEXT: s_setpc_b64 s[30:31] 1155; 1156; R600-LABEL: v_fshr_i24: 1157; R600: ; %bb.0: 1158; R600-NEXT: CF_END 1159; R600-NEXT: PAD 1160; 1161; GFX10-LABEL: v_fshr_i24: 1162; GFX10: ; %bb.0: 1163; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1164; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1165; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v2 1166; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1167; GFX10-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1168; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 1169; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 1170; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 1171; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 1172; GFX10-NEXT: s_setpc_b64 s[30:31] 1173 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) 1174 ret i24 %ret 1175} 1176 1177define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { 1178; SI-LABEL: v_fshr_v2i24: 1179; SI: ; %bb.0: 1180; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1181; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1182; SI-NEXT: v_mul_hi_u32 v6, v4, s4 1183; SI-NEXT: v_mul_hi_u32 v7, v5, s4 1184; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1185; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1186; SI-NEXT: v_mul_lo_u32 v6, v6, 24 1187; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 1188; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1189; SI-NEXT: v_mul_lo_u32 v6, v6, 24 1190; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 1191; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1192; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1193; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 1194; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 1195; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1196; SI-NEXT: s_setpc_b64 s[30:31] 1197; 1198; VI-LABEL: v_fshr_v2i24: 1199; VI: ; %bb.0: 1200; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1201; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1202; VI-NEXT: v_mul_hi_u32 v6, v4, s4 1203; VI-NEXT: v_mul_hi_u32 v7, v5, s4 1204; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1205; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1206; VI-NEXT: v_mul_lo_u32 v6, v6, 24 1207; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 1208; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1209; VI-NEXT: v_mul_lo_u32 v6, v6, 24 1210; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 1211; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1212; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1213; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v6 1214; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 1215; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1216; VI-NEXT: s_setpc_b64 s[30:31] 1217; 1218; GFX9-LABEL: v_fshr_v2i24: 1219; GFX9: ; %bb.0: 1220; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1221; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1222; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 1223; GFX9-NEXT: v_mul_hi_u32 v7, v5, s4 1224; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1225; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1226; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 1227; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 1228; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v7 1229; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 1230; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 1231; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 1232; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1233; GFX9-NEXT: v_sub_u32_e32 v3, v5, v6 1234; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 1235; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 1236; GFX9-NEXT: s_setpc_b64 s[30:31] 1237; 1238; R600-LABEL: v_fshr_v2i24: 1239; R600: ; %bb.0: 1240; R600-NEXT: CF_END 1241; R600-NEXT: PAD 1242; 1243; GFX10-LABEL: v_fshr_v2i24: 1244; GFX10: ; %bb.0: 1245; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1246; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1247; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab 1248; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1249; GFX10-NEXT: v_mul_hi_u32 v6, v4, s4 1250; GFX10-NEXT: v_mul_hi_u32 v7, v5, s4 1251; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1252; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1253; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 1254; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 1255; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 1256; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 1257; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 1258; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 1259; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 1260; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 1261; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 1262; GFX10-NEXT: s_setpc_b64 s[30:31] 1263 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) 1264 ret <2 x i24> %ret 1265} 1266