1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 7; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11 8 9declare i32 @llvm.fshr.i32(i32, i32, i32) 10declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) 11declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) 12declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 13declare i16 @llvm.fshr.i16(i16, i16, i16) 14declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) 15declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) 16declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) 17declare i64 @llvm.fshr.i64(i64, i64, i64) 18declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 19declare i24 @llvm.fshr.i24(i24, i24, i24) 20declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) 21 22define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 23; SI-LABEL: fshr_i32: 24; SI: ; %bb.0: ; %entry 25; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 26; SI-NEXT: s_load_dword s6, s[0:1], 0xd 27; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 28; SI-NEXT: s_mov_b32 s3, 0xf000 29; SI-NEXT: s_mov_b32 s2, -1 30; SI-NEXT: s_waitcnt lgkmcnt(0) 31; SI-NEXT: v_mov_b32_e32 v0, s5 32; SI-NEXT: v_mov_b32_e32 v1, s6 33; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 34; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 35; SI-NEXT: s_endpgm 36; 37; VI-LABEL: fshr_i32: 38; VI: ; %bb.0: ; %entry 39; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 40; VI-NEXT: s_load_dword s4, s[0:1], 0x34 41; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: v_mov_b32_e32 v0, s3 44; VI-NEXT: v_mov_b32_e32 v1, s4 45; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 46; VI-NEXT: v_mov_b32_e32 v0, s0 47; VI-NEXT: v_mov_b32_e32 v1, s1 48; VI-NEXT: flat_store_dword v[0:1], v2 49; VI-NEXT: s_endpgm 50; 51; GFX9-LABEL: fshr_i32: 52; GFX9: ; %bb.0: ; %entry 53; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 54; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 55; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 56; GFX9-NEXT: v_mov_b32_e32 v0, 0 57; GFX9-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-NEXT: v_mov_b32_e32 v1, s3 59; GFX9-NEXT: v_mov_b32_e32 v2, s6 60; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 61; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 62; GFX9-NEXT: s_endpgm 63; 64; R600-LABEL: fshr_i32: 65; R600: ; %bb.0: ; %entry 66; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 67; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 68; R600-NEXT: CF_END 69; R600-NEXT: PAD 70; R600-NEXT: ALU clause starting at 4: 71; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 72; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 73; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, 74; 75; GFX10-LABEL: fshr_i32: 76; GFX10: ; %bb.0: ; %entry 77; GFX10-NEXT: s_clause 0x2 78; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 79; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 80; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 81; GFX10-NEXT: v_mov_b32_e32 v1, 0 82; GFX10-NEXT: s_waitcnt lgkmcnt(0) 83; GFX10-NEXT: v_mov_b32_e32 v0, s6 84; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0 85; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 86; GFX10-NEXT: s_endpgm 87; 88; GFX11-LABEL: fshr_i32: 89; GFX11: ; %bb.0: ; %entry 90; GFX11-NEXT: s_clause 0x2 91; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 92; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 93; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 94; GFX11-NEXT: s_waitcnt lgkmcnt(0) 95; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 96; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 97; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 98; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 99; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 100; GFX11-NEXT: s_endpgm 101entry: 102 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) 103 store i32 %0, i32 addrspace(1)* %in 104 ret void 105} 106 107define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 108; SI-LABEL: fshr_i32_imm: 109; SI: ; %bb.0: ; %entry 110; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 111; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 112; SI-NEXT: s_mov_b32 s3, 0xf000 113; SI-NEXT: s_mov_b32 s2, -1 114; SI-NEXT: s_waitcnt lgkmcnt(0) 115; SI-NEXT: v_mov_b32_e32 v0, s5 116; SI-NEXT: v_alignbit_b32 v0, s4, v0, 7 117; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 118; SI-NEXT: s_endpgm 119; 120; VI-LABEL: fshr_i32_imm: 121; VI: ; %bb.0: ; %entry 122; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 123; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 124; VI-NEXT: s_waitcnt lgkmcnt(0) 125; VI-NEXT: v_mov_b32_e32 v0, s3 126; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 127; VI-NEXT: v_mov_b32_e32 v0, s0 128; VI-NEXT: v_mov_b32_e32 v1, s1 129; VI-NEXT: flat_store_dword v[0:1], v2 130; VI-NEXT: s_endpgm 131; 132; GFX9-LABEL: fshr_i32_imm: 133; GFX9: ; %bb.0: ; %entry 134; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 135; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 136; GFX9-NEXT: v_mov_b32_e32 v0, 0 137; GFX9-NEXT: s_waitcnt lgkmcnt(0) 138; GFX9-NEXT: v_mov_b32_e32 v1, s3 139; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 140; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 141; GFX9-NEXT: s_endpgm 142; 143; R600-LABEL: fshr_i32_imm: 144; R600: ; %bb.0: ; %entry 145; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 146; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 147; R600-NEXT: CF_END 148; R600-NEXT: PAD 149; R600-NEXT: ALU clause starting at 4: 150; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 151; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 152; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 153; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 154; 155; GFX10-LABEL: fshr_i32_imm: 156; GFX10: ; %bb.0: ; %entry 157; GFX10-NEXT: s_clause 0x1 158; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 159; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 160; GFX10-NEXT: v_mov_b32_e32 v0, 0 161; GFX10-NEXT: s_waitcnt lgkmcnt(0) 162; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 163; GFX10-NEXT: global_store_dword v0, v1, s[4:5] 164; GFX10-NEXT: s_endpgm 165; 166; GFX11-LABEL: fshr_i32_imm: 167; GFX11: ; %bb.0: ; %entry 168; GFX11-NEXT: s_clause 0x1 169; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c 170; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 171; GFX11-NEXT: v_mov_b32_e32 v0, 0 172; GFX11-NEXT: s_waitcnt lgkmcnt(0) 173; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 174; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 175; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 176; GFX11-NEXT: s_endpgm 177entry: 178 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) 179 store i32 %0, i32 addrspace(1)* %in 180 ret void 181} 182 183define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 184; SI-LABEL: fshr_v2i32: 185; SI: ; %bb.0: ; %entry 186; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 187; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf 188; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 189; SI-NEXT: s_mov_b32 s3, 0xf000 190; SI-NEXT: s_mov_b32 s2, -1 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: v_mov_b32_e32 v0, s7 193; SI-NEXT: v_mov_b32_e32 v1, s9 194; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 195; SI-NEXT: v_mov_b32_e32 v0, s6 196; SI-NEXT: v_mov_b32_e32 v2, s8 197; SI-NEXT: v_alignbit_b32 v0, s4, v0, v2 198; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 199; SI-NEXT: s_endpgm 200; 201; VI-LABEL: fshr_v2i32: 202; VI: ; %bb.0: ; %entry 203; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 204; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 205; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 206; VI-NEXT: s_waitcnt lgkmcnt(0) 207; VI-NEXT: v_mov_b32_e32 v0, s7 208; VI-NEXT: v_mov_b32_e32 v1, s3 209; VI-NEXT: v_mov_b32_e32 v2, s6 210; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 211; VI-NEXT: v_mov_b32_e32 v0, s2 212; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0 213; VI-NEXT: v_mov_b32_e32 v3, s1 214; VI-NEXT: v_mov_b32_e32 v2, s0 215; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 216; VI-NEXT: s_endpgm 217; 218; GFX9-LABEL: fshr_v2i32: 219; GFX9: ; %bb.0: ; %entry 220; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 221; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 222; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 223; GFX9-NEXT: v_mov_b32_e32 v2, 0 224; GFX9-NEXT: s_waitcnt lgkmcnt(0) 225; GFX9-NEXT: v_mov_b32_e32 v0, s7 226; GFX9-NEXT: v_mov_b32_e32 v1, s3 227; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 228; GFX9-NEXT: v_mov_b32_e32 v0, s6 229; GFX9-NEXT: v_mov_b32_e32 v3, s2 230; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v3 231; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 232; GFX9-NEXT: s_endpgm 233; 234; R600-LABEL: fshr_v2i32: 235; R600: ; %bb.0: ; %entry 236; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 237; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 238; R600-NEXT: CF_END 239; R600-NEXT: PAD 240; R600-NEXT: ALU clause starting at 4: 241; R600-NEXT: MOV * T0.W, KC0[4].X, 242; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W, 243; R600-NEXT: MOV * T0.W, KC0[3].W, 244; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W, 245; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 246; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 247; 248; GFX10-LABEL: fshr_v2i32: 249; GFX10: ; %bb.0: ; %entry 250; GFX10-NEXT: s_clause 0x2 251; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c 252; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 253; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 254; GFX10-NEXT: v_mov_b32_e32 v3, 0 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: v_mov_b32_e32 v0, s3 257; GFX10-NEXT: v_mov_b32_e32 v2, s2 258; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, v0 259; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 260; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] 261; GFX10-NEXT: s_endpgm 262; 263; GFX11-LABEL: fshr_v2i32: 264; GFX11: ; %bb.0: ; %entry 265; GFX11-NEXT: s_clause 0x2 266; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c 267; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c 268; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 269; GFX11-NEXT: s_waitcnt lgkmcnt(0) 270; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 271; GFX11-NEXT: v_mov_b32_e32 v2, s2 272; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 273; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 274; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 275; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] 276; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 277; GFX11-NEXT: s_endpgm 278entry: 279 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 280 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 281 ret void 282} 283 284define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 285; SI-LABEL: fshr_v2i32_imm: 286; SI: ; %bb.0: ; %entry 287; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 288; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 289; SI-NEXT: s_mov_b32 s3, 0xf000 290; SI-NEXT: s_mov_b32 s2, -1 291; SI-NEXT: s_waitcnt lgkmcnt(0) 292; SI-NEXT: v_mov_b32_e32 v0, s7 293; SI-NEXT: v_mov_b32_e32 v2, s6 294; SI-NEXT: v_alignbit_b32 v1, s5, v0, 9 295; SI-NEXT: v_alignbit_b32 v0, s4, v2, 7 296; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 297; SI-NEXT: s_endpgm 298; 299; VI-LABEL: fshr_v2i32_imm: 300; VI: ; %bb.0: ; %entry 301; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 302; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 303; VI-NEXT: s_waitcnt lgkmcnt(0) 304; VI-NEXT: v_mov_b32_e32 v0, s7 305; VI-NEXT: v_mov_b32_e32 v2, s6 306; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 307; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 308; VI-NEXT: v_mov_b32_e32 v3, s1 309; VI-NEXT: v_mov_b32_e32 v2, s0 310; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 311; VI-NEXT: s_endpgm 312; 313; GFX9-LABEL: fshr_v2i32_imm: 314; GFX9: ; %bb.0: ; %entry 315; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 316; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 317; GFX9-NEXT: v_mov_b32_e32 v2, 0 318; GFX9-NEXT: s_waitcnt lgkmcnt(0) 319; GFX9-NEXT: v_mov_b32_e32 v0, s7 320; GFX9-NEXT: v_mov_b32_e32 v3, s6 321; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 322; GFX9-NEXT: v_alignbit_b32 v0, s4, v3, 7 323; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 324; GFX9-NEXT: s_endpgm 325; 326; R600-LABEL: fshr_v2i32_imm: 327; R600: ; %bb.0: ; %entry 328; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 329; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 330; R600-NEXT: CF_END 331; R600-NEXT: PAD 332; R600-NEXT: ALU clause starting at 4: 333; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 334; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 335; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 336; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 337; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 338; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 339; 340; GFX10-LABEL: fshr_v2i32_imm: 341; GFX10: ; %bb.0: ; %entry 342; GFX10-NEXT: s_clause 0x1 343; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 344; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 345; GFX10-NEXT: v_mov_b32_e32 v2, 0 346; GFX10-NEXT: s_waitcnt lgkmcnt(0) 347; GFX10-NEXT: v_alignbit_b32 v1, s5, s7, 9 348; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 349; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 350; GFX10-NEXT: s_endpgm 351; 352; GFX11-LABEL: fshr_v2i32_imm: 353; GFX11: ; %bb.0: ; %entry 354; GFX11-NEXT: s_clause 0x1 355; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c 356; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 357; GFX11-NEXT: v_mov_b32_e32 v2, 0 358; GFX11-NEXT: s_waitcnt lgkmcnt(0) 359; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 360; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7 361; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 362; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 363; GFX11-NEXT: s_endpgm 364entry: 365 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 366 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 367 ret void 368} 369 370define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 371; SI-LABEL: fshr_v4i32: 372; SI: ; %bb.0: ; %entry 373; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 374; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x15 375; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 376; SI-NEXT: s_mov_b32 s3, 0xf000 377; SI-NEXT: s_mov_b32 s2, -1 378; SI-NEXT: s_waitcnt lgkmcnt(0) 379; SI-NEXT: v_mov_b32_e32 v0, s11 380; SI-NEXT: v_mov_b32_e32 v1, s15 381; SI-NEXT: v_alignbit_b32 v3, s7, v0, v1 382; SI-NEXT: v_mov_b32_e32 v0, s10 383; SI-NEXT: v_mov_b32_e32 v1, s14 384; SI-NEXT: v_alignbit_b32 v2, s6, v0, v1 385; SI-NEXT: v_mov_b32_e32 v0, s9 386; SI-NEXT: v_mov_b32_e32 v1, s13 387; SI-NEXT: v_alignbit_b32 v1, s5, v0, v1 388; SI-NEXT: v_mov_b32_e32 v0, s8 389; SI-NEXT: v_mov_b32_e32 v4, s12 390; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4 391; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 392; SI-NEXT: s_endpgm 393; 394; VI-LABEL: fshr_v4i32: 395; VI: ; %bb.0: ; %entry 396; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 397; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 398; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 399; VI-NEXT: s_waitcnt lgkmcnt(0) 400; VI-NEXT: v_mov_b32_e32 v0, s11 401; VI-NEXT: v_mov_b32_e32 v1, s15 402; VI-NEXT: v_mov_b32_e32 v2, s10 403; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 404; VI-NEXT: v_mov_b32_e32 v0, s14 405; VI-NEXT: v_alignbit_b32 v2, s6, v2, v0 406; VI-NEXT: v_mov_b32_e32 v0, s9 407; VI-NEXT: v_mov_b32_e32 v1, s13 408; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 409; VI-NEXT: v_mov_b32_e32 v0, s8 410; VI-NEXT: v_mov_b32_e32 v4, s12 411; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 412; VI-NEXT: v_mov_b32_e32 v5, s1 413; VI-NEXT: v_mov_b32_e32 v4, s0 414; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 415; VI-NEXT: s_endpgm 416; 417; GFX9-LABEL: fshr_v4i32: 418; GFX9: ; %bb.0: ; %entry 419; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 420; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 421; GFX9-NEXT: v_mov_b32_e32 v4, 0 422; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 424; GFX9-NEXT: v_mov_b32_e32 v0, s11 425; GFX9-NEXT: v_mov_b32_e32 v1, s15 426; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, v1 427; GFX9-NEXT: v_mov_b32_e32 v0, s10 428; GFX9-NEXT: v_mov_b32_e32 v1, s14 429; GFX9-NEXT: v_alignbit_b32 v2, s6, v0, v1 430; GFX9-NEXT: v_mov_b32_e32 v0, s9 431; GFX9-NEXT: v_mov_b32_e32 v1, s13 432; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 433; GFX9-NEXT: v_mov_b32_e32 v0, s8 434; GFX9-NEXT: v_mov_b32_e32 v5, s12 435; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5 436; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 437; GFX9-NEXT: s_endpgm 438; 439; R600-LABEL: fshr_v4i32: 440; R600: ; %bb.0: ; %entry 441; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 442; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 443; R600-NEXT: CF_END 444; R600-NEXT: PAD 445; R600-NEXT: ALU clause starting at 4: 446; R600-NEXT: MOV * T0.W, KC0[6].X, 447; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W, 448; R600-NEXT: MOV * T1.W, KC0[5].W, 449; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W, 450; R600-NEXT: MOV * T1.W, KC0[5].Z, 451; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W, 452; R600-NEXT: MOV * T1.W, KC0[5].Y, 453; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W, 454; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 455; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 456; 457; GFX10-LABEL: fshr_v4i32: 458; GFX10: ; %bb.0: ; %entry 459; GFX10-NEXT: s_clause 0x2 460; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54 461; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 462; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 463; GFX10-NEXT: v_mov_b32_e32 v6, 0 464; GFX10-NEXT: s_waitcnt lgkmcnt(0) 465; GFX10-NEXT: v_mov_b32_e32 v0, s15 466; GFX10-NEXT: v_mov_b32_e32 v1, s14 467; GFX10-NEXT: v_mov_b32_e32 v4, s13 468; GFX10-NEXT: v_mov_b32_e32 v5, s12 469; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, v0 470; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, v1 471; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, v4 472; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 473; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] 474; GFX10-NEXT: s_endpgm 475; 476; GFX11-LABEL: fshr_v4i32: 477; GFX11: ; %bb.0: ; %entry 478; GFX11-NEXT: s_clause 0x2 479; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 480; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 481; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 482; GFX11-NEXT: v_mov_b32_e32 v6, 0 483; GFX11-NEXT: s_waitcnt lgkmcnt(0) 484; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 485; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12 486; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 487; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, v0 488; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, v1 489; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 490; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4 491; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5 492; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 493; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 494; GFX11-NEXT: s_endpgm 495entry: 496 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 497 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 498 ret void 499} 500 501define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 502; SI-LABEL: fshr_v4i32_imm: 503; SI: ; %bb.0: ; %entry 504; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 505; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 506; SI-NEXT: s_mov_b32 s3, 0xf000 507; SI-NEXT: s_mov_b32 s2, -1 508; SI-NEXT: s_waitcnt lgkmcnt(0) 509; SI-NEXT: v_mov_b32_e32 v0, s11 510; SI-NEXT: v_mov_b32_e32 v1, s10 511; SI-NEXT: v_alignbit_b32 v3, s7, v0, 1 512; SI-NEXT: v_mov_b32_e32 v0, s9 513; SI-NEXT: v_alignbit_b32 v2, s6, v1, 9 514; SI-NEXT: v_alignbit_b32 v1, s5, v0, 7 515; SI-NEXT: v_mov_b32_e32 v0, s8 516; SI-NEXT: v_alignbit_b32 v0, s4, v0, 1 517; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 518; SI-NEXT: s_endpgm 519; 520; VI-LABEL: fshr_v4i32_imm: 521; VI: ; %bb.0: ; %entry 522; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 523; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 524; VI-NEXT: s_waitcnt lgkmcnt(0) 525; VI-NEXT: v_mov_b32_e32 v0, s11 526; VI-NEXT: v_mov_b32_e32 v1, s10 527; VI-NEXT: v_mov_b32_e32 v4, s9 528; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 529; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 530; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7 531; VI-NEXT: v_mov_b32_e32 v0, s8 532; VI-NEXT: v_mov_b32_e32 v5, s1 533; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 534; VI-NEXT: v_mov_b32_e32 v4, s0 535; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 536; VI-NEXT: s_endpgm 537; 538; GFX9-LABEL: fshr_v4i32_imm: 539; GFX9: ; %bb.0: ; %entry 540; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 541; GFX9-NEXT: v_mov_b32_e32 v4, 0 542; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 544; GFX9-NEXT: v_mov_b32_e32 v0, s11 545; GFX9-NEXT: v_mov_b32_e32 v1, s10 546; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 547; GFX9-NEXT: v_mov_b32_e32 v0, s9 548; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 549; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 550; GFX9-NEXT: v_mov_b32_e32 v0, s8 551; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 552; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 553; GFX9-NEXT: s_endpgm 554; 555; R600-LABEL: fshr_v4i32_imm: 556; R600: ; %bb.0: ; %entry 557; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 558; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 559; R600-NEXT: CF_END 560; R600-NEXT: PAD 561; R600-NEXT: ALU clause starting at 4: 562; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 563; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 564; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 565; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 566; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 567; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, 568; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 569; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 570; 571; GFX10-LABEL: fshr_v4i32_imm: 572; GFX10: ; %bb.0: ; %entry 573; GFX10-NEXT: s_clause 0x1 574; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 575; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 576; GFX10-NEXT: v_mov_b32_e32 v4, 0 577; GFX10-NEXT: s_waitcnt lgkmcnt(0) 578; GFX10-NEXT: v_alignbit_b32 v3, s7, s11, 1 579; GFX10-NEXT: v_alignbit_b32 v2, s6, s10, 9 580; GFX10-NEXT: v_alignbit_b32 v1, s5, s9, 7 581; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 582; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 583; GFX10-NEXT: s_endpgm 584; 585; GFX11-LABEL: fshr_v4i32_imm: 586; GFX11: ; %bb.0: ; %entry 587; GFX11-NEXT: s_clause 0x1 588; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 589; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 590; GFX11-NEXT: v_mov_b32_e32 v4, 0 591; GFX11-NEXT: s_waitcnt lgkmcnt(0) 592; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 593; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9 594; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7 595; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1 596; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 597; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 598; GFX11-NEXT: s_endpgm 599entry: 600 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 601 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 602 ret void 603} 604 605define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { 606; GFX89-LABEL: v_fshr_i32: 607; GFX89: ; %bb.0: 608; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 609; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2 610; GFX89-NEXT: s_setpc_b64 s[30:31] 611; 612; R600-LABEL: v_fshr_i32: 613; R600: ; %bb.0: 614; R600-NEXT: CF_END 615; R600-NEXT: PAD 616; 617; GFX10-LABEL: v_fshr_i32: 618; GFX10: ; %bb.0: 619; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 620; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 621; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 622; GFX10-NEXT: s_setpc_b64 s[30:31] 623; 624; GFX11-LABEL: v_fshr_i32: 625; GFX11: ; %bb.0: 626; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 627; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 628; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 629; GFX11-NEXT: s_setpc_b64 s[30:31] 630 %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) 631 ret i32 %ret 632} 633 634define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) { 635; GFX89-LABEL: v_fshr_v2i32: 636; GFX89: ; %bb.0: 637; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 638; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4 639; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v5 640; GFX89-NEXT: s_setpc_b64 s[30:31] 641; 642; R600-LABEL: v_fshr_v2i32: 643; R600: ; %bb.0: 644; R600-NEXT: CF_END 645; R600-NEXT: PAD 646; 647; GFX10-LABEL: v_fshr_v2i32: 648; GFX10: ; %bb.0: 649; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 650; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 651; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 652; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 653; GFX10-NEXT: s_setpc_b64 s[30:31] 654; 655; GFX11-LABEL: v_fshr_v2i32: 656; GFX11: ; %bb.0: 657; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 658; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 659; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 660; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 661; GFX11-NEXT: s_setpc_b64 s[30:31] 662 %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) 663 ret <2 x i32> %ret 664} 665 666define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) { 667; GFX89-LABEL: v_fshr_v3i32: 668; GFX89: ; %bb.0: 669; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 670; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6 671; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v7 672; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v8 673; GFX89-NEXT: s_setpc_b64 s[30:31] 674; 675; R600-LABEL: v_fshr_v3i32: 676; R600: ; %bb.0: 677; R600-NEXT: CF_END 678; R600-NEXT: PAD 679; 680; GFX10-LABEL: v_fshr_v3i32: 681; GFX10: ; %bb.0: 682; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 684; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 685; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 686; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 687; GFX10-NEXT: s_setpc_b64 s[30:31] 688; 689; GFX11-LABEL: v_fshr_v3i32: 690; GFX11: ; %bb.0: 691; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 692; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 693; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 694; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 695; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 696; GFX11-NEXT: s_setpc_b64 s[30:31] 697 %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) 698 ret <3 x i32> %ret 699} 700 701define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) { 702; GFX89-LABEL: v_fshr_v4i32: 703; GFX89: ; %bb.0: 704; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 705; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8 706; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v9 707; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v10 708; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v11 709; GFX89-NEXT: s_setpc_b64 s[30:31] 710; 711; R600-LABEL: v_fshr_v4i32: 712; R600: ; %bb.0: 713; R600-NEXT: CF_END 714; R600-NEXT: PAD 715; 716; GFX10-LABEL: v_fshr_v4i32: 717; GFX10: ; %bb.0: 718; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 719; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 720; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 721; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 722; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 723; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 724; GFX10-NEXT: s_setpc_b64 s[30:31] 725; 726; GFX11-LABEL: v_fshr_v4i32: 727; GFX11: ; %bb.0: 728; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 729; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 730; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 731; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 732; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 733; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 734; GFX11-NEXT: s_setpc_b64 s[30:31] 735 %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) 736 ret <4 x i32> %ret 737} 738 739define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { 740; SI-LABEL: v_fshr_i16: 741; SI: ; %bb.0: 742; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 743; SI-NEXT: v_or_b32_e32 v2, 16, v2 744; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 745; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 746; SI-NEXT: s_setpc_b64 s[30:31] 747; 748; VI-LABEL: v_fshr_i16: 749; VI: ; %bb.0: 750; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 751; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 752; VI-NEXT: v_xor_b32_e32 v3, -1, v2 753; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 754; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 755; VI-NEXT: v_or_b32_e32 v0, v0, v1 756; VI-NEXT: s_setpc_b64 s[30:31] 757; 758; GFX9-LABEL: v_fshr_i16: 759; GFX9: ; %bb.0: 760; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 761; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 762; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 763; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 764; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 765; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 766; GFX9-NEXT: s_setpc_b64 s[30:31] 767; 768; R600-LABEL: v_fshr_i16: 769; R600: ; %bb.0: 770; R600-NEXT: CF_END 771; R600-NEXT: PAD 772; 773; GFX10-LABEL: v_fshr_i16: 774; GFX10: ; %bb.0: 775; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 776; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 777; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 778; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 779; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 780; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 781; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 782; GFX10-NEXT: s_setpc_b64 s[30:31] 783; 784; GFX11-LABEL: v_fshr_i16: 785; GFX11: ; %bb.0: 786; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 787; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 788; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 789; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 790; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 791; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 792; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 793; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 794; GFX11-NEXT: s_setpc_b64 s[30:31] 795 %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) 796 ret i16 %ret 797} 798 799define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) { 800; SI-LABEL: v_fshr_v2i16: 801; SI: ; %bb.0: 802; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 803; SI-NEXT: v_or_b32_e32 v5, 16, v5 804; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 805; SI-NEXT: v_alignbit_b32 v1, v1, v3, v5 806; SI-NEXT: v_or_b32_e32 v3, 16, v4 807; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 808; SI-NEXT: v_alignbit_b32 v0, v0, v2, v3 809; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 810; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 811; SI-NEXT: v_or_b32_e32 v0, v0, v1 812; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 813; SI-NEXT: s_setpc_b64 s[30:31] 814; 815; VI-LABEL: v_fshr_v2i16: 816; VI: ; %bb.0: 817; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 818; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 819; VI-NEXT: v_mov_b32_e32 v5, 1 820; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 821; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 822; VI-NEXT: v_xor_b32_e32 v3, -1, v3 823; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 824; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 825; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 826; VI-NEXT: v_xor_b32_e32 v4, -1, v2 827; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 828; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 829; VI-NEXT: v_or_b32_e32 v0, v0, v1 830; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 831; VI-NEXT: s_setpc_b64 s[30:31] 832; 833; GFX9-LABEL: v_fshr_v2i16: 834; GFX9: ; %bb.0: 835; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 836; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 837; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 838; GFX9-NEXT: v_and_b32_e32 v3, 0xf000f, v3 839; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 840; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 841; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 842; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 843; GFX9-NEXT: s_setpc_b64 s[30:31] 844; 845; R600-LABEL: v_fshr_v2i16: 846; R600: ; %bb.0: 847; R600-NEXT: CF_END 848; R600-NEXT: PAD 849; 850; GFX10-LABEL: v_fshr_v2i16: 851; GFX10: ; %bb.0: 852; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 853; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 854; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 855; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 856; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 857; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 858; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 859; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 860; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 861; GFX10-NEXT: s_setpc_b64 s[30:31] 862; 863; GFX11-LABEL: v_fshr_v2i16: 864; GFX11: ; %bb.0: 865; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 866; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 867; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 868; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] 869; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2 870; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 871; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3 872; GFX11-NEXT: v_pk_lshrrev_b16 v1, v2, v1 873; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 874; GFX11-NEXT: v_pk_lshlrev_b16 v0, v3, v0 875; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 876; GFX11-NEXT: s_setpc_b64 s[30:31] 877 %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) 878 ret <2 x i16> %ret 879} 880 881define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) { 882; SI-LABEL: v_fshr_v3i16: 883; SI: ; %bb.0: 884; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 885; SI-NEXT: v_or_b32_e32 v7, 16, v7 886; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 887; SI-NEXT: v_alignbit_b32 v1, v1, v4, v7 888; SI-NEXT: v_or_b32_e32 v4, 16, v6 889; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 890; SI-NEXT: v_alignbit_b32 v0, v0, v3, v4 891; SI-NEXT: v_or_b32_e32 v3, 16, v8 892; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 893; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 894; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 895; SI-NEXT: v_alignbit_b32 v3, v2, v4, v3 896; SI-NEXT: v_or_b32_e32 v0, v0, v1 897; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3 898; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 899; SI-NEXT: s_setpc_b64 s[30:31] 900; 901; VI-LABEL: v_fshr_v3i16: 902; VI: ; %bb.0: 903; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 904; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 905; VI-NEXT: v_mov_b32_e32 v8, 1 906; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 907; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 908; VI-NEXT: v_xor_b32_e32 v6, -1, v6 909; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 910; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 911; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 912; VI-NEXT: v_xor_b32_e32 v7, -1, v5 913; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 914; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 915; VI-NEXT: v_or_b32_e32 v1, v1, v3 916; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 917; VI-NEXT: v_xor_b32_e32 v3, -1, v4 918; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 919; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 920; VI-NEXT: v_or_b32_e32 v0, v0, v2 921; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 922; VI-NEXT: s_setpc_b64 s[30:31] 923; 924; GFX9-LABEL: v_fshr_v3i16: 925; GFX9: ; %bb.0: 926; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 927; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 928; GFX9-NEXT: v_mov_b32_e32 v8, 1 929; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 930; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 931; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 932; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 933; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 934; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 935; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 936; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 937; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 938; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 939; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 940; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 941; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 942; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 943; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 944; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 945; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 946; GFX9-NEXT: s_setpc_b64 s[30:31] 947; 948; R600-LABEL: v_fshr_v3i16: 949; R600: ; %bb.0: 950; R600-NEXT: CF_END 951; R600-NEXT: PAD 952; 953; GFX10-LABEL: v_fshr_v3i16: 954; GFX10: ; %bb.0: 955; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 956; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 957; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 958; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 959; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 960; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 961; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 962; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6 963; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 964; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 965; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 966; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9 967; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 968; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 969; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 970; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 971; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 972; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 973; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 974; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1 975; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 976; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 977; GFX10-NEXT: s_setpc_b64 s[30:31] 978; 979; GFX11-LABEL: v_fshr_v3i16: 980; GFX11: ; %bb.0: 981; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 982; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 983; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v4 984; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 985; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 986; GFX11-NEXT: v_xor_b32_e32 v8, -1, v4 987; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 988; GFX11-NEXT: v_xor_b32_e32 v10, -1, v6 989; GFX11-NEXT: v_lshlrev_b16 v7, 1, v7 990; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 991; GFX11-NEXT: v_lshlrev_b16 v0, v8, v0 992; GFX11-NEXT: v_lshrrev_b16 v4, v6, v9 993; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 994; GFX11-NEXT: v_lshlrev_b16 v6, v10, v7 995; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 996; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 997; GFX11-NEXT: v_xor_b32_e32 v2, -1, v5 998; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 999; GFX11-NEXT: v_or_b32_e32 v4, v6, v4 1000; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1001; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 1002; GFX11-NEXT: v_lshlrev_b16 v1, v2, v1 1003; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v0 1004; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1005; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 1006; GFX11-NEXT: s_setpc_b64 s[30:31] 1007 %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) 1008 ret <3 x i16> %ret 1009} 1010 1011define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) { 1012; SI-LABEL: v_fshr_v4i16: 1013; SI: ; %bb.0: 1014; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1015; SI-NEXT: v_or_b32_e32 v9, 16, v9 1016; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1017; SI-NEXT: v_alignbit_b32 v1, v1, v5, v9 1018; SI-NEXT: v_or_b32_e32 v5, 16, v8 1019; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 1020; SI-NEXT: v_alignbit_b32 v0, v0, v4, v5 1021; SI-NEXT: v_or_b32_e32 v4, 16, v11 1022; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 1023; SI-NEXT: v_alignbit_b32 v3, v3, v5, v4 1024; SI-NEXT: v_or_b32_e32 v4, 16, v10 1025; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 1026; SI-NEXT: v_alignbit_b32 v2, v2, v5, v4 1027; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1028; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 1029; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1030; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1031; SI-NEXT: v_or_b32_e32 v2, v2, v3 1032; SI-NEXT: v_or_b32_e32 v0, v0, v1 1033; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 1034; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1035; SI-NEXT: s_setpc_b64 s[30:31] 1036; 1037; VI-LABEL: v_fshr_v4i16: 1038; VI: ; %bb.0: 1039; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1040; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 1041; VI-NEXT: v_mov_b32_e32 v8, 1 1042; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1043; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1044; VI-NEXT: v_xor_b32_e32 v6, -1, v6 1045; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 1046; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1047; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1048; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1049; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1050; VI-NEXT: v_xor_b32_e32 v7, -1, v7 1051; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 1052; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 1053; VI-NEXT: v_xor_b32_e32 v8, -1, v5 1054; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 1055; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 1056; VI-NEXT: v_or_b32_e32 v1, v1, v3 1057; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 1058; VI-NEXT: v_xor_b32_e32 v3, -1, v4 1059; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 1060; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 1061; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1062; VI-NEXT: v_or_b32_e32 v0, v0, v2 1063; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1064; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1065; VI-NEXT: s_setpc_b64 s[30:31] 1066; 1067; GFX9-LABEL: v_fshr_v4i16: 1068; GFX9: ; %bb.0: 1069; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1070; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 1071; GFX9-NEXT: v_mov_b32_e32 v8, 1 1072; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1073; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1074; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 1075; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 1076; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 1077; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 1078; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1079; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1080; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 1081; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 1082; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 1083; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 1084; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 1085; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 1086; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1087; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 1088; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 1089; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 1090; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 1091; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 1092; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 1093; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 1094; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1095; GFX9-NEXT: v_lshl_or_b32 v0, v7, 16, v0 1096; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v1 1097; GFX9-NEXT: s_setpc_b64 s[30:31] 1098; 1099; R600-LABEL: v_fshr_v4i16: 1100; R600: ; %bb.0: 1101; R600-NEXT: CF_END 1102; R600-NEXT: PAD 1103; 1104; GFX10-LABEL: v_fshr_v4i16: 1105; GFX10: ; %bb.0: 1106; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1107; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1108; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1109; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 1110; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1111; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 1112; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1113; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 1114; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 1115; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 1116; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4 1117; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 1118; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 1119; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 1120; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2 1121; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 1122; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9 1123; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1 1124; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0 1125; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 1126; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 1127; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8 1128; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13 1129; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10 1130; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1131; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1132; GFX10-NEXT: v_or_b32_e32 v2, v4, v6 1133; GFX10-NEXT: v_or_b32_e32 v3, v7, v5 1134; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 1135; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 1136; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 1137; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 1138; GFX10-NEXT: s_setpc_b64 s[30:31] 1139; 1140; GFX11-LABEL: v_fshr_v4i16: 1141; GFX11: ; %bb.0: 1142; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1143; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1144; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 1145; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5 1146; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 1147; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4 1148; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 1149; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 1150; GFX11-NEXT: v_xor_b32_e32 v11, -1, v5 1151; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 1152; GFX11-NEXT: v_xor_b32_e32 v12, -1, v4 1153; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 1154; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8 1155; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 1156; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2 1157; GFX11-NEXT: v_lshlrev_b16 v10, 1, v10 1158; GFX11-NEXT: v_xor_b32_e32 v14, -1, v9 1159; GFX11-NEXT: v_lshlrev_b16 v1, v11, v1 1160; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 1161; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 1162; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 1163; GFX11-NEXT: v_lshlrev_b16 v4, v7, v8 1164; GFX11-NEXT: v_lshrrev_b16 v5, v9, v13 1165; GFX11-NEXT: v_lshlrev_b16 v7, v14, v10 1166; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 1167; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 1168; GFX11-NEXT: v_or_b32_e32 v2, v4, v6 1169; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1170; GFX11-NEXT: v_or_b32_e32 v3, v7, v5 1171; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1172; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 1173; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 1174; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 1175; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1176; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 1177; GFX11-NEXT: s_setpc_b64 s[30:31] 1178 %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) 1179 ret <4 x i16> %ret 1180} 1181 1182define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { 1183; SI-LABEL: v_fshr_i64: 1184; SI: ; %bb.0: 1185; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1186; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1187; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 1188; SI-NEXT: v_not_b32_e32 v4, v4 1189; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 1190; SI-NEXT: v_or_b32_e32 v1, v1, v3 1191; SI-NEXT: v_or_b32_e32 v0, v0, v2 1192; SI-NEXT: s_setpc_b64 s[30:31] 1193; 1194; VI-LABEL: v_fshr_i64: 1195; VI: ; %bb.0: 1196; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1197; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1198; VI-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1199; VI-NEXT: v_not_b32_e32 v4, v4 1200; VI-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1201; VI-NEXT: v_or_b32_e32 v1, v1, v3 1202; VI-NEXT: v_or_b32_e32 v0, v0, v2 1203; VI-NEXT: s_setpc_b64 s[30:31] 1204; 1205; GFX9-LABEL: v_fshr_i64: 1206; GFX9: ; %bb.0: 1207; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1208; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1209; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1210; GFX9-NEXT: v_not_b32_e32 v4, v4 1211; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 1212; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1213; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 1214; GFX9-NEXT: s_setpc_b64 s[30:31] 1215; 1216; R600-LABEL: v_fshr_i64: 1217; R600: ; %bb.0: 1218; R600-NEXT: CF_END 1219; R600-NEXT: PAD 1220; 1221; GFX10-LABEL: v_fshr_i64: 1222; GFX10: ; %bb.0: 1223; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1224; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1225; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1226; GFX10-NEXT: v_not_b32_e32 v5, v4 1227; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1228; GFX10-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] 1229; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 1230; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 1231; GFX10-NEXT: s_setpc_b64 s[30:31] 1232; 1233; GFX11-LABEL: v_fshr_i64: 1234; GFX11: ; %bb.0: 1235; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1236; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1237; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1238; GFX11-NEXT: v_not_b32_e32 v5, v4 1239; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] 1240; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1241; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] 1242; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 1243; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1244; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 1245; GFX11-NEXT: s_setpc_b64 s[30:31] 1246 %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) 1247 ret i64 %ret 1248} 1249 1250define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) { 1251; SI-LABEL: v_fshr_v2i64: 1252; SI: ; %bb.0: 1253; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1254; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 1255; SI-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 1256; SI-NEXT: v_not_b32_e32 v8, v8 1257; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 1258; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 1259; SI-NEXT: v_or_b32_e32 v1, v1, v5 1260; SI-NEXT: v_lshr_b64 v[5:6], v[6:7], v10 1261; SI-NEXT: v_not_b32_e32 v7, v10 1262; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v7 1263; SI-NEXT: v_or_b32_e32 v0, v0, v4 1264; SI-NEXT: v_or_b32_e32 v3, v3, v6 1265; SI-NEXT: v_or_b32_e32 v2, v2, v5 1266; SI-NEXT: s_setpc_b64 s[30:31] 1267; 1268; VI-LABEL: v_fshr_v2i64: 1269; VI: ; %bb.0: 1270; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1271; VI-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1272; VI-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1273; VI-NEXT: v_not_b32_e32 v8, v8 1274; VI-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1275; VI-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1276; VI-NEXT: v_or_b32_e32 v1, v1, v5 1277; VI-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] 1278; VI-NEXT: v_not_b32_e32 v7, v10 1279; VI-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1280; VI-NEXT: v_or_b32_e32 v0, v0, v4 1281; VI-NEXT: v_or_b32_e32 v3, v3, v6 1282; VI-NEXT: v_or_b32_e32 v2, v2, v5 1283; VI-NEXT: s_setpc_b64 s[30:31] 1284; 1285; GFX9-LABEL: v_fshr_v2i64: 1286; GFX9: ; %bb.0: 1287; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1288; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1289; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1290; GFX9-NEXT: v_not_b32_e32 v8, v8 1291; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] 1292; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1293; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 1294; GFX9-NEXT: v_lshrrev_b64 v[5:6], v10, v[6:7] 1295; GFX9-NEXT: v_not_b32_e32 v7, v10 1296; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, v[2:3] 1297; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 1298; GFX9-NEXT: v_or_b32_e32 v3, v3, v6 1299; GFX9-NEXT: v_or_b32_e32 v2, v2, v5 1300; GFX9-NEXT: s_setpc_b64 s[30:31] 1301; 1302; R600-LABEL: v_fshr_v2i64: 1303; R600: ; %bb.0: 1304; R600-NEXT: CF_END 1305; R600-NEXT: PAD 1306; 1307; GFX10-LABEL: v_fshr_v2i64: 1308; GFX10: ; %bb.0: 1309; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1310; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1311; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1312; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1313; GFX10-NEXT: v_not_b32_e32 v9, v8 1314; GFX10-NEXT: v_not_b32_e32 v11, v10 1315; GFX10-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1316; GFX10-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] 1317; GFX10-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 1318; GFX10-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 1319; GFX10-NEXT: v_or_b32_e32 v0, v0, v4 1320; GFX10-NEXT: v_or_b32_e32 v1, v1, v5 1321; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 1322; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 1323; GFX10-NEXT: s_setpc_b64 s[30:31] 1324; 1325; GFX11-LABEL: v_fshr_v2i64: 1326; GFX11: ; %bb.0: 1327; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1328; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1329; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] 1330; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] 1331; GFX11-NEXT: v_not_b32_e32 v9, v8 1332; GFX11-NEXT: v_not_b32_e32 v11, v10 1333; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] 1334; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] 1335; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1336; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] 1337; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] 1338; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 1339; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 1340; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 1341; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 1342; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 1343; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 1344; GFX11-NEXT: s_setpc_b64 s[30:31] 1345 %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) 1346 ret <2 x i64> %ret 1347} 1348 1349define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { 1350; SI-LABEL: v_fshr_i24: 1351; SI: ; %bb.0: 1352; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1353; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1354; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1355; SI-NEXT: v_mul_hi_u32 v3, v3, s4 1356; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1357; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1358; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1359; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 1360; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 1361; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1362; SI-NEXT: s_setpc_b64 s[30:31] 1363; 1364; VI-LABEL: v_fshr_i24: 1365; VI: ; %bb.0: 1366; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1367; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1368; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1369; VI-NEXT: v_mul_hi_u32 v3, v3, s4 1370; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1371; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1372; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1373; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 1374; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 1375; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 1376; VI-NEXT: s_setpc_b64 s[30:31] 1377; 1378; GFX9-LABEL: v_fshr_i24: 1379; GFX9: ; %bb.0: 1380; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1381; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1382; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1383; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4 1384; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1385; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1386; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1387; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 1388; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 1389; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 1390; GFX9-NEXT: s_setpc_b64 s[30:31] 1391; 1392; R600-LABEL: v_fshr_i24: 1393; R600: ; %bb.0: 1394; R600-NEXT: CF_END 1395; R600-NEXT: PAD 1396; 1397; GFX10-LABEL: v_fshr_i24: 1398; GFX10: ; %bb.0: 1399; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1400; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1401; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1402; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1403; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v3 1404; GFX10-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1405; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1406; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 1407; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 1408; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 1409; GFX10-NEXT: s_setpc_b64 s[30:31] 1410; 1411; GFX11-LABEL: v_fshr_i24: 1412; GFX11: ; %bb.0: 1413; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1414; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1415; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2 1416; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1417; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1418; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v3 1419; GFX11-NEXT: v_lshrrev_b32_e32 v3, 4, v3 1420; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1421; GFX11-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1422; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 1423; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1424; GFX11-NEXT: v_add_nc_u32_e32 v2, 8, v2 1425; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 1426; GFX11-NEXT: s_setpc_b64 s[30:31] 1427 %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) 1428 ret i24 %ret 1429} 1430 1431define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { 1432; SI-LABEL: v_fshr_v2i24: 1433; SI: ; %bb.0: 1434; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1435; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1436; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1437; SI-NEXT: v_mul_hi_u32 v6, v6, s4 1438; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1439; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1440; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1441; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1442; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 1443; SI-NEXT: v_mul_hi_u32 v6, v7, s4 1444; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 1445; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1446; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1447; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v6 1448; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1449; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 1450; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 1451; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1452; SI-NEXT: s_setpc_b64 s[30:31] 1453; 1454; VI-LABEL: v_fshr_v2i24: 1455; VI: ; %bb.0: 1456; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1457; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1458; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab 1459; VI-NEXT: v_mul_hi_u32 v6, v6, s4 1460; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1461; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1462; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1463; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1464; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 1465; VI-NEXT: v_mul_hi_u32 v6, v7, s4 1466; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 1467; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 1468; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1469; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v6 1470; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1471; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3 1472; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 1473; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 1474; VI-NEXT: s_setpc_b64 s[30:31] 1475; 1476; GFX9-LABEL: v_fshr_v2i24: 1477; GFX9: ; %bb.0: 1478; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1479; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1480; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab 1481; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4 1482; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1483; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1484; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1485; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1486; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 1487; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4 1488; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 1489; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 1490; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 1491; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v6 1492; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3 1493; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3 1494; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 1495; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 1496; GFX9-NEXT: s_setpc_b64 s[30:31] 1497; 1498; R600-LABEL: v_fshr_v2i24: 1499; R600: ; %bb.0: 1500; R600-NEXT: CF_END 1501; R600-NEXT: PAD 1502; 1503; GFX10-LABEL: v_fshr_v2i24: 1504; GFX10: ; %bb.0: 1505; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1506; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1507; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1508; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1509; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1510; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1511; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 1512; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 1513; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1514; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 1515; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1516; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7 1517; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 1518; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 1519; GFX10-NEXT: v_add_nc_u32_e32 v4, 8, v4 1520; GFX10-NEXT: v_add_nc_u32_e32 v5, 8, v5 1521; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 1522; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 1523; GFX10-NEXT: s_setpc_b64 s[30:31] 1524; 1525; GFX11-LABEL: v_fshr_v2i24: 1526; GFX11: ; %bb.0: 1527; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1528; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1529; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v4 1530; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v5 1531; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 1532; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 1533; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1534; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 1535; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 1536; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1537; GFX11-NEXT: v_lshrrev_b32_e32 v6, 4, v6 1538; GFX11-NEXT: v_lshrrev_b32_e32 v7, 4, v7 1539; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1540; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 1541; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 1542; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1543; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 1544; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 1545; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1546; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 1547; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 1548; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1549; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 1550; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 1551; GFX11-NEXT: s_setpc_b64 s[30:31] 1552 %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) 1553 ret <2 x i24> %ret 1554} 1555