1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI 3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9 5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600 6 7declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone 8declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone 9declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone 10 11define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { 12; SI-LABEL: fshr_i32: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 15; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 16; SI-NEXT: s_mov_b32 s7, 0xf000 17; SI-NEXT: s_mov_b32 s6, -1 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: v_mov_b32_e32 v0, s1 20; SI-NEXT: v_mov_b32_e32 v1, s2 21; SI-NEXT: v_alignbit_b32 v0, s0, v0, v1 22; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 23; SI-NEXT: s_endpgm 24; 25; VI-LABEL: fshr_i32: 26; VI: ; %bb.0: ; %entry 27; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 28; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: v_mov_b32_e32 v0, s1 31; VI-NEXT: v_mov_b32_e32 v1, s2 32; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1 33; VI-NEXT: v_mov_b32_e32 v0, s4 34; VI-NEXT: v_mov_b32_e32 v1, s5 35; VI-NEXT: flat_store_dword v[0:1], v2 36; VI-NEXT: s_endpgm 37; 38; GFX9-LABEL: fshr_i32: 39; GFX9: ; %bb.0: ; %entry 40; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 41; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 42; GFX9-NEXT: s_waitcnt lgkmcnt(0) 43; GFX9-NEXT: v_mov_b32_e32 v0, s1 44; GFX9-NEXT: v_mov_b32_e32 v1, s2 45; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, v1 46; GFX9-NEXT: v_mov_b32_e32 v0, s4 47; GFX9-NEXT: v_mov_b32_e32 v1, s5 48; GFX9-NEXT: global_store_dword v[0:1], v2, off 49; GFX9-NEXT: s_endpgm 50; 51; R600-LABEL: fshr_i32: 52; R600: ; %bb.0: ; %entry 53; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 54; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 55; R600-NEXT: CF_END 56; R600-NEXT: PAD 57; R600-NEXT: ALU clause starting at 4: 58; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 59; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 60; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X, 61entry: 62 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) 63 store i32 %0, i32 addrspace(1)* %in 64 ret void 65} 66 67define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { 68; SI-LABEL: fshr_i32_imm: 69; SI: ; %bb.0: ; %entry 70; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 71; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 72; SI-NEXT: s_mov_b32 s7, 0xf000 73; SI-NEXT: s_mov_b32 s6, -1 74; SI-NEXT: s_waitcnt lgkmcnt(0) 75; SI-NEXT: v_mov_b32_e32 v0, s1 76; SI-NEXT: v_alignbit_b32 v0, s0, v0, 7 77; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 78; SI-NEXT: s_endpgm 79; 80; VI-LABEL: fshr_i32_imm: 81; VI: ; %bb.0: ; %entry 82; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 83; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 84; VI-NEXT: s_waitcnt lgkmcnt(0) 85; VI-NEXT: v_mov_b32_e32 v0, s1 86; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7 87; VI-NEXT: v_mov_b32_e32 v0, s2 88; VI-NEXT: v_mov_b32_e32 v1, s3 89; VI-NEXT: flat_store_dword v[0:1], v2 90; VI-NEXT: s_endpgm 91; 92; GFX9-LABEL: fshr_i32_imm: 93; GFX9: ; %bb.0: ; %entry 94; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 95; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 96; GFX9-NEXT: s_waitcnt lgkmcnt(0) 97; GFX9-NEXT: v_mov_b32_e32 v0, s1 98; GFX9-NEXT: v_alignbit_b32 v2, s0, v0, 7 99; GFX9-NEXT: v_mov_b32_e32 v0, s2 100; GFX9-NEXT: v_mov_b32_e32 v1, s3 101; GFX9-NEXT: global_store_dword v[0:1], v2, off 102; GFX9-NEXT: s_endpgm 103; 104; R600-LABEL: fshr_i32_imm: 105; R600: ; %bb.0: ; %entry 106; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 107; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 108; R600-NEXT: CF_END 109; R600-NEXT: PAD 110; R600-NEXT: ALU clause starting at 4: 111; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 112; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 113; R600-NEXT: BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x, 114; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 115entry: 116 %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) 117 store i32 %0, i32 addrspace(1)* %in 118 ret void 119} 120 121define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) { 122; SI-LABEL: fshr_v2i32: 123; SI: ; %bb.0: ; %entry 124; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 125; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 126; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 127; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf 128; SI-NEXT: s_mov_b32 s7, 0xf000 129; SI-NEXT: s_mov_b32 s6, -1 130; SI-NEXT: s_waitcnt lgkmcnt(0) 131; SI-NEXT: v_mov_b32_e32 v0, s9 132; SI-NEXT: s_and_b32 s1, s1, 31 133; SI-NEXT: v_mov_b32_e32 v1, s1 134; SI-NEXT: s_and_b32 s0, s0, 31 135; SI-NEXT: v_alignbit_b32 v1, s3, v0, v1 136; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 137; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 138; SI-NEXT: v_mov_b32_e32 v0, s8 139; SI-NEXT: v_mov_b32_e32 v2, s0 140; SI-NEXT: v_alignbit_b32 v2, s2, v0, v2 141; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 142; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 143; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 144; SI-NEXT: s_endpgm 145; 146; VI-LABEL: fshr_v2i32: 147; VI: ; %bb.0: ; %entry 148; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 149; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 150; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 151; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 152; VI-NEXT: s_waitcnt lgkmcnt(0) 153; VI-NEXT: v_mov_b32_e32 v0, s7 154; VI-NEXT: s_and_b32 s1, s1, 31 155; VI-NEXT: v_mov_b32_e32 v1, s1 156; VI-NEXT: s_and_b32 s0, s0, 31 157; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 158; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 159; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 160; VI-NEXT: v_mov_b32_e32 v0, s6 161; VI-NEXT: v_mov_b32_e32 v2, s0 162; VI-NEXT: v_alignbit_b32 v2, s4, v0, v2 163; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 164; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 165; VI-NEXT: v_mov_b32_e32 v2, s2 166; VI-NEXT: v_mov_b32_e32 v3, s3 167; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 168; VI-NEXT: s_endpgm 169; 170; GFX9-LABEL: fshr_v2i32: 171; GFX9: ; %bb.0: ; %entry 172; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 173; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 174; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 175; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c 176; GFX9-NEXT: s_waitcnt lgkmcnt(0) 177; GFX9-NEXT: v_mov_b32_e32 v0, s7 178; GFX9-NEXT: s_and_b32 s1, s1, 31 179; GFX9-NEXT: v_mov_b32_e32 v1, s1 180; GFX9-NEXT: s_and_b32 s0, s0, 31 181; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 182; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 183; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 184; GFX9-NEXT: v_mov_b32_e32 v0, s6 185; GFX9-NEXT: v_mov_b32_e32 v2, s0 186; GFX9-NEXT: v_alignbit_b32 v2, s4, v0, v2 187; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 188; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 189; GFX9-NEXT: v_mov_b32_e32 v2, s2 190; GFX9-NEXT: v_mov_b32_e32 v3, s3 191; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 192; GFX9-NEXT: s_endpgm 193; 194; R600-LABEL: fshr_v2i32: 195; R600: ; %bb.0: ; %entry 196; R600-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] 197; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 198; R600-NEXT: CF_END 199; R600-NEXT: PAD 200; R600-NEXT: ALU clause starting at 4: 201; R600-NEXT: AND_INT * T0.W, KC0[4].X, literal.x, 202; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 203; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[3].X, KC0[3].Z, PV.W, 204; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0, 205; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, KC0[3].Z, 206; R600-NEXT: AND_INT * T0.W, KC0[3].W, literal.x, 207; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 208; R600-NEXT: BIT_ALIGN_INT T1.W, KC0[2].W, KC0[3].Y, PV.W, 209; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0, 210; R600-NEXT: CNDE_INT T0.X, PS, PV.W, KC0[3].Y, 211; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 212; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 213entry: 214 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) 215 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 216 ret void 217} 218 219define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) { 220; SI-LABEL: fshr_v2i32_imm: 221; SI: ; %bb.0: ; %entry 222; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 223; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 224; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 225; SI-NEXT: s_mov_b32 s7, 0xf000 226; SI-NEXT: s_mov_b32 s6, -1 227; SI-NEXT: s_waitcnt lgkmcnt(0) 228; SI-NEXT: v_mov_b32_e32 v0, s1 229; SI-NEXT: v_alignbit_b32 v1, s3, v0, 9 230; SI-NEXT: v_mov_b32_e32 v0, s0 231; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 232; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 233; SI-NEXT: s_endpgm 234; 235; VI-LABEL: fshr_v2i32_imm: 236; VI: ; %bb.0: ; %entry 237; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 238; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 239; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 240; VI-NEXT: s_waitcnt lgkmcnt(0) 241; VI-NEXT: v_mov_b32_e32 v0, s1 242; VI-NEXT: v_mov_b32_e32 v2, s0 243; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 244; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 245; VI-NEXT: v_mov_b32_e32 v2, s2 246; VI-NEXT: v_mov_b32_e32 v3, s3 247; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 248; VI-NEXT: s_endpgm 249; 250; GFX9-LABEL: fshr_v2i32_imm: 251; GFX9: ; %bb.0: ; %entry 252; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 253; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 254; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 256; GFX9-NEXT: v_mov_b32_e32 v0, s1 257; GFX9-NEXT: v_mov_b32_e32 v2, s0 258; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 9 259; GFX9-NEXT: v_alignbit_b32 v0, s4, v2, 7 260; GFX9-NEXT: v_mov_b32_e32 v2, s2 261; GFX9-NEXT: v_mov_b32_e32 v3, s3 262; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 263; GFX9-NEXT: s_endpgm 264; 265; R600-LABEL: fshr_v2i32_imm: 266; R600: ; %bb.0: ; %entry 267; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 268; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 269; R600-NEXT: CF_END 270; R600-NEXT: PAD 271; R600-NEXT: ALU clause starting at 4: 272; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x, 273; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 274; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x, 275; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 276; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 277; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 278entry: 279 %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>) 280 store <2 x i32> %0, <2 x i32> addrspace(1)* %in 281 ret void 282} 283 284define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { 285; SI-LABEL: fshr_v4i32: 286; SI: ; %bb.0: ; %entry 287; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 288; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 289; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 290; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x15 291; SI-NEXT: s_mov_b32 s7, 0xf000 292; SI-NEXT: s_mov_b32 s6, -1 293; SI-NEXT: s_waitcnt lgkmcnt(0) 294; SI-NEXT: v_mov_b32_e32 v0, s15 295; SI-NEXT: s_and_b32 s3, s3, 31 296; SI-NEXT: v_mov_b32_e32 v1, s3 297; SI-NEXT: v_alignbit_b32 v1, s11, v0, v1 298; SI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 299; SI-NEXT: s_and_b32 s2, s2, 31 300; SI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc 301; SI-NEXT: v_mov_b32_e32 v0, s14 302; SI-NEXT: v_mov_b32_e32 v1, s2 303; SI-NEXT: v_alignbit_b32 v1, s10, v0, v1 304; SI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 305; SI-NEXT: s_and_b32 s1, s1, 31 306; SI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc 307; SI-NEXT: v_mov_b32_e32 v0, s13 308; SI-NEXT: v_mov_b32_e32 v1, s1 309; SI-NEXT: s_and_b32 s0, s0, 31 310; SI-NEXT: v_alignbit_b32 v1, s9, v0, v1 311; SI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 312; SI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 313; SI-NEXT: v_mov_b32_e32 v0, s12 314; SI-NEXT: v_mov_b32_e32 v4, s0 315; SI-NEXT: v_alignbit_b32 v4, s8, v0, v4 316; SI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 317; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 318; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 319; SI-NEXT: s_endpgm 320; 321; VI-LABEL: fshr_v4i32: 322; VI: ; %bb.0: ; %entry 323; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 324; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 325; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 326; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 327; VI-NEXT: s_waitcnt lgkmcnt(0) 328; VI-NEXT: v_mov_b32_e32 v0, s11 329; VI-NEXT: s_and_b32 s3, s3, 31 330; VI-NEXT: v_mov_b32_e32 v1, s3 331; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1 332; VI-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 333; VI-NEXT: s_and_b32 s2, s2, 31 334; VI-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc 335; VI-NEXT: v_mov_b32_e32 v0, s10 336; VI-NEXT: v_mov_b32_e32 v1, s2 337; VI-NEXT: v_alignbit_b32 v1, s6, v0, v1 338; VI-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 339; VI-NEXT: s_and_b32 s1, s1, 31 340; VI-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc 341; VI-NEXT: v_mov_b32_e32 v0, s9 342; VI-NEXT: v_mov_b32_e32 v1, s1 343; VI-NEXT: s_and_b32 s0, s0, 31 344; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 345; VI-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 346; VI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 347; VI-NEXT: v_mov_b32_e32 v0, s8 348; VI-NEXT: v_mov_b32_e32 v4, s0 349; VI-NEXT: v_alignbit_b32 v4, s4, v0, v4 350; VI-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 351; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 352; VI-NEXT: v_mov_b32_e32 v4, s12 353; VI-NEXT: v_mov_b32_e32 v5, s13 354; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 355; VI-NEXT: s_endpgm 356; 357; GFX9-LABEL: fshr_v4i32: 358; GFX9: ; %bb.0: ; %entry 359; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 360; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 361; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 362; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 364; GFX9-NEXT: v_mov_b32_e32 v0, s11 365; GFX9-NEXT: s_and_b32 s3, s3, 31 366; GFX9-NEXT: v_mov_b32_e32 v1, s3 367; GFX9-NEXT: v_alignbit_b32 v1, s7, v0, v1 368; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 369; GFX9-NEXT: s_and_b32 s2, s2, 31 370; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc 371; GFX9-NEXT: v_mov_b32_e32 v0, s10 372; GFX9-NEXT: v_mov_b32_e32 v1, s2 373; GFX9-NEXT: v_alignbit_b32 v1, s6, v0, v1 374; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 375; GFX9-NEXT: s_and_b32 s1, s1, 31 376; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc 377; GFX9-NEXT: v_mov_b32_e32 v0, s9 378; GFX9-NEXT: v_mov_b32_e32 v1, s1 379; GFX9-NEXT: s_and_b32 s0, s0, 31 380; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, v1 381; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 0 382; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc 383; GFX9-NEXT: v_mov_b32_e32 v0, s8 384; GFX9-NEXT: v_mov_b32_e32 v4, s0 385; GFX9-NEXT: v_alignbit_b32 v4, s4, v0, v4 386; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 0 387; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 388; GFX9-NEXT: v_mov_b32_e32 v4, s12 389; GFX9-NEXT: v_mov_b32_e32 v5, s13 390; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 391; GFX9-NEXT: s_endpgm 392; 393; R600-LABEL: fshr_v4i32: 394; R600: ; %bb.0: ; %entry 395; R600-NEXT: ALU 20, @4, KC0[CB0:0-32], KC1[] 396; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 397; R600-NEXT: CF_END 398; R600-NEXT: PAD 399; R600-NEXT: ALU clause starting at 4: 400; R600-NEXT: AND_INT T0.W, KC0[5].Z, literal.x, 401; R600-NEXT: AND_INT * T1.W, KC0[6].X, literal.x, 402; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 403; R600-NEXT: SETE_INT T0.Z, PS, 0.0, 404; R600-NEXT: BIT_ALIGN_INT * T1.W, KC0[4].X, KC0[5].X, PS, 405; R600-NEXT: AND_INT * T2.W, KC0[5].W, literal.x, 406; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 407; R600-NEXT: SETE_INT T1.Z, PV.W, 0.0, 408; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[3].W, KC0[4].W, PV.W, 409; R600-NEXT: CNDE_INT * T1.W, T0.Z, T1.W, KC0[5].X, 410; R600-NEXT: CNDE_INT T1.Z, T1.Z, T2.W, KC0[4].W, 411; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Z, KC0[4].Z, T0.W, 412; R600-NEXT: SETE_INT * T0.W, T0.W, 0.0, 413; R600-NEXT: CNDE_INT T1.Y, PS, PV.W, KC0[4].Z, 414; R600-NEXT: AND_INT * T0.W, KC0[5].Y, literal.x, 415; R600-NEXT: 31(4.344025e-44), 0(0.000000e+00) 416; R600-NEXT: BIT_ALIGN_INT T2.W, KC0[3].Y, KC0[4].Y, PV.W, 417; R600-NEXT: SETE_INT * T0.W, PV.W, 0.0, 418; R600-NEXT: CNDE_INT T1.X, PS, PV.W, KC0[4].Y, 419; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 420; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 421entry: 422 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) 423 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 424 ret void 425} 426 427define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) { 428; SI-LABEL: fshr_v4i32_imm: 429; SI: ; %bb.0: ; %entry 430; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 431; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 432; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 433; SI-NEXT: s_mov_b32 s7, 0xf000 434; SI-NEXT: s_mov_b32 s6, -1 435; SI-NEXT: s_waitcnt lgkmcnt(0) 436; SI-NEXT: v_mov_b32_e32 v0, s3 437; SI-NEXT: v_alignbit_b32 v3, s11, v0, 1 438; SI-NEXT: v_mov_b32_e32 v0, s2 439; SI-NEXT: v_alignbit_b32 v2, s10, v0, 9 440; SI-NEXT: v_mov_b32_e32 v0, s1 441; SI-NEXT: v_alignbit_b32 v1, s9, v0, 7 442; SI-NEXT: v_mov_b32_e32 v0, s0 443; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1 444; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 445; SI-NEXT: s_endpgm 446; 447; VI-LABEL: fshr_v4i32_imm: 448; VI: ; %bb.0: ; %entry 449; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 450; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 451; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 452; VI-NEXT: s_waitcnt lgkmcnt(0) 453; VI-NEXT: v_mov_b32_e32 v4, s8 454; VI-NEXT: v_mov_b32_e32 v5, s9 455; VI-NEXT: v_mov_b32_e32 v0, s3 456; VI-NEXT: v_mov_b32_e32 v1, s2 457; VI-NEXT: v_alignbit_b32 v3, s7, v0, 1 458; VI-NEXT: v_mov_b32_e32 v0, s1 459; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9 460; VI-NEXT: v_alignbit_b32 v1, s5, v0, 7 461; VI-NEXT: v_mov_b32_e32 v0, s0 462; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1 463; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 464; VI-NEXT: s_endpgm 465; 466; GFX9-LABEL: fshr_v4i32_imm: 467; GFX9: ; %bb.0: ; %entry 468; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 469; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 470; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44 471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 472; GFX9-NEXT: v_mov_b32_e32 v4, s8 473; GFX9-NEXT: v_mov_b32_e32 v5, s9 474; GFX9-NEXT: v_mov_b32_e32 v0, s3 475; GFX9-NEXT: v_mov_b32_e32 v1, s2 476; GFX9-NEXT: v_alignbit_b32 v3, s7, v0, 1 477; GFX9-NEXT: v_mov_b32_e32 v0, s1 478; GFX9-NEXT: v_alignbit_b32 v2, s6, v1, 9 479; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7 480; GFX9-NEXT: v_mov_b32_e32 v0, s0 481; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 482; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 483; GFX9-NEXT: s_endpgm 484; 485; R600-LABEL: fshr_v4i32_imm: 486; R600: ; %bb.0: ; %entry 487; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 488; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 489; R600-NEXT: CF_END 490; R600-NEXT: PAD 491; R600-NEXT: ALU clause starting at 4: 492; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1, 493; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x, 494; R600-NEXT: 9(1.261169e-44), 0(0.000000e+00) 495; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x, 496; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00) 497; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1, 498; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 499; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 500entry: 501 %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>) 502 store <4 x i32> %0, <4 x i32> addrspace(1)* %in 503 ret void 504} 505