1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 9; SI-LABEL: ashr_v2i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_ashr_i32_e32 v1, v1, v3 24; SI-NEXT: v_ashr_i32_e32 v0, v0, v2 25; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: ashr_v2i32: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 31; VI-NEXT: s_mov_b32 s7, 0xf000 32; VI-NEXT: s_mov_b32 s6, -1 33; VI-NEXT: s_mov_b32 s10, s6 34; VI-NEXT: s_mov_b32 s11, s7 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_mov_b32 s8, s2 37; VI-NEXT: s_mov_b32 s9, s3 38; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 39; VI-NEXT: s_mov_b32 s4, s0 40; VI-NEXT: s_mov_b32 s5, s1 41; VI-NEXT: s_waitcnt vmcnt(0) 42; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1 43; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 44; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 45; VI-NEXT: s_endpgm 46; 47; EG-LABEL: ashr_v2i32: 48; EG: ; %bb.0: 49; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 50; EG-NEXT: TEX 0 @6 51; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: Fetch clause starting at 6: 56; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 57; EG-NEXT: ALU clause starting at 8: 58; EG-NEXT: MOV * T0.X, KC0[2].Z, 59; EG-NEXT: ALU clause starting at 9: 60; EG-NEXT: ASHR * T0.Y, T0.Y, T0.W, 61; EG-NEXT: ASHR T0.X, T0.X, T0.Z, 62; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 63; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 64 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 65 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in 66 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr 67 %result = ashr <2 x i32> %a, %b 68 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 69 ret void 70} 71 72define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 73; SI-LABEL: ashr_v4i32: 74; SI: ; %bb.0: 75; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 76; SI-NEXT: s_mov_b32 s7, 0xf000 77; SI-NEXT: s_mov_b32 s6, -1 78; SI-NEXT: s_mov_b32 s10, s6 79; SI-NEXT: s_mov_b32 s11, s7 80; SI-NEXT: s_waitcnt lgkmcnt(0) 81; SI-NEXT: s_mov_b32 s8, s2 82; SI-NEXT: s_mov_b32 s9, s3 83; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 84; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 85; SI-NEXT: s_mov_b32 s4, s0 86; SI-NEXT: s_mov_b32 s5, s1 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 89; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 90; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 91; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 92; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 93; SI-NEXT: s_endpgm 94; 95; VI-LABEL: ashr_v4i32: 96; VI: ; %bb.0: 97; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 98; VI-NEXT: s_mov_b32 s7, 0xf000 99; VI-NEXT: s_mov_b32 s6, -1 100; VI-NEXT: s_mov_b32 s10, s6 101; VI-NEXT: s_mov_b32 s11, s7 102; VI-NEXT: s_waitcnt lgkmcnt(0) 103; VI-NEXT: s_mov_b32 s8, s2 104; VI-NEXT: s_mov_b32 s9, s3 105; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 106; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 107; VI-NEXT: s_mov_b32 s4, s0 108; VI-NEXT: s_mov_b32 s5, s1 109; VI-NEXT: s_waitcnt vmcnt(0) 110; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 111; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 112; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 113; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0 114; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 115; VI-NEXT: s_endpgm 116; 117; EG-LABEL: ashr_v4i32: 118; EG: ; %bb.0: 119; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 120; EG-NEXT: TEX 1 @6 121; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 123; EG-NEXT: CF_END 124; EG-NEXT: PAD 125; EG-NEXT: Fetch clause starting at 6: 126; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 127; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 128; EG-NEXT: ALU clause starting at 10: 129; EG-NEXT: MOV * T0.X, KC0[2].Z, 130; EG-NEXT: ALU clause starting at 11: 131; EG-NEXT: ASHR * T0.W, T0.W, T1.W, 132; EG-NEXT: ASHR * T0.Z, T0.Z, T1.Z, 133; EG-NEXT: ASHR * T0.Y, T0.Y, T1.Y, 134; EG-NEXT: ASHR T0.X, T0.X, T1.X, 135; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 136; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 137 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 138 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in 139 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr 140 %result = ashr <4 x i32> %a, %b 141 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 142 ret void 143} 144 145; FIXME: The ashr operation is uniform, but because its operands come from a 146; global load we end up with the vector instructions rather than scalar. 147define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 148; SI-LABEL: ashr_v2i16: 149; SI: ; %bb.0: 150; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 151; SI-NEXT: s_mov_b32 s7, 0xf000 152; SI-NEXT: s_mov_b32 s6, -1 153; SI-NEXT: s_mov_b32 s10, s6 154; SI-NEXT: s_mov_b32 s11, s7 155; SI-NEXT: s_waitcnt lgkmcnt(0) 156; SI-NEXT: s_mov_b32 s8, s2 157; SI-NEXT: s_mov_b32 s9, s3 158; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 159; SI-NEXT: s_mov_b32 s4, s0 160; SI-NEXT: s_mov_b32 s5, s1 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 163; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 164; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 165; SI-NEXT: v_ashrrev_i32_e32 v0, v3, v0 166; SI-NEXT: v_ashrrev_i32_e32 v1, v1, v2 167; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 168; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 169; SI-NEXT: v_or_b32_e32 v0, v1, v0 170; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 171; SI-NEXT: s_endpgm 172; 173; VI-LABEL: ashr_v2i16: 174; VI: ; %bb.0: 175; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 176; VI-NEXT: s_mov_b32 s7, 0xf000 177; VI-NEXT: s_mov_b32 s6, -1 178; VI-NEXT: s_mov_b32 s10, s6 179; VI-NEXT: s_mov_b32 s11, s7 180; VI-NEXT: s_waitcnt lgkmcnt(0) 181; VI-NEXT: s_mov_b32 s8, s2 182; VI-NEXT: s_mov_b32 s9, s3 183; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 184; VI-NEXT: s_mov_b32 s4, s0 185; VI-NEXT: s_mov_b32 s5, s1 186; VI-NEXT: s_waitcnt vmcnt(0) 187; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 188; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 189; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 190; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 191; VI-NEXT: s_endpgm 192; 193; EG-LABEL: ashr_v2i16: 194; EG: ; %bb.0: 195; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 196; EG-NEXT: TEX 0 @6 197; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 198; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 199; EG-NEXT: CF_END 200; EG-NEXT: PAD 201; EG-NEXT: Fetch clause starting at 6: 202; EG-NEXT: VTX_READ_64 T6.XY, T6.X, 0, #1 203; EG-NEXT: ALU clause starting at 8: 204; EG-NEXT: MOV * T6.X, KC0[2].Z, 205; EG-NEXT: ALU clause starting at 9: 206; EG-NEXT: LSHR * T0.W, T6.X, literal.x, 207; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 208; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, 209; EG-NEXT: LSHR T0.Z, T6.Y, literal.x, 210; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x, 211; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y, 212; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 213; EG-NEXT: ASHR T0.W, PV.W, PS, 214; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z, 215; EG-NEXT: LSHL T1.W, PS, literal.x, 216; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, 217; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 218; EG-NEXT: OR_INT T6.X, PS, PV.W, 219; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 220; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 221 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 222 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in 223 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 224 %result = ashr <2 x i16> %a, %b 225 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 226 ret void 227} 228 229; FIXME: The ashr operation is uniform, but because its operands come from a 230; global load we end up with the vector instructions rather than scalar. 231define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 232; SI-LABEL: ashr_v4i16: 233; SI: ; %bb.0: 234; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 235; SI-NEXT: s_mov_b32 s7, 0xf000 236; SI-NEXT: s_mov_b32 s6, -1 237; SI-NEXT: s_mov_b32 s10, s6 238; SI-NEXT: s_mov_b32 s11, s7 239; SI-NEXT: s_waitcnt lgkmcnt(0) 240; SI-NEXT: s_mov_b32 s8, s2 241; SI-NEXT: s_mov_b32 s9, s3 242; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 243; SI-NEXT: s_mov_b32 s2, 0xffff 244; SI-NEXT: s_mov_b32 s4, s0 245; SI-NEXT: s_mov_b32 s5, s1 246; SI-NEXT: s_waitcnt vmcnt(0) 247; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 248; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 249; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 250; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 251; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 252; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 253; SI-NEXT: v_ashr_i32_e32 v1, v1, v7 254; SI-NEXT: v_ashr_i32_e32 v3, v5, v3 255; SI-NEXT: v_ashr_i32_e32 v0, v0, v6 256; SI-NEXT: v_ashr_i32_e32 v2, v4, v2 257; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 258; SI-NEXT: v_and_b32_e32 v3, s2, v3 259; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 260; SI-NEXT: v_and_b32_e32 v2, s2, v2 261; SI-NEXT: v_or_b32_e32 v1, v3, v1 262; SI-NEXT: v_or_b32_e32 v0, v2, v0 263; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 264; SI-NEXT: s_endpgm 265; 266; VI-LABEL: ashr_v4i16: 267; VI: ; %bb.0: 268; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 269; VI-NEXT: s_mov_b32 s7, 0xf000 270; VI-NEXT: s_mov_b32 s6, -1 271; VI-NEXT: s_mov_b32 s10, s6 272; VI-NEXT: s_mov_b32 s11, s7 273; VI-NEXT: s_waitcnt lgkmcnt(0) 274; VI-NEXT: s_mov_b32 s8, s2 275; VI-NEXT: s_mov_b32 s9, s3 276; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 277; VI-NEXT: s_mov_b32 s4, s0 278; VI-NEXT: s_mov_b32 s5, s1 279; VI-NEXT: s_waitcnt vmcnt(0) 280; VI-NEXT: v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 281; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 282; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v3), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 283; VI-NEXT: v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 284; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 285; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 286; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 287; VI-NEXT: s_endpgm 288; 289; EG-LABEL: ashr_v4i16: 290; EG: ; %bb.0: 291; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 292; EG-NEXT: TEX 0 @6 293; EG-NEXT: ALU 58, @9, KC0[CB0:0-32], KC1[] 294; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 295; EG-NEXT: CF_END 296; EG-NEXT: PAD 297; EG-NEXT: Fetch clause starting at 6: 298; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 299; EG-NEXT: ALU clause starting at 8: 300; EG-NEXT: MOV * T9.X, KC0[2].Z, 301; EG-NEXT: ALU clause starting at 9: 302; EG-NEXT: MOV T4.X, T9.X, 303; EG-NEXT: MOV * T5.X, T9.Y, 304; EG-NEXT: MOV T0.Y, PV.X, 305; EG-NEXT: MOV * T0.Z, PS, 306; EG-NEXT: MOV T2.X, T9.Z, 307; EG-NEXT: MOV * T3.X, T9.W, 308; EG-NEXT: MOV * T0.W, T6.X, 309; EG-NEXT: MOV T1.Y, T2.X, 310; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x, 311; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 312; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, 313; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 314; EG-NEXT: ASHR * T1.W, T1.W, PV.W, 315; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 316; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, 317; EG-NEXT: 65535(9.183409e-41), -65536(nan) 318; EG-NEXT: OR_INT * T0.W, PS, PV.W, 319; EG-NEXT: MOV * T1.Z, T3.X, 320; EG-NEXT: MOV * T6.X, T0.W, 321; EG-NEXT: MOV T0.W, PV.X, 322; EG-NEXT: LSHR * T1.W, T0.Y, literal.x, 323; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 324; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x, 325; EG-NEXT: LSHR * T2.W, T1.Y, literal.x, 326; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 327; EG-NEXT: ASHR T1.W, PV.W, PS, 328; EG-NEXT: AND_INT * T0.W, T0.W, literal.x, 329; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 330; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 331; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 332; EG-NEXT: OR_INT * T0.W, T0.W, PV.W, 333; EG-NEXT: MOV T6.X, PV.W, 334; EG-NEXT: MOV T0.Y, T7.X, 335; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, 336; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y, 337; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 338; EG-NEXT: ASHR T0.W, PV.W, PS, 339; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 340; EG-NEXT: -65536(nan), 0(0.000000e+00) 341; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, 342; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 343; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 344; EG-NEXT: MOV * T7.X, PV.W, 345; EG-NEXT: MOV T0.Y, PV.X, 346; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, 347; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 348; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, 349; EG-NEXT: LSHR * T1.W, T1.Z, literal.x, 350; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 351; EG-NEXT: ASHR T0.W, PV.W, PS, 352; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 353; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 354; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 355; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 356; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, 357; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W, 358; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 359; EG-NEXT: MOV T7.X, PV.Y, 360; EG-NEXT: MOV * T10.X, T6.X, 361 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 362 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in 363 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 364 %result = ashr <4 x i16> %a, %b 365 store <4 x i16> %result, <4 x i16> addrspace(1)* %out 366 ret void 367} 368 369define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { 370; SI-LABEL: s_ashr_i64: 371; SI: ; %bb.0: ; %entry 372; SI-NEXT: s_load_dword s4, s[0:1], 0xb 373; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 374; SI-NEXT: s_mov_b32 s3, 0xf000 375; SI-NEXT: s_mov_b32 s2, -1 376; SI-NEXT: s_waitcnt lgkmcnt(0) 377; SI-NEXT: s_ashr_i32 s5, s4, 31 378; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 379; SI-NEXT: v_mov_b32_e32 v0, s4 380; SI-NEXT: v_mov_b32_e32 v1, s5 381; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 382; SI-NEXT: s_endpgm 383; 384; VI-LABEL: s_ashr_i64: 385; VI: ; %bb.0: ; %entry 386; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 387; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 388; VI-NEXT: s_mov_b32 s3, 0xf000 389; VI-NEXT: s_mov_b32 s2, -1 390; VI-NEXT: s_waitcnt lgkmcnt(0) 391; VI-NEXT: s_ashr_i32 s5, s4, 31 392; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 393; VI-NEXT: v_mov_b32_e32 v0, s4 394; VI-NEXT: v_mov_b32_e32 v1, s5 395; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 396; VI-NEXT: s_endpgm 397; 398; EG-LABEL: s_ashr_i64: 399; EG: ; %bb.0: ; %entry 400; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 401; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 402; EG-NEXT: CF_END 403; EG-NEXT: PAD 404; EG-NEXT: ALU clause starting at 4: 405; EG-NEXT: ASHR * T0.Y, KC0[2].Z, literal.x, 406; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 407; EG-NEXT: BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x, 408; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 409; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 410entry: 411 %in.ext = sext i32 %in to i64 412 %ashr = ashr i64 %in.ext, 8 413 store i64 %ashr, i64 addrspace(1)* %out 414 ret void 415} 416 417define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 418; SI-LABEL: ashr_i64_2: 419; SI: ; %bb.0: ; %entry 420; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 421; SI-NEXT: s_mov_b32 s7, 0xf000 422; SI-NEXT: s_mov_b32 s6, -1 423; SI-NEXT: s_mov_b32 s10, s6 424; SI-NEXT: s_mov_b32 s11, s7 425; SI-NEXT: s_waitcnt lgkmcnt(0) 426; SI-NEXT: s_mov_b32 s8, s2 427; SI-NEXT: s_mov_b32 s9, s3 428; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 429; SI-NEXT: s_mov_b32 s4, s0 430; SI-NEXT: s_mov_b32 s5, s1 431; SI-NEXT: s_waitcnt vmcnt(0) 432; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v2 433; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 434; SI-NEXT: s_endpgm 435; 436; VI-LABEL: ashr_i64_2: 437; VI: ; %bb.0: ; %entry 438; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 439; VI-NEXT: s_mov_b32 s7, 0xf000 440; VI-NEXT: s_mov_b32 s6, -1 441; VI-NEXT: s_mov_b32 s10, s6 442; VI-NEXT: s_mov_b32 s11, s7 443; VI-NEXT: s_waitcnt lgkmcnt(0) 444; VI-NEXT: s_mov_b32 s8, s2 445; VI-NEXT: s_mov_b32 s9, s3 446; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 447; VI-NEXT: s_mov_b32 s4, s0 448; VI-NEXT: s_mov_b32 s5, s1 449; VI-NEXT: s_waitcnt vmcnt(0) 450; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] 451; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 452; VI-NEXT: s_endpgm 453; 454; EG-LABEL: ashr_i64_2: 455; EG: ; %bb.0: ; %entry 456; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 457; EG-NEXT: TEX 0 @6 458; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] 459; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 460; EG-NEXT: CF_END 461; EG-NEXT: PAD 462; EG-NEXT: Fetch clause starting at 6: 463; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 464; EG-NEXT: ALU clause starting at 8: 465; EG-NEXT: MOV * T0.X, KC0[2].Z, 466; EG-NEXT: ALU clause starting at 9: 467; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, 468; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 469; EG-NEXT: ASHR T1.Z, T0.Y, PV.W, 470; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, 471; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 472; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 473; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, 474; EG-NEXT: ASHR T0.W, T0.Y, literal.x, 475; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 476; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) 477; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W, 478entry: 479 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 480 %a = load i64, i64 addrspace(1)* %in 481 %b = load i64, i64 addrspace(1)* %b_ptr 482 %result = ashr i64 %a, %b 483 store i64 %result, i64 addrspace(1)* %out 484 ret void 485} 486 487define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { 488; SI-LABEL: ashr_v2i64: 489; SI: ; %bb.0: 490; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 491; SI-NEXT: s_mov_b32 s7, 0xf000 492; SI-NEXT: s_mov_b32 s6, -1 493; SI-NEXT: s_mov_b32 s10, s6 494; SI-NEXT: s_mov_b32 s11, s7 495; SI-NEXT: s_waitcnt lgkmcnt(0) 496; SI-NEXT: s_mov_b32 s8, s2 497; SI-NEXT: s_mov_b32 s9, s3 498; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 499; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 500; SI-NEXT: s_mov_b32 s4, s0 501; SI-NEXT: s_mov_b32 s5, s1 502; SI-NEXT: s_waitcnt vmcnt(0) 503; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 504; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 505; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 506; SI-NEXT: s_endpgm 507; 508; VI-LABEL: ashr_v2i64: 509; VI: ; %bb.0: 510; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 511; VI-NEXT: s_mov_b32 s7, 0xf000 512; VI-NEXT: s_mov_b32 s6, -1 513; VI-NEXT: s_mov_b32 s10, s6 514; VI-NEXT: s_mov_b32 s11, s7 515; VI-NEXT: s_waitcnt lgkmcnt(0) 516; VI-NEXT: s_mov_b32 s8, s2 517; VI-NEXT: s_mov_b32 s9, s3 518; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 519; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 520; VI-NEXT: s_mov_b32 s4, s0 521; VI-NEXT: s_mov_b32 s5, s1 522; VI-NEXT: s_waitcnt vmcnt(0) 523; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 524; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 525; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 526; VI-NEXT: s_endpgm 527; 528; EG-LABEL: ashr_v2i64: 529; EG: ; %bb.0: 530; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 531; EG-NEXT: TEX 1 @6 532; EG-NEXT: ALU 19, @11, KC0[CB0:0-32], KC1[] 533; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 534; EG-NEXT: CF_END 535; EG-NEXT: PAD 536; EG-NEXT: Fetch clause starting at 6: 537; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 538; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 539; EG-NEXT: ALU clause starting at 10: 540; EG-NEXT: MOV * T0.X, KC0[2].Z, 541; EG-NEXT: ALU clause starting at 11: 542; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 543; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 544; EG-NEXT: ASHR T1.Y, T0.W, PV.W, 545; EG-NEXT: AND_INT T2.Z, T1.Z, literal.x, 546; EG-NEXT: BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z, 547; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, 548; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 549; EG-NEXT: ASHR T2.Y, T0.Y, PS, 550; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.W, PV.Y, 551; EG-NEXT: BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X, 552; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, 553; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 554; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 555; EG-NEXT: ASHR T0.W, T0.W, literal.x, 556; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, 557; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 558; EG-NEXT: CNDE_INT * T0.W, T2.Z, T1.Y, PV.W, 559; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 560; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W, 561; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 562 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 563 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in 564 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 565 %result = ashr <2 x i64> %a, %b 566 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 567 ret void 568} 569 570; FIXME: Broken on r600 571define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { 572; SI-LABEL: ashr_v4i64: 573; SI: ; %bb.0: 574; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 575; SI-NEXT: s_mov_b32 s3, 0xf000 576; SI-NEXT: s_mov_b32 s2, -1 577; SI-NEXT: s_mov_b32 s10, s2 578; SI-NEXT: s_mov_b32 s11, s3 579; SI-NEXT: s_waitcnt lgkmcnt(0) 580; SI-NEXT: s_mov_b32 s8, s6 581; SI-NEXT: s_mov_b32 s9, s7 582; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 583; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 584; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 585; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 586; SI-NEXT: s_mov_b32 s0, s4 587; SI-NEXT: s_mov_b32 s1, s5 588; SI-NEXT: s_waitcnt vmcnt(2) 589; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 590; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 591; SI-NEXT: s_waitcnt vmcnt(0) 592; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 593; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 594; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 595; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 596; SI-NEXT: s_endpgm 597; 598; VI-LABEL: ashr_v4i64: 599; VI: ; %bb.0: 600; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 601; VI-NEXT: s_mov_b32 s3, 0xf000 602; VI-NEXT: s_mov_b32 s2, -1 603; VI-NEXT: s_mov_b32 s10, s2 604; VI-NEXT: s_mov_b32 s11, s3 605; VI-NEXT: s_waitcnt lgkmcnt(0) 606; VI-NEXT: s_mov_b32 s8, s6 607; VI-NEXT: s_mov_b32 s9, s7 608; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 609; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 610; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 611; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 612; VI-NEXT: s_mov_b32 s0, s4 613; VI-NEXT: s_mov_b32 s1, s5 614; VI-NEXT: s_waitcnt vmcnt(2) 615; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 616; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 617; VI-NEXT: s_waitcnt vmcnt(0) 618; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] 619; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] 620; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 621; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 622; VI-NEXT: s_endpgm 623; 624; EG-LABEL: ashr_v4i64: 625; EG: ; %bb.0: 626; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 627; EG-NEXT: TEX 3 @6 628; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] 629; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 630; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 631; EG-NEXT: CF_END 632; EG-NEXT: Fetch clause starting at 6: 633; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 634; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 635; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 636; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 637; EG-NEXT: ALU clause starting at 14: 638; EG-NEXT: MOV * T0.X, KC0[2].Z, 639; EG-NEXT: ALU clause starting at 15: 640; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 641; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 642; EG-NEXT: ASHR T1.Y, T0.W, literal.x, 643; EG-NEXT: ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212 644; EG-NEXT: AND_INT T1.W, T1.Z, literal.y, 645; EG-NEXT: AND_INT * T2.W, T2.Z, literal.x, 646; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 647; EG-NEXT: BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z, 648; EG-NEXT: ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212 649; EG-NEXT: AND_INT * T1.Z, T2.Z, literal.x, 650; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 651; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z, 652; EG-NEXT: AND_INT * T2.W, T2.X, literal.x, 653; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 654; EG-NEXT: AND_INT T5.X, T1.X, literal.x, 655; EG-NEXT: ASHR T4.Y, T0.Y, PS, 656; EG-NEXT: CNDE_INT T0.Z, T1.Z, PV.W, T2.Y, 657; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X, 658; EG-NEXT: AND_INT * T2.W, T2.X, literal.y, 659; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 660; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 661; EG-NEXT: ASHR T5.Y, T3.Y, PV.X, 662; EG-NEXT: CNDE_INT T2.Z, T1.W, T4.X, T4.Z, 663; EG-NEXT: BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221 664; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, 665; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 666; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, 667; EG-NEXT: ASHR T6.Y, T3.W, literal.x, 668; EG-NEXT: ASHR T3.Z, T0.Y, literal.x, BS:VEC_201 669; EG-NEXT: ADD_INT T3.W, KC0[2].Y, literal.y, 670; EG-NEXT: CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y, 671; EG-NEXT: 31(4.344025e-44), 16(2.242078e-44) 672; EG-NEXT: LSHR T1.X, PV.W, literal.x, 673; EG-NEXT: CNDE_INT T0.Y, T2.W, T4.Y, PV.Z, 674; EG-NEXT: ASHR T3.W, T3.Y, literal.y, 675; EG-NEXT: CNDE_INT * T2.W, T1.W, T4.Z, PV.Y, 676; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 677; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x, 678; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W, 679; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 680 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 681 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in 682 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 683 %result = ashr <4 x i64> %a, %b 684 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 685 ret void 686} 687 688define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 689; SI-LABEL: s_ashr_32_i64: 690; SI: ; %bb.0: 691; SI-NEXT: s_load_dword s6, s[0:1], 0x14 692; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 693; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 694; SI-NEXT: s_mov_b32 s3, 0xf000 695; SI-NEXT: s_mov_b32 s2, -1 696; SI-NEXT: s_waitcnt lgkmcnt(0) 697; SI-NEXT: s_ashr_i32 s7, s6, 31 698; SI-NEXT: s_add_u32 s4, s6, s4 699; SI-NEXT: s_addc_u32 s5, s7, s5 700; SI-NEXT: v_mov_b32_e32 v0, s4 701; SI-NEXT: v_mov_b32_e32 v1, s5 702; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 703; SI-NEXT: s_endpgm 704; 705; VI-LABEL: s_ashr_32_i64: 706; VI: ; %bb.0: 707; VI-NEXT: s_load_dword s6, s[0:1], 0x50 708; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 709; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 710; VI-NEXT: s_mov_b32 s3, 0xf000 711; VI-NEXT: s_mov_b32 s2, -1 712; VI-NEXT: s_waitcnt lgkmcnt(0) 713; VI-NEXT: s_ashr_i32 s7, s6, 31 714; VI-NEXT: s_add_u32 s4, s6, s4 715; VI-NEXT: s_addc_u32 s5, s7, s5 716; VI-NEXT: v_mov_b32_e32 v0, s4 717; VI-NEXT: v_mov_b32_e32 v1, s5 718; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 719; VI-NEXT: s_endpgm 720; 721; EG-LABEL: s_ashr_32_i64: 722; EG: ; %bb.0: 723; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 724; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 725; EG-NEXT: CF_END 726; EG-NEXT: PAD 727; EG-NEXT: ALU clause starting at 4: 728; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 729; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 730; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[7].Z, 731; EG-NEXT: ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y, 732; EG-NEXT: ADD_INT * T0.Y, T0.W, PV.W, 733; EG-NEXT: ADD_INT * T0.X, KC0[5].X, KC0[7].Y, 734; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 735; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 736 %result = ashr i64 %a, 32 737 %add = add i64 %result, %b 738 store i64 %add, i64 addrspace(1)* %out 739 ret void 740} 741 742define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 743; SI-LABEL: v_ashr_32_i64: 744; SI: ; %bb.0: 745; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 746; SI-NEXT: s_mov_b32 s7, 0xf000 747; SI-NEXT: s_mov_b32 s6, 0 748; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 749; SI-NEXT: v_mov_b32_e32 v1, 0 750; SI-NEXT: s_waitcnt lgkmcnt(0) 751; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 752; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 753; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 754; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 755; SI-NEXT: s_waitcnt vmcnt(0) 756; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 757; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 758; SI-NEXT: s_endpgm 759; 760; VI-LABEL: v_ashr_32_i64: 761; VI: ; %bb.0: 762; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 763; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 764; VI-NEXT: s_waitcnt lgkmcnt(0) 765; VI-NEXT: v_mov_b32_e32 v0, s3 766; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 767; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 768; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 769; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 770; VI-NEXT: flat_load_dword v0, v[0:1] 771; VI-NEXT: v_mov_b32_e32 v1, s1 772; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 773; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 774; VI-NEXT: s_waitcnt vmcnt(0) 775; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 776; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 777; VI-NEXT: s_endpgm 778; 779; EG-LABEL: v_ashr_32_i64: 780; EG: ; %bb.0: 781; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 782; EG-NEXT: TEX 0 @6 783; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 784; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 785; EG-NEXT: CF_END 786; EG-NEXT: PAD 787; EG-NEXT: Fetch clause starting at 6: 788; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 789; EG-NEXT: ALU clause starting at 8: 790; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 791; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 792; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 793; EG-NEXT: ALU clause starting at 11: 794; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 795; EG-NEXT: LSHR T1.X, PV.W, literal.x, 796; EG-NEXT: ASHR * T0.Y, T0.X, literal.y, 797; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 798 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 799 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 800 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 801 %a = load i64, i64 addrspace(1)* %gep.in 802 %result = ashr i64 %a, 32 803 store i64 %result, i64 addrspace(1)* %gep.out 804 ret void 805} 806 807define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 808; SI-LABEL: s_ashr_63_i64: 809; SI: ; %bb.0: 810; SI-NEXT: s_load_dword s6, s[0:1], 0x14 811; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 812; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 813; SI-NEXT: s_mov_b32 s3, 0xf000 814; SI-NEXT: s_mov_b32 s2, -1 815; SI-NEXT: s_waitcnt lgkmcnt(0) 816; SI-NEXT: s_ashr_i32 s6, s6, 31 817; SI-NEXT: s_add_u32 s4, s6, s4 818; SI-NEXT: s_addc_u32 s5, s6, s5 819; SI-NEXT: v_mov_b32_e32 v0, s4 820; SI-NEXT: v_mov_b32_e32 v1, s5 821; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 822; SI-NEXT: s_endpgm 823; 824; VI-LABEL: s_ashr_63_i64: 825; VI: ; %bb.0: 826; VI-NEXT: s_load_dword s6, s[0:1], 0x50 827; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 828; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 829; VI-NEXT: s_mov_b32 s3, 0xf000 830; VI-NEXT: s_mov_b32 s2, -1 831; VI-NEXT: s_waitcnt lgkmcnt(0) 832; VI-NEXT: s_ashr_i32 s6, s6, 31 833; VI-NEXT: s_add_u32 s4, s6, s4 834; VI-NEXT: s_addc_u32 s5, s6, s5 835; VI-NEXT: v_mov_b32_e32 v0, s4 836; VI-NEXT: v_mov_b32_e32 v1, s5 837; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 838; VI-NEXT: s_endpgm 839; 840; EG-LABEL: s_ashr_63_i64: 841; EG: ; %bb.0: 842; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 843; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 844; EG-NEXT: CF_END 845; EG-NEXT: PAD 846; EG-NEXT: ALU clause starting at 4: 847; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 848; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 849; EG-NEXT: ADD_INT T1.W, PV.W, KC0[7].Z, 850; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y, 851; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 852; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y, 853; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 854; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 855 %result = ashr i64 %a, 63 856 %add = add i64 %result, %b 857 store i64 %add, i64 addrspace(1)* %out 858 ret void 859} 860 861define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 862; SI-LABEL: v_ashr_63_i64: 863; SI: ; %bb.0: 864; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 865; SI-NEXT: s_mov_b32 s7, 0xf000 866; SI-NEXT: s_mov_b32 s6, 0 867; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 868; SI-NEXT: v_mov_b32_e32 v1, 0 869; SI-NEXT: s_waitcnt lgkmcnt(0) 870; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 871; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 872; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 873; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 874; SI-NEXT: s_waitcnt vmcnt(0) 875; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 876; SI-NEXT: v_mov_b32_e32 v3, v2 877; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 878; SI-NEXT: s_endpgm 879; 880; VI-LABEL: v_ashr_63_i64: 881; VI: ; %bb.0: 882; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 883; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 884; VI-NEXT: s_waitcnt lgkmcnt(0) 885; VI-NEXT: v_mov_b32_e32 v0, s3 886; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 887; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 888; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 889; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 890; VI-NEXT: flat_load_dword v3, v[0:1] 891; VI-NEXT: v_mov_b32_e32 v1, s1 892; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 893; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 894; VI-NEXT: s_waitcnt vmcnt(0) 895; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 896; VI-NEXT: v_mov_b32_e32 v3, v2 897; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 898; VI-NEXT: s_endpgm 899; 900; EG-LABEL: v_ashr_63_i64: 901; EG: ; %bb.0: 902; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 903; EG-NEXT: TEX 0 @6 904; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 905; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 906; EG-NEXT: CF_END 907; EG-NEXT: PAD 908; EG-NEXT: Fetch clause starting at 6: 909; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 910; EG-NEXT: ALU clause starting at 8: 911; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 912; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 913; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 914; EG-NEXT: ALU clause starting at 11: 915; EG-NEXT: ASHR T0.X, T0.X, literal.x, 916; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 917; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 918; EG-NEXT: LSHR T1.X, PV.W, literal.x, 919; EG-NEXT: MOV * T0.Y, PV.X, 920; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 921 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 922 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 923 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 924 %a = load i64, i64 addrspace(1)* %gep.in 925 %result = ashr i64 %a, 63 926 store i64 %result, i64 addrspace(1)* %gep.out 927 ret void 928} 929 930attributes #0 = { nounwind readnone } 931