1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 9; SI-LABEL: ashr_v2i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_ashr_i32_e32 v1, v1, v3 24; SI-NEXT: v_ashr_i32_e32 v0, v0, v2 25; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: ashr_v2i32: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 31; VI-NEXT: s_mov_b32 s7, 0xf000 32; VI-NEXT: s_mov_b32 s6, -1 33; VI-NEXT: s_mov_b32 s10, s6 34; VI-NEXT: s_mov_b32 s11, s7 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_mov_b32 s8, s2 37; VI-NEXT: s_mov_b32 s9, s3 38; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 39; VI-NEXT: s_mov_b32 s4, s0 40; VI-NEXT: s_mov_b32 s5, s1 41; VI-NEXT: s_waitcnt vmcnt(0) 42; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1 43; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 44; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 45; VI-NEXT: s_endpgm 46; 47; EG-LABEL: ashr_v2i32: 48; EG: ; %bb.0: 49; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 50; EG-NEXT: TEX 1 @6 51; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: Fetch clause starting at 6: 56; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1 57; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 58; EG-NEXT: ALU clause starting at 10: 59; EG-NEXT: MOV * T0.X, KC0[2].Z, 60; EG-NEXT: ALU clause starting at 11: 61; EG-NEXT: ASHR * T0.Y, T0.Y, T1.Y, 62; EG-NEXT: ASHR T0.X, T0.X, T1.X, 63; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 64; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 65 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 66 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in 67 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr 68 %result = ashr <2 x i32> %a, %b 69 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 70 ret void 71} 72 73define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 74; SI-LABEL: ashr_v4i32: 75; SI: ; %bb.0: 76; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 77; SI-NEXT: s_mov_b32 s7, 0xf000 78; SI-NEXT: s_mov_b32 s6, -1 79; SI-NEXT: s_mov_b32 s10, s6 80; SI-NEXT: s_mov_b32 s11, s7 81; SI-NEXT: s_waitcnt lgkmcnt(0) 82; SI-NEXT: s_mov_b32 s8, s2 83; SI-NEXT: s_mov_b32 s9, s3 84; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 85; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 86; SI-NEXT: s_mov_b32 s4, s0 87; SI-NEXT: s_mov_b32 s5, s1 88; SI-NEXT: s_waitcnt vmcnt(0) 89; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 90; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 91; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 92; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 93; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 94; SI-NEXT: s_endpgm 95; 96; VI-LABEL: ashr_v4i32: 97; VI: ; %bb.0: 98; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 99; VI-NEXT: s_mov_b32 s7, 0xf000 100; VI-NEXT: s_mov_b32 s6, -1 101; VI-NEXT: s_mov_b32 s10, s6 102; VI-NEXT: s_mov_b32 s11, s7 103; VI-NEXT: s_waitcnt lgkmcnt(0) 104; VI-NEXT: s_mov_b32 s8, s2 105; VI-NEXT: s_mov_b32 s9, s3 106; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 107; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 108; VI-NEXT: s_mov_b32 s4, s0 109; VI-NEXT: s_mov_b32 s5, s1 110; VI-NEXT: s_waitcnt vmcnt(0) 111; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 112; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 113; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 114; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0 115; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 116; VI-NEXT: s_endpgm 117; 118; EG-LABEL: ashr_v4i32: 119; EG: ; %bb.0: 120; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 121; EG-NEXT: TEX 1 @6 122; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 124; EG-NEXT: CF_END 125; EG-NEXT: PAD 126; EG-NEXT: Fetch clause starting at 6: 127; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 128; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 129; EG-NEXT: ALU clause starting at 10: 130; EG-NEXT: MOV * T0.X, KC0[2].Z, 131; EG-NEXT: ALU clause starting at 11: 132; EG-NEXT: ASHR * T0.W, T0.W, T1.W, 133; EG-NEXT: ASHR * T0.Z, T0.Z, T1.Z, 134; EG-NEXT: ASHR * T0.Y, T0.Y, T1.Y, 135; EG-NEXT: ASHR T0.X, T0.X, T1.X, 136; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 137; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 138 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 139 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in 140 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr 141 %result = ashr <4 x i32> %a, %b 142 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 143 ret void 144} 145 146; FIXME: The ashr operation is uniform, but because its operands come from a 147; global load we end up with the vector instructions rather than scalar. 148define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 149; SI-LABEL: ashr_v2i16: 150; SI: ; %bb.0: 151; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 152; SI-NEXT: s_mov_b32 s7, 0xf000 153; SI-NEXT: s_mov_b32 s6, -1 154; SI-NEXT: s_mov_b32 s10, s6 155; SI-NEXT: s_mov_b32 s11, s7 156; SI-NEXT: s_waitcnt lgkmcnt(0) 157; SI-NEXT: s_mov_b32 s8, s2 158; SI-NEXT: s_mov_b32 s9, s3 159; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 160; SI-NEXT: s_mov_b32 s4, s0 161; SI-NEXT: s_mov_b32 s5, s1 162; SI-NEXT: s_waitcnt vmcnt(0) 163; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 164; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 165; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 166; SI-NEXT: v_ashrrev_i32_e32 v0, v3, v0 167; SI-NEXT: v_ashrrev_i32_e32 v1, v1, v2 168; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 169; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 170; SI-NEXT: v_or_b32_e32 v0, v1, v0 171; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 172; SI-NEXT: s_endpgm 173; 174; VI-LABEL: ashr_v2i16: 175; VI: ; %bb.0: 176; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 177; VI-NEXT: s_mov_b32 s7, 0xf000 178; VI-NEXT: s_mov_b32 s6, -1 179; VI-NEXT: s_mov_b32 s10, s6 180; VI-NEXT: s_mov_b32 s11, s7 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: s_mov_b32 s8, s2 183; VI-NEXT: s_mov_b32 s9, s3 184; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 185; VI-NEXT: s_mov_b32 s4, s0 186; VI-NEXT: s_mov_b32 s5, s1 187; VI-NEXT: s_waitcnt vmcnt(0) 188; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 189; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 190; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 191; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 192; VI-NEXT: s_endpgm 193; 194; EG-LABEL: ashr_v2i16: 195; EG: ; %bb.0: 196; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 197; EG-NEXT: TEX 1 @6 198; EG-NEXT: ALU 14, @12, KC0[CB0:0-32], KC1[] 199; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 200; EG-NEXT: CF_END 201; EG-NEXT: PAD 202; EG-NEXT: Fetch clause starting at 6: 203; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 204; EG-NEXT: VTX_READ_32 T6.X, T6.X, 4, #1 205; EG-NEXT: ALU clause starting at 10: 206; EG-NEXT: MOV * T6.X, KC0[2].Z, 207; EG-NEXT: MOV * T7.X, PV.X, 208; EG-NEXT: ALU clause starting at 12: 209; EG-NEXT: LSHR * T0.W, T7.X, literal.x, 210; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 211; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, 212; EG-NEXT: LSHR T0.Z, T6.X, literal.x, 213; EG-NEXT: BFE_INT T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 214; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 215; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 216; EG-NEXT: ASHR T0.W, PV.W, PS, 217; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z, 218; EG-NEXT: LSHL T1.W, PS, literal.x, 219; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, 220; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 221; EG-NEXT: OR_INT T6.X, PS, PV.W, 222; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 223; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 224 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 225 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in 226 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 227 %result = ashr <2 x i16> %a, %b 228 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 229 ret void 230} 231 232; FIXME: The ashr operation is uniform, but because its operands come from a 233; global load we end up with the vector instructions rather than scalar. 234define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 235; SI-LABEL: ashr_v4i16: 236; SI: ; %bb.0: 237; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 238; SI-NEXT: s_mov_b32 s7, 0xf000 239; SI-NEXT: s_mov_b32 s6, -1 240; SI-NEXT: s_mov_b32 s10, s6 241; SI-NEXT: s_mov_b32 s11, s7 242; SI-NEXT: s_waitcnt lgkmcnt(0) 243; SI-NEXT: s_mov_b32 s8, s2 244; SI-NEXT: s_mov_b32 s9, s3 245; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 246; SI-NEXT: s_mov_b32 s2, 0xffff 247; SI-NEXT: s_mov_b32 s4, s0 248; SI-NEXT: s_mov_b32 s5, s1 249; SI-NEXT: s_waitcnt vmcnt(0) 250; SI-NEXT: v_bfe_i32 v4, v0, 0, 16 251; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 252; SI-NEXT: v_bfe_i32 v5, v1, 0, 16 253; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1 254; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 255; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 256; SI-NEXT: v_ashrrev_i32_e32 v1, v7, v1 257; SI-NEXT: v_ashrrev_i32_e32 v3, v3, v5 258; SI-NEXT: v_ashrrev_i32_e32 v0, v6, v0 259; SI-NEXT: v_ashrrev_i32_e32 v2, v2, v4 260; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 261; SI-NEXT: v_and_b32_e32 v3, s2, v3 262; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 263; SI-NEXT: v_and_b32_e32 v2, s2, v2 264; SI-NEXT: v_or_b32_e32 v1, v3, v1 265; SI-NEXT: v_or_b32_e32 v0, v2, v0 266; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 267; SI-NEXT: s_endpgm 268; 269; VI-LABEL: ashr_v4i16: 270; VI: ; %bb.0: 271; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 272; VI-NEXT: s_mov_b32 s7, 0xf000 273; VI-NEXT: s_mov_b32 s6, -1 274; VI-NEXT: s_mov_b32 s10, s6 275; VI-NEXT: s_mov_b32 s11, s7 276; VI-NEXT: s_waitcnt lgkmcnt(0) 277; VI-NEXT: s_mov_b32 s8, s2 278; VI-NEXT: s_mov_b32 s9, s3 279; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 280; VI-NEXT: s_mov_b32 s4, s0 281; VI-NEXT: s_mov_b32 s5, s1 282; VI-NEXT: s_waitcnt vmcnt(0) 283; VI-NEXT: v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 284; VI-NEXT: v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 285; VI-NEXT: v_ashrrev_i32_sdwa v2, sext(v3), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 286; VI-NEXT: v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 287; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 288; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 289; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 290; VI-NEXT: s_endpgm 291; 292; EG-LABEL: ashr_v4i16: 293; EG: ; %bb.0: 294; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 295; EG-NEXT: TEX 0 @8 296; EG-NEXT: ALU 3, @13, KC0[], KC1[] 297; EG-NEXT: TEX 0 @10 298; EG-NEXT: ALU 54, @17, KC0[CB0:0-32], KC1[] 299; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 300; EG-NEXT: CF_END 301; EG-NEXT: PAD 302; EG-NEXT: Fetch clause starting at 8: 303; EG-NEXT: VTX_READ_64 T10.XY, T9.X, 0, #1 304; EG-NEXT: Fetch clause starting at 10: 305; EG-NEXT: VTX_READ_64 T9.XY, T9.X, 8, #1 306; EG-NEXT: ALU clause starting at 12: 307; EG-NEXT: MOV * T9.X, KC0[2].Z, 308; EG-NEXT: ALU clause starting at 13: 309; EG-NEXT: MOV T4.X, T10.X, 310; EG-NEXT: MOV * T5.X, T10.Y, 311; EG-NEXT: MOV T0.Y, PV.X, 312; EG-NEXT: MOV * T0.Z, PS, 313; EG-NEXT: ALU clause starting at 17: 314; EG-NEXT: MOV T2.X, T9.X, 315; EG-NEXT: MOV * T3.X, T9.Y, 316; EG-NEXT: MOV * T0.W, T6.X, 317; EG-NEXT: MOV T1.Y, T2.X, 318; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x, 319; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 320; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, 321; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 322; EG-NEXT: ASHR * T1.W, T1.W, PV.W, 323; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 324; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, 325; EG-NEXT: 65535(9.183409e-41), -65536(nan) 326; EG-NEXT: OR_INT * T0.W, PS, PV.W, 327; EG-NEXT: MOV * T1.Z, T3.X, 328; EG-NEXT: MOV * T6.X, T0.W, 329; EG-NEXT: MOV T0.W, PV.X, 330; EG-NEXT: LSHR * T1.W, T0.Y, literal.x, 331; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 332; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x, 333; EG-NEXT: LSHR * T2.W, T1.Y, literal.x, 334; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 335; EG-NEXT: ASHR T1.W, PV.W, PS, 336; EG-NEXT: AND_INT * T0.W, T0.W, literal.x, 337; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 338; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 339; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 340; EG-NEXT: OR_INT * T0.W, T0.W, PV.W, 341; EG-NEXT: MOV T6.X, PV.W, 342; EG-NEXT: MOV T0.Y, T7.X, 343; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, 344; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y, 345; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 346; EG-NEXT: ASHR T0.W, PV.W, PS, 347; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 348; EG-NEXT: -65536(nan), 0(0.000000e+00) 349; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, 350; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 351; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 352; EG-NEXT: MOV * T7.X, PV.W, 353; EG-NEXT: MOV T0.Y, PV.X, 354; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, 355; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 356; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, 357; EG-NEXT: LSHR * T1.W, T1.Z, literal.x, 358; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 359; EG-NEXT: ASHR T0.W, PV.W, PS, 360; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 361; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 362; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 363; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 364; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, 365; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W, 366; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 367; EG-NEXT: MOV T7.X, PV.Y, 368; EG-NEXT: MOV * T10.X, T6.X, 369 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 370 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in 371 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 372 %result = ashr <4 x i16> %a, %b 373 store <4 x i16> %result, <4 x i16> addrspace(1)* %out 374 ret void 375} 376 377define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { 378; SI-LABEL: s_ashr_i64: 379; SI: ; %bb.0: ; %entry 380; SI-NEXT: s_load_dword s4, s[0:1], 0xb 381; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 382; SI-NEXT: s_mov_b32 s3, 0xf000 383; SI-NEXT: s_mov_b32 s2, -1 384; SI-NEXT: s_waitcnt lgkmcnt(0) 385; SI-NEXT: s_ashr_i32 s5, s4, 31 386; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 387; SI-NEXT: v_mov_b32_e32 v0, s4 388; SI-NEXT: v_mov_b32_e32 v1, s5 389; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 390; SI-NEXT: s_endpgm 391; 392; VI-LABEL: s_ashr_i64: 393; VI: ; %bb.0: ; %entry 394; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 395; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 396; VI-NEXT: s_mov_b32 s3, 0xf000 397; VI-NEXT: s_mov_b32 s2, -1 398; VI-NEXT: s_waitcnt lgkmcnt(0) 399; VI-NEXT: s_ashr_i32 s5, s4, 31 400; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 401; VI-NEXT: v_mov_b32_e32 v0, s4 402; VI-NEXT: v_mov_b32_e32 v1, s5 403; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 404; VI-NEXT: s_endpgm 405; 406; EG-LABEL: s_ashr_i64: 407; EG: ; %bb.0: ; %entry 408; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 409; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 410; EG-NEXT: CF_END 411; EG-NEXT: PAD 412; EG-NEXT: ALU clause starting at 4: 413; EG-NEXT: ASHR * T0.Y, KC0[2].Z, literal.x, 414; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 415; EG-NEXT: BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x, 416; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 417; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 418entry: 419 %in.ext = sext i32 %in to i64 420 %ashr = ashr i64 %in.ext, 8 421 store i64 %ashr, i64 addrspace(1)* %out 422 ret void 423} 424 425define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 426; SI-LABEL: ashr_i64_2: 427; SI: ; %bb.0: ; %entry 428; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 429; SI-NEXT: s_mov_b32 s7, 0xf000 430; SI-NEXT: s_mov_b32 s6, -1 431; SI-NEXT: s_mov_b32 s10, s6 432; SI-NEXT: s_mov_b32 s11, s7 433; SI-NEXT: s_waitcnt lgkmcnt(0) 434; SI-NEXT: s_mov_b32 s8, s2 435; SI-NEXT: s_mov_b32 s9, s3 436; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 437; SI-NEXT: s_mov_b32 s4, s0 438; SI-NEXT: s_mov_b32 s5, s1 439; SI-NEXT: s_waitcnt vmcnt(0) 440; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v2 441; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 442; SI-NEXT: s_endpgm 443; 444; VI-LABEL: ashr_i64_2: 445; VI: ; %bb.0: ; %entry 446; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 447; VI-NEXT: s_mov_b32 s7, 0xf000 448; VI-NEXT: s_mov_b32 s6, -1 449; VI-NEXT: s_mov_b32 s10, s6 450; VI-NEXT: s_mov_b32 s11, s7 451; VI-NEXT: s_waitcnt lgkmcnt(0) 452; VI-NEXT: s_mov_b32 s8, s2 453; VI-NEXT: s_mov_b32 s9, s3 454; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 455; VI-NEXT: s_mov_b32 s4, s0 456; VI-NEXT: s_mov_b32 s5, s1 457; VI-NEXT: s_waitcnt vmcnt(0) 458; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] 459; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 460; VI-NEXT: s_endpgm 461; 462; EG-LABEL: ashr_i64_2: 463; EG: ; %bb.0: ; %entry 464; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 465; EG-NEXT: TEX 0 @6 466; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] 467; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 468; EG-NEXT: CF_END 469; EG-NEXT: PAD 470; EG-NEXT: Fetch clause starting at 6: 471; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 472; EG-NEXT: ALU clause starting at 8: 473; EG-NEXT: MOV * T0.X, KC0[2].Z, 474; EG-NEXT: ALU clause starting at 9: 475; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, 476; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 477; EG-NEXT: ASHR T1.Z, T0.Y, PV.W, 478; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, 479; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 480; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 481; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, 482; EG-NEXT: ASHR T0.W, T0.Y, literal.x, 483; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 484; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) 485; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W, 486entry: 487 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 488 %a = load i64, i64 addrspace(1)* %in 489 %b = load i64, i64 addrspace(1)* %b_ptr 490 %result = ashr i64 %a, %b 491 store i64 %result, i64 addrspace(1)* %out 492 ret void 493} 494 495define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { 496; SI-LABEL: ashr_v2i64: 497; SI: ; %bb.0: 498; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 499; SI-NEXT: s_mov_b32 s7, 0xf000 500; SI-NEXT: s_mov_b32 s6, -1 501; SI-NEXT: s_mov_b32 s10, s6 502; SI-NEXT: s_mov_b32 s11, s7 503; SI-NEXT: s_waitcnt lgkmcnt(0) 504; SI-NEXT: s_mov_b32 s8, s2 505; SI-NEXT: s_mov_b32 s9, s3 506; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 507; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 508; SI-NEXT: s_mov_b32 s4, s0 509; SI-NEXT: s_mov_b32 s5, s1 510; SI-NEXT: s_waitcnt vmcnt(0) 511; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 512; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 513; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 514; SI-NEXT: s_endpgm 515; 516; VI-LABEL: ashr_v2i64: 517; VI: ; %bb.0: 518; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 519; VI-NEXT: s_mov_b32 s7, 0xf000 520; VI-NEXT: s_mov_b32 s6, -1 521; VI-NEXT: s_mov_b32 s10, s6 522; VI-NEXT: s_mov_b32 s11, s7 523; VI-NEXT: s_waitcnt lgkmcnt(0) 524; VI-NEXT: s_mov_b32 s8, s2 525; VI-NEXT: s_mov_b32 s9, s3 526; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 527; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 528; VI-NEXT: s_mov_b32 s4, s0 529; VI-NEXT: s_mov_b32 s5, s1 530; VI-NEXT: s_waitcnt vmcnt(0) 531; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 532; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 533; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 534; VI-NEXT: s_endpgm 535; 536; EG-LABEL: ashr_v2i64: 537; EG: ; %bb.0: 538; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 539; EG-NEXT: TEX 1 @6 540; EG-NEXT: ALU 19, @11, KC0[CB0:0-32], KC1[] 541; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 542; EG-NEXT: CF_END 543; EG-NEXT: PAD 544; EG-NEXT: Fetch clause starting at 6: 545; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 546; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 547; EG-NEXT: ALU clause starting at 10: 548; EG-NEXT: MOV * T0.X, KC0[2].Z, 549; EG-NEXT: ALU clause starting at 11: 550; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 551; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 552; EG-NEXT: ASHR T1.Y, T0.W, PV.W, 553; EG-NEXT: AND_INT T2.Z, T1.Z, literal.x, 554; EG-NEXT: BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z, 555; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, 556; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 557; EG-NEXT: ASHR T2.Y, T0.Y, PS, 558; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.W, PV.Y, 559; EG-NEXT: BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X, 560; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, 561; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 562; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 563; EG-NEXT: ASHR T0.W, T0.W, literal.x, 564; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, 565; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 566; EG-NEXT: CNDE_INT * T0.W, T2.Z, T1.Y, PV.W, 567; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 568; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W, 569; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 570 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 571 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in 572 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 573 %result = ashr <2 x i64> %a, %b 574 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 575 ret void 576} 577 578; FIXME: Broken on r600 579define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { 580; SI-LABEL: ashr_v4i64: 581; SI: ; %bb.0: 582; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 583; SI-NEXT: s_mov_b32 s3, 0xf000 584; SI-NEXT: s_mov_b32 s2, -1 585; SI-NEXT: s_mov_b32 s10, s2 586; SI-NEXT: s_mov_b32 s11, s3 587; SI-NEXT: s_waitcnt lgkmcnt(0) 588; SI-NEXT: s_mov_b32 s8, s6 589; SI-NEXT: s_mov_b32 s9, s7 590; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 591; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 592; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 593; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 594; SI-NEXT: s_mov_b32 s0, s4 595; SI-NEXT: s_mov_b32 s1, s5 596; SI-NEXT: s_waitcnt vmcnt(2) 597; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 598; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 599; SI-NEXT: s_waitcnt vmcnt(0) 600; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 601; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 602; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 603; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 604; SI-NEXT: s_endpgm 605; 606; VI-LABEL: ashr_v4i64: 607; VI: ; %bb.0: 608; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 609; VI-NEXT: s_mov_b32 s3, 0xf000 610; VI-NEXT: s_mov_b32 s2, -1 611; VI-NEXT: s_mov_b32 s10, s2 612; VI-NEXT: s_mov_b32 s11, s3 613; VI-NEXT: s_waitcnt lgkmcnt(0) 614; VI-NEXT: s_mov_b32 s8, s6 615; VI-NEXT: s_mov_b32 s9, s7 616; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 617; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 618; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 619; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 620; VI-NEXT: s_mov_b32 s0, s4 621; VI-NEXT: s_mov_b32 s1, s5 622; VI-NEXT: s_waitcnt vmcnt(2) 623; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 624; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 625; VI-NEXT: s_waitcnt vmcnt(0) 626; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] 627; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] 628; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 629; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 630; VI-NEXT: s_endpgm 631; 632; EG-LABEL: ashr_v4i64: 633; EG: ; %bb.0: 634; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 635; EG-NEXT: TEX 3 @6 636; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] 637; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 638; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 639; EG-NEXT: CF_END 640; EG-NEXT: Fetch clause starting at 6: 641; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 642; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 643; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 644; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 645; EG-NEXT: ALU clause starting at 14: 646; EG-NEXT: MOV * T0.X, KC0[2].Z, 647; EG-NEXT: ALU clause starting at 15: 648; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 649; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 650; EG-NEXT: ASHR T1.Y, T0.W, literal.x, 651; EG-NEXT: ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212 652; EG-NEXT: AND_INT T1.W, T1.Z, literal.y, 653; EG-NEXT: AND_INT * T2.W, T2.Z, literal.x, 654; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 655; EG-NEXT: BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z, 656; EG-NEXT: ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212 657; EG-NEXT: AND_INT * T1.Z, T2.Z, literal.x, 658; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 659; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z, 660; EG-NEXT: AND_INT * T2.W, T2.X, literal.x, 661; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 662; EG-NEXT: AND_INT T5.X, T1.X, literal.x, 663; EG-NEXT: ASHR T4.Y, T0.Y, PS, 664; EG-NEXT: CNDE_INT T0.Z, T1.Z, PV.W, T2.Y, 665; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X, 666; EG-NEXT: AND_INT * T2.W, T2.X, literal.y, 667; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 668; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 669; EG-NEXT: ASHR T5.Y, T3.Y, PV.X, 670; EG-NEXT: CNDE_INT T2.Z, T1.W, T4.X, T4.Z, 671; EG-NEXT: BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221 672; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, 673; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 674; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, 675; EG-NEXT: ASHR T6.Y, T3.W, literal.x, 676; EG-NEXT: ASHR T3.Z, T0.Y, literal.x, BS:VEC_201 677; EG-NEXT: ADD_INT T3.W, KC0[2].Y, literal.y, 678; EG-NEXT: CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y, 679; EG-NEXT: 31(4.344025e-44), 16(2.242078e-44) 680; EG-NEXT: LSHR T1.X, PV.W, literal.x, 681; EG-NEXT: CNDE_INT T0.Y, T2.W, T4.Y, PV.Z, 682; EG-NEXT: ASHR T3.W, T3.Y, literal.y, 683; EG-NEXT: CNDE_INT * T2.W, T1.W, T4.Z, PV.Y, 684; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 685; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x, 686; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W, 687; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 688 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 689 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in 690 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 691 %result = ashr <4 x i64> %a, %b 692 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 693 ret void 694} 695 696define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 697; SI-LABEL: s_ashr_32_i64: 698; SI: ; %bb.0: 699; SI-NEXT: s_load_dword s6, s[0:1], 0x14 700; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 701; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 702; SI-NEXT: s_mov_b32 s3, 0xf000 703; SI-NEXT: s_mov_b32 s2, -1 704; SI-NEXT: s_waitcnt lgkmcnt(0) 705; SI-NEXT: s_ashr_i32 s7, s6, 31 706; SI-NEXT: s_add_u32 s4, s6, s4 707; SI-NEXT: s_addc_u32 s5, s7, s5 708; SI-NEXT: v_mov_b32_e32 v0, s4 709; SI-NEXT: v_mov_b32_e32 v1, s5 710; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 711; SI-NEXT: s_endpgm 712; 713; VI-LABEL: s_ashr_32_i64: 714; VI: ; %bb.0: 715; VI-NEXT: s_load_dword s6, s[0:1], 0x50 716; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 717; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 718; VI-NEXT: s_mov_b32 s3, 0xf000 719; VI-NEXT: s_mov_b32 s2, -1 720; VI-NEXT: s_waitcnt lgkmcnt(0) 721; VI-NEXT: s_ashr_i32 s7, s6, 31 722; VI-NEXT: s_add_u32 s4, s6, s4 723; VI-NEXT: s_addc_u32 s5, s7, s5 724; VI-NEXT: v_mov_b32_e32 v0, s4 725; VI-NEXT: v_mov_b32_e32 v1, s5 726; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 727; VI-NEXT: s_endpgm 728; 729; EG-LABEL: s_ashr_32_i64: 730; EG: ; %bb.0: 731; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 732; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 733; EG-NEXT: CF_END 734; EG-NEXT: PAD 735; EG-NEXT: ALU clause starting at 4: 736; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 737; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 738; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[7].Z, 739; EG-NEXT: ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y, 740; EG-NEXT: ADD_INT * T0.Y, T0.W, PV.W, 741; EG-NEXT: ADD_INT * T0.X, KC0[5].X, KC0[7].Y, 742; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 743; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 744 %result = ashr i64 %a, 32 745 %add = add i64 %result, %b 746 store i64 %add, i64 addrspace(1)* %out 747 ret void 748} 749 750define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 751; SI-LABEL: v_ashr_32_i64: 752; SI: ; %bb.0: 753; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 754; SI-NEXT: s_mov_b32 s7, 0xf000 755; SI-NEXT: s_mov_b32 s6, 0 756; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 757; SI-NEXT: v_mov_b32_e32 v1, 0 758; SI-NEXT: s_waitcnt lgkmcnt(0) 759; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 760; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 761; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 762; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 763; SI-NEXT: s_waitcnt vmcnt(0) 764; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 765; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 766; SI-NEXT: s_endpgm 767; 768; VI-LABEL: v_ashr_32_i64: 769; VI: ; %bb.0: 770; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 771; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 772; VI-NEXT: s_waitcnt lgkmcnt(0) 773; VI-NEXT: v_mov_b32_e32 v0, s3 774; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 775; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 776; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 777; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 778; VI-NEXT: flat_load_dword v0, v[0:1] 779; VI-NEXT: v_mov_b32_e32 v1, s1 780; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 781; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 782; VI-NEXT: s_waitcnt vmcnt(0) 783; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 784; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 785; VI-NEXT: s_endpgm 786; 787; EG-LABEL: v_ashr_32_i64: 788; EG: ; %bb.0: 789; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 790; EG-NEXT: TEX 0 @6 791; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 792; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 793; EG-NEXT: CF_END 794; EG-NEXT: PAD 795; EG-NEXT: Fetch clause starting at 6: 796; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 797; EG-NEXT: ALU clause starting at 8: 798; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 799; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 800; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 801; EG-NEXT: ALU clause starting at 11: 802; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 803; EG-NEXT: LSHR T1.X, PV.W, literal.x, 804; EG-NEXT: ASHR * T0.Y, T0.X, literal.y, 805; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 806 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 807 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 808 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 809 %a = load i64, i64 addrspace(1)* %gep.in 810 %result = ashr i64 %a, 32 811 store i64 %result, i64 addrspace(1)* %gep.out 812 ret void 813} 814 815define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 816; SI-LABEL: s_ashr_63_i64: 817; SI: ; %bb.0: 818; SI-NEXT: s_load_dword s6, s[0:1], 0x14 819; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 820; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 821; SI-NEXT: s_mov_b32 s3, 0xf000 822; SI-NEXT: s_mov_b32 s2, -1 823; SI-NEXT: s_waitcnt lgkmcnt(0) 824; SI-NEXT: s_ashr_i32 s6, s6, 31 825; SI-NEXT: s_add_u32 s4, s6, s4 826; SI-NEXT: s_addc_u32 s5, s6, s5 827; SI-NEXT: v_mov_b32_e32 v0, s4 828; SI-NEXT: v_mov_b32_e32 v1, s5 829; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 830; SI-NEXT: s_endpgm 831; 832; VI-LABEL: s_ashr_63_i64: 833; VI: ; %bb.0: 834; VI-NEXT: s_load_dword s6, s[0:1], 0x50 835; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 836; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 837; VI-NEXT: s_mov_b32 s3, 0xf000 838; VI-NEXT: s_mov_b32 s2, -1 839; VI-NEXT: s_waitcnt lgkmcnt(0) 840; VI-NEXT: s_ashr_i32 s6, s6, 31 841; VI-NEXT: s_add_u32 s4, s6, s4 842; VI-NEXT: s_addc_u32 s5, s6, s5 843; VI-NEXT: v_mov_b32_e32 v0, s4 844; VI-NEXT: v_mov_b32_e32 v1, s5 845; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 846; VI-NEXT: s_endpgm 847; 848; EG-LABEL: s_ashr_63_i64: 849; EG: ; %bb.0: 850; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 851; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 852; EG-NEXT: CF_END 853; EG-NEXT: PAD 854; EG-NEXT: ALU clause starting at 4: 855; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 856; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 857; EG-NEXT: ADD_INT T1.W, PV.W, KC0[7].Z, 858; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y, 859; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 860; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y, 861; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 862; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 863 %result = ashr i64 %a, 63 864 %add = add i64 %result, %b 865 store i64 %add, i64 addrspace(1)* %out 866 ret void 867} 868 869define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 870; SI-LABEL: v_ashr_63_i64: 871; SI: ; %bb.0: 872; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 873; SI-NEXT: s_mov_b32 s7, 0xf000 874; SI-NEXT: s_mov_b32 s6, 0 875; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 876; SI-NEXT: v_mov_b32_e32 v1, 0 877; SI-NEXT: s_waitcnt lgkmcnt(0) 878; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 879; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 880; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 881; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 882; SI-NEXT: s_waitcnt vmcnt(0) 883; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 884; SI-NEXT: v_mov_b32_e32 v3, v2 885; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 886; SI-NEXT: s_endpgm 887; 888; VI-LABEL: v_ashr_63_i64: 889; VI: ; %bb.0: 890; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 891; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 892; VI-NEXT: s_waitcnt lgkmcnt(0) 893; VI-NEXT: v_mov_b32_e32 v0, s3 894; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 895; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 896; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 897; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 898; VI-NEXT: flat_load_dword v3, v[0:1] 899; VI-NEXT: v_mov_b32_e32 v1, s1 900; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 901; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 902; VI-NEXT: s_waitcnt vmcnt(0) 903; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 904; VI-NEXT: v_mov_b32_e32 v3, v2 905; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 906; VI-NEXT: s_endpgm 907; 908; EG-LABEL: v_ashr_63_i64: 909; EG: ; %bb.0: 910; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 911; EG-NEXT: TEX 0 @6 912; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 913; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 914; EG-NEXT: CF_END 915; EG-NEXT: PAD 916; EG-NEXT: Fetch clause starting at 6: 917; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 918; EG-NEXT: ALU clause starting at 8: 919; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 920; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 921; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 922; EG-NEXT: ALU clause starting at 11: 923; EG-NEXT: ASHR T0.X, T0.X, literal.x, 924; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 925; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 926; EG-NEXT: LSHR T1.X, PV.W, literal.x, 927; EG-NEXT: MOV * T0.Y, PV.X, 928; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 929 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 930 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 931 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 932 %a = load i64, i64 addrspace(1)* %gep.in 933 %result = ashr i64 %a, 63 934 store i64 %result, i64 addrspace(1)* %gep.out 935 ret void 936} 937 938attributes #0 = { nounwind readnone } 939