1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 9; SI-LABEL: ashr_v2i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_ashr_i32_e32 v1, v1, v3 24; SI-NEXT: v_ashr_i32_e32 v0, v0, v2 25; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: ashr_v2i32: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 31; VI-NEXT: s_mov_b32 s7, 0xf000 32; VI-NEXT: s_mov_b32 s6, -1 33; VI-NEXT: s_mov_b32 s10, s6 34; VI-NEXT: s_mov_b32 s11, s7 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_mov_b32 s8, s2 37; VI-NEXT: s_mov_b32 s9, s3 38; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 39; VI-NEXT: s_mov_b32 s4, s0 40; VI-NEXT: s_mov_b32 s5, s1 41; VI-NEXT: s_waitcnt vmcnt(0) 42; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1 43; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 44; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 45; VI-NEXT: s_endpgm 46; 47; EG-LABEL: ashr_v2i32: 48; EG: ; %bb.0: 49; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 50; EG-NEXT: TEX 0 @6 51; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: Fetch clause starting at 6: 56; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 57; EG-NEXT: ALU clause starting at 8: 58; EG-NEXT: MOV * T0.X, KC0[2].Z, 59; EG-NEXT: ALU clause starting at 9: 60; EG-NEXT: ASHR * T0.Y, T0.Y, T0.W, 61; EG-NEXT: ASHR T0.X, T0.X, T0.Z, 62; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 63; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 64 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 65 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in 66 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr 67 %result = ashr <2 x i32> %a, %b 68 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 69 ret void 70} 71 72define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 73; SI-LABEL: ashr_v4i32: 74; SI: ; %bb.0: 75; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 76; SI-NEXT: s_mov_b32 s7, 0xf000 77; SI-NEXT: s_mov_b32 s6, -1 78; SI-NEXT: s_mov_b32 s10, s6 79; SI-NEXT: s_mov_b32 s11, s7 80; SI-NEXT: s_waitcnt lgkmcnt(0) 81; SI-NEXT: s_mov_b32 s8, s2 82; SI-NEXT: s_mov_b32 s9, s3 83; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 84; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 85; SI-NEXT: s_mov_b32 s4, s0 86; SI-NEXT: s_mov_b32 s5, s1 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 89; SI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 90; SI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 91; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 92; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 93; SI-NEXT: s_endpgm 94; 95; VI-LABEL: ashr_v4i32: 96; VI: ; %bb.0: 97; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 98; VI-NEXT: s_mov_b32 s7, 0xf000 99; VI-NEXT: s_mov_b32 s6, -1 100; VI-NEXT: s_mov_b32 s10, s6 101; VI-NEXT: s_mov_b32 s11, s7 102; VI-NEXT: s_waitcnt lgkmcnt(0) 103; VI-NEXT: s_mov_b32 s8, s2 104; VI-NEXT: s_mov_b32 s9, s3 105; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 106; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 107; VI-NEXT: s_mov_b32 s4, s0 108; VI-NEXT: s_mov_b32 s5, s1 109; VI-NEXT: s_waitcnt vmcnt(0) 110; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 111; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 112; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 113; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0 114; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 115; VI-NEXT: s_endpgm 116; 117; EG-LABEL: ashr_v4i32: 118; EG: ; %bb.0: 119; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 120; EG-NEXT: TEX 1 @6 121; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 123; EG-NEXT: CF_END 124; EG-NEXT: PAD 125; EG-NEXT: Fetch clause starting at 6: 126; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 127; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 128; EG-NEXT: ALU clause starting at 10: 129; EG-NEXT: MOV * T0.X, KC0[2].Z, 130; EG-NEXT: ALU clause starting at 11: 131; EG-NEXT: ASHR * T0.W, T0.W, T1.W, 132; EG-NEXT: ASHR * T0.Z, T0.Z, T1.Z, 133; EG-NEXT: ASHR * T0.Y, T0.Y, T1.Y, 134; EG-NEXT: ASHR T0.X, T0.X, T1.X, 135; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 136; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 137 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 138 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in 139 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr 140 %result = ashr <4 x i32> %a, %b 141 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 142 ret void 143} 144 145; FIXME: The ashr operation is uniform, but because its operands come from a 146; global load we end up with the vector instructions rather than scalar. 147define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 148; SI-LABEL: ashr_v2i16: 149; SI: ; %bb.0: 150; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 151; SI-NEXT: s_mov_b32 s7, 0xf000 152; SI-NEXT: s_mov_b32 s6, -1 153; SI-NEXT: s_mov_b32 s10, s6 154; SI-NEXT: s_mov_b32 s11, s7 155; SI-NEXT: s_waitcnt lgkmcnt(0) 156; SI-NEXT: s_mov_b32 s8, s2 157; SI-NEXT: s_mov_b32 s9, s3 158; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 159; SI-NEXT: s_mov_b32 s4, s0 160; SI-NEXT: s_mov_b32 s5, s1 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: v_readfirstlane_b32 s0, v0 163; SI-NEXT: v_readfirstlane_b32 s1, v1 164; SI-NEXT: s_sext_i32_i16 s2, s0 165; SI-NEXT: s_ashr_i32 s0, s0, 16 166; SI-NEXT: s_lshr_b32 s3, s1, 16 167; SI-NEXT: s_ashr_i32 s0, s0, s3 168; SI-NEXT: s_ashr_i32 s1, s2, s1 169; SI-NEXT: s_lshl_b32 s0, s0, 16 170; SI-NEXT: s_and_b32 s1, s1, 0xffff 171; SI-NEXT: s_or_b32 s0, s1, s0 172; SI-NEXT: v_mov_b32_e32 v0, s0 173; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 174; SI-NEXT: s_endpgm 175; 176; VI-LABEL: ashr_v2i16: 177; VI: ; %bb.0: 178; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 179; VI-NEXT: s_mov_b32 s7, 0xf000 180; VI-NEXT: s_mov_b32 s6, -1 181; VI-NEXT: s_mov_b32 s10, s6 182; VI-NEXT: s_mov_b32 s11, s7 183; VI-NEXT: s_waitcnt lgkmcnt(0) 184; VI-NEXT: s_mov_b32 s8, s2 185; VI-NEXT: s_mov_b32 s9, s3 186; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 187; VI-NEXT: s_mov_b32 s4, s0 188; VI-NEXT: s_mov_b32 s5, s1 189; VI-NEXT: s_waitcnt vmcnt(0) 190; VI-NEXT: v_readfirstlane_b32 s0, v0 191; VI-NEXT: v_readfirstlane_b32 s1, v1 192; VI-NEXT: s_ashr_i32 s2, s0, 16 193; VI-NEXT: s_sext_i32_i16 s0, s0 194; VI-NEXT: s_ashr_i32 s3, s1, 16 195; VI-NEXT: s_sext_i32_i16 s1, s1 196; VI-NEXT: s_ashr_i32 s0, s0, s1 197; VI-NEXT: s_ashr_i32 s1, s2, s3 198; VI-NEXT: s_lshl_b32 s1, s1, 16 199; VI-NEXT: s_and_b32 s0, s0, 0xffff 200; VI-NEXT: s_or_b32 s0, s0, s1 201; VI-NEXT: v_mov_b32_e32 v0, s0 202; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 203; VI-NEXT: s_endpgm 204; 205; EG-LABEL: ashr_v2i16: 206; EG: ; %bb.0: 207; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 208; EG-NEXT: TEX 0 @6 209; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 210; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 211; EG-NEXT: CF_END 212; EG-NEXT: PAD 213; EG-NEXT: Fetch clause starting at 6: 214; EG-NEXT: VTX_READ_64 T6.XY, T6.X, 0, #1 215; EG-NEXT: ALU clause starting at 8: 216; EG-NEXT: MOV * T6.X, KC0[2].Z, 217; EG-NEXT: ALU clause starting at 9: 218; EG-NEXT: LSHR * T0.W, T6.X, literal.x, 219; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 220; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, 221; EG-NEXT: LSHR T0.Z, T6.Y, literal.x, 222; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x, 223; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y, 224; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 225; EG-NEXT: ASHR T0.W, PV.W, PS, 226; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z, 227; EG-NEXT: LSHL T1.W, PS, literal.x, 228; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, 229; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 230; EG-NEXT: OR_INT T6.X, PS, PV.W, 231; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 232; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 233 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 234 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in 235 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 236 %result = ashr <2 x i16> %a, %b 237 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 238 ret void 239} 240 241; FIXME: The ashr operation is uniform, but because its operands come from a 242; global load we end up with the vector instructions rather than scalar. 243define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 244; SI-LABEL: ashr_v4i16: 245; SI: ; %bb.0: 246; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 247; SI-NEXT: s_mov_b32 s7, 0xf000 248; SI-NEXT: s_mov_b32 s6, -1 249; SI-NEXT: s_mov_b32 s10, s6 250; SI-NEXT: s_mov_b32 s11, s7 251; SI-NEXT: s_waitcnt lgkmcnt(0) 252; SI-NEXT: s_mov_b32 s8, s2 253; SI-NEXT: s_mov_b32 s9, s3 254; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 255; SI-NEXT: s_mov_b32 s4, s0 256; SI-NEXT: s_mov_b32 s5, s1 257; SI-NEXT: s_waitcnt vmcnt(0) 258; SI-NEXT: v_readfirstlane_b32 s0, v3 259; SI-NEXT: v_readfirstlane_b32 s1, v2 260; SI-NEXT: v_readfirstlane_b32 s2, v1 261; SI-NEXT: v_readfirstlane_b32 s3, v0 262; SI-NEXT: s_sext_i32_i16 s8, s3 263; SI-NEXT: s_ashr_i32 s3, s3, 16 264; SI-NEXT: s_sext_i32_i16 s9, s2 265; SI-NEXT: s_ashr_i32 s2, s2, 16 266; SI-NEXT: s_lshr_b32 s10, s1, 16 267; SI-NEXT: s_lshr_b32 s11, s0, 16 268; SI-NEXT: s_ashr_i32 s2, s2, s11 269; SI-NEXT: s_ashr_i32 s0, s9, s0 270; SI-NEXT: s_ashr_i32 s3, s3, s10 271; SI-NEXT: s_ashr_i32 s1, s8, s1 272; SI-NEXT: s_lshl_b32 s2, s2, 16 273; SI-NEXT: s_and_b32 s0, s0, 0xffff 274; SI-NEXT: s_lshl_b32 s3, s3, 16 275; SI-NEXT: s_and_b32 s1, s1, 0xffff 276; SI-NEXT: s_or_b32 s0, s0, s2 277; SI-NEXT: s_or_b32 s1, s1, s3 278; SI-NEXT: v_mov_b32_e32 v0, s1 279; SI-NEXT: v_mov_b32_e32 v1, s0 280; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 281; SI-NEXT: s_endpgm 282; 283; VI-LABEL: ashr_v4i16: 284; VI: ; %bb.0: 285; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 286; VI-NEXT: s_mov_b32 s7, 0xf000 287; VI-NEXT: s_mov_b32 s6, -1 288; VI-NEXT: s_mov_b32 s10, s6 289; VI-NEXT: s_mov_b32 s11, s7 290; VI-NEXT: s_waitcnt lgkmcnt(0) 291; VI-NEXT: s_mov_b32 s8, s2 292; VI-NEXT: s_mov_b32 s9, s3 293; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 294; VI-NEXT: s_mov_b32 s4, s0 295; VI-NEXT: s_mov_b32 s5, s1 296; VI-NEXT: s_waitcnt vmcnt(0) 297; VI-NEXT: v_readfirstlane_b32 s0, v2 298; VI-NEXT: v_readfirstlane_b32 s1, v3 299; VI-NEXT: v_readfirstlane_b32 s2, v0 300; VI-NEXT: v_readfirstlane_b32 s3, v1 301; VI-NEXT: s_ashr_i32 s8, s3, 16 302; VI-NEXT: s_sext_i32_i16 s3, s3 303; VI-NEXT: s_ashr_i32 s9, s2, 16 304; VI-NEXT: s_sext_i32_i16 s2, s2 305; VI-NEXT: s_ashr_i32 s10, s1, 16 306; VI-NEXT: s_sext_i32_i16 s1, s1 307; VI-NEXT: s_ashr_i32 s11, s0, 16 308; VI-NEXT: s_sext_i32_i16 s0, s0 309; VI-NEXT: s_ashr_i32 s0, s2, s0 310; VI-NEXT: s_ashr_i32 s2, s9, s11 311; VI-NEXT: s_ashr_i32 s1, s3, s1 312; VI-NEXT: s_ashr_i32 s3, s8, s10 313; VI-NEXT: s_lshl_b32 s3, s3, 16 314; VI-NEXT: s_and_b32 s1, s1, 0xffff 315; VI-NEXT: s_lshl_b32 s2, s2, 16 316; VI-NEXT: s_and_b32 s0, s0, 0xffff 317; VI-NEXT: s_or_b32 s1, s1, s3 318; VI-NEXT: s_or_b32 s0, s0, s2 319; VI-NEXT: v_mov_b32_e32 v0, s0 320; VI-NEXT: v_mov_b32_e32 v1, s1 321; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 322; VI-NEXT: s_endpgm 323; 324; EG-LABEL: ashr_v4i16: 325; EG: ; %bb.0: 326; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 327; EG-NEXT: TEX 0 @6 328; EG-NEXT: ALU 58, @9, KC0[CB0:0-32], KC1[] 329; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 330; EG-NEXT: CF_END 331; EG-NEXT: PAD 332; EG-NEXT: Fetch clause starting at 6: 333; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 334; EG-NEXT: ALU clause starting at 8: 335; EG-NEXT: MOV * T9.X, KC0[2].Z, 336; EG-NEXT: ALU clause starting at 9: 337; EG-NEXT: MOV T4.X, T9.X, 338; EG-NEXT: MOV * T5.X, T9.Y, 339; EG-NEXT: MOV T0.Y, PV.X, 340; EG-NEXT: MOV * T0.Z, PS, 341; EG-NEXT: MOV T2.X, T9.Z, 342; EG-NEXT: MOV * T3.X, T9.W, 343; EG-NEXT: MOV * T0.W, T6.X, 344; EG-NEXT: MOV T1.Y, T2.X, 345; EG-NEXT: BFE_INT * T1.W, T0.Y, 0.0, literal.x, 346; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 347; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, 348; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 349; EG-NEXT: ASHR * T1.W, T1.W, PV.W, 350; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 351; EG-NEXT: AND_INT * T0.W, T0.W, literal.y, 352; EG-NEXT: 65535(9.183409e-41), -65536(nan) 353; EG-NEXT: OR_INT * T0.W, PS, PV.W, 354; EG-NEXT: MOV * T1.Z, T3.X, 355; EG-NEXT: MOV * T6.X, T0.W, 356; EG-NEXT: MOV T0.W, PV.X, 357; EG-NEXT: LSHR * T1.W, T0.Y, literal.x, 358; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 359; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x, 360; EG-NEXT: LSHR * T2.W, T1.Y, literal.x, 361; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 362; EG-NEXT: ASHR T1.W, PV.W, PS, 363; EG-NEXT: AND_INT * T0.W, T0.W, literal.x, 364; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 365; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 366; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 367; EG-NEXT: OR_INT * T0.W, T0.W, PV.W, 368; EG-NEXT: MOV T6.X, PV.W, 369; EG-NEXT: MOV T0.Y, T7.X, 370; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x, 371; EG-NEXT: AND_INT * T1.W, T1.Z, literal.y, 372; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 373; EG-NEXT: ASHR T0.W, PV.W, PS, 374; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 375; EG-NEXT: -65536(nan), 0(0.000000e+00) 376; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, 377; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 378; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 379; EG-NEXT: MOV * T7.X, PV.W, 380; EG-NEXT: MOV T0.Y, PV.X, 381; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, 382; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 383; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, 384; EG-NEXT: LSHR * T1.W, T1.Z, literal.x, 385; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 386; EG-NEXT: ASHR T0.W, PV.W, PS, 387; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 388; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 389; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 390; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 391; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, 392; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W, 393; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 394; EG-NEXT: MOV T7.X, PV.Y, 395; EG-NEXT: MOV * T10.X, T6.X, 396 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 397 %a = load <4 x i16>, <4 x i16> addrspace(1)* %in 398 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 399 %result = ashr <4 x i16> %a, %b 400 store <4 x i16> %result, <4 x i16> addrspace(1)* %out 401 ret void 402} 403 404define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) { 405; SI-LABEL: s_ashr_i64: 406; SI: ; %bb.0: ; %entry 407; SI-NEXT: s_load_dword s4, s[0:1], 0xb 408; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 409; SI-NEXT: s_mov_b32 s3, 0xf000 410; SI-NEXT: s_mov_b32 s2, -1 411; SI-NEXT: s_waitcnt lgkmcnt(0) 412; SI-NEXT: s_ashr_i32 s5, s4, 31 413; SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 414; SI-NEXT: v_mov_b32_e32 v0, s4 415; SI-NEXT: v_mov_b32_e32 v1, s5 416; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 417; SI-NEXT: s_endpgm 418; 419; VI-LABEL: s_ashr_i64: 420; VI: ; %bb.0: ; %entry 421; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 422; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 423; VI-NEXT: s_mov_b32 s3, 0xf000 424; VI-NEXT: s_mov_b32 s2, -1 425; VI-NEXT: s_waitcnt lgkmcnt(0) 426; VI-NEXT: s_ashr_i32 s5, s4, 31 427; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8 428; VI-NEXT: v_mov_b32_e32 v0, s4 429; VI-NEXT: v_mov_b32_e32 v1, s5 430; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 431; VI-NEXT: s_endpgm 432; 433; EG-LABEL: s_ashr_i64: 434; EG: ; %bb.0: ; %entry 435; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 436; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 437; EG-NEXT: CF_END 438; EG-NEXT: PAD 439; EG-NEXT: ALU clause starting at 4: 440; EG-NEXT: ASHR * T0.Y, KC0[2].Z, literal.x, 441; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 442; EG-NEXT: BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x, 443; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 444; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 445entry: 446 %in.ext = sext i32 %in to i64 447 %ashr = ashr i64 %in.ext, 8 448 store i64 %ashr, i64 addrspace(1)* %out 449 ret void 450} 451 452define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 453; SI-LABEL: ashr_i64_2: 454; SI: ; %bb.0: ; %entry 455; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 456; SI-NEXT: s_mov_b32 s7, 0xf000 457; SI-NEXT: s_mov_b32 s6, -1 458; SI-NEXT: s_mov_b32 s10, s6 459; SI-NEXT: s_mov_b32 s11, s7 460; SI-NEXT: s_waitcnt lgkmcnt(0) 461; SI-NEXT: s_mov_b32 s8, s2 462; SI-NEXT: s_mov_b32 s9, s3 463; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 464; SI-NEXT: s_mov_b32 s4, s0 465; SI-NEXT: s_mov_b32 s5, s1 466; SI-NEXT: s_waitcnt vmcnt(0) 467; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v2 468; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 469; SI-NEXT: s_endpgm 470; 471; VI-LABEL: ashr_i64_2: 472; VI: ; %bb.0: ; %entry 473; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 474; VI-NEXT: s_mov_b32 s7, 0xf000 475; VI-NEXT: s_mov_b32 s6, -1 476; VI-NEXT: s_mov_b32 s10, s6 477; VI-NEXT: s_mov_b32 s11, s7 478; VI-NEXT: s_waitcnt lgkmcnt(0) 479; VI-NEXT: s_mov_b32 s8, s2 480; VI-NEXT: s_mov_b32 s9, s3 481; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 482; VI-NEXT: s_mov_b32 s4, s0 483; VI-NEXT: s_mov_b32 s5, s1 484; VI-NEXT: s_waitcnt vmcnt(0) 485; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] 486; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 487; VI-NEXT: s_endpgm 488; 489; EG-LABEL: ashr_i64_2: 490; EG: ; %bb.0: ; %entry 491; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 492; EG-NEXT: TEX 0 @6 493; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] 494; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 495; EG-NEXT: CF_END 496; EG-NEXT: PAD 497; EG-NEXT: Fetch clause starting at 6: 498; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 499; EG-NEXT: ALU clause starting at 8: 500; EG-NEXT: MOV * T0.X, KC0[2].Z, 501; EG-NEXT: ALU clause starting at 9: 502; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, 503; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 504; EG-NEXT: ASHR T1.Z, T0.Y, PV.W, 505; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, 506; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 507; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 508; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, 509; EG-NEXT: ASHR T0.W, T0.Y, literal.x, 510; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 511; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) 512; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W, 513entry: 514 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 515 %a = load i64, i64 addrspace(1)* %in 516 %b = load i64, i64 addrspace(1)* %b_ptr 517 %result = ashr i64 %a, %b 518 store i64 %result, i64 addrspace(1)* %out 519 ret void 520} 521 522define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { 523; SI-LABEL: ashr_v2i64: 524; SI: ; %bb.0: 525; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 526; SI-NEXT: s_mov_b32 s7, 0xf000 527; SI-NEXT: s_mov_b32 s6, -1 528; SI-NEXT: s_mov_b32 s10, s6 529; SI-NEXT: s_mov_b32 s11, s7 530; SI-NEXT: s_waitcnt lgkmcnt(0) 531; SI-NEXT: s_mov_b32 s8, s2 532; SI-NEXT: s_mov_b32 s9, s3 533; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 534; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 535; SI-NEXT: s_mov_b32 s4, s0 536; SI-NEXT: s_mov_b32 s5, s1 537; SI-NEXT: s_waitcnt vmcnt(0) 538; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 539; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 540; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 541; SI-NEXT: s_endpgm 542; 543; VI-LABEL: ashr_v2i64: 544; VI: ; %bb.0: 545; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 546; VI-NEXT: s_mov_b32 s7, 0xf000 547; VI-NEXT: s_mov_b32 s6, -1 548; VI-NEXT: s_mov_b32 s10, s6 549; VI-NEXT: s_mov_b32 s11, s7 550; VI-NEXT: s_waitcnt lgkmcnt(0) 551; VI-NEXT: s_mov_b32 s8, s2 552; VI-NEXT: s_mov_b32 s9, s3 553; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 554; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 555; VI-NEXT: s_mov_b32 s4, s0 556; VI-NEXT: s_mov_b32 s5, s1 557; VI-NEXT: s_waitcnt vmcnt(0) 558; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 559; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 560; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 561; VI-NEXT: s_endpgm 562; 563; EG-LABEL: ashr_v2i64: 564; EG: ; %bb.0: 565; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 566; EG-NEXT: TEX 1 @6 567; EG-NEXT: ALU 19, @11, KC0[CB0:0-32], KC1[] 568; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 569; EG-NEXT: CF_END 570; EG-NEXT: PAD 571; EG-NEXT: Fetch clause starting at 6: 572; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 573; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 574; EG-NEXT: ALU clause starting at 10: 575; EG-NEXT: MOV * T0.X, KC0[2].Z, 576; EG-NEXT: ALU clause starting at 11: 577; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 578; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 579; EG-NEXT: ASHR T1.Y, T0.W, PV.W, 580; EG-NEXT: AND_INT T2.Z, T1.Z, literal.x, 581; EG-NEXT: BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z, 582; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, 583; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 584; EG-NEXT: ASHR T2.Y, T0.Y, PS, 585; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.W, PV.Y, 586; EG-NEXT: BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X, 587; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, 588; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 589; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 590; EG-NEXT: ASHR T0.W, T0.W, literal.x, 591; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, 592; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 593; EG-NEXT: CNDE_INT * T0.W, T2.Z, T1.Y, PV.W, 594; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 595; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W, 596; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 597 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 598 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in 599 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 600 %result = ashr <2 x i64> %a, %b 601 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 602 ret void 603} 604 605; FIXME: Broken on r600 606define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { 607; SI-LABEL: ashr_v4i64: 608; SI: ; %bb.0: 609; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 610; SI-NEXT: s_mov_b32 s3, 0xf000 611; SI-NEXT: s_mov_b32 s2, -1 612; SI-NEXT: s_mov_b32 s10, s2 613; SI-NEXT: s_mov_b32 s11, s3 614; SI-NEXT: s_waitcnt lgkmcnt(0) 615; SI-NEXT: s_mov_b32 s8, s6 616; SI-NEXT: s_mov_b32 s9, s7 617; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 618; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 619; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 620; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 621; SI-NEXT: s_mov_b32 s0, s4 622; SI-NEXT: s_mov_b32 s1, s5 623; SI-NEXT: s_waitcnt vmcnt(2) 624; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 625; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 626; SI-NEXT: s_waitcnt vmcnt(0) 627; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 628; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 629; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 630; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 631; SI-NEXT: s_endpgm 632; 633; VI-LABEL: ashr_v4i64: 634; VI: ; %bb.0: 635; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 636; VI-NEXT: s_mov_b32 s3, 0xf000 637; VI-NEXT: s_mov_b32 s2, -1 638; VI-NEXT: s_mov_b32 s10, s2 639; VI-NEXT: s_mov_b32 s11, s3 640; VI-NEXT: s_waitcnt lgkmcnt(0) 641; VI-NEXT: s_mov_b32 s8, s6 642; VI-NEXT: s_mov_b32 s9, s7 643; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 644; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 645; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 646; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 647; VI-NEXT: s_mov_b32 s0, s4 648; VI-NEXT: s_mov_b32 s1, s5 649; VI-NEXT: s_waitcnt vmcnt(2) 650; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 651; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 652; VI-NEXT: s_waitcnt vmcnt(0) 653; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] 654; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] 655; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 656; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 657; VI-NEXT: s_endpgm 658; 659; EG-LABEL: ashr_v4i64: 660; EG: ; %bb.0: 661; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 662; EG-NEXT: TEX 3 @6 663; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] 664; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 665; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 666; EG-NEXT: CF_END 667; EG-NEXT: Fetch clause starting at 6: 668; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 669; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 670; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 671; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 672; EG-NEXT: ALU clause starting at 14: 673; EG-NEXT: MOV * T0.X, KC0[2].Z, 674; EG-NEXT: ALU clause starting at 15: 675; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 676; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 677; EG-NEXT: ASHR T1.Y, T0.W, literal.x, 678; EG-NEXT: ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212 679; EG-NEXT: AND_INT T1.W, T1.Z, literal.y, 680; EG-NEXT: AND_INT * T2.W, T2.Z, literal.x, 681; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 682; EG-NEXT: BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z, 683; EG-NEXT: ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212 684; EG-NEXT: AND_INT * T1.Z, T2.Z, literal.x, 685; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 686; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z, 687; EG-NEXT: AND_INT * T2.W, T2.X, literal.x, 688; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 689; EG-NEXT: AND_INT T5.X, T1.X, literal.x, 690; EG-NEXT: ASHR T4.Y, T0.Y, PS, 691; EG-NEXT: CNDE_INT T0.Z, T1.Z, PV.W, T2.Y, 692; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X, 693; EG-NEXT: AND_INT * T2.W, T2.X, literal.y, 694; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 695; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 696; EG-NEXT: ASHR T5.Y, T3.Y, PV.X, 697; EG-NEXT: CNDE_INT T2.Z, T1.W, T4.X, T4.Z, 698; EG-NEXT: BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221 699; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, 700; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 701; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, 702; EG-NEXT: ASHR T6.Y, T3.W, literal.x, 703; EG-NEXT: ASHR T3.Z, T0.Y, literal.x, BS:VEC_201 704; EG-NEXT: ADD_INT T3.W, KC0[2].Y, literal.y, 705; EG-NEXT: CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y, 706; EG-NEXT: 31(4.344025e-44), 16(2.242078e-44) 707; EG-NEXT: LSHR T1.X, PV.W, literal.x, 708; EG-NEXT: CNDE_INT T0.Y, T2.W, T4.Y, PV.Z, 709; EG-NEXT: ASHR T3.W, T3.Y, literal.y, 710; EG-NEXT: CNDE_INT * T2.W, T1.W, T4.Z, PV.Y, 711; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 712; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x, 713; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W, 714; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 715 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 716 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in 717 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 718 %result = ashr <4 x i64> %a, %b 719 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 720 ret void 721} 722 723define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 724; SI-LABEL: s_ashr_32_i64: 725; SI: ; %bb.0: 726; SI-NEXT: s_load_dword s6, s[0:1], 0x14 727; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 728; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 729; SI-NEXT: s_mov_b32 s3, 0xf000 730; SI-NEXT: s_mov_b32 s2, -1 731; SI-NEXT: s_waitcnt lgkmcnt(0) 732; SI-NEXT: s_ashr_i32 s7, s6, 31 733; SI-NEXT: s_add_u32 s4, s6, s4 734; SI-NEXT: s_addc_u32 s5, s7, s5 735; SI-NEXT: v_mov_b32_e32 v0, s4 736; SI-NEXT: v_mov_b32_e32 v1, s5 737; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 738; SI-NEXT: s_endpgm 739; 740; VI-LABEL: s_ashr_32_i64: 741; VI: ; %bb.0: 742; VI-NEXT: s_load_dword s6, s[0:1], 0x50 743; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 744; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 745; VI-NEXT: s_mov_b32 s3, 0xf000 746; VI-NEXT: s_mov_b32 s2, -1 747; VI-NEXT: s_waitcnt lgkmcnt(0) 748; VI-NEXT: s_ashr_i32 s7, s6, 31 749; VI-NEXT: s_add_u32 s4, s6, s4 750; VI-NEXT: s_addc_u32 s5, s7, s5 751; VI-NEXT: v_mov_b32_e32 v0, s4 752; VI-NEXT: v_mov_b32_e32 v1, s5 753; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 754; VI-NEXT: s_endpgm 755; 756; EG-LABEL: s_ashr_32_i64: 757; EG: ; %bb.0: 758; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 759; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 760; EG-NEXT: CF_END 761; EG-NEXT: PAD 762; EG-NEXT: ALU clause starting at 4: 763; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 764; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 765; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[7].Z, 766; EG-NEXT: ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y, 767; EG-NEXT: ADD_INT * T0.Y, T0.W, PV.W, 768; EG-NEXT: ADD_INT * T0.X, KC0[5].X, KC0[7].Y, 769; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 770; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 771 %result = ashr i64 %a, 32 772 %add = add i64 %result, %b 773 store i64 %add, i64 addrspace(1)* %out 774 ret void 775} 776 777define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 778; SI-LABEL: v_ashr_32_i64: 779; SI: ; %bb.0: 780; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 781; SI-NEXT: s_mov_b32 s7, 0xf000 782; SI-NEXT: s_mov_b32 s6, 0 783; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 784; SI-NEXT: v_mov_b32_e32 v1, 0 785; SI-NEXT: s_waitcnt lgkmcnt(0) 786; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 787; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 788; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 789; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 790; SI-NEXT: s_waitcnt vmcnt(0) 791; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 792; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 793; SI-NEXT: s_endpgm 794; 795; VI-LABEL: v_ashr_32_i64: 796; VI: ; %bb.0: 797; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 798; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 799; VI-NEXT: s_waitcnt lgkmcnt(0) 800; VI-NEXT: v_mov_b32_e32 v0, s3 801; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 802; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 803; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 804; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 805; VI-NEXT: flat_load_dword v0, v[0:1] 806; VI-NEXT: v_mov_b32_e32 v1, s1 807; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 808; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 809; VI-NEXT: s_waitcnt vmcnt(0) 810; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 811; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 812; VI-NEXT: s_endpgm 813; 814; EG-LABEL: v_ashr_32_i64: 815; EG: ; %bb.0: 816; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 817; EG-NEXT: TEX 0 @6 818; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 819; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 820; EG-NEXT: CF_END 821; EG-NEXT: PAD 822; EG-NEXT: Fetch clause starting at 6: 823; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 824; EG-NEXT: ALU clause starting at 8: 825; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 826; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 827; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 828; EG-NEXT: ALU clause starting at 11: 829; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 830; EG-NEXT: LSHR T1.X, PV.W, literal.x, 831; EG-NEXT: ASHR * T0.Y, T0.X, literal.y, 832; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 833 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 834 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 835 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 836 %a = load i64, i64 addrspace(1)* %gep.in 837 %result = ashr i64 %a, 32 838 store i64 %result, i64 addrspace(1)* %gep.out 839 ret void 840} 841 842define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 843; SI-LABEL: s_ashr_63_i64: 844; SI: ; %bb.0: 845; SI-NEXT: s_load_dword s6, s[0:1], 0x14 846; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 847; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 848; SI-NEXT: s_mov_b32 s3, 0xf000 849; SI-NEXT: s_mov_b32 s2, -1 850; SI-NEXT: s_waitcnt lgkmcnt(0) 851; SI-NEXT: s_ashr_i32 s6, s6, 31 852; SI-NEXT: s_add_u32 s4, s6, s4 853; SI-NEXT: s_addc_u32 s5, s6, s5 854; SI-NEXT: v_mov_b32_e32 v0, s4 855; SI-NEXT: v_mov_b32_e32 v1, s5 856; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 857; SI-NEXT: s_endpgm 858; 859; VI-LABEL: s_ashr_63_i64: 860; VI: ; %bb.0: 861; VI-NEXT: s_load_dword s6, s[0:1], 0x50 862; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 863; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 864; VI-NEXT: s_mov_b32 s3, 0xf000 865; VI-NEXT: s_mov_b32 s2, -1 866; VI-NEXT: s_waitcnt lgkmcnt(0) 867; VI-NEXT: s_ashr_i32 s6, s6, 31 868; VI-NEXT: s_add_u32 s4, s6, s4 869; VI-NEXT: s_addc_u32 s5, s6, s5 870; VI-NEXT: v_mov_b32_e32 v0, s4 871; VI-NEXT: v_mov_b32_e32 v1, s5 872; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 873; VI-NEXT: s_endpgm 874; 875; EG-LABEL: s_ashr_63_i64: 876; EG: ; %bb.0: 877; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 878; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 879; EG-NEXT: CF_END 880; EG-NEXT: PAD 881; EG-NEXT: ALU clause starting at 4: 882; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 883; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 884; EG-NEXT: ADD_INT T1.W, PV.W, KC0[7].Z, 885; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y, 886; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 887; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y, 888; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 889; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 890 %result = ashr i64 %a, 63 891 %add = add i64 %result, %b 892 store i64 %add, i64 addrspace(1)* %out 893 ret void 894} 895 896define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 897; SI-LABEL: v_ashr_63_i64: 898; SI: ; %bb.0: 899; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 900; SI-NEXT: s_mov_b32 s7, 0xf000 901; SI-NEXT: s_mov_b32 s6, 0 902; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 903; SI-NEXT: v_mov_b32_e32 v1, 0 904; SI-NEXT: s_waitcnt lgkmcnt(0) 905; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 906; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 907; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 908; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 909; SI-NEXT: s_waitcnt vmcnt(0) 910; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 911; SI-NEXT: v_mov_b32_e32 v3, v2 912; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 913; SI-NEXT: s_endpgm 914; 915; VI-LABEL: v_ashr_63_i64: 916; VI: ; %bb.0: 917; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 918; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 919; VI-NEXT: s_waitcnt lgkmcnt(0) 920; VI-NEXT: v_mov_b32_e32 v0, s3 921; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 922; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 923; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 924; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 925; VI-NEXT: flat_load_dword v3, v[0:1] 926; VI-NEXT: v_mov_b32_e32 v1, s1 927; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 928; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 929; VI-NEXT: s_waitcnt vmcnt(0) 930; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 931; VI-NEXT: v_mov_b32_e32 v3, v2 932; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 933; VI-NEXT: s_endpgm 934; 935; EG-LABEL: v_ashr_63_i64: 936; EG: ; %bb.0: 937; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 938; EG-NEXT: TEX 0 @6 939; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 940; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 941; EG-NEXT: CF_END 942; EG-NEXT: PAD 943; EG-NEXT: Fetch clause starting at 6: 944; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 945; EG-NEXT: ALU clause starting at 8: 946; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 947; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 948; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 949; EG-NEXT: ALU clause starting at 11: 950; EG-NEXT: ASHR T0.X, T0.X, literal.x, 951; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 952; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 953; EG-NEXT: LSHR T1.X, PV.W, literal.x, 954; EG-NEXT: MOV * T0.Y, PV.X, 955; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 956 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 957 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 958 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 959 %a = load i64, i64 addrspace(1)* %gep.in 960 %result = ashr i64 %a, 63 961 store i64 %result, i64 addrspace(1)* %gep.out 962 ret void 963} 964 965attributes #0 = { nounwind readnone } 966