1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 9; SI-LABEL: lshr_i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 24; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: lshr_i32: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 32; VI-NEXT: s_mov_b32 s3, 0xf000 33; VI-NEXT: s_mov_b32 s2, -1 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_lshr_b32 s4, s4, s5 36; VI-NEXT: v_mov_b32_e32 v0, s4 37; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 38; VI-NEXT: s_endpgm 39; 40; EG-LABEL: lshr_i32: 41; EG: ; %bb.0: 42; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 43; EG-NEXT: TEX 0 @6 44; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 45; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 46; EG-NEXT: CF_END 47; EG-NEXT: PAD 48; EG-NEXT: Fetch clause starting at 6: 49; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 50; EG-NEXT: ALU clause starting at 8: 51; EG-NEXT: MOV * T0.X, KC0[2].Z, 52; EG-NEXT: ALU clause starting at 9: 53; EG-NEXT: LSHR T0.X, T0.X, T0.Y, 54; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 55; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 56 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1 57 %a = load i32, i32 addrspace(1)* %in 58 %b = load i32, i32 addrspace(1)* %b_ptr 59 %result = lshr i32 %a, %b 60 store i32 %result, i32 addrspace(1)* %out 61 ret void 62} 63 64define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 65; SI-LABEL: lshr_v2i32: 66; SI: ; %bb.0: 67; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 68; SI-NEXT: s_mov_b32 s7, 0xf000 69; SI-NEXT: s_mov_b32 s6, -1 70; SI-NEXT: s_mov_b32 s10, s6 71; SI-NEXT: s_mov_b32 s11, s7 72; SI-NEXT: s_waitcnt lgkmcnt(0) 73; SI-NEXT: s_mov_b32 s8, s2 74; SI-NEXT: s_mov_b32 s9, s3 75; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 76; SI-NEXT: s_mov_b32 s4, s0 77; SI-NEXT: s_mov_b32 s5, s1 78; SI-NEXT: s_waitcnt vmcnt(0) 79; SI-NEXT: v_lshr_b32_e32 v1, v1, v3 80; SI-NEXT: v_lshr_b32_e32 v0, v0, v2 81; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 82; SI-NEXT: s_endpgm 83; 84; VI-LABEL: lshr_v2i32: 85; VI: ; %bb.0: 86; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 89; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 90; VI-NEXT: s_mov_b32 s3, 0xf000 91; VI-NEXT: s_mov_b32 s2, -1 92; VI-NEXT: s_waitcnt lgkmcnt(0) 93; VI-NEXT: s_lshr_b32 s5, s5, s7 94; VI-NEXT: s_lshr_b32 s4, s4, s6 95; VI-NEXT: v_mov_b32_e32 v0, s4 96; VI-NEXT: v_mov_b32_e32 v1, s5 97; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 98; VI-NEXT: s_endpgm 99; 100; EG-LABEL: lshr_v2i32: 101; EG: ; %bb.0: 102; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 103; EG-NEXT: TEX 1 @6 104; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 105; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 106; EG-NEXT: CF_END 107; EG-NEXT: PAD 108; EG-NEXT: Fetch clause starting at 6: 109; EG-NEXT: VTX_READ_64 T1.XY, T0.X, 8, #1 110; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 111; EG-NEXT: ALU clause starting at 10: 112; EG-NEXT: MOV * T0.X, KC0[2].Z, 113; EG-NEXT: ALU clause starting at 11: 114; EG-NEXT: LSHR * T0.Y, T0.Y, T1.Y, 115; EG-NEXT: LSHR T0.X, T0.X, T1.X, 116; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 117; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 118 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 119 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in 120 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr 121 %result = lshr <2 x i32> %a, %b 122 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 123 ret void 124} 125 126define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 127; SI-LABEL: lshr_v4i32: 128; SI: ; %bb.0: 129; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 130; SI-NEXT: s_mov_b32 s7, 0xf000 131; SI-NEXT: s_mov_b32 s6, -1 132; SI-NEXT: s_mov_b32 s10, s6 133; SI-NEXT: s_mov_b32 s11, s7 134; SI-NEXT: s_waitcnt lgkmcnt(0) 135; SI-NEXT: s_mov_b32 s8, s2 136; SI-NEXT: s_mov_b32 s9, s3 137; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 138; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 139; SI-NEXT: s_mov_b32 s4, s0 140; SI-NEXT: s_mov_b32 s5, s1 141; SI-NEXT: s_waitcnt vmcnt(0) 142; SI-NEXT: v_lshr_b32_e32 v3, v3, v7 143; SI-NEXT: v_lshr_b32_e32 v2, v2, v6 144; SI-NEXT: v_lshr_b32_e32 v1, v1, v5 145; SI-NEXT: v_lshr_b32_e32 v0, v0, v4 146; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 147; SI-NEXT: s_endpgm 148; 149; VI-LABEL: lshr_v4i32: 150; VI: ; %bb.0: 151; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 152; VI-NEXT: s_waitcnt lgkmcnt(0) 153; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 154; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x10 155; VI-NEXT: s_mov_b32 s3, 0xf000 156; VI-NEXT: s_mov_b32 s2, -1 157; VI-NEXT: s_waitcnt lgkmcnt(0) 158; VI-NEXT: s_lshr_b32 s7, s7, s11 159; VI-NEXT: s_lshr_b32 s6, s6, s10 160; VI-NEXT: s_lshr_b32 s5, s5, s9 161; VI-NEXT: s_lshr_b32 s4, s4, s8 162; VI-NEXT: v_mov_b32_e32 v0, s4 163; VI-NEXT: v_mov_b32_e32 v1, s5 164; VI-NEXT: v_mov_b32_e32 v2, s6 165; VI-NEXT: v_mov_b32_e32 v3, s7 166; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 167; VI-NEXT: s_endpgm 168; 169; EG-LABEL: lshr_v4i32: 170; EG: ; %bb.0: 171; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 172; EG-NEXT: TEX 1 @6 173; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 174; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 175; EG-NEXT: CF_END 176; EG-NEXT: PAD 177; EG-NEXT: Fetch clause starting at 6: 178; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 179; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 180; EG-NEXT: ALU clause starting at 10: 181; EG-NEXT: MOV * T0.X, KC0[2].Z, 182; EG-NEXT: ALU clause starting at 11: 183; EG-NEXT: LSHR * T0.W, T0.W, T1.W, 184; EG-NEXT: LSHR * T0.Z, T0.Z, T1.Z, 185; EG-NEXT: LSHR * T0.Y, T0.Y, T1.Y, 186; EG-NEXT: LSHR T0.X, T0.X, T1.X, 187; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 188; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 189 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 190 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in 191 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr 192 %result = lshr <4 x i32> %a, %b 193 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 194 ret void 195} 196 197define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 198; SI-LABEL: lshr_i64: 199; SI: ; %bb.0: 200; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 201; SI-NEXT: s_mov_b32 s7, 0xf000 202; SI-NEXT: s_mov_b32 s6, -1 203; SI-NEXT: s_mov_b32 s10, s6 204; SI-NEXT: s_mov_b32 s11, s7 205; SI-NEXT: s_waitcnt lgkmcnt(0) 206; SI-NEXT: s_mov_b32 s8, s2 207; SI-NEXT: s_mov_b32 s9, s3 208; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 209; SI-NEXT: s_mov_b32 s4, s0 210; SI-NEXT: s_mov_b32 s5, s1 211; SI-NEXT: s_waitcnt vmcnt(0) 212; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v2 213; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 214; SI-NEXT: s_endpgm 215; 216; VI-LABEL: lshr_i64: 217; VI: ; %bb.0: 218; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 219; VI-NEXT: s_waitcnt lgkmcnt(0) 220; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 221; VI-NEXT: s_mov_b32 s3, 0xf000 222; VI-NEXT: s_mov_b32 s2, -1 223; VI-NEXT: s_waitcnt lgkmcnt(0) 224; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 225; VI-NEXT: v_mov_b32_e32 v0, s4 226; VI-NEXT: v_mov_b32_e32 v1, s5 227; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 228; VI-NEXT: s_endpgm 229; 230; EG-LABEL: lshr_i64: 231; EG: ; %bb.0: 232; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 233; EG-NEXT: TEX 0 @6 234; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] 235; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 236; EG-NEXT: CF_END 237; EG-NEXT: PAD 238; EG-NEXT: Fetch clause starting at 6: 239; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 240; EG-NEXT: ALU clause starting at 8: 241; EG-NEXT: MOV * T0.X, KC0[2].Z, 242; EG-NEXT: ALU clause starting at 9: 243; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, 244; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 245; EG-NEXT: LSHR T1.Z, T0.Y, PV.W, 246; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, 247; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 248; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 249; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, 250; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 251; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 252; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, 0.0, 253 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 254 %a = load i64, i64 addrspace(1)* %in 255 %b = load i64, i64 addrspace(1)* %b_ptr 256 %result = lshr i64 %a, %b 257 store i64 %result, i64 addrspace(1)* %out 258 ret void 259} 260 261define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { 262; SI-LABEL: lshr_v4i64: 263; SI: ; %bb.0: 264; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 265; SI-NEXT: s_mov_b32 s3, 0xf000 266; SI-NEXT: s_mov_b32 s2, -1 267; SI-NEXT: s_mov_b32 s10, s2 268; SI-NEXT: s_mov_b32 s11, s3 269; SI-NEXT: s_waitcnt lgkmcnt(0) 270; SI-NEXT: s_mov_b32 s8, s6 271; SI-NEXT: s_mov_b32 s9, s7 272; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 273; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 274; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 275; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 276; SI-NEXT: s_mov_b32 s0, s4 277; SI-NEXT: s_mov_b32 s1, s5 278; SI-NEXT: s_waitcnt vmcnt(2) 279; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 280; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 281; SI-NEXT: s_waitcnt vmcnt(0) 282; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 283; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 284; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 285; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 286; SI-NEXT: s_endpgm 287; 288; VI-LABEL: lshr_v4i64: 289; VI: ; %bb.0: 290; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 291; VI-NEXT: s_waitcnt lgkmcnt(0) 292; VI-NEXT: s_load_dwordx8 s[0:7], s[18:19], 0x0 293; VI-NEXT: s_load_dwordx8 s[8:15], s[18:19], 0x20 294; VI-NEXT: s_mov_b32 s19, 0xf000 295; VI-NEXT: s_mov_b32 s18, -1 296; VI-NEXT: s_waitcnt lgkmcnt(0) 297; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 298; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 299; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 300; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 301; VI-NEXT: v_mov_b32_e32 v0, s4 302; VI-NEXT: v_mov_b32_e32 v1, s5 303; VI-NEXT: v_mov_b32_e32 v2, s6 304; VI-NEXT: v_mov_b32_e32 v3, s7 305; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 306; VI-NEXT: s_nop 0 307; VI-NEXT: v_mov_b32_e32 v0, s0 308; VI-NEXT: v_mov_b32_e32 v1, s1 309; VI-NEXT: v_mov_b32_e32 v2, s2 310; VI-NEXT: v_mov_b32_e32 v3, s3 311; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 312; VI-NEXT: s_endpgm 313; 314; EG-LABEL: lshr_v4i64: 315; EG: ; %bb.0: 316; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 317; EG-NEXT: TEX 3 @6 318; EG-NEXT: ALU 34, @15, KC0[CB0:0-32], KC1[] 319; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 320; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 321; EG-NEXT: CF_END 322; EG-NEXT: Fetch clause starting at 6: 323; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 324; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 325; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 48, #1 326; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 327; EG-NEXT: ALU clause starting at 14: 328; EG-NEXT: MOV * T0.X, KC0[2].Z, 329; EG-NEXT: ALU clause starting at 15: 330; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 331; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 332; EG-NEXT: LSHR T4.Z, T0.W, PV.W, 333; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, 334; EG-NEXT: AND_INT * T3.W, T3.Z, literal.y, 335; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 336; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, T1.Z, 337; EG-NEXT: LSHR T1.Y, T2.W, PS, BS:VEC_120/SCL_212 338; EG-NEXT: AND_INT * T0.Z, T3.Z, literal.x, 339; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 340; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, T3.Z, 341; EG-NEXT: AND_INT * T2.W, T3.X, literal.x, 342; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 343; EG-NEXT: AND_INT T5.X, T1.X, literal.x, 344; EG-NEXT: LSHR T3.Y, T2.Y, PS, 345; EG-NEXT: CNDE_INT T2.Z, T0.Z, PV.W, T1.Y, 346; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, T3.X, 347; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, 348; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 349; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, 350; EG-NEXT: LSHR T4.Y, T0.Y, PV.X, 351; EG-NEXT: CNDE_INT T1.Z, T1.W, T4.X, T4.Z, 352; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T1.X, BS:VEC_102/SCL_221 353; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, 354; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 355; EG-NEXT: CNDE_INT T1.X, PS, PV.W, PV.Y, 356; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, 357; EG-NEXT: CNDE_INT * T2.W, T0.Z, T1.Y, 0.0, 358; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 359; EG-NEXT: LSHR T0.X, PV.W, literal.x, 360; EG-NEXT: CNDE_INT T2.Y, T3.W, T3.Y, 0.0, 361; EG-NEXT: CNDE_INT T1.W, T1.W, T4.Z, 0.0, BS:VEC_120/SCL_212 362; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 363; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 364; EG-NEXT: CNDE_INT * T1.Y, T4.W, T4.Y, 0.0, 365 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 366 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in 367 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 368 %result = lshr <4 x i64> %a, %b 369 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 370 ret void 371} 372 373; Make sure load width gets reduced to i32 load. 374define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { 375; SI-LABEL: s_lshr_32_i64: 376; SI: ; %bb.0: 377; SI-NEXT: s_load_dword s4, s[0:1], 0x14 378; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 379; SI-NEXT: s_mov_b32 s3, 0xf000 380; SI-NEXT: s_mov_b32 s2, -1 381; SI-NEXT: v_mov_b32_e32 v1, 0 382; SI-NEXT: s_waitcnt lgkmcnt(0) 383; SI-NEXT: v_mov_b32_e32 v0, s4 384; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 385; SI-NEXT: s_endpgm 386; 387; VI-LABEL: s_lshr_32_i64: 388; VI: ; %bb.0: 389; VI-NEXT: s_load_dword s4, s[0:1], 0x50 390; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 391; VI-NEXT: s_mov_b32 s3, 0xf000 392; VI-NEXT: s_mov_b32 s2, -1 393; VI-NEXT: v_mov_b32_e32 v1, 0 394; VI-NEXT: s_waitcnt lgkmcnt(0) 395; VI-NEXT: v_mov_b32_e32 v0, s4 396; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 397; VI-NEXT: s_endpgm 398; 399; EG-LABEL: s_lshr_32_i64: 400; EG: ; %bb.0: 401; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 402; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 403; EG-NEXT: CF_END 404; EG-NEXT: PAD 405; EG-NEXT: ALU clause starting at 4: 406; EG-NEXT: MOV T0.X, KC0[5].X, 407; EG-NEXT: MOV T0.Y, 0.0, 408; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 409; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 410 %result = lshr i64 %a, 32 411 store i64 %result, i64 addrspace(1)* %out 412 ret void 413} 414 415define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 416; SI-LABEL: v_lshr_32_i64: 417; SI: ; %bb.0: 418; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 419; SI-NEXT: s_mov_b32 s6, 0 420; SI-NEXT: s_mov_b32 s7, 0xf000 421; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 422; SI-NEXT: v_mov_b32_e32 v1, 0 423; SI-NEXT: s_waitcnt lgkmcnt(0) 424; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 425; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 426; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 427; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 428; SI-NEXT: v_mov_b32_e32 v3, v1 429; SI-NEXT: s_waitcnt vmcnt(0) 430; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 431; SI-NEXT: s_endpgm 432; 433; VI-LABEL: v_lshr_32_i64: 434; VI: ; %bb.0: 435; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 436; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 437; VI-NEXT: s_waitcnt lgkmcnt(0) 438; VI-NEXT: v_mov_b32_e32 v0, s3 439; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 440; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 441; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 442; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 443; VI-NEXT: flat_load_dword v0, v[0:1] 444; VI-NEXT: v_mov_b32_e32 v3, s1 445; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 446; VI-NEXT: v_mov_b32_e32 v1, 0 447; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 448; VI-NEXT: s_waitcnt vmcnt(0) 449; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 450; VI-NEXT: s_endpgm 451; 452; EG-LABEL: v_lshr_32_i64: 453; EG: ; %bb.0: 454; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 455; EG-NEXT: TEX 0 @6 456; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 457; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 458; EG-NEXT: CF_END 459; EG-NEXT: PAD 460; EG-NEXT: Fetch clause starting at 6: 461; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 462; EG-NEXT: ALU clause starting at 8: 463; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 464; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 465; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 466; EG-NEXT: ALU clause starting at 11: 467; EG-NEXT: MOV T0.Y, 0.0, 468; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 469; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 470; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 471 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 472 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 473 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 474 %a = load i64, i64 addrspace(1)* %gep.in 475 %result = lshr i64 %a, 32 476 store i64 %result, i64 addrspace(1)* %gep.out 477 ret void 478} 479 480attributes #0 = { nounwind readnone } 481