1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI 3; RUN: llc < %s -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8declare i32 @llvm.amdgcn.workgroup.id.x() #0 9 10define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) { 11; SI-LABEL: shl_v2i32: 12; SI: ; %bb.0: 13; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 14; SI-NEXT: s_mov_b32 s7, 0xf000 15; SI-NEXT: s_mov_b32 s6, -1 16; SI-NEXT: s_mov_b32 s10, s6 17; SI-NEXT: s_mov_b32 s11, s7 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: s_mov_b32 s8, s2 20; SI-NEXT: s_mov_b32 s9, s3 21; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 22; SI-NEXT: s_mov_b32 s4, s0 23; SI-NEXT: s_mov_b32 s5, s1 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: v_lshl_b32_e32 v1, v1, v3 26; SI-NEXT: v_lshl_b32_e32 v0, v0, v2 27; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 28; SI-NEXT: s_endpgm 29; 30; VI-LABEL: shl_v2i32: 31; VI: ; %bb.0: 32; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 35; VI-NEXT: s_mov_b32 s3, 0xf000 36; VI-NEXT: s_mov_b32 s2, -1 37; VI-NEXT: s_waitcnt lgkmcnt(0) 38; VI-NEXT: s_lshl_b32 s5, s5, s7 39; VI-NEXT: s_lshl_b32 s4, s4, s6 40; VI-NEXT: v_mov_b32_e32 v0, s4 41; VI-NEXT: v_mov_b32_e32 v1, s5 42; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 43; VI-NEXT: s_endpgm 44; 45; EG-LABEL: shl_v2i32: 46; EG: ; %bb.0: 47; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 48; EG-NEXT: TEX 0 @6 49; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 50; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 51; EG-NEXT: CF_END 52; EG-NEXT: PAD 53; EG-NEXT: Fetch clause starting at 6: 54; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 55; EG-NEXT: ALU clause starting at 8: 56; EG-NEXT: MOV * T0.X, KC0[2].Z, 57; EG-NEXT: ALU clause starting at 9: 58; EG-NEXT: LSHL * T0.Y, T0.Y, T0.W, 59; EG-NEXT: LSHL T0.X, T0.X, T0.Z, 60; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 61; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 62 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1 63 %a = load <2 x i32>, <2 x i32> addrspace(1)* %in 64 %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr 65 %result = shl <2 x i32> %a, %b 66 store <2 x i32> %result, <2 x i32> addrspace(1)* %out 67 ret void 68} 69 70define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { 71; SI-LABEL: shl_v4i32: 72; SI: ; %bb.0: 73; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 74; SI-NEXT: s_mov_b32 s7, 0xf000 75; SI-NEXT: s_mov_b32 s6, -1 76; SI-NEXT: s_mov_b32 s10, s6 77; SI-NEXT: s_mov_b32 s11, s7 78; SI-NEXT: s_waitcnt lgkmcnt(0) 79; SI-NEXT: s_mov_b32 s8, s2 80; SI-NEXT: s_mov_b32 s9, s3 81; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 82; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 83; SI-NEXT: s_mov_b32 s4, s0 84; SI-NEXT: s_mov_b32 s5, s1 85; SI-NEXT: s_waitcnt vmcnt(0) 86; SI-NEXT: v_lshlrev_b32_e32 v3, v7, v3 87; SI-NEXT: v_lshlrev_b32_e32 v2, v6, v2 88; SI-NEXT: v_lshlrev_b32_e32 v1, v5, v1 89; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 90; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: shl_v4i32: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 96; VI-NEXT: s_waitcnt lgkmcnt(0) 97; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 98; VI-NEXT: s_mov_b32 s11, 0xf000 99; VI-NEXT: s_mov_b32 s10, -1 100; VI-NEXT: s_waitcnt lgkmcnt(0) 101; VI-NEXT: s_lshl_b32 s3, s3, s7 102; VI-NEXT: s_lshl_b32 s2, s2, s6 103; VI-NEXT: s_lshl_b32 s1, s1, s5 104; VI-NEXT: s_lshl_b32 s0, s0, s4 105; VI-NEXT: v_mov_b32_e32 v0, s0 106; VI-NEXT: v_mov_b32_e32 v1, s1 107; VI-NEXT: v_mov_b32_e32 v2, s2 108; VI-NEXT: v_mov_b32_e32 v3, s3 109; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 110; VI-NEXT: s_endpgm 111; 112; EG-LABEL: shl_v4i32: 113; EG: ; %bb.0: 114; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 115; EG-NEXT: TEX 1 @6 116; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 117; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 118; EG-NEXT: CF_END 119; EG-NEXT: PAD 120; EG-NEXT: Fetch clause starting at 6: 121; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 122; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 123; EG-NEXT: ALU clause starting at 10: 124; EG-NEXT: MOV * T0.X, KC0[2].Z, 125; EG-NEXT: ALU clause starting at 11: 126; EG-NEXT: LSHL * T0.W, T0.W, T1.W, 127; EG-NEXT: LSHL * T0.Z, T0.Z, T1.Z, 128; EG-NEXT: LSHL * T0.Y, T0.Y, T1.Y, 129; EG-NEXT: LSHL T0.X, T0.X, T1.X, 130; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 131; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 132 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1 133 %a = load <4 x i32>, <4 x i32> addrspace(1)* %in 134 %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr 135 %result = shl <4 x i32> %a, %b 136 store <4 x i32> %result, <4 x i32> addrspace(1)* %out 137 ret void 138} 139 140define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 141; SI-LABEL: shl_i16: 142; SI: ; %bb.0: 143; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 144; SI-NEXT: s_mov_b32 s7, 0xf000 145; SI-NEXT: s_mov_b32 s6, -1 146; SI-NEXT: s_mov_b32 s10, s6 147; SI-NEXT: s_mov_b32 s11, s7 148; SI-NEXT: s_waitcnt lgkmcnt(0) 149; SI-NEXT: s_mov_b32 s8, s2 150; SI-NEXT: s_mov_b32 s9, s3 151; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 152; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 153; SI-NEXT: s_mov_b32 s4, s0 154; SI-NEXT: s_mov_b32 s5, s1 155; SI-NEXT: s_waitcnt vmcnt(0) 156; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 157; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 158; SI-NEXT: s_endpgm 159; 160; VI-LABEL: shl_i16: 161; VI: ; %bb.0: 162; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 163; VI-NEXT: s_mov_b32 s7, 0xf000 164; VI-NEXT: s_mov_b32 s6, -1 165; VI-NEXT: s_mov_b32 s10, s6 166; VI-NEXT: s_mov_b32 s11, s7 167; VI-NEXT: s_waitcnt lgkmcnt(0) 168; VI-NEXT: s_mov_b32 s8, s2 169; VI-NEXT: s_mov_b32 s9, s3 170; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 171; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 172; VI-NEXT: s_mov_b32 s4, s0 173; VI-NEXT: s_mov_b32 s5, s1 174; VI-NEXT: s_waitcnt vmcnt(0) 175; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 176; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 177; VI-NEXT: s_endpgm 178; 179; EG-LABEL: shl_i16: 180; EG: ; %bb.0: 181; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 182; EG-NEXT: TEX 1 @6 183; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 184; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 185; EG-NEXT: CF_END 186; EG-NEXT: PAD 187; EG-NEXT: Fetch clause starting at 6: 188; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 189; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 190; EG-NEXT: ALU clause starting at 10: 191; EG-NEXT: MOV * T0.X, KC0[2].Z, 192; EG-NEXT: ALU clause starting at 11: 193; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 194; EG-NEXT: LSHL * T1.W, T0.X, T1.X, 195; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 196; EG-NEXT: AND_INT T1.W, PS, literal.x, 197; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 198; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 199; EG-NEXT: LSHL T0.X, PV.W, PS, 200; EG-NEXT: LSHL * T0.W, literal.x, PS, 201; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 202; EG-NEXT: MOV T0.Y, 0.0, 203; EG-NEXT: MOV * T0.Z, 0.0, 204; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 205; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 206 %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 207 %a = load i16, i16 addrspace(1)* %in 208 %b = load i16, i16 addrspace(1)* %b_ptr 209 %result = shl i16 %a, %b 210 store i16 %result, i16 addrspace(1)* %out 211 ret void 212} 213 214define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { 215; SI-LABEL: shl_i16_v_s: 216; SI: ; %bb.0: 217; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 218; SI-NEXT: s_load_dword s12, s[0:1], 0xd 219; SI-NEXT: s_mov_b32 s3, 0xf000 220; SI-NEXT: s_mov_b32 s2, -1 221; SI-NEXT: s_mov_b32 s10, s2 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: s_mov_b32 s8, s6 224; SI-NEXT: s_mov_b32 s9, s7 225; SI-NEXT: s_mov_b32 s11, s3 226; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 227; SI-NEXT: s_mov_b32 s0, s4 228; SI-NEXT: s_mov_b32 s1, s5 229; SI-NEXT: s_waitcnt vmcnt(0) 230; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 231; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 232; SI-NEXT: s_endpgm 233; 234; VI-LABEL: shl_i16_v_s: 235; VI: ; %bb.0: 236; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 237; VI-NEXT: s_load_dword s12, s[0:1], 0x34 238; VI-NEXT: s_mov_b32 s3, 0xf000 239; VI-NEXT: s_mov_b32 s2, -1 240; VI-NEXT: s_mov_b32 s10, s2 241; VI-NEXT: s_waitcnt lgkmcnt(0) 242; VI-NEXT: s_mov_b32 s8, s6 243; VI-NEXT: s_mov_b32 s9, s7 244; VI-NEXT: s_mov_b32 s11, s3 245; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 246; VI-NEXT: s_mov_b32 s0, s4 247; VI-NEXT: s_mov_b32 s1, s5 248; VI-NEXT: s_waitcnt vmcnt(0) 249; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 250; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 251; VI-NEXT: s_endpgm 252; 253; EG-LABEL: shl_i16_v_s: 254; EG: ; %bb.0: 255; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 256; EG-NEXT: TEX 1 @6 257; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[] 258; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 259; EG-NEXT: CF_END 260; EG-NEXT: PAD 261; EG-NEXT: Fetch clause starting at 6: 262; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 263; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 264; EG-NEXT: ALU clause starting at 10: 265; EG-NEXT: MOV T0.X, 0.0, 266; EG-NEXT: MOV * T1.X, KC0[2].Z, 267; EG-NEXT: ALU clause starting at 12: 268; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 269; EG-NEXT: LSHL * T1.W, T1.X, T0.X, 270; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 271; EG-NEXT: AND_INT T1.W, PS, literal.x, 272; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 273; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 274; EG-NEXT: LSHL T0.X, PV.W, PS, 275; EG-NEXT: LSHL * T0.W, literal.x, PS, 276; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 277; EG-NEXT: MOV T0.Y, 0.0, 278; EG-NEXT: MOV * T0.Z, 0.0, 279; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 280; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 281 %a = load i16, i16 addrspace(1)* %in 282 %result = shl i16 %a, %b 283 store i16 %result, i16 addrspace(1)* %out 284 ret void 285} 286 287define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) { 288; SI-LABEL: shl_i16_v_compute_s: 289; SI: ; %bb.0: 290; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 291; SI-NEXT: s_load_dword s12, s[0:1], 0xd 292; SI-NEXT: s_mov_b32 s3, 0xf000 293; SI-NEXT: s_mov_b32 s2, -1 294; SI-NEXT: s_mov_b32 s10, s2 295; SI-NEXT: s_waitcnt lgkmcnt(0) 296; SI-NEXT: s_mov_b32 s8, s6 297; SI-NEXT: s_mov_b32 s9, s7 298; SI-NEXT: s_mov_b32 s11, s3 299; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 300; SI-NEXT: s_add_i32 s12, s12, 3 301; SI-NEXT: s_mov_b32 s0, s4 302; SI-NEXT: s_mov_b32 s1, s5 303; SI-NEXT: s_waitcnt vmcnt(0) 304; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 305; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 306; SI-NEXT: s_endpgm 307; 308; VI-LABEL: shl_i16_v_compute_s: 309; VI: ; %bb.0: 310; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 311; VI-NEXT: s_load_dword s12, s[0:1], 0x34 312; VI-NEXT: s_mov_b32 s3, 0xf000 313; VI-NEXT: s_mov_b32 s2, -1 314; VI-NEXT: s_mov_b32 s10, s2 315; VI-NEXT: s_waitcnt lgkmcnt(0) 316; VI-NEXT: s_mov_b32 s8, s6 317; VI-NEXT: s_mov_b32 s9, s7 318; VI-NEXT: s_mov_b32 s11, s3 319; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 320; VI-NEXT: s_add_i32 s12, s12, 3 321; VI-NEXT: s_mov_b32 s0, s4 322; VI-NEXT: s_mov_b32 s1, s5 323; VI-NEXT: s_waitcnt vmcnt(0) 324; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 325; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 326; VI-NEXT: s_endpgm 327; 328; EG-LABEL: shl_i16_v_compute_s: 329; EG: ; %bb.0: 330; EG-NEXT: ALU 0, @12, KC0[], KC1[] 331; EG-NEXT: TEX 0 @8 332; EG-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] 333; EG-NEXT: TEX 0 @10 334; EG-NEXT: ALU 15, @14, KC0[CB0:0-32], KC1[] 335; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 336; EG-NEXT: CF_END 337; EG-NEXT: PAD 338; EG-NEXT: Fetch clause starting at 8: 339; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 340; EG-NEXT: Fetch clause starting at 10: 341; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 342; EG-NEXT: ALU clause starting at 12: 343; EG-NEXT: MOV * T0.X, 0.0, 344; EG-NEXT: ALU clause starting at 13: 345; EG-NEXT: MOV * T1.X, KC0[2].Z, 346; EG-NEXT: ALU clause starting at 14: 347; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x, 348; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 349; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 350; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 351; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 352; EG-NEXT: LSHL * T0.W, T1.X, PV.W, 353; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 354; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 355; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 356; EG-NEXT: LSHL T0.X, PV.W, PS, 357; EG-NEXT: LSHL * T0.W, literal.x, PS, 358; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 359; EG-NEXT: MOV T0.Y, 0.0, 360; EG-NEXT: MOV * T0.Z, 0.0, 361; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 362; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 363 %a = load i16, i16 addrspace(1)* %in 364 %b.add = add i16 %b, 3 365 %result = shl i16 %a, %b.add 366 store i16 %result, i16 addrspace(1)* %out 367 ret void 368} 369 370define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { 371; SI-LABEL: shl_i16_computed_amount: 372; SI: ; %bb.0: 373; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 374; SI-NEXT: s_mov_b32 s7, 0xf000 375; SI-NEXT: s_mov_b32 s6, -1 376; SI-NEXT: s_mov_b32 s10, s6 377; SI-NEXT: s_mov_b32 s11, s7 378; SI-NEXT: s_waitcnt lgkmcnt(0) 379; SI-NEXT: s_mov_b32 s8, s2 380; SI-NEXT: s_mov_b32 s9, s3 381; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 382; SI-NEXT: v_mov_b32_e32 v1, 0 383; SI-NEXT: s_mov_b32 s14, 0 384; SI-NEXT: s_mov_b32 s15, s7 385; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 386; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 387; SI-NEXT: s_waitcnt vmcnt(0) 388; SI-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc 389; SI-NEXT: s_waitcnt vmcnt(0) 390; SI-NEXT: s_mov_b32 s4, s0 391; SI-NEXT: s_mov_b32 s5, s1 392; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 393; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 394; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 395; SI-NEXT: s_endpgm 396; 397; VI-LABEL: shl_i16_computed_amount: 398; VI: ; %bb.0: 399; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 400; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 401; VI-NEXT: s_mov_b32 s7, 0xf000 402; VI-NEXT: s_mov_b32 s6, -1 403; VI-NEXT: s_mov_b32 s10, s6 404; VI-NEXT: s_waitcnt lgkmcnt(0) 405; VI-NEXT: v_mov_b32_e32 v1, s3 406; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 407; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 408; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 409; VI-NEXT: s_mov_b32 s8, s2 410; VI-NEXT: s_mov_b32 s9, s3 411; VI-NEXT: s_mov_b32 s11, s7 412; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 413; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 414; VI-NEXT: s_waitcnt vmcnt(0) 415; VI-NEXT: flat_load_ushort v0, v[0:1] glc 416; VI-NEXT: s_waitcnt vmcnt(0) 417; VI-NEXT: s_mov_b32 s4, s0 418; VI-NEXT: s_mov_b32 s5, s1 419; VI-NEXT: v_add_u16_e32 v0, 3, v0 420; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2 421; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 422; VI-NEXT: s_endpgm 423; 424; EG-LABEL: shl_i16_computed_amount: 425; EG: ; %bb.0: 426; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 427; EG-NEXT: TEX 0 @8 428; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[] 429; EG-NEXT: TEX 0 @10 430; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 431; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 432; EG-NEXT: CF_END 433; EG-NEXT: PAD 434; EG-NEXT: Fetch clause starting at 8: 435; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 436; EG-NEXT: Fetch clause starting at 10: 437; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 438; EG-NEXT: ALU clause starting at 12: 439; EG-NEXT: MOV * T1.X, KC0[2].Z, 440; EG-NEXT: ALU clause starting at 13: 441; EG-NEXT: LSHL * T0.W, T0.X, 1, 442; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 443; EG-NEXT: ALU clause starting at 15: 444; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x, 445; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 446; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 447; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 448; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 449; EG-NEXT: LSHL * T0.W, T1.X, PV.W, 450; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 451; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 452; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 453; EG-NEXT: LSHL T0.X, PV.W, PS, 454; EG-NEXT: LSHL * T0.W, literal.x, PS, 455; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 456; EG-NEXT: MOV T0.Y, 0.0, 457; EG-NEXT: MOV * T0.Z, 0.0, 458; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 459; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 460 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 461 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid 462 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid 463 %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i16 1 464 %a = load volatile i16, i16 addrspace(1)* %in 465 %b = load volatile i16, i16 addrspace(1)* %b_ptr 466 %b.add = add i16 %b, 3 467 %result = shl i16 %a, %b.add 468 store i16 %result, i16 addrspace(1)* %out 469 ret void 470} 471 472define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) { 473; SI-LABEL: shl_i16_i_s: 474; SI: ; %bb.0: 475; SI-NEXT: s_load_dword s4, s[0:1], 0xb 476; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 477; SI-NEXT: s_mov_b32 s3, 0xf000 478; SI-NEXT: s_mov_b32 s2, -1 479; SI-NEXT: s_waitcnt lgkmcnt(0) 480; SI-NEXT: s_lshl_b32 s4, s4, 12 481; SI-NEXT: v_mov_b32_e32 v0, s4 482; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 483; SI-NEXT: s_endpgm 484; 485; VI-LABEL: shl_i16_i_s: 486; VI: ; %bb.0: 487; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 488; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 489; VI-NEXT: s_mov_b32 s3, 0xf000 490; VI-NEXT: s_mov_b32 s2, -1 491; VI-NEXT: s_waitcnt lgkmcnt(0) 492; VI-NEXT: s_lshl_b32 s4, s4, 12 493; VI-NEXT: v_mov_b32_e32 v0, s4 494; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 495; VI-NEXT: s_endpgm 496; 497; EG-LABEL: shl_i16_i_s: 498; EG: ; %bb.0: 499; EG-NEXT: ALU 0, @8, KC0[], KC1[] 500; EG-NEXT: TEX 0 @6 501; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 502; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 503; EG-NEXT: CF_END 504; EG-NEXT: PAD 505; EG-NEXT: Fetch clause starting at 6: 506; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 507; EG-NEXT: ALU clause starting at 8: 508; EG-NEXT: MOV * T0.X, 0.0, 509; EG-NEXT: ALU clause starting at 9: 510; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, 511; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 512; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45) 513; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 514; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) 515; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 516; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 517; EG-NEXT: 61440(8.609578e-41), 3(4.203895e-45) 518; EG-NEXT: LSHL T0.X, PV.W, PS, 519; EG-NEXT: LSHL * T0.W, literal.x, PS, 520; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 521; EG-NEXT: MOV T0.Y, 0.0, 522; EG-NEXT: MOV * T0.Z, 0.0, 523; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 524; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 525 %result = shl i16 %a, 12 526 store i16 %result, i16 addrspace(1)* %out 527 ret void 528} 529 530define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { 531; SI-LABEL: shl_v2i16: 532; SI: ; %bb.0: 533; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 534; SI-NEXT: s_mov_b32 s7, 0xf000 535; SI-NEXT: s_mov_b32 s6, -1 536; SI-NEXT: s_mov_b32 s10, s6 537; SI-NEXT: s_mov_b32 s11, s7 538; SI-NEXT: s_waitcnt lgkmcnt(0) 539; SI-NEXT: s_mov_b32 s8, s2 540; SI-NEXT: s_mov_b32 s9, s3 541; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 542; SI-NEXT: v_mov_b32_e32 v1, 0 543; SI-NEXT: s_mov_b32 s14, 0 544; SI-NEXT: s_mov_b32 s15, s7 545; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 546; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 547; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 548; SI-NEXT: s_mov_b32 s4, s0 549; SI-NEXT: s_mov_b32 s5, s1 550; SI-NEXT: s_waitcnt vmcnt(1) 551; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 552; SI-NEXT: s_waitcnt vmcnt(0) 553; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 554; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 555; SI-NEXT: v_lshlrev_b32_e32 v1, v3, v1 556; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 557; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 558; SI-NEXT: v_or_b32_e32 v0, v0, v1 559; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 560; SI-NEXT: s_endpgm 561; 562; VI-LABEL: shl_v2i16: 563; VI: ; %bb.0: 564; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 565; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 566; VI-NEXT: s_waitcnt lgkmcnt(0) 567; VI-NEXT: v_mov_b32_e32 v1, s3 568; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 569; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 570; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 571; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 572; VI-NEXT: flat_load_dword v0, v[0:1] 573; VI-NEXT: s_load_dword s4, s[2:3], 0x0 574; VI-NEXT: s_mov_b32 s3, 0xf000 575; VI-NEXT: s_mov_b32 s2, -1 576; VI-NEXT: s_waitcnt lgkmcnt(0) 577; VI-NEXT: s_lshr_b32 s5, s4, 16 578; VI-NEXT: v_mov_b32_e32 v1, s5 579; VI-NEXT: s_waitcnt vmcnt(0) 580; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4 581; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 582; VI-NEXT: v_or_b32_e32 v0, v2, v0 583; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 584; VI-NEXT: s_endpgm 585; 586; EG-LABEL: shl_v2i16: 587; EG: ; %bb.0: 588; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 589; EG-NEXT: TEX 0 @8 590; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] 591; EG-NEXT: TEX 0 @10 592; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[] 593; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 594; EG-NEXT: CF_END 595; EG-NEXT: PAD 596; EG-NEXT: Fetch clause starting at 8: 597; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 598; EG-NEXT: Fetch clause starting at 10: 599; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 600; EG-NEXT: ALU clause starting at 12: 601; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 603; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 604; EG-NEXT: ALU clause starting at 15: 605; EG-NEXT: MOV * T7.X, KC0[2].Z, 606; EG-NEXT: ALU clause starting at 16: 607; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, 608; EG-NEXT: LSHR T0.W, T0.X, literal.y, 609; EG-NEXT: LSHR * T1.W, T7.X, literal.y, 610; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 611; EG-NEXT: LSHL T0.W, PS, PV.W, 612; EG-NEXT: LSHL * T1.W, T7.X, PV.Z, 613; EG-NEXT: AND_INT T1.W, PS, literal.x, 614; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 615; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 616; EG-NEXT: OR_INT T0.X, PV.W, PS, 617; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 618; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 619 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 620 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid 621 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid 622 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1 623 %a = load <2 x i16>, <2 x i16> addrspace(1)* %in 624 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr 625 %result = shl <2 x i16> %a, %b 626 store <2 x i16> %result, <2 x i16> addrspace(1)* %out 627 ret void 628} 629 630define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { 631; SI-LABEL: shl_v4i16: 632; SI: ; %bb.0: 633; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 634; SI-NEXT: s_mov_b32 s7, 0xf000 635; SI-NEXT: s_mov_b32 s6, 0 636; SI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 637; SI-NEXT: v_mov_b32_e32 v5, 0 638; SI-NEXT: s_waitcnt lgkmcnt(0) 639; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 640; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 641; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 642; SI-NEXT: s_waitcnt vmcnt(0) 643; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 644; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 645; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 646; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 647; SI-NEXT: v_lshlrev_b32_e32 v1, v3, v1 648; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 649; SI-NEXT: v_lshlrev_b32_e32 v2, v9, v7 650; SI-NEXT: v_lshlrev_b32_e32 v3, v8, v6 651; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 652; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 653; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 654; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 655; SI-NEXT: v_or_b32_e32 v1, v1, v2 656; SI-NEXT: v_or_b32_e32 v0, v0, v3 657; SI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 658; SI-NEXT: s_endpgm 659; 660; VI-LABEL: shl_v4i16: 661; VI: ; %bb.0: 662; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 663; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 664; VI-NEXT: s_waitcnt lgkmcnt(0) 665; VI-NEXT: v_mov_b32_e32 v1, s3 666; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 667; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 668; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 669; VI-NEXT: v_mov_b32_e32 v5, s1 670; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 671; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 672; VI-NEXT: s_waitcnt vmcnt(0) 673; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 674; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 675; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 676; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 677; VI-NEXT: v_or_b32_e32 v1, v6, v1 678; VI-NEXT: v_or_b32_e32 v0, v3, v0 679; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 680; VI-NEXT: s_endpgm 681; 682; EG-LABEL: shl_v4i16: 683; EG: ; %bb.0: 684; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 685; EG-NEXT: TEX 0 @6 686; EG-NEXT: ALU 51, @11, KC0[CB0:0-32], KC1[] 687; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 688; EG-NEXT: CF_END 689; EG-NEXT: PAD 690; EG-NEXT: Fetch clause starting at 6: 691; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 692; EG-NEXT: ALU clause starting at 8: 693; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 694; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 695; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 696; EG-NEXT: ALU clause starting at 11: 697; EG-NEXT: MOV T4.X, T10.X, 698; EG-NEXT: MOV * T5.X, T10.Y, 699; EG-NEXT: MOV T0.X, PV.X, 700; EG-NEXT: MOV T0.Y, PS, 701; EG-NEXT: MOV * T2.X, T10.Z, 702; EG-NEXT: MOV T3.X, T10.W, 703; EG-NEXT: MOV * T0.Z, T6.X, 704; EG-NEXT: MOV * T1.Y, T2.X, 705; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 706; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 707; EG-NEXT: LSHL * T1.W, T0.X, PV.W, 708; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 709; EG-NEXT: AND_INT * T2.W, T0.Z, literal.y, 710; EG-NEXT: 65535(9.183409e-41), -65536(nan) 711; EG-NEXT: OR_INT * T1.W, PS, PV.W, 712; EG-NEXT: MOV * T0.Z, T3.X, 713; EG-NEXT: MOV * T6.X, T1.W, 714; EG-NEXT: MOV T1.Z, PV.X, 715; EG-NEXT: LSHR T1.W, T1.Y, literal.x, 716; EG-NEXT: LSHR * T2.W, T0.X, literal.x, 717; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 718; EG-NEXT: LSHL T1.W, PS, PV.W, 719; EG-NEXT: AND_INT * T2.W, PV.Z, literal.x, 720; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 721; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 722; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 723; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 724; EG-NEXT: MOV T6.X, PV.W, 725; EG-NEXT: MOV * T0.X, T7.X, 726; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 727; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 728; EG-NEXT: LSHL T1.W, T0.Y, PV.W, 729; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 730; EG-NEXT: -65536(nan), 0(0.000000e+00) 731; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 732; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 733; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 734; EG-NEXT: MOV * T7.X, PV.W, 735; EG-NEXT: MOV T0.X, PV.X, 736; EG-NEXT: LSHR T1.W, T0.Z, literal.x, 737; EG-NEXT: LSHR * T2.W, T0.Y, literal.x, 738; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 739; EG-NEXT: LSHL * T1.W, PS, PV.W, 740; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, 741; EG-NEXT: LSHL T1.W, PV.W, literal.y, 742; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 743; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 744; EG-NEXT: LSHR T0.X, PS, literal.x, 745; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W, 746; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 747; EG-NEXT: MOV T7.X, PV.Y, 748; EG-NEXT: MOV * T10.X, T6.X, 749 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 750 %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid 751 %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid 752 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1 753 %a = load <4 x i16>, <4 x i16> addrspace(1)* %gep 754 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr 755 %result = shl <4 x i16> %a, %b 756 store <4 x i16> %result, <4 x i16> addrspace(1)* %gep.out 757 ret void 758} 759 760define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 761; SI-LABEL: shl_i64: 762; SI: ; %bb.0: 763; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 764; SI-NEXT: s_mov_b32 s7, 0xf000 765; SI-NEXT: s_mov_b32 s6, -1 766; SI-NEXT: s_mov_b32 s10, s6 767; SI-NEXT: s_mov_b32 s11, s7 768; SI-NEXT: s_waitcnt lgkmcnt(0) 769; SI-NEXT: s_mov_b32 s8, s2 770; SI-NEXT: s_mov_b32 s9, s3 771; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 772; SI-NEXT: s_mov_b32 s4, s0 773; SI-NEXT: s_mov_b32 s5, s1 774; SI-NEXT: s_waitcnt vmcnt(0) 775; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 776; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 777; SI-NEXT: s_endpgm 778; 779; VI-LABEL: shl_i64: 780; VI: ; %bb.0: 781; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 782; VI-NEXT: s_waitcnt lgkmcnt(0) 783; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 784; VI-NEXT: s_mov_b32 s3, 0xf000 785; VI-NEXT: s_mov_b32 s2, -1 786; VI-NEXT: s_waitcnt lgkmcnt(0) 787; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 788; VI-NEXT: v_mov_b32_e32 v0, s4 789; VI-NEXT: v_mov_b32_e32 v1, s5 790; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 791; VI-NEXT: s_endpgm 792; 793; EG-LABEL: shl_i64: 794; EG: ; %bb.0: 795; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 796; EG-NEXT: TEX 0 @6 797; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 798; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 799; EG-NEXT: CF_END 800; EG-NEXT: PAD 801; EG-NEXT: Fetch clause starting at 6: 802; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 803; EG-NEXT: ALU clause starting at 8: 804; EG-NEXT: MOV * T0.X, KC0[2].Z, 805; EG-NEXT: ALU clause starting at 9: 806; EG-NEXT: AND_INT T1.Y, T0.Z, literal.x, 807; EG-NEXT: LSHR T1.Z, T0.Y, 1, 808; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, 809; EG-NEXT: NOT_INT * T1.W, T0.Z, 810; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 811; EG-NEXT: BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS, 812; EG-NEXT: LSHL T0.W, T0.X, PV.Y, 813; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 814; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 815; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 816; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 817; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 818; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 819 %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1 820 %a = load i64, i64 addrspace(1)* %in 821 %b = load i64, i64 addrspace(1)* %b_ptr 822 %result = shl i64 %a, %b 823 store i64 %result, i64 addrspace(1)* %out 824 ret void 825} 826 827define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) { 828; SI-LABEL: shl_v2i64: 829; SI: ; %bb.0: 830; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 831; SI-NEXT: s_mov_b32 s7, 0xf000 832; SI-NEXT: s_mov_b32 s6, -1 833; SI-NEXT: s_mov_b32 s10, s6 834; SI-NEXT: s_mov_b32 s11, s7 835; SI-NEXT: s_waitcnt lgkmcnt(0) 836; SI-NEXT: s_mov_b32 s8, s2 837; SI-NEXT: s_mov_b32 s9, s3 838; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 839; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 840; SI-NEXT: s_mov_b32 s4, s0 841; SI-NEXT: s_mov_b32 s5, s1 842; SI-NEXT: s_waitcnt vmcnt(0) 843; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 844; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 845; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 846; SI-NEXT: s_endpgm 847; 848; VI-LABEL: shl_v2i64: 849; VI: ; %bb.0: 850; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 851; VI-NEXT: s_waitcnt lgkmcnt(0) 852; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 853; VI-NEXT: s_mov_b32 s11, 0xf000 854; VI-NEXT: s_mov_b32 s10, -1 855; VI-NEXT: s_waitcnt lgkmcnt(0) 856; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 857; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 858; VI-NEXT: v_mov_b32_e32 v0, s0 859; VI-NEXT: v_mov_b32_e32 v1, s1 860; VI-NEXT: v_mov_b32_e32 v2, s2 861; VI-NEXT: v_mov_b32_e32 v3, s3 862; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 863; VI-NEXT: s_endpgm 864; 865; EG-LABEL: shl_v2i64: 866; EG: ; %bb.0: 867; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 868; EG-NEXT: TEX 1 @6 869; EG-NEXT: ALU 22, @11, KC0[CB0:0-32], KC1[] 870; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 871; EG-NEXT: CF_END 872; EG-NEXT: PAD 873; EG-NEXT: Fetch clause starting at 6: 874; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 875; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 876; EG-NEXT: ALU clause starting at 10: 877; EG-NEXT: MOV * T0.X, KC0[2].Z, 878; EG-NEXT: ALU clause starting at 11: 879; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x, 880; EG-NEXT: LSHR T2.Z, T0.W, 1, 881; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, 882; EG-NEXT: NOT_INT * T1.W, T1.Z, 883; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 884; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS, 885; EG-NEXT: LSHL * T1.W, T0.Z, PV.Y, 886; EG-NEXT: AND_INT T2.X, T1.Z, literal.x, 887; EG-NEXT: AND_INT T1.Y, T1.X, literal.y, 888; EG-NEXT: LSHR T0.Z, T0.Y, 1, 889; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1, 890; EG-NEXT: NOT_INT * T3.W, T1.X, 891; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 892; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS, 893; EG-NEXT: LSHL T0.Z, T0.X, PV.Y, 894; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212 895; EG-NEXT: CNDE_INT * T3.W, PV.X, T0.W, T1.W, 896; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 897; EG-NEXT: CNDE_INT T3.Y, PV.W, PV.Y, PV.Z, 898; EG-NEXT: CNDE_INT * T3.Z, T2.X, T1.W, 0.0, 899; EG-NEXT: CNDE_INT T3.X, T2.W, T0.Z, 0.0, 900; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 901; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 902 %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1 903 %a = load <2 x i64>, <2 x i64> addrspace(1)* %in 904 %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr 905 %result = shl <2 x i64> %a, %b 906 store <2 x i64> %result, <2 x i64> addrspace(1)* %out 907 ret void 908} 909 910define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) { 911; SI-LABEL: shl_v4i64: 912; SI: ; %bb.0: 913; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 914; SI-NEXT: s_mov_b32 s3, 0xf000 915; SI-NEXT: s_mov_b32 s2, -1 916; SI-NEXT: s_mov_b32 s10, s2 917; SI-NEXT: s_mov_b32 s11, s3 918; SI-NEXT: s_waitcnt lgkmcnt(0) 919; SI-NEXT: s_mov_b32 s8, s6 920; SI-NEXT: s_mov_b32 s9, s7 921; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 922; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 923; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 924; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 925; SI-NEXT: s_mov_b32 s0, s4 926; SI-NEXT: s_mov_b32 s1, s5 927; SI-NEXT: s_waitcnt vmcnt(2) 928; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 929; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 930; SI-NEXT: s_waitcnt vmcnt(0) 931; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 932; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 933; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 934; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 935; SI-NEXT: s_endpgm 936; 937; VI-LABEL: shl_v4i64: 938; VI: ; %bb.0: 939; VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 940; VI-NEXT: s_waitcnt lgkmcnt(0) 941; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 942; VI-NEXT: s_mov_b32 s19, 0xf000 943; VI-NEXT: s_mov_b32 s18, -1 944; VI-NEXT: s_waitcnt lgkmcnt(0) 945; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s14 946; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s12 947; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 948; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 949; VI-NEXT: v_mov_b32_e32 v0, s4 950; VI-NEXT: v_mov_b32_e32 v1, s5 951; VI-NEXT: v_mov_b32_e32 v2, s6 952; VI-NEXT: v_mov_b32_e32 v3, s7 953; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 954; VI-NEXT: s_nop 0 955; VI-NEXT: v_mov_b32_e32 v0, s0 956; VI-NEXT: v_mov_b32_e32 v1, s1 957; VI-NEXT: v_mov_b32_e32 v2, s2 958; VI-NEXT: v_mov_b32_e32 v3, s3 959; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 960; VI-NEXT: s_endpgm 961; 962; EG-LABEL: shl_v4i64: 963; EG: ; %bb.0: 964; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 965; EG-NEXT: TEX 3 @6 966; EG-NEXT: ALU 47, @15, KC0[CB0:0-32], KC1[] 967; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 968; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 969; EG-NEXT: CF_END 970; EG-NEXT: Fetch clause starting at 6: 971; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 972; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 973; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 32, #1 974; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 975; EG-NEXT: ALU clause starting at 14: 976; EG-NEXT: MOV * T0.X, KC0[2].Z, 977; EG-NEXT: ALU clause starting at 15: 978; EG-NEXT: AND_INT T4.Z, T1.Z, literal.x, 979; EG-NEXT: LSHR T1.W, T0.W, 1, 980; EG-NEXT: NOT_INT * T3.W, T1.Z, 981; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 982; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1, 983; EG-NEXT: AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201 984; EG-NEXT: LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212 985; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221 986; EG-NEXT: NOT_INT * T2.W, T3.Z, 987; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 988; EG-NEXT: BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS, 989; EG-NEXT: LSHL T2.Z, T2.Z, PV.Y, 990; EG-NEXT: BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W, 991; EG-NEXT: LSHL * T1.W, T0.Z, T4.Z, 992; EG-NEXT: AND_INT T4.X, T1.Z, literal.x, 993; EG-NEXT: AND_INT T1.Y, T1.X, literal.y, 994; EG-NEXT: LSHR T0.Z, T0.Y, 1, 995; EG-NEXT: BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1, 996; EG-NEXT: NOT_INT * T3.W, T1.X, 997; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 998; EG-NEXT: AND_INT T5.X, T3.Z, literal.x, 999; EG-NEXT: BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS, 1000; EG-NEXT: LSHL T0.Z, T0.X, PV.Y, 1001; EG-NEXT: AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212 1002; EG-NEXT: CNDE_INT * T4.W, PV.X, T0.W, T1.W, 1003; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1004; EG-NEXT: AND_INT T0.X, T3.X, literal.x, 1005; EG-NEXT: CNDE_INT T4.Y, PV.W, PV.Y, PV.Z, 1006; EG-NEXT: LSHR T1.Z, T2.Y, 1, 1007; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1, 1008; EG-NEXT: NOT_INT * T3.W, T3.X, 1009; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1010; EG-NEXT: BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS, 1011; EG-NEXT: LSHL T0.Y, T2.X, PV.X, 1012; EG-NEXT: CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212 1013; EG-NEXT: AND_INT * T0.W, T3.X, literal.x, BS:VEC_201 1014; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1015; EG-NEXT: CNDE_INT * T1.W, T5.X, T3.Y, T2.Z, 1016; EG-NEXT: CNDE_INT T4.X, T2.W, T0.Z, 0.0, 1017; EG-NEXT: CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212 1018; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1019; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1020; EG-NEXT: LSHR T0.X, PV.W, literal.x, 1021; EG-NEXT: CNDE_INT T1.Z, T5.X, T2.Z, 0.0, 1022; EG-NEXT: CNDE_INT * T1.X, T0.W, T0.Y, 0.0, 1023; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1024; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 1025; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1026 %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1 1027 %a = load <4 x i64>, <4 x i64> addrspace(1)* %in 1028 %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr 1029 %result = shl <4 x i64> %a, %b 1030 store <4 x i64> %result, <4 x i64> addrspace(1)* %out 1031 ret void 1032} 1033 1034; Make sure load width gets reduced to i32 load. 1035define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) { 1036; SI-LABEL: s_shl_32_i64: 1037; SI: ; %bb.0: 1038; SI-NEXT: s_load_dword s4, s[0:1], 0x13 1039; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1040; SI-NEXT: s_mov_b32 s3, 0xf000 1041; SI-NEXT: s_mov_b32 s2, -1 1042; SI-NEXT: v_mov_b32_e32 v0, 0 1043; SI-NEXT: s_waitcnt lgkmcnt(0) 1044; SI-NEXT: v_mov_b32_e32 v1, s4 1045; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1046; SI-NEXT: s_endpgm 1047; 1048; VI-LABEL: s_shl_32_i64: 1049; VI: ; %bb.0: 1050; VI-NEXT: s_load_dword s4, s[0:1], 0x4c 1051; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1052; VI-NEXT: s_mov_b32 s3, 0xf000 1053; VI-NEXT: s_mov_b32 s2, -1 1054; VI-NEXT: v_mov_b32_e32 v0, 0 1055; VI-NEXT: s_waitcnt lgkmcnt(0) 1056; VI-NEXT: v_mov_b32_e32 v1, s4 1057; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1058; VI-NEXT: s_endpgm 1059; 1060; EG-LABEL: s_shl_32_i64: 1061; EG: ; %bb.0: 1062; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1063; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1064; EG-NEXT: CF_END 1065; EG-NEXT: PAD 1066; EG-NEXT: ALU clause starting at 4: 1067; EG-NEXT: MOV * T0.Y, KC0[4].W, 1068; EG-NEXT: MOV T0.X, 0.0, 1069; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1070; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1071 %result = shl i64 %a, 32 1072 store i64 %result, i64 addrspace(1)* %out 1073 ret void 1074} 1075 1076define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { 1077; SI-LABEL: v_shl_32_i64: 1078; SI: ; %bb.0: 1079; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1080; SI-NEXT: s_ashr_i32 s3, s2, 31 1081; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 1082; SI-NEXT: v_mov_b32_e32 v0, s0 1083; SI-NEXT: s_mov_b32 s11, 0xf000 1084; SI-NEXT: s_mov_b32 s10, 0 1085; SI-NEXT: s_waitcnt lgkmcnt(0) 1086; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 1087; SI-NEXT: v_mov_b32_e32 v1, s1 1088; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 1089; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1090; SI-NEXT: v_mov_b32_e32 v2, 0 1091; SI-NEXT: s_waitcnt vmcnt(0) 1092; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 1093; SI-NEXT: s_endpgm 1094; 1095; VI-LABEL: v_shl_32_i64: 1096; VI: ; %bb.0: 1097; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1098; VI-NEXT: s_ashr_i32 s3, s2, 31 1099; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 1100; VI-NEXT: v_mov_b32_e32 v0, 0 1101; VI-NEXT: s_waitcnt lgkmcnt(0) 1102; VI-NEXT: s_add_u32 s2, s6, s0 1103; VI-NEXT: s_addc_u32 s3, s7, s1 1104; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1105; VI-NEXT: s_add_u32 s0, s4, s0 1106; VI-NEXT: s_addc_u32 s1, s5, s1 1107; VI-NEXT: v_mov_b32_e32 v3, s1 1108; VI-NEXT: v_mov_b32_e32 v2, s0 1109; VI-NEXT: s_waitcnt lgkmcnt(0) 1110; VI-NEXT: v_mov_b32_e32 v1, s2 1111; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1112; VI-NEXT: s_endpgm 1113; 1114; EG-LABEL: v_shl_32_i64: 1115; EG: ; %bb.0: 1116; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1117; EG-NEXT: TEX 0 @6 1118; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 1119; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1 1120; EG-NEXT: CF_END 1121; EG-NEXT: PAD 1122; EG-NEXT: Fetch clause starting at 6: 1123; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1124; EG-NEXT: ALU clause starting at 8: 1125; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1126; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1127; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1128; EG-NEXT: ALU clause starting at 11: 1129; EG-NEXT: MOV T1.X, 0.0, 1130; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1131; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1132; EG-NEXT: MOV * T1.Y, T0.X, 1133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1134 %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0 1135 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid 1136 %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 1137 %a = load i64, i64 addrspace(1)* %gep.in 1138 %result = shl i64 %a, 32 1139 store i64 %result, i64 addrspace(1)* %gep.out 1140 ret void 1141} 1142 1143define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) { 1144; SI-LABEL: s_shl_constant_i64: 1145; SI: ; %bb.0: 1146; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1147; SI-NEXT: s_mov_b32 s6, -1 1148; SI-NEXT: s_mov_b32 s9, 0xffff 1149; SI-NEXT: s_mov_b32 s8, s6 1150; SI-NEXT: s_mov_b32 s7, 0xf000 1151; SI-NEXT: s_waitcnt lgkmcnt(0) 1152; SI-NEXT: s_mov_b32 s4, s0 1153; SI-NEXT: s_mov_b32 s5, s1 1154; SI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 1155; SI-NEXT: v_mov_b32_e32 v0, s0 1156; SI-NEXT: v_mov_b32_e32 v1, s1 1157; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1158; SI-NEXT: s_endpgm 1159; 1160; VI-LABEL: s_shl_constant_i64: 1161; VI: ; %bb.0: 1162; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1163; VI-NEXT: s_mov_b32 s6, -1 1164; VI-NEXT: s_mov_b32 s9, 0xffff 1165; VI-NEXT: s_mov_b32 s8, s6 1166; VI-NEXT: s_mov_b32 s7, 0xf000 1167; VI-NEXT: s_waitcnt lgkmcnt(0) 1168; VI-NEXT: s_mov_b32 s4, s0 1169; VI-NEXT: s_mov_b32 s5, s1 1170; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 1171; VI-NEXT: v_mov_b32_e32 v0, s0 1172; VI-NEXT: v_mov_b32_e32 v1, s1 1173; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1174; VI-NEXT: s_endpgm 1175; 1176; EG-LABEL: s_shl_constant_i64: 1177; EG: ; %bb.0: 1178; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 1179; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1180; EG-NEXT: CF_END 1181; EG-NEXT: PAD 1182; EG-NEXT: ALU clause starting at 4: 1183; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, 1184; EG-NEXT: MOV T0.W, literal.y, 1185; EG-NEXT: NOT_INT * T1.W, KC0[2].W, 1186; EG-NEXT: 31(4.344025e-44), -1(nan) 1187; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS, 1188; EG-NEXT: LSHL T0.W, literal.y, PV.Z, 1189; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, 1190; EG-NEXT: 32767(4.591635e-41), -1(nan) 1191; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1192; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 1193; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1194; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1195; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1196 %shl = shl i64 281474976710655, %a 1197 store i64 %shl, i64 addrspace(1)* %out, align 8 1198 ret void 1199} 1200 1201define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { 1202; SI-LABEL: v_shl_constant_i64: 1203; SI: ; %bb.0: 1204; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1205; SI-NEXT: s_mov_b32 s7, 0xf000 1206; SI-NEXT: s_mov_b32 s6, -1 1207; SI-NEXT: s_mov_b32 s10, s6 1208; SI-NEXT: s_mov_b32 s11, s7 1209; SI-NEXT: s_waitcnt lgkmcnt(0) 1210; SI-NEXT: s_mov_b32 s8, s2 1211; SI-NEXT: s_mov_b32 s9, s3 1212; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1213; SI-NEXT: s_mov_b32 s2, 0xab19b207 1214; SI-NEXT: s_movk_i32 s3, 0x11e 1215; SI-NEXT: s_mov_b32 s4, s0 1216; SI-NEXT: s_mov_b32 s5, s1 1217; SI-NEXT: s_waitcnt vmcnt(0) 1218; SI-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 1219; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1220; SI-NEXT: s_endpgm 1221; 1222; VI-LABEL: v_shl_constant_i64: 1223; VI: ; %bb.0: 1224; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1225; VI-NEXT: s_mov_b32 s7, 0xf000 1226; VI-NEXT: s_mov_b32 s6, -1 1227; VI-NEXT: s_waitcnt lgkmcnt(0) 1228; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1229; VI-NEXT: s_mov_b32 s4, s0 1230; VI-NEXT: s_mov_b32 s5, s1 1231; VI-NEXT: s_mov_b32 s0, 0xab19b207 1232; VI-NEXT: s_movk_i32 s1, 0x11e 1233; VI-NEXT: s_waitcnt lgkmcnt(0) 1234; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1235; VI-NEXT: v_mov_b32_e32 v0, s0 1236; VI-NEXT: v_mov_b32_e32 v1, s1 1237; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1238; VI-NEXT: s_endpgm 1239; 1240; EG-LABEL: v_shl_constant_i64: 1241; EG: ; %bb.0: 1242; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1243; EG-NEXT: TEX 0 @6 1244; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1245; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1246; EG-NEXT: CF_END 1247; EG-NEXT: PAD 1248; EG-NEXT: Fetch clause starting at 6: 1249; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1250; EG-NEXT: ALU clause starting at 8: 1251; EG-NEXT: MOV * T0.X, KC0[2].Z, 1252; EG-NEXT: ALU clause starting at 9: 1253; EG-NEXT: NOT_INT T0.Z, T0.X, 1254; EG-NEXT: MOV T0.W, literal.x, 1255; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, 1256; EG-NEXT: 1435293955(1.935796e+13), 31(4.344025e-44) 1257; EG-NEXT: LSHL T1.Z, literal.x, PS, 1258; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z, 1259; EG-NEXT: AND_INT * T1.W, T0.X, literal.z, 1260; EG-NEXT: -1424379385(-5.460358e-13), 143(2.003857e-43) 1261; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1262; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 1263; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, 1264; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1265; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1266 %a = load i64, i64 addrspace(1)* %aptr, align 8 1267 %shl = shl i64 1231231234567, %a 1268 store i64 %shl, i64 addrspace(1)* %out, align 8 1269 ret void 1270} 1271 1272define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { 1273; SI-LABEL: v_shl_i64_32_bit_constant: 1274; SI: ; %bb.0: 1275; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1276; SI-NEXT: s_mov_b32 s7, 0xf000 1277; SI-NEXT: s_mov_b32 s6, -1 1278; SI-NEXT: s_mov_b32 s10, s6 1279; SI-NEXT: s_mov_b32 s11, s7 1280; SI-NEXT: s_waitcnt lgkmcnt(0) 1281; SI-NEXT: s_mov_b32 s8, s2 1282; SI-NEXT: s_mov_b32 s9, s3 1283; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1284; SI-NEXT: s_mov_b64 s[2:3], 0x12d687 1285; SI-NEXT: s_mov_b32 s4, s0 1286; SI-NEXT: s_mov_b32 s5, s1 1287; SI-NEXT: s_waitcnt vmcnt(0) 1288; SI-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 1289; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1290; SI-NEXT: s_endpgm 1291; 1292; VI-LABEL: v_shl_i64_32_bit_constant: 1293; VI: ; %bb.0: 1294; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1295; VI-NEXT: s_mov_b32 s7, 0xf000 1296; VI-NEXT: s_mov_b32 s6, -1 1297; VI-NEXT: s_waitcnt lgkmcnt(0) 1298; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1299; VI-NEXT: s_mov_b32 s4, s0 1300; VI-NEXT: s_mov_b32 s5, s1 1301; VI-NEXT: s_mov_b64 s[0:1], 0x12d687 1302; VI-NEXT: s_waitcnt lgkmcnt(0) 1303; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1304; VI-NEXT: v_mov_b32_e32 v0, s0 1305; VI-NEXT: v_mov_b32_e32 v1, s1 1306; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1307; VI-NEXT: s_endpgm 1308; 1309; EG-LABEL: v_shl_i64_32_bit_constant: 1310; EG: ; %bb.0: 1311; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1312; EG-NEXT: TEX 0 @6 1313; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 1314; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1315; EG-NEXT: CF_END 1316; EG-NEXT: PAD 1317; EG-NEXT: Fetch clause starting at 6: 1318; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1319; EG-NEXT: ALU clause starting at 8: 1320; EG-NEXT: MOV * T0.X, KC0[2].Z, 1321; EG-NEXT: ALU clause starting at 9: 1322; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 1323; EG-NEXT: NOT_INT * T1.W, T0.X, 1324; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1325; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, 1326; EG-NEXT: LSHL T0.W, literal.y, PV.W, 1327; EG-NEXT: AND_INT * T1.W, T0.X, literal.z, 1328; EG-NEXT: 617283(8.649977e-40), 1234567(1.729997e-39) 1329; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1330; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 1331; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1332; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1333; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1334 %a = load i64, i64 addrspace(1)* %aptr, align 8 1335 %shl = shl i64 1234567, %a 1336 store i64 %shl, i64 addrspace(1)* %out, align 8 1337 ret void 1338} 1339 1340define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) { 1341; SI-LABEL: v_shl_inline_imm_64_i64: 1342; SI: ; %bb.0: 1343; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1344; SI-NEXT: s_mov_b32 s7, 0xf000 1345; SI-NEXT: s_mov_b32 s6, -1 1346; SI-NEXT: s_mov_b32 s10, s6 1347; SI-NEXT: s_mov_b32 s11, s7 1348; SI-NEXT: s_waitcnt lgkmcnt(0) 1349; SI-NEXT: s_mov_b32 s8, s2 1350; SI-NEXT: s_mov_b32 s9, s3 1351; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1352; SI-NEXT: s_mov_b32 s4, s0 1353; SI-NEXT: s_mov_b32 s5, s1 1354; SI-NEXT: s_waitcnt vmcnt(0) 1355; SI-NEXT: v_lshl_b64 v[0:1], 64, v0 1356; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1357; SI-NEXT: s_endpgm 1358; 1359; VI-LABEL: v_shl_inline_imm_64_i64: 1360; VI: ; %bb.0: 1361; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1362; VI-NEXT: s_waitcnt lgkmcnt(0) 1363; VI-NEXT: s_load_dword s4, s[2:3], 0x0 1364; VI-NEXT: s_mov_b32 s3, 0xf000 1365; VI-NEXT: s_mov_b32 s2, -1 1366; VI-NEXT: s_waitcnt lgkmcnt(0) 1367; VI-NEXT: s_lshl_b64 s[4:5], 64, s4 1368; VI-NEXT: v_mov_b32_e32 v0, s4 1369; VI-NEXT: v_mov_b32_e32 v1, s5 1370; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1371; VI-NEXT: s_endpgm 1372; 1373; EG-LABEL: v_shl_inline_imm_64_i64: 1374; EG: ; %bb.0: 1375; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1376; EG-NEXT: TEX 0 @6 1377; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] 1378; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1379; EG-NEXT: CF_END 1380; EG-NEXT: PAD 1381; EG-NEXT: Fetch clause starting at 6: 1382; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1383; EG-NEXT: ALU clause starting at 8: 1384; EG-NEXT: MOV * T0.X, KC0[2].Z, 1385; EG-NEXT: ALU clause starting at 9: 1386; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 1387; EG-NEXT: NOT_INT * T1.W, T0.X, 1388; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1389; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, 1390; EG-NEXT: LSHL T0.W, literal.y, PV.W, 1391; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 1392; EG-NEXT: 32(4.484155e-44), 64(8.968310e-44) 1393; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 1394; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1395; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1396; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1397 %a = load i64, i64 addrspace(1)* %aptr, align 8 1398 %shl = shl i64 64, %a 1399 store i64 %shl, i64 addrspace(1)* %out, align 8 1400 ret void 1401} 1402 1403define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1404; SI-LABEL: s_shl_inline_imm_64_i64: 1405; SI: ; %bb.0: 1406; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1407; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1408; SI-NEXT: s_mov_b32 s3, 0xf000 1409; SI-NEXT: s_mov_b32 s2, -1 1410; SI-NEXT: s_waitcnt lgkmcnt(0) 1411; SI-NEXT: s_lshl_b64 s[4:5], 64, s4 1412; SI-NEXT: v_mov_b32_e32 v0, s4 1413; SI-NEXT: v_mov_b32_e32 v1, s5 1414; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1415; SI-NEXT: s_endpgm 1416; 1417; VI-LABEL: s_shl_inline_imm_64_i64: 1418; VI: ; %bb.0: 1419; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1420; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1421; VI-NEXT: s_mov_b32 s3, 0xf000 1422; VI-NEXT: s_mov_b32 s2, -1 1423; VI-NEXT: s_waitcnt lgkmcnt(0) 1424; VI-NEXT: s_lshl_b64 s[4:5], 64, s4 1425; VI-NEXT: v_mov_b32_e32 v0, s4 1426; VI-NEXT: v_mov_b32_e32 v1, s5 1427; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1428; VI-NEXT: s_endpgm 1429; 1430; EG-LABEL: s_shl_inline_imm_64_i64: 1431; EG: ; %bb.0: 1432; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 1433; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1434; EG-NEXT: CF_END 1435; EG-NEXT: PAD 1436; EG-NEXT: ALU clause starting at 4: 1437; EG-NEXT: NOT_INT T0.W, KC0[2].W, 1438; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, 1439; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1440; EG-NEXT: LSHL T0.Z, literal.x, PS, 1441; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W, 1442; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1443; EG-NEXT: 64(8.968310e-44), 32(4.484155e-44) 1444; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 1445; EG-NEXT: CNDE_INT T0.X, T1.W, T0.Z, 0.0, 1446; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1447; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1448 %shl = shl i64 64, %a 1449 store i64 %shl, i64 addrspace(1)* %out, align 8 1450 ret void 1451} 1452 1453define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1454; SI-LABEL: s_shl_inline_imm_1_i64: 1455; SI: ; %bb.0: 1456; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1457; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1458; SI-NEXT: s_mov_b32 s3, 0xf000 1459; SI-NEXT: s_mov_b32 s2, -1 1460; SI-NEXT: s_waitcnt lgkmcnt(0) 1461; SI-NEXT: s_lshl_b64 s[4:5], 1, s4 1462; SI-NEXT: v_mov_b32_e32 v0, s4 1463; SI-NEXT: v_mov_b32_e32 v1, s5 1464; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1465; SI-NEXT: s_endpgm 1466; 1467; VI-LABEL: s_shl_inline_imm_1_i64: 1468; VI: ; %bb.0: 1469; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1470; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1471; VI-NEXT: s_mov_b32 s3, 0xf000 1472; VI-NEXT: s_mov_b32 s2, -1 1473; VI-NEXT: s_waitcnt lgkmcnt(0) 1474; VI-NEXT: s_lshl_b64 s[4:5], 1, s4 1475; VI-NEXT: v_mov_b32_e32 v0, s4 1476; VI-NEXT: v_mov_b32_e32 v1, s5 1477; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1478; VI-NEXT: s_endpgm 1479; 1480; EG-LABEL: s_shl_inline_imm_1_i64: 1481; EG: ; %bb.0: 1482; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] 1483; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1484; EG-NEXT: CF_END 1485; EG-NEXT: PAD 1486; EG-NEXT: ALU clause starting at 4: 1487; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, 1488; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y, 1489; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44) 1490; EG-NEXT: ASHR T1.W, PS, literal.x, 1491; EG-NEXT: LSHL * T0.W, 1, PV.W, 1492; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1493; EG-NEXT: AND_INT T0.Y, PV.W, PS, 1494; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, 1495; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1496; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0, 1497; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1498; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1499 %shl = shl i64 1, %a 1500 store i64 %shl, i64 addrspace(1)* %out, align 8 1501 ret void 1502} 1503 1504define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1505; SI-LABEL: s_shl_inline_imm_1_0_i64: 1506; SI: ; %bb.0: 1507; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1508; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1509; SI-NEXT: s_mov_b32 s3, 0xf000 1510; SI-NEXT: s_mov_b32 s2, -1 1511; SI-NEXT: s_waitcnt lgkmcnt(0) 1512; SI-NEXT: s_lshl_b64 s[4:5], 1.0, s4 1513; SI-NEXT: v_mov_b32_e32 v0, s4 1514; SI-NEXT: v_mov_b32_e32 v1, s5 1515; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1516; SI-NEXT: s_endpgm 1517; 1518; VI-LABEL: s_shl_inline_imm_1_0_i64: 1519; VI: ; %bb.0: 1520; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1521; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1522; VI-NEXT: s_mov_b32 s3, 0xf000 1523; VI-NEXT: s_mov_b32 s2, -1 1524; VI-NEXT: s_waitcnt lgkmcnt(0) 1525; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s4 1526; VI-NEXT: v_mov_b32_e32 v0, s4 1527; VI-NEXT: v_mov_b32_e32 v1, s5 1528; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1529; VI-NEXT: s_endpgm 1530; 1531; EG-LABEL: s_shl_inline_imm_1_0_i64: 1532; EG: ; %bb.0: 1533; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1534; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1535; EG-NEXT: CF_END 1536; EG-NEXT: PAD 1537; EG-NEXT: ALU clause starting at 4: 1538; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1539; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1540; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1541; EG-NEXT: 536346624(1.050321e-19), 32(4.484155e-44) 1542; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1543; EG-NEXT: MOV T0.X, 0.0, 1544; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1545; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1546 %shl = shl i64 4607182418800017408, %a 1547 store i64 %shl, i64 addrspace(1)* %out, align 8 1548 ret void 1549} 1550 1551define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1552; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: 1553; SI: ; %bb.0: 1554; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1555; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1556; SI-NEXT: s_mov_b32 s3, 0xf000 1557; SI-NEXT: s_mov_b32 s2, -1 1558; SI-NEXT: s_waitcnt lgkmcnt(0) 1559; SI-NEXT: s_lshl_b64 s[4:5], -1.0, s4 1560; SI-NEXT: v_mov_b32_e32 v0, s4 1561; SI-NEXT: v_mov_b32_e32 v1, s5 1562; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1563; SI-NEXT: s_endpgm 1564; 1565; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: 1566; VI: ; %bb.0: 1567; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1568; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1569; VI-NEXT: s_mov_b32 s3, 0xf000 1570; VI-NEXT: s_mov_b32 s2, -1 1571; VI-NEXT: s_waitcnt lgkmcnt(0) 1572; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s4 1573; VI-NEXT: v_mov_b32_e32 v0, s4 1574; VI-NEXT: v_mov_b32_e32 v1, s5 1575; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1576; VI-NEXT: s_endpgm 1577; 1578; EG-LABEL: s_shl_inline_imm_neg_1_0_i64: 1579; EG: ; %bb.0: 1580; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1581; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1582; EG-NEXT: CF_END 1583; EG-NEXT: PAD 1584; EG-NEXT: ALU clause starting at 4: 1585; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1586; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1587; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1588; EG-NEXT: 1610088448(3.574057e+19), 32(4.484155e-44) 1589; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1590; EG-NEXT: MOV T0.X, 0.0, 1591; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1592; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1593 %shl = shl i64 13830554455654793216, %a 1594 store i64 %shl, i64 addrspace(1)* %out, align 8 1595 ret void 1596} 1597 1598define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1599; SI-LABEL: s_shl_inline_imm_0_5_i64: 1600; SI: ; %bb.0: 1601; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1602; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1603; SI-NEXT: s_mov_b32 s3, 0xf000 1604; SI-NEXT: s_mov_b32 s2, -1 1605; SI-NEXT: s_waitcnt lgkmcnt(0) 1606; SI-NEXT: s_lshl_b64 s[4:5], 0.5, s4 1607; SI-NEXT: v_mov_b32_e32 v0, s4 1608; SI-NEXT: v_mov_b32_e32 v1, s5 1609; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1610; SI-NEXT: s_endpgm 1611; 1612; VI-LABEL: s_shl_inline_imm_0_5_i64: 1613; VI: ; %bb.0: 1614; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1615; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1616; VI-NEXT: s_mov_b32 s3, 0xf000 1617; VI-NEXT: s_mov_b32 s2, -1 1618; VI-NEXT: s_waitcnt lgkmcnt(0) 1619; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s4 1620; VI-NEXT: v_mov_b32_e32 v0, s4 1621; VI-NEXT: v_mov_b32_e32 v1, s5 1622; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1623; VI-NEXT: s_endpgm 1624; 1625; EG-LABEL: s_shl_inline_imm_0_5_i64: 1626; EG: ; %bb.0: 1627; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1628; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1629; EG-NEXT: CF_END 1630; EG-NEXT: PAD 1631; EG-NEXT: ALU clause starting at 4: 1632; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1633; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1634; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1635; EG-NEXT: 535822336(1.016440e-19), 32(4.484155e-44) 1636; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1637; EG-NEXT: MOV T0.X, 0.0, 1638; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1639; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1640 %shl = shl i64 4602678819172646912, %a 1641 store i64 %shl, i64 addrspace(1)* %out, align 8 1642 ret void 1643} 1644 1645define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1646; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: 1647; SI: ; %bb.0: 1648; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1649; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1650; SI-NEXT: s_mov_b32 s3, 0xf000 1651; SI-NEXT: s_mov_b32 s2, -1 1652; SI-NEXT: s_waitcnt lgkmcnt(0) 1653; SI-NEXT: s_lshl_b64 s[4:5], -0.5, s4 1654; SI-NEXT: v_mov_b32_e32 v0, s4 1655; SI-NEXT: v_mov_b32_e32 v1, s5 1656; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1657; SI-NEXT: s_endpgm 1658; 1659; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: 1660; VI: ; %bb.0: 1661; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1662; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1663; VI-NEXT: s_mov_b32 s3, 0xf000 1664; VI-NEXT: s_mov_b32 s2, -1 1665; VI-NEXT: s_waitcnt lgkmcnt(0) 1666; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s4 1667; VI-NEXT: v_mov_b32_e32 v0, s4 1668; VI-NEXT: v_mov_b32_e32 v1, s5 1669; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1670; VI-NEXT: s_endpgm 1671; 1672; EG-LABEL: s_shl_inline_imm_neg_0_5_i64: 1673; EG: ; %bb.0: 1674; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1675; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1676; EG-NEXT: CF_END 1677; EG-NEXT: PAD 1678; EG-NEXT: ALU clause starting at 4: 1679; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1680; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1681; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1682; EG-NEXT: 1609564160(3.458765e+19), 32(4.484155e-44) 1683; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1684; EG-NEXT: MOV T0.X, 0.0, 1685; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1686; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1687 %shl = shl i64 13826050856027422720, %a 1688 store i64 %shl, i64 addrspace(1)* %out, align 8 1689 ret void 1690} 1691 1692define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1693; SI-LABEL: s_shl_inline_imm_2_0_i64: 1694; SI: ; %bb.0: 1695; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1696; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1697; SI-NEXT: s_mov_b32 s3, 0xf000 1698; SI-NEXT: s_mov_b32 s2, -1 1699; SI-NEXT: s_waitcnt lgkmcnt(0) 1700; SI-NEXT: s_lshl_b64 s[4:5], 2.0, s4 1701; SI-NEXT: v_mov_b32_e32 v0, s4 1702; SI-NEXT: v_mov_b32_e32 v1, s5 1703; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1704; SI-NEXT: s_endpgm 1705; 1706; VI-LABEL: s_shl_inline_imm_2_0_i64: 1707; VI: ; %bb.0: 1708; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1709; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1710; VI-NEXT: s_mov_b32 s3, 0xf000 1711; VI-NEXT: s_mov_b32 s2, -1 1712; VI-NEXT: s_waitcnt lgkmcnt(0) 1713; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s4 1714; VI-NEXT: v_mov_b32_e32 v0, s4 1715; VI-NEXT: v_mov_b32_e32 v1, s5 1716; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1717; VI-NEXT: s_endpgm 1718; 1719; EG-LABEL: s_shl_inline_imm_2_0_i64: 1720; EG: ; %bb.0: 1721; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1722; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1723; EG-NEXT: CF_END 1724; EG-NEXT: PAD 1725; EG-NEXT: ALU clause starting at 4: 1726; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1727; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1728; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1729; EG-NEXT: 536870912(1.084202e-19), 32(4.484155e-44) 1730; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1731; EG-NEXT: MOV T0.X, 0.0, 1732; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1733; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1734 %shl = shl i64 4611686018427387904, %a 1735 store i64 %shl, i64 addrspace(1)* %out, align 8 1736 ret void 1737} 1738 1739define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1740; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: 1741; SI: ; %bb.0: 1742; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1743; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1744; SI-NEXT: s_mov_b32 s3, 0xf000 1745; SI-NEXT: s_mov_b32 s2, -1 1746; SI-NEXT: s_waitcnt lgkmcnt(0) 1747; SI-NEXT: s_lshl_b64 s[4:5], -2.0, s4 1748; SI-NEXT: v_mov_b32_e32 v0, s4 1749; SI-NEXT: v_mov_b32_e32 v1, s5 1750; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1751; SI-NEXT: s_endpgm 1752; 1753; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: 1754; VI: ; %bb.0: 1755; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1756; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1757; VI-NEXT: s_mov_b32 s3, 0xf000 1758; VI-NEXT: s_mov_b32 s2, -1 1759; VI-NEXT: s_waitcnt lgkmcnt(0) 1760; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s4 1761; VI-NEXT: v_mov_b32_e32 v0, s4 1762; VI-NEXT: v_mov_b32_e32 v1, s5 1763; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1764; VI-NEXT: s_endpgm 1765; 1766; EG-LABEL: s_shl_inline_imm_neg_2_0_i64: 1767; EG: ; %bb.0: 1768; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1769; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1770; EG-NEXT: CF_END 1771; EG-NEXT: PAD 1772; EG-NEXT: ALU clause starting at 4: 1773; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1774; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1775; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1776; EG-NEXT: 1610612736(3.689349e+19), 32(4.484155e-44) 1777; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1778; EG-NEXT: MOV T0.X, 0.0, 1779; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1780; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1781 %shl = shl i64 13835058055282163712, %a 1782 store i64 %shl, i64 addrspace(1)* %out, align 8 1783 ret void 1784} 1785 1786define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1787; SI-LABEL: s_shl_inline_imm_4_0_i64: 1788; SI: ; %bb.0: 1789; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1790; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1791; SI-NEXT: s_mov_b32 s3, 0xf000 1792; SI-NEXT: s_mov_b32 s2, -1 1793; SI-NEXT: s_waitcnt lgkmcnt(0) 1794; SI-NEXT: s_lshl_b64 s[4:5], 4.0, s4 1795; SI-NEXT: v_mov_b32_e32 v0, s4 1796; SI-NEXT: v_mov_b32_e32 v1, s5 1797; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1798; SI-NEXT: s_endpgm 1799; 1800; VI-LABEL: s_shl_inline_imm_4_0_i64: 1801; VI: ; %bb.0: 1802; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1803; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1804; VI-NEXT: s_mov_b32 s3, 0xf000 1805; VI-NEXT: s_mov_b32 s2, -1 1806; VI-NEXT: s_waitcnt lgkmcnt(0) 1807; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s4 1808; VI-NEXT: v_mov_b32_e32 v0, s4 1809; VI-NEXT: v_mov_b32_e32 v1, s5 1810; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1811; VI-NEXT: s_endpgm 1812; 1813; EG-LABEL: s_shl_inline_imm_4_0_i64: 1814; EG: ; %bb.0: 1815; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1816; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1817; EG-NEXT: CF_END 1818; EG-NEXT: PAD 1819; EG-NEXT: ALU clause starting at 4: 1820; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1821; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1822; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1823; EG-NEXT: 537395200(1.151965e-19), 32(4.484155e-44) 1824; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1825; EG-NEXT: MOV T0.X, 0.0, 1826; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1827; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1828 %shl = shl i64 4616189618054758400, %a 1829 store i64 %shl, i64 addrspace(1)* %out, align 8 1830 ret void 1831} 1832 1833define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1834; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: 1835; SI: ; %bb.0: 1836; SI-NEXT: s_load_dword s4, s[0:1], 0xd 1837; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1838; SI-NEXT: s_mov_b32 s3, 0xf000 1839; SI-NEXT: s_mov_b32 s2, -1 1840; SI-NEXT: s_waitcnt lgkmcnt(0) 1841; SI-NEXT: s_lshl_b64 s[4:5], -4.0, s4 1842; SI-NEXT: v_mov_b32_e32 v0, s4 1843; SI-NEXT: v_mov_b32_e32 v1, s5 1844; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1845; SI-NEXT: s_endpgm 1846; 1847; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: 1848; VI: ; %bb.0: 1849; VI-NEXT: s_load_dword s4, s[0:1], 0x34 1850; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1851; VI-NEXT: s_mov_b32 s3, 0xf000 1852; VI-NEXT: s_mov_b32 s2, -1 1853; VI-NEXT: s_waitcnt lgkmcnt(0) 1854; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s4 1855; VI-NEXT: v_mov_b32_e32 v0, s4 1856; VI-NEXT: v_mov_b32_e32 v1, s5 1857; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1858; VI-NEXT: s_endpgm 1859; 1860; EG-LABEL: s_shl_inline_imm_neg_4_0_i64: 1861; EG: ; %bb.0: 1862; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1863; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1864; EG-NEXT: CF_END 1865; EG-NEXT: PAD 1866; EG-NEXT: ALU clause starting at 4: 1867; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1868; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1869; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1870; EG-NEXT: 1611137024(3.919933e+19), 32(4.484155e-44) 1871; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1872; EG-NEXT: MOV T0.X, 0.0, 1873; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1874; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1875 %shl = shl i64 13839561654909534208, %a 1876 store i64 %shl, i64 addrspace(1)* %out, align 8 1877 ret void 1878} 1879 1880 1881; Test with the 64-bit integer bitpattern for a 32-bit float in the 1882; low 32-bits, which is not a valid 64-bit inline immmediate. 1883define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1884; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: 1885; SI: ; %bb.0: 1886; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1887; SI-NEXT: s_load_dword s2, s[0:1], 0xd 1888; SI-NEXT: s_mov_b64 s[0:1], 0x40800000 1889; SI-NEXT: s_mov_b32 s7, 0xf000 1890; SI-NEXT: s_mov_b32 s6, -1 1891; SI-NEXT: s_waitcnt lgkmcnt(0) 1892; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1893; SI-NEXT: v_mov_b32_e32 v0, s0 1894; SI-NEXT: v_mov_b32_e32 v1, s1 1895; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1896; SI-NEXT: s_endpgm 1897; 1898; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: 1899; VI: ; %bb.0: 1900; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1901; VI-NEXT: s_load_dword s2, s[0:1], 0x34 1902; VI-NEXT: s_mov_b64 s[0:1], 0x40800000 1903; VI-NEXT: s_mov_b32 s7, 0xf000 1904; VI-NEXT: s_mov_b32 s6, -1 1905; VI-NEXT: s_waitcnt lgkmcnt(0) 1906; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1907; VI-NEXT: v_mov_b32_e32 v0, s0 1908; VI-NEXT: v_mov_b32_e32 v1, s1 1909; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1910; VI-NEXT: s_endpgm 1911; 1912; EG-LABEL: s_shl_inline_imm_f32_4_0_i64: 1913; EG: ; %bb.0: 1914; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] 1915; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1916; EG-NEXT: CF_END 1917; EG-NEXT: PAD 1918; EG-NEXT: ALU clause starting at 4: 1919; EG-NEXT: NOT_INT T0.W, KC0[2].W, 1920; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, 1921; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1922; EG-NEXT: LSHL T0.Z, literal.x, PS, 1923; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W, 1924; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, 1925; EG-NEXT: 1082130432(4.000000e+00), 541065216(1.626303e-19) 1926; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1927; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 1928; EG-NEXT: CNDE_INT T0.X, T1.W, T0.Z, 0.0, 1929; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1930; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1931 %shl = shl i64 1082130432, %a 1932 store i64 %shl, i64 addrspace(1)* %out, align 8 1933 ret void 1934} 1935 1936; FIXME: Copy of -1 register 1937define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1938; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1939; SI: ; %bb.0: 1940; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1941; SI-NEXT: s_load_dword s2, s[0:1], 0xd 1942; SI-NEXT: s_mov_b32 s6, -1 1943; SI-NEXT: s_mov_b32 s0, -4.0 1944; SI-NEXT: s_mov_b32 s1, s6 1945; SI-NEXT: s_mov_b32 s7, 0xf000 1946; SI-NEXT: s_waitcnt lgkmcnt(0) 1947; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1948; SI-NEXT: v_mov_b32_e32 v0, s0 1949; SI-NEXT: v_mov_b32_e32 v1, s1 1950; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1951; SI-NEXT: s_endpgm 1952; 1953; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1954; VI: ; %bb.0: 1955; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1956; VI-NEXT: s_load_dword s2, s[0:1], 0x34 1957; VI-NEXT: s_mov_b32 s6, -1 1958; VI-NEXT: s_mov_b32 s0, -4.0 1959; VI-NEXT: s_mov_b32 s1, s6 1960; VI-NEXT: s_mov_b32 s7, 0xf000 1961; VI-NEXT: s_waitcnt lgkmcnt(0) 1962; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1963; VI-NEXT: v_mov_b32_e32 v0, s0 1964; VI-NEXT: v_mov_b32_e32 v1, s1 1965; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1966; VI-NEXT: s_endpgm 1967; 1968; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1969; EG: ; %bb.0: 1970; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 1971; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1972; EG-NEXT: CF_END 1973; EG-NEXT: PAD 1974; EG-NEXT: ALU clause starting at 4: 1975; EG-NEXT: AND_INT T0.Z, KC0[2].W, literal.x, 1976; EG-NEXT: MOV T0.W, literal.y, 1977; EG-NEXT: NOT_INT * T1.W, KC0[2].W, 1978; EG-NEXT: 31(4.344025e-44), -532676608(-5.534023e+19) 1979; EG-NEXT: BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS, 1980; EG-NEXT: LSHL T0.W, literal.y, PV.Z, 1981; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, 1982; EG-NEXT: 2147483647(nan), -1065353216(-4.000000e+00) 1983; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1984; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 1985; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1986; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1987; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1988 %shl = shl i64 -1065353216, %a 1989 store i64 %shl, i64 addrspace(1)* %out, align 8 1990 ret void 1991} 1992 1993define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 1994; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 1995; SI: ; %bb.0: 1996; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1997; SI-NEXT: s_load_dword s2, s[0:1], 0xd 1998; SI-NEXT: s_mov_b32 s0, 0 1999; SI-NEXT: s_mov_b32 s1, 4.0 2000; SI-NEXT: s_mov_b32 s7, 0xf000 2001; SI-NEXT: s_mov_b32 s6, -1 2002; SI-NEXT: s_waitcnt lgkmcnt(0) 2003; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 2004; SI-NEXT: v_mov_b32_e32 v0, s0 2005; SI-NEXT: v_mov_b32_e32 v1, s1 2006; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2007; SI-NEXT: s_endpgm 2008; 2009; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 2010; VI: ; %bb.0: 2011; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2012; VI-NEXT: s_load_dword s2, s[0:1], 0x34 2013; VI-NEXT: s_mov_b32 s0, 0 2014; VI-NEXT: s_mov_b32 s1, 4.0 2015; VI-NEXT: s_mov_b32 s7, 0xf000 2016; VI-NEXT: s_mov_b32 s6, -1 2017; VI-NEXT: s_waitcnt lgkmcnt(0) 2018; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 2019; VI-NEXT: v_mov_b32_e32 v0, s0 2020; VI-NEXT: v_mov_b32_e32 v1, s1 2021; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2022; VI-NEXT: s_endpgm 2023; 2024; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 2025; EG: ; %bb.0: 2026; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 2027; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2028; EG-NEXT: CF_END 2029; EG-NEXT: PAD 2030; EG-NEXT: ALU clause starting at 4: 2031; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 2032; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 2033; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 2034; EG-NEXT: 541065216(1.626303e-19), 32(4.484155e-44) 2035; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 2036; EG-NEXT: MOV T0.X, 0.0, 2037; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2038; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2039 %shl = shl i64 4647714815446351872, %a 2040 store i64 %shl, i64 addrspace(1)* %out, align 8 2041 ret void 2042} 2043 2044define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) { 2045; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 2046; SI: ; %bb.0: 2047; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2048; SI-NEXT: s_load_dword s2, s[0:1], 0xd 2049; SI-NEXT: s_mov_b32 s0, 0 2050; SI-NEXT: s_mov_b32 s1, -4.0 2051; SI-NEXT: s_mov_b32 s7, 0xf000 2052; SI-NEXT: s_mov_b32 s6, -1 2053; SI-NEXT: s_waitcnt lgkmcnt(0) 2054; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 2055; SI-NEXT: v_mov_b32_e32 v0, s0 2056; SI-NEXT: v_mov_b32_e32 v1, s1 2057; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2058; SI-NEXT: s_endpgm 2059; 2060; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 2061; VI: ; %bb.0: 2062; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2063; VI-NEXT: s_load_dword s2, s[0:1], 0x34 2064; VI-NEXT: s_mov_b32 s0, 0 2065; VI-NEXT: s_mov_b32 s1, -4.0 2066; VI-NEXT: s_mov_b32 s7, 0xf000 2067; VI-NEXT: s_mov_b32 s6, -1 2068; VI-NEXT: s_waitcnt lgkmcnt(0) 2069; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 2070; VI-NEXT: v_mov_b32_e32 v0, s0 2071; VI-NEXT: v_mov_b32_e32 v1, s1 2072; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2073; VI-NEXT: s_endpgm 2074; 2075; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 2076; EG: ; %bb.0: 2077; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 2078; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2079; EG-NEXT: CF_END 2080; EG-NEXT: PAD 2081; EG-NEXT: ALU clause starting at 4: 2082; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 2083; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 2084; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 2085; EG-NEXT: 1614807040(5.534023e+19), 32(4.484155e-44) 2086; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 2087; EG-NEXT: MOV T0.X, 0.0, 2088; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2089; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2090 %shl = shl i64 13871086852301127680, %a 2091 store i64 %shl, i64 addrspace(1)* %out, align 8 2092 ret void 2093} 2094 2095define amdgpu_kernel void @test_mul2(i32 %p) { 2096; SI-LABEL: test_mul2: 2097; SI: ; %bb.0: 2098; SI-NEXT: s_load_dword s0, s[0:1], 0x9 2099; SI-NEXT: s_mov_b32 s3, 0xf000 2100; SI-NEXT: s_mov_b32 s2, -1 2101; SI-NEXT: s_waitcnt lgkmcnt(0) 2102; SI-NEXT: s_lshl_b32 s0, s0, 1 2103; SI-NEXT: v_mov_b32_e32 v0, s0 2104; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2105; SI-NEXT: s_waitcnt vmcnt(0) 2106; SI-NEXT: s_endpgm 2107; 2108; VI-LABEL: test_mul2: 2109; VI: ; %bb.0: 2110; VI-NEXT: s_load_dword s0, s[0:1], 0x24 2111; VI-NEXT: s_mov_b32 s3, 0xf000 2112; VI-NEXT: s_mov_b32 s2, -1 2113; VI-NEXT: s_waitcnt lgkmcnt(0) 2114; VI-NEXT: s_lshl_b32 s0, s0, 1 2115; VI-NEXT: v_mov_b32_e32 v0, s0 2116; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2117; VI-NEXT: s_waitcnt vmcnt(0) 2118; VI-NEXT: s_endpgm 2119; 2120; EG-LABEL: test_mul2: 2121; EG: ; %bb.0: 2122; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2124; EG-NEXT: CF_END 2125; EG-NEXT: PAD 2126; EG-NEXT: ALU clause starting at 4: 2127; EG-NEXT: MOV T0.X, literal.x, 2128; EG-NEXT: LSHL * T1.X, KC0[2].Y, 1, 2129; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2130 %i = mul i32 %p, 2 2131 store volatile i32 %i, i32 addrspace(1)* undef 2132 ret void 2133} 2134 2135define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) { 2136; SI-LABEL: shl_or_k: 2137; SI: ; %bb.0: 2138; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2139; SI-NEXT: s_mov_b32 s6, 0 2140; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2141; SI-NEXT: s_mov_b32 s7, 0xf000 2142; SI-NEXT: s_mov_b32 s4, s6 2143; SI-NEXT: s_mov_b32 s5, s6 2144; SI-NEXT: v_or_b32_e32 v2, 4, v2 2145; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 2146; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2147; SI-NEXT: s_setpc_b64 s[30:31] 2148; 2149; VI-LABEL: shl_or_k: 2150; VI: ; %bb.0: 2151; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2152; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2153; VI-NEXT: v_or_b32_e32 v2, 4, v2 2154; VI-NEXT: flat_store_dword v[0:1], v2 2155; VI-NEXT: s_waitcnt vmcnt(0) 2156; VI-NEXT: s_setpc_b64 s[30:31] 2157; 2158; EG-LABEL: shl_or_k: 2159; EG: ; %bb.0: 2160; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 2161; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2162; EG-NEXT: CF_END 2163; EG-NEXT: PAD 2164; EG-NEXT: ALU clause starting at 4: 2165; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 2166; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2167; EG-NEXT: OR_INT T0.X, PV.W, literal.x, 2168; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 2169; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) 2170 %tmp0 = or i32 %in, 1 2171 %tmp2 = shl i32 %tmp0, 2 2172 store i32 %tmp2, i32 addrspace(1)* %out 2173 ret void 2174} 2175 2176define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) { 2177; SI-LABEL: shl_or_k_two_uses: 2178; SI: ; %bb.0: 2179; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2180; SI-NEXT: s_mov_b32 s6, 0 2181; SI-NEXT: v_or_b32_e32 v4, 1, v4 2182; SI-NEXT: s_mov_b32 s7, 0xf000 2183; SI-NEXT: s_mov_b32 s4, s6 2184; SI-NEXT: s_mov_b32 s5, s6 2185; SI-NEXT: v_lshlrev_b32_e32 v5, 2, v4 2186; SI-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 2187; SI-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64 2188; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2189; SI-NEXT: s_setpc_b64 s[30:31] 2190; 2191; VI-LABEL: shl_or_k_two_uses: 2192; VI: ; %bb.0: 2193; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2194; VI-NEXT: v_or_b32_e32 v4, 1, v4 2195; VI-NEXT: v_lshlrev_b32_e32 v5, 2, v4 2196; VI-NEXT: flat_store_dword v[0:1], v5 2197; VI-NEXT: flat_store_dword v[2:3], v4 2198; VI-NEXT: s_waitcnt vmcnt(0) 2199; VI-NEXT: s_setpc_b64 s[30:31] 2200; 2201; EG-LABEL: shl_or_k_two_uses: 2202; EG: ; %bb.0: 2203; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 2204; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 2205; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2206; EG-NEXT: CF_END 2207; EG-NEXT: ALU clause starting at 4: 2208; EG-NEXT: LSHR T0.X, KC0[2].Z, literal.x, 2209; EG-NEXT: OR_INT * T1.X, KC0[2].W, 1, 2210; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2211; EG-NEXT: LSHL T2.X, PS, literal.x, 2212; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2213; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2214 %tmp0 = or i32 %in, 1 2215 %tmp2 = shl i32 %tmp0, 2 2216 store i32 %tmp2, i32 addrspace(1)* %out0 2217 store i32 %tmp0, i32 addrspace(1)* %out1 2218 ret void 2219} 2220 2221attributes #0 = { nounwind readnone } 2222