1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; GCN-LABEL: {{^}}v_mul_i16: 6; SI: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} 7; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] 8; SI: v_and_b32_e32 v{{[0-9]+}}, [[K]] 9; SI: v_mul_u32_u24 10 11; GFX89: v_mul_lo_u16_e32 v0, v0, v1 12define i16 @v_mul_i16(i16 %a, i16 %b) { 13 %r.val = mul i16 %a, %b 14 ret i16 %r.val 15} 16 17; GCN-LABEL: {{^}}s_mul_i16: 18; GCN: s_mul_i16 19define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) { 20 %r.val = mul i16 %a, %b 21 store volatile i16 %r.val, i16 addrspace(1)* null 22 ret void 23} 24 25; FIXME: Should emit u16 mul here. 26; GCN-LABEL: {{^}}v_mul_i16_uniform_load: 27; GCN: v_mul_lo_u32 28define amdgpu_kernel void @v_mul_i16_uniform_load( 29 i16 addrspace(1)* %r, 30 i16 addrspace(1)* %a, 31 i16 addrspace(1)* %b) { 32entry: 33 %a.val = load i16, i16 addrspace(1)* %a 34 %b.val = load i16, i16 addrspace(1)* %b 35 %r.val = mul i16 %a.val, %b.val 36 store i16 %r.val, i16 addrspace(1)* %r 37 ret void 38} 39 40; GCN-LABEL: {{^}}v_mul_v2i16: 41; SI: v_mul_u32_u24 42; SI: v_mul_u32_u24 43 44; VI: v_mul_lo_u16_sdwa 45; VI: v_mul_lo_u16_e32 46; VI: v_or_b32_e32 47 48 49; GFX9: s_waitcnt 50; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 51; GFX9-NEXT: s_setpc_b64 52define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) { 53 %r.val = mul <2 x i16> %a, %b 54 ret <2 x i16> %r.val 55} 56 57; FIXME: Unpack garbage on gfx9 58; GCN-LABEL: {{^}}v_mul_v3i16: 59; SI: v_mul_u32_u24 60; SI: v_mul_u32_u24 61; SI: v_mul_u32_u24 62 63; VI: v_mul_lo_u16 64; VI: v_mul_lo_u16 65; VI: v_mul_lo_u16 66 67; GFX9: s_waitcnt 68; GFX9-NEXT: v_pk_mul_lo_u16 69; GFX9-NEXT: v_pk_mul_lo_u16 70; GFX9-NEXT: s_setpc_b64 71define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) { 72 %r.val = mul <3 x i16> %a, %b 73 ret <3 x i16> %r.val 74} 75 76; GCN-LABEL: {{^}}v_mul_v4i16: 77; SI: v_mul_u32_u24 78; SI: v_mul_u32_u24 79; SI: v_mul_u32_u24 80; SI: v_mul_u32_u24 81 82; VI: v_mul_lo_u16_sdwa 83; VI: v_mul_lo_u16_sdwa 84; VI: v_mul_lo_u16_e32 85; VI: v_mul_lo_u16_e32 86; VI: v_or_b32_e32 87; VI: v_or_b32_e32 88 89; GFX9: s_waitcnt 90; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 91; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 92; GFX9-NEXT: s_setpc_b64 93define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { 94 %r.val = mul <4 x i16> %a, %b 95 ret <4 x i16> %r.val 96} 97