1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; GCN-LABEL: {{^}}v_mul_i16: 6; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} 7; SI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} 8; SI: v_mul_u32_u24 9 10; GFX89: v_mul_lo_u16_e32 v0, v0, v1 11define i16 @v_mul_i16(i16 %a, i16 %b) { 12 %r.val = mul i16 %a, %b 13 ret i16 %r.val 14} 15 16; GCN-LABEL: {{^}}s_mul_i16: 17; GCN: s_mul_i16 18define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) { 19 %r.val = mul i16 %a, %b 20 store volatile i16 %r.val, i16 addrspace(1)* null 21 ret void 22} 23 24; FIXME: Should emit u16 mul here. 25; GCN-LABEL: {{^}}v_mul_i16_uniform_load: 26; GCN: v_mul_lo_u32 27define amdgpu_kernel void @v_mul_i16_uniform_load( 28 i16 addrspace(1)* %r, 29 i16 addrspace(1)* %a, 30 i16 addrspace(1)* %b) { 31entry: 32 %a.val = load i16, i16 addrspace(1)* %a 33 %b.val = load i16, i16 addrspace(1)* %b 34 %r.val = mul i16 %a.val, %b.val 35 store i16 %r.val, i16 addrspace(1)* %r 36 ret void 37} 38 39; GCN-LABEL: {{^}}v_mul_v2i16: 40; SI: v_mul_u32_u24 41; SI: v_mul_u32_u24 42 43; VI: v_mul_lo_u16_sdwa 44; VI: v_mul_lo_u16_e32 45; VI: v_or_b32_e32 46 47 48; GFX9: s_waitcnt 49; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1 50; GFX9-NEXT: s_setpc_b64 51define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) { 52 %r.val = mul <2 x i16> %a, %b 53 ret <2 x i16> %r.val 54} 55 56; FIXME: Unpack garbage on gfx9 57; GCN-LABEL: {{^}}v_mul_v3i16: 58; SI: v_mul_u32_u24 59; SI: v_mul_u32_u24 60; SI: v_mul_u32_u24 61 62; VI: v_mul_lo_u16 63; VI: v_mul_lo_u16 64; VI: v_mul_lo_u16 65 66; GFX9: s_waitcnt 67; GFX9-NEXT: v_pk_mul_lo_u16 68; GFX9-NEXT: v_pk_mul_lo_u16 69; GFX9-NEXT: s_setpc_b64 70define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) { 71 %r.val = mul <3 x i16> %a, %b 72 ret <3 x i16> %r.val 73} 74 75; GCN-LABEL: {{^}}v_mul_v4i16: 76; SI: v_mul_u32_u24 77; SI: v_mul_u32_u24 78; SI: v_mul_u32_u24 79; SI: v_mul_u32_u24 80 81; VI: v_mul_lo_u16_sdwa 82; VI: v_mul_lo_u16_sdwa 83; VI: v_mul_lo_u16_e32 84; VI: v_mul_lo_u16_e32 85; VI: v_or_b32_e32 86; VI: v_or_b32_e32 87 88; GFX9: s_waitcnt 89; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 90; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 91; GFX9-NEXT: s_setpc_b64 92define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { 93 %r.val = mul <4 x i16> %a, %b 94 ret <4 x i16> %r.val 95} 96