1; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX906 %s 4 5; GCN-LABEL: name: uniform_vec_0_i16 6; GCN: S_LSHL_B32 7define amdgpu_kernel void @uniform_vec_0_i16(i32 addrspace(1)* %out, i16 %a) { 8 %tmp = insertelement <2 x i16> undef, i16 0, i32 0 9 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 10 %val = bitcast <2 x i16> %vec to i32 11 store i32 %val, i32 addrspace(1)* %out, align 4 12 ret void 13} 14 15; GCN-LABEL: name: divergent_vec_0_i16 16; GCN: V_LSHLREV_B32_e64 17define i32 @divergent_vec_0_i16(i16 %a) { 18 %tmp = insertelement <2 x i16> undef, i16 0, i32 0 19 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 20 %val = bitcast <2 x i16> %vec to i32 21 ret i32 %val 22} 23 24; GCN-LABEL: name: uniform_vec_i16_0 25; GCN: S_AND_B32 26define amdgpu_kernel void @uniform_vec_i16_0(i32 addrspace(1)* %out, i16 %a) { 27 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 28 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 29 %val = bitcast <2 x i16> %vec to i32 30 store i32 %val, i32 addrspace(1)* %out, align 4 31 ret void 32} 33 34; GCN-LABEL: name: divergent_vec_i16_0 35; GCN: V_AND_B32_e64 36define i32 @divergent_vec_i16_0(i16 %a) { 37 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 38 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 39 %val = bitcast <2 x i16> %vec to i32 40 ret i32 %val 41} 42 43; GCN-LABEL: name: uniform_vec_f16_0 44; GCN: S_AND_B32 45define amdgpu_kernel void @uniform_vec_f16_0(float addrspace(1)* %out, half %a) { 46 %tmp = insertelement <2 x half> undef, half %a, i32 0 47 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 48 %val = bitcast <2 x half> %vec to float 49 store float %val, float addrspace(1)* %out, align 4 50 ret void 51} 52 53; GCN-LABEL: name: divergent_vec_f16_0 54; GCN: V_CVT_F16_F32_e64 0, %0 55; GCN: COPY %1 56 57; GFX9-LABEL: name: divergent_vec_f16_0 58; GFX9: V_AND_B32_e64 59define float @divergent_vec_f16_0(half %a) { 60 %tmp = insertelement <2 x half> undef, half %a, i32 0 61 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 62 %val = bitcast <2 x half> %vec to float 63 ret float %val 64} 65 66; GCN-LABEL: name: uniform_vec_i16_LL 67; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 68; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] 69; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 70; GCN: %[[SHL:[0-9]+]]:sreg_32 = S_LSHL_B32 killed %{{[0-9]+}}, killed %[[SHIFT]] 71; GCN: S_OR_B32 killed %[[AND]], killed %[[SHL]] 72 73; GFX9-LABEL: name: uniform_vec_i16_LL 74; GFX9: S_PACK_LL_B32_B16 75define amdgpu_kernel void @uniform_vec_i16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) { 76 %val0 = load volatile i32, i32 addrspace(4)* %in0 77 %val1 = load volatile i32, i32 addrspace(4)* %in1 78 %lo = trunc i32 %val0 to i16 79 %hi = trunc i32 %val1 to i16 80 %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 81 %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 82 %vec.i32 = bitcast <2 x i16> %vec.1 to i32 83 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 84 ret void 85} 86 87; GCN-LABEL: name: divergent_vec_i16_LL 88; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 89; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]], %1, implicit $exec 90; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 91; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %0, killed %[[IMM]], implicit $exec 92; GCN: V_OR_B32_e64 killed %[[AND]], killed %[[SHL]], implicit $exec 93 94; GFX9-LABEL: name: divergent_vec_i16_LL 95; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535 96; GFX9: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[IMM]] 97; GFX9: V_LSHL_OR_B32_e64 %{{[0-9]+}}, 16, killed %[[AND]] 98define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) { 99 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 100 %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1 101 %val = bitcast <2 x i16> %vec to i32 102 ret i32 %val 103} 104 105; GCN-LABEL: name: uniform_vec_i16_LH 106; GCN-DAG: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 107; GCN-DAG: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] 108; GCN-DAG: %[[NEG:[0-9]+]]:sreg_32 = S_MOV_B32 -65536 109; GCN-DAG: %[[ANDN:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[NEG]] 110; GCN: S_OR_B32 killed %[[AND]], killed %[[ANDN]] 111 112; GFX9-LABEL: name: uniform_vec_i16_LH 113; GFX9: S_PACK_LH_B32_B16 114define amdgpu_kernel void @uniform_vec_i16_LH(i32 addrspace(1)* %out, i16 %a, i32 %b) { 115 %shift = lshr i32 %b, 16 116 %tr = trunc i32 %shift to i16 117 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 118 %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 119 %val = bitcast <2 x i16> %vec to i32 120 store i32 %val, i32 addrspace(1)* %out, align 4 121 ret void 122} 123 124; GCN-LABEL: name: divergent_vec_i16_LH 125; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 126; GCN: V_BFI_B32_e64 killed %[[IMM]] 127define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) { 128 %shift = lshr i32 %b, 16 129 %tr = trunc i32 %shift to i16 130 %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 131 %vec = insertelement <2 x i16> %tmp, i16 %tr, i32 1 132 %val = bitcast <2 x i16> %vec to i32 133 ret i32 %val 134} 135 136; GCN-LABEL: name: uniform_vec_i16_HH 137; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 138; GCN: %[[SHR:[0-9]+]]:sreg_32 = S_LSHR_B32 killed %{{[0-9]+}}, killed %[[SHIFT]] 139; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 -65536 140; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] 141; GCN: S_OR_B32 killed %[[SHR]], killed %[[AND]] 142 143; GFX9-LABEL: name: uniform_vec_i16_HH 144; GFX9: S_PACK_HH_B32_B16 145define amdgpu_kernel void @uniform_vec_i16_HH(i32 addrspace(1)* %out, i32 %a, i32 %b) { 146 %shift_a = lshr i32 %a, 16 147 %tr_a = trunc i32 %shift_a to i16 148 %shift_b = lshr i32 %b, 16 149 %tr_b = trunc i32 %shift_b to i16 150 %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0 151 %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1 152 %val = bitcast <2 x i16> %vec to i32 153 store i32 %val, i32 addrspace(1)* %out, align 4 154 ret void 155} 156 157; GCN-LABEL: name: divergent_vec_i16_HH 158; GCN: %[[SHR:[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 killed %{{[0-9]+}}, %0, implicit $exec 159; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 -65536 160; GCN: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 %1, killed %[[IMM]], implicit $exec 161; GCN: V_OR_B32_e64 killed %[[SHR]], killed %[[AND]], implicit $exec 162 163; GFX9-LABEL: name: divergent_vec_i16_HH 164; GFX9: %[[SHR:[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %0 165; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -65536, implicit $exec 166; GFX9: V_AND_OR_B32_e64 %1, killed %[[IMM]], killed %[[SHR]] 167define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) { 168 %shift_a = lshr i32 %a, 16 169 %tr_a = trunc i32 %shift_a to i16 170 %shift_b = lshr i32 %b, 16 171 %tr_b = trunc i32 %shift_b to i16 172 %tmp = insertelement <2 x i16> undef, i16 %tr_a, i32 0 173 %vec = insertelement <2 x i16> %tmp, i16 %tr_b, i32 1 174 %val = bitcast <2 x i16> %vec to i32 175 ret i32 %val 176} 177 178; GCN-LABEL: name: uniform_vec_f16_LL 179; GCN: %[[IMM:[0-9]+]]:sreg_32 = S_MOV_B32 65535 180; GCN: %[[AND:[0-9]+]]:sreg_32 = S_AND_B32 killed %{{[0-9]+}}, killed %[[IMM]] 181; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 182; GCN: %[[SHL:[0-9]+]]:sreg_32 = S_LSHL_B32 killed %{{[0-9]+}}, killed %[[SHIFT]] 183; GCN: S_OR_B32 killed %[[AND]], killed %[[SHL]] 184 185; GFX9-LABEL: name: uniform_vec_f16_LL 186; GFX9: S_PACK_LL_B32_B16 187define amdgpu_kernel void @uniform_vec_f16_LL(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) { 188 %val0 = load volatile i32, i32 addrspace(4)* %in0 189 %val1 = load volatile i32, i32 addrspace(4)* %in1 190 %lo.i = trunc i32 %val0 to i16 191 %hi.i = trunc i32 %val1 to i16 192 %lo = bitcast i16 %lo.i to half 193 %hi = bitcast i16 %hi.i to half 194 %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 195 %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 196 %vec.i32 = bitcast <2 x half> %vec.1 to i32 197 198 call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 199 ret void 200} 201 202; GCN-LABEL: name: divergent_vec_f16_LL 203; GCN: %[[SHIFT:[0-9]+]]:sreg_32 = S_MOV_B32 16 204; GCN: %[[SHL:[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed %[[SHIFT]] 205; GCN: V_OR_B32_e64 killed %{{[0-9]+}}, killed %[[SHL]], implicit $exec 206 207; GFX9-LABEL: name: divergent_vec_f16_LL 208; GFX9: %[[IMM:[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535 209; GFX9: %[[AND:[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed %[[IMM]] 210; GFX9: V_LSHL_OR_B32_e64 %{{[0-9]+}}, 16, killed %[[AND]] 211define float @divergent_vec_f16_LL(half %a, half %b) { 212 %tmp = insertelement <2 x half> undef, half %a, i32 0 213 %vec = insertelement <2 x half> %tmp, half %b, i32 1 214 %val = bitcast <2 x half> %vec to float 215 ret float %val 216} 217 218; GFX906-LABEL: name: build_vec_v2i16_undeflo_divergent 219; GFX906: %[[LOAD:[0-9]+]]:vgpr_32 = DS_READ_U16 220; GFX906: %{{[0-9]+}}:vgpr_32 = COPY %[[LOAD]] 221define <2 x i16> @build_vec_v2i16_undeflo_divergent(i16 addrspace(3)* %in) #0 { 222entry: 223 %load = load i16, i16 addrspace(3)* %in 224 %build = insertelement <2 x i16> undef, i16 %load, i32 0 225 ret <2 x i16> %build 226} 227 228; GFX906-LABEL: name: build_vec_v2i16_undeflo_uniform 229; GFX906: %[[LOAD:[0-9]+]]:vgpr_32 = DS_READ_U16 230; GFX906: %{{[0-9]+}}:sreg_32 = COPY %[[LOAD]] 231define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(i16 addrspace(3)* %in, i32 addrspace(1)* %out) #0 { 232entry: 233 %load = load i16, i16 addrspace(3)* %in 234 %build = insertelement <2 x i16> undef, i16 %load, i32 0 235 %result = bitcast <2 x i16> %build to i32 236 store i32 %result, i32 addrspace(1)* %out 237 ret void 238} 239