1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2010-2014 Intel Corporation 3 */ 4 5 #include "acl_run_sse.h" 6 7 static const rte_ymm_t ymm_match_mask = { 8 .u32 = { 9 RTE_ACL_NODE_MATCH, 10 RTE_ACL_NODE_MATCH, 11 RTE_ACL_NODE_MATCH, 12 RTE_ACL_NODE_MATCH, 13 RTE_ACL_NODE_MATCH, 14 RTE_ACL_NODE_MATCH, 15 RTE_ACL_NODE_MATCH, 16 RTE_ACL_NODE_MATCH, 17 }, 18 }; 19 20 static const rte_ymm_t ymm_index_mask = { 21 .u32 = { 22 RTE_ACL_NODE_INDEX, 23 RTE_ACL_NODE_INDEX, 24 RTE_ACL_NODE_INDEX, 25 RTE_ACL_NODE_INDEX, 26 RTE_ACL_NODE_INDEX, 27 RTE_ACL_NODE_INDEX, 28 RTE_ACL_NODE_INDEX, 29 RTE_ACL_NODE_INDEX, 30 }, 31 }; 32 33 static const rte_ymm_t ymm_shuffle_input = { 34 .u32 = { 35 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, 36 0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c, 37 }, 38 }; 39 40 static const rte_ymm_t ymm_ones_16 = { 41 .u16 = { 42 1, 1, 1, 1, 1, 1, 1, 1, 43 1, 1, 1, 1, 1, 1, 1, 1, 44 }, 45 }; 46 47 static const rte_ymm_t ymm_range_base = { 48 .u32 = { 49 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, 50 0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c, 51 }, 52 }; 53 54 /* 55 * Process 8 transitions in parallel. 56 * tr_lo contains low 32 bits for 8 transition. 57 * tr_hi contains high 32 bits for 8 transition. 58 * next_input contains up to 4 input bytes for 8 flows. 59 */ 60 static __rte_always_inline ymm_t 61 transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi) 62 { 63 const int32_t *tr; 64 ymm_t addr; 65 66 tr = (const int32_t *)(uintptr_t)trans; 67 68 /* Calculate the address (array index) for all 8 transitions. */ 69 ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input, 70 ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y, 71 *tr_lo, *tr_hi); 72 73 /* load lower 32 bits of 8 transactions at once. */ 74 *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0])); 75 76 next_input = _mm256_srli_epi32(next_input, CHAR_BIT); 77 78 /* load high 32 bits of 8 transactions at once. */ 79 *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0])); 80 81 return next_input; 82 } 83 84 /* 85 * Process matches for 8 flows. 86 * tr_lo contains low 32 bits for 8 transition. 87 * tr_hi contains high 32 bits for 8 transition. 88 */ 89 static inline void 90 acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx, 91 struct parms *parms, struct acl_flow_data *flows, uint32_t slot, 92 ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi) 93 { 94 ymm_t t0, t1; 95 ymm_t lo, hi; 96 xmm_t l0, l1; 97 uint32_t i; 98 uint64_t tr[MAX_SEARCHES_SSE8]; 99 100 l1 = _mm256_extracti128_si256(*tr_lo, 1); 101 l0 = _mm256_castsi256_si128(*tr_lo); 102 103 for (i = 0; i != RTE_DIM(tr) / 2; i++) { 104 105 /* 106 * Extract low 32bits of each transition. 107 * That's enough to process the match. 108 */ 109 tr[i] = (uint32_t)_mm_cvtsi128_si32(l0); 110 tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1); 111 112 l0 = _mm_srli_si128(l0, sizeof(uint32_t)); 113 l1 = _mm_srli_si128(l1, sizeof(uint32_t)); 114 115 tr[i] = acl_match_check(tr[i], slot + i, 116 ctx, parms, flows, resolve_priority_sse); 117 tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4, 118 ctx, parms, flows, resolve_priority_sse); 119 } 120 121 /* Collect new transitions into 2 YMM registers. */ 122 t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]); 123 t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]); 124 125 /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */ 126 ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi); 127 128 /* Keep transitions with NOMATCH intact. */ 129 *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches); 130 *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches); 131 } 132 133 static inline void 134 acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms, 135 struct acl_flow_data *flows, uint32_t slot, 136 ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask) 137 { 138 uint32_t msk; 139 ymm_t matches, temp; 140 141 /* test for match node */ 142 temp = _mm256_and_si256(match_mask, *tr_lo); 143 matches = _mm256_cmpeq_epi32(temp, match_mask); 144 msk = _mm256_movemask_epi8(matches); 145 146 while (msk != 0) { 147 148 acl_process_matches_avx2x8(ctx, parms, flows, slot, 149 matches, tr_lo, tr_hi); 150 temp = _mm256_and_si256(match_mask, *tr_lo); 151 matches = _mm256_cmpeq_epi32(temp, match_mask); 152 msk = _mm256_movemask_epi8(matches); 153 } 154 } 155 156 /* 157 * Execute trie traversal for up to 16 flows in parallel. 158 */ 159 static inline int 160 search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data, 161 uint32_t *results, uint32_t total_packets, uint32_t categories) 162 { 163 uint32_t n; 164 struct acl_flow_data flows; 165 uint64_t index_array[MAX_SEARCHES_AVX16]; 166 struct completion cmplt[MAX_SEARCHES_AVX16]; 167 struct parms parms[MAX_SEARCHES_AVX16]; 168 ymm_t input[2], tr_lo[2], tr_hi[2]; 169 ymm_t t0, t1; 170 171 acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results, 172 total_packets, categories, ctx->trans_table); 173 174 for (n = 0; n < RTE_DIM(cmplt); n++) { 175 cmplt[n].count = 0; 176 index_array[n] = acl_start_next_trie(&flows, parms, n, ctx); 177 } 178 179 t0 = _mm256_set_epi64x(index_array[5], index_array[4], 180 index_array[1], index_array[0]); 181 t1 = _mm256_set_epi64x(index_array[7], index_array[6], 182 index_array[3], index_array[2]); 183 184 ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]); 185 186 t0 = _mm256_set_epi64x(index_array[13], index_array[12], 187 index_array[9], index_array[8]); 188 t1 = _mm256_set_epi64x(index_array[15], index_array[14], 189 index_array[11], index_array[10]); 190 191 ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]); 192 193 /* Check for any matches. */ 194 acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0], 195 ymm_match_mask.y); 196 acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1], 197 ymm_match_mask.y); 198 199 while (flows.started > 0) { 200 201 uint32_t in[MAX_SEARCHES_SSE8]; 202 203 /* Gather 4 bytes of input data for first 8 flows. */ 204 in[0] = GET_NEXT_4BYTES(parms, 0); 205 in[4] = GET_NEXT_4BYTES(parms, 4); 206 in[1] = GET_NEXT_4BYTES(parms, 1); 207 in[5] = GET_NEXT_4BYTES(parms, 5); 208 in[2] = GET_NEXT_4BYTES(parms, 2); 209 in[6] = GET_NEXT_4BYTES(parms, 6); 210 in[3] = GET_NEXT_4BYTES(parms, 3); 211 in[7] = GET_NEXT_4BYTES(parms, 7); 212 input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4], 213 in[3], in[2], in[1], in[0]); 214 215 /* Gather 4 bytes of input data for last 8 flows. */ 216 in[0] = GET_NEXT_4BYTES(parms, 8); 217 in[4] = GET_NEXT_4BYTES(parms, 12); 218 in[1] = GET_NEXT_4BYTES(parms, 9); 219 in[5] = GET_NEXT_4BYTES(parms, 13); 220 in[2] = GET_NEXT_4BYTES(parms, 10); 221 in[6] = GET_NEXT_4BYTES(parms, 14); 222 in[3] = GET_NEXT_4BYTES(parms, 11); 223 in[7] = GET_NEXT_4BYTES(parms, 15); 224 input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4], 225 in[3], in[2], in[1], in[0]); 226 227 input[0] = transition8(input[0], flows.trans, 228 &tr_lo[0], &tr_hi[0]); 229 input[1] = transition8(input[1], flows.trans, 230 &tr_lo[1], &tr_hi[1]); 231 232 input[0] = transition8(input[0], flows.trans, 233 &tr_lo[0], &tr_hi[0]); 234 input[1] = transition8(input[1], flows.trans, 235 &tr_lo[1], &tr_hi[1]); 236 237 input[0] = transition8(input[0], flows.trans, 238 &tr_lo[0], &tr_hi[0]); 239 input[1] = transition8(input[1], flows.trans, 240 &tr_lo[1], &tr_hi[1]); 241 242 input[0] = transition8(input[0], flows.trans, 243 &tr_lo[0], &tr_hi[0]); 244 input[1] = transition8(input[1], flows.trans, 245 &tr_lo[1], &tr_hi[1]); 246 247 /* Check for any matches. */ 248 acl_match_check_avx2x8(ctx, parms, &flows, 0, 249 &tr_lo[0], &tr_hi[0], ymm_match_mask.y); 250 acl_match_check_avx2x8(ctx, parms, &flows, 8, 251 &tr_lo[1], &tr_hi[1], ymm_match_mask.y); 252 } 253 254 return 0; 255 } 256