1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017 Cavium, Inc
3 */
4
5 #include <string.h>
6
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_net_crc.h>
10 #include <rte_vect.h>
11 #include <rte_cpuflags.h>
12
13 #include "net_crc.h"
14
15 /** PMULL CRC computation context structure */
16 struct crc_pmull_ctx {
17 uint64x2_t rk1_rk2;
18 uint64x2_t rk5_rk6;
19 uint64x2_t rk7_rk8;
20 };
21
22 struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
23 struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
24
25 /**
26 * @brief Performs one folding round
27 *
28 * Logically function operates as follows:
29 * DATA = READ_NEXT_16BYTES();
30 * F1 = LSB8(FOLD)
31 * F2 = MSB8(FOLD)
32 * T1 = CLMUL(F1, RK1)
33 * T2 = CLMUL(F2, RK2)
34 * FOLD = XOR(T1, T2, DATA)
35 *
36 * @param data_block 16 byte data block
37 * @param precomp precomputed rk1 constant
38 * @param fold running 16 byte folded data
39 *
40 * @return New 16 byte folded data
41 */
42 static inline uint64x2_t
crcr32_folding_round(uint64x2_t data_block,uint64x2_t precomp,uint64x2_t fold)43 crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
44 uint64x2_t fold)
45 {
46 uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
47 vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
48 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
49
50 uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
51 vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
52 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
53
54 return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
55 }
56
57 /**
58 * Performs reduction from 128 bits to 64 bits
59 *
60 * @param data128 128 bits data to be reduced
61 * @param precomp rk5 and rk6 precomputed constants
62 *
63 * @return data reduced to 64 bits
64 */
65 static inline uint64x2_t
crcr32_reduce_128_to_64(uint64x2_t data128,uint64x2_t precomp)66 crcr32_reduce_128_to_64(uint64x2_t data128,
67 uint64x2_t precomp)
68 {
69 uint64x2_t tmp0, tmp1, tmp2;
70
71 /* 64b fold */
72 tmp0 = vreinterpretq_u64_p128(vmull_p64(
73 vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
74 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
75 tmp1 = vshift_bytes_right(data128, 8);
76 tmp0 = veorq_u64(tmp0, tmp1);
77
78 /* 32b fold */
79 tmp2 = vshift_bytes_left(tmp0, 4);
80 tmp1 = vreinterpretq_u64_p128(vmull_p64(
81 vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
82 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
83
84 return veorq_u64(tmp1, tmp0);
85 }
86
87 /**
88 * Performs Barret's reduction from 64 bits to 32 bits
89 *
90 * @param data64 64 bits data to be reduced
91 * @param precomp rk7 precomputed constant
92 *
93 * @return data reduced to 32 bits
94 */
95 static inline uint32_t
crcr32_reduce_64_to_32(uint64x2_t data64,uint64x2_t precomp)96 crcr32_reduce_64_to_32(uint64x2_t data64,
97 uint64x2_t precomp)
98 {
99 static uint32_t mask1[4] __rte_aligned(16) = {
100 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
101 };
102 static uint32_t mask2[4] __rte_aligned(16) = {
103 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
104 };
105 uint64x2_t tmp0, tmp1, tmp2;
106
107 tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
108
109 tmp1 = vreinterpretq_u64_p128(vmull_p64(
110 vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
111 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
112 tmp1 = veorq_u64(tmp1, tmp0);
113 tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
114
115 tmp2 = vreinterpretq_u64_p128(vmull_p64(
116 vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
117 vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
118 tmp2 = veorq_u64(tmp2, tmp1);
119 tmp2 = veorq_u64(tmp2, tmp0);
120
121 return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
122 }
123
124 static inline uint32_t
crc32_eth_calc_pmull(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pmull_ctx * params)125 crc32_eth_calc_pmull(
126 const uint8_t *data,
127 uint32_t data_len,
128 uint32_t crc,
129 const struct crc_pmull_ctx *params)
130 {
131 uint64x2_t temp, fold, k;
132 uint32_t n;
133
134 /* Get CRC init value */
135 temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
136
137 /**
138 * Folding all data into single 16 byte data block
139 * Assumes: fold holds first 16 bytes of data
140 */
141 if (unlikely(data_len < 32)) {
142 if (unlikely(data_len == 16)) {
143 /* 16 bytes */
144 fold = vld1q_u64((const uint64_t *)data);
145 fold = veorq_u64(fold, temp);
146 goto reduction_128_64;
147 }
148
149 if (unlikely(data_len < 16)) {
150 /* 0 to 15 bytes */
151 uint8_t buffer[16] __rte_aligned(16);
152
153 memset(buffer, 0, sizeof(buffer));
154 memcpy(buffer, data, data_len);
155
156 fold = vld1q_u64((uint64_t *)buffer);
157 fold = veorq_u64(fold, temp);
158 if (unlikely(data_len < 4)) {
159 fold = vshift_bytes_left(fold, 8 - data_len);
160 goto barret_reduction;
161 }
162 fold = vshift_bytes_left(fold, 16 - data_len);
163 goto reduction_128_64;
164 }
165 /* 17 to 31 bytes */
166 fold = vld1q_u64((const uint64_t *)data);
167 fold = veorq_u64(fold, temp);
168 n = 16;
169 k = params->rk1_rk2;
170 goto partial_bytes;
171 }
172
173 /** At least 32 bytes in the buffer */
174 /** Apply CRC initial value */
175 fold = vld1q_u64((const uint64_t *)data);
176 fold = veorq_u64(fold, temp);
177
178 /** Main folding loop - the last 16 bytes is processed separately */
179 k = params->rk1_rk2;
180 for (n = 16; (n + 16) <= data_len; n += 16) {
181 temp = vld1q_u64((const uint64_t *)&data[n]);
182 fold = crcr32_folding_round(temp, k, fold);
183 }
184
185 partial_bytes:
186 if (likely(n < data_len)) {
187 uint64x2_t last16, a, b, mask;
188 uint32_t rem = data_len & 15;
189
190 last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
191 a = vshift_bytes_left(fold, 16 - rem);
192 b = vshift_bytes_right(fold, rem);
193 mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
194 b = vorrq_u64(b, vandq_u64(mask, last16));
195
196 /* k = rk1 & rk2 */
197 temp = vreinterpretq_u64_p128(vmull_p64(
198 vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
199 vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
200 fold = vreinterpretq_u64_p128(vmull_p64(
201 vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
202 vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
203 fold = veorq_u64(fold, temp);
204 fold = veorq_u64(fold, b);
205 }
206
207 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
208 reduction_128_64:
209 k = params->rk5_rk6;
210 fold = crcr32_reduce_128_to_64(fold, k);
211
212 barret_reduction:
213 k = params->rk7_rk8;
214 n = crcr32_reduce_64_to_32(fold, k);
215
216 return n;
217 }
218
219 void
rte_net_crc_neon_init(void)220 rte_net_crc_neon_init(void)
221 {
222 /* Initialize CRC16 data */
223 uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
224 uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
225 uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
226
227 /* Initialize CRC32 data */
228 uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
229 uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
230 uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
231
232 /** Save the params in context structure */
233 crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
234 crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
235 crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
236
237 /** Save the params in context structure */
238 crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
239 crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
240 crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
241 }
242
243 uint32_t
rte_crc16_ccitt_neon_handler(const uint8_t * data,uint32_t data_len)244 rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len)
245 {
246 return (uint16_t)~crc32_eth_calc_pmull(data,
247 data_len,
248 0xffff,
249 &crc16_ccitt_pmull);
250 }
251
252 uint32_t
rte_crc32_eth_neon_handler(const uint8_t * data,uint32_t data_len)253 rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len)
254 {
255 return ~crc32_eth_calc_pmull(data,
256 data_len,
257 0xffffffffUL,
258 &crc32_eth_pmull);
259 }
260