1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2017-2020 Intel Corporation
3 */
4
5 #include <string.h>
6
7 #include <rte_common.h>
8 #include <rte_branch_prediction.h>
9 #include <rte_cpuflags.h>
10
11 #include "net_crc.h"
12
13 #include <x86intrin.h>
14
15 /** PCLMULQDQ CRC computation context structure */
16 struct crc_pclmulqdq_ctx {
17 __m128i rk1_rk2;
18 __m128i rk5_rk6;
19 __m128i rk7_rk8;
20 };
21
22 static struct crc_pclmulqdq_ctx crc32_eth_pclmulqdq __rte_aligned(16);
23 static struct crc_pclmulqdq_ctx crc16_ccitt_pclmulqdq __rte_aligned(16);
24 /**
25 * @brief Performs one folding round
26 *
27 * Logically function operates as follows:
28 * DATA = READ_NEXT_16BYTES();
29 * F1 = LSB8(FOLD)
30 * F2 = MSB8(FOLD)
31 * T1 = CLMUL(F1, RK1)
32 * T2 = CLMUL(F2, RK2)
33 * FOLD = XOR(T1, T2, DATA)
34 *
35 * @param data_block
36 * 16 byte data block
37 * @param precomp
38 * Precomputed rk1 constant
39 * @param fold
40 * Current16 byte folded data
41 *
42 * @return
43 * New 16 byte folded data
44 */
45 static __rte_always_inline __m128i
crcr32_folding_round(__m128i data_block,__m128i precomp,__m128i fold)46 crcr32_folding_round(__m128i data_block,
47 __m128i precomp,
48 __m128i fold)
49 {
50 __m128i tmp0 = _mm_clmulepi64_si128(fold, precomp, 0x01);
51 __m128i tmp1 = _mm_clmulepi64_si128(fold, precomp, 0x10);
52
53 return _mm_xor_si128(tmp1, _mm_xor_si128(data_block, tmp0));
54 }
55
56 /**
57 * Performs reduction from 128 bits to 64 bits
58 *
59 * @param data128
60 * 128 bits data to be reduced
61 * @param precomp
62 * precomputed constants rk5, rk6
63 *
64 * @return
65 * 64 bits reduced data
66 */
67
68 static __rte_always_inline __m128i
crcr32_reduce_128_to_64(__m128i data128,__m128i precomp)69 crcr32_reduce_128_to_64(__m128i data128, __m128i precomp)
70 {
71 __m128i tmp0, tmp1, tmp2;
72
73 /* 64b fold */
74 tmp0 = _mm_clmulepi64_si128(data128, precomp, 0x00);
75 tmp1 = _mm_srli_si128(data128, 8);
76 tmp0 = _mm_xor_si128(tmp0, tmp1);
77
78 /* 32b fold */
79 tmp2 = _mm_slli_si128(tmp0, 4);
80 tmp1 = _mm_clmulepi64_si128(tmp2, precomp, 0x10);
81
82 return _mm_xor_si128(tmp1, tmp0);
83 }
84
85 /**
86 * Performs Barret's reduction from 64 bits to 32 bits
87 *
88 * @param data64
89 * 64 bits data to be reduced
90 * @param precomp
91 * rk7 precomputed constant
92 *
93 * @return
94 * reduced 32 bits data
95 */
96
97 static __rte_always_inline uint32_t
crcr32_reduce_64_to_32(__m128i data64,__m128i precomp)98 crcr32_reduce_64_to_32(__m128i data64, __m128i precomp)
99 {
100 static const uint32_t mask1[4] __rte_aligned(16) = {
101 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
102 };
103
104 static const uint32_t mask2[4] __rte_aligned(16) = {
105 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
106 };
107 __m128i tmp0, tmp1, tmp2;
108
109 tmp0 = _mm_and_si128(data64, _mm_load_si128((const __m128i *)mask2));
110
111 tmp1 = _mm_clmulepi64_si128(tmp0, precomp, 0x00);
112 tmp1 = _mm_xor_si128(tmp1, tmp0);
113 tmp1 = _mm_and_si128(tmp1, _mm_load_si128((const __m128i *)mask1));
114
115 tmp2 = _mm_clmulepi64_si128(tmp1, precomp, 0x10);
116 tmp2 = _mm_xor_si128(tmp2, tmp1);
117 tmp2 = _mm_xor_si128(tmp2, tmp0);
118
119 return _mm_extract_epi32(tmp2, 2);
120 }
121
122 static const uint8_t crc_xmm_shift_tab[48] __rte_aligned(16) = {
123 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
124 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
125 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
126 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
128 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
129 };
130
131 /**
132 * Shifts left 128 bit register by specified number of bytes
133 *
134 * @param reg
135 * 128 bit value
136 * @param num
137 * number of bytes to shift left reg by (0-16)
138 *
139 * @return
140 * reg << (num * 8)
141 */
142
143 static __rte_always_inline __m128i
xmm_shift_left(__m128i reg,const unsigned int num)144 xmm_shift_left(__m128i reg, const unsigned int num)
145 {
146 const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
147
148 return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
149 }
150
151 static __rte_always_inline uint32_t
crc32_eth_calc_pclmulqdq(const uint8_t * data,uint32_t data_len,uint32_t crc,const struct crc_pclmulqdq_ctx * params)152 crc32_eth_calc_pclmulqdq(
153 const uint8_t *data,
154 uint32_t data_len,
155 uint32_t crc,
156 const struct crc_pclmulqdq_ctx *params)
157 {
158 __m128i temp, fold, k;
159 uint32_t n;
160
161 /* Get CRC init value */
162 temp = _mm_insert_epi32(_mm_setzero_si128(), crc, 0);
163
164 /**
165 * Folding all data into single 16 byte data block
166 * Assumes: fold holds first 16 bytes of data
167 */
168
169 if (unlikely(data_len < 32)) {
170 if (unlikely(data_len == 16)) {
171 /* 16 bytes */
172 fold = _mm_loadu_si128((const __m128i *)data);
173 fold = _mm_xor_si128(fold, temp);
174 goto reduction_128_64;
175 }
176
177 if (unlikely(data_len < 16)) {
178 /* 0 to 15 bytes */
179 uint8_t buffer[16] __rte_aligned(16);
180
181 memset(buffer, 0, sizeof(buffer));
182 memcpy(buffer, data, data_len);
183
184 fold = _mm_load_si128((const __m128i *)buffer);
185 fold = _mm_xor_si128(fold, temp);
186 if (unlikely(data_len < 4)) {
187 fold = xmm_shift_left(fold, 8 - data_len);
188 goto barret_reduction;
189 }
190 fold = xmm_shift_left(fold, 16 - data_len);
191 goto reduction_128_64;
192 }
193 /* 17 to 31 bytes */
194 fold = _mm_loadu_si128((const __m128i *)data);
195 fold = _mm_xor_si128(fold, temp);
196 n = 16;
197 k = params->rk1_rk2;
198 goto partial_bytes;
199 }
200
201 /** At least 32 bytes in the buffer */
202 /** Apply CRC initial value */
203 fold = _mm_loadu_si128((const __m128i *)data);
204 fold = _mm_xor_si128(fold, temp);
205
206 /** Main folding loop - the last 16 bytes is processed separately */
207 k = params->rk1_rk2;
208 for (n = 16; (n + 16) <= data_len; n += 16) {
209 temp = _mm_loadu_si128((const __m128i *)&data[n]);
210 fold = crcr32_folding_round(temp, k, fold);
211 }
212
213 partial_bytes:
214 if (likely(n < data_len)) {
215
216 const uint32_t mask3[4] __rte_aligned(16) = {
217 0x80808080, 0x80808080, 0x80808080, 0x80808080
218 };
219
220 const uint8_t shf_table[32] __rte_aligned(16) = {
221 0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
222 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
223 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
224 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
225 };
226
227 __m128i last16, a, b;
228
229 last16 = _mm_loadu_si128((const __m128i *)&data[data_len - 16]);
230
231 temp = _mm_loadu_si128((const __m128i *)
232 &shf_table[data_len & 15]);
233 a = _mm_shuffle_epi8(fold, temp);
234
235 temp = _mm_xor_si128(temp,
236 _mm_load_si128((const __m128i *)mask3));
237 b = _mm_shuffle_epi8(fold, temp);
238 b = _mm_blendv_epi8(b, last16, temp);
239
240 /* k = rk1 & rk2 */
241 temp = _mm_clmulepi64_si128(a, k, 0x01);
242 fold = _mm_clmulepi64_si128(a, k, 0x10);
243
244 fold = _mm_xor_si128(fold, temp);
245 fold = _mm_xor_si128(fold, b);
246 }
247
248 /** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
249 reduction_128_64:
250 k = params->rk5_rk6;
251 fold = crcr32_reduce_128_to_64(fold, k);
252
253 barret_reduction:
254 k = params->rk7_rk8;
255 n = crcr32_reduce_64_to_32(fold, k);
256
257 return n;
258 }
259
260 void
rte_net_crc_sse42_init(void)261 rte_net_crc_sse42_init(void)
262 {
263 uint64_t k1, k2, k5, k6;
264 uint64_t p = 0, q = 0;
265
266 /** Initialize CRC16 data */
267 k1 = 0x189aeLLU;
268 k2 = 0x8e10LLU;
269 k5 = 0x189aeLLU;
270 k6 = 0x114aaLLU;
271 q = 0x11c581910LLU;
272 p = 0x10811LLU;
273
274 /** Save the params in context structure */
275 crc16_ccitt_pclmulqdq.rk1_rk2 =
276 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
277 crc16_ccitt_pclmulqdq.rk5_rk6 =
278 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
279 crc16_ccitt_pclmulqdq.rk7_rk8 =
280 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
281
282 /** Initialize CRC32 data */
283 k1 = 0xccaa009eLLU;
284 k2 = 0x1751997d0LLU;
285 k5 = 0xccaa009eLLU;
286 k6 = 0x163cd6124LLU;
287 q = 0x1f7011640LLU;
288 p = 0x1db710641LLU;
289
290 /** Save the params in context structure */
291 crc32_eth_pclmulqdq.rk1_rk2 =
292 _mm_setr_epi64(_mm_cvtsi64_m64(k1), _mm_cvtsi64_m64(k2));
293 crc32_eth_pclmulqdq.rk5_rk6 =
294 _mm_setr_epi64(_mm_cvtsi64_m64(k5), _mm_cvtsi64_m64(k6));
295 crc32_eth_pclmulqdq.rk7_rk8 =
296 _mm_setr_epi64(_mm_cvtsi64_m64(q), _mm_cvtsi64_m64(p));
297
298 /**
299 * Reset the register as following calculation may
300 * use other data types such as float, double, etc.
301 */
302 _mm_empty();
303 }
304
305 uint32_t
rte_crc16_ccitt_sse42_handler(const uint8_t * data,uint32_t data_len)306 rte_crc16_ccitt_sse42_handler(const uint8_t *data, uint32_t data_len)
307 {
308 /** return 16-bit CRC value */
309 return (uint16_t)~crc32_eth_calc_pclmulqdq(data,
310 data_len,
311 0xffff,
312 &crc16_ccitt_pclmulqdq);
313 }
314
315 uint32_t
rte_crc32_eth_sse42_handler(const uint8_t * data,uint32_t data_len)316 rte_crc32_eth_sse42_handler(const uint8_t *data, uint32_t data_len)
317 {
318 return ~crc32_eth_calc_pclmulqdq(data,
319 data_len,
320 0xffffffffUL,
321 &crc32_eth_pclmulqdq);
322 }
323