1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include "iavf_rxtx_vec_common.h"
6 
7 #include <x86intrin.h>
8 
9 #ifndef __INTEL_COMPILER
10 #pragma GCC diagnostic ignored "-Wcast-qual"
11 #endif
12 
13 #define IAVF_DESCS_PER_LOOP_AVX 8
14 #define PKTLEN_SHIFT 10
15 
16 static inline void
iavf_rxq_rearm(struct iavf_rx_queue * rxq)17 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
18 {
19 	int i;
20 	uint16_t rx_id;
21 	volatile union iavf_rx_desc *rxdp;
22 	struct rte_mempool_cache *cache =
23 		rte_mempool_default_cache(rxq->mp, rte_lcore_id());
24 	struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
25 
26 	rxdp = rxq->rx_ring + rxq->rxrearm_start;
27 
28 	/* We need to pull 'n' more MBUFs into the software ring from mempool
29 	 * We inline the mempool function here, so we can vectorize the copy
30 	 * from the cache into the shadow ring.
31 	 */
32 
33 	/* Can this be satisfied from the cache? */
34 	if (cache->len < IAVF_RXQ_REARM_THRESH) {
35 		/* No. Backfill the cache first, and then fill from it */
36 		uint32_t req = IAVF_RXQ_REARM_THRESH + (cache->size -
37 							cache->len);
38 
39 		/* How many do we require i.e. number to fill the cache + the request */
40 		int ret = rte_mempool_ops_dequeue_bulk
41 				(rxq->mp, &cache->objs[cache->len], req);
42 		if (ret == 0) {
43 			cache->len += req;
44 		} else {
45 			if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
46 			    rxq->nb_rx_desc) {
47 				__m128i dma_addr0;
48 
49 				dma_addr0 = _mm_setzero_si128();
50 				for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
51 					rxp[i] = &rxq->fake_mbuf;
52 					_mm_storeu_si128((__m128i *)&rxdp[i].read,
53 							 dma_addr0);
54 				}
55 			}
56 			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
57 					IAVF_RXQ_REARM_THRESH;
58 			return;
59 		}
60 	}
61 
62 	const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
63 							(struct rte_mbuf, buf_iova));
64 	const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
65 
66 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
67 	/* to shuffle the addresses to correct slots. Values 4-7 will contain
68 	 * zeros, so use 7 for a zero-value.
69 	 */
70 	const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
71 #else
72 	const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
73 #endif
74 
75 	/* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
76 	 * from mempool cache and populating both shadow and HW rings
77 	 */
78 	for (i = 0; i < IAVF_RXQ_REARM_THRESH / IAVF_DESCS_PER_LOOP_AVX; i++) {
79 		const __m512i mbuf_ptrs = _mm512_loadu_si512
80 			(&cache->objs[cache->len - IAVF_DESCS_PER_LOOP_AVX]);
81 		_mm512_storeu_si512(rxp, mbuf_ptrs);
82 
83 		const __m512i iova_base_addrs = _mm512_i64gather_epi64
84 				(_mm512_add_epi64(mbuf_ptrs, iova_offsets),
85 				 0, /* base */
86 				 1  /* scale */);
87 		const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
88 				headroom);
89 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
90 		const __m512i iovas0 = _mm512_castsi256_si512
91 				(_mm512_extracti64x4_epi64(iova_addrs, 0));
92 		const __m512i iovas1 = _mm512_castsi256_si512
93 				(_mm512_extracti64x4_epi64(iova_addrs, 1));
94 
95 		/* permute leaves desc 2-3 addresses in header address slots 0-1
96 		 * but these are ignored by driver since header split not
97 		 * enabled. Similarly for desc 6 & 7.
98 		 */
99 		const __m512i desc0_1 = _mm512_permutexvar_epi64
100 				(permute_idx,
101 				 iovas0);
102 		const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
103 
104 		const __m512i desc4_5 = _mm512_permutexvar_epi64
105 				(permute_idx,
106 				 iovas1);
107 		const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
108 
109 		_mm512_storeu_si512((void *)rxdp, desc0_1);
110 		_mm512_storeu_si512((void *)(rxdp + 2), desc2_3);
111 		_mm512_storeu_si512((void *)(rxdp + 4), desc4_5);
112 		_mm512_storeu_si512((void *)(rxdp + 6), desc6_7);
113 #else
114 		/* permute leaves desc 4-7 addresses in header address slots 0-3
115 		 * but these are ignored by driver since header split not
116 		 * enabled.
117 		 */
118 		const __m512i desc0_3 = _mm512_permutexvar_epi64(permute_idx,
119 								 iova_addrs);
120 		const __m512i desc4_7 = _mm512_bsrli_epi128(desc0_3, 8);
121 
122 		_mm512_storeu_si512((void *)rxdp, desc0_3);
123 		_mm512_storeu_si512((void *)(rxdp + 4), desc4_7);
124 #endif
125 		rxp += IAVF_DESCS_PER_LOOP_AVX;
126 		rxdp += IAVF_DESCS_PER_LOOP_AVX;
127 		cache->len -= IAVF_DESCS_PER_LOOP_AVX;
128 	}
129 
130 	rxq->rxrearm_start += IAVF_RXQ_REARM_THRESH;
131 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
132 		rxq->rxrearm_start = 0;
133 
134 	rxq->rxrearm_nb -= IAVF_RXQ_REARM_THRESH;
135 
136 	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
137 			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
138 
139 	/* Update the tail pointer on the NIC */
140 	IAVF_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
141 }
142 
143 #define IAVF_RX_LEN_MASK 0x80808080
144 static inline uint16_t
_iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue * rxq,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts,uint8_t * split_packet)145 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
146 			       struct rte_mbuf **rx_pkts,
147 			       uint16_t nb_pkts, uint8_t *split_packet)
148 {
149 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
150 
151 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
152 						    rxq->mbuf_initializer);
153 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
154 	volatile union iavf_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
155 
156 	rte_prefetch0(rxdp);
157 
158 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
159 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
160 
161 	/* See if we need to rearm the RX queue - gives the prefetch a bit
162 	 * of time to act
163 	 */
164 	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
165 		iavf_rxq_rearm(rxq);
166 
167 	/* Before we start moving massive data around, check to see if
168 	 * there is actually a packet available
169 	 */
170 	if (!(rxdp->wb.qword1.status_error_len &
171 	      rte_cpu_to_le_32(1 << IAVF_RX_DESC_STATUS_DD_SHIFT)))
172 		return 0;
173 
174 	/* constants used in processing loop */
175 	const __m512i crc_adjust =
176 		_mm512_set_epi32
177 			(/* 1st descriptor */
178 			 0,             /* ignore non-length fields */
179 			 -rxq->crc_len, /* sub crc on data_len */
180 			 -rxq->crc_len, /* sub crc on pkt_len */
181 			 0,             /* ignore pkt_type field */
182 			 /* 2nd descriptor */
183 			 0,             /* ignore non-length fields */
184 			 -rxq->crc_len, /* sub crc on data_len */
185 			 -rxq->crc_len, /* sub crc on pkt_len */
186 			 0,             /* ignore pkt_type field */
187 			 /* 3rd descriptor */
188 			 0,             /* ignore non-length fields */
189 			 -rxq->crc_len, /* sub crc on data_len */
190 			 -rxq->crc_len, /* sub crc on pkt_len */
191 			 0,             /* ignore pkt_type field */
192 			 /* 4th descriptor */
193 			 0,             /* ignore non-length fields */
194 			 -rxq->crc_len, /* sub crc on data_len */
195 			 -rxq->crc_len, /* sub crc on pkt_len */
196 			 0              /* ignore pkt_type field */
197 			);
198 
199 	/* 8 packets DD mask, LSB in each 32-bit value */
200 	const __m256i dd_check = _mm256_set1_epi32(1);
201 
202 	/* 8 packets EOP mask, second-LSB in each 32-bit value */
203 	const __m256i eop_check = _mm256_slli_epi32(dd_check,
204 			IAVF_RX_DESC_STATUS_EOF_SHIFT);
205 
206 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
207 	const __m512i shuf_msk =
208 		_mm512_set_epi32
209 			(/* 1st descriptor */
210 			 0x07060504,    /* octet 4~7, 32bits rss */
211 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
212 					/* octet 15~14, 16 bits data_len */
213 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
214 					/* octet 15~14, low 16 bits pkt_len */
215 			 0xFFFFFFFF,    /* pkt_type set as unknown */
216 			 /* 2nd descriptor */
217 			 0x07060504,    /* octet 4~7, 32bits rss */
218 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
219 					/* octet 15~14, 16 bits data_len */
220 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
221 					/* octet 15~14, low 16 bits pkt_len */
222 			 0xFFFFFFFF,    /* pkt_type set as unknown */
223 			 /* 3rd descriptor */
224 			 0x07060504,    /* octet 4~7, 32bits rss */
225 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
226 					/* octet 15~14, 16 bits data_len */
227 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
228 					/* octet 15~14, low 16 bits pkt_len */
229 			 0xFFFFFFFF,    /* pkt_type set as unknown */
230 			 /* 4th descriptor */
231 			 0x07060504,    /* octet 4~7, 32bits rss */
232 			 0x03020F0E,    /* octet 2~3, low 16 bits vlan_macip */
233 					/* octet 15~14, 16 bits data_len */
234 			 0xFFFF0F0E,    /* skip high 16 bits pkt_len, zero out */
235 					/* octet 15~14, low 16 bits pkt_len */
236 			 0xFFFFFFFF     /* pkt_type set as unknown */
237 			);
238 	/**
239 	 * compile-time check the above crc and shuffle layout is correct.
240 	 * NOTE: the first field (lowest address) is given last in set_epi
241 	 * calls above.
242 	 */
243 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
244 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
245 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
246 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
247 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
248 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
249 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
250 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
251 
252 	/* Status/Error flag masks */
253 	/**
254 	 * mask everything except RSS, flow director and VLAN flags
255 	 * bit2 is for VLAN tag, bit11 for flow director indication
256 	 * bit13:12 for RSS indication. Bits 3-5 of error
257 	 * field (bits 22-24) are for IP/L4 checksum errors
258 	 */
259 	const __m256i flags_mask =
260 		_mm256_set1_epi32((1 << 2) | (1 << 11) |
261 				  (3 << 12) | (7 << 22));
262 	/**
263 	 * data to be shuffled by result of flag mask. If VLAN bit is set,
264 	 * (bit 2), then position 4 in this array will be used in the
265 	 * destination
266 	 */
267 	const __m256i vlan_flags_shuf =
268 		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
269 				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
270 	/**
271 	 * data to be shuffled by result of flag mask, shifted down 11.
272 	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
273 	 * place.
274 	 */
275 	const __m256i rss_flags_shuf =
276 		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
277 				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
278 				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
279 				0, 0, 0, 0, 0, 0, 0, 0,
280 				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
281 				0, 0, 0, 0, PKT_RX_FDIR, 0);
282 
283 	/**
284 	 * data to be shuffled by the result of the flags mask shifted by 22
285 	 * bits.  This gives use the l3_l4 flags.
286 	 */
287 	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
288 			/* shift right 1 bit to make sure it not exceed 255 */
289 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
290 			 PKT_RX_IP_CKSUM_BAD) >> 1,
291 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
292 			 PKT_RX_L4_CKSUM_BAD) >> 1,
293 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
294 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
295 			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
296 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
297 			PKT_RX_IP_CKSUM_BAD >> 1,
298 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
299 			/* second 128-bits */
300 			0, 0, 0, 0, 0, 0, 0, 0,
301 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
302 			 PKT_RX_IP_CKSUM_BAD) >> 1,
303 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
304 			 PKT_RX_L4_CKSUM_BAD) >> 1,
305 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
306 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
307 			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
308 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
309 			PKT_RX_IP_CKSUM_BAD >> 1,
310 			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
311 
312 	const __m256i cksum_mask =
313 		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
314 				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
315 				  PKT_RX_EIP_CKSUM_BAD);
316 
317 	uint16_t i, received;
318 
319 	for (i = 0, received = 0; i < nb_pkts;
320 	     i += IAVF_DESCS_PER_LOOP_AVX,
321 	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
322 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
323 		_mm256_storeu_si256((void *)&rx_pkts[i],
324 				    _mm256_loadu_si256((void *)&sw_ring[i]));
325 #ifdef RTE_ARCH_X86_64
326 		_mm256_storeu_si256
327 			((void *)&rx_pkts[i + 4],
328 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
329 #endif
330 
331 		__m512i raw_desc0_3, raw_desc4_7;
332 		const __m128i raw_desc7 =
333 			_mm_load_si128((void *)(rxdp + 7));
334 		rte_compiler_barrier();
335 		const __m128i raw_desc6 =
336 			_mm_load_si128((void *)(rxdp + 6));
337 		rte_compiler_barrier();
338 		const __m128i raw_desc5 =
339 			_mm_load_si128((void *)(rxdp + 5));
340 		rte_compiler_barrier();
341 		const __m128i raw_desc4 =
342 			_mm_load_si128((void *)(rxdp + 4));
343 		rte_compiler_barrier();
344 		const __m128i raw_desc3 =
345 			_mm_load_si128((void *)(rxdp + 3));
346 		rte_compiler_barrier();
347 		const __m128i raw_desc2 =
348 			_mm_load_si128((void *)(rxdp + 2));
349 		rte_compiler_barrier();
350 		const __m128i raw_desc1 =
351 			_mm_load_si128((void *)(rxdp + 1));
352 		rte_compiler_barrier();
353 		const __m128i raw_desc0 =
354 			_mm_load_si128((void *)(rxdp + 0));
355 
356 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
357 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
358 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
359 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
360 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
361 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
362 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
363 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
364 
365 		if (split_packet) {
366 			int j;
367 
368 			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
369 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
370 		}
371 
372 		/**
373 		 * convert descriptors 4-7 into mbufs, adjusting length and
374 		 * re-arranging fields. Then write into the mbuf
375 		 */
376 		const __m512i len4_7 = _mm512_slli_epi32(raw_desc4_7,
377 							 PKTLEN_SHIFT);
378 		const __m512i desc4_7 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
379 								raw_desc4_7,
380 								len4_7);
381 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
382 
383 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
384 		/**
385 		 * to get packet types, shift 64-bit values down 30 bits
386 		 * and so ptype is in lower 8-bits in each
387 		 */
388 		const __m512i ptypes4_7 = _mm512_srli_epi64(desc4_7, 30);
389 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
390 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
391 		const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
392 		const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
393 		const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
394 		const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
395 
396 		const __m512i ptype4_7 = _mm512_set_epi32
397 			(0, 0, 0, type_table[ptype7],
398 			 0, 0, 0, type_table[ptype6],
399 			 0, 0, 0, type_table[ptype5],
400 			 0, 0, 0, type_table[ptype4]);
401 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
402 
403 		/**
404 		 * convert descriptors 0-3 into mbufs, adjusting length and
405 		 * re-arranging fields. Then write into the mbuf
406 		 */
407 		const __m512i len0_3 = _mm512_slli_epi32(raw_desc0_3,
408 							 PKTLEN_SHIFT);
409 		const __m512i desc0_3 = _mm512_mask_blend_epi16(IAVF_RX_LEN_MASK,
410 								raw_desc0_3,
411 								len0_3);
412 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
413 
414 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
415 		/* get the packet types */
416 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
417 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
418 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
419 		const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
420 		const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
421 		const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
422 		const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
423 
424 		const __m512i ptype0_3 = _mm512_set_epi32
425 			(0, 0, 0, type_table[ptype3],
426 			 0, 0, 0, type_table[ptype2],
427 			 0, 0, 0, type_table[ptype1],
428 			 0, 0, 0, type_table[ptype0]);
429 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
430 
431 		/**
432 		 * use permute/extract to get status content
433 		 * After the operations, the packets status flags are in the
434 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
435 		 */
436 		/* merge the status bits into one register */
437 		const __m512i status_permute_msk = _mm512_set_epi32
438 			(0, 0, 0, 0,
439 			 0, 0, 0, 0,
440 			 22, 30, 6, 14,
441 			 18, 26, 2, 10);
442 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
443 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
444 		__m256i status0_7 = _mm512_extracti64x4_epi64
445 			(raw_status0_7, 0);
446 
447 		/* now do flag manipulation */
448 
449 		/* get only flag/error bits we want */
450 		const __m256i flag_bits =
451 			_mm256_and_si256(status0_7, flags_mask);
452 		/* set vlan and rss flags */
453 		const __m256i vlan_flags =
454 			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
455 		const __m256i rss_flags =
456 			_mm256_shuffle_epi8(rss_flags_shuf,
457 					    _mm256_srli_epi32(flag_bits, 11));
458 		/**
459 		 * l3_l4_error flags, shuffle, then shift to correct adjustment
460 		 * of flags in flags_shuf, and finally mask out extra bits
461 		 */
462 		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
463 						_mm256_srli_epi32(flag_bits, 22));
464 		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
465 		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
466 
467 		/* merge flags */
468 		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
469 				_mm256_or_si256(rss_flags, vlan_flags));
470 		/**
471 		 * At this point, we have the 8 sets of flags in the low 16-bits
472 		 * of each 32-bit value in vlan0.
473 		 * We want to extract these, and merge them with the mbuf init
474 		 * data so we can do a single write to the mbuf to set the flags
475 		 * and all the other initialization fields. Extracting the
476 		 * appropriate flags means that we have to do a shift and blend
477 		 * for each mbuf before we do the write. However, we can also
478 		 * add in the previously computed rx_descriptor fields to
479 		 * make a single 256-bit write per mbuf
480 		 */
481 		/* check the structure matches expectations */
482 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
483 				 offsetof(struct rte_mbuf, rearm_data) + 8);
484 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
485 				 RTE_ALIGN(offsetof(struct rte_mbuf,
486 						    rearm_data),
487 					   16));
488 		/* build up data and do writes */
489 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
490 			rearm6, rearm7;
491 		const __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
492 		const __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
493 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
494 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
495 
496 		rearm6 = _mm256_blend_epi32(mbuf_init,
497 					    _mm256_slli_si256(mbuf_flags, 8),
498 					    0x04);
499 		rearm4 = _mm256_blend_epi32(mbuf_init,
500 					    _mm256_slli_si256(mbuf_flags, 4),
501 					    0x04);
502 		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
503 		rearm0 = _mm256_blend_epi32(mbuf_init,
504 					    _mm256_srli_si256(mbuf_flags, 4),
505 					    0x04);
506 		/* permute to add in the rx_descriptor e.g. rss fields */
507 		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
508 		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
509 		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
510 		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
511 		/* write to mbuf */
512 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
513 				    rearm6);
514 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
515 				    rearm4);
516 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
517 				    rearm2);
518 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
519 				    rearm0);
520 
521 		/* repeat for the odd mbufs */
522 		const __m256i odd_flags =
523 			_mm256_castsi128_si256
524 				(_mm256_extracti128_si256(mbuf_flags, 1));
525 		rearm7 = _mm256_blend_epi32(mbuf_init,
526 					    _mm256_slli_si256(odd_flags, 8),
527 					    0x04);
528 		rearm5 = _mm256_blend_epi32(mbuf_init,
529 					    _mm256_slli_si256(odd_flags, 4),
530 					    0x04);
531 		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
532 		rearm1 = _mm256_blend_epi32(mbuf_init,
533 					    _mm256_srli_si256(odd_flags, 4),
534 					    0x04);
535 		/* since odd mbufs are already in hi 128-bits use blend */
536 		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
537 		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
538 		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
539 		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
540 		/* again write to mbufs */
541 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
542 				    rearm7);
543 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
544 				    rearm5);
545 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
546 				    rearm3);
547 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
548 				    rearm1);
549 
550 		/* extract and record EOP bit */
551 		if (split_packet) {
552 			const __m128i eop_mask =
553 				_mm_set1_epi16(1 << IAVF_RX_DESC_STATUS_EOF_SHIFT);
554 			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
555 								     eop_check);
556 			/* pack status bits into a single 128-bit register */
557 			const __m128i eop_bits =
558 				_mm_packus_epi32
559 					(_mm256_castsi256_si128(eop_bits256),
560 					 _mm256_extractf128_si256(eop_bits256,
561 								  1));
562 			/**
563 			 * flip bits, and mask out the EOP bit, which is now
564 			 * a split-packet bit i.e. !EOP, rather than EOP one.
565 			 */
566 			__m128i split_bits = _mm_andnot_si128(eop_bits,
567 							      eop_mask);
568 			/**
569 			 * eop bits are out of order, so we need to shuffle them
570 			 * back into order again. In doing so, only use low 8
571 			 * bits, which acts like another pack instruction
572 			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
573 			 * [Since we use epi8, the 16-bit positions are
574 			 * multiplied by 2 in the eop_shuffle value.]
575 			 */
576 			__m128i eop_shuffle =
577 				_mm_set_epi8(/* zero hi 64b */
578 					     0xFF, 0xFF, 0xFF, 0xFF,
579 					     0xFF, 0xFF, 0xFF, 0xFF,
580 					     /* move values to lo 64b */
581 					     8, 0, 10, 2,
582 					     12, 4, 14, 6);
583 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
584 			*(uint64_t *)split_packet =
585 				_mm_cvtsi128_si64(split_bits);
586 			split_packet += IAVF_DESCS_PER_LOOP_AVX;
587 		}
588 
589 		/* perform dd_check */
590 		status0_7 = _mm256_and_si256(status0_7, dd_check);
591 		status0_7 = _mm256_packs_epi32(status0_7,
592 					       _mm256_setzero_si256());
593 
594 		uint64_t burst = __builtin_popcountll
595 					(_mm_cvtsi128_si64
596 						(_mm256_extracti128_si256
597 							(status0_7, 1)));
598 		burst += __builtin_popcountll
599 				(_mm_cvtsi128_si64
600 					(_mm256_castsi256_si128(status0_7)));
601 		received += burst;
602 		if (burst != IAVF_DESCS_PER_LOOP_AVX)
603 			break;
604 	}
605 
606 	/* update tail pointers */
607 	rxq->rx_tail += received;
608 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
609 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
610 		rxq->rx_tail--;
611 		received--;
612 	}
613 	rxq->rxrearm_nb += received;
614 	return received;
615 }
616 
617 static inline __m256i
flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)618 flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
619 {
620 #define FDID_MIS_MAGIC 0xFFFFFFFF
621 	RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
622 	RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
623 	const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
624 						       PKT_RX_FDIR_ID);
625 	/* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
626 	const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
627 	__m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
628 					       fdir_mis_mask);
629 	/* this XOR op results to bit-reverse the fdir_mask */
630 	fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
631 	const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
632 
633 	return fdir_flags;
634 }
635 
636 static inline uint16_t
_iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue * rxq,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts,uint8_t * split_packet)637 _iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
638 					struct rte_mbuf **rx_pkts,
639 					uint16_t nb_pkts, uint8_t *split_packet)
640 {
641 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
642 
643 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
644 						    rxq->mbuf_initializer);
645 	struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
646 	volatile union iavf_rx_flex_desc *rxdp =
647 		(union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
648 
649 	rte_prefetch0(rxdp);
650 
651 	/* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
652 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
653 
654 	/* See if we need to rearm the RX queue - gives the prefetch a bit
655 	 * of time to act
656 	 */
657 	if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
658 		iavf_rxq_rearm(rxq);
659 
660 	/* Before we start moving massive data around, check to see if
661 	 * there is actually a packet available
662 	 */
663 	if (!(rxdp->wb.status_error0 &
664 	      rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
665 		return 0;
666 
667 	/* constants used in processing loop */
668 	const __m512i crc_adjust =
669 		_mm512_set_epi32
670 			(/* 1st descriptor */
671 			 0,             /* ignore non-length fields */
672 			 -rxq->crc_len, /* sub crc on data_len */
673 			 -rxq->crc_len, /* sub crc on pkt_len */
674 			 0,             /* ignore pkt_type field */
675 			 /* 2nd descriptor */
676 			 0,             /* ignore non-length fields */
677 			 -rxq->crc_len, /* sub crc on data_len */
678 			 -rxq->crc_len, /* sub crc on pkt_len */
679 			 0,             /* ignore pkt_type field */
680 			 /* 3rd descriptor */
681 			 0,             /* ignore non-length fields */
682 			 -rxq->crc_len, /* sub crc on data_len */
683 			 -rxq->crc_len, /* sub crc on pkt_len */
684 			 0,             /* ignore pkt_type field */
685 			 /* 4th descriptor */
686 			 0,             /* ignore non-length fields */
687 			 -rxq->crc_len, /* sub crc on data_len */
688 			 -rxq->crc_len, /* sub crc on pkt_len */
689 			 0              /* ignore pkt_type field */
690 			);
691 
692 	/* 8 packets DD mask, LSB in each 32-bit value */
693 	const __m256i dd_check = _mm256_set1_epi32(1);
694 
695 	/* 8 packets EOP mask, second-LSB in each 32-bit value */
696 	const __m256i eop_check = _mm256_slli_epi32(dd_check,
697 			IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
698 
699 	/* mask to shuffle from desc. to mbuf (4 descriptors)*/
700 	const __m512i shuf_msk =
701 		_mm512_set_epi32
702 			(/* 1st descriptor */
703 			 0xFFFFFFFF,    /* rss hash parsed separately */
704 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
705 					/* octet 4~5, 16 bits data_len */
706 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
707 					/* octet 4~5, 16 bits pkt_len */
708 			 0xFFFFFFFF,    /* pkt_type set as unknown */
709 			 /* 2nd descriptor */
710 			 0xFFFFFFFF,    /* rss hash parsed separately */
711 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
712 					/* octet 4~5, 16 bits data_len */
713 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
714 					/* octet 4~5, 16 bits pkt_len */
715 			 0xFFFFFFFF,    /* pkt_type set as unknown */
716 			 /* 3rd descriptor */
717 			 0xFFFFFFFF,    /* rss hash parsed separately */
718 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
719 					/* octet 4~5, 16 bits data_len */
720 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
721 					/* octet 4~5, 16 bits pkt_len */
722 			 0xFFFFFFFF,    /* pkt_type set as unknown */
723 			 /* 4th descriptor */
724 			 0xFFFFFFFF,    /* rss hash parsed separately */
725 			 0x0B0A0504,    /* octet 10~11, 16 bits vlan_macip */
726 					/* octet 4~5, 16 bits data_len */
727 			 0xFFFF0504,    /* skip hi 16 bits pkt_len, zero out */
728 					/* octet 4~5, 16 bits pkt_len */
729 			 0xFFFFFFFF     /* pkt_type set as unknown */
730 			);
731 	/**
732 	 * compile-time check the above crc and shuffle layout is correct.
733 	 * NOTE: the first field (lowest address) is given last in set_epi
734 	 * calls above.
735 	 */
736 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
737 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
738 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
739 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
740 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
741 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
742 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
743 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
744 
745 	/* Status/Error flag masks */
746 	/**
747 	 * mask everything except Checksum Reports, RSS indication
748 	 * and VLAN indication.
749 	 * bit6:4 for IP/L4 checksum errors.
750 	 * bit12 is for RSS indication.
751 	 * bit13 is for VLAN indication.
752 	 */
753 	const __m256i flags_mask =
754 		_mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
755 	/**
756 	 * data to be shuffled by the result of the flags mask shifted by 4
757 	 * bits.  This gives use the l3_l4 flags.
758 	 */
759 	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
760 			/* shift right 1 bit to make sure it not exceed 255 */
761 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
762 			 PKT_RX_IP_CKSUM_BAD) >> 1,
763 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
764 			 PKT_RX_IP_CKSUM_GOOD) >> 1,
765 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
766 			 PKT_RX_IP_CKSUM_BAD) >> 1,
767 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
768 			 PKT_RX_IP_CKSUM_GOOD) >> 1,
769 			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
770 			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
771 			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
772 			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
773 			/* second 128-bits */
774 			0, 0, 0, 0, 0, 0, 0, 0,
775 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
776 			 PKT_RX_IP_CKSUM_BAD) >> 1,
777 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
778 			 PKT_RX_IP_CKSUM_GOOD) >> 1,
779 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
780 			 PKT_RX_IP_CKSUM_BAD) >> 1,
781 			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
782 			 PKT_RX_IP_CKSUM_GOOD) >> 1,
783 			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
784 			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
785 			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
786 			(PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
787 	const __m256i cksum_mask =
788 		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
789 				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
790 				  PKT_RX_EIP_CKSUM_BAD);
791 	/**
792 	 * data to be shuffled by result of flag mask, shifted down 12.
793 	 * If RSS(bit12)/VLAN(bit13) are set,
794 	 * shuffle moves appropriate flags in place.
795 	 */
796 	const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
797 			0, 0, 0, 0,
798 			0, 0, 0, 0,
799 			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
800 			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
801 			PKT_RX_RSS_HASH, 0,
802 			/* end up 128-bits */
803 			0, 0, 0, 0,
804 			0, 0, 0, 0,
805 			0, 0, 0, 0,
806 			PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
807 			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
808 			PKT_RX_RSS_HASH, 0);
809 
810 	uint16_t i, received;
811 
812 	for (i = 0, received = 0; i < nb_pkts;
813 	     i += IAVF_DESCS_PER_LOOP_AVX,
814 	     rxdp += IAVF_DESCS_PER_LOOP_AVX) {
815 		/* step 1, copy over 8 mbuf pointers to rx_pkts array */
816 		_mm256_storeu_si256((void *)&rx_pkts[i],
817 				    _mm256_loadu_si256((void *)&sw_ring[i]));
818 #ifdef RTE_ARCH_X86_64
819 		_mm256_storeu_si256
820 			((void *)&rx_pkts[i + 4],
821 			 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
822 #endif
823 
824 		__m512i raw_desc0_3, raw_desc4_7;
825 
826 		const __m128i raw_desc7 =
827 			_mm_load_si128((void *)(rxdp + 7));
828 		rte_compiler_barrier();
829 		const __m128i raw_desc6 =
830 			_mm_load_si128((void *)(rxdp + 6));
831 		rte_compiler_barrier();
832 		const __m128i raw_desc5 =
833 			_mm_load_si128((void *)(rxdp + 5));
834 		rte_compiler_barrier();
835 		const __m128i raw_desc4 =
836 			_mm_load_si128((void *)(rxdp + 4));
837 		rte_compiler_barrier();
838 		const __m128i raw_desc3 =
839 			_mm_load_si128((void *)(rxdp + 3));
840 		rte_compiler_barrier();
841 		const __m128i raw_desc2 =
842 			_mm_load_si128((void *)(rxdp + 2));
843 		rte_compiler_barrier();
844 		const __m128i raw_desc1 =
845 			_mm_load_si128((void *)(rxdp + 1));
846 		rte_compiler_barrier();
847 		const __m128i raw_desc0 =
848 			_mm_load_si128((void *)(rxdp + 0));
849 
850 		raw_desc4_7 = _mm512_broadcast_i32x4(raw_desc4);
851 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc5, 1);
852 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc6, 2);
853 		raw_desc4_7 = _mm512_inserti32x4(raw_desc4_7, raw_desc7, 3);
854 		raw_desc0_3 = _mm512_broadcast_i32x4(raw_desc0);
855 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc1, 1);
856 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc2, 2);
857 		raw_desc0_3 = _mm512_inserti32x4(raw_desc0_3, raw_desc3, 3);
858 
859 		if (split_packet) {
860 			int j;
861 
862 			for (j = 0; j < IAVF_DESCS_PER_LOOP_AVX; j++)
863 				rte_mbuf_prefetch_part2(rx_pkts[i + j]);
864 		}
865 
866 		/**
867 		 * convert descriptors 4-7 into mbufs, re-arrange fields.
868 		 * Then write into the mbuf.
869 		 */
870 		__m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
871 
872 		mb4_7 = _mm512_add_epi16(mb4_7, crc_adjust);
873 		/**
874 		 * to get packet types, ptype is located in bit16-25
875 		 * of each 128bits
876 		 */
877 		const __m512i ptype_mask =
878 			_mm512_set1_epi16(IAVF_RX_FLEX_DESC_PTYPE_M);
879 		const __m512i ptypes4_7 =
880 			_mm512_and_si512(raw_desc4_7, ptype_mask);
881 		const __m256i ptypes6_7 = _mm512_extracti64x4_epi64(ptypes4_7, 1);
882 		const __m256i ptypes4_5 = _mm512_extracti64x4_epi64(ptypes4_7, 0);
883 		const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
884 		const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
885 		const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
886 		const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
887 
888 		const __m512i ptype4_7 = _mm512_set_epi32
889 			(0, 0, 0, type_table[ptype7],
890 			 0, 0, 0, type_table[ptype6],
891 			 0, 0, 0, type_table[ptype5],
892 			 0, 0, 0, type_table[ptype4]);
893 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
894 
895 		/**
896 		 * convert descriptors 0-3 into mbufs, re-arrange fields.
897 		 * Then write into the mbuf.
898 		 */
899 		__m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
900 
901 		mb0_3 = _mm512_add_epi16(mb0_3, crc_adjust);
902 		/**
903 		 * to get packet types, ptype is located in bit16-25
904 		 * of each 128bits
905 		 */
906 		const __m512i ptypes0_3 =
907 			_mm512_and_si512(raw_desc0_3, ptype_mask);
908 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
909 		const __m256i ptypes0_1 = _mm512_extracti64x4_epi64(ptypes0_3, 0);
910 		const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
911 		const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
912 		const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
913 		const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
914 
915 		const __m512i ptype0_3 = _mm512_set_epi32
916 			(0, 0, 0, type_table[ptype3],
917 			 0, 0, 0, type_table[ptype2],
918 			 0, 0, 0, type_table[ptype1],
919 			 0, 0, 0, type_table[ptype0]);
920 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
921 
922 		/**
923 		 * use permute/extract to get status content
924 		 * After the operations, the packets status flags are in the
925 		 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
926 		 */
927 		/* merge the status bits into one register */
928 		const __m512i status_permute_msk = _mm512_set_epi32
929 			(0, 0, 0, 0,
930 			 0, 0, 0, 0,
931 			 22, 30, 6, 14,
932 			 18, 26, 2, 10);
933 		const __m512i raw_status0_7 = _mm512_permutex2var_epi32
934 			(raw_desc4_7, status_permute_msk, raw_desc0_3);
935 		__m256i status0_7 = _mm512_extracti64x4_epi64
936 			(raw_status0_7, 0);
937 
938 		/* now do flag manipulation */
939 
940 		/* get only flag/error bits we want */
941 		const __m256i flag_bits =
942 			_mm256_and_si256(status0_7, flags_mask);
943 		/**
944 		 * l3_l4_error flags, shuffle, then shift to correct adjustment
945 		 * of flags in flags_shuf, and finally mask out extra bits
946 		 */
947 		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
948 				_mm256_srli_epi32(flag_bits, 4));
949 		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
950 		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
951 		/* set rss and vlan flags */
952 		const __m256i rss_vlan_flag_bits =
953 			_mm256_srli_epi32(flag_bits, 12);
954 		const __m256i rss_vlan_flags =
955 			_mm256_shuffle_epi8(rss_vlan_flags_shuf,
956 					    rss_vlan_flag_bits);
957 
958 		/* merge flags */
959 		__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
960 						     rss_vlan_flags);
961 
962 		if (rxq->fdir_enabled) {
963 			const __m512i fdir_permute_mask = _mm512_set_epi32
964 				(0, 0, 0, 0,
965 				 0, 0, 0, 0,
966 				 7, 15, 23, 31,
967 				 3, 11, 19, 27);
968 			__m512i fdir_tmp = _mm512_permutex2var_epi32
969 				(raw_desc0_3, fdir_permute_mask, raw_desc4_7);
970 			const __m256i fdir_id0_7 = _mm512_extracti64x4_epi64
971 				(fdir_tmp, 0);
972 			const __m256i fdir_flags =
973 				flex_rxd_to_fdir_flags_vec_avx512(fdir_id0_7);
974 
975 			/* merge with fdir_flags */
976 			mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
977 
978 			/* write to mbuf: have to use scalar store here */
979 			rx_pkts[i + 0]->hash.fdir.hi =
980 				_mm256_extract_epi32(fdir_id0_7, 3);
981 
982 			rx_pkts[i + 1]->hash.fdir.hi =
983 				_mm256_extract_epi32(fdir_id0_7, 7);
984 
985 			rx_pkts[i + 2]->hash.fdir.hi =
986 				_mm256_extract_epi32(fdir_id0_7, 2);
987 
988 			rx_pkts[i + 3]->hash.fdir.hi =
989 				_mm256_extract_epi32(fdir_id0_7, 6);
990 
991 			rx_pkts[i + 4]->hash.fdir.hi =
992 				_mm256_extract_epi32(fdir_id0_7, 1);
993 
994 			rx_pkts[i + 5]->hash.fdir.hi =
995 				_mm256_extract_epi32(fdir_id0_7, 5);
996 
997 			rx_pkts[i + 6]->hash.fdir.hi =
998 				_mm256_extract_epi32(fdir_id0_7, 0);
999 
1000 			rx_pkts[i + 7]->hash.fdir.hi =
1001 				_mm256_extract_epi32(fdir_id0_7, 4);
1002 		} /* if() on fdir_enabled */
1003 
1004 		__m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
1005 		__m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
1006 		__m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
1007 		__m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
1008 
1009 #ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
1010 		/**
1011 		 * needs to load 2nd 16B of each desc for RSS hash parsing,
1012 		 * will cause performance drop to get into this context.
1013 		 */
1014 		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
1015 		    DEV_RX_OFFLOAD_RSS_HASH) {
1016 			/* load bottom half of every 32B desc */
1017 			const __m128i raw_desc_bh7 =
1018 				_mm_load_si128
1019 					((void *)(&rxdp[7].wb.status_error1));
1020 			rte_compiler_barrier();
1021 			const __m128i raw_desc_bh6 =
1022 				_mm_load_si128
1023 					((void *)(&rxdp[6].wb.status_error1));
1024 			rte_compiler_barrier();
1025 			const __m128i raw_desc_bh5 =
1026 				_mm_load_si128
1027 					((void *)(&rxdp[5].wb.status_error1));
1028 			rte_compiler_barrier();
1029 			const __m128i raw_desc_bh4 =
1030 				_mm_load_si128
1031 					((void *)(&rxdp[4].wb.status_error1));
1032 			rte_compiler_barrier();
1033 			const __m128i raw_desc_bh3 =
1034 				_mm_load_si128
1035 					((void *)(&rxdp[3].wb.status_error1));
1036 			rte_compiler_barrier();
1037 			const __m128i raw_desc_bh2 =
1038 				_mm_load_si128
1039 					((void *)(&rxdp[2].wb.status_error1));
1040 			rte_compiler_barrier();
1041 			const __m128i raw_desc_bh1 =
1042 				_mm_load_si128
1043 					((void *)(&rxdp[1].wb.status_error1));
1044 			rte_compiler_barrier();
1045 			const __m128i raw_desc_bh0 =
1046 				_mm_load_si128
1047 					((void *)(&rxdp[0].wb.status_error1));
1048 
1049 			__m256i raw_desc_bh6_7 =
1050 				_mm256_inserti128_si256
1051 					(_mm256_castsi128_si256(raw_desc_bh6),
1052 					 raw_desc_bh7, 1);
1053 			__m256i raw_desc_bh4_5 =
1054 				_mm256_inserti128_si256
1055 					(_mm256_castsi128_si256(raw_desc_bh4),
1056 					 raw_desc_bh5, 1);
1057 			__m256i raw_desc_bh2_3 =
1058 				_mm256_inserti128_si256
1059 					(_mm256_castsi128_si256(raw_desc_bh2),
1060 					 raw_desc_bh3, 1);
1061 			__m256i raw_desc_bh0_1 =
1062 				_mm256_inserti128_si256
1063 					(_mm256_castsi128_si256(raw_desc_bh0),
1064 					 raw_desc_bh1, 1);
1065 
1066 			/**
1067 			 * to shift the 32b RSS hash value to the
1068 			 * highest 32b of each 128b before mask
1069 			 */
1070 			__m256i rss_hash6_7 =
1071 				_mm256_slli_epi64(raw_desc_bh6_7, 32);
1072 			__m256i rss_hash4_5 =
1073 				_mm256_slli_epi64(raw_desc_bh4_5, 32);
1074 			__m256i rss_hash2_3 =
1075 				_mm256_slli_epi64(raw_desc_bh2_3, 32);
1076 			__m256i rss_hash0_1 =
1077 				_mm256_slli_epi64(raw_desc_bh0_1, 32);
1078 
1079 			__m256i rss_hash_msk =
1080 				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
1081 						 0xFFFFFFFF, 0, 0, 0);
1082 
1083 			rss_hash6_7 = _mm256_and_si256
1084 					(rss_hash6_7, rss_hash_msk);
1085 			rss_hash4_5 = _mm256_and_si256
1086 					(rss_hash4_5, rss_hash_msk);
1087 			rss_hash2_3 = _mm256_and_si256
1088 					(rss_hash2_3, rss_hash_msk);
1089 			rss_hash0_1 = _mm256_and_si256
1090 					(rss_hash0_1, rss_hash_msk);
1091 
1092 			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
1093 			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
1094 			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
1095 			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
1096 		} /* if() on RSS hash parsing */
1097 #endif
1098 
1099 		/**
1100 		 * At this point, we have the 8 sets of flags in the low 16-bits
1101 		 * of each 32-bit value in vlan0.
1102 		 * We want to extract these, and merge them with the mbuf init
1103 		 * data so we can do a single write to the mbuf to set the flags
1104 		 * and all the other initialization fields. Extracting the
1105 		 * appropriate flags means that we have to do a shift and blend
1106 		 * for each mbuf before we do the write. However, we can also
1107 		 * add in the previously computed rx_descriptor fields to
1108 		 * make a single 256-bit write per mbuf
1109 		 */
1110 		/* check the structure matches expectations */
1111 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
1112 				 offsetof(struct rte_mbuf, rearm_data) + 8);
1113 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
1114 				 RTE_ALIGN(offsetof(struct rte_mbuf,
1115 						    rearm_data),
1116 						    16));
1117 		/* build up data and do writes */
1118 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
1119 			rearm6, rearm7;
1120 		rearm6 = _mm256_blend_epi32(mbuf_init,
1121 					    _mm256_slli_si256(mbuf_flags, 8),
1122 					    0x04);
1123 		rearm4 = _mm256_blend_epi32(mbuf_init,
1124 					    _mm256_slli_si256(mbuf_flags, 4),
1125 					    0x04);
1126 		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
1127 		rearm0 = _mm256_blend_epi32(mbuf_init,
1128 					    _mm256_srli_si256(mbuf_flags, 4),
1129 					    0x04);
1130 		/* permute to add in the rx_descriptor e.g. rss fields */
1131 		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
1132 		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
1133 		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
1134 		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
1135 		/* write to mbuf */
1136 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
1137 				    rearm6);
1138 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
1139 				    rearm4);
1140 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
1141 				    rearm2);
1142 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
1143 				    rearm0);
1144 
1145 		/* repeat for the odd mbufs */
1146 		const __m256i odd_flags =
1147 			_mm256_castsi128_si256
1148 				(_mm256_extracti128_si256(mbuf_flags, 1));
1149 		rearm7 = _mm256_blend_epi32(mbuf_init,
1150 					    _mm256_slli_si256(odd_flags, 8),
1151 					    0x04);
1152 		rearm5 = _mm256_blend_epi32(mbuf_init,
1153 					    _mm256_slli_si256(odd_flags, 4),
1154 					    0x04);
1155 		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
1156 		rearm1 = _mm256_blend_epi32(mbuf_init,
1157 					    _mm256_srli_si256(odd_flags, 4),
1158 					    0x04);
1159 		/* since odd mbufs are already in hi 128-bits use blend */
1160 		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
1161 		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
1162 		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
1163 		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
1164 		/* again write to mbufs */
1165 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
1166 				    rearm7);
1167 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
1168 				    rearm5);
1169 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
1170 				    rearm3);
1171 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
1172 				    rearm1);
1173 
1174 		/* extract and record EOP bit */
1175 		if (split_packet) {
1176 			const __m128i eop_mask =
1177 				_mm_set1_epi16(1 <<
1178 					       IAVF_RX_FLEX_DESC_STATUS0_EOF_S);
1179 			const __m256i eop_bits256 = _mm256_and_si256(status0_7,
1180 								     eop_check);
1181 			/* pack status bits into a single 128-bit register */
1182 			const __m128i eop_bits =
1183 				_mm_packus_epi32
1184 					(_mm256_castsi256_si128(eop_bits256),
1185 					 _mm256_extractf128_si256(eop_bits256,
1186 								  1));
1187 			/**
1188 			 * flip bits, and mask out the EOP bit, which is now
1189 			 * a split-packet bit i.e. !EOP, rather than EOP one.
1190 			 */
1191 			__m128i split_bits = _mm_andnot_si128(eop_bits,
1192 							      eop_mask);
1193 			/**
1194 			 * eop bits are out of order, so we need to shuffle them
1195 			 * back into order again. In doing so, only use low 8
1196 			 * bits, which acts like another pack instruction
1197 			 * The original order is (hi->lo): 1,3,5,7,0,2,4,6
1198 			 * [Since we use epi8, the 16-bit positions are
1199 			 * multiplied by 2 in the eop_shuffle value.]
1200 			 */
1201 			__m128i eop_shuffle =
1202 				_mm_set_epi8(/* zero hi 64b */
1203 					     0xFF, 0xFF, 0xFF, 0xFF,
1204 					     0xFF, 0xFF, 0xFF, 0xFF,
1205 					     /* move values to lo 64b */
1206 					     8, 0, 10, 2,
1207 					     12, 4, 14, 6);
1208 			split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
1209 			*(uint64_t *)split_packet =
1210 				_mm_cvtsi128_si64(split_bits);
1211 			split_packet += IAVF_DESCS_PER_LOOP_AVX;
1212 		}
1213 
1214 		/* perform dd_check */
1215 		status0_7 = _mm256_and_si256(status0_7, dd_check);
1216 		status0_7 = _mm256_packs_epi32(status0_7,
1217 					       _mm256_setzero_si256());
1218 
1219 		uint64_t burst = __builtin_popcountll
1220 					(_mm_cvtsi128_si64
1221 						(_mm256_extracti128_si256
1222 							(status0_7, 1)));
1223 		burst += __builtin_popcountll
1224 				(_mm_cvtsi128_si64
1225 					(_mm256_castsi256_si128(status0_7)));
1226 		received += burst;
1227 		if (burst != IAVF_DESCS_PER_LOOP_AVX)
1228 			break;
1229 	}
1230 
1231 	/* update tail pointers */
1232 	rxq->rx_tail += received;
1233 	rxq->rx_tail &= (rxq->nb_rx_desc - 1);
1234 	if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep aligned */
1235 		rxq->rx_tail--;
1236 		received--;
1237 	}
1238 	rxq->rxrearm_nb += received;
1239 	return received;
1240 }
1241 
1242 /**
1243  * Notice:
1244  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1245  */
1246 uint16_t
iavf_recv_pkts_vec_avx512(void * rx_queue,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts)1247 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1248 			  uint16_t nb_pkts)
1249 {
1250 	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
1251 }
1252 
1253 /**
1254  * Notice:
1255  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1256  */
1257 uint16_t
iavf_recv_pkts_vec_avx512_flex_rxd(void * rx_queue,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts)1258 iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue, struct rte_mbuf **rx_pkts,
1259 				   uint16_t nb_pkts)
1260 {
1261 	return _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rx_queue, rx_pkts,
1262 						       nb_pkts, NULL);
1263 }
1264 
1265 /**
1266  * vPMD receive routine that reassembles single burst of 32 scattered packets
1267  * Notice:
1268  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1269  */
1270 static uint16_t
iavf_recv_scattered_burst_vec_avx512(void * rx_queue,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts)1271 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1272 				     uint16_t nb_pkts)
1273 {
1274 	struct iavf_rx_queue *rxq = rx_queue;
1275 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1276 
1277 	/* get some new buffers */
1278 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
1279 							  split_flags);
1280 	if (nb_bufs == 0)
1281 		return 0;
1282 
1283 	/* happy day case, full burst + no packets to be joined */
1284 	const uint64_t *split_fl64 = (uint64_t *)split_flags;
1285 
1286 	if (!rxq->pkt_first_seg &&
1287 	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
1288 	    split_fl64[2] == 0 && split_fl64[3] == 0)
1289 		return nb_bufs;
1290 
1291 	/* reassemble any packets that need reassembly*/
1292 	unsigned int i = 0;
1293 
1294 	if (!rxq->pkt_first_seg) {
1295 		/* find the first split flag, and only reassemble then*/
1296 		while (i < nb_bufs && !split_flags[i])
1297 			i++;
1298 		if (i == nb_bufs)
1299 			return nb_bufs;
1300 		rxq->pkt_first_seg = rx_pkts[i];
1301 	}
1302 	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1303 				      &split_flags[i]);
1304 }
1305 
1306 /**
1307  * vPMD receive routine that reassembles scattered packets.
1308  * Main receive routine that can handle arbitrary burst sizes
1309  * Notice:
1310  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1311  */
1312 uint16_t
iavf_recv_scattered_pkts_vec_avx512(void * rx_queue,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts)1313 iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
1314 				    uint16_t nb_pkts)
1315 {
1316 	uint16_t retval = 0;
1317 
1318 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1319 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
1320 				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
1321 		retval += burst;
1322 		nb_pkts -= burst;
1323 		if (burst < IAVF_VPMD_RX_MAX_BURST)
1324 			return retval;
1325 	}
1326 	return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
1327 				rx_pkts + retval, nb_pkts);
1328 }
1329 
1330 /**
1331  * vPMD receive routine that reassembles single burst of
1332  * 32 scattered packets for flex RxD
1333  * Notice:
1334  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1335  */
1336 static uint16_t
iavf_recv_scattered_burst_vec_avx512_flex_rxd(void * rx_queue,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts)1337 iavf_recv_scattered_burst_vec_avx512_flex_rxd(void *rx_queue,
1338 					      struct rte_mbuf **rx_pkts,
1339 					      uint16_t nb_pkts)
1340 {
1341 	struct iavf_rx_queue *rxq = rx_queue;
1342 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
1343 
1344 	/* get some new buffers */
1345 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512_flex_rxd(rxq,
1346 					rx_pkts, nb_pkts, split_flags);
1347 	if (nb_bufs == 0)
1348 		return 0;
1349 
1350 	/* happy day case, full burst + no packets to be joined */
1351 	const uint64_t *split_fl64 = (uint64_t *)split_flags;
1352 
1353 	if (!rxq->pkt_first_seg &&
1354 	    split_fl64[0] == 0 && split_fl64[1] == 0 &&
1355 	    split_fl64[2] == 0 && split_fl64[3] == 0)
1356 		return nb_bufs;
1357 
1358 	/* reassemble any packets that need reassembly*/
1359 	unsigned int i = 0;
1360 
1361 	if (!rxq->pkt_first_seg) {
1362 		/* find the first split flag, and only reassemble then*/
1363 		while (i < nb_bufs && !split_flags[i])
1364 			i++;
1365 		if (i == nb_bufs)
1366 			return nb_bufs;
1367 		rxq->pkt_first_seg = rx_pkts[i];
1368 	}
1369 	return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
1370 				      &split_flags[i]);
1371 }
1372 
1373 /**
1374  * vPMD receive routine that reassembles scattered packets for flex RxD.
1375  * Main receive routine that can handle arbitrary burst sizes
1376  * Notice:
1377  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
1378  */
1379 uint16_t
iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void * rx_queue,struct rte_mbuf ** rx_pkts,uint16_t nb_pkts)1380 iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
1381 					     struct rte_mbuf **rx_pkts,
1382 					     uint16_t nb_pkts)
1383 {
1384 	uint16_t retval = 0;
1385 
1386 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
1387 		uint16_t burst =
1388 			iavf_recv_scattered_burst_vec_avx512_flex_rxd
1389 				(rx_queue, rx_pkts + retval,
1390 				 IAVF_VPMD_RX_MAX_BURST);
1391 		retval += burst;
1392 		nb_pkts -= burst;
1393 		if (burst < IAVF_VPMD_RX_MAX_BURST)
1394 			return retval;
1395 	}
1396 	return retval + iavf_recv_scattered_burst_vec_avx512_flex_rxd(rx_queue,
1397 				rx_pkts + retval, nb_pkts);
1398 }
1399 
1400 static __rte_always_inline int
iavf_tx_free_bufs_avx512(struct iavf_tx_queue * txq)1401 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
1402 {
1403 	struct iavf_tx_vec_entry *txep;
1404 	uint32_t n;
1405 	uint32_t i;
1406 	int nb_free = 0;
1407 	struct rte_mbuf *m, *free[IAVF_VPMD_TX_MAX_FREE_BUF];
1408 
1409 	/* check DD bits on threshold descriptor */
1410 	if ((txq->tx_ring[txq->next_dd].cmd_type_offset_bsz &
1411 	     rte_cpu_to_le_64(IAVF_TXD_QW1_DTYPE_MASK)) !=
1412 	    rte_cpu_to_le_64(IAVF_TX_DESC_DTYPE_DESC_DONE))
1413 		return 0;
1414 
1415 	n = txq->rs_thresh;
1416 
1417 	 /* first buffer to free from S/W ring is at index
1418 	  * tx_next_dd - (tx_rs_thresh-1)
1419 	  */
1420 	txep = (void *)txq->sw_ring;
1421 	txep += txq->next_dd - (n - 1);
1422 
1423 	if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
1424 		struct rte_mempool *mp = txep[0].mbuf->pool;
1425 		struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
1426 								rte_lcore_id());
1427 		void **cache_objs;
1428 
1429 		if (!cache || cache->len == 0)
1430 			goto normal;
1431 
1432 		cache_objs = &cache->objs[cache->len];
1433 
1434 		if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
1435 			rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
1436 			goto done;
1437 		}
1438 
1439 		/* The cache follows the following algorithm
1440 		 *   1. Add the objects to the cache
1441 		 *   2. Anything greater than the cache min value (if it crosses the
1442 		 *   cache flush threshold) is flushed to the ring.
1443 		 */
1444 		/* Add elements back into the cache */
1445 		uint32_t copied = 0;
1446 		/* n is multiple of 32 */
1447 		while (copied < n) {
1448 			const __m512i a = _mm512_loadu_si512(&txep[copied]);
1449 			const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
1450 			const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
1451 			const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
1452 
1453 			_mm512_storeu_si512(&cache_objs[copied], a);
1454 			_mm512_storeu_si512(&cache_objs[copied + 8], b);
1455 			_mm512_storeu_si512(&cache_objs[copied + 16], c);
1456 			_mm512_storeu_si512(&cache_objs[copied + 24], d);
1457 			copied += 32;
1458 		}
1459 		cache->len += n;
1460 
1461 		if (cache->len >= cache->flushthresh) {
1462 			rte_mempool_ops_enqueue_bulk(mp,
1463 						     &cache->objs[cache->size],
1464 						     cache->len - cache->size);
1465 			cache->len = cache->size;
1466 		}
1467 		goto done;
1468 	}
1469 
1470 normal:
1471 	m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
1472 	if (likely(m)) {
1473 		free[0] = m;
1474 		nb_free = 1;
1475 		for (i = 1; i < n; i++) {
1476 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1477 			if (likely(m)) {
1478 				if (likely(m->pool == free[0]->pool)) {
1479 					free[nb_free++] = m;
1480 				} else {
1481 					rte_mempool_put_bulk(free[0]->pool,
1482 							     (void *)free,
1483 							     nb_free);
1484 					free[0] = m;
1485 					nb_free = 1;
1486 				}
1487 			}
1488 		}
1489 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
1490 	} else {
1491 		for (i = 1; i < n; i++) {
1492 			m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
1493 			if (m)
1494 				rte_mempool_put(m->pool, m);
1495 		}
1496 	}
1497 
1498 done:
1499 	/* buffers were freed, update counters */
1500 	txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
1501 	txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_thresh);
1502 	if (txq->next_dd >= txq->nb_tx_desc)
1503 		txq->next_dd = (uint16_t)(txq->rs_thresh - 1);
1504 
1505 	return txq->rs_thresh;
1506 }
1507 
1508 static __rte_always_inline void
tx_backlog_entry_avx512(struct iavf_tx_vec_entry * txep,struct rte_mbuf ** tx_pkts,uint16_t nb_pkts)1509 tx_backlog_entry_avx512(struct iavf_tx_vec_entry *txep,
1510 			struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
1511 {
1512 	int i;
1513 
1514 	for (i = 0; i < (int)nb_pkts; ++i)
1515 		txep[i].mbuf = tx_pkts[i];
1516 }
1517 
1518 static inline void
iavf_vtx1(volatile struct iavf_tx_desc * txdp,struct rte_mbuf * pkt,uint64_t flags)1519 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
1520 	  struct rte_mbuf *pkt, uint64_t flags)
1521 {
1522 	uint64_t high_qw =
1523 		(IAVF_TX_DESC_DTYPE_DATA |
1524 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
1525 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
1526 
1527 	__m128i descriptor = _mm_set_epi64x(high_qw,
1528 					    pkt->buf_iova + pkt->data_off);
1529 	_mm_storeu_si128((__m128i *)txdp, descriptor);
1530 }
1531 
1532 #define IAVF_TX_LEN_MASK 0xAA
1533 #define IAVF_TX_OFF_MASK 0x55
1534 static inline void
iavf_vtx(volatile struct iavf_tx_desc * txdp,struct rte_mbuf ** pkt,uint16_t nb_pkts,uint64_t flags)1535 iavf_vtx(volatile struct iavf_tx_desc *txdp,
1536 	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
1537 {
1538 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
1539 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
1540 
1541 	/* if unaligned on 32-bit boundary, do one to align */
1542 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
1543 		iavf_vtx1(txdp, *pkt, flags);
1544 		nb_pkts--, txdp++, pkt++;
1545 	}
1546 
1547 	/* do 4 at a time while possible, in bursts */
1548 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
1549 		__m512i desc4 =
1550 			_mm512_set_epi64
1551 				((uint64_t)pkt[3]->data_len,
1552 				 pkt[3]->buf_iova,
1553 				 (uint64_t)pkt[2]->data_len,
1554 				 pkt[2]->buf_iova,
1555 				 (uint64_t)pkt[1]->data_len,
1556 				 pkt[1]->buf_iova,
1557 				 (uint64_t)pkt[0]->data_len,
1558 				 pkt[0]->buf_iova);
1559 		__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
1560 		__m512i data_off_4 =
1561 			_mm512_set_epi64
1562 				(0,
1563 				 pkt[3]->data_off,
1564 				 0,
1565 				 pkt[2]->data_off,
1566 				 0,
1567 				 pkt[1]->data_off,
1568 				 0,
1569 				 pkt[0]->data_off);
1570 
1571 		desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
1572 					       IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
1573 		desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
1574 					     hi_qw_tmpl_4);
1575 		desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
1576 					      data_off_4);
1577 		_mm512_storeu_si512((void *)txdp, desc4);
1578 	}
1579 
1580 	/* do any last ones */
1581 	while (nb_pkts) {
1582 		iavf_vtx1(txdp, *pkt, flags);
1583 		txdp++, pkt++, nb_pkts--;
1584 	}
1585 }
1586 
1587 static inline uint16_t
iavf_xmit_fixed_burst_vec_avx512(void * tx_queue,struct rte_mbuf ** tx_pkts,uint16_t nb_pkts)1588 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1589 				 uint16_t nb_pkts)
1590 {
1591 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1592 	volatile struct iavf_tx_desc *txdp;
1593 	struct iavf_tx_vec_entry *txep;
1594 	uint16_t n, nb_commit, tx_id;
1595 	/* bit2 is reserved and must be set to 1 according to Spec */
1596 	uint64_t flags = IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_ICRC;
1597 	uint64_t rs = IAVF_TX_DESC_CMD_RS | flags;
1598 
1599 	/* cross rx_thresh boundary is not allowed */
1600 	nb_pkts = RTE_MIN(nb_pkts, txq->rs_thresh);
1601 
1602 	if (txq->nb_free < txq->free_thresh)
1603 		iavf_tx_free_bufs_avx512(txq);
1604 
1605 	nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_free, nb_pkts);
1606 	if (unlikely(nb_pkts == 0))
1607 		return 0;
1608 
1609 	tx_id = txq->tx_tail;
1610 	txdp = &txq->tx_ring[tx_id];
1611 	txep = (void *)txq->sw_ring;
1612 	txep += tx_id;
1613 
1614 	txq->nb_free = (uint16_t)(txq->nb_free - nb_pkts);
1615 
1616 	n = (uint16_t)(txq->nb_tx_desc - tx_id);
1617 	if (nb_commit >= n) {
1618 		tx_backlog_entry_avx512(txep, tx_pkts, n);
1619 
1620 		iavf_vtx(txdp, tx_pkts, n - 1, flags);
1621 		tx_pkts += (n - 1);
1622 		txdp += (n - 1);
1623 
1624 		iavf_vtx1(txdp, *tx_pkts++, rs);
1625 
1626 		nb_commit = (uint16_t)(nb_commit - n);
1627 
1628 		tx_id = 0;
1629 		txq->next_rs = (uint16_t)(txq->rs_thresh - 1);
1630 
1631 		/* avoid reach the end of ring */
1632 		txdp = &txq->tx_ring[tx_id];
1633 		txep = (void *)txq->sw_ring;
1634 		txep += tx_id;
1635 	}
1636 
1637 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1638 
1639 	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
1640 
1641 	tx_id = (uint16_t)(tx_id + nb_commit);
1642 	if (tx_id > txq->next_rs) {
1643 		txq->tx_ring[txq->next_rs].cmd_type_offset_bsz |=
1644 			rte_cpu_to_le_64(((uint64_t)IAVF_TX_DESC_CMD_RS) <<
1645 					 IAVF_TXD_QW1_CMD_SHIFT);
1646 		txq->next_rs =
1647 			(uint16_t)(txq->next_rs + txq->rs_thresh);
1648 	}
1649 
1650 	txq->tx_tail = tx_id;
1651 
1652 	IAVF_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1653 
1654 	return nb_pkts;
1655 }
1656 
1657 uint16_t
iavf_xmit_pkts_vec_avx512(void * tx_queue,struct rte_mbuf ** tx_pkts,uint16_t nb_pkts)1658 iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1659 			  uint16_t nb_pkts)
1660 {
1661 	uint16_t nb_tx = 0;
1662 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
1663 
1664 	while (nb_pkts) {
1665 		uint16_t ret, num;
1666 
1667 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
1668 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
1669 						       num);
1670 		nb_tx += ret;
1671 		nb_pkts -= ret;
1672 		if (ret < num)
1673 			break;
1674 	}
1675 
1676 	return nb_tx;
1677 }
1678 
1679 static inline void
iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue * txq)1680 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
1681 {
1682 	unsigned int i;
1683 	const uint16_t max_desc = (uint16_t)(txq->nb_tx_desc - 1);
1684 	struct iavf_tx_vec_entry *swr = (void *)txq->sw_ring;
1685 
1686 	if (!txq->sw_ring || txq->nb_free == max_desc)
1687 		return;
1688 
1689 	i = txq->next_dd - txq->rs_thresh + 1;
1690 	if (txq->tx_tail < i) {
1691 		for (; i < txq->nb_tx_desc; i++) {
1692 			rte_pktmbuf_free_seg(swr[i].mbuf);
1693 			swr[i].mbuf = NULL;
1694 		}
1695 		i = 0;
1696 	}
1697 }
1698 
1699 static const struct iavf_txq_ops avx512_vec_txq_ops = {
1700 	.release_mbufs = iavf_tx_queue_release_mbufs_avx512,
1701 };
1702 
1703 int __rte_cold
iavf_txq_vec_setup_avx512(struct iavf_tx_queue * txq)1704 iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq)
1705 {
1706 	txq->ops = &avx512_vec_txq_ops;
1707 	return 0;
1708 }
1709