xref: /f-stack/dpdk/drivers/net/octeontx2/otx2_tx.c (revision 2d9fd380)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4 
5 #include <rte_vect.h>
6 
7 #include "otx2_ethdev.h"
8 
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {				\
10 	/* Cached value is low, Update the fc_cache_pkts */		\
11 	if (unlikely((txq)->fc_cache_pkts < (pkts))) {			\
12 		/* Multiply with sqe_per_sqb to express in pkts */	\
13 		(txq)->fc_cache_pkts =					\
14 			((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15 				(txq)->sqes_per_sqb_log2;		\
16 		/* Check it again for the room */			\
17 		if (unlikely((txq)->fc_cache_pkts < (pkts)))		\
18 			return 0;					\
19 	}								\
20 } while (0)
21 
22 
23 static __rte_always_inline uint16_t
nix_xmit_pkts(void * tx_queue,struct rte_mbuf ** tx_pkts,uint16_t pkts,uint64_t * cmd,const uint16_t flags)24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25 	      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27 	struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28 	const rte_iova_t io_addr = txq->io_addr;
29 	void *lmt_addr = txq->lmt_addr;
30 
31 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
32 
33 	otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34 
35 	/* Perform header writes before barrier for TSO */
36 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
37 		for (i = 0; i < pkts; i++)
38 			otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
39 	}
40 
41 	/* Lets commit any changes in the packet here as no further changes
42 	 * to the packet will be done unless no fast free is enabled.
43 	 */
44 	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
45 		rte_io_wmb();
46 
47 	for (i = 0; i < pkts; i++) {
48 		otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
49 		/* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
50 		otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
51 					     tx_pkts[i]->ol_flags, 4, flags);
52 		otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
53 	}
54 
55 	/* Reduce the cached count */
56 	txq->fc_cache_pkts -= pkts;
57 
58 	return pkts;
59 }
60 
61 static __rte_always_inline uint16_t
nix_xmit_pkts_mseg(void * tx_queue,struct rte_mbuf ** tx_pkts,uint16_t pkts,uint64_t * cmd,const uint16_t flags)62 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
63 		   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
64 {
65 	struct otx2_eth_txq *txq = tx_queue; uint64_t i;
66 	const rte_iova_t io_addr = txq->io_addr;
67 	void *lmt_addr = txq->lmt_addr;
68 	uint16_t segdw;
69 
70 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
71 
72 	otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
73 
74 	/* Perform header writes before barrier for TSO */
75 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
76 		for (i = 0; i < pkts; i++)
77 			otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
78 	}
79 
80 	/* Lets commit any changes in the packet here as no further changes
81 	 * to the packet will be done unless no fast free is enabled.
82 	 */
83 	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
84 		rte_io_wmb();
85 
86 	for (i = 0; i < pkts; i++) {
87 		otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
88 		segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
89 		otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
90 					     tx_pkts[i]->ol_flags, segdw,
91 					     flags);
92 		otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
93 	}
94 
95 	/* Reduce the cached count */
96 	txq->fc_cache_pkts -= pkts;
97 
98 	return pkts;
99 }
100 
101 #if defined(RTE_ARCH_ARM64)
102 
103 #define NIX_DESCS_PER_LOOP	4
104 static __rte_always_inline uint16_t
nix_xmit_pkts_vector(void * tx_queue,struct rte_mbuf ** tx_pkts,uint16_t pkts,uint64_t * cmd,const uint16_t flags)105 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
106 		     uint16_t pkts, uint64_t *cmd, const uint16_t flags)
107 {
108 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
109 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
110 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
111 	uint64x2_t senddesc01_w0, senddesc23_w0;
112 	uint64x2_t senddesc01_w1, senddesc23_w1;
113 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
114 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
115 	struct otx2_eth_txq *txq = tx_queue;
116 	uint64_t *lmt_addr = txq->lmt_addr;
117 	rte_iova_t io_addr = txq->io_addr;
118 	uint64x2_t ltypes01, ltypes23;
119 	uint64x2_t xtmp128, ytmp128;
120 	uint64x2_t xmask01, xmask23;
121 	uint64x2_t cmd00, cmd01;
122 	uint64x2_t cmd10, cmd11;
123 	uint64x2_t cmd20, cmd21;
124 	uint64x2_t cmd30, cmd31;
125 	uint64_t lmt_status, i;
126 	uint16_t pkts_left;
127 
128 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
129 
130 	pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
131 	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
132 
133 	/* Reduce the cached count */
134 	txq->fc_cache_pkts -= pkts;
135 
136 	/* Lets commit any changes in the packet here as no further changes
137 	 * to the packet will be done unless no fast free is enabled.
138 	 */
139 	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
140 		rte_io_wmb();
141 
142 	senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
143 	senddesc23_w0 = senddesc01_w0;
144 	senddesc01_w1 = vdupq_n_u64(0);
145 	senddesc23_w1 = senddesc01_w1;
146 	sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
147 	sgdesc23_w0 = sgdesc01_w0;
148 
149 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
150 		/* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
151 		senddesc01_w0 = vbicq_u64(senddesc01_w0,
152 					  vdupq_n_u64(0xFFFFFFFF));
153 		sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
154 					vdupq_n_u64(0xFFFFFFFF));
155 
156 		senddesc23_w0 = senddesc01_w0;
157 		sgdesc23_w0 = sgdesc01_w0;
158 
159 		/* Move mbufs to iova */
160 		mbuf0 = (uint64_t *)tx_pkts[0];
161 		mbuf1 = (uint64_t *)tx_pkts[1];
162 		mbuf2 = (uint64_t *)tx_pkts[2];
163 		mbuf3 = (uint64_t *)tx_pkts[3];
164 
165 		mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
166 				     offsetof(struct rte_mbuf, buf_iova));
167 		mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
168 				     offsetof(struct rte_mbuf, buf_iova));
169 		mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
170 				     offsetof(struct rte_mbuf, buf_iova));
171 		mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
172 				     offsetof(struct rte_mbuf, buf_iova));
173 		/*
174 		 * Get mbuf's, olflags, iova, pktlen, dataoff
175 		 * dataoff_iovaX.D[0] = iova,
176 		 * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
177 		 * len_olflagsX.D[0] = ol_flags,
178 		 * len_olflagsX.D[1](63:32) = mbuf->pkt_len
179 		 */
180 		dataoff_iova0  = vld1q_u64(mbuf0);
181 		len_olflags0 = vld1q_u64(mbuf0 + 2);
182 		dataoff_iova1  = vld1q_u64(mbuf1);
183 		len_olflags1 = vld1q_u64(mbuf1 + 2);
184 		dataoff_iova2  = vld1q_u64(mbuf2);
185 		len_olflags2 = vld1q_u64(mbuf2 + 2);
186 		dataoff_iova3  = vld1q_u64(mbuf3);
187 		len_olflags3 = vld1q_u64(mbuf3 + 2);
188 
189 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
190 			struct rte_mbuf *mbuf;
191 			/* Set don't free bit if reference count > 1 */
192 			xmask01 = vdupq_n_u64(0);
193 			xmask23 = xmask01;
194 
195 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
196 				offsetof(struct rte_mbuf, buf_iova));
197 
198 			if (otx2_nix_prefree_seg(mbuf))
199 				vsetq_lane_u64(0x80000, xmask01, 0);
200 			else
201 				__mempool_check_cookies(mbuf->pool,
202 							(void **)&mbuf,
203 							1, 0);
204 
205 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
206 				offsetof(struct rte_mbuf, buf_iova));
207 			if (otx2_nix_prefree_seg(mbuf))
208 				vsetq_lane_u64(0x80000, xmask01, 1);
209 			else
210 				__mempool_check_cookies(mbuf->pool,
211 							(void **)&mbuf,
212 							1, 0);
213 
214 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
215 				offsetof(struct rte_mbuf, buf_iova));
216 			if (otx2_nix_prefree_seg(mbuf))
217 				vsetq_lane_u64(0x80000, xmask23, 0);
218 			else
219 				__mempool_check_cookies(mbuf->pool,
220 							(void **)&mbuf,
221 							1, 0);
222 
223 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
224 				offsetof(struct rte_mbuf, buf_iova));
225 			if (otx2_nix_prefree_seg(mbuf))
226 				vsetq_lane_u64(0x80000, xmask23, 1);
227 			else
228 				__mempool_check_cookies(mbuf->pool,
229 							(void **)&mbuf,
230 							1, 0);
231 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
232 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
233 			/* Ensuring mbuf fields which got updated in
234 			 * otx2_nix_prefree_seg are written before LMTST.
235 			 */
236 			rte_io_wmb();
237 		} else {
238 			struct rte_mbuf *mbuf;
239 			/* Mark mempool object as "put" since
240 			 * it is freed by NIX
241 			 */
242 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
243 				offsetof(struct rte_mbuf, buf_iova));
244 			__mempool_check_cookies(mbuf->pool, (void **)&mbuf,
245 						1, 0);
246 
247 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
248 				offsetof(struct rte_mbuf, buf_iova));
249 			__mempool_check_cookies(mbuf->pool, (void **)&mbuf,
250 						1, 0);
251 
252 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
253 				offsetof(struct rte_mbuf, buf_iova));
254 			__mempool_check_cookies(mbuf->pool, (void **)&mbuf,
255 						1, 0);
256 
257 			mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
258 				offsetof(struct rte_mbuf, buf_iova));
259 			__mempool_check_cookies(mbuf->pool, (void **)&mbuf,
260 						1, 0);
261 			RTE_SET_USED(mbuf);
262 		}
263 
264 		/* Move mbufs to point pool */
265 		mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
266 			 offsetof(struct rte_mbuf, pool) -
267 			 offsetof(struct rte_mbuf, buf_iova));
268 		mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
269 			 offsetof(struct rte_mbuf, pool) -
270 			 offsetof(struct rte_mbuf, buf_iova));
271 		mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
272 			 offsetof(struct rte_mbuf, pool) -
273 			 offsetof(struct rte_mbuf, buf_iova));
274 		mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
275 			 offsetof(struct rte_mbuf, pool) -
276 			 offsetof(struct rte_mbuf, buf_iova));
277 
278 		if (flags &
279 		    (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
280 		     NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
281 			/* Get tx_offload for ol2, ol3, l2, l3 lengths */
282 			/*
283 			 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
284 			 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
285 			 */
286 
287 			asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
288 				      [a]"+w"(senddesc01_w1) :
289 				      [in]"r"(mbuf0 + 2) : "memory");
290 
291 			asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
292 				      [a]"+w"(senddesc01_w1) :
293 				      [in]"r"(mbuf1 + 2) : "memory");
294 
295 			asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
296 				      [b]"+w"(senddesc23_w1) :
297 				      [in]"r"(mbuf2 + 2) : "memory");
298 
299 			asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
300 				      [b]"+w"(senddesc23_w1) :
301 				      [in]"r"(mbuf3 + 2) : "memory");
302 
303 			/* Get pool pointer alone */
304 			mbuf0 = (uint64_t *)*mbuf0;
305 			mbuf1 = (uint64_t *)*mbuf1;
306 			mbuf2 = (uint64_t *)*mbuf2;
307 			mbuf3 = (uint64_t *)*mbuf3;
308 		} else {
309 			/* Get pool pointer alone */
310 			mbuf0 = (uint64_t *)*mbuf0;
311 			mbuf1 = (uint64_t *)*mbuf1;
312 			mbuf2 = (uint64_t *)*mbuf2;
313 			mbuf3 = (uint64_t *)*mbuf3;
314 		}
315 
316 		const uint8x16_t shuf_mask2 = {
317 			0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
318 			0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
319 		};
320 		xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
321 		ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
322 
323 		/* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
324 		const uint64x2_t and_mask0 = {
325 			0xFFFFFFFFFFFFFFFF,
326 			0x000000000000FFFF,
327 		};
328 
329 		dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
330 		dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
331 		dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
332 		dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
333 
334 		/*
335 		 * Pick only 16 bits of pktlen preset at bits 63:32
336 		 * and place them at bits 15:0.
337 		 */
338 		xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
339 		ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
340 
341 		/* Add pairwise to get dataoff + iova in sgdesc_w1 */
342 		sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
343 		sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
344 
345 		/* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
346 		 * pktlen at 15:0 position.
347 		 */
348 		sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
349 		sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
350 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
351 		senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
352 
353 		if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
354 		    !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
355 			/*
356 			 * Lookup table to translate ol_flags to
357 			 * il3/il4 types. But we still use ol3/ol4 types in
358 			 * senddesc_w1 as only one header processing is enabled.
359 			 */
360 			const uint8x16_t tbl = {
361 				/* [0-15] = il4type:il3type */
362 				0x04, /* none (IPv6 assumed) */
363 				0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
364 				0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
365 				0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
366 				0x03, /* PKT_TX_IP_CKSUM */
367 				0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
368 				0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
369 				0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
370 				0x02, /* PKT_TX_IPV4  */
371 				0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
372 				0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
373 				0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
374 				0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
375 				0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
376 				       * PKT_TX_TCP_CKSUM
377 				       */
378 				0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
379 				       * PKT_TX_SCTP_CKSUM
380 				       */
381 				0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
382 				       * PKT_TX_UDP_CKSUM
383 				       */
384 			};
385 
386 			/* Extract olflags to translate to iltypes */
387 			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
388 			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
389 
390 			/*
391 			 * E(47):L3_LEN(9):L2_LEN(7+z)
392 			 * E(47):L3_LEN(9):L2_LEN(7+z)
393 			 */
394 			senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
395 			senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
396 
397 			/* Move OLFLAGS bits 55:52 to 51:48
398 			 * with zeros preprended on the byte and rest
399 			 * don't care
400 			 */
401 			xtmp128 = vshrq_n_u8(xtmp128, 4);
402 			ytmp128 = vshrq_n_u8(ytmp128, 4);
403 			/*
404 			 * E(48):L3_LEN(8):L2_LEN(z+7)
405 			 * E(48):L3_LEN(8):L2_LEN(z+7)
406 			 */
407 			const int8x16_t tshft3 = {
408 				-1, 0, 8, 8, 8,	8, 8, 8,
409 				-1, 0, 8, 8, 8,	8, 8, 8,
410 			};
411 
412 			senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
413 			senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
414 
415 			/* Do the lookup */
416 			ltypes01 = vqtbl1q_u8(tbl, xtmp128);
417 			ltypes23 = vqtbl1q_u8(tbl, ytmp128);
418 
419 			/* Just use ld1q to retrieve aura
420 			 * when we don't need tx_offload
421 			 */
422 			mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
423 					offsetof(struct rte_mempool, pool_id));
424 			mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
425 					offsetof(struct rte_mempool, pool_id));
426 			mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
427 					offsetof(struct rte_mempool, pool_id));
428 			mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
429 					offsetof(struct rte_mempool, pool_id));
430 
431 			/* Pick only relevant fields i.e Bit 48:55 of iltype
432 			 * and place it in ol3/ol4type of senddesc_w1
433 			 */
434 			const uint8x16_t shuf_mask0 = {
435 				0xFF, 0xFF, 0xFF, 0xFF,	0x6, 0xFF, 0xFF, 0xFF,
436 				0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
437 			};
438 
439 			ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
440 			ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
441 
442 			/* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
443 			 * a [E(32):E(16):OL3(8):OL2(8)]
444 			 * a = a + (a << 8)
445 			 * a [E(32):E(16):(OL3+OL2):OL2]
446 			 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
447 			 */
448 			senddesc01_w1 = vaddq_u8(senddesc01_w1,
449 						 vshlq_n_u16(senddesc01_w1, 8));
450 			senddesc23_w1 = vaddq_u8(senddesc23_w1,
451 						 vshlq_n_u16(senddesc23_w1, 8));
452 
453 			/* Create first half of 4W cmd for 4 mbufs (sgdesc) */
454 			cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
455 			cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
456 			cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
457 			cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
458 
459 			xmask01 = vdupq_n_u64(0);
460 			xmask23 = xmask01;
461 			asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
462 				[a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
463 
464 			asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
465 				 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
466 
467 			asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
468 				 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
469 
470 			asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
471 				 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
472 			xmask01 = vshlq_n_u64(xmask01, 20);
473 			xmask23 = vshlq_n_u64(xmask23, 20);
474 
475 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
476 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
477 			/* Move ltypes to senddesc*_w1 */
478 			senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
479 			senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
480 
481 			/* Create first half of 4W cmd for 4 mbufs (sendhdr) */
482 			cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
483 			cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
484 			cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
485 			cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
486 
487 		} else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
488 			   (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
489 			/*
490 			 * Lookup table to translate ol_flags to
491 			 * ol3/ol4 types.
492 			 */
493 
494 			const uint8x16_t tbl = {
495 				/* [0-15] = ol4type:ol3type */
496 				0x00, /* none */
497 				0x03, /* OUTER_IP_CKSUM */
498 				0x02, /* OUTER_IPV4 */
499 				0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
500 				0x04, /* OUTER_IPV6 */
501 				0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
502 				0x00, /* OUTER_IPV6 | OUTER_IPV4 */
503 				0x00, /* OUTER_IPV6 | OUTER_IPV4 |
504 				       * OUTER_IP_CKSUM
505 				       */
506 				0x00, /* OUTER_UDP_CKSUM */
507 				0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
508 				0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
509 				0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
510 				       * OUTER_IP_CKSUM
511 				       */
512 				0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
513 				0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
514 				       * OUTER_IP_CKSUM
515 				       */
516 				0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
517 				       * OUTER_IPV4
518 				       */
519 				0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
520 				       * OUTER_IPV4 | OUTER_IP_CKSUM
521 				       */
522 			};
523 
524 			/* Extract olflags to translate to iltypes */
525 			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
526 			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
527 
528 			/*
529 			 * E(47):OL3_LEN(9):OL2_LEN(7+z)
530 			 * E(47):OL3_LEN(9):OL2_LEN(7+z)
531 			 */
532 			const uint8x16_t shuf_mask5 = {
533 				0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
534 				0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
535 			};
536 			senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
537 			senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
538 
539 			/* Extract outer ol flags only */
540 			const uint64x2_t o_cksum_mask = {
541 				0x1C00020000000000,
542 				0x1C00020000000000,
543 			};
544 
545 			xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
546 			ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
547 
548 			/* Extract OUTER_UDP_CKSUM bit 41 and
549 			 * move it to bit 61
550 			 */
551 
552 			xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
553 			ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
554 
555 			/* Shift oltype by 2 to start nibble from BIT(56)
556 			 * instead of BIT(58)
557 			 */
558 			xtmp128 = vshrq_n_u8(xtmp128, 2);
559 			ytmp128 = vshrq_n_u8(ytmp128, 2);
560 			/*
561 			 * E(48):L3_LEN(8):L2_LEN(z+7)
562 			 * E(48):L3_LEN(8):L2_LEN(z+7)
563 			 */
564 			const int8x16_t tshft3 = {
565 				-1, 0, 8, 8, 8, 8, 8, 8,
566 				-1, 0, 8, 8, 8, 8, 8, 8,
567 			};
568 
569 			senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
570 			senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
571 
572 			/* Do the lookup */
573 			ltypes01 = vqtbl1q_u8(tbl, xtmp128);
574 			ltypes23 = vqtbl1q_u8(tbl, ytmp128);
575 
576 			/* Just use ld1q to retrieve aura
577 			 * when we don't need tx_offload
578 			 */
579 			mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
580 					offsetof(struct rte_mempool, pool_id));
581 			mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
582 					offsetof(struct rte_mempool, pool_id));
583 			mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
584 					offsetof(struct rte_mempool, pool_id));
585 			mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
586 					offsetof(struct rte_mempool, pool_id));
587 
588 			/* Pick only relevant fields i.e Bit 56:63 of oltype
589 			 * and place it in ol3/ol4type of senddesc_w1
590 			 */
591 			const uint8x16_t shuf_mask0 = {
592 				0xFF, 0xFF, 0xFF, 0xFF,	0x7, 0xFF, 0xFF, 0xFF,
593 				0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
594 			};
595 
596 			ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
597 			ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
598 
599 			/* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
600 			 * a [E(32):E(16):OL3(8):OL2(8)]
601 			 * a = a + (a << 8)
602 			 * a [E(32):E(16):(OL3+OL2):OL2]
603 			 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
604 			 */
605 			senddesc01_w1 = vaddq_u8(senddesc01_w1,
606 						 vshlq_n_u16(senddesc01_w1, 8));
607 			senddesc23_w1 = vaddq_u8(senddesc23_w1,
608 						 vshlq_n_u16(senddesc23_w1, 8));
609 
610 			/* Create second half of 4W cmd for 4 mbufs (sgdesc) */
611 			cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
612 			cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
613 			cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
614 			cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
615 
616 			xmask01 = vdupq_n_u64(0);
617 			xmask23 = xmask01;
618 			asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
619 				 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
620 
621 			asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
622 				 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
623 
624 			asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
625 				 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
626 
627 			asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
628 				 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
629 			xmask01 = vshlq_n_u64(xmask01, 20);
630 			xmask23 = vshlq_n_u64(xmask23, 20);
631 
632 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
633 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
634 			/* Move ltypes to senddesc*_w1 */
635 			senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
636 			senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
637 
638 			/* Create first half of 4W cmd for 4 mbufs (sendhdr) */
639 			cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
640 			cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
641 			cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
642 			cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
643 
644 		} else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
645 			   (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
646 			/* Lookup table to translate ol_flags to
647 			 * ol4type, ol3type, il4type, il3type of senddesc_w1
648 			 */
649 			const uint8x16x2_t tbl = {
650 			{
651 				{
652 					/* [0-15] = il4type:il3type */
653 					0x04, /* none (IPv6) */
654 					0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
655 					0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
656 					0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
657 					0x03, /* PKT_TX_IP_CKSUM */
658 					0x13, /* PKT_TX_IP_CKSUM |
659 					       * PKT_TX_TCP_CKSUM
660 					       */
661 					0x23, /* PKT_TX_IP_CKSUM |
662 					       * PKT_TX_SCTP_CKSUM
663 					       */
664 					0x33, /* PKT_TX_IP_CKSUM |
665 					       * PKT_TX_UDP_CKSUM
666 					       */
667 					0x02, /* PKT_TX_IPV4 */
668 					0x12, /* PKT_TX_IPV4 |
669 					       * PKT_TX_TCP_CKSUM
670 					       */
671 					0x22, /* PKT_TX_IPV4 |
672 					       * PKT_TX_SCTP_CKSUM
673 					       */
674 					0x32, /* PKT_TX_IPV4 |
675 					       * PKT_TX_UDP_CKSUM
676 					       */
677 					0x03, /* PKT_TX_IPV4 |
678 					       * PKT_TX_IP_CKSUM
679 					       */
680 					0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
681 					       * PKT_TX_TCP_CKSUM
682 					       */
683 					0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
684 					       * PKT_TX_SCTP_CKSUM
685 					       */
686 					0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
687 					       * PKT_TX_UDP_CKSUM
688 					       */
689 				},
690 
691 				{
692 					/* [16-31] = ol4type:ol3type */
693 					0x00, /* none */
694 					0x03, /* OUTER_IP_CKSUM */
695 					0x02, /* OUTER_IPV4 */
696 					0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
697 					0x04, /* OUTER_IPV6 */
698 					0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
699 					0x00, /* OUTER_IPV6 | OUTER_IPV4 */
700 					0x00, /* OUTER_IPV6 | OUTER_IPV4 |
701 					       * OUTER_IP_CKSUM
702 					       */
703 					0x00, /* OUTER_UDP_CKSUM */
704 					0x33, /* OUTER_UDP_CKSUM |
705 					       * OUTER_IP_CKSUM
706 					       */
707 					0x32, /* OUTER_UDP_CKSUM |
708 					       * OUTER_IPV4
709 					       */
710 					0x33, /* OUTER_UDP_CKSUM |
711 					       * OUTER_IPV4 | OUTER_IP_CKSUM
712 					       */
713 					0x34, /* OUTER_UDP_CKSUM |
714 					       * OUTER_IPV6
715 					       */
716 					0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
717 					       * OUTER_IP_CKSUM
718 					       */
719 					0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
720 					       * OUTER_IPV4
721 					       */
722 					0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
723 					       * OUTER_IPV4 | OUTER_IP_CKSUM
724 					       */
725 				},
726 			}
727 			};
728 
729 			/* Extract olflags to translate to oltype & iltype */
730 			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
731 			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
732 
733 			/*
734 			 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
735 			 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
736 			 */
737 			const uint32x4_t tshft_4 = {
738 				1, 0,
739 				1, 0,
740 			};
741 			senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
742 			senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
743 
744 			/*
745 			 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
746 			 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
747 			 */
748 			const uint8x16_t shuf_mask5 = {
749 				0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
750 				0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF,	0xFF, 0xFF,
751 			};
752 			senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
753 			senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
754 
755 			/* Extract outer and inner header ol_flags */
756 			const uint64x2_t oi_cksum_mask = {
757 				0x1CF0020000000000,
758 				0x1CF0020000000000,
759 			};
760 
761 			xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
762 			ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
763 
764 			/* Extract OUTER_UDP_CKSUM bit 41 and
765 			 * move it to bit 61
766 			 */
767 
768 			xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
769 			ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
770 
771 			/* Shift right oltype by 2 and iltype by 4
772 			 * to start oltype nibble from BIT(58)
773 			 * instead of BIT(56) and iltype nibble from BIT(48)
774 			 * instead of BIT(52).
775 			 */
776 			const int8x16_t tshft5 = {
777 				8, 8, 8, 8, 8, 8, -4, -2,
778 				8, 8, 8, 8, 8, 8, -4, -2,
779 			};
780 
781 			xtmp128 = vshlq_u8(xtmp128, tshft5);
782 			ytmp128 = vshlq_u8(ytmp128, tshft5);
783 			/*
784 			 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
785 			 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
786 			 */
787 			const int8x16_t tshft3 = {
788 				-1, 0, -1, 0, 0, 0, 0, 0,
789 				-1, 0, -1, 0, 0, 0, 0, 0,
790 			};
791 
792 			senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
793 			senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
794 
795 			/* Mark Bit(4) of oltype */
796 			const uint64x2_t oi_cksum_mask2 = {
797 				0x1000000000000000,
798 				0x1000000000000000,
799 			};
800 
801 			xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
802 			ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
803 
804 			/* Do the lookup */
805 			ltypes01 = vqtbl2q_u8(tbl, xtmp128);
806 			ltypes23 = vqtbl2q_u8(tbl, ytmp128);
807 
808 			/* Just use ld1q to retrieve aura
809 			 * when we don't need tx_offload
810 			 */
811 			mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
812 					offsetof(struct rte_mempool, pool_id));
813 			mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
814 					offsetof(struct rte_mempool, pool_id));
815 			mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
816 					offsetof(struct rte_mempool, pool_id));
817 			mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
818 					offsetof(struct rte_mempool, pool_id));
819 
820 			/* Pick only relevant fields i.e Bit 48:55 of iltype and
821 			 * Bit 56:63 of oltype and place it in corresponding
822 			 * place in senddesc_w1.
823 			 */
824 			const uint8x16_t shuf_mask0 = {
825 				0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
826 				0xFF, 0xFF, 0xFF, 0xFF,	0xF, 0xE, 0xFF, 0xFF,
827 			};
828 
829 			ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
830 			ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
831 
832 			/* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
833 			 * l3len, l2len, ol3len, ol2len.
834 			 * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
835 			 * a = a + (a << 8)
836 			 * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
837 			 * a = a + (a << 16)
838 			 * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
839 			 * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
840 			 */
841 			senddesc01_w1 = vaddq_u8(senddesc01_w1,
842 						 vshlq_n_u32(senddesc01_w1, 8));
843 			senddesc23_w1 = vaddq_u8(senddesc23_w1,
844 						 vshlq_n_u32(senddesc23_w1, 8));
845 
846 			/* Create second half of 4W cmd for 4 mbufs (sgdesc) */
847 			cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
848 			cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
849 			cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
850 			cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
851 
852 			/* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
853 			senddesc01_w1 = vaddq_u8(senddesc01_w1,
854 						vshlq_n_u32(senddesc01_w1, 16));
855 			senddesc23_w1 = vaddq_u8(senddesc23_w1,
856 						vshlq_n_u32(senddesc23_w1, 16));
857 
858 			xmask01 = vdupq_n_u64(0);
859 			xmask23 = xmask01;
860 			asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
861 				 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
862 
863 			asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
864 				 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
865 
866 			asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
867 				 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
868 
869 			asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
870 				 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
871 			xmask01 = vshlq_n_u64(xmask01, 20);
872 			xmask23 = vshlq_n_u64(xmask23, 20);
873 
874 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
875 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
876 			/* Move ltypes to senddesc*_w1 */
877 			senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
878 			senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
879 
880 			/* Create first half of 4W cmd for 4 mbufs (sendhdr) */
881 			cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
882 			cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
883 			cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
884 			cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
885 		} else {
886 			/* Just use ld1q to retrieve aura
887 			 * when we don't need tx_offload
888 			 */
889 			mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
890 					offsetof(struct rte_mempool, pool_id));
891 			mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
892 					offsetof(struct rte_mempool, pool_id));
893 			mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
894 					offsetof(struct rte_mempool, pool_id));
895 			mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
896 					offsetof(struct rte_mempool, pool_id));
897 			xmask01 = vdupq_n_u64(0);
898 			xmask23 = xmask01;
899 			asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
900 				 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
901 
902 			asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
903 				 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
904 
905 			asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
906 				 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
907 
908 			asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
909 				 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
910 			xmask01 = vshlq_n_u64(xmask01, 20);
911 			xmask23 = vshlq_n_u64(xmask23, 20);
912 
913 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
914 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
915 
916 			/* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
917 			cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
918 			cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
919 			cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
920 			cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
921 			cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
922 			cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
923 			cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
924 			cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
925 		}
926 
927 		do {
928 			vst1q_u64(lmt_addr, cmd00);
929 			vst1q_u64(lmt_addr + 2, cmd01);
930 			vst1q_u64(lmt_addr + 4, cmd10);
931 			vst1q_u64(lmt_addr + 6, cmd11);
932 			vst1q_u64(lmt_addr + 8, cmd20);
933 			vst1q_u64(lmt_addr + 10, cmd21);
934 			vst1q_u64(lmt_addr + 12, cmd30);
935 			vst1q_u64(lmt_addr + 14, cmd31);
936 			lmt_status = otx2_lmt_submit(io_addr);
937 
938 		} while (lmt_status == 0);
939 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
940 	}
941 
942 	if (unlikely(pkts_left))
943 		pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
944 
945 	return pkts;
946 }
947 
948 #else
949 static __rte_always_inline uint16_t
nix_xmit_pkts_vector(void * tx_queue,struct rte_mbuf ** tx_pkts,uint16_t pkts,uint64_t * cmd,const uint16_t flags)950 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
951 		     uint16_t pkts, uint64_t *cmd, const uint16_t flags)
952 {
953 	RTE_SET_USED(tx_queue);
954 	RTE_SET_USED(tx_pkts);
955 	RTE_SET_USED(pkts);
956 	RTE_SET_USED(cmd);
957 	RTE_SET_USED(flags);
958 	return 0;
959 }
960 #endif
961 
962 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)			\
963 static uint16_t __rte_noinline	__rte_hot					\
964 otx2_nix_xmit_pkts_ ## name(void *tx_queue,				\
965 			struct rte_mbuf **tx_pkts, uint16_t pkts)	\
966 {									\
967 	uint64_t cmd[sz];						\
968 									\
969 	/* For TSO inner checksum is a must */				\
970 	if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&				\
971 	    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))			\
972 		return 0;						\
973 	return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);	\
974 }
975 
976 NIX_TX_FASTPATH_MODES
977 #undef T
978 
979 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)			\
980 static uint16_t __rte_noinline	__rte_hot					\
981 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,			\
982 			struct rte_mbuf **tx_pkts, uint16_t pkts)	\
983 {									\
984 	uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];			\
985 									\
986 	/* For TSO inner checksum is a must */				\
987 	if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&				\
988 	    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))			\
989 		return 0;						\
990 	return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,		\
991 				  (flags) | NIX_TX_MULTI_SEG_F);	\
992 }
993 
994 NIX_TX_FASTPATH_MODES
995 #undef T
996 
997 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)			\
998 static uint16_t __rte_noinline	__rte_hot					\
999 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,				\
1000 			struct rte_mbuf **tx_pkts, uint16_t pkts)	\
1001 {									\
1002 	uint64_t cmd[sz];						\
1003 									\
1004 	/* VLAN, TSTMP, TSO is not supported by vec */			\
1005 	if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||			\
1006 	    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||			\
1007 	    (flags) & NIX_TX_OFFLOAD_TSO_F)				\
1008 		return 0;						\
1009 	return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
1010 }
1011 
1012 NIX_TX_FASTPATH_MODES
1013 #undef T
1014 
1015 static inline void
pick_tx_func(struct rte_eth_dev * eth_dev,const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])1016 pick_tx_func(struct rte_eth_dev *eth_dev,
1017 	     const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])
1018 {
1019 	struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1020 
1021 	/* [SEC] [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1022 	eth_dev->tx_pkt_burst = tx_burst
1023 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_SECURITY_F)]
1024 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1025 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1026 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1027 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1028 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1029 		[!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1030 }
1031 
1032 void
otx2_eth_set_tx_function(struct rte_eth_dev * eth_dev)1033 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1034 {
1035 	struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1036 
1037 	const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2][2] = {
1038 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)			\
1039 	[f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1040 
1041 NIX_TX_FASTPATH_MODES
1042 #undef T
1043 	};
1044 
1045 	const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2][2] = {
1046 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)			\
1047 	[f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1048 
1049 NIX_TX_FASTPATH_MODES
1050 #undef T
1051 	};
1052 
1053 	const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2][2] = {
1054 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)			\
1055 	[f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1056 
1057 NIX_TX_FASTPATH_MODES
1058 #undef T
1059 	};
1060 
1061 	if (dev->scalar_ena ||
1062 	    (dev->tx_offload_flags &
1063 	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1064 	      NIX_TX_OFFLOAD_TSO_F)))
1065 		pick_tx_func(eth_dev, nix_eth_tx_burst);
1066 	else
1067 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1068 
1069 	if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1070 		pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1071 
1072 	rte_mb();
1073 }
1074