1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4 
5 #include <rte_mempool.h>
6 #include <rte_vect.h>
7 
8 #include "otx2_mempool.h"
9 
10 static int __rte_hot
otx2_npa_enq(struct rte_mempool * mp,void * const * obj_table,unsigned int n)11 otx2_npa_enq(struct rte_mempool *mp, void * const *obj_table, unsigned int n)
12 {
13 	unsigned int index; const uint64_t aura_handle = mp->pool_id;
14 	const uint64_t reg = npa_lf_aura_handle_to_aura(aura_handle);
15 	const uint64_t addr = npa_lf_aura_handle_to_base(aura_handle) +
16 				 NPA_LF_AURA_OP_FREE0;
17 
18 	/* Ensure mbuf init changes are written before the free pointers
19 	 * are enqueued to the stack.
20 	 */
21 	rte_io_wmb();
22 	for (index = 0; index < n; index++)
23 		otx2_store_pair((uint64_t)obj_table[index], reg, addr);
24 
25 	return 0;
26 }
27 
28 static __rte_noinline int
npa_lf_aura_op_alloc_one(const int64_t wdata,int64_t * const addr,void ** obj_table,uint8_t i)29 npa_lf_aura_op_alloc_one(const int64_t wdata, int64_t * const addr,
30 			 void **obj_table, uint8_t i)
31 {
32 	uint8_t retry = 4;
33 
34 	do {
35 		obj_table[i] = (void *)otx2_atomic64_add_nosync(wdata, addr);
36 		if (obj_table[i] != NULL)
37 			return 0;
38 
39 	} while (retry--);
40 
41 	return -ENOENT;
42 }
43 
44 #if defined(RTE_ARCH_ARM64)
45 static __rte_noinline int
npa_lf_aura_op_search_alloc(const int64_t wdata,int64_t * const addr,void ** obj_table,unsigned int n)46 npa_lf_aura_op_search_alloc(const int64_t wdata, int64_t * const addr,
47 		void **obj_table, unsigned int n)
48 {
49 	uint8_t i;
50 
51 	for (i = 0; i < n; i++) {
52 		if (obj_table[i] != NULL)
53 			continue;
54 		if (npa_lf_aura_op_alloc_one(wdata, addr, obj_table, i))
55 			return -ENOENT;
56 	}
57 
58 	return 0;
59 }
60 
61 static __rte_noinline int
npa_lf_aura_op_alloc_bulk(const int64_t wdata,int64_t * const addr,unsigned int n,void ** obj_table)62 npa_lf_aura_op_alloc_bulk(const int64_t wdata, int64_t * const addr,
63 			  unsigned int n, void **obj_table)
64 {
65 	register const uint64_t wdata64 __asm("x26") = wdata;
66 	register const uint64_t wdata128 __asm("x27") = wdata;
67 	uint64x2_t failed = vdupq_n_u64(~0);
68 
69 	switch (n) {
70 	case 32:
71 	{
72 		asm volatile (
73 		".cpu  generic+lse\n"
74 		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
75 		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
76 		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
77 		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
78 		"casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
79 		"casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
80 		"casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
81 		"casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
82 		"casp x16, x17, %[wdata64], %[wdata128], [%[loc]]\n"
83 		"casp x18, x19, %[wdata64], %[wdata128], [%[loc]]\n"
84 		"casp x20, x21, %[wdata64], %[wdata128], [%[loc]]\n"
85 		"casp x22, x23, %[wdata64], %[wdata128], [%[loc]]\n"
86 		"fmov d16, x0\n"
87 		"fmov v16.D[1], x1\n"
88 		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
89 		"fmov d17, x2\n"
90 		"fmov v17.D[1], x3\n"
91 		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
92 		"fmov d18, x4\n"
93 		"fmov v18.D[1], x5\n"
94 		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
95 		"fmov d19, x6\n"
96 		"fmov v19.D[1], x7\n"
97 		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
98 		"and %[failed].16B, %[failed].16B, v16.16B\n"
99 		"and %[failed].16B, %[failed].16B, v17.16B\n"
100 		"and %[failed].16B, %[failed].16B, v18.16B\n"
101 		"and %[failed].16B, %[failed].16B, v19.16B\n"
102 		"fmov d20, x8\n"
103 		"fmov v20.D[1], x9\n"
104 		"fmov d21, x10\n"
105 		"fmov v21.D[1], x11\n"
106 		"fmov d22, x12\n"
107 		"fmov v22.D[1], x13\n"
108 		"fmov d23, x14\n"
109 		"fmov v23.D[1], x15\n"
110 		"and %[failed].16B, %[failed].16B, v20.16B\n"
111 		"and %[failed].16B, %[failed].16B, v21.16B\n"
112 		"and %[failed].16B, %[failed].16B, v22.16B\n"
113 		"and %[failed].16B, %[failed].16B, v23.16B\n"
114 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
115 		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
116 		"fmov d16, x16\n"
117 		"fmov v16.D[1], x17\n"
118 		"fmov d17, x18\n"
119 		"fmov v17.D[1], x19\n"
120 		"fmov d18, x20\n"
121 		"fmov v18.D[1], x21\n"
122 		"fmov d19, x22\n"
123 		"fmov v19.D[1], x23\n"
124 		"and %[failed].16B, %[failed].16B, v16.16B\n"
125 		"and %[failed].16B, %[failed].16B, v17.16B\n"
126 		"and %[failed].16B, %[failed].16B, v18.16B\n"
127 		"and %[failed].16B, %[failed].16B, v19.16B\n"
128 		"fmov d20, x0\n"
129 		"fmov v20.D[1], x1\n"
130 		"fmov d21, x2\n"
131 		"fmov v21.D[1], x3\n"
132 		"fmov d22, x4\n"
133 		"fmov v22.D[1], x5\n"
134 		"fmov d23, x6\n"
135 		"fmov v23.D[1], x7\n"
136 		"and %[failed].16B, %[failed].16B, v20.16B\n"
137 		"and %[failed].16B, %[failed].16B, v21.16B\n"
138 		"and %[failed].16B, %[failed].16B, v22.16B\n"
139 		"and %[failed].16B, %[failed].16B, v23.16B\n"
140 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
141 		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
142 		: "+Q" (*addr), [failed] "=&w" (failed)
143 		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
144 		[dst] "r" (obj_table), [loc] "r" (addr)
145 		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
146 		"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16",
147 		"x17", "x18", "x19", "x20", "x21", "x22", "x23", "v16", "v17",
148 		"v18", "v19", "v20", "v21", "v22", "v23"
149 		);
150 		break;
151 	}
152 	case 16:
153 	{
154 		asm volatile (
155 		".cpu  generic+lse\n"
156 		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
157 		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
158 		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
159 		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
160 		"casp x8, x9, %[wdata64], %[wdata128], [%[loc]]\n"
161 		"casp x10, x11, %[wdata64], %[wdata128], [%[loc]]\n"
162 		"casp x12, x13, %[wdata64], %[wdata128], [%[loc]]\n"
163 		"casp x14, x15, %[wdata64], %[wdata128], [%[loc]]\n"
164 		"fmov d16, x0\n"
165 		"fmov v16.D[1], x1\n"
166 		"fmov d17, x2\n"
167 		"fmov v17.D[1], x3\n"
168 		"fmov d18, x4\n"
169 		"fmov v18.D[1], x5\n"
170 		"fmov d19, x6\n"
171 		"fmov v19.D[1], x7\n"
172 		"and %[failed].16B, %[failed].16B, v16.16B\n"
173 		"and %[failed].16B, %[failed].16B, v17.16B\n"
174 		"and %[failed].16B, %[failed].16B, v18.16B\n"
175 		"and %[failed].16B, %[failed].16B, v19.16B\n"
176 		"fmov d20, x8\n"
177 		"fmov v20.D[1], x9\n"
178 		"fmov d21, x10\n"
179 		"fmov v21.D[1], x11\n"
180 		"fmov d22, x12\n"
181 		"fmov v22.D[1], x13\n"
182 		"fmov d23, x14\n"
183 		"fmov v23.D[1], x15\n"
184 		"and %[failed].16B, %[failed].16B, v20.16B\n"
185 		"and %[failed].16B, %[failed].16B, v21.16B\n"
186 		"and %[failed].16B, %[failed].16B, v22.16B\n"
187 		"and %[failed].16B, %[failed].16B, v23.16B\n"
188 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
189 		"st1 { v20.2d, v21.2d, v22.2d, v23.2d}, [%[dst]], 64\n"
190 		: "+Q" (*addr), [failed] "=&w" (failed)
191 		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
192 		[dst] "r" (obj_table), [loc] "r" (addr)
193 		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
194 		"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "v16",
195 		"v17", "v18", "v19", "v20", "v21", "v22", "v23"
196 		);
197 		break;
198 	}
199 	case 8:
200 	{
201 		asm volatile (
202 		".cpu  generic+lse\n"
203 		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
204 		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
205 		"casp x4, x5, %[wdata64], %[wdata128], [%[loc]]\n"
206 		"casp x6, x7, %[wdata64], %[wdata128], [%[loc]]\n"
207 		"fmov d16, x0\n"
208 		"fmov v16.D[1], x1\n"
209 		"fmov d17, x2\n"
210 		"fmov v17.D[1], x3\n"
211 		"fmov d18, x4\n"
212 		"fmov v18.D[1], x5\n"
213 		"fmov d19, x6\n"
214 		"fmov v19.D[1], x7\n"
215 		"and %[failed].16B, %[failed].16B, v16.16B\n"
216 		"and %[failed].16B, %[failed].16B, v17.16B\n"
217 		"and %[failed].16B, %[failed].16B, v18.16B\n"
218 		"and %[failed].16B, %[failed].16B, v19.16B\n"
219 		"st1 { v16.2d, v17.2d, v18.2d, v19.2d}, [%[dst]], 64\n"
220 		: "+Q" (*addr), [failed] "=&w" (failed)
221 		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
222 		[dst] "r" (obj_table), [loc] "r" (addr)
223 		: "memory", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
224 		"v16", "v17", "v18", "v19"
225 		);
226 		break;
227 	}
228 	case 4:
229 	{
230 		asm volatile (
231 		".cpu  generic+lse\n"
232 		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
233 		"casp x2, x3, %[wdata64], %[wdata128], [%[loc]]\n"
234 		"fmov d16, x0\n"
235 		"fmov v16.D[1], x1\n"
236 		"fmov d17, x2\n"
237 		"fmov v17.D[1], x3\n"
238 		"and %[failed].16B, %[failed].16B, v16.16B\n"
239 		"and %[failed].16B, %[failed].16B, v17.16B\n"
240 		"st1 { v16.2d, v17.2d}, [%[dst]], 32\n"
241 		: "+Q" (*addr), [failed] "=&w" (failed)
242 		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
243 		[dst] "r" (obj_table), [loc] "r" (addr)
244 		: "memory", "x0", "x1", "x2", "x3", "v16", "v17"
245 		);
246 		break;
247 	}
248 	case 2:
249 	{
250 		asm volatile (
251 		".cpu  generic+lse\n"
252 		"casp x0, x1, %[wdata64], %[wdata128], [%[loc]]\n"
253 		"fmov d16, x0\n"
254 		"fmov v16.D[1], x1\n"
255 		"and %[failed].16B, %[failed].16B, v16.16B\n"
256 		"st1 { v16.2d}, [%[dst]], 16\n"
257 		: "+Q" (*addr), [failed] "=&w" (failed)
258 		: [wdata64] "r" (wdata64), [wdata128] "r" (wdata128),
259 		[dst] "r" (obj_table), [loc] "r" (addr)
260 		: "memory", "x0", "x1", "v16"
261 		);
262 		break;
263 	}
264 	case 1:
265 		return npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
266 	}
267 
268 	if (unlikely(!(vgetq_lane_u64(failed, 0) & vgetq_lane_u64(failed, 1))))
269 		return npa_lf_aura_op_search_alloc(wdata, addr, (void **)
270 			((char *)obj_table - (sizeof(uint64_t) * n)), n);
271 
272 	return 0;
273 }
274 
275 static __rte_noinline void
otx2_npa_clear_alloc(struct rte_mempool * mp,void ** obj_table,unsigned int n)276 otx2_npa_clear_alloc(struct rte_mempool *mp, void **obj_table, unsigned int n)
277 {
278 	unsigned int i;
279 
280 	for (i = 0; i < n; i++) {
281 		if (obj_table[i] != NULL) {
282 			otx2_npa_enq(mp, &obj_table[i], 1);
283 			obj_table[i] = NULL;
284 		}
285 	}
286 }
287 
288 static __rte_noinline int __rte_hot
otx2_npa_deq_arm64(struct rte_mempool * mp,void ** obj_table,unsigned int n)289 otx2_npa_deq_arm64(struct rte_mempool *mp, void **obj_table, unsigned int n)
290 {
291 	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
292 	void **obj_table_bak = obj_table;
293 	const unsigned int nfree = n;
294 	unsigned int parts;
295 
296 	int64_t * const addr = (int64_t * const)
297 			(npa_lf_aura_handle_to_base(mp->pool_id) +
298 				NPA_LF_AURA_OP_ALLOCX(0));
299 	while (n) {
300 		parts = n > 31 ? 32 : rte_align32prevpow2(n);
301 		n -= parts;
302 		if (unlikely(npa_lf_aura_op_alloc_bulk(wdata, addr,
303 				parts, obj_table))) {
304 			otx2_npa_clear_alloc(mp, obj_table_bak, nfree - n);
305 			return -ENOENT;
306 		}
307 		obj_table += parts;
308 	}
309 
310 	return 0;
311 }
312 
313 #else
314 
315 static inline int __rte_hot
otx2_npa_deq(struct rte_mempool * mp,void ** obj_table,unsigned int n)316 otx2_npa_deq(struct rte_mempool *mp, void **obj_table, unsigned int n)
317 {
318 	const int64_t wdata = npa_lf_aura_handle_to_aura(mp->pool_id);
319 	unsigned int index;
320 	uint64_t obj;
321 
322 	int64_t * const addr = (int64_t *)
323 			(npa_lf_aura_handle_to_base(mp->pool_id) +
324 				NPA_LF_AURA_OP_ALLOCX(0));
325 	for (index = 0; index < n; index++, obj_table++) {
326 		obj = npa_lf_aura_op_alloc_one(wdata, addr, obj_table, 0);
327 		if (obj == 0) {
328 			for (; index > 0; index--) {
329 				obj_table--;
330 				otx2_npa_enq(mp, obj_table, 1);
331 			}
332 			return -ENOENT;
333 		}
334 		*obj_table = (void *)obj;
335 	}
336 
337 	return 0;
338 }
339 
340 #endif
341 
342 static unsigned int
otx2_npa_get_count(const struct rte_mempool * mp)343 otx2_npa_get_count(const struct rte_mempool *mp)
344 {
345 	return (unsigned int)npa_lf_aura_op_available(mp->pool_id);
346 }
347 
348 static int
npa_lf_aura_pool_init(struct otx2_mbox * mbox,uint32_t aura_id,struct npa_aura_s * aura,struct npa_pool_s * pool)349 npa_lf_aura_pool_init(struct otx2_mbox *mbox, uint32_t aura_id,
350 		      struct npa_aura_s *aura, struct npa_pool_s *pool)
351 {
352 	struct npa_aq_enq_req *aura_init_req, *pool_init_req;
353 	struct npa_aq_enq_rsp *aura_init_rsp, *pool_init_rsp;
354 	struct otx2_mbox_dev *mdev = &mbox->dev[0];
355 	struct otx2_idev_cfg *idev;
356 	int rc, off;
357 
358 	idev = otx2_intra_dev_get_cfg();
359 	if (idev == NULL)
360 		return -ENOMEM;
361 
362 	aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
363 
364 	aura_init_req->aura_id = aura_id;
365 	aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
366 	aura_init_req->op = NPA_AQ_INSTOP_INIT;
367 	otx2_mbox_memcpy(&aura_init_req->aura, aura, sizeof(*aura));
368 
369 	pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
370 
371 	pool_init_req->aura_id = aura_id;
372 	pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
373 	pool_init_req->op = NPA_AQ_INSTOP_INIT;
374 	otx2_mbox_memcpy(&pool_init_req->pool, pool, sizeof(*pool));
375 
376 	otx2_mbox_msg_send(mbox, 0);
377 	rc = otx2_mbox_wait_for_rsp(mbox, 0);
378 	if (rc < 0)
379 		return rc;
380 
381 	off = mbox->rx_start +
382 			RTE_ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN);
383 	aura_init_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off);
384 	off = mbox->rx_start + aura_init_rsp->hdr.next_msgoff;
385 	pool_init_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off);
386 
387 	if (rc == 2 && aura_init_rsp->hdr.rc == 0 && pool_init_rsp->hdr.rc == 0)
388 		return 0;
389 	else
390 		return NPA_LF_ERR_AURA_POOL_INIT;
391 
392 	if (!(idev->npa_lock_mask & BIT_ULL(aura_id)))
393 		return 0;
394 
395 	aura_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
396 	aura_init_req->aura_id = aura_id;
397 	aura_init_req->ctype = NPA_AQ_CTYPE_AURA;
398 	aura_init_req->op = NPA_AQ_INSTOP_LOCK;
399 
400 	pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
401 	if (!pool_init_req) {
402 		/* The shared memory buffer can be full.
403 		 * Flush it and retry
404 		 */
405 		otx2_mbox_msg_send(mbox, 0);
406 		rc = otx2_mbox_wait_for_rsp(mbox, 0);
407 		if (rc < 0) {
408 			otx2_err("Failed to LOCK AURA context");
409 			return -ENOMEM;
410 		}
411 
412 		pool_init_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
413 		if (!pool_init_req) {
414 			otx2_err("Failed to LOCK POOL context");
415 			return -ENOMEM;
416 		}
417 	}
418 	pool_init_req->aura_id = aura_id;
419 	pool_init_req->ctype = NPA_AQ_CTYPE_POOL;
420 	pool_init_req->op = NPA_AQ_INSTOP_LOCK;
421 
422 	rc = otx2_mbox_process(mbox);
423 	if (rc < 0) {
424 		otx2_err("Failed to lock POOL ctx to NDC");
425 		return -ENOMEM;
426 	}
427 
428 	return 0;
429 }
430 
431 static int
npa_lf_aura_pool_fini(struct otx2_mbox * mbox,uint32_t aura_id,uint64_t aura_handle)432 npa_lf_aura_pool_fini(struct otx2_mbox *mbox,
433 		      uint32_t aura_id,
434 		      uint64_t aura_handle)
435 {
436 	struct npa_aq_enq_req *aura_req, *pool_req;
437 	struct npa_aq_enq_rsp *aura_rsp, *pool_rsp;
438 	struct otx2_mbox_dev *mdev = &mbox->dev[0];
439 	struct ndc_sync_op *ndc_req;
440 	struct otx2_idev_cfg *idev;
441 	int rc, off;
442 
443 	idev = otx2_intra_dev_get_cfg();
444 	if (idev == NULL)
445 		return -EINVAL;
446 
447 	/* Procedure for disabling an aura/pool */
448 	rte_delay_us(10);
449 	npa_lf_aura_op_alloc(aura_handle, 0);
450 
451 	pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
452 	pool_req->aura_id = aura_id;
453 	pool_req->ctype = NPA_AQ_CTYPE_POOL;
454 	pool_req->op = NPA_AQ_INSTOP_WRITE;
455 	pool_req->pool.ena = 0;
456 	pool_req->pool_mask.ena = ~pool_req->pool_mask.ena;
457 
458 	aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
459 	aura_req->aura_id = aura_id;
460 	aura_req->ctype = NPA_AQ_CTYPE_AURA;
461 	aura_req->op = NPA_AQ_INSTOP_WRITE;
462 	aura_req->aura.ena = 0;
463 	aura_req->aura_mask.ena = ~aura_req->aura_mask.ena;
464 
465 	otx2_mbox_msg_send(mbox, 0);
466 	rc = otx2_mbox_wait_for_rsp(mbox, 0);
467 	if (rc < 0)
468 		return rc;
469 
470 	off = mbox->rx_start +
471 			RTE_ALIGN(sizeof(struct mbox_hdr), MBOX_MSG_ALIGN);
472 	pool_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off);
473 
474 	off = mbox->rx_start + pool_rsp->hdr.next_msgoff;
475 	aura_rsp = (struct npa_aq_enq_rsp *)((uintptr_t)mdev->mbase + off);
476 
477 	if (rc != 2 || aura_rsp->hdr.rc != 0 || pool_rsp->hdr.rc != 0)
478 		return NPA_LF_ERR_AURA_POOL_FINI;
479 
480 	/* Sync NDC-NPA for LF */
481 	ndc_req = otx2_mbox_alloc_msg_ndc_sync_op(mbox);
482 	ndc_req->npa_lf_sync = 1;
483 
484 	rc = otx2_mbox_process(mbox);
485 	if (rc) {
486 		otx2_err("Error on NDC-NPA LF sync, rc %d", rc);
487 		return NPA_LF_ERR_AURA_POOL_FINI;
488 	}
489 
490 	if (!(idev->npa_lock_mask & BIT_ULL(aura_id)))
491 		return 0;
492 
493 	aura_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
494 	aura_req->aura_id = aura_id;
495 	aura_req->ctype = NPA_AQ_CTYPE_AURA;
496 	aura_req->op = NPA_AQ_INSTOP_UNLOCK;
497 
498 	rc = otx2_mbox_process(mbox);
499 	if (rc < 0) {
500 		otx2_err("Failed to unlock AURA ctx to NDC");
501 		return -EINVAL;
502 	}
503 
504 	pool_req = otx2_mbox_alloc_msg_npa_aq_enq(mbox);
505 	pool_req->aura_id = aura_id;
506 	pool_req->ctype = NPA_AQ_CTYPE_POOL;
507 	pool_req->op = NPA_AQ_INSTOP_UNLOCK;
508 
509 	rc = otx2_mbox_process(mbox);
510 	if (rc < 0) {
511 		otx2_err("Failed to unlock POOL ctx to NDC");
512 		return -EINVAL;
513 	}
514 
515 	return 0;
516 }
517 
518 static inline char*
npa_lf_stack_memzone_name(struct otx2_npa_lf * lf,int pool_id,char * name)519 npa_lf_stack_memzone_name(struct otx2_npa_lf *lf, int pool_id, char *name)
520 {
521 	snprintf(name, RTE_MEMZONE_NAMESIZE, "otx2_npa_stack_%x_%d",
522 			lf->pf_func, pool_id);
523 
524 	return name;
525 }
526 
527 static inline const struct rte_memzone *
npa_lf_stack_dma_alloc(struct otx2_npa_lf * lf,char * name,int pool_id,size_t size)528 npa_lf_stack_dma_alloc(struct otx2_npa_lf *lf, char *name,
529 		       int pool_id, size_t size)
530 {
531 	return rte_memzone_reserve_aligned(
532 		npa_lf_stack_memzone_name(lf, pool_id, name), size, 0,
533 			RTE_MEMZONE_IOVA_CONTIG, OTX2_ALIGN);
534 }
535 
536 static inline int
npa_lf_stack_dma_free(struct otx2_npa_lf * lf,char * name,int pool_id)537 npa_lf_stack_dma_free(struct otx2_npa_lf *lf, char *name, int pool_id)
538 {
539 	const struct rte_memzone *mz;
540 
541 	mz = rte_memzone_lookup(npa_lf_stack_memzone_name(lf, pool_id, name));
542 	if (mz == NULL)
543 		return -EINVAL;
544 
545 	return rte_memzone_free(mz);
546 }
547 
548 static inline int
bitmap_ctzll(uint64_t slab)549 bitmap_ctzll(uint64_t slab)
550 {
551 	if (slab == 0)
552 		return 0;
553 
554 	return __builtin_ctzll(slab);
555 }
556 
557 static int
npa_lf_aura_pool_pair_alloc(struct otx2_npa_lf * lf,const uint32_t block_size,const uint32_t block_count,struct npa_aura_s * aura,struct npa_pool_s * pool,uint64_t * aura_handle)558 npa_lf_aura_pool_pair_alloc(struct otx2_npa_lf *lf, const uint32_t block_size,
559 			    const uint32_t block_count, struct npa_aura_s *aura,
560 			    struct npa_pool_s *pool, uint64_t *aura_handle)
561 {
562 	int rc, aura_id, pool_id, stack_size, alloc_size;
563 	char name[RTE_MEMZONE_NAMESIZE];
564 	const struct rte_memzone *mz;
565 	uint64_t slab;
566 	uint32_t pos;
567 
568 	/* Sanity check */
569 	if (!lf || !block_size || !block_count ||
570 	    !pool || !aura || !aura_handle)
571 		return NPA_LF_ERR_PARAM;
572 
573 	/* Block size should be cache line aligned and in range of 128B-128KB */
574 	if (block_size % OTX2_ALIGN || block_size < 128 ||
575 	    block_size > 128 * 1024)
576 		return NPA_LF_ERR_INVALID_BLOCK_SZ;
577 
578 	pos = slab = 0;
579 	/* Scan from the beginning */
580 	__rte_bitmap_scan_init(lf->npa_bmp);
581 	/* Scan bitmap to get the free pool */
582 	rc = rte_bitmap_scan(lf->npa_bmp, &pos, &slab);
583 	/* Empty bitmap */
584 	if (rc == 0) {
585 		otx2_err("Mempools exhausted, 'max_pools' devargs to increase");
586 		return -ERANGE;
587 	}
588 
589 	/* Get aura_id from resource bitmap */
590 	aura_id = pos + bitmap_ctzll(slab);
591 	/* Mark pool as reserved */
592 	rte_bitmap_clear(lf->npa_bmp, aura_id);
593 
594 	/* Configuration based on each aura has separate pool(aura-pool pair) */
595 	pool_id = aura_id;
596 	rc = (aura_id < 0 || pool_id >= (int)lf->nr_pools || aura_id >=
597 	      (int)BIT_ULL(6 + lf->aura_sz)) ? NPA_LF_ERR_AURA_ID_ALLOC : 0;
598 	if (rc)
599 		goto exit;
600 
601 	/* Allocate stack memory */
602 	stack_size = (block_count + lf->stack_pg_ptrs - 1) / lf->stack_pg_ptrs;
603 	alloc_size = stack_size * lf->stack_pg_bytes;
604 
605 	mz = npa_lf_stack_dma_alloc(lf, name, pool_id, alloc_size);
606 	if (mz == NULL) {
607 		rc = -ENOMEM;
608 		goto aura_res_put;
609 	}
610 
611 	/* Update aura fields */
612 	aura->pool_addr = pool_id;/* AF will translate to associated poolctx */
613 	aura->ena = 1;
614 	aura->shift = __builtin_clz(block_count) - 8;
615 	aura->limit = block_count;
616 	aura->pool_caching = 1;
617 	aura->err_int_ena = BIT(NPA_AURA_ERR_INT_AURA_ADD_OVER);
618 	aura->err_int_ena |= BIT(NPA_AURA_ERR_INT_AURA_ADD_UNDER);
619 	aura->err_int_ena |= BIT(NPA_AURA_ERR_INT_AURA_FREE_UNDER);
620 	aura->err_int_ena |= BIT(NPA_AURA_ERR_INT_POOL_DIS);
621 	/* Many to one reduction */
622 	aura->err_qint_idx = aura_id % lf->qints;
623 
624 	/* Update pool fields */
625 	pool->stack_base = mz->iova;
626 	pool->ena = 1;
627 	pool->buf_size = block_size / OTX2_ALIGN;
628 	pool->stack_max_pages = stack_size;
629 	pool->shift = __builtin_clz(block_count) - 8;
630 	pool->ptr_start = 0;
631 	pool->ptr_end = ~0;
632 	pool->stack_caching = 1;
633 	pool->err_int_ena = BIT(NPA_POOL_ERR_INT_OVFLS);
634 	pool->err_int_ena |= BIT(NPA_POOL_ERR_INT_RANGE);
635 	pool->err_int_ena |= BIT(NPA_POOL_ERR_INT_PERR);
636 
637 	/* Many to one reduction */
638 	pool->err_qint_idx = pool_id % lf->qints;
639 
640 	/* Issue AURA_INIT and POOL_INIT op */
641 	rc = npa_lf_aura_pool_init(lf->mbox, aura_id, aura, pool);
642 	if (rc)
643 		goto stack_mem_free;
644 
645 	*aura_handle = npa_lf_aura_handle_gen(aura_id, lf->base);
646 
647 	/* Update aura count */
648 	npa_lf_aura_op_cnt_set(*aura_handle, 0, block_count);
649 	/* Read it back to make sure aura count is updated */
650 	npa_lf_aura_op_cnt_get(*aura_handle);
651 
652 	return 0;
653 
654 stack_mem_free:
655 	rte_memzone_free(mz);
656 aura_res_put:
657 	rte_bitmap_set(lf->npa_bmp, aura_id);
658 exit:
659 	return rc;
660 }
661 
662 static int
npa_lf_aura_pool_pair_free(struct otx2_npa_lf * lf,uint64_t aura_handle)663 npa_lf_aura_pool_pair_free(struct otx2_npa_lf *lf, uint64_t aura_handle)
664 {
665 	char name[RTE_MEMZONE_NAMESIZE];
666 	int aura_id, pool_id, rc;
667 
668 	if (!lf || !aura_handle)
669 		return NPA_LF_ERR_PARAM;
670 
671 	aura_id = pool_id = npa_lf_aura_handle_to_aura(aura_handle);
672 	rc = npa_lf_aura_pool_fini(lf->mbox, aura_id, aura_handle);
673 	rc |= npa_lf_stack_dma_free(lf, name, pool_id);
674 
675 	rte_bitmap_set(lf->npa_bmp, aura_id);
676 
677 	return rc;
678 }
679 
680 static int
npa_lf_aura_range_update_check(uint64_t aura_handle)681 npa_lf_aura_range_update_check(uint64_t aura_handle)
682 {
683 	uint64_t aura_id = npa_lf_aura_handle_to_aura(aura_handle);
684 	struct otx2_npa_lf *lf = otx2_npa_lf_obj_get();
685 	struct npa_aura_lim *lim = lf->aura_lim;
686 	__otx2_io struct npa_pool_s *pool;
687 	struct npa_aq_enq_req *req;
688 	struct npa_aq_enq_rsp *rsp;
689 	int rc;
690 
691 	req  = otx2_mbox_alloc_msg_npa_aq_enq(lf->mbox);
692 
693 	req->aura_id = aura_id;
694 	req->ctype = NPA_AQ_CTYPE_POOL;
695 	req->op = NPA_AQ_INSTOP_READ;
696 
697 	rc = otx2_mbox_process_msg(lf->mbox, (void *)&rsp);
698 	if (rc) {
699 		otx2_err("Failed to get pool(0x%"PRIx64") context", aura_id);
700 		return rc;
701 	}
702 
703 	pool = &rsp->pool;
704 
705 	if (lim[aura_id].ptr_start != pool->ptr_start ||
706 		lim[aura_id].ptr_end != pool->ptr_end) {
707 		otx2_err("Range update failed on pool(0x%"PRIx64")", aura_id);
708 		return -ERANGE;
709 	}
710 
711 	return 0;
712 }
713 
714 static int
otx2_npa_alloc(struct rte_mempool * mp)715 otx2_npa_alloc(struct rte_mempool *mp)
716 {
717 	uint32_t block_size, block_count;
718 	uint64_t aura_handle = 0;
719 	struct otx2_npa_lf *lf;
720 	struct npa_aura_s aura;
721 	struct npa_pool_s pool;
722 	size_t padding;
723 	int rc;
724 
725 	lf = otx2_npa_lf_obj_get();
726 	if (lf == NULL) {
727 		rc = -EINVAL;
728 		goto error;
729 	}
730 
731 	block_size = mp->elt_size + mp->header_size + mp->trailer_size;
732 	/*
733 	 * OCTEON TX2 has 8 sets, 41 ways L1D cache, VA<9:7> bits dictate
734 	 * the set selection.
735 	 * Add additional padding to ensure that the element size always
736 	 * occupies odd number of cachelines to ensure even distribution
737 	 * of elements among L1D cache sets.
738 	 */
739 	padding = ((block_size / RTE_CACHE_LINE_SIZE) % 2) ? 0 :
740 				RTE_CACHE_LINE_SIZE;
741 	mp->trailer_size += padding;
742 	block_size += padding;
743 
744 	block_count = mp->size;
745 
746 	if (block_size % OTX2_ALIGN != 0) {
747 		otx2_err("Block size should be multiple of 128B");
748 		rc = -ERANGE;
749 		goto error;
750 	}
751 
752 	memset(&aura, 0, sizeof(struct npa_aura_s));
753 	memset(&pool, 0, sizeof(struct npa_pool_s));
754 	pool.nat_align = 1;
755 	pool.buf_offset = 1;
756 
757 	if ((uint32_t)pool.buf_offset * OTX2_ALIGN != mp->header_size) {
758 		otx2_err("Unsupported mp->header_size=%d", mp->header_size);
759 		rc = -EINVAL;
760 		goto error;
761 	}
762 
763 	/* Use driver specific mp->pool_config to override aura config */
764 	if (mp->pool_config != NULL)
765 		memcpy(&aura, mp->pool_config, sizeof(struct npa_aura_s));
766 
767 	rc = npa_lf_aura_pool_pair_alloc(lf, block_size, block_count,
768 			 &aura, &pool, &aura_handle);
769 	if (rc) {
770 		otx2_err("Failed to alloc pool or aura rc=%d", rc);
771 		goto error;
772 	}
773 
774 	/* Store aura_handle for future queue operations */
775 	mp->pool_id = aura_handle;
776 	otx2_npa_dbg("lf=%p block_sz=%d block_count=%d aura_handle=0x%"PRIx64,
777 		     lf, block_size, block_count, aura_handle);
778 
779 	/* Just hold the reference of the object */
780 	otx2_npa_lf_obj_ref();
781 	return 0;
782 error:
783 	return rc;
784 }
785 
786 static void
otx2_npa_free(struct rte_mempool * mp)787 otx2_npa_free(struct rte_mempool *mp)
788 {
789 	struct otx2_npa_lf *lf = otx2_npa_lf_obj_get();
790 	int rc = 0;
791 
792 	otx2_npa_dbg("lf=%p aura_handle=0x%"PRIx64, lf, mp->pool_id);
793 	if (lf != NULL)
794 		rc = npa_lf_aura_pool_pair_free(lf, mp->pool_id);
795 
796 	if (rc)
797 		otx2_err("Failed to free pool or aura rc=%d", rc);
798 
799 	/* Release the reference of npalf */
800 	otx2_npa_lf_fini();
801 }
802 
803 static ssize_t
otx2_npa_calc_mem_size(const struct rte_mempool * mp,uint32_t obj_num,uint32_t pg_shift,size_t * min_chunk_size,size_t * align)804 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
805 		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
806 {
807 	size_t total_elt_sz;
808 
809 	/* Need space for one more obj on each chunk to fulfill
810 	 * alignment requirements.
811 	 */
812 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
813 	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
814 						total_elt_sz, min_chunk_size,
815 						align);
816 }
817 
818 static uint8_t
otx2_npa_l1d_way_set_get(uint64_t iova)819 otx2_npa_l1d_way_set_get(uint64_t iova)
820 {
821 	return (iova >> rte_log2_u32(RTE_CACHE_LINE_SIZE)) & 0x7;
822 }
823 
824 static int
otx2_npa_populate(struct rte_mempool * mp,unsigned int max_objs,void * vaddr,rte_iova_t iova,size_t len,rte_mempool_populate_obj_cb_t * obj_cb,void * obj_cb_arg)825 otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
826 		  rte_iova_t iova, size_t len,
827 		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
828 {
829 #define OTX2_L1D_NB_SETS	8
830 	uint64_t distribution[OTX2_L1D_NB_SETS];
831 	rte_iova_t start_iova;
832 	size_t total_elt_sz;
833 	uint8_t set;
834 	size_t off;
835 	int i;
836 
837 	if (iova == RTE_BAD_IOVA)
838 		return -EINVAL;
839 
840 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
841 
842 	/* Align object start address to a multiple of total_elt_sz */
843 	off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
844 
845 	if (len < off)
846 		return -EINVAL;
847 
848 
849 	vaddr = (char *)vaddr + off;
850 	iova += off;
851 	len -= off;
852 
853 	memset(distribution, 0, sizeof(uint64_t) * OTX2_L1D_NB_SETS);
854 	start_iova = iova;
855 	while (start_iova < iova + len) {
856 		set = otx2_npa_l1d_way_set_get(start_iova + mp->header_size);
857 		distribution[set]++;
858 		start_iova += total_elt_sz;
859 	}
860 
861 	otx2_npa_dbg("iova %"PRIx64", aligned iova %"PRIx64"", iova - off,
862 		     iova);
863 	otx2_npa_dbg("length %"PRIu64", aligned length %"PRIu64"",
864 		     (uint64_t)(len + off), (uint64_t)len);
865 	otx2_npa_dbg("element size %"PRIu64"", (uint64_t)total_elt_sz);
866 	otx2_npa_dbg("requested objects %"PRIu64", possible objects %"PRIu64"",
867 		     (uint64_t)max_objs, (uint64_t)(len / total_elt_sz));
868 	otx2_npa_dbg("L1D set distribution :");
869 	for (i = 0; i < OTX2_L1D_NB_SETS; i++)
870 		otx2_npa_dbg("set[%d] : objects : %"PRIu64"", i,
871 			     distribution[i]);
872 
873 	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
874 
875 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
876 		return -EBUSY;
877 
878 	return rte_mempool_op_populate_helper(mp,
879 					RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
880 					max_objs, vaddr, iova, len,
881 					obj_cb, obj_cb_arg);
882 }
883 
884 static struct rte_mempool_ops otx2_npa_ops = {
885 	.name = "octeontx2_npa",
886 	.alloc = otx2_npa_alloc,
887 	.free = otx2_npa_free,
888 	.enqueue = otx2_npa_enq,
889 	.get_count = otx2_npa_get_count,
890 	.calc_mem_size = otx2_npa_calc_mem_size,
891 	.populate = otx2_npa_populate,
892 #if defined(RTE_ARCH_ARM64)
893 	.dequeue = otx2_npa_deq_arm64,
894 #else
895 	.dequeue = otx2_npa_deq,
896 #endif
897 };
898 
899 MEMPOOL_REGISTER_OPS(otx2_npa_ops);
900