1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2017 Intel Corporation
3  */
4 
5 #ifndef _DIST_PRIV_H_
6 #define _DIST_PRIV_H_
7 
8 /**
9  * @file
10  * RTE distributor
11  *
12  * The distributor is a component which is designed to pass packets
13  * one-at-a-time to workers, with dynamic load balancing.
14  */
15 
16 #ifdef __cplusplus
17 extern "C" {
18 #endif
19 
20 #define NO_FLAGS 0
21 #define RTE_DISTRIB_PREFIX "DT_"
22 
23 /*
24  * We will use the bottom four bits of pointer for flags, shifting out
25  * the top four bits to make room (since a 64-bit pointer actually only uses
26  * 48 bits). An arithmetic-right-shift will then appropriately restore the
27  * original pointer value with proper sign extension into the top bits.
28  */
29 #define RTE_DISTRIB_FLAG_BITS 4
30 #define RTE_DISTRIB_FLAGS_MASK (0x0F)
31 #define RTE_DISTRIB_NO_BUF 0       /**< empty flags: no buffer requested */
32 #define RTE_DISTRIB_GET_BUF (1)    /**< worker requests a buffer, returns old */
33 #define RTE_DISTRIB_RETURN_BUF (2) /**< worker returns a buffer, no request */
34 #define RTE_DISTRIB_VALID_BUF (4)  /**< set if bufptr contains ptr */
35 
36 #define RTE_DISTRIB_BACKLOG_SIZE 8
37 #define RTE_DISTRIB_BACKLOG_MASK (RTE_DISTRIB_BACKLOG_SIZE - 1)
38 
39 #define RTE_DISTRIB_MAX_RETURNS 128
40 #define RTE_DISTRIB_RETURNS_MASK (RTE_DISTRIB_MAX_RETURNS - 1)
41 
42 /**
43  * Maximum number of workers allowed.
44  * Be aware of increasing the limit, because it is limited by how we track
45  * in-flight tags. See in_flight_bitmask and rte_distributor_process
46  */
47 #define RTE_DISTRIB_MAX_WORKERS 64
48 
49 #define RTE_DISTRIBUTOR_NAMESIZE 32 /**< Length of name for instance */
50 
51 /**
52  * Buffer structure used to pass the pointer data between cores. This is cache
53  * line aligned, but to improve performance and prevent adjacent cache-line
54  * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
55  * the next cache line to worker 0, we pad this out to three cache lines.
56  * Only 64-bits of the memory is actually used though.
57  */
58 union rte_distributor_buffer_single {
59 	volatile int64_t bufptr64;
60 	char pad[RTE_CACHE_LINE_SIZE*3];
61 } __rte_cache_aligned;
62 
63 /*
64  * Transfer up to 8 mbufs at a time to/from workers, and
65  * flow matching algorithm optimized for 8 flow IDs at a time
66  */
67 #define RTE_DIST_BURST_SIZE 8
68 
69 struct rte_distributor_backlog {
70 	unsigned int start;
71 	unsigned int count;
72 	int64_t pkts[RTE_DIST_BURST_SIZE] __rte_cache_aligned;
73 	uint16_t *tags; /* will point to second cacheline of inflights */
74 } __rte_cache_aligned;
75 
76 
77 struct rte_distributor_returned_pkts {
78 	unsigned int start;
79 	unsigned int count;
80 	struct rte_mbuf *mbufs[RTE_DISTRIB_MAX_RETURNS];
81 };
82 
83 struct rte_distributor_single {
84 	TAILQ_ENTRY(rte_distributor_single) next;    /**< Next in list. */
85 
86 	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
87 	unsigned int num_workers;             /**< Number of workers polling */
88 
89 	uint32_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS];
90 		/**< Tracks the tag being processed per core */
91 	uint64_t in_flight_bitmask;
92 		/**< on/off bits for in-flight tags.
93 		 * Note that if RTE_DISTRIB_MAX_WORKERS is larger than 64 then
94 		 * the bitmask has to expand.
95 		 */
96 
97 	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS];
98 
99 	union rte_distributor_buffer_single bufs[RTE_DISTRIB_MAX_WORKERS];
100 
101 	struct rte_distributor_returned_pkts returns;
102 };
103 
104 /* All different signature compare functions */
105 enum rte_distributor_match_function {
106 	RTE_DIST_MATCH_SCALAR = 0,
107 	RTE_DIST_MATCH_VECTOR,
108 	RTE_DIST_NUM_MATCH_FNS
109 };
110 
111 /**
112  * Buffer structure used to pass the pointer data between cores. This is cache
113  * line aligned, but to improve performance and prevent adjacent cache-line
114  * prefetches of buffers for other workers, e.g. when worker 1's buffer is on
115  * the next cache line to worker 0, we pad this out to two cache lines.
116  * We can pass up to 8 mbufs at a time in one cacheline.
117  * There is a separate cacheline for returns in the burst API.
118  */
119 struct rte_distributor_buffer {
120 	volatile int64_t bufptr64[RTE_DIST_BURST_SIZE]
121 		__rte_cache_aligned; /* <= outgoing to worker */
122 
123 	int64_t pad1 __rte_cache_aligned;    /* <= one cache line  */
124 
125 	volatile int64_t retptr64[RTE_DIST_BURST_SIZE]
126 		__rte_cache_aligned; /* <= incoming from worker */
127 
128 	int64_t pad2 __rte_cache_aligned;    /* <= one cache line  */
129 
130 	int count __rte_cache_aligned;       /* <= number of current mbufs */
131 };
132 
133 struct rte_distributor {
134 	TAILQ_ENTRY(rte_distributor) next;    /**< Next in list. */
135 
136 	char name[RTE_DISTRIBUTOR_NAMESIZE];  /**< Name of the ring. */
137 	unsigned int num_workers;             /**< Number of workers polling */
138 	unsigned int alg_type;                /**< Number of alg types */
139 
140 	/**>
141 	 * First cache line in the this array are the tags inflight
142 	 * on the worker core. Second cache line are the backlog
143 	 * that are going to go to the worker core.
144 	 */
145 	uint16_t in_flight_tags[RTE_DISTRIB_MAX_WORKERS][RTE_DIST_BURST_SIZE*2]
146 			__rte_cache_aligned;
147 
148 	struct rte_distributor_backlog backlog[RTE_DISTRIB_MAX_WORKERS]
149 			__rte_cache_aligned;
150 
151 	struct rte_distributor_buffer bufs[RTE_DISTRIB_MAX_WORKERS];
152 
153 	struct rte_distributor_returned_pkts returns;
154 
155 	enum rte_distributor_match_function dist_match_fn;
156 
157 	struct rte_distributor_single *d_single;
158 
159 	uint8_t active[RTE_DISTRIB_MAX_WORKERS];
160 	uint8_t activesum;
161 };
162 
163 void
164 find_match_scalar(struct rte_distributor *d,
165 			uint16_t *data_ptr,
166 			uint16_t *output_ptr);
167 
168 void
169 find_match_vec(struct rte_distributor *d,
170 			uint16_t *data_ptr,
171 			uint16_t *output_ptr);
172 
173 #ifdef __cplusplus
174 }
175 #endif
176 
177 #endif /* _DIST_PRIV_H_ */
178