xref: /f-stack/dpdk/lib/librte_fib/trie_avx512.c (revision 2d9fd380)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2020 Intel Corporation
3  */
4 
5 #include <rte_vect.h>
6 #include <rte_fib6.h>
7 
8 #include "trie.h"
9 #include "trie_avx512.h"
10 
11 static __rte_always_inline void
transpose_x16(uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],__m512i * first,__m512i * second,__m512i * third,__m512i * fourth)12 transpose_x16(uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],
13 	__m512i *first, __m512i *second, __m512i *third, __m512i *fourth)
14 {
15 	__m512i tmp1, tmp2, tmp3, tmp4;
16 	__m512i tmp5, tmp6, tmp7, tmp8;
17 	const __rte_x86_zmm_t perm_idxes = {
18 		.u32 = { 0, 4, 8, 12, 2, 6, 10, 14,
19 			1, 5, 9, 13, 3, 7, 11, 15
20 		},
21 	};
22 
23 	/* load all ip addresses */
24 	tmp1 = _mm512_loadu_si512(&ips[0][0]);
25 	tmp2 = _mm512_loadu_si512(&ips[4][0]);
26 	tmp3 = _mm512_loadu_si512(&ips[8][0]);
27 	tmp4 = _mm512_loadu_si512(&ips[12][0]);
28 
29 	/* transpose 4 byte chunks of 16 ips */
30 	tmp5 = _mm512_unpacklo_epi32(tmp1, tmp2);
31 	tmp7 = _mm512_unpackhi_epi32(tmp1, tmp2);
32 	tmp6 = _mm512_unpacklo_epi32(tmp3, tmp4);
33 	tmp8 = _mm512_unpackhi_epi32(tmp3, tmp4);
34 
35 	tmp1 = _mm512_unpacklo_epi32(tmp5, tmp6);
36 	tmp3 = _mm512_unpackhi_epi32(tmp5, tmp6);
37 	tmp2 = _mm512_unpacklo_epi32(tmp7, tmp8);
38 	tmp4 = _mm512_unpackhi_epi32(tmp7, tmp8);
39 
40 	/* first 4-byte chunks of ips[] */
41 	*first = _mm512_permutexvar_epi32(perm_idxes.z, tmp1);
42 	/* second 4-byte chunks of ips[] */
43 	*second = _mm512_permutexvar_epi32(perm_idxes.z, tmp3);
44 	/* third 4-byte chunks of ips[] */
45 	*third = _mm512_permutexvar_epi32(perm_idxes.z, tmp2);
46 	/* fourth 4-byte chunks of ips[] */
47 	*fourth = _mm512_permutexvar_epi32(perm_idxes.z, tmp4);
48 }
49 
50 static __rte_always_inline void
transpose_x8(uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],__m512i * first,__m512i * second)51 transpose_x8(uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],
52 	__m512i *first, __m512i *second)
53 {
54 	__m512i tmp1, tmp2, tmp3, tmp4;
55 	const __rte_x86_zmm_t perm_idxes = {
56 		.u64 = { 0, 2, 4, 6, 1, 3, 5, 7
57 		},
58 	};
59 
60 	tmp1 = _mm512_loadu_si512(&ips[0][0]);
61 	tmp2 = _mm512_loadu_si512(&ips[4][0]);
62 
63 	tmp3 = _mm512_unpacklo_epi64(tmp1, tmp2);
64 	*first = _mm512_permutexvar_epi64(perm_idxes.z, tmp3);
65 	tmp4 = _mm512_unpackhi_epi64(tmp1, tmp2);
66 	*second = _mm512_permutexvar_epi64(perm_idxes.z, tmp4);
67 }
68 
69 static __rte_always_inline void
trie_vec_lookup_x16(void * p,uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],uint64_t * next_hops,int size)70 trie_vec_lookup_x16(void *p, uint8_t ips[16][RTE_FIB6_IPV6_ADDR_SIZE],
71 	uint64_t *next_hops, int size)
72 {
73 	struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
74 	const __m512i zero = _mm512_set1_epi32(0);
75 	const __m512i lsb = _mm512_set1_epi32(1);
76 	const __m512i two_lsb = _mm512_set1_epi32(3);
77 	__m512i first, second, third, fourth; /*< IPv6 four byte chunks */
78 	__m512i idxes, res, shuf_idxes;
79 	__m512i tmp, tmp2, bytes, byte_chunk, base_idxes;
80 	/* used to mask gather values if size is 2 (16 bit next hops) */
81 	const __m512i res_msk = _mm512_set1_epi32(UINT16_MAX);
82 	const __rte_x86_zmm_t bswap = {
83 		.u8 = { 2, 1, 0, 255, 6, 5, 4, 255,
84 			10, 9, 8, 255, 14, 13, 12, 255,
85 			2, 1, 0, 255, 6, 5, 4, 255,
86 			10, 9, 8, 255, 14, 13, 12, 255,
87 			2, 1, 0, 255, 6, 5, 4, 255,
88 			10, 9, 8, 255, 14, 13, 12, 255,
89 			2, 1, 0, 255, 6, 5, 4, 255,
90 			10, 9, 8, 255, 14, 13, 12, 255
91 			},
92 	};
93 	const __mmask64 k = 0x1111111111111111;
94 	int i = 3;
95 	__mmask16 msk_ext, new_msk;
96 	__mmask16 exp_msk = 0x5555;
97 
98 	transpose_x16(ips, &first, &second, &third, &fourth);
99 
100 	/* get_tbl24_idx() for every 4 byte chunk */
101 	idxes = _mm512_shuffle_epi8(first, bswap.z);
102 
103 	/**
104 	 * lookup in tbl24
105 	 * Put it inside branch to make compiller happy with -O0
106 	 */
107 	if (size == sizeof(uint16_t)) {
108 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
109 		res = _mm512_and_epi32(res, res_msk);
110 	} else
111 		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
112 
113 
114 	/* get extended entries indexes */
115 	msk_ext = _mm512_test_epi32_mask(res, lsb);
116 
117 	tmp = _mm512_srli_epi32(res, 1);
118 
119 	/* idxes to retrieve bytes */
120 	shuf_idxes = _mm512_setr_epi32(3, 7, 11, 15,
121 				19, 23, 27, 31,
122 				35, 39, 43, 47,
123 				51, 55, 59, 63);
124 
125 	base_idxes = _mm512_setr_epi32(0, 4, 8, 12,
126 				16, 20, 24, 28,
127 				32, 36, 40, 44,
128 				48, 52, 56, 60);
129 
130 	/* traverse down the trie */
131 	while (msk_ext) {
132 		idxes = _mm512_maskz_slli_epi32(msk_ext, tmp, 8);
133 		byte_chunk = (i < 8) ?
134 			((i >= 4) ? second : first) :
135 			((i >= 12) ? fourth : third);
136 		bytes = _mm512_maskz_shuffle_epi8(k, byte_chunk, shuf_idxes);
137 		idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
138 		if (size == sizeof(uint16_t)) {
139 			tmp = _mm512_mask_i32gather_epi32(zero, msk_ext,
140 				idxes, (const int *)dp->tbl8, 2);
141 			tmp = _mm512_and_epi32(tmp, res_msk);
142 		} else
143 			tmp = _mm512_mask_i32gather_epi32(zero, msk_ext,
144 				idxes, (const int *)dp->tbl8, 4);
145 		new_msk = _mm512_test_epi32_mask(tmp, lsb);
146 		res = _mm512_mask_blend_epi32(msk_ext ^ new_msk, res, tmp);
147 		tmp = _mm512_srli_epi32(tmp, 1);
148 		msk_ext = new_msk;
149 
150 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
151 		shuf_idxes = _mm512_and_epi32(shuf_idxes, two_lsb);
152 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
153 		i++;
154 	}
155 
156 	res = _mm512_srli_epi32(res, 1);
157 	tmp = _mm512_maskz_expand_epi32(exp_msk, res);
158 	__m256i tmp256;
159 	tmp256 = _mm512_extracti32x8_epi32(res, 1);
160 	tmp2 = _mm512_maskz_expand_epi32(exp_msk,
161 		_mm512_castsi256_si512(tmp256));
162 	_mm512_storeu_si512(next_hops, tmp);
163 	_mm512_storeu_si512(next_hops + 8, tmp2);
164 }
165 
166 static void
trie_vec_lookup_x8_8b(void * p,uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],uint64_t * next_hops)167 trie_vec_lookup_x8_8b(void *p, uint8_t ips[8][RTE_FIB6_IPV6_ADDR_SIZE],
168 	uint64_t *next_hops)
169 {
170 	struct rte_trie_tbl *dp = (struct rte_trie_tbl *)p;
171 	const __m512i zero = _mm512_set1_epi32(0);
172 	const __m512i lsb = _mm512_set1_epi32(1);
173 	const __m512i three_lsb = _mm512_set1_epi32(7);
174 	__m512i first, second; /*< IPv6 eight byte chunks */
175 	__m512i idxes, res, shuf_idxes;
176 	__m512i tmp, bytes, byte_chunk, base_idxes;
177 	const __rte_x86_zmm_t bswap = {
178 		.u8 = { 2, 1, 0, 255, 255, 255, 255, 255,
179 			10, 9, 8, 255, 255, 255, 255, 255,
180 			2, 1, 0, 255, 255, 255, 255, 255,
181 			10, 9, 8, 255, 255, 255, 255, 255,
182 			2, 1, 0, 255, 255, 255, 255, 255,
183 			10, 9, 8, 255, 255, 255, 255, 255,
184 			2, 1, 0, 255, 255, 255, 255, 255,
185 			10, 9, 8, 255, 255, 255, 255, 255
186 			},
187 	};
188 	const __mmask64 k = 0x101010101010101;
189 	int i = 3;
190 	__mmask8 msk_ext, new_msk;
191 
192 	transpose_x8(ips, &first, &second);
193 
194 	/* get_tbl24_idx() for every 4 byte chunk */
195 	idxes = _mm512_shuffle_epi8(first, bswap.z);
196 
197 	/* lookup in tbl24 */
198 	res = _mm512_i64gather_epi64(idxes, (const void *)dp->tbl24, 8);
199 	/* get extended entries indexes */
200 	msk_ext = _mm512_test_epi64_mask(res, lsb);
201 
202 	tmp = _mm512_srli_epi64(res, 1);
203 
204 	/* idxes to retrieve bytes */
205 	shuf_idxes = _mm512_setr_epi64(3, 11, 19, 27, 35, 43, 51, 59);
206 
207 	base_idxes = _mm512_setr_epi64(0, 8, 16, 24, 32, 40, 48, 56);
208 
209 	/* traverse down the trie */
210 	while (msk_ext) {
211 		idxes = _mm512_maskz_slli_epi64(msk_ext, tmp, 8);
212 		byte_chunk = (i < 8) ? first : second;
213 		bytes = _mm512_maskz_shuffle_epi8(k, byte_chunk, shuf_idxes);
214 		idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
215 		tmp = _mm512_mask_i64gather_epi64(zero, msk_ext,
216 				idxes, (const void *)dp->tbl8, 8);
217 		new_msk = _mm512_test_epi64_mask(tmp, lsb);
218 		res = _mm512_mask_blend_epi64(msk_ext ^ new_msk, res, tmp);
219 		tmp = _mm512_srli_epi64(tmp, 1);
220 		msk_ext = new_msk;
221 
222 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, lsb);
223 		shuf_idxes = _mm512_and_epi64(shuf_idxes, three_lsb);
224 		shuf_idxes = _mm512_maskz_add_epi8(k, shuf_idxes, base_idxes);
225 		i++;
226 	}
227 
228 	res = _mm512_srli_epi64(res, 1);
229 	_mm512_storeu_si512(next_hops, res);
230 }
231 
232 void
rte_trie_vec_lookup_bulk_2b(void * p,uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],uint64_t * next_hops,const unsigned int n)233 rte_trie_vec_lookup_bulk_2b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
234 	uint64_t *next_hops, const unsigned int n)
235 {
236 	uint32_t i;
237 	for (i = 0; i < (n / 16); i++) {
238 		trie_vec_lookup_x16(p, (uint8_t (*)[16])&ips[i * 16][0],
239 				next_hops + i * 16, sizeof(uint16_t));
240 	}
241 	rte_trie_lookup_bulk_2b(p, (uint8_t (*)[16])&ips[i * 16][0],
242 			next_hops + i * 16, n - i * 16);
243 }
244 
245 void
rte_trie_vec_lookup_bulk_4b(void * p,uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],uint64_t * next_hops,const unsigned int n)246 rte_trie_vec_lookup_bulk_4b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
247 	uint64_t *next_hops, const unsigned int n)
248 {
249 	uint32_t i;
250 	for (i = 0; i < (n / 16); i++) {
251 		trie_vec_lookup_x16(p, (uint8_t (*)[16])&ips[i * 16][0],
252 				next_hops + i * 16, sizeof(uint32_t));
253 	}
254 	rte_trie_lookup_bulk_4b(p, (uint8_t (*)[16])&ips[i * 16][0],
255 			next_hops + i * 16, n - i * 16);
256 }
257 
258 void
rte_trie_vec_lookup_bulk_8b(void * p,uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],uint64_t * next_hops,const unsigned int n)259 rte_trie_vec_lookup_bulk_8b(void *p, uint8_t ips[][RTE_FIB6_IPV6_ADDR_SIZE],
260 	uint64_t *next_hops, const unsigned int n)
261 {
262 	uint32_t i;
263 	for (i = 0; i < (n / 8); i++) {
264 		trie_vec_lookup_x8_8b(p, (uint8_t (*)[16])&ips[i * 8][0],
265 				next_hops + i * 8);
266 	}
267 	rte_trie_lookup_bulk_8b(p, (uint8_t (*)[16])&ips[i * 8][0],
268 			next_hops + i * 8, n - i * 8);
269 }
270