1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2020 Alexander V. Chernikov
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_route.h"
33
34 #include <sys/param.h>
35 #include <sys/jail.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/socket.h>
40 #include <sys/sysctl.h>
41 #include <sys/syslog.h>
42 #include <sys/sysproto.h>
43 #include <sys/proc.h>
44 #include <sys/domain.h>
45 #include <sys/kernel.h>
46 #include <sys/lock.h>
47 #include <sys/rmlock.h>
48
49 #include <net/if.h>
50 #include <net/if_var.h>
51 #include <net/if_dl.h>
52 #include <net/route.h>
53 #include <net/route/route_ctl.h>
54 #include <net/route/route_var.h>
55 #include <net/route/nhop_utils.h>
56 #include <net/route/nhop.h>
57 #include <net/route/nhop_var.h>
58 #ifdef INET
59 #include <netinet/in_fib.h>
60 #endif
61 #ifdef INET6
62 #include <netinet6/in6_fib.h>
63 #endif
64 #include <net/vnet.h>
65
66 /*
67 * RIB helper functions.
68 */
69
70 void
rib_walk_ext_locked(struct rib_head * rnh,rib_walktree_f_t * wa_f,rib_walk_hook_f_t * hook_f,void * arg)71 rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f,
72 rib_walk_hook_f_t *hook_f, void *arg)
73 {
74 if (hook_f != NULL)
75 hook_f(rnh, RIB_WALK_HOOK_PRE, arg);
76 rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f, arg);
77 if (hook_f != NULL)
78 hook_f(rnh, RIB_WALK_HOOK_POST, arg);
79 }
80
81 /*
82 * Calls @wa_f with @arg for each entry in the table specified by
83 * @af and @fibnum.
84 *
85 * @ss_t callback is called before and after the tree traversal
86 * while holding table lock.
87 *
88 * Table is traversed under read lock unless @wlock is set.
89 */
90 void
rib_walk_ext_internal(struct rib_head * rnh,bool wlock,rib_walktree_f_t * wa_f,rib_walk_hook_f_t * hook_f,void * arg)91 rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f,
92 rib_walk_hook_f_t *hook_f, void *arg)
93 {
94 RIB_RLOCK_TRACKER;
95
96 if (wlock)
97 RIB_WLOCK(rnh);
98 else
99 RIB_RLOCK(rnh);
100 rib_walk_ext_locked(rnh, wa_f, hook_f, arg);
101 if (wlock)
102 RIB_WUNLOCK(rnh);
103 else
104 RIB_RUNLOCK(rnh);
105 }
106
107 void
rib_walk_ext(uint32_t fibnum,int family,bool wlock,rib_walktree_f_t * wa_f,rib_walk_hook_f_t * hook_f,void * arg)108 rib_walk_ext(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
109 rib_walk_hook_f_t *hook_f, void *arg)
110 {
111 struct rib_head *rnh;
112
113 if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
114 rib_walk_ext_internal(rnh, wlock, wa_f, hook_f, arg);
115 }
116
117 /*
118 * Calls @wa_f with @arg for each entry in the table specified by
119 * @af and @fibnum.
120 *
121 * Table is traversed under read lock unless @wlock is set.
122 */
123 void
rib_walk(uint32_t fibnum,int family,bool wlock,rib_walktree_f_t * wa_f,void * arg)124 rib_walk(uint32_t fibnum, int family, bool wlock, rib_walktree_f_t *wa_f,
125 void *arg)
126 {
127
128 rib_walk_ext(fibnum, family, wlock, wa_f, NULL, arg);
129 }
130
131 /*
132 * Iterates over all existing fibs in system calling
133 * @hook_f function before/after traversing each fib.
134 * Calls @wa_f function for each element in current fib.
135 * If af is not AF_UNSPEC, iterates over fibs in particular
136 * address family.
137 */
138 void
rib_foreach_table_walk(int family,bool wlock,rib_walktree_f_t * wa_f,rib_walk_hook_f_t * hook_f,void * arg)139 rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f,
140 rib_walk_hook_f_t *hook_f, void *arg)
141 {
142
143 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
144 /* Do we want some specific family? */
145 if (family != AF_UNSPEC) {
146 rib_walk_ext(fibnum, family, wlock, wa_f, hook_f, arg);
147 continue;
148 }
149
150 for (int i = 1; i <= AF_MAX; i++)
151 rib_walk_ext(fibnum, i, wlock, wa_f, hook_f, arg);
152 }
153 }
154
155 /*
156 * Iterates over all existing fibs in system and deletes each element
157 * for which @filter_f function returns non-zero value.
158 * If @family is not AF_UNSPEC, iterates over fibs in particular
159 * address family.
160 */
161 void
rib_foreach_table_walk_del(int family,rib_filter_f_t * filter_f,void * arg)162 rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg)
163 {
164
165 for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
166 /* Do we want some specific family? */
167 if (family != AF_UNSPEC) {
168 rib_walk_del(fibnum, family, filter_f, arg, 0);
169 continue;
170 }
171
172 for (int i = 1; i <= AF_MAX; i++)
173 rib_walk_del(fibnum, i, filter_f, arg, 0);
174 }
175 }
176
177
178 /*
179 * Wrapper for the control plane functions for performing af-agnostic
180 * lookups.
181 * @fibnum: fib to perform the lookup.
182 * @dst: sockaddr with family and addr filled in. IPv6 addresses needs to be in
183 * deembedded from.
184 * @flags: fib(9) flags.
185 * @flowid: flow id for path selection in multipath use case.
186 *
187 * Returns nhop_object or NULL.
188 *
189 * Requires NET_EPOCH.
190 *
191 */
192 struct nhop_object *
rib_lookup(uint32_t fibnum,const struct sockaddr * dst,uint32_t flags,uint32_t flowid)193 rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags,
194 uint32_t flowid)
195 {
196 struct nhop_object *nh;
197
198 nh = NULL;
199
200 switch (dst->sa_family) {
201 #ifdef INET
202 case AF_INET:
203 {
204 const struct sockaddr_in *a = (const struct sockaddr_in *)dst;
205 nh = fib4_lookup(fibnum, a->sin_addr, 0, flags, flowid);
206 break;
207 }
208 #endif
209 #ifdef INET6
210 case AF_INET6:
211 {
212 const struct sockaddr_in6 *a = (const struct sockaddr_in6*)dst;
213 nh = fib6_lookup(fibnum, &a->sin6_addr, a->sin6_scope_id,
214 flags, flowid);
215 break;
216 }
217 #endif
218 }
219
220 return (nh);
221 }
222
223 #ifdef ROUTE_MPATH
224 static void
decompose_change_notification(struct rib_cmd_info * rc,route_notification_t * cb,void * cbdata)225 decompose_change_notification(struct rib_cmd_info *rc, route_notification_t *cb,
226 void *cbdata)
227 {
228 uint32_t num_old, num_new;
229 uint32_t nh_idx_old, nh_idx_new;
230 struct weightened_nhop *wn_old, *wn_new;
231 struct weightened_nhop tmp = { NULL, 0 };
232 uint32_t idx_old = 0, idx_new = 0;
233
234 struct rib_cmd_info rc_del = { .rc_cmd = RTM_DELETE, .rc_rt = rc->rc_rt };
235 struct rib_cmd_info rc_add = { .rc_cmd = RTM_ADD, .rc_rt = rc->rc_rt };
236
237 if (NH_IS_NHGRP(rc->rc_nh_old)) {
238 wn_old = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_old);
239 } else {
240 tmp.nh = rc->rc_nh_old;
241 tmp.weight = rc->rc_nh_weight;
242 wn_old = &tmp;
243 num_old = 1;
244 }
245 if (NH_IS_NHGRP(rc->rc_nh_new)) {
246 wn_new = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_new);
247 } else {
248 tmp.nh = rc->rc_nh_new;
249 tmp.weight = rc->rc_nh_weight;
250 wn_new = &tmp;
251 num_new = 1;
252 }
253
254 /* Use the fact that each @wn array is sorted */
255 /*
256 * Want to convert into set of add and delete operations
257 * [1] -> [1, 2] = A{2}
258 * [2] -> [1, 2] = A{1}
259 * [1, 2, 4]->[1, 3, 4] = A{2}, D{3}
260 * [1, 2, 4]->[1, 4] = D{2}
261 * [1, 2, 4] -> [3, 4] = D{1}, C{2,3} OR C{1,3}, D{2} OR D{1},D{2},A{3}
262 * [1, 2] -> [3, 4] =
263 *
264 */
265 idx_old = 0;
266 while ((idx_old < num_old) && (idx_new < num_new)) {
267 nh_idx_old = wn_old[idx_old].nh->nh_priv->nh_idx;
268 nh_idx_new = wn_new[idx_new].nh->nh_priv->nh_idx;
269
270 if (nh_idx_old == nh_idx_new) {
271 if (wn_old[idx_old].weight != wn_new[idx_new].weight) {
272 /* Update weight by providing del/add notifications */
273 rc_del.rc_nh_old = wn_old[idx_old].nh;
274 rc_del.rc_nh_weight = wn_old[idx_old].weight;
275 cb(&rc_del, cbdata);
276
277 rc_add.rc_nh_new = wn_new[idx_new].nh;
278 rc_add.rc_nh_weight = wn_new[idx_new].weight;
279 cb(&rc_add, cbdata);
280 }
281 idx_old++;
282 idx_new++;
283 } else if (nh_idx_old < nh_idx_new) {
284 /*
285 * [1, ~2~, 4], [1, ~3~, 4]
286 * [1, ~2~, 5], [1, ~3~, 4]
287 * [1, ~2~], [1, ~3~, 4]
288 */
289 if ((idx_old + 1 >= num_old) ||
290 (wn_old[idx_old + 1].nh->nh_priv->nh_idx > nh_idx_new)) {
291 /* Add new unless the next old item is still <= new */
292 rc_add.rc_nh_new = wn_new[idx_new].nh;
293 rc_add.rc_nh_weight = wn_new[idx_new].weight;
294 cb(&rc_add, cbdata);
295 idx_new++;
296 }
297 /* In any case, delete current old */
298 rc_del.rc_nh_old = wn_old[idx_old].nh;
299 rc_del.rc_nh_weight = wn_old[idx_old].weight;
300 cb(&rc_del, cbdata);
301 idx_old++;
302 } else {
303 /*
304 * nh_idx_old > nh_idx_new
305 *
306 * [1, ~3~, 4], [1, ~2~, 4]
307 * [1, ~3~, 5], [1, ~2~, 4]
308 * [1, ~3~, 4], [1, ~2~]
309 */
310 if ((idx_new + 1 >= num_new) ||
311 (wn_new[idx_new + 1].nh->nh_priv->nh_idx > nh_idx_old)) {
312 /* No next item or next item is > current one */
313 rc_add.rc_nh_new = wn_new[idx_new].nh;
314 rc_add.rc_nh_weight = wn_new[idx_new].weight;
315 cb(&rc_add, cbdata);
316 idx_new++;
317 }
318 /* In any case, delete current old */
319 rc_del.rc_nh_old = wn_old[idx_old].nh;
320 rc_del.rc_nh_weight = wn_old[idx_old].weight;
321 cb(&rc_del, cbdata);
322 idx_old++;
323 }
324 }
325
326 while (idx_old < num_old) {
327 rc_del.rc_nh_old = wn_old[idx_old].nh;
328 rc_del.rc_nh_weight = wn_old[idx_old].weight;
329 cb(&rc_del, cbdata);
330 idx_old++;
331 }
332
333 while (idx_new < num_new) {
334 rc_add.rc_nh_new = wn_new[idx_new].nh;
335 rc_add.rc_nh_weight = wn_new[idx_new].weight;
336 cb(&rc_add, cbdata);
337 idx_new++;
338 }
339 }
340
341 /*
342 * Decompose multipath cmd info @rc into a list of add/del/change
343 * single-path operations, calling @cb callback for each operation.
344 * Assumes at least one of the nexthops in @rc is multipath.
345 */
346 void
rib_decompose_notification(struct rib_cmd_info * rc,route_notification_t * cb,void * cbdata)347 rib_decompose_notification(struct rib_cmd_info *rc, route_notification_t *cb,
348 void *cbdata)
349 {
350 struct weightened_nhop *wn;
351 uint32_t num_nhops;
352 struct rib_cmd_info rc_new;
353
354 rc_new = *rc;
355 DPRINTF("cb=%p cmd=%d nh_old=%p nh_new=%p",
356 cb, rc->cmd, rc->nh_old, rc->nh_new);
357 switch (rc->rc_cmd) {
358 case RTM_ADD:
359 if (!NH_IS_NHGRP(rc->rc_nh_new))
360 return;
361 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_new, &num_nhops);
362 for (uint32_t i = 0; i < num_nhops; i++) {
363 rc_new.rc_nh_new = wn[i].nh;
364 rc_new.rc_nh_weight = wn[i].weight;
365 cb(&rc_new, cbdata);
366 }
367 break;
368 case RTM_DELETE:
369 if (!NH_IS_NHGRP(rc->rc_nh_old))
370 return;
371 wn = nhgrp_get_nhops((struct nhgrp_object *)rc->rc_nh_old, &num_nhops);
372 for (uint32_t i = 0; i < num_nhops; i++) {
373 rc_new.rc_nh_old = wn[i].nh;
374 rc_new.rc_nh_weight = wn[i].weight;
375 cb(&rc_new, cbdata);
376 }
377 break;
378 case RTM_CHANGE:
379 if (!NH_IS_NHGRP(rc->rc_nh_old) && !NH_IS_NHGRP(rc->rc_nh_new))
380 return;
381 decompose_change_notification(rc, cb, cbdata);
382 break;
383 }
384 }
385 #endif
386