1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2016-2017 Netflix, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/conf.h>
32 #include <sys/fcntl.h>
33 #include <sys/filio.h>
34 #include <sys/kernel.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/poll.h>
39 #include <sys/queue.h>
40 #include <sys/refcount.h>
41 #include <sys/mutex.h>
42 #include <sys/selinfo.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/sysctl.h>
46 #include <sys/tree.h>
47 #include <sys/uio.h>
48 #include <machine/atomic.h>
49 #include <sys/counter.h>
50
51 #include <dev/tcp_log/tcp_log_dev.h>
52
53 #ifdef TCPLOG_DEBUG_COUNTERS
54 extern counter_u64_t tcp_log_que_read;
55 extern counter_u64_t tcp_log_que_freed;
56 #endif
57
58 static struct cdev *tcp_log_dev;
59 static struct selinfo tcp_log_sel;
60
61 static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
62 static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
63
64 MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
65
66 static int tcp_log_dev_listeners = 0;
67
68 static struct mtx tcp_log_dev_queue_lock;
69
70 #define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock)
71 #define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock)
72 #define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
73 #define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
74 #define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt))
75 #define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt))
76
77 static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
78 static void tcp_log_dev_clear_cdevpriv(void *data);
79 static int tcp_log_dev_open(struct cdev *dev __unused, int flags,
80 int devtype __unused, struct thread *td __unused);
81 static int tcp_log_dev_write(struct cdev *dev __unused,
82 struct uio *uio __unused, int flags __unused);
83 static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio,
84 int flags __unused);
85 static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
86 caddr_t data, int fflag __unused, struct thread *td __unused);
87 static int tcp_log_dev_poll(struct cdev *dev __unused, int events,
88 struct thread *td);
89
90 enum tcp_log_dev_queue_lock_state {
91 QUEUE_UNLOCKED = 0,
92 QUEUE_LOCKED,
93 };
94
95 static struct cdevsw tcp_log_cdevsw = {
96 .d_version = D_VERSION,
97 .d_read = tcp_log_dev_read,
98 .d_open = tcp_log_dev_open,
99 .d_write = tcp_log_dev_write,
100 .d_poll = tcp_log_dev_poll,
101 .d_ioctl = tcp_log_dev_ioctl,
102 #ifdef NOTYET
103 .d_mmap = tcp_log_dev_mmap,
104 #endif
105 .d_name = "tcp_log",
106 };
107
108 static __inline void
tcp_log_dev_queue_validate_lock(int lockstate)109 tcp_log_dev_queue_validate_lock(int lockstate)
110 {
111
112 #ifdef INVARIANTS
113 switch (lockstate) {
114 case QUEUE_LOCKED:
115 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
116 break;
117 case QUEUE_UNLOCKED:
118 TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
119 break;
120 default:
121 kassert_panic("%s:%d: unknown queue lock state", __func__,
122 __LINE__);
123 }
124 #endif
125 }
126
127 /*
128 * Clear the refcount. If appropriate, it will remove the entry from the
129 * queue and call the destructor.
130 *
131 * This must be called with the queue lock held.
132 */
133 static void
tcp_log_dev_clear_refcount(struct tcp_log_dev_queue * entry)134 tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
135 {
136
137 KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
138
139 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
140
141 if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
142 #ifdef TCPLOG_DEBUG_COUNTERS
143 counter_u64_add(tcp_log_que_freed, 1);
144 #endif
145 /* Remove the entry from the queue and call the destructor. */
146 STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
147 tldq_queue);
148 (*entry->tldq_dtor)(entry);
149 }
150 }
151
152 static void
tcp_log_dev_clear_cdevpriv(void * data)153 tcp_log_dev_clear_cdevpriv(void *data)
154 {
155 struct tcp_log_dev_info *priv;
156 struct tcp_log_dev_queue *entry, *entry_tmp;
157
158 priv = (struct tcp_log_dev_info *)data;
159 if (priv == NULL)
160 return;
161
162 /*
163 * Lock the queue and drop our references. We hold references to all
164 * the entries starting with tldi_head (or, if tldi_head == NULL, all
165 * entries in the queue).
166 *
167 * Because we don't want anyone adding addition things to the queue
168 * while we are doing this, we lock the queue.
169 */
170 TCP_LOG_DEV_QUEUE_LOCK();
171 if (priv->tldi_head != NULL) {
172 entry = priv->tldi_head;
173 STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
174 tldq_queue, entry_tmp) {
175 tcp_log_dev_clear_refcount(entry);
176 }
177 }
178 tcp_log_dev_listeners--;
179 KASSERT(tcp_log_dev_listeners >= 0,
180 ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
181 STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
182 tldi_list);
183 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
184 TCP_LOG_DEV_QUEUE_UNLOCK();
185 free(priv, M_TCPLOGDEV);
186 }
187
188 static int
tcp_log_dev_open(struct cdev * dev __unused,int flags,int devtype __unused,struct thread * td __unused)189 tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
190 struct thread *td __unused)
191 {
192 struct tcp_log_dev_info *priv;
193 struct tcp_log_dev_queue *entry;
194 int rv;
195
196 /*
197 * Ideally, we shouldn't see these because of file system
198 * permissions.
199 */
200 if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC))
201 return (ENODEV);
202
203 /* Allocate space to hold information about where we are. */
204 priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
205 M_ZERO | M_WAITOK);
206
207 /* Stash the private data away. */
208 rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
209 if (!rv) {
210 /*
211 * Increase the listener count, add this reader to the list, and
212 * take references on all current queues.
213 */
214 TCP_LOG_DEV_QUEUE_LOCK();
215 tcp_log_dev_listeners++;
216 STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
217 priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
218 if (priv->tldi_head != NULL)
219 priv->tldi_cur = priv->tldi_head->tldq_buf;
220 STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
221 TCP_LOG_DEV_QUEUE_REF(entry);
222 TCP_LOG_DEV_QUEUE_UNLOCK();
223 } else {
224 /* Free the entry. */
225 free(priv, M_TCPLOGDEV);
226 }
227 return (rv);
228 }
229
230 static int
tcp_log_dev_write(struct cdev * dev __unused,struct uio * uio __unused,int flags __unused)231 tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused,
232 int flags __unused)
233 {
234
235 return (ENODEV);
236 }
237
238 static __inline void
tcp_log_dev_rotate_bufs(struct tcp_log_dev_info * priv,int * lockstate)239 tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate)
240 {
241 struct tcp_log_dev_queue *entry;
242
243 KASSERT(priv->tldi_head != NULL,
244 ("%s:%d: priv->tldi_head unexpectedly NULL",
245 __func__, __LINE__));
246 KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
247 ("%s:%d: buffer mismatch (%p vs %p)",
248 __func__, __LINE__, priv->tldi_head->tldq_buf,
249 priv->tldi_cur));
250 tcp_log_dev_queue_validate_lock(*lockstate);
251
252 if (*lockstate == QUEUE_UNLOCKED) {
253 TCP_LOG_DEV_QUEUE_LOCK();
254 *lockstate = QUEUE_LOCKED;
255 }
256 entry = priv->tldi_head;
257 priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
258 tcp_log_dev_clear_refcount(entry);
259 priv->tldi_cur = NULL;
260 }
261
262 static int
tcp_log_dev_read(struct cdev * dev __unused,struct uio * uio,int flags)263 tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags)
264 {
265 struct tcp_log_common_header *buf;
266 struct tcp_log_dev_info *priv;
267 struct tcp_log_dev_queue *entry;
268 ssize_t len;
269 int lockstate, rv;
270
271 /* Get our private info. */
272 rv = devfs_get_cdevpriv((void **)&priv);
273 if (rv)
274 return (rv);
275
276 lockstate = QUEUE_UNLOCKED;
277
278 /* Do we need to get a new buffer? */
279 while (priv->tldi_cur == NULL ||
280 priv->tldi_cur->tlch_length <= priv->tldi_off) {
281 /* Did we somehow forget to rotate? */
282 KASSERT(priv->tldi_cur == NULL,
283 ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
284 __LINE__));
285 if (priv->tldi_cur != NULL)
286 tcp_log_dev_rotate_bufs(priv, &lockstate);
287
288 /*
289 * Before we start looking at tldi_head, we need a lock on the
290 * queue to make sure tldi_head stays stable.
291 */
292 if (lockstate == QUEUE_UNLOCKED) {
293 TCP_LOG_DEV_QUEUE_LOCK();
294 lockstate = QUEUE_LOCKED;
295 }
296
297 /* We need the next buffer. Do we have one? */
298 if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
299 rv = EAGAIN;
300 goto done;
301 }
302 if (priv->tldi_head == NULL) {
303 /* Sleep and wait for more things we can read. */
304 rv = mtx_sleep(&tcp_log_dev_listeners,
305 &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
306 if (rv)
307 goto done;
308 if (priv->tldi_head == NULL)
309 continue;
310 }
311
312 /*
313 * We have an entry to read. We want to try to create a
314 * buffer, if one doesn't already exist.
315 */
316 entry = priv->tldi_head;
317 if (entry->tldq_buf == NULL) {
318 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
319 buf = (*entry->tldq_xform)(entry);
320 if (buf == NULL) {
321 rv = EBUSY;
322 goto done;
323 }
324 entry->tldq_buf = buf;
325 }
326
327 priv->tldi_cur = entry->tldq_buf;
328 priv->tldi_off = 0;
329 }
330
331 /* Copy what we can from this buffer to the output buffer. */
332 if (uio->uio_resid > 0) {
333 /* Drop locks so we can take page faults. */
334 if (lockstate == QUEUE_LOCKED)
335 TCP_LOG_DEV_QUEUE_UNLOCK();
336 lockstate = QUEUE_UNLOCKED;
337
338 KASSERT(priv->tldi_cur != NULL,
339 ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
340
341 /* Copy as much as we can to this uio. */
342 len = priv->tldi_cur->tlch_length - priv->tldi_off;
343 if (len > uio->uio_resid)
344 len = uio->uio_resid;
345 rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
346 len, uio);
347 if (rv != 0)
348 goto done;
349 priv->tldi_off += len;
350 #ifdef TCPLOG_DEBUG_COUNTERS
351 counter_u64_add(tcp_log_que_read, len);
352 #endif
353 }
354 /* Are we done with this buffer? If so, find the next one. */
355 if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
356 KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
357 ("%s: offset (%ju) exceeds length (%ju)", __func__,
358 (uintmax_t)priv->tldi_off,
359 (uintmax_t)priv->tldi_cur->tlch_length));
360 tcp_log_dev_rotate_bufs(priv, &lockstate);
361 }
362 done:
363 tcp_log_dev_queue_validate_lock(lockstate);
364 if (lockstate == QUEUE_LOCKED)
365 TCP_LOG_DEV_QUEUE_UNLOCK();
366 return (rv);
367 }
368
369 static int
tcp_log_dev_ioctl(struct cdev * dev __unused,u_long cmd,caddr_t data,int fflag __unused,struct thread * td __unused)370 tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
371 int fflag __unused, struct thread *td __unused)
372 {
373 struct tcp_log_dev_info *priv;
374 int rv;
375
376 /* Get our private info. */
377 rv = devfs_get_cdevpriv((void **)&priv);
378 if (rv)
379 return (rv);
380
381 /*
382 * Set things. Here, we are most concerned about the non-blocking I/O
383 * flag.
384 */
385 rv = 0;
386 switch (cmd) {
387 case FIONBIO:
388 break;
389 case FIOASYNC:
390 if (*(int *)data != 0)
391 rv = EINVAL;
392 break;
393 default:
394 rv = ENOIOCTL;
395 }
396 return (rv);
397 }
398
399 static int
tcp_log_dev_poll(struct cdev * dev __unused,int events,struct thread * td)400 tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td)
401 {
402 struct tcp_log_dev_info *priv;
403 int revents;
404
405 /*
406 * Get our private info. If this fails, claim that all events are
407 * ready. That should prod the user to do something that will
408 * make the error evident to them.
409 */
410 if (devfs_get_cdevpriv((void **)&priv))
411 return (events);
412
413 revents = 0;
414 if (events & (POLLIN | POLLRDNORM)) {
415 /*
416 * We can (probably) read right now if we are partway through
417 * a buffer or if we are just about to start a buffer.
418 * Because we are going to read tldi_head, we should acquire
419 * a read lock on the queue.
420 */
421 TCP_LOG_DEV_QUEUE_LOCK();
422 if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) ||
423 (priv->tldi_cur != NULL &&
424 priv->tldi_off < priv->tldi_cur->tlch_length))
425 revents = events & (POLLIN | POLLRDNORM);
426 else
427 selrecord(td, &tcp_log_sel);
428 TCP_LOG_DEV_QUEUE_UNLOCK();
429 } else {
430 /*
431 * It only makes sense to poll for reading. So, again, prod the
432 * user to do something that will make the error of their ways
433 * apparent.
434 */
435 revents = events;
436 }
437 return (revents);
438 }
439
440 int
tcp_log_dev_add_log(struct tcp_log_dev_queue * entry)441 tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
442 {
443 struct tcp_log_dev_info *priv;
444 int rv;
445 bool wakeup_needed;
446
447 KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL,
448 ("%s: Called with both tldq_buf and tldq_xform set to NULL",
449 __func__));
450 KASSERT(entry->tldq_dtor != NULL,
451 ("%s: Called with tldq_dtor set to NULL", __func__));
452
453 /* Get a lock on the queue. */
454 TCP_LOG_DEV_QUEUE_LOCK();
455
456 /* If no one is listening, tell the caller to free the resources. */
457 if (tcp_log_dev_listeners == 0) {
458 rv = ENXIO;
459 goto done;
460 }
461
462 /* Add this to the end of the tailq. */
463 STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
464
465 /* Add references for all current listeners. */
466 refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
467
468 /*
469 * If any listener is currently stuck on NULL, that means they are
470 * waiting. Point their head to this new entry.
471 */
472 wakeup_needed = false;
473 STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
474 if (priv->tldi_head == NULL) {
475 priv->tldi_head = entry;
476 wakeup_needed = true;
477 }
478
479 if (wakeup_needed) {
480 selwakeup(&tcp_log_sel);
481 wakeup(&tcp_log_dev_listeners);
482 }
483
484 rv = 0;
485
486 done:
487 TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
488 TCP_LOG_DEV_QUEUE_UNLOCK();
489 return (rv);
490 }
491
492 static int
tcp_log_dev_modevent(module_t mod __unused,int type,void * data __unused)493 tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
494 {
495
496 /* TODO: Support intelligent unloading. */
497 switch (type) {
498 case MOD_LOAD:
499 if (bootverbose)
500 printf("tcp_log: tcp_log device\n");
501 memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
502 memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
503 mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
504 "tcp_log device queues", MTX_DEF);
505 tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
506 &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
507 "tcp_log");
508 break;
509 default:
510 return (EOPNOTSUPP);
511 }
512
513 return (0);
514 }
515
516 DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
517 MODULE_VERSION(tcp_log_dev, 1);
518