1 /*- 2 * Copyright (c) 2010 Isilon Systems, Inc. 3 * Copyright (c) 2010 iX Systems, Inc. 4 * Copyright (c) 2010 Panasas, Inc. 5 * Copyright (c) 2013-2021 Mellanox Technologies, Ltd. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice unmodified, this list of conditions, and the following 13 * disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_stack.h" 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/malloc.h> 38 #include <sys/kernel.h> 39 #include <sys/sysctl.h> 40 #include <sys/proc.h> 41 #include <sys/sglist.h> 42 #include <sys/sleepqueue.h> 43 #include <sys/refcount.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/bus.h> 47 #include <sys/eventhandler.h> 48 #include <sys/fcntl.h> 49 #include <sys/file.h> 50 #include <sys/filio.h> 51 #include <sys/rwlock.h> 52 #include <sys/mman.h> 53 #include <sys/stack.h> 54 #include <sys/time.h> 55 #include <sys/user.h> 56 57 #include <vm/vm.h> 58 #include <vm/pmap.h> 59 #include <vm/vm_object.h> 60 #include <vm/vm_page.h> 61 #include <vm/vm_pager.h> 62 63 #include <machine/stdarg.h> 64 65 #if defined(__i386__) || defined(__amd64__) 66 #include <machine/md_var.h> 67 #endif 68 69 #include <linux/kobject.h> 70 #include <linux/cpu.h> 71 #include <linux/device.h> 72 #include <linux/slab.h> 73 #include <linux/module.h> 74 #include <linux/moduleparam.h> 75 #include <linux/cdev.h> 76 #include <linux/file.h> 77 #include <linux/sysfs.h> 78 #include <linux/mm.h> 79 #include <linux/io.h> 80 #include <linux/vmalloc.h> 81 #include <linux/netdevice.h> 82 #include <linux/timer.h> 83 #include <linux/interrupt.h> 84 #include <linux/uaccess.h> 85 #include <linux/list.h> 86 #include <linux/kthread.h> 87 #include <linux/kernel.h> 88 #include <linux/compat.h> 89 #include <linux/poll.h> 90 #include <linux/smp.h> 91 #include <linux/wait_bit.h> 92 #include <linux/rcupdate.h> 93 #include <linux/interval_tree.h> 94 #include <linux/interval_tree_generic.h> 95 96 #if defined(__i386__) || defined(__amd64__) 97 #include <asm/smp.h> 98 #endif 99 100 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 101 "LinuxKPI parameters"); 102 103 int linuxkpi_debug; 104 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN, 105 &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable."); 106 107 int linuxkpi_warn_dump_stack = 0; 108 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN, 109 &linuxkpi_warn_dump_stack, 0, 110 "Set to enable stack traces from WARN_ON(). Clear to disable."); 111 112 static struct timeval lkpi_net_lastlog; 113 static int lkpi_net_curpps; 114 static int lkpi_net_maxpps = 99; 115 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN, 116 &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second."); 117 118 MALLOC_DEFINE(M_KMALLOC, "linux", "Linux kmalloc compat"); 119 120 #include <linux/rbtree.h> 121 /* Undo Linux compat changes. */ 122 #undef RB_ROOT 123 #undef file 124 #undef cdev 125 #define RB_ROOT(head) (head)->rbh_root 126 127 static void linux_destroy_dev(struct linux_cdev *); 128 static void linux_cdev_deref(struct linux_cdev *ldev); 129 static struct vm_area_struct *linux_cdev_handle_find(void *handle); 130 131 cpumask_t cpu_online_mask; 132 struct kobject linux_class_root; 133 struct device linux_root_device; 134 struct class linux_class_misc; 135 struct list_head pci_drivers; 136 struct list_head pci_devices; 137 spinlock_t pci_lock; 138 139 unsigned long linux_timer_hz_mask; 140 141 wait_queue_head_t linux_bit_waitq; 142 wait_queue_head_t linux_var_waitq; 143 144 int 145 panic_cmp(struct rb_node *one, struct rb_node *two) 146 { 147 panic("no cmp"); 148 } 149 150 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); 151 152 #define START(node) ((node)->start) 153 #define LAST(node) ((node)->last) 154 155 INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START, 156 LAST,, lkpi_interval_tree) 157 158 int 159 kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args) 160 { 161 va_list tmp_va; 162 int len; 163 char *old; 164 char *name; 165 char dummy; 166 167 old = kobj->name; 168 169 if (old && fmt == NULL) 170 return (0); 171 172 /* compute length of string */ 173 va_copy(tmp_va, args); 174 len = vsnprintf(&dummy, 0, fmt, tmp_va); 175 va_end(tmp_va); 176 177 /* account for zero termination */ 178 len++; 179 180 /* check for error */ 181 if (len < 1) 182 return (-EINVAL); 183 184 /* allocate memory for string */ 185 name = kzalloc(len, GFP_KERNEL); 186 if (name == NULL) 187 return (-ENOMEM); 188 vsnprintf(name, len, fmt, args); 189 kobj->name = name; 190 191 /* free old string */ 192 kfree(old); 193 194 /* filter new string */ 195 for (; *name != '\0'; name++) 196 if (*name == '/') 197 *name = '!'; 198 return (0); 199 } 200 201 int 202 kobject_set_name(struct kobject *kobj, const char *fmt, ...) 203 { 204 va_list args; 205 int error; 206 207 va_start(args, fmt); 208 error = kobject_set_name_vargs(kobj, fmt, args); 209 va_end(args); 210 211 return (error); 212 } 213 214 static int 215 kobject_add_complete(struct kobject *kobj, struct kobject *parent) 216 { 217 const struct kobj_type *t; 218 int error; 219 220 kobj->parent = parent; 221 error = sysfs_create_dir(kobj); 222 if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) { 223 struct attribute **attr; 224 t = kobj->ktype; 225 226 for (attr = t->default_attrs; *attr != NULL; attr++) { 227 error = sysfs_create_file(kobj, *attr); 228 if (error) 229 break; 230 } 231 if (error) 232 sysfs_remove_dir(kobj); 233 } 234 return (error); 235 } 236 237 int 238 kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) 239 { 240 va_list args; 241 int error; 242 243 va_start(args, fmt); 244 error = kobject_set_name_vargs(kobj, fmt, args); 245 va_end(args); 246 if (error) 247 return (error); 248 249 return kobject_add_complete(kobj, parent); 250 } 251 252 void 253 linux_kobject_release(struct kref *kref) 254 { 255 struct kobject *kobj; 256 char *name; 257 258 kobj = container_of(kref, struct kobject, kref); 259 sysfs_remove_dir(kobj); 260 name = kobj->name; 261 if (kobj->ktype && kobj->ktype->release) 262 kobj->ktype->release(kobj); 263 kfree(name); 264 } 265 266 static void 267 linux_kobject_kfree(struct kobject *kobj) 268 { 269 kfree(kobj); 270 } 271 272 static void 273 linux_kobject_kfree_name(struct kobject *kobj) 274 { 275 if (kobj) { 276 kfree(kobj->name); 277 } 278 } 279 280 const struct kobj_type linux_kfree_type = { 281 .release = linux_kobject_kfree 282 }; 283 284 static ssize_t 285 lkpi_kobj_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) 286 { 287 struct kobj_attribute *ka = 288 container_of(attr, struct kobj_attribute, attr); 289 290 if (ka->show == NULL) 291 return (-EIO); 292 293 return (ka->show(kobj, ka, buf)); 294 } 295 296 static ssize_t 297 lkpi_kobj_attr_store(struct kobject *kobj, struct attribute *attr, 298 const char *buf, size_t count) 299 { 300 struct kobj_attribute *ka = 301 container_of(attr, struct kobj_attribute, attr); 302 303 if (ka->store == NULL) 304 return (-EIO); 305 306 return (ka->store(kobj, ka, buf, count)); 307 } 308 309 const struct sysfs_ops kobj_sysfs_ops = { 310 .show = lkpi_kobj_attr_show, 311 .store = lkpi_kobj_attr_store, 312 }; 313 314 static void 315 linux_device_release(struct device *dev) 316 { 317 pr_debug("linux_device_release: %s\n", dev_name(dev)); 318 kfree(dev); 319 } 320 321 static ssize_t 322 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) 323 { 324 struct class_attribute *dattr; 325 ssize_t error; 326 327 dattr = container_of(attr, struct class_attribute, attr); 328 error = -EIO; 329 if (dattr->show) 330 error = dattr->show(container_of(kobj, struct class, kobj), 331 dattr, buf); 332 return (error); 333 } 334 335 static ssize_t 336 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, 337 size_t count) 338 { 339 struct class_attribute *dattr; 340 ssize_t error; 341 342 dattr = container_of(attr, struct class_attribute, attr); 343 error = -EIO; 344 if (dattr->store) 345 error = dattr->store(container_of(kobj, struct class, kobj), 346 dattr, buf, count); 347 return (error); 348 } 349 350 static void 351 linux_class_release(struct kobject *kobj) 352 { 353 struct class *class; 354 355 class = container_of(kobj, struct class, kobj); 356 if (class->class_release) 357 class->class_release(class); 358 } 359 360 static const struct sysfs_ops linux_class_sysfs = { 361 .show = linux_class_show, 362 .store = linux_class_store, 363 }; 364 365 const struct kobj_type linux_class_ktype = { 366 .release = linux_class_release, 367 .sysfs_ops = &linux_class_sysfs 368 }; 369 370 static void 371 linux_dev_release(struct kobject *kobj) 372 { 373 struct device *dev; 374 375 dev = container_of(kobj, struct device, kobj); 376 /* This is the precedence defined by linux. */ 377 if (dev->release) 378 dev->release(dev); 379 else if (dev->class && dev->class->dev_release) 380 dev->class->dev_release(dev); 381 } 382 383 static ssize_t 384 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) 385 { 386 struct device_attribute *dattr; 387 ssize_t error; 388 389 dattr = container_of(attr, struct device_attribute, attr); 390 error = -EIO; 391 if (dattr->show) 392 error = dattr->show(container_of(kobj, struct device, kobj), 393 dattr, buf); 394 return (error); 395 } 396 397 static ssize_t 398 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, 399 size_t count) 400 { 401 struct device_attribute *dattr; 402 ssize_t error; 403 404 dattr = container_of(attr, struct device_attribute, attr); 405 error = -EIO; 406 if (dattr->store) 407 error = dattr->store(container_of(kobj, struct device, kobj), 408 dattr, buf, count); 409 return (error); 410 } 411 412 static const struct sysfs_ops linux_dev_sysfs = { 413 .show = linux_dev_show, 414 .store = linux_dev_store, 415 }; 416 417 const struct kobj_type linux_dev_ktype = { 418 .release = linux_dev_release, 419 .sysfs_ops = &linux_dev_sysfs 420 }; 421 422 struct device * 423 device_create(struct class *class, struct device *parent, dev_t devt, 424 void *drvdata, const char *fmt, ...) 425 { 426 struct device *dev; 427 va_list args; 428 429 dev = kzalloc(sizeof(*dev), M_WAITOK); 430 dev->parent = parent; 431 dev->class = class; 432 dev->devt = devt; 433 dev->driver_data = drvdata; 434 dev->release = linux_device_release; 435 va_start(args, fmt); 436 kobject_set_name_vargs(&dev->kobj, fmt, args); 437 va_end(args); 438 device_register(dev); 439 440 return (dev); 441 } 442 443 int 444 kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, 445 struct kobject *parent, const char *fmt, ...) 446 { 447 va_list args; 448 int error; 449 450 kobject_init(kobj, ktype); 451 kobj->ktype = ktype; 452 kobj->parent = parent; 453 kobj->name = NULL; 454 455 va_start(args, fmt); 456 error = kobject_set_name_vargs(kobj, fmt, args); 457 va_end(args); 458 if (error) 459 return (error); 460 return kobject_add_complete(kobj, parent); 461 } 462 463 static void 464 linux_kq_lock(void *arg) 465 { 466 spinlock_t *s = arg; 467 468 spin_lock(s); 469 } 470 static void 471 linux_kq_unlock(void *arg) 472 { 473 spinlock_t *s = arg; 474 475 spin_unlock(s); 476 } 477 478 static void 479 linux_kq_assert_lock(void *arg, int what) 480 { 481 #ifdef INVARIANTS 482 spinlock_t *s = arg; 483 484 if (what == LA_LOCKED) 485 mtx_assert(&s->m, MA_OWNED); 486 else 487 mtx_assert(&s->m, MA_NOTOWNED); 488 #endif 489 } 490 491 static void 492 linux_file_kqfilter_poll(struct linux_file *, int); 493 494 struct linux_file * 495 linux_file_alloc(void) 496 { 497 struct linux_file *filp; 498 499 filp = kzalloc(sizeof(*filp), GFP_KERNEL); 500 501 /* set initial refcount */ 502 filp->f_count = 1; 503 504 /* setup fields needed by kqueue support */ 505 spin_lock_init(&filp->f_kqlock); 506 knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock, 507 linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock); 508 509 return (filp); 510 } 511 512 void 513 linux_file_free(struct linux_file *filp) 514 { 515 if (filp->_file == NULL) { 516 if (filp->f_op != NULL && filp->f_op->release != NULL) 517 filp->f_op->release(filp->f_vnode, filp); 518 if (filp->f_shmem != NULL) 519 vm_object_deallocate(filp->f_shmem); 520 kfree_rcu(filp, rcu); 521 } else { 522 /* 523 * The close method of the character device or file 524 * will free the linux_file structure: 525 */ 526 _fdrop(filp->_file, curthread); 527 } 528 } 529 530 static int 531 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, 532 vm_page_t *mres) 533 { 534 struct vm_area_struct *vmap; 535 536 vmap = linux_cdev_handle_find(vm_obj->handle); 537 538 MPASS(vmap != NULL); 539 MPASS(vmap->vm_private_data == vm_obj->handle); 540 541 if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) { 542 vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset; 543 vm_page_t page; 544 545 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 546 /* 547 * If the passed in result page is a fake 548 * page, update it with the new physical 549 * address. 550 */ 551 page = *mres; 552 vm_page_updatefake(page, paddr, vm_obj->memattr); 553 } else { 554 /* 555 * Replace the passed in "mres" page with our 556 * own fake page and free up the all of the 557 * original pages. 558 */ 559 VM_OBJECT_WUNLOCK(vm_obj); 560 page = vm_page_getfake(paddr, vm_obj->memattr); 561 VM_OBJECT_WLOCK(vm_obj); 562 563 vm_page_replace(page, vm_obj, (*mres)->pindex, *mres); 564 *mres = page; 565 } 566 vm_page_valid(page); 567 return (VM_PAGER_OK); 568 } 569 return (VM_PAGER_FAIL); 570 } 571 572 static int 573 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, 574 vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) 575 { 576 struct vm_area_struct *vmap; 577 int err; 578 579 /* get VM area structure */ 580 vmap = linux_cdev_handle_find(vm_obj->handle); 581 MPASS(vmap != NULL); 582 MPASS(vmap->vm_private_data == vm_obj->handle); 583 584 VM_OBJECT_WUNLOCK(vm_obj); 585 586 linux_set_current(curthread); 587 588 down_write(&vmap->vm_mm->mmap_sem); 589 if (unlikely(vmap->vm_ops == NULL)) { 590 err = VM_FAULT_SIGBUS; 591 } else { 592 struct vm_fault vmf; 593 594 /* fill out VM fault structure */ 595 vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx); 596 vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 597 vmf.pgoff = 0; 598 vmf.page = NULL; 599 vmf.vma = vmap; 600 601 vmap->vm_pfn_count = 0; 602 vmap->vm_pfn_pcount = &vmap->vm_pfn_count; 603 vmap->vm_obj = vm_obj; 604 605 err = vmap->vm_ops->fault(vmap, &vmf); 606 607 while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { 608 kern_yield(PRI_USER); 609 err = vmap->vm_ops->fault(vmap, &vmf); 610 } 611 } 612 613 /* translate return code */ 614 switch (err) { 615 case VM_FAULT_OOM: 616 err = VM_PAGER_AGAIN; 617 break; 618 case VM_FAULT_SIGBUS: 619 err = VM_PAGER_BAD; 620 break; 621 case VM_FAULT_NOPAGE: 622 /* 623 * By contract the fault handler will return having 624 * busied all the pages itself. If pidx is already 625 * found in the object, it will simply xbusy the first 626 * page and return with vm_pfn_count set to 1. 627 */ 628 *first = vmap->vm_pfn_first; 629 *last = *first + vmap->vm_pfn_count - 1; 630 err = VM_PAGER_OK; 631 break; 632 default: 633 err = VM_PAGER_ERROR; 634 break; 635 } 636 up_write(&vmap->vm_mm->mmap_sem); 637 VM_OBJECT_WLOCK(vm_obj); 638 return (err); 639 } 640 641 static struct rwlock linux_vma_lock; 642 static TAILQ_HEAD(, vm_area_struct) linux_vma_head = 643 TAILQ_HEAD_INITIALIZER(linux_vma_head); 644 645 static void 646 linux_cdev_handle_free(struct vm_area_struct *vmap) 647 { 648 /* Drop reference on vm_file */ 649 if (vmap->vm_file != NULL) 650 fput(vmap->vm_file); 651 652 /* Drop reference on mm_struct */ 653 mmput(vmap->vm_mm); 654 655 kfree(vmap); 656 } 657 658 static void 659 linux_cdev_handle_remove(struct vm_area_struct *vmap) 660 { 661 rw_wlock(&linux_vma_lock); 662 TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); 663 rw_wunlock(&linux_vma_lock); 664 } 665 666 static struct vm_area_struct * 667 linux_cdev_handle_find(void *handle) 668 { 669 struct vm_area_struct *vmap; 670 671 rw_rlock(&linux_vma_lock); 672 TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { 673 if (vmap->vm_private_data == handle) 674 break; 675 } 676 rw_runlock(&linux_vma_lock); 677 return (vmap); 678 } 679 680 static int 681 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 682 vm_ooffset_t foff, struct ucred *cred, u_short *color) 683 { 684 685 MPASS(linux_cdev_handle_find(handle) != NULL); 686 *color = 0; 687 return (0); 688 } 689 690 static void 691 linux_cdev_pager_dtor(void *handle) 692 { 693 const struct vm_operations_struct *vm_ops; 694 struct vm_area_struct *vmap; 695 696 vmap = linux_cdev_handle_find(handle); 697 MPASS(vmap != NULL); 698 699 /* 700 * Remove handle before calling close operation to prevent 701 * other threads from reusing the handle pointer. 702 */ 703 linux_cdev_handle_remove(vmap); 704 705 down_write(&vmap->vm_mm->mmap_sem); 706 vm_ops = vmap->vm_ops; 707 if (likely(vm_ops != NULL)) 708 vm_ops->close(vmap); 709 up_write(&vmap->vm_mm->mmap_sem); 710 711 linux_cdev_handle_free(vmap); 712 } 713 714 static struct cdev_pager_ops linux_cdev_pager_ops[2] = { 715 { 716 /* OBJT_MGTDEVICE */ 717 .cdev_pg_populate = linux_cdev_pager_populate, 718 .cdev_pg_ctor = linux_cdev_pager_ctor, 719 .cdev_pg_dtor = linux_cdev_pager_dtor 720 }, 721 { 722 /* OBJT_DEVICE */ 723 .cdev_pg_fault = linux_cdev_pager_fault, 724 .cdev_pg_ctor = linux_cdev_pager_ctor, 725 .cdev_pg_dtor = linux_cdev_pager_dtor 726 }, 727 }; 728 729 int 730 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 731 unsigned long size) 732 { 733 vm_object_t obj; 734 vm_page_t m; 735 736 obj = vma->vm_obj; 737 if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) 738 return (-ENOTSUP); 739 VM_OBJECT_RLOCK(obj); 740 for (m = vm_page_find_least(obj, OFF_TO_IDX(address)); 741 m != NULL && m->pindex < OFF_TO_IDX(address + size); 742 m = TAILQ_NEXT(m, listq)) 743 pmap_remove_all(m); 744 VM_OBJECT_RUNLOCK(obj); 745 return (0); 746 } 747 748 static struct file_operations dummy_ldev_ops = { 749 /* XXXKIB */ 750 }; 751 752 static struct linux_cdev dummy_ldev = { 753 .ops = &dummy_ldev_ops, 754 }; 755 756 #define LDEV_SI_DTR 0x0001 757 #define LDEV_SI_REF 0x0002 758 759 static void 760 linux_get_fop(struct linux_file *filp, const struct file_operations **fop, 761 struct linux_cdev **dev) 762 { 763 struct linux_cdev *ldev; 764 u_int siref; 765 766 ldev = filp->f_cdev; 767 *fop = filp->f_op; 768 if (ldev != NULL) { 769 if (ldev->kobj.ktype == &linux_cdev_static_ktype) { 770 refcount_acquire(&ldev->refs); 771 } else { 772 for (siref = ldev->siref;;) { 773 if ((siref & LDEV_SI_DTR) != 0) { 774 ldev = &dummy_ldev; 775 *fop = ldev->ops; 776 siref = ldev->siref; 777 MPASS((ldev->siref & LDEV_SI_DTR) == 0); 778 } else if (atomic_fcmpset_int(&ldev->siref, 779 &siref, siref + LDEV_SI_REF)) { 780 break; 781 } 782 } 783 } 784 } 785 *dev = ldev; 786 } 787 788 static void 789 linux_drop_fop(struct linux_cdev *ldev) 790 { 791 792 if (ldev == NULL) 793 return; 794 if (ldev->kobj.ktype == &linux_cdev_static_ktype) { 795 linux_cdev_deref(ldev); 796 } else { 797 MPASS(ldev->kobj.ktype == &linux_cdev_ktype); 798 MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); 799 atomic_subtract_int(&ldev->siref, LDEV_SI_REF); 800 } 801 } 802 803 #define OPW(fp,td,code) ({ \ 804 struct file *__fpop; \ 805 __typeof(code) __retval; \ 806 \ 807 __fpop = (td)->td_fpop; \ 808 (td)->td_fpop = (fp); \ 809 __retval = (code); \ 810 (td)->td_fpop = __fpop; \ 811 __retval; \ 812 }) 813 814 static int 815 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, 816 struct file *file) 817 { 818 struct linux_cdev *ldev; 819 struct linux_file *filp; 820 const struct file_operations *fop; 821 int error; 822 823 ldev = dev->si_drv1; 824 825 filp = linux_file_alloc(); 826 filp->f_dentry = &filp->f_dentry_store; 827 filp->f_op = ldev->ops; 828 filp->f_mode = file->f_flag; 829 filp->f_flags = file->f_flag; 830 filp->f_vnode = file->f_vnode; 831 filp->_file = file; 832 refcount_acquire(&ldev->refs); 833 filp->f_cdev = ldev; 834 835 linux_set_current(td); 836 linux_get_fop(filp, &fop, &ldev); 837 838 if (fop->open != NULL) { 839 error = -fop->open(file->f_vnode, filp); 840 if (error != 0) { 841 linux_drop_fop(ldev); 842 linux_cdev_deref(filp->f_cdev); 843 kfree(filp); 844 return (error); 845 } 846 } 847 848 /* hold on to the vnode - used for fstat() */ 849 vhold(filp->f_vnode); 850 851 /* release the file from devfs */ 852 finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); 853 linux_drop_fop(ldev); 854 return (ENXIO); 855 } 856 857 #define LINUX_IOCTL_MIN_PTR 0x10000UL 858 #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) 859 860 static inline int 861 linux_remap_address(void **uaddr, size_t len) 862 { 863 uintptr_t uaddr_val = (uintptr_t)(*uaddr); 864 865 if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && 866 uaddr_val < LINUX_IOCTL_MAX_PTR)) { 867 struct task_struct *pts = current; 868 if (pts == NULL) { 869 *uaddr = NULL; 870 return (1); 871 } 872 873 /* compute data offset */ 874 uaddr_val -= LINUX_IOCTL_MIN_PTR; 875 876 /* check that length is within bounds */ 877 if ((len > IOCPARM_MAX) || 878 (uaddr_val + len) > pts->bsd_ioctl_len) { 879 *uaddr = NULL; 880 return (1); 881 } 882 883 /* re-add kernel buffer address */ 884 uaddr_val += (uintptr_t)pts->bsd_ioctl_data; 885 886 /* update address location */ 887 *uaddr = (void *)uaddr_val; 888 return (1); 889 } 890 return (0); 891 } 892 893 int 894 linux_copyin(const void *uaddr, void *kaddr, size_t len) 895 { 896 if (linux_remap_address(__DECONST(void **, &uaddr), len)) { 897 if (uaddr == NULL) 898 return (-EFAULT); 899 memcpy(kaddr, uaddr, len); 900 return (0); 901 } 902 return (-copyin(uaddr, kaddr, len)); 903 } 904 905 int 906 linux_copyout(const void *kaddr, void *uaddr, size_t len) 907 { 908 if (linux_remap_address(&uaddr, len)) { 909 if (uaddr == NULL) 910 return (-EFAULT); 911 memcpy(uaddr, kaddr, len); 912 return (0); 913 } 914 return (-copyout(kaddr, uaddr, len)); 915 } 916 917 size_t 918 linux_clear_user(void *_uaddr, size_t _len) 919 { 920 uint8_t *uaddr = _uaddr; 921 size_t len = _len; 922 923 /* make sure uaddr is aligned before going into the fast loop */ 924 while (((uintptr_t)uaddr & 7) != 0 && len > 7) { 925 if (subyte(uaddr, 0)) 926 return (_len); 927 uaddr++; 928 len--; 929 } 930 931 /* zero 8 bytes at a time */ 932 while (len > 7) { 933 #ifdef __LP64__ 934 if (suword64(uaddr, 0)) 935 return (_len); 936 #else 937 if (suword32(uaddr, 0)) 938 return (_len); 939 if (suword32(uaddr + 4, 0)) 940 return (_len); 941 #endif 942 uaddr += 8; 943 len -= 8; 944 } 945 946 /* zero fill end, if any */ 947 while (len > 0) { 948 if (subyte(uaddr, 0)) 949 return (_len); 950 uaddr++; 951 len--; 952 } 953 return (0); 954 } 955 956 int 957 linux_access_ok(const void *uaddr, size_t len) 958 { 959 uintptr_t saddr; 960 uintptr_t eaddr; 961 962 /* get start and end address */ 963 saddr = (uintptr_t)uaddr; 964 eaddr = (uintptr_t)uaddr + len; 965 966 /* verify addresses are valid for userspace */ 967 return ((saddr == eaddr) || 968 (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS)); 969 } 970 971 /* 972 * This function should return either EINTR or ERESTART depending on 973 * the signal type sent to this thread: 974 */ 975 static int 976 linux_get_error(struct task_struct *task, int error) 977 { 978 /* check for signal type interrupt code */ 979 if (error == EINTR || error == ERESTARTSYS || error == ERESTART) { 980 error = -linux_schedule_get_interrupt_value(task); 981 if (error == 0) 982 error = EINTR; 983 } 984 return (error); 985 } 986 987 static int 988 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, 989 const struct file_operations *fop, u_long cmd, caddr_t data, 990 struct thread *td) 991 { 992 struct task_struct *task = current; 993 unsigned size; 994 int error; 995 996 size = IOCPARM_LEN(cmd); 997 /* refer to logic in sys_ioctl() */ 998 if (size > 0) { 999 /* 1000 * Setup hint for linux_copyin() and linux_copyout(). 1001 * 1002 * Background: Linux code expects a user-space address 1003 * while FreeBSD supplies a kernel-space address. 1004 */ 1005 task->bsd_ioctl_data = data; 1006 task->bsd_ioctl_len = size; 1007 data = (void *)LINUX_IOCTL_MIN_PTR; 1008 } else { 1009 /* fetch user-space pointer */ 1010 data = *(void **)data; 1011 } 1012 #if defined(__amd64__) 1013 if (td->td_proc->p_elf_machine == EM_386) { 1014 /* try the compat IOCTL handler first */ 1015 if (fop->compat_ioctl != NULL) { 1016 error = -OPW(fp, td, fop->compat_ioctl(filp, 1017 cmd, (u_long)data)); 1018 } else { 1019 error = ENOTTY; 1020 } 1021 1022 /* fallback to the regular IOCTL handler, if any */ 1023 if (error == ENOTTY && fop->unlocked_ioctl != NULL) { 1024 error = -OPW(fp, td, fop->unlocked_ioctl(filp, 1025 cmd, (u_long)data)); 1026 } 1027 } else 1028 #endif 1029 { 1030 if (fop->unlocked_ioctl != NULL) { 1031 error = -OPW(fp, td, fop->unlocked_ioctl(filp, 1032 cmd, (u_long)data)); 1033 } else { 1034 error = ENOTTY; 1035 } 1036 } 1037 if (size > 0) { 1038 task->bsd_ioctl_data = NULL; 1039 task->bsd_ioctl_len = 0; 1040 } 1041 1042 if (error == EWOULDBLOCK) { 1043 /* update kqfilter status, if any */ 1044 linux_file_kqfilter_poll(filp, 1045 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); 1046 } else { 1047 error = linux_get_error(task, error); 1048 } 1049 return (error); 1050 } 1051 1052 #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1) 1053 1054 /* 1055 * This function atomically updates the poll wakeup state and returns 1056 * the previous state at the time of update. 1057 */ 1058 static uint8_t 1059 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate) 1060 { 1061 int c, old; 1062 1063 c = v->counter; 1064 1065 while ((old = atomic_cmpxchg(v, c, pstate[c])) != c) 1066 c = old; 1067 1068 return (c); 1069 } 1070 1071 static int 1072 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key) 1073 { 1074 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1075 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ 1076 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ 1077 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY, 1078 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */ 1079 }; 1080 struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq); 1081 1082 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1083 case LINUX_FWQ_STATE_QUEUED: 1084 linux_poll_wakeup(filp); 1085 return (1); 1086 default: 1087 return (0); 1088 } 1089 } 1090 1091 void 1092 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p) 1093 { 1094 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1095 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY, 1096 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ 1097 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */ 1098 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED, 1099 }; 1100 1101 /* check if we are called inside the select system call */ 1102 if (p == LINUX_POLL_TABLE_NORMAL) 1103 selrecord(curthread, &filp->f_selinfo); 1104 1105 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1106 case LINUX_FWQ_STATE_INIT: 1107 /* NOTE: file handles can only belong to one wait-queue */ 1108 filp->f_wait_queue.wqh = wqh; 1109 filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback; 1110 add_wait_queue(wqh, &filp->f_wait_queue.wq); 1111 atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED); 1112 break; 1113 default: 1114 break; 1115 } 1116 } 1117 1118 static void 1119 linux_poll_wait_dequeue(struct linux_file *filp) 1120 { 1121 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1122 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ 1123 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT, 1124 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT, 1125 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT, 1126 }; 1127 1128 seldrain(&filp->f_selinfo); 1129 1130 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1131 case LINUX_FWQ_STATE_NOT_READY: 1132 case LINUX_FWQ_STATE_QUEUED: 1133 case LINUX_FWQ_STATE_READY: 1134 remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq); 1135 break; 1136 default: 1137 break; 1138 } 1139 } 1140 1141 void 1142 linux_poll_wakeup(struct linux_file *filp) 1143 { 1144 /* this function should be NULL-safe */ 1145 if (filp == NULL) 1146 return; 1147 1148 selwakeup(&filp->f_selinfo); 1149 1150 spin_lock(&filp->f_kqlock); 1151 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ | 1152 LINUX_KQ_FLAG_NEED_WRITE; 1153 1154 /* make sure the "knote" gets woken up */ 1155 KNOTE_LOCKED(&filp->f_selinfo.si_note, 1); 1156 spin_unlock(&filp->f_kqlock); 1157 } 1158 1159 static void 1160 linux_file_kqfilter_detach(struct knote *kn) 1161 { 1162 struct linux_file *filp = kn->kn_hook; 1163 1164 spin_lock(&filp->f_kqlock); 1165 knlist_remove(&filp->f_selinfo.si_note, kn, 1); 1166 spin_unlock(&filp->f_kqlock); 1167 } 1168 1169 static int 1170 linux_file_kqfilter_read_event(struct knote *kn, long hint) 1171 { 1172 struct linux_file *filp = kn->kn_hook; 1173 1174 mtx_assert(&filp->f_kqlock.m, MA_OWNED); 1175 1176 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0); 1177 } 1178 1179 static int 1180 linux_file_kqfilter_write_event(struct knote *kn, long hint) 1181 { 1182 struct linux_file *filp = kn->kn_hook; 1183 1184 mtx_assert(&filp->f_kqlock.m, MA_OWNED); 1185 1186 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0); 1187 } 1188 1189 static struct filterops linux_dev_kqfiltops_read = { 1190 .f_isfd = 1, 1191 .f_detach = linux_file_kqfilter_detach, 1192 .f_event = linux_file_kqfilter_read_event, 1193 }; 1194 1195 static struct filterops linux_dev_kqfiltops_write = { 1196 .f_isfd = 1, 1197 .f_detach = linux_file_kqfilter_detach, 1198 .f_event = linux_file_kqfilter_write_event, 1199 }; 1200 1201 static void 1202 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) 1203 { 1204 struct thread *td; 1205 const struct file_operations *fop; 1206 struct linux_cdev *ldev; 1207 int temp; 1208 1209 if ((filp->f_kqflags & kqflags) == 0) 1210 return; 1211 1212 td = curthread; 1213 1214 linux_get_fop(filp, &fop, &ldev); 1215 /* get the latest polling state */ 1216 temp = OPW(filp->_file, td, fop->poll(filp, NULL)); 1217 linux_drop_fop(ldev); 1218 1219 spin_lock(&filp->f_kqlock); 1220 /* clear kqflags */ 1221 filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | 1222 LINUX_KQ_FLAG_NEED_WRITE); 1223 /* update kqflags */ 1224 if ((temp & (POLLIN | POLLOUT)) != 0) { 1225 if ((temp & POLLIN) != 0) 1226 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; 1227 if ((temp & POLLOUT) != 0) 1228 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; 1229 1230 /* make sure the "knote" gets woken up */ 1231 KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); 1232 } 1233 spin_unlock(&filp->f_kqlock); 1234 } 1235 1236 static int 1237 linux_file_kqfilter(struct file *file, struct knote *kn) 1238 { 1239 struct linux_file *filp; 1240 struct thread *td; 1241 int error; 1242 1243 td = curthread; 1244 filp = (struct linux_file *)file->f_data; 1245 filp->f_flags = file->f_flag; 1246 if (filp->f_op->poll == NULL) 1247 return (EINVAL); 1248 1249 spin_lock(&filp->f_kqlock); 1250 switch (kn->kn_filter) { 1251 case EVFILT_READ: 1252 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ; 1253 kn->kn_fop = &linux_dev_kqfiltops_read; 1254 kn->kn_hook = filp; 1255 knlist_add(&filp->f_selinfo.si_note, kn, 1); 1256 error = 0; 1257 break; 1258 case EVFILT_WRITE: 1259 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE; 1260 kn->kn_fop = &linux_dev_kqfiltops_write; 1261 kn->kn_hook = filp; 1262 knlist_add(&filp->f_selinfo.si_note, kn, 1); 1263 error = 0; 1264 break; 1265 default: 1266 error = EINVAL; 1267 break; 1268 } 1269 spin_unlock(&filp->f_kqlock); 1270 1271 if (error == 0) { 1272 linux_set_current(td); 1273 1274 /* update kqfilter status, if any */ 1275 linux_file_kqfilter_poll(filp, 1276 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); 1277 } 1278 return (error); 1279 } 1280 1281 static int 1282 linux_file_mmap_single(struct file *fp, const struct file_operations *fop, 1283 vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, 1284 int nprot, bool is_shared, struct thread *td) 1285 { 1286 struct task_struct *task; 1287 struct vm_area_struct *vmap; 1288 struct mm_struct *mm; 1289 struct linux_file *filp; 1290 vm_memattr_t attr; 1291 int error; 1292 1293 filp = (struct linux_file *)fp->f_data; 1294 filp->f_flags = fp->f_flag; 1295 1296 if (fop->mmap == NULL) 1297 return (EOPNOTSUPP); 1298 1299 linux_set_current(td); 1300 1301 /* 1302 * The same VM object might be shared by multiple processes 1303 * and the mm_struct is usually freed when a process exits. 1304 * 1305 * The atomic reference below makes sure the mm_struct is 1306 * available as long as the vmap is in the linux_vma_head. 1307 */ 1308 task = current; 1309 mm = task->mm; 1310 if (atomic_inc_not_zero(&mm->mm_users) == 0) 1311 return (EINVAL); 1312 1313 vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); 1314 vmap->vm_start = 0; 1315 vmap->vm_end = size; 1316 vmap->vm_pgoff = *offset / PAGE_SIZE; 1317 vmap->vm_pfn = 0; 1318 vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL); 1319 if (is_shared) 1320 vmap->vm_flags |= VM_SHARED; 1321 vmap->vm_ops = NULL; 1322 vmap->vm_file = get_file(filp); 1323 vmap->vm_mm = mm; 1324 1325 if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { 1326 error = linux_get_error(task, EINTR); 1327 } else { 1328 error = -OPW(fp, td, fop->mmap(filp, vmap)); 1329 error = linux_get_error(task, error); 1330 up_write(&vmap->vm_mm->mmap_sem); 1331 } 1332 1333 if (error != 0) { 1334 linux_cdev_handle_free(vmap); 1335 return (error); 1336 } 1337 1338 attr = pgprot2cachemode(vmap->vm_page_prot); 1339 1340 if (vmap->vm_ops != NULL) { 1341 struct vm_area_struct *ptr; 1342 void *vm_private_data; 1343 bool vm_no_fault; 1344 1345 if (vmap->vm_ops->open == NULL || 1346 vmap->vm_ops->close == NULL || 1347 vmap->vm_private_data == NULL) { 1348 /* free allocated VM area struct */ 1349 linux_cdev_handle_free(vmap); 1350 return (EINVAL); 1351 } 1352 1353 vm_private_data = vmap->vm_private_data; 1354 1355 rw_wlock(&linux_vma_lock); 1356 TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { 1357 if (ptr->vm_private_data == vm_private_data) 1358 break; 1359 } 1360 /* check if there is an existing VM area struct */ 1361 if (ptr != NULL) { 1362 /* check if the VM area structure is invalid */ 1363 if (ptr->vm_ops == NULL || 1364 ptr->vm_ops->open == NULL || 1365 ptr->vm_ops->close == NULL) { 1366 error = ESTALE; 1367 vm_no_fault = 1; 1368 } else { 1369 error = EEXIST; 1370 vm_no_fault = (ptr->vm_ops->fault == NULL); 1371 } 1372 } else { 1373 /* insert VM area structure into list */ 1374 TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); 1375 error = 0; 1376 vm_no_fault = (vmap->vm_ops->fault == NULL); 1377 } 1378 rw_wunlock(&linux_vma_lock); 1379 1380 if (error != 0) { 1381 /* free allocated VM area struct */ 1382 linux_cdev_handle_free(vmap); 1383 /* check for stale VM area struct */ 1384 if (error != EEXIST) 1385 return (error); 1386 } 1387 1388 /* check if there is no fault handler */ 1389 if (vm_no_fault) { 1390 *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE, 1391 &linux_cdev_pager_ops[1], size, nprot, *offset, 1392 td->td_ucred); 1393 } else { 1394 *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, 1395 &linux_cdev_pager_ops[0], size, nprot, *offset, 1396 td->td_ucred); 1397 } 1398 1399 /* check if allocating the VM object failed */ 1400 if (*object == NULL) { 1401 if (error == 0) { 1402 /* remove VM area struct from list */ 1403 linux_cdev_handle_remove(vmap); 1404 /* free allocated VM area struct */ 1405 linux_cdev_handle_free(vmap); 1406 } 1407 return (EINVAL); 1408 } 1409 } else { 1410 struct sglist *sg; 1411 1412 sg = sglist_alloc(1, M_WAITOK); 1413 sglist_append_phys(sg, 1414 (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); 1415 1416 *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, 1417 nprot, 0, td->td_ucred); 1418 1419 linux_cdev_handle_free(vmap); 1420 1421 if (*object == NULL) { 1422 sglist_free(sg); 1423 return (EINVAL); 1424 } 1425 } 1426 1427 if (attr != VM_MEMATTR_DEFAULT) { 1428 VM_OBJECT_WLOCK(*object); 1429 vm_object_set_memattr(*object, attr); 1430 VM_OBJECT_WUNLOCK(*object); 1431 } 1432 *offset = 0; 1433 return (0); 1434 } 1435 1436 struct cdevsw linuxcdevsw = { 1437 .d_version = D_VERSION, 1438 .d_fdopen = linux_dev_fdopen, 1439 .d_name = "lkpidev", 1440 }; 1441 1442 static int 1443 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, 1444 int flags, struct thread *td) 1445 { 1446 struct linux_file *filp; 1447 const struct file_operations *fop; 1448 struct linux_cdev *ldev; 1449 ssize_t bytes; 1450 int error; 1451 1452 error = 0; 1453 filp = (struct linux_file *)file->f_data; 1454 filp->f_flags = file->f_flag; 1455 /* XXX no support for I/O vectors currently */ 1456 if (uio->uio_iovcnt != 1) 1457 return (EOPNOTSUPP); 1458 if (uio->uio_resid > DEVFS_IOSIZE_MAX) 1459 return (EINVAL); 1460 linux_set_current(td); 1461 linux_get_fop(filp, &fop, &ldev); 1462 if (fop->read != NULL) { 1463 bytes = OPW(file, td, fop->read(filp, 1464 uio->uio_iov->iov_base, 1465 uio->uio_iov->iov_len, &uio->uio_offset)); 1466 if (bytes >= 0) { 1467 uio->uio_iov->iov_base = 1468 ((uint8_t *)uio->uio_iov->iov_base) + bytes; 1469 uio->uio_iov->iov_len -= bytes; 1470 uio->uio_resid -= bytes; 1471 } else { 1472 error = linux_get_error(current, -bytes); 1473 } 1474 } else 1475 error = ENXIO; 1476 1477 /* update kqfilter status, if any */ 1478 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); 1479 linux_drop_fop(ldev); 1480 1481 return (error); 1482 } 1483 1484 static int 1485 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, 1486 int flags, struct thread *td) 1487 { 1488 struct linux_file *filp; 1489 const struct file_operations *fop; 1490 struct linux_cdev *ldev; 1491 ssize_t bytes; 1492 int error; 1493 1494 filp = (struct linux_file *)file->f_data; 1495 filp->f_flags = file->f_flag; 1496 /* XXX no support for I/O vectors currently */ 1497 if (uio->uio_iovcnt != 1) 1498 return (EOPNOTSUPP); 1499 if (uio->uio_resid > DEVFS_IOSIZE_MAX) 1500 return (EINVAL); 1501 linux_set_current(td); 1502 linux_get_fop(filp, &fop, &ldev); 1503 if (fop->write != NULL) { 1504 bytes = OPW(file, td, fop->write(filp, 1505 uio->uio_iov->iov_base, 1506 uio->uio_iov->iov_len, &uio->uio_offset)); 1507 if (bytes >= 0) { 1508 uio->uio_iov->iov_base = 1509 ((uint8_t *)uio->uio_iov->iov_base) + bytes; 1510 uio->uio_iov->iov_len -= bytes; 1511 uio->uio_resid -= bytes; 1512 error = 0; 1513 } else { 1514 error = linux_get_error(current, -bytes); 1515 } 1516 } else 1517 error = ENXIO; 1518 1519 /* update kqfilter status, if any */ 1520 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); 1521 1522 linux_drop_fop(ldev); 1523 1524 return (error); 1525 } 1526 1527 static int 1528 linux_file_poll(struct file *file, int events, struct ucred *active_cred, 1529 struct thread *td) 1530 { 1531 struct linux_file *filp; 1532 const struct file_operations *fop; 1533 struct linux_cdev *ldev; 1534 int revents; 1535 1536 filp = (struct linux_file *)file->f_data; 1537 filp->f_flags = file->f_flag; 1538 linux_set_current(td); 1539 linux_get_fop(filp, &fop, &ldev); 1540 if (fop->poll != NULL) { 1541 revents = OPW(file, td, fop->poll(filp, 1542 LINUX_POLL_TABLE_NORMAL)) & events; 1543 } else { 1544 revents = 0; 1545 } 1546 linux_drop_fop(ldev); 1547 return (revents); 1548 } 1549 1550 static int 1551 linux_file_close(struct file *file, struct thread *td) 1552 { 1553 struct linux_file *filp; 1554 int (*release)(struct inode *, struct linux_file *); 1555 const struct file_operations *fop; 1556 struct linux_cdev *ldev; 1557 int error; 1558 1559 filp = (struct linux_file *)file->f_data; 1560 1561 KASSERT(file_count(filp) == 0, 1562 ("File refcount(%d) is not zero", file_count(filp))); 1563 1564 if (td == NULL) 1565 td = curthread; 1566 1567 error = 0; 1568 filp->f_flags = file->f_flag; 1569 linux_set_current(td); 1570 linux_poll_wait_dequeue(filp); 1571 linux_get_fop(filp, &fop, &ldev); 1572 /* 1573 * Always use the real release function, if any, to avoid 1574 * leaking device resources: 1575 */ 1576 release = filp->f_op->release; 1577 if (release != NULL) 1578 error = -OPW(file, td, release(filp->f_vnode, filp)); 1579 funsetown(&filp->f_sigio); 1580 if (filp->f_vnode != NULL) 1581 vdrop(filp->f_vnode); 1582 linux_drop_fop(ldev); 1583 ldev = filp->f_cdev; 1584 if (ldev != NULL) 1585 linux_cdev_deref(ldev); 1586 linux_synchronize_rcu(RCU_TYPE_REGULAR); 1587 kfree(filp); 1588 1589 return (error); 1590 } 1591 1592 static int 1593 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, 1594 struct thread *td) 1595 { 1596 struct linux_file *filp; 1597 const struct file_operations *fop; 1598 struct linux_cdev *ldev; 1599 struct fiodgname_arg *fgn; 1600 const char *p; 1601 int error, i; 1602 1603 error = 0; 1604 filp = (struct linux_file *)fp->f_data; 1605 filp->f_flags = fp->f_flag; 1606 linux_get_fop(filp, &fop, &ldev); 1607 1608 linux_set_current(td); 1609 switch (cmd) { 1610 case FIONBIO: 1611 break; 1612 case FIOASYNC: 1613 if (fop->fasync == NULL) 1614 break; 1615 error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); 1616 break; 1617 case FIOSETOWN: 1618 error = fsetown(*(int *)data, &filp->f_sigio); 1619 if (error == 0) { 1620 if (fop->fasync == NULL) 1621 break; 1622 error = -OPW(fp, td, fop->fasync(0, filp, 1623 fp->f_flag & FASYNC)); 1624 } 1625 break; 1626 case FIOGETOWN: 1627 *(int *)data = fgetown(&filp->f_sigio); 1628 break; 1629 case FIODGNAME: 1630 #ifdef COMPAT_FREEBSD32 1631 case FIODGNAME_32: 1632 #endif 1633 if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) { 1634 error = ENXIO; 1635 break; 1636 } 1637 fgn = data; 1638 p = devtoname(filp->f_cdev->cdev); 1639 i = strlen(p) + 1; 1640 if (i > fgn->len) { 1641 error = EINVAL; 1642 break; 1643 } 1644 error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i); 1645 break; 1646 default: 1647 error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); 1648 break; 1649 } 1650 linux_drop_fop(ldev); 1651 return (error); 1652 } 1653 1654 static int 1655 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1656 vm_prot_t maxprot, int flags, struct file *fp, 1657 vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) 1658 { 1659 /* 1660 * Character devices do not provide private mappings 1661 * of any kind: 1662 */ 1663 if ((maxprot & VM_PROT_WRITE) == 0 && 1664 (prot & VM_PROT_WRITE) != 0) 1665 return (EACCES); 1666 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) 1667 return (EINVAL); 1668 1669 return (linux_file_mmap_single(fp, fop, foff, objsize, objp, 1670 (int)prot, (flags & MAP_SHARED) ? true : false, td)); 1671 } 1672 1673 static int 1674 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, 1675 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, 1676 struct thread *td) 1677 { 1678 struct linux_file *filp; 1679 const struct file_operations *fop; 1680 struct linux_cdev *ldev; 1681 struct mount *mp; 1682 struct vnode *vp; 1683 vm_object_t object; 1684 vm_prot_t maxprot; 1685 int error; 1686 1687 filp = (struct linux_file *)fp->f_data; 1688 1689 vp = filp->f_vnode; 1690 if (vp == NULL) 1691 return (EOPNOTSUPP); 1692 1693 /* 1694 * Ensure that file and memory protections are 1695 * compatible. 1696 */ 1697 mp = vp->v_mount; 1698 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { 1699 maxprot = VM_PROT_NONE; 1700 if ((prot & VM_PROT_EXECUTE) != 0) 1701 return (EACCES); 1702 } else 1703 maxprot = VM_PROT_EXECUTE; 1704 if ((fp->f_flag & FREAD) != 0) 1705 maxprot |= VM_PROT_READ; 1706 else if ((prot & VM_PROT_READ) != 0) 1707 return (EACCES); 1708 1709 /* 1710 * If we are sharing potential changes via MAP_SHARED and we 1711 * are trying to get write permission although we opened it 1712 * without asking for it, bail out. 1713 * 1714 * Note that most character devices always share mappings. 1715 * 1716 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE 1717 * requests rather than doing it here. 1718 */ 1719 if ((flags & MAP_SHARED) != 0) { 1720 if ((fp->f_flag & FWRITE) != 0) 1721 maxprot |= VM_PROT_WRITE; 1722 else if ((prot & VM_PROT_WRITE) != 0) 1723 return (EACCES); 1724 } 1725 maxprot &= cap_maxprot; 1726 1727 linux_get_fop(filp, &fop, &ldev); 1728 error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp, 1729 &foff, fop, &object); 1730 if (error != 0) 1731 goto out; 1732 1733 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1734 foff, FALSE, td); 1735 if (error != 0) 1736 vm_object_deallocate(object); 1737 out: 1738 linux_drop_fop(ldev); 1739 return (error); 1740 } 1741 1742 static int 1743 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, 1744 struct thread *td) 1745 { 1746 struct linux_file *filp; 1747 struct vnode *vp; 1748 int error; 1749 1750 filp = (struct linux_file *)fp->f_data; 1751 if (filp->f_vnode == NULL) 1752 return (EOPNOTSUPP); 1753 1754 vp = filp->f_vnode; 1755 1756 vn_lock(vp, LK_SHARED | LK_RETRY); 1757 error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td); 1758 VOP_UNLOCK(vp); 1759 1760 return (error); 1761 } 1762 1763 static int 1764 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, 1765 struct filedesc *fdp) 1766 { 1767 struct linux_file *filp; 1768 struct vnode *vp; 1769 int error; 1770 1771 filp = fp->f_data; 1772 vp = filp->f_vnode; 1773 if (vp == NULL) { 1774 error = 0; 1775 kif->kf_type = KF_TYPE_DEV; 1776 } else { 1777 vref(vp); 1778 FILEDESC_SUNLOCK(fdp); 1779 error = vn_fill_kinfo_vnode(vp, kif); 1780 vrele(vp); 1781 kif->kf_type = KF_TYPE_VNODE; 1782 FILEDESC_SLOCK(fdp); 1783 } 1784 return (error); 1785 } 1786 1787 unsigned int 1788 linux_iminor(struct inode *inode) 1789 { 1790 struct linux_cdev *ldev; 1791 1792 if (inode == NULL || inode->v_rdev == NULL || 1793 inode->v_rdev->si_devsw != &linuxcdevsw) 1794 return (-1U); 1795 ldev = inode->v_rdev->si_drv1; 1796 if (ldev == NULL) 1797 return (-1U); 1798 1799 return (minor(ldev->dev)); 1800 } 1801 1802 struct fileops linuxfileops = { 1803 .fo_read = linux_file_read, 1804 .fo_write = linux_file_write, 1805 .fo_truncate = invfo_truncate, 1806 .fo_kqfilter = linux_file_kqfilter, 1807 .fo_stat = linux_file_stat, 1808 .fo_fill_kinfo = linux_file_fill_kinfo, 1809 .fo_poll = linux_file_poll, 1810 .fo_close = linux_file_close, 1811 .fo_ioctl = linux_file_ioctl, 1812 .fo_mmap = linux_file_mmap, 1813 .fo_chmod = invfo_chmod, 1814 .fo_chown = invfo_chown, 1815 .fo_sendfile = invfo_sendfile, 1816 .fo_flags = DFLAG_PASSABLE, 1817 }; 1818 1819 /* 1820 * Hash of vmmap addresses. This is infrequently accessed and does not 1821 * need to be particularly large. This is done because we must store the 1822 * caller's idea of the map size to properly unmap. 1823 */ 1824 struct vmmap { 1825 LIST_ENTRY(vmmap) vm_next; 1826 void *vm_addr; 1827 unsigned long vm_size; 1828 }; 1829 1830 struct vmmaphd { 1831 struct vmmap *lh_first; 1832 }; 1833 #define VMMAP_HASH_SIZE 64 1834 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) 1835 #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK 1836 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; 1837 static struct mtx vmmaplock; 1838 1839 static void 1840 vmmap_add(void *addr, unsigned long size) 1841 { 1842 struct vmmap *vmmap; 1843 1844 vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); 1845 mtx_lock(&vmmaplock); 1846 vmmap->vm_size = size; 1847 vmmap->vm_addr = addr; 1848 LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); 1849 mtx_unlock(&vmmaplock); 1850 } 1851 1852 static struct vmmap * 1853 vmmap_remove(void *addr) 1854 { 1855 struct vmmap *vmmap; 1856 1857 mtx_lock(&vmmaplock); 1858 LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) 1859 if (vmmap->vm_addr == addr) 1860 break; 1861 if (vmmap) 1862 LIST_REMOVE(vmmap, vm_next); 1863 mtx_unlock(&vmmaplock); 1864 1865 return (vmmap); 1866 } 1867 1868 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv) 1869 void * 1870 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) 1871 { 1872 void *addr; 1873 1874 addr = pmap_mapdev_attr(phys_addr, size, attr); 1875 if (addr == NULL) 1876 return (NULL); 1877 vmmap_add(addr, size); 1878 1879 return (addr); 1880 } 1881 #endif 1882 1883 void 1884 iounmap(void *addr) 1885 { 1886 struct vmmap *vmmap; 1887 1888 vmmap = vmmap_remove(addr); 1889 if (vmmap == NULL) 1890 return; 1891 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv) 1892 pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size); 1893 #endif 1894 kfree(vmmap); 1895 } 1896 1897 void * 1898 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) 1899 { 1900 vm_offset_t off; 1901 size_t size; 1902 1903 size = count * PAGE_SIZE; 1904 off = kva_alloc(size); 1905 if (off == 0) 1906 return (NULL); 1907 vmmap_add((void *)off, size); 1908 pmap_qenter(off, pages, count); 1909 1910 return ((void *)off); 1911 } 1912 1913 void 1914 vunmap(void *addr) 1915 { 1916 struct vmmap *vmmap; 1917 1918 vmmap = vmmap_remove(addr); 1919 if (vmmap == NULL) 1920 return; 1921 pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); 1922 kva_free((vm_offset_t)addr, vmmap->vm_size); 1923 kfree(vmmap); 1924 } 1925 1926 static char * 1927 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) 1928 { 1929 unsigned int len; 1930 char *p; 1931 va_list aq; 1932 1933 va_copy(aq, ap); 1934 len = vsnprintf(NULL, 0, fmt, aq); 1935 va_end(aq); 1936 1937 if (dev != NULL) 1938 p = devm_kmalloc(dev, len + 1, gfp); 1939 else 1940 p = kmalloc(len + 1, gfp); 1941 if (p != NULL) 1942 vsnprintf(p, len + 1, fmt, ap); 1943 1944 return (p); 1945 } 1946 1947 char * 1948 kvasprintf(gfp_t gfp, const char *fmt, va_list ap) 1949 { 1950 1951 return (devm_kvasprintf(NULL, gfp, fmt, ap)); 1952 } 1953 1954 char * 1955 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) 1956 { 1957 va_list ap; 1958 char *p; 1959 1960 va_start(ap, fmt); 1961 p = devm_kvasprintf(dev, gfp, fmt, ap); 1962 va_end(ap); 1963 1964 return (p); 1965 } 1966 1967 char * 1968 kasprintf(gfp_t gfp, const char *fmt, ...) 1969 { 1970 va_list ap; 1971 char *p; 1972 1973 va_start(ap, fmt); 1974 p = kvasprintf(gfp, fmt, ap); 1975 va_end(ap); 1976 1977 return (p); 1978 } 1979 1980 static void 1981 linux_timer_callback_wrapper(void *context) 1982 { 1983 struct timer_list *timer; 1984 1985 timer = context; 1986 1987 if (linux_set_current_flags(curthread, M_NOWAIT)) { 1988 /* try again later */ 1989 callout_reset(&timer->callout, 1, 1990 &linux_timer_callback_wrapper, timer); 1991 return; 1992 } 1993 1994 timer->function(timer->data); 1995 } 1996 1997 int 1998 mod_timer(struct timer_list *timer, int expires) 1999 { 2000 int ret; 2001 2002 timer->expires = expires; 2003 ret = callout_reset(&timer->callout, 2004 linux_timer_jiffies_until(expires), 2005 &linux_timer_callback_wrapper, timer); 2006 2007 MPASS(ret == 0 || ret == 1); 2008 2009 return (ret == 1); 2010 } 2011 2012 void 2013 add_timer(struct timer_list *timer) 2014 { 2015 2016 callout_reset(&timer->callout, 2017 linux_timer_jiffies_until(timer->expires), 2018 &linux_timer_callback_wrapper, timer); 2019 } 2020 2021 void 2022 add_timer_on(struct timer_list *timer, int cpu) 2023 { 2024 2025 callout_reset_on(&timer->callout, 2026 linux_timer_jiffies_until(timer->expires), 2027 &linux_timer_callback_wrapper, timer, cpu); 2028 } 2029 2030 int 2031 del_timer(struct timer_list *timer) 2032 { 2033 2034 if (callout_stop(&(timer)->callout) == -1) 2035 return (0); 2036 return (1); 2037 } 2038 2039 int 2040 del_timer_sync(struct timer_list *timer) 2041 { 2042 2043 if (callout_drain(&(timer)->callout) == -1) 2044 return (0); 2045 return (1); 2046 } 2047 2048 /* greatest common divisor, Euclid equation */ 2049 static uint64_t 2050 lkpi_gcd_64(uint64_t a, uint64_t b) 2051 { 2052 uint64_t an; 2053 uint64_t bn; 2054 2055 while (b != 0) { 2056 an = b; 2057 bn = a % b; 2058 a = an; 2059 b = bn; 2060 } 2061 return (a); 2062 } 2063 2064 uint64_t lkpi_nsec2hz_rem; 2065 uint64_t lkpi_nsec2hz_div = 1000000000ULL; 2066 uint64_t lkpi_nsec2hz_max; 2067 2068 uint64_t lkpi_usec2hz_rem; 2069 uint64_t lkpi_usec2hz_div = 1000000ULL; 2070 uint64_t lkpi_usec2hz_max; 2071 2072 uint64_t lkpi_msec2hz_rem; 2073 uint64_t lkpi_msec2hz_div = 1000ULL; 2074 uint64_t lkpi_msec2hz_max; 2075 2076 static void 2077 linux_timer_init(void *arg) 2078 { 2079 uint64_t gcd; 2080 2081 /* 2082 * Compute an internal HZ value which can divide 2**32 to 2083 * avoid timer rounding problems when the tick value wraps 2084 * around 2**32: 2085 */ 2086 linux_timer_hz_mask = 1; 2087 while (linux_timer_hz_mask < (unsigned long)hz) 2088 linux_timer_hz_mask *= 2; 2089 linux_timer_hz_mask--; 2090 2091 /* compute some internal constants */ 2092 2093 lkpi_nsec2hz_rem = hz; 2094 lkpi_usec2hz_rem = hz; 2095 lkpi_msec2hz_rem = hz; 2096 2097 gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div); 2098 lkpi_nsec2hz_rem /= gcd; 2099 lkpi_nsec2hz_div /= gcd; 2100 lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem; 2101 2102 gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div); 2103 lkpi_usec2hz_rem /= gcd; 2104 lkpi_usec2hz_div /= gcd; 2105 lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem; 2106 2107 gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div); 2108 lkpi_msec2hz_rem /= gcd; 2109 lkpi_msec2hz_div /= gcd; 2110 lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem; 2111 } 2112 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); 2113 2114 void 2115 linux_complete_common(struct completion *c, int all) 2116 { 2117 int wakeup_swapper; 2118 2119 sleepq_lock(c); 2120 if (all) { 2121 c->done = UINT_MAX; 2122 wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); 2123 } else { 2124 if (c->done != UINT_MAX) 2125 c->done++; 2126 wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); 2127 } 2128 sleepq_release(c); 2129 if (wakeup_swapper) 2130 kick_proc0(); 2131 } 2132 2133 /* 2134 * Indefinite wait for done != 0 with or without signals. 2135 */ 2136 int 2137 linux_wait_for_common(struct completion *c, int flags) 2138 { 2139 struct task_struct *task; 2140 int error; 2141 2142 if (SCHEDULER_STOPPED()) 2143 return (0); 2144 2145 task = current; 2146 2147 if (flags != 0) 2148 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; 2149 else 2150 flags = SLEEPQ_SLEEP; 2151 error = 0; 2152 for (;;) { 2153 sleepq_lock(c); 2154 if (c->done) 2155 break; 2156 sleepq_add(c, NULL, "completion", flags, 0); 2157 if (flags & SLEEPQ_INTERRUPTIBLE) { 2158 DROP_GIANT(); 2159 error = -sleepq_wait_sig(c, 0); 2160 PICKUP_GIANT(); 2161 if (error != 0) { 2162 linux_schedule_save_interrupt_value(task, error); 2163 error = -ERESTARTSYS; 2164 goto intr; 2165 } 2166 } else { 2167 DROP_GIANT(); 2168 sleepq_wait(c, 0); 2169 PICKUP_GIANT(); 2170 } 2171 } 2172 if (c->done != UINT_MAX) 2173 c->done--; 2174 sleepq_release(c); 2175 2176 intr: 2177 return (error); 2178 } 2179 2180 /* 2181 * Time limited wait for done != 0 with or without signals. 2182 */ 2183 int 2184 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags) 2185 { 2186 struct task_struct *task; 2187 int end = jiffies + timeout; 2188 int error; 2189 2190 if (SCHEDULER_STOPPED()) 2191 return (0); 2192 2193 task = current; 2194 2195 if (flags != 0) 2196 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; 2197 else 2198 flags = SLEEPQ_SLEEP; 2199 2200 for (;;) { 2201 sleepq_lock(c); 2202 if (c->done) 2203 break; 2204 sleepq_add(c, NULL, "completion", flags, 0); 2205 sleepq_set_timeout(c, linux_timer_jiffies_until(end)); 2206 2207 DROP_GIANT(); 2208 if (flags & SLEEPQ_INTERRUPTIBLE) 2209 error = -sleepq_timedwait_sig(c, 0); 2210 else 2211 error = -sleepq_timedwait(c, 0); 2212 PICKUP_GIANT(); 2213 2214 if (error != 0) { 2215 /* check for timeout */ 2216 if (error == -EWOULDBLOCK) { 2217 error = 0; /* timeout */ 2218 } else { 2219 /* signal happened */ 2220 linux_schedule_save_interrupt_value(task, error); 2221 error = -ERESTARTSYS; 2222 } 2223 goto done; 2224 } 2225 } 2226 if (c->done != UINT_MAX) 2227 c->done--; 2228 sleepq_release(c); 2229 2230 /* return how many jiffies are left */ 2231 error = linux_timer_jiffies_until(end); 2232 done: 2233 return (error); 2234 } 2235 2236 int 2237 linux_try_wait_for_completion(struct completion *c) 2238 { 2239 int isdone; 2240 2241 sleepq_lock(c); 2242 isdone = (c->done != 0); 2243 if (c->done != 0 && c->done != UINT_MAX) 2244 c->done--; 2245 sleepq_release(c); 2246 return (isdone); 2247 } 2248 2249 int 2250 linux_completion_done(struct completion *c) 2251 { 2252 int isdone; 2253 2254 sleepq_lock(c); 2255 isdone = (c->done != 0); 2256 sleepq_release(c); 2257 return (isdone); 2258 } 2259 2260 static void 2261 linux_cdev_deref(struct linux_cdev *ldev) 2262 { 2263 if (refcount_release(&ldev->refs) && 2264 ldev->kobj.ktype == &linux_cdev_ktype) 2265 kfree(ldev); 2266 } 2267 2268 static void 2269 linux_cdev_release(struct kobject *kobj) 2270 { 2271 struct linux_cdev *cdev; 2272 struct kobject *parent; 2273 2274 cdev = container_of(kobj, struct linux_cdev, kobj); 2275 parent = kobj->parent; 2276 linux_destroy_dev(cdev); 2277 linux_cdev_deref(cdev); 2278 kobject_put(parent); 2279 } 2280 2281 static void 2282 linux_cdev_static_release(struct kobject *kobj) 2283 { 2284 struct cdev *cdev; 2285 struct linux_cdev *ldev; 2286 2287 ldev = container_of(kobj, struct linux_cdev, kobj); 2288 cdev = ldev->cdev; 2289 if (cdev != NULL) { 2290 destroy_dev(cdev); 2291 ldev->cdev = NULL; 2292 } 2293 kobject_put(kobj->parent); 2294 } 2295 2296 int 2297 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev) 2298 { 2299 int ret; 2300 2301 if (dev->devt != 0) { 2302 /* Set parent kernel object. */ 2303 ldev->kobj.parent = &dev->kobj; 2304 2305 /* 2306 * Unlike Linux we require the kobject of the 2307 * character device structure to have a valid name 2308 * before calling this function: 2309 */ 2310 if (ldev->kobj.name == NULL) 2311 return (-EINVAL); 2312 2313 ret = cdev_add(ldev, dev->devt, 1); 2314 if (ret) 2315 return (ret); 2316 } 2317 ret = device_add(dev); 2318 if (ret != 0 && dev->devt != 0) 2319 cdev_del(ldev); 2320 return (ret); 2321 } 2322 2323 void 2324 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev) 2325 { 2326 device_del(dev); 2327 2328 if (dev->devt != 0) 2329 cdev_del(ldev); 2330 } 2331 2332 static void 2333 linux_destroy_dev(struct linux_cdev *ldev) 2334 { 2335 2336 if (ldev->cdev == NULL) 2337 return; 2338 2339 MPASS((ldev->siref & LDEV_SI_DTR) == 0); 2340 MPASS(ldev->kobj.ktype == &linux_cdev_ktype); 2341 2342 atomic_set_int(&ldev->siref, LDEV_SI_DTR); 2343 while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) 2344 pause("ldevdtr", hz / 4); 2345 2346 destroy_dev(ldev->cdev); 2347 ldev->cdev = NULL; 2348 } 2349 2350 const struct kobj_type linux_cdev_ktype = { 2351 .release = linux_cdev_release, 2352 }; 2353 2354 const struct kobj_type linux_cdev_static_ktype = { 2355 .release = linux_cdev_static_release, 2356 }; 2357 2358 static void 2359 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) 2360 { 2361 struct notifier_block *nb; 2362 struct netdev_notifier_info ni; 2363 2364 nb = arg; 2365 ni.ifp = ifp; 2366 ni.dev = (struct net_device *)ifp; 2367 if (linkstate == LINK_STATE_UP) 2368 nb->notifier_call(nb, NETDEV_UP, &ni); 2369 else 2370 nb->notifier_call(nb, NETDEV_DOWN, &ni); 2371 } 2372 2373 static void 2374 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) 2375 { 2376 struct notifier_block *nb; 2377 struct netdev_notifier_info ni; 2378 2379 nb = arg; 2380 ni.ifp = ifp; 2381 ni.dev = (struct net_device *)ifp; 2382 nb->notifier_call(nb, NETDEV_REGISTER, &ni); 2383 } 2384 2385 static void 2386 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) 2387 { 2388 struct notifier_block *nb; 2389 struct netdev_notifier_info ni; 2390 2391 nb = arg; 2392 ni.ifp = ifp; 2393 ni.dev = (struct net_device *)ifp; 2394 nb->notifier_call(nb, NETDEV_UNREGISTER, &ni); 2395 } 2396 2397 static void 2398 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) 2399 { 2400 struct notifier_block *nb; 2401 struct netdev_notifier_info ni; 2402 2403 nb = arg; 2404 ni.ifp = ifp; 2405 ni.dev = (struct net_device *)ifp; 2406 nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni); 2407 } 2408 2409 static void 2410 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) 2411 { 2412 struct notifier_block *nb; 2413 struct netdev_notifier_info ni; 2414 2415 nb = arg; 2416 ni.ifp = ifp; 2417 ni.dev = (struct net_device *)ifp; 2418 nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni); 2419 } 2420 2421 int 2422 register_netdevice_notifier(struct notifier_block *nb) 2423 { 2424 2425 nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( 2426 ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); 2427 nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( 2428 ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); 2429 nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( 2430 ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); 2431 nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( 2432 iflladdr_event, linux_handle_iflladdr_event, nb, 0); 2433 2434 return (0); 2435 } 2436 2437 int 2438 register_inetaddr_notifier(struct notifier_block *nb) 2439 { 2440 2441 nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( 2442 ifaddr_event, linux_handle_ifaddr_event, nb, 0); 2443 return (0); 2444 } 2445 2446 int 2447 unregister_netdevice_notifier(struct notifier_block *nb) 2448 { 2449 2450 EVENTHANDLER_DEREGISTER(ifnet_link_event, 2451 nb->tags[NETDEV_UP]); 2452 EVENTHANDLER_DEREGISTER(ifnet_arrival_event, 2453 nb->tags[NETDEV_REGISTER]); 2454 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2455 nb->tags[NETDEV_UNREGISTER]); 2456 EVENTHANDLER_DEREGISTER(iflladdr_event, 2457 nb->tags[NETDEV_CHANGEADDR]); 2458 2459 return (0); 2460 } 2461 2462 int 2463 unregister_inetaddr_notifier(struct notifier_block *nb) 2464 { 2465 2466 EVENTHANDLER_DEREGISTER(ifaddr_event, 2467 nb->tags[NETDEV_CHANGEIFADDR]); 2468 2469 return (0); 2470 } 2471 2472 struct list_sort_thunk { 2473 int (*cmp)(void *, struct list_head *, struct list_head *); 2474 void *priv; 2475 }; 2476 2477 static inline int 2478 linux_le_cmp(void *priv, const void *d1, const void *d2) 2479 { 2480 struct list_head *le1, *le2; 2481 struct list_sort_thunk *thunk; 2482 2483 thunk = priv; 2484 le1 = *(__DECONST(struct list_head **, d1)); 2485 le2 = *(__DECONST(struct list_head **, d2)); 2486 return ((thunk->cmp)(thunk->priv, le1, le2)); 2487 } 2488 2489 void 2490 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, 2491 struct list_head *a, struct list_head *b)) 2492 { 2493 struct list_sort_thunk thunk; 2494 struct list_head **ar, *le; 2495 size_t count, i; 2496 2497 count = 0; 2498 list_for_each(le, head) 2499 count++; 2500 ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); 2501 i = 0; 2502 list_for_each(le, head) 2503 ar[i++] = le; 2504 thunk.cmp = cmp; 2505 thunk.priv = priv; 2506 qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp); 2507 INIT_LIST_HEAD(head); 2508 for (i = 0; i < count; i++) 2509 list_add_tail(ar[i], head); 2510 free(ar, M_KMALLOC); 2511 } 2512 2513 #if defined(__i386__) || defined(__amd64__) 2514 int 2515 linux_wbinvd_on_all_cpus(void) 2516 { 2517 2518 pmap_invalidate_cache(); 2519 return (0); 2520 } 2521 #endif 2522 2523 int 2524 linux_on_each_cpu(void callback(void *), void *data) 2525 { 2526 2527 smp_rendezvous(smp_no_rendezvous_barrier, callback, 2528 smp_no_rendezvous_barrier, data); 2529 return (0); 2530 } 2531 2532 int 2533 linux_in_atomic(void) 2534 { 2535 2536 return ((curthread->td_pflags & TDP_NOFAULTING) != 0); 2537 } 2538 2539 struct linux_cdev * 2540 linux_find_cdev(const char *name, unsigned major, unsigned minor) 2541 { 2542 dev_t dev = MKDEV(major, minor); 2543 struct cdev *cdev; 2544 2545 dev_lock(); 2546 LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) { 2547 struct linux_cdev *ldev = cdev->si_drv1; 2548 if (ldev->dev == dev && 2549 strcmp(kobject_name(&ldev->kobj), name) == 0) { 2550 break; 2551 } 2552 } 2553 dev_unlock(); 2554 2555 return (cdev != NULL ? cdev->si_drv1 : NULL); 2556 } 2557 2558 int 2559 __register_chrdev(unsigned int major, unsigned int baseminor, 2560 unsigned int count, const char *name, 2561 const struct file_operations *fops) 2562 { 2563 struct linux_cdev *cdev; 2564 int ret = 0; 2565 int i; 2566 2567 for (i = baseminor; i < baseminor + count; i++) { 2568 cdev = cdev_alloc(); 2569 cdev->ops = fops; 2570 kobject_set_name(&cdev->kobj, name); 2571 2572 ret = cdev_add(cdev, makedev(major, i), 1); 2573 if (ret != 0) 2574 break; 2575 } 2576 return (ret); 2577 } 2578 2579 int 2580 __register_chrdev_p(unsigned int major, unsigned int baseminor, 2581 unsigned int count, const char *name, 2582 const struct file_operations *fops, uid_t uid, 2583 gid_t gid, int mode) 2584 { 2585 struct linux_cdev *cdev; 2586 int ret = 0; 2587 int i; 2588 2589 for (i = baseminor; i < baseminor + count; i++) { 2590 cdev = cdev_alloc(); 2591 cdev->ops = fops; 2592 kobject_set_name(&cdev->kobj, name); 2593 2594 ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode); 2595 if (ret != 0) 2596 break; 2597 } 2598 return (ret); 2599 } 2600 2601 void 2602 __unregister_chrdev(unsigned int major, unsigned int baseminor, 2603 unsigned int count, const char *name) 2604 { 2605 struct linux_cdev *cdevp; 2606 int i; 2607 2608 for (i = baseminor; i < baseminor + count; i++) { 2609 cdevp = linux_find_cdev(name, major, i); 2610 if (cdevp != NULL) 2611 cdev_del(cdevp); 2612 } 2613 } 2614 2615 void 2616 linux_dump_stack(void) 2617 { 2618 #ifdef STACK 2619 struct stack st; 2620 2621 stack_zero(&st); 2622 stack_save(&st); 2623 stack_print(&st); 2624 #endif 2625 } 2626 2627 int 2628 linuxkpi_net_ratelimit(void) 2629 { 2630 2631 return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps, 2632 lkpi_net_maxpps)); 2633 } 2634 2635 #if defined(__i386__) || defined(__amd64__) 2636 bool linux_cpu_has_clflush; 2637 #endif 2638 2639 static void 2640 linux_compat_init(void *arg) 2641 { 2642 struct sysctl_oid *rootoid; 2643 int i; 2644 2645 #if defined(__i386__) || defined(__amd64__) 2646 linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); 2647 #endif 2648 rw_init(&linux_vma_lock, "lkpi-vma-lock"); 2649 2650 rootoid = SYSCTL_ADD_ROOT_NODE(NULL, 2651 OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); 2652 kobject_init(&linux_class_root, &linux_class_ktype); 2653 kobject_set_name(&linux_class_root, "class"); 2654 linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), 2655 OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); 2656 kobject_init(&linux_root_device.kobj, &linux_dev_ktype); 2657 kobject_set_name(&linux_root_device.kobj, "device"); 2658 linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, 2659 SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", 2660 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device"); 2661 linux_root_device.bsddev = root_bus; 2662 linux_class_misc.name = "misc"; 2663 class_register(&linux_class_misc); 2664 INIT_LIST_HEAD(&pci_drivers); 2665 INIT_LIST_HEAD(&pci_devices); 2666 spin_lock_init(&pci_lock); 2667 mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); 2668 for (i = 0; i < VMMAP_HASH_SIZE; i++) 2669 LIST_INIT(&vmmaphead[i]); 2670 init_waitqueue_head(&linux_bit_waitq); 2671 init_waitqueue_head(&linux_var_waitq); 2672 2673 CPU_COPY(&all_cpus, &cpu_online_mask); 2674 } 2675 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); 2676 2677 static void 2678 linux_compat_uninit(void *arg) 2679 { 2680 linux_kobject_kfree_name(&linux_class_root); 2681 linux_kobject_kfree_name(&linux_root_device.kobj); 2682 linux_kobject_kfree_name(&linux_class_misc.kobj); 2683 2684 mtx_destroy(&vmmaplock); 2685 spin_lock_destroy(&pci_lock); 2686 rw_destroy(&linux_vma_lock); 2687 } 2688 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); 2689 2690 /* 2691 * NOTE: Linux frequently uses "unsigned long" for pointer to integer 2692 * conversion and vice versa, where in FreeBSD "uintptr_t" would be 2693 * used. Assert these types have the same size, else some parts of the 2694 * LinuxKPI may not work like expected: 2695 */ 2696 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); 2697