1 /*- 2 * Copyright (c) 2016 Matthew Macy ([email protected]) 3 * Copyright (c) 2017-2021 Hans Petter Selasky ([email protected]) 4 * All rights reserved. 5 * Copyright (c) 2024 The FreeBSD Foundation 6 * 7 * Portions of this software were developed by Björn Zeeb 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice unmodified, this list of conditions, and the following 15 * disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include <sys/types.h> 34 #include <sys/systm.h> 35 #include <sys/malloc.h> 36 #include <sys/kernel.h> 37 #include <sys/lock.h> 38 #include <sys/mutex.h> 39 #include <sys/proc.h> 40 #include <sys/sched.h> 41 #include <sys/smp.h> 42 #include <sys/queue.h> 43 #include <sys/taskqueue.h> 44 #include <sys/kdb.h> 45 46 #include <ck_epoch.h> 47 48 #include <linux/rcupdate.h> 49 #include <linux/sched.h> 50 #include <linux/srcu.h> 51 #include <linux/slab.h> 52 #include <linux/kernel.h> 53 #include <linux/compat.h> 54 #include <linux/llist.h> 55 #include <linux/irq_work.h> 56 57 /* 58 * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will 59 * not be skipped during panic(). 60 */ 61 #ifdef CONFIG_NO_RCU_SKIP 62 #define RCU_SKIP(void) 0 63 #else 64 #define RCU_SKIP(void) unlikely(SCHEDULER_STOPPED() || kdb_active) 65 #endif 66 67 struct callback_head { 68 union { 69 STAILQ_ENTRY(callback_head) entry; 70 struct llist_node node; 71 }; 72 rcu_callback_t func; 73 }; 74 75 struct linux_epoch_head { 76 struct llist_head cb_head; 77 struct task task; 78 } __aligned(CACHE_LINE_SIZE); 79 80 struct linux_epoch_record { 81 ck_epoch_record_t epoch_record; 82 TAILQ_HEAD(, task_struct) ts_head; 83 int cpuid; 84 int type; 85 } __aligned(CACHE_LINE_SIZE); 86 87 /* 88 * Verify that "struct rcu_head" is big enough to hold "struct 89 * callback_head". This has been done to avoid having to add special 90 * compile flags for including ck_epoch.h to all clients of the 91 * LinuxKPI. 92 */ 93 CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head)); 94 95 /* 96 * Verify that "rcu_section[0]" has the same size as 97 * "ck_epoch_section_t". This has been done to avoid having to add 98 * special compile flags for including ck_epoch.h to all clients of 99 * the LinuxKPI. 100 */ 101 CTASSERT(sizeof(((struct task_struct *)0)->rcu_section[0] == 102 sizeof(ck_epoch_section_t))); 103 104 /* 105 * Verify that "epoch_record" is at beginning of "struct 106 * linux_epoch_record": 107 */ 108 CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0); 109 110 CTASSERT(TS_RCU_TYPE_MAX == RCU_TYPE_MAX); 111 112 static ck_epoch_t linux_epoch[RCU_TYPE_MAX]; 113 static struct linux_epoch_head linux_epoch_head[RCU_TYPE_MAX]; 114 DPCPU_DEFINE_STATIC(struct linux_epoch_record, linux_epoch_record[RCU_TYPE_MAX]); 115 116 static void linux_rcu_cleaner_func(void *, int); 117 118 static void 119 linux_rcu_runtime_init(void *arg __unused) 120 { 121 struct linux_epoch_head *head; 122 int i; 123 int j; 124 125 for (j = 0; j != RCU_TYPE_MAX; j++) { 126 ck_epoch_init(&linux_epoch[j]); 127 128 head = &linux_epoch_head[j]; 129 130 TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, head); 131 init_llist_head(&head->cb_head); 132 133 CPU_FOREACH(i) { 134 struct linux_epoch_record *record; 135 136 record = &DPCPU_ID_GET(i, linux_epoch_record[j]); 137 138 record->cpuid = i; 139 record->type = j; 140 ck_epoch_register(&linux_epoch[j], 141 &record->epoch_record, NULL); 142 TAILQ_INIT(&record->ts_head); 143 } 144 } 145 } 146 SYSINIT(linux_rcu_runtime, SI_SUB_CPU, SI_ORDER_ANY, linux_rcu_runtime_init, NULL); 147 148 static void 149 linux_rcu_cleaner_func(void *context, int pending __unused) 150 { 151 struct linux_epoch_head *head = context; 152 struct callback_head *rcu; 153 STAILQ_HEAD(, callback_head) tmp_head; 154 struct llist_node *node, *next; 155 uintptr_t offset; 156 157 /* move current callbacks into own queue */ 158 STAILQ_INIT(&tmp_head); 159 llist_for_each_safe(node, next, llist_del_all(&head->cb_head)) { 160 rcu = container_of(node, struct callback_head, node); 161 /* re-reverse list to restore chronological order */ 162 STAILQ_INSERT_HEAD(&tmp_head, rcu, entry); 163 } 164 165 /* synchronize */ 166 linux_synchronize_rcu(head - linux_epoch_head); 167 168 /* dispatch all callbacks, if any */ 169 while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) { 170 STAILQ_REMOVE_HEAD(&tmp_head, entry); 171 172 offset = (uintptr_t)rcu->func; 173 174 if (offset < LINUX_KFREE_RCU_OFFSET_MAX) 175 kfree((char *)rcu - offset); 176 else 177 rcu->func((struct rcu_head *)rcu); 178 } 179 } 180 181 void 182 linux_rcu_read_lock(unsigned type) 183 { 184 struct linux_epoch_record *record; 185 struct task_struct *ts; 186 187 MPASS(type < RCU_TYPE_MAX); 188 189 if (RCU_SKIP()) 190 return; 191 192 ts = current; 193 194 /* assert valid refcount */ 195 MPASS(ts->rcu_recurse[type] != INT_MAX); 196 197 if (++(ts->rcu_recurse[type]) != 1) 198 return; 199 200 /* 201 * Pin thread to current CPU so that the unlock code gets the 202 * same per-CPU epoch record: 203 */ 204 sched_pin(); 205 206 record = &DPCPU_GET(linux_epoch_record[type]); 207 208 /* 209 * Use a critical section to prevent recursion inside 210 * ck_epoch_begin(). Else this function supports recursion. 211 */ 212 critical_enter(); 213 ck_epoch_begin(&record->epoch_record, 214 (ck_epoch_section_t *)&ts->rcu_section[type]); 215 TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry[type]); 216 critical_exit(); 217 } 218 219 void 220 linux_rcu_read_unlock(unsigned type) 221 { 222 struct linux_epoch_record *record; 223 struct task_struct *ts; 224 225 MPASS(type < RCU_TYPE_MAX); 226 227 if (RCU_SKIP()) 228 return; 229 230 ts = current; 231 232 /* assert valid refcount */ 233 MPASS(ts->rcu_recurse[type] > 0); 234 235 if (--(ts->rcu_recurse[type]) != 0) 236 return; 237 238 record = &DPCPU_GET(linux_epoch_record[type]); 239 240 /* 241 * Use a critical section to prevent recursion inside 242 * ck_epoch_end(). Else this function supports recursion. 243 */ 244 critical_enter(); 245 ck_epoch_end(&record->epoch_record, 246 (ck_epoch_section_t *)&ts->rcu_section[type]); 247 TAILQ_REMOVE(&record->ts_head, ts, rcu_entry[type]); 248 critical_exit(); 249 250 sched_unpin(); 251 } 252 253 bool 254 linux_rcu_read_lock_held(unsigned type) 255 { 256 #ifdef INVARINATS 257 struct linux_epoch_record *record __diagused; 258 struct task_struct *ts; 259 260 MPASS(type < RCU_TYPE_MAX); 261 262 if (RCU_SKIP()) 263 return (false); 264 265 if (__current_unallocated(curthread)) 266 return (false); 267 268 ts = current; 269 if (ts->rcu_recurse[type] == 0) 270 return (false); 271 272 MPASS(curthread->td_pinned != 0); 273 MPASS((record = &DPCPU_GET(linux_epoch_record[type])) && 274 record->epoch_record.active != 0); 275 #endif 276 277 return (true); 278 } 279 280 static void 281 linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused) 282 { 283 struct linux_epoch_record *record = 284 container_of(epoch_record, struct linux_epoch_record, epoch_record); 285 struct thread *td = curthread; 286 struct task_struct *ts; 287 288 /* check if blocked on the current CPU */ 289 if (record->cpuid == PCPU_GET(cpuid)) { 290 bool is_sleeping = 0; 291 u_char prio = 0; 292 293 /* 294 * Find the lowest priority or sleeping thread which 295 * is blocking synchronization on this CPU core. All 296 * the threads in the queue are CPU-pinned and cannot 297 * go anywhere while the current thread is locked. 298 */ 299 TAILQ_FOREACH(ts, &record->ts_head, rcu_entry[record->type]) { 300 if (ts->task_thread->td_priority > prio) 301 prio = ts->task_thread->td_priority; 302 is_sleeping |= (ts->task_thread->td_inhibitors != 0); 303 } 304 305 if (is_sleeping) { 306 thread_unlock(td); 307 pause("W", 1); 308 thread_lock(td); 309 } else { 310 /* set new thread priority */ 311 sched_prio(td, prio); 312 /* task switch */ 313 mi_switch(SW_VOL | SWT_RELINQUISH); 314 /* 315 * It is important the thread lock is dropped 316 * while yielding to allow other threads to 317 * acquire the lock pointed to by 318 * TDQ_LOCKPTR(td). Currently mi_switch() will 319 * unlock the thread lock before 320 * returning. Else a deadlock like situation 321 * might happen. 322 */ 323 thread_lock(td); 324 } 325 } else { 326 /* 327 * To avoid spinning move execution to the other CPU 328 * which is blocking synchronization. Set highest 329 * thread priority so that code gets run. The thread 330 * priority will be restored later. 331 */ 332 sched_prio(td, 0); 333 sched_bind(td, record->cpuid); 334 } 335 } 336 337 void 338 linux_synchronize_rcu(unsigned type) 339 { 340 struct thread *td; 341 int was_bound; 342 int old_cpu; 343 int old_pinned; 344 u_char old_prio; 345 346 MPASS(type < RCU_TYPE_MAX); 347 348 if (RCU_SKIP()) 349 return; 350 351 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 352 "linux_synchronize_rcu() can sleep"); 353 354 td = curthread; 355 DROP_GIANT(); 356 357 /* 358 * Synchronizing RCU might change the CPU core this function 359 * is running on. Save current values: 360 */ 361 thread_lock(td); 362 363 old_cpu = PCPU_GET(cpuid); 364 old_pinned = td->td_pinned; 365 old_prio = td->td_priority; 366 was_bound = sched_is_bound(td); 367 sched_unbind(td); 368 td->td_pinned = 0; 369 sched_bind(td, old_cpu); 370 371 ck_epoch_synchronize_wait(&linux_epoch[type], 372 &linux_synchronize_rcu_cb, NULL); 373 374 /* restore CPU binding, if any */ 375 if (was_bound != 0) { 376 sched_bind(td, old_cpu); 377 } else { 378 /* get thread back to initial CPU, if any */ 379 if (old_pinned != 0) 380 sched_bind(td, old_cpu); 381 sched_unbind(td); 382 } 383 /* restore pinned after bind */ 384 td->td_pinned = old_pinned; 385 386 /* restore thread priority */ 387 sched_prio(td, old_prio); 388 thread_unlock(td); 389 390 PICKUP_GIANT(); 391 } 392 393 void 394 linux_rcu_barrier(unsigned type) 395 { 396 struct linux_epoch_head *head; 397 398 MPASS(type < RCU_TYPE_MAX); 399 400 /* 401 * This function is not obligated to wait for a grace period. 402 * It only waits for RCU callbacks that have already been posted. 403 * If there are no RCU callbacks posted, rcu_barrier() can return 404 * immediately. 405 */ 406 head = &linux_epoch_head[type]; 407 408 /* wait for callbacks to complete */ 409 taskqueue_drain(linux_irq_work_tq, &head->task); 410 } 411 412 void 413 linux_call_rcu(unsigned type, struct rcu_head *context, rcu_callback_t func) 414 { 415 struct callback_head *rcu; 416 struct linux_epoch_head *head; 417 418 MPASS(type < RCU_TYPE_MAX); 419 420 rcu = (struct callback_head *)context; 421 head = &linux_epoch_head[type]; 422 423 rcu->func = func; 424 llist_add(&rcu->node, &head->cb_head); 425 taskqueue_enqueue(linux_irq_work_tq, &head->task); 426 } 427 428 int 429 init_srcu_struct(struct srcu_struct *srcu) 430 { 431 return (0); 432 } 433 434 void 435 cleanup_srcu_struct(struct srcu_struct *srcu) 436 { 437 } 438 439 int 440 srcu_read_lock(struct srcu_struct *srcu) 441 { 442 linux_rcu_read_lock(RCU_TYPE_SLEEPABLE); 443 return (0); 444 } 445 446 void 447 srcu_read_unlock(struct srcu_struct *srcu, int key __unused) 448 { 449 linux_rcu_read_unlock(RCU_TYPE_SLEEPABLE); 450 } 451 452 void 453 synchronize_srcu(struct srcu_struct *srcu) 454 { 455 linux_synchronize_rcu(RCU_TYPE_SLEEPABLE); 456 } 457 458 void 459 srcu_barrier(struct srcu_struct *srcu) 460 { 461 linux_rcu_barrier(RCU_TYPE_SLEEPABLE); 462 } 463