1 /*- 2 * Copyright (c) 2016 Matthew Macy ([email protected]) 3 * Copyright (c) 2017-2021 Hans Petter Selasky ([email protected]) 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice unmodified, this list of conditions, and the following 11 * disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 #include <sys/types.h> 30 #include <sys/systm.h> 31 #include <sys/malloc.h> 32 #include <sys/kernel.h> 33 #include <sys/lock.h> 34 #include <sys/mutex.h> 35 #include <sys/proc.h> 36 #include <sys/sched.h> 37 #include <sys/smp.h> 38 #include <sys/queue.h> 39 #include <sys/taskqueue.h> 40 #include <sys/kdb.h> 41 42 #include <ck_epoch.h> 43 44 #include <linux/rcupdate.h> 45 #include <linux/sched.h> 46 #include <linux/srcu.h> 47 #include <linux/slab.h> 48 #include <linux/kernel.h> 49 #include <linux/compat.h> 50 #include <linux/llist.h> 51 #include <linux/irq_work.h> 52 53 /* 54 * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will 55 * not be skipped during panic(). 56 */ 57 #ifdef CONFIG_NO_RCU_SKIP 58 #define RCU_SKIP(void) 0 59 #else 60 #define RCU_SKIP(void) unlikely(SCHEDULER_STOPPED() || kdb_active) 61 #endif 62 63 struct callback_head { 64 union { 65 STAILQ_ENTRY(callback_head) entry; 66 struct llist_node node; 67 }; 68 rcu_callback_t func; 69 }; 70 71 struct linux_epoch_head { 72 struct llist_head cb_head; 73 struct task task; 74 } __aligned(CACHE_LINE_SIZE); 75 76 struct linux_epoch_record { 77 ck_epoch_record_t epoch_record; 78 TAILQ_HEAD(, task_struct) ts_head; 79 int cpuid; 80 int type; 81 } __aligned(CACHE_LINE_SIZE); 82 83 /* 84 * Verify that "struct rcu_head" is big enough to hold "struct 85 * callback_head". This has been done to avoid having to add special 86 * compile flags for including ck_epoch.h to all clients of the 87 * LinuxKPI. 88 */ 89 CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head)); 90 91 /* 92 * Verify that "rcu_section[0]" has the same size as 93 * "ck_epoch_section_t". This has been done to avoid having to add 94 * special compile flags for including ck_epoch.h to all clients of 95 * the LinuxKPI. 96 */ 97 CTASSERT(sizeof(((struct task_struct *)0)->rcu_section[0] == 98 sizeof(ck_epoch_section_t))); 99 100 /* 101 * Verify that "epoch_record" is at beginning of "struct 102 * linux_epoch_record": 103 */ 104 CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0); 105 106 CTASSERT(TS_RCU_TYPE_MAX == RCU_TYPE_MAX); 107 108 static ck_epoch_t linux_epoch[RCU_TYPE_MAX]; 109 static struct linux_epoch_head linux_epoch_head[RCU_TYPE_MAX]; 110 DPCPU_DEFINE_STATIC(struct linux_epoch_record, linux_epoch_record[RCU_TYPE_MAX]); 111 112 static void linux_rcu_cleaner_func(void *, int); 113 114 static void 115 linux_rcu_runtime_init(void *arg __unused) 116 { 117 struct linux_epoch_head *head; 118 int i; 119 int j; 120 121 for (j = 0; j != RCU_TYPE_MAX; j++) { 122 ck_epoch_init(&linux_epoch[j]); 123 124 head = &linux_epoch_head[j]; 125 126 TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, head); 127 init_llist_head(&head->cb_head); 128 129 CPU_FOREACH(i) { 130 struct linux_epoch_record *record; 131 132 record = &DPCPU_ID_GET(i, linux_epoch_record[j]); 133 134 record->cpuid = i; 135 record->type = j; 136 ck_epoch_register(&linux_epoch[j], 137 &record->epoch_record, NULL); 138 TAILQ_INIT(&record->ts_head); 139 } 140 } 141 } 142 SYSINIT(linux_rcu_runtime, SI_SUB_CPU, SI_ORDER_ANY, linux_rcu_runtime_init, NULL); 143 144 static void 145 linux_rcu_cleaner_func(void *context, int pending __unused) 146 { 147 struct linux_epoch_head *head = context; 148 struct callback_head *rcu; 149 STAILQ_HEAD(, callback_head) tmp_head; 150 struct llist_node *node, *next; 151 uintptr_t offset; 152 153 /* move current callbacks into own queue */ 154 STAILQ_INIT(&tmp_head); 155 llist_for_each_safe(node, next, llist_del_all(&head->cb_head)) { 156 rcu = container_of(node, struct callback_head, node); 157 /* re-reverse list to restore chronological order */ 158 STAILQ_INSERT_HEAD(&tmp_head, rcu, entry); 159 } 160 161 /* synchronize */ 162 linux_synchronize_rcu(head - linux_epoch_head); 163 164 /* dispatch all callbacks, if any */ 165 while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) { 166 STAILQ_REMOVE_HEAD(&tmp_head, entry); 167 168 offset = (uintptr_t)rcu->func; 169 170 if (offset < LINUX_KFREE_RCU_OFFSET_MAX) 171 kfree((char *)rcu - offset); 172 else 173 rcu->func((struct rcu_head *)rcu); 174 } 175 } 176 177 void 178 linux_rcu_read_lock(unsigned type) 179 { 180 struct linux_epoch_record *record; 181 struct task_struct *ts; 182 183 MPASS(type < RCU_TYPE_MAX); 184 185 if (RCU_SKIP()) 186 return; 187 188 ts = current; 189 190 /* assert valid refcount */ 191 MPASS(ts->rcu_recurse[type] != INT_MAX); 192 193 if (++(ts->rcu_recurse[type]) != 1) 194 return; 195 196 /* 197 * Pin thread to current CPU so that the unlock code gets the 198 * same per-CPU epoch record: 199 */ 200 sched_pin(); 201 202 record = &DPCPU_GET(linux_epoch_record[type]); 203 204 /* 205 * Use a critical section to prevent recursion inside 206 * ck_epoch_begin(). Else this function supports recursion. 207 */ 208 critical_enter(); 209 ck_epoch_begin(&record->epoch_record, 210 (ck_epoch_section_t *)&ts->rcu_section[type]); 211 TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry[type]); 212 critical_exit(); 213 } 214 215 void 216 linux_rcu_read_unlock(unsigned type) 217 { 218 struct linux_epoch_record *record; 219 struct task_struct *ts; 220 221 MPASS(type < RCU_TYPE_MAX); 222 223 if (RCU_SKIP()) 224 return; 225 226 ts = current; 227 228 /* assert valid refcount */ 229 MPASS(ts->rcu_recurse[type] > 0); 230 231 if (--(ts->rcu_recurse[type]) != 0) 232 return; 233 234 record = &DPCPU_GET(linux_epoch_record[type]); 235 236 /* 237 * Use a critical section to prevent recursion inside 238 * ck_epoch_end(). Else this function supports recursion. 239 */ 240 critical_enter(); 241 ck_epoch_end(&record->epoch_record, 242 (ck_epoch_section_t *)&ts->rcu_section[type]); 243 TAILQ_REMOVE(&record->ts_head, ts, rcu_entry[type]); 244 critical_exit(); 245 246 sched_unpin(); 247 } 248 249 static void 250 linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused) 251 { 252 struct linux_epoch_record *record = 253 container_of(epoch_record, struct linux_epoch_record, epoch_record); 254 struct thread *td = curthread; 255 struct task_struct *ts; 256 257 /* check if blocked on the current CPU */ 258 if (record->cpuid == PCPU_GET(cpuid)) { 259 bool is_sleeping = 0; 260 u_char prio = 0; 261 262 /* 263 * Find the lowest priority or sleeping thread which 264 * is blocking synchronization on this CPU core. All 265 * the threads in the queue are CPU-pinned and cannot 266 * go anywhere while the current thread is locked. 267 */ 268 TAILQ_FOREACH(ts, &record->ts_head, rcu_entry[record->type]) { 269 if (ts->task_thread->td_priority > prio) 270 prio = ts->task_thread->td_priority; 271 is_sleeping |= (ts->task_thread->td_inhibitors != 0); 272 } 273 274 if (is_sleeping) { 275 thread_unlock(td); 276 pause("W", 1); 277 thread_lock(td); 278 } else { 279 /* set new thread priority */ 280 sched_prio(td, prio); 281 /* task switch */ 282 mi_switch(SW_VOL | SWT_RELINQUISH); 283 /* 284 * It is important the thread lock is dropped 285 * while yielding to allow other threads to 286 * acquire the lock pointed to by 287 * TDQ_LOCKPTR(td). Currently mi_switch() will 288 * unlock the thread lock before 289 * returning. Else a deadlock like situation 290 * might happen. 291 */ 292 thread_lock(td); 293 } 294 } else { 295 /* 296 * To avoid spinning move execution to the other CPU 297 * which is blocking synchronization. Set highest 298 * thread priority so that code gets run. The thread 299 * priority will be restored later. 300 */ 301 sched_prio(td, 0); 302 sched_bind(td, record->cpuid); 303 } 304 } 305 306 void 307 linux_synchronize_rcu(unsigned type) 308 { 309 struct thread *td; 310 int was_bound; 311 int old_cpu; 312 int old_pinned; 313 u_char old_prio; 314 315 MPASS(type < RCU_TYPE_MAX); 316 317 if (RCU_SKIP()) 318 return; 319 320 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 321 "linux_synchronize_rcu() can sleep"); 322 323 td = curthread; 324 DROP_GIANT(); 325 326 /* 327 * Synchronizing RCU might change the CPU core this function 328 * is running on. Save current values: 329 */ 330 thread_lock(td); 331 332 old_cpu = PCPU_GET(cpuid); 333 old_pinned = td->td_pinned; 334 old_prio = td->td_priority; 335 was_bound = sched_is_bound(td); 336 sched_unbind(td); 337 td->td_pinned = 0; 338 sched_bind(td, old_cpu); 339 340 ck_epoch_synchronize_wait(&linux_epoch[type], 341 &linux_synchronize_rcu_cb, NULL); 342 343 /* restore CPU binding, if any */ 344 if (was_bound != 0) { 345 sched_bind(td, old_cpu); 346 } else { 347 /* get thread back to initial CPU, if any */ 348 if (old_pinned != 0) 349 sched_bind(td, old_cpu); 350 sched_unbind(td); 351 } 352 /* restore pinned after bind */ 353 td->td_pinned = old_pinned; 354 355 /* restore thread priority */ 356 sched_prio(td, old_prio); 357 thread_unlock(td); 358 359 PICKUP_GIANT(); 360 } 361 362 void 363 linux_rcu_barrier(unsigned type) 364 { 365 struct linux_epoch_head *head; 366 367 MPASS(type < RCU_TYPE_MAX); 368 369 /* 370 * This function is not obligated to wait for a grace period. 371 * It only waits for RCU callbacks that have already been posted. 372 * If there are no RCU callbacks posted, rcu_barrier() can return 373 * immediately. 374 */ 375 head = &linux_epoch_head[type]; 376 377 /* wait for callbacks to complete */ 378 taskqueue_drain(linux_irq_work_tq, &head->task); 379 } 380 381 void 382 linux_call_rcu(unsigned type, struct rcu_head *context, rcu_callback_t func) 383 { 384 struct callback_head *rcu; 385 struct linux_epoch_head *head; 386 387 MPASS(type < RCU_TYPE_MAX); 388 389 rcu = (struct callback_head *)context; 390 head = &linux_epoch_head[type]; 391 392 rcu->func = func; 393 llist_add(&rcu->node, &head->cb_head); 394 taskqueue_enqueue(linux_irq_work_tq, &head->task); 395 } 396 397 int 398 init_srcu_struct(struct srcu_struct *srcu) 399 { 400 return (0); 401 } 402 403 void 404 cleanup_srcu_struct(struct srcu_struct *srcu) 405 { 406 } 407 408 int 409 srcu_read_lock(struct srcu_struct *srcu) 410 { 411 linux_rcu_read_lock(RCU_TYPE_SLEEPABLE); 412 return (0); 413 } 414 415 void 416 srcu_read_unlock(struct srcu_struct *srcu, int key __unused) 417 { 418 linux_rcu_read_unlock(RCU_TYPE_SLEEPABLE); 419 } 420 421 void 422 synchronize_srcu(struct srcu_struct *srcu) 423 { 424 linux_synchronize_rcu(RCU_TYPE_SLEEPABLE); 425 } 426 427 void 428 srcu_barrier(struct srcu_struct *srcu) 429 { 430 linux_rcu_barrier(RCU_TYPE_SLEEPABLE); 431 } 432