1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef BLK_MQ_H 3 #define BLK_MQ_H 4 5 #include <linux/blkdev.h> 6 #include <linux/sbitmap.h> 7 #include <linux/lockdep.h> 8 #include <linux/scatterlist.h> 9 #include <linux/prefetch.h> 10 #include <linux/srcu.h> 11 #include <linux/rw_hint.h> 12 13 struct blk_mq_tags; 14 struct blk_flush_queue; 15 16 #define BLKDEV_MIN_RQ 4 17 #define BLKDEV_DEFAULT_RQ 128 18 19 enum rq_end_io_ret { 20 RQ_END_IO_NONE, 21 RQ_END_IO_FREE, 22 }; 23 24 typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); 25 26 /* 27 * request flags */ 28 typedef __u32 __bitwise req_flags_t; 29 30 /* drive already may have started this one */ 31 #define RQF_STARTED ((__force req_flags_t)(1 << 1)) 32 /* request for flush sequence */ 33 #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) 34 /* merge of different types, fail separately */ 35 #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) 36 /* don't call prep for this one */ 37 #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) 38 /* use hctx->sched_tags */ 39 #define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8)) 40 /* use an I/O scheduler for this request */ 41 #define RQF_USE_SCHED ((__force req_flags_t)(1 << 9)) 42 /* vaguely specified driver internal error. Ignored by the block layer */ 43 #define RQF_FAILED ((__force req_flags_t)(1 << 10)) 44 /* don't warn about errors */ 45 #define RQF_QUIET ((__force req_flags_t)(1 << 11)) 46 /* account into disk and partition IO statistics */ 47 #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) 48 /* runtime pm request */ 49 #define RQF_PM ((__force req_flags_t)(1 << 15)) 50 /* on IO scheduler merge hash */ 51 #define RQF_HASHED ((__force req_flags_t)(1 << 16)) 52 /* track IO completion time */ 53 #define RQF_STATS ((__force req_flags_t)(1 << 17)) 54 /* Look at ->special_vec for the actual data payload instead of the 55 bio chain. */ 56 #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) 57 /* The per-zone write lock is held for this request */ 58 #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) 59 /* ->timeout has been called, don't expire again */ 60 #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) 61 #define RQF_RESV ((__force req_flags_t)(1 << 23)) 62 63 /* flags that prevent us from merging requests: */ 64 #define RQF_NOMERGE_FLAGS \ 65 (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) 66 67 enum mq_rq_state { 68 MQ_RQ_IDLE = 0, 69 MQ_RQ_IN_FLIGHT = 1, 70 MQ_RQ_COMPLETE = 2, 71 }; 72 73 /* 74 * Try to put the fields that are referenced together in the same cacheline. 75 * 76 * If you modify this structure, make sure to update blk_rq_init() and 77 * especially blk_mq_rq_ctx_init() to take care of the added fields. 78 */ 79 struct request { 80 struct request_queue *q; 81 struct blk_mq_ctx *mq_ctx; 82 struct blk_mq_hw_ctx *mq_hctx; 83 84 blk_opf_t cmd_flags; /* op and common flags */ 85 req_flags_t rq_flags; 86 87 int tag; 88 int internal_tag; 89 90 unsigned int timeout; 91 92 /* the following two fields are internal, NEVER access directly */ 93 unsigned int __data_len; /* total data len */ 94 sector_t __sector; /* sector cursor */ 95 96 struct bio *bio; 97 struct bio *biotail; 98 99 union { 100 struct list_head queuelist; 101 struct request *rq_next; 102 }; 103 104 struct block_device *part; 105 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 106 /* Time that the first bio started allocating this request. */ 107 u64 alloc_time_ns; 108 #endif 109 /* Time that this request was allocated for this IO. */ 110 u64 start_time_ns; 111 /* Time that I/O was submitted to the device. */ 112 u64 io_start_time_ns; 113 114 #ifdef CONFIG_BLK_WBT 115 unsigned short wbt_flags; 116 #endif 117 /* 118 * rq sectors used for blk stats. It has the same value 119 * with blk_rq_sectors(rq), except that it never be zeroed 120 * by completion. 121 */ 122 unsigned short stats_sectors; 123 124 /* 125 * Number of scatter-gather DMA addr+len pairs after 126 * physical address coalescing is performed. 127 */ 128 unsigned short nr_phys_segments; 129 130 #ifdef CONFIG_BLK_DEV_INTEGRITY 131 unsigned short nr_integrity_segments; 132 #endif 133 134 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 135 struct bio_crypt_ctx *crypt_ctx; 136 struct blk_crypto_keyslot *crypt_keyslot; 137 #endif 138 139 enum rw_hint write_hint; 140 unsigned short ioprio; 141 142 enum mq_rq_state state; 143 atomic_t ref; 144 145 unsigned long deadline; 146 147 /* 148 * The hash is used inside the scheduler, and killed once the 149 * request reaches the dispatch list. The ipi_list is only used 150 * to queue the request for softirq completion, which is long 151 * after the request has been unhashed (and even removed from 152 * the dispatch list). 153 */ 154 union { 155 struct hlist_node hash; /* merge hash */ 156 struct llist_node ipi_list; 157 }; 158 159 /* 160 * The rb_node is only used inside the io scheduler, requests 161 * are pruned when moved to the dispatch queue. special_vec must 162 * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be 163 * insert into an IO scheduler. 164 */ 165 union { 166 struct rb_node rb_node; /* sort/lookup */ 167 struct bio_vec special_vec; 168 }; 169 170 /* 171 * Three pointers are available for the IO schedulers, if they need 172 * more they have to dynamically allocate it. 173 */ 174 struct { 175 struct io_cq *icq; 176 void *priv[2]; 177 } elv; 178 179 struct { 180 unsigned int seq; 181 rq_end_io_fn *saved_end_io; 182 } flush; 183 184 u64 fifo_time; 185 186 /* 187 * completion callback. 188 */ 189 rq_end_io_fn *end_io; 190 void *end_io_data; 191 }; 192 193 static inline enum req_op req_op(const struct request *req) 194 { 195 return req->cmd_flags & REQ_OP_MASK; 196 } 197 198 static inline bool blk_rq_is_passthrough(struct request *rq) 199 { 200 return blk_op_is_passthrough(rq->cmd_flags); 201 } 202 203 static inline unsigned short req_get_ioprio(struct request *req) 204 { 205 return req->ioprio; 206 } 207 208 #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 209 210 #define rq_dma_dir(rq) \ 211 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 212 213 #define rq_list_add(listptr, rq) do { \ 214 (rq)->rq_next = *(listptr); \ 215 *(listptr) = rq; \ 216 } while (0) 217 218 #define rq_list_add_tail(lastpptr, rq) do { \ 219 (rq)->rq_next = NULL; \ 220 **(lastpptr) = rq; \ 221 *(lastpptr) = &rq->rq_next; \ 222 } while (0) 223 224 #define rq_list_pop(listptr) \ 225 ({ \ 226 struct request *__req = NULL; \ 227 if ((listptr) && *(listptr)) { \ 228 __req = *(listptr); \ 229 *(listptr) = __req->rq_next; \ 230 } \ 231 __req; \ 232 }) 233 234 #define rq_list_peek(listptr) \ 235 ({ \ 236 struct request *__req = NULL; \ 237 if ((listptr) && *(listptr)) \ 238 __req = *(listptr); \ 239 __req; \ 240 }) 241 242 #define rq_list_for_each(listptr, pos) \ 243 for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) 244 245 #define rq_list_for_each_safe(listptr, pos, nxt) \ 246 for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ 247 pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) 248 249 #define rq_list_next(rq) (rq)->rq_next 250 #define rq_list_empty(list) ((list) == (struct request *) NULL) 251 252 /** 253 * rq_list_move() - move a struct request from one list to another 254 * @src: The source list @rq is currently in 255 * @dst: The destination list that @rq will be appended to 256 * @rq: The request to move 257 * @prev: The request preceding @rq in @src (NULL if @rq is the head) 258 */ 259 static inline void rq_list_move(struct request **src, struct request **dst, 260 struct request *rq, struct request *prev) 261 { 262 if (prev) 263 prev->rq_next = rq->rq_next; 264 else 265 *src = rq->rq_next; 266 rq_list_add(dst, rq); 267 } 268 269 /** 270 * enum blk_eh_timer_return - How the timeout handler should proceed 271 * @BLK_EH_DONE: The block driver completed the command or will complete it at 272 * a later time. 273 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the 274 * request to complete. 275 */ 276 enum blk_eh_timer_return { 277 BLK_EH_DONE, 278 BLK_EH_RESET_TIMER, 279 }; 280 281 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ 282 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ 283 284 /** 285 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 286 * block device 287 */ 288 struct blk_mq_hw_ctx { 289 struct { 290 /** @lock: Protects the dispatch list. */ 291 spinlock_t lock; 292 /** 293 * @dispatch: Used for requests that are ready to be 294 * dispatched to the hardware but for some reason (e.g. lack of 295 * resources) could not be sent to the hardware. As soon as the 296 * driver can send new requests, requests at this list will 297 * be sent first for a fairer dispatch. 298 */ 299 struct list_head dispatch; 300 /** 301 * @state: BLK_MQ_S_* flags. Defines the state of the hw 302 * queue (active, scheduled to restart, stopped). 303 */ 304 unsigned long state; 305 } ____cacheline_aligned_in_smp; 306 307 /** 308 * @run_work: Used for scheduling a hardware queue run at a later time. 309 */ 310 struct delayed_work run_work; 311 /** @cpumask: Map of available CPUs where this hctx can run. */ 312 cpumask_var_t cpumask; 313 /** 314 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 315 * selection from @cpumask. 316 */ 317 int next_cpu; 318 /** 319 * @next_cpu_batch: Counter of how many works left in the batch before 320 * changing to the next CPU. 321 */ 322 int next_cpu_batch; 323 324 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 325 unsigned long flags; 326 327 /** 328 * @sched_data: Pointer owned by the IO scheduler attached to a request 329 * queue. It's up to the IO scheduler how to use this pointer. 330 */ 331 void *sched_data; 332 /** 333 * @queue: Pointer to the request queue that owns this hardware context. 334 */ 335 struct request_queue *queue; 336 /** @fq: Queue of requests that need to perform a flush operation. */ 337 struct blk_flush_queue *fq; 338 339 /** 340 * @driver_data: Pointer to data owned by the block driver that created 341 * this hctx 342 */ 343 void *driver_data; 344 345 /** 346 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 347 * pending request in that software queue. 348 */ 349 struct sbitmap ctx_map; 350 351 /** 352 * @dispatch_from: Software queue to be used when no scheduler was 353 * selected. 354 */ 355 struct blk_mq_ctx *dispatch_from; 356 /** 357 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 358 * decide if the hw_queue is busy using Exponential Weighted Moving 359 * Average algorithm. 360 */ 361 unsigned int dispatch_busy; 362 363 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 364 unsigned short type; 365 /** @nr_ctx: Number of software queues. */ 366 unsigned short nr_ctx; 367 /** @ctxs: Array of software queues. */ 368 struct blk_mq_ctx **ctxs; 369 370 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 371 spinlock_t dispatch_wait_lock; 372 /** 373 * @dispatch_wait: Waitqueue to put requests when there is no tag 374 * available at the moment, to wait for another try in the future. 375 */ 376 wait_queue_entry_t dispatch_wait; 377 378 /** 379 * @wait_index: Index of next available dispatch_wait queue to insert 380 * requests. 381 */ 382 atomic_t wait_index; 383 384 /** 385 * @tags: Tags owned by the block driver. A tag at this set is only 386 * assigned when a request is dispatched from a hardware queue. 387 */ 388 struct blk_mq_tags *tags; 389 /** 390 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 391 * scheduler associated with a request queue, a tag is assigned when 392 * that request is allocated. Else, this member is not used. 393 */ 394 struct blk_mq_tags *sched_tags; 395 396 /** @numa_node: NUMA node the storage adapter has been connected to. */ 397 unsigned int numa_node; 398 /** @queue_num: Index of this hardware queue. */ 399 unsigned int queue_num; 400 401 /** 402 * @nr_active: Number of active requests. Only used when a tag set is 403 * shared across request queues. 404 */ 405 atomic_t nr_active; 406 407 /** @cpuhp_online: List to store request if CPU is going to die */ 408 struct hlist_node cpuhp_online; 409 /** @cpuhp_dead: List to store request if some CPU die. */ 410 struct hlist_node cpuhp_dead; 411 /** @kobj: Kernel object for sysfs. */ 412 struct kobject kobj; 413 414 #ifdef CONFIG_BLK_DEBUG_FS 415 /** 416 * @debugfs_dir: debugfs directory for this hardware queue. Named 417 * as cpu<cpu_number>. 418 */ 419 struct dentry *debugfs_dir; 420 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 421 struct dentry *sched_debugfs_dir; 422 #endif 423 424 /** 425 * @hctx_list: if this hctx is not in use, this is an entry in 426 * q->unused_hctx_list. 427 */ 428 struct list_head hctx_list; 429 }; 430 431 /** 432 * struct blk_mq_queue_map - Map software queues to hardware queues 433 * @mq_map: CPU ID to hardware queue index map. This is an array 434 * with nr_cpu_ids elements. Each element has a value in the range 435 * [@queue_offset, @queue_offset + @nr_queues). 436 * @nr_queues: Number of hardware queues to map CPU IDs onto. 437 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 438 * driver to map each hardware queue type (enum hctx_type) onto a distinct 439 * set of hardware queues. 440 */ 441 struct blk_mq_queue_map { 442 unsigned int *mq_map; 443 unsigned int nr_queues; 444 unsigned int queue_offset; 445 }; 446 447 /** 448 * enum hctx_type - Type of hardware queue 449 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 450 * @HCTX_TYPE_READ: Just for READ I/O. 451 * @HCTX_TYPE_POLL: Polled I/O of any kind. 452 * @HCTX_MAX_TYPES: Number of types of hctx. 453 */ 454 enum hctx_type { 455 HCTX_TYPE_DEFAULT, 456 HCTX_TYPE_READ, 457 HCTX_TYPE_POLL, 458 459 HCTX_MAX_TYPES, 460 }; 461 462 /** 463 * struct blk_mq_tag_set - tag set that can be shared between request queues 464 * @ops: Pointers to functions that implement block driver behavior. 465 * @map: One or more ctx -> hctx mappings. One map exists for each 466 * hardware queue type (enum hctx_type) that the driver wishes 467 * to support. There are no restrictions on maps being of the 468 * same size, and it's perfectly legal to share maps between 469 * types. 470 * @nr_maps: Number of elements in the @map array. A number in the range 471 * [1, HCTX_MAX_TYPES]. 472 * @nr_hw_queues: Number of hardware queues supported by the block driver that 473 * owns this data structure. 474 * @queue_depth: Number of tags per hardware queue, reserved tags included. 475 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 476 * allocations. 477 * @cmd_size: Number of additional bytes to allocate per request. The block 478 * driver owns these additional bytes. 479 * @numa_node: NUMA node the storage adapter has been connected to. 480 * @timeout: Request processing timeout in jiffies. 481 * @flags: Zero or more BLK_MQ_F_* flags. 482 * @driver_data: Pointer to data owned by the block driver that created this 483 * tag set. 484 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 485 * elements. 486 * @shared_tags: 487 * Shared set of tags. Has @nr_hw_queues elements. If set, 488 * shared by all @tags. 489 * @tag_list_lock: Serializes tag_list accesses. 490 * @tag_list: List of the request queues that use this tag set. See also 491 * request_queue.tag_set_list. 492 * @srcu: Use as lock when type of the request queue is blocking 493 * (BLK_MQ_F_BLOCKING). 494 */ 495 struct blk_mq_tag_set { 496 const struct blk_mq_ops *ops; 497 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 498 unsigned int nr_maps; 499 unsigned int nr_hw_queues; 500 unsigned int queue_depth; 501 unsigned int reserved_tags; 502 unsigned int cmd_size; 503 int numa_node; 504 unsigned int timeout; 505 unsigned int flags; 506 void *driver_data; 507 508 struct blk_mq_tags **tags; 509 510 struct blk_mq_tags *shared_tags; 511 512 struct mutex tag_list_lock; 513 struct list_head tag_list; 514 struct srcu_struct *srcu; 515 }; 516 517 /** 518 * struct blk_mq_queue_data - Data about a request inserted in a queue 519 * 520 * @rq: Request pointer. 521 * @last: If it is the last request in the queue. 522 */ 523 struct blk_mq_queue_data { 524 struct request *rq; 525 bool last; 526 }; 527 528 typedef bool (busy_tag_iter_fn)(struct request *, void *); 529 530 /** 531 * struct blk_mq_ops - Callback functions that implements block driver 532 * behaviour. 533 */ 534 struct blk_mq_ops { 535 /** 536 * @queue_rq: Queue a new request from block IO. 537 */ 538 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 539 const struct blk_mq_queue_data *); 540 541 /** 542 * @commit_rqs: If a driver uses bd->last to judge when to submit 543 * requests to hardware, it must define this function. In case of errors 544 * that make us stop issuing further requests, this hook serves the 545 * purpose of kicking the hardware (which the last request otherwise 546 * would have done). 547 */ 548 void (*commit_rqs)(struct blk_mq_hw_ctx *); 549 550 /** 551 * @queue_rqs: Queue a list of new requests. Driver is guaranteed 552 * that each request belongs to the same queue. If the driver doesn't 553 * empty the @rqlist completely, then the rest will be queued 554 * individually by the block layer upon return. 555 */ 556 void (*queue_rqs)(struct request **rqlist); 557 558 /** 559 * @get_budget: Reserve budget before queue request, once .queue_rq is 560 * run, it is driver's responsibility to release the 561 * reserved budget. Also we have to handle failure case 562 * of .get_budget for avoiding I/O deadlock. 563 */ 564 int (*get_budget)(struct request_queue *); 565 566 /** 567 * @put_budget: Release the reserved budget. 568 */ 569 void (*put_budget)(struct request_queue *, int); 570 571 /** 572 * @set_rq_budget_token: store rq's budget token 573 */ 574 void (*set_rq_budget_token)(struct request *, int); 575 /** 576 * @get_rq_budget_token: retrieve rq's budget token 577 */ 578 int (*get_rq_budget_token)(struct request *); 579 580 /** 581 * @timeout: Called on request timeout. 582 */ 583 enum blk_eh_timer_return (*timeout)(struct request *); 584 585 /** 586 * @poll: Called to poll for completion of a specific tag. 587 */ 588 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); 589 590 /** 591 * @complete: Mark the request as complete. 592 */ 593 void (*complete)(struct request *); 594 595 /** 596 * @init_hctx: Called when the block layer side of a hardware queue has 597 * been set up, allowing the driver to allocate/init matching 598 * structures. 599 */ 600 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 601 /** 602 * @exit_hctx: Ditto for exit/teardown. 603 */ 604 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 605 606 /** 607 * @init_request: Called for every command allocated by the block layer 608 * to allow the driver to set up driver specific data. 609 * 610 * Tag greater than or equal to queue_depth is for setting up 611 * flush request. 612 */ 613 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 614 unsigned int, unsigned int); 615 /** 616 * @exit_request: Ditto for exit/teardown. 617 */ 618 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 619 unsigned int); 620 621 /** 622 * @cleanup_rq: Called before freeing one request which isn't completed 623 * yet, and usually for freeing the driver private data. 624 */ 625 void (*cleanup_rq)(struct request *); 626 627 /** 628 * @busy: If set, returns whether or not this queue currently is busy. 629 */ 630 bool (*busy)(struct request_queue *); 631 632 /** 633 * @map_queues: This allows drivers specify their own queue mapping by 634 * overriding the setup-time function that builds the mq_map. 635 */ 636 void (*map_queues)(struct blk_mq_tag_set *set); 637 638 #ifdef CONFIG_BLK_DEBUG_FS 639 /** 640 * @show_rq: Used by the debugfs implementation to show driver-specific 641 * information about a request. 642 */ 643 void (*show_rq)(struct seq_file *m, struct request *rq); 644 #endif 645 }; 646 647 enum { 648 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 649 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 650 /* 651 * Set when this device requires underlying blk-mq device for 652 * completing IO: 653 */ 654 BLK_MQ_F_STACKING = 1 << 2, 655 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 656 BLK_MQ_F_BLOCKING = 1 << 5, 657 /* Do not allow an I/O scheduler to be configured. */ 658 BLK_MQ_F_NO_SCHED = 1 << 6, 659 /* 660 * Select 'none' during queue registration in case of a single hwq 661 * or shared hwqs instead of 'mq-deadline'. 662 */ 663 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, 664 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 665 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 666 667 BLK_MQ_S_STOPPED = 0, 668 BLK_MQ_S_TAG_ACTIVE = 1, 669 BLK_MQ_S_SCHED_RESTART = 2, 670 671 /* hw queue is inactive after all its CPUs become offline */ 672 BLK_MQ_S_INACTIVE = 3, 673 674 BLK_MQ_MAX_DEPTH = 10240, 675 676 BLK_MQ_CPU_WORK_BATCH = 8, 677 }; 678 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 679 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 680 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 681 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 682 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 683 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 684 685 #define BLK_MQ_NO_HCTX_IDX (-1U) 686 687 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, 688 struct queue_limits *lim, void *queuedata, 689 struct lock_class_key *lkclass); 690 #define blk_mq_alloc_disk(set, lim, queuedata) \ 691 ({ \ 692 static struct lock_class_key __key; \ 693 \ 694 __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ 695 }) 696 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 697 struct lock_class_key *lkclass); 698 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, 699 struct queue_limits *lim, void *queuedata); 700 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 701 struct request_queue *q); 702 void blk_mq_destroy_queue(struct request_queue *); 703 704 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 705 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 706 const struct blk_mq_ops *ops, unsigned int queue_depth, 707 unsigned int set_flags); 708 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 709 710 void blk_mq_free_request(struct request *rq); 711 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, 712 unsigned int poll_flags); 713 714 bool blk_mq_queue_inflight(struct request_queue *q); 715 716 enum { 717 /* return when out of requests */ 718 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 719 /* allocate from reserved pool */ 720 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 721 /* set RQF_PM */ 722 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 723 }; 724 725 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, 726 blk_mq_req_flags_t flags); 727 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 728 blk_opf_t opf, blk_mq_req_flags_t flags, 729 unsigned int hctx_idx); 730 731 /* 732 * Tag address space map. 733 */ 734 struct blk_mq_tags { 735 unsigned int nr_tags; 736 unsigned int nr_reserved_tags; 737 unsigned int active_queues; 738 739 struct sbitmap_queue bitmap_tags; 740 struct sbitmap_queue breserved_tags; 741 742 struct request **rqs; 743 struct request **static_rqs; 744 struct list_head page_list; 745 746 /* 747 * used to clear request reference in rqs[] before freeing one 748 * request pool 749 */ 750 spinlock_t lock; 751 }; 752 753 static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, 754 unsigned int tag) 755 { 756 if (tag < tags->nr_tags) { 757 prefetch(tags->rqs[tag]); 758 return tags->rqs[tag]; 759 } 760 761 return NULL; 762 } 763 764 enum { 765 BLK_MQ_UNIQUE_TAG_BITS = 16, 766 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 767 }; 768 769 u32 blk_mq_unique_tag(struct request *rq); 770 771 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 772 { 773 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 774 } 775 776 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 777 { 778 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 779 } 780 781 /** 782 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 783 * @rq: target request. 784 */ 785 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 786 { 787 return READ_ONCE(rq->state); 788 } 789 790 static inline int blk_mq_request_started(struct request *rq) 791 { 792 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 793 } 794 795 static inline int blk_mq_request_completed(struct request *rq) 796 { 797 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 798 } 799 800 /* 801 * 802 * Set the state to complete when completing a request from inside ->queue_rq. 803 * This is used by drivers that want to ensure special complete actions that 804 * need access to the request are called on failure, e.g. by nvme for 805 * multipathing. 806 */ 807 static inline void blk_mq_set_request_complete(struct request *rq) 808 { 809 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 810 } 811 812 /* 813 * Complete the request directly instead of deferring it to softirq or 814 * completing it another CPU. Useful in preemptible instead of an interrupt. 815 */ 816 static inline void blk_mq_complete_request_direct(struct request *rq, 817 void (*complete)(struct request *rq)) 818 { 819 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 820 complete(rq); 821 } 822 823 void blk_mq_start_request(struct request *rq); 824 void blk_mq_end_request(struct request *rq, blk_status_t error); 825 void __blk_mq_end_request(struct request *rq, blk_status_t error); 826 void blk_mq_end_request_batch(struct io_comp_batch *ib); 827 828 /* 829 * Only need start/end time stamping if we have iostat or 830 * blk stats enabled, or using an IO scheduler. 831 */ 832 static inline bool blk_mq_need_time_stamp(struct request *rq) 833 { 834 /* 835 * passthrough io doesn't use iostat accounting, cgroup stats 836 * and io scheduler functionalities. 837 */ 838 if (blk_rq_is_passthrough(rq)) 839 return false; 840 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); 841 } 842 843 static inline bool blk_mq_is_reserved_rq(struct request *rq) 844 { 845 return rq->rq_flags & RQF_RESV; 846 } 847 848 /* 849 * Batched completions only work when there is no I/O error and no special 850 * ->end_io handler. 851 */ 852 static inline bool blk_mq_add_to_batch(struct request *req, 853 struct io_comp_batch *iob, int ioerror, 854 void (*complete)(struct io_comp_batch *)) 855 { 856 /* 857 * blk_mq_end_request_batch() can't end request allocated from 858 * sched tags 859 */ 860 if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || 861 (req->end_io && !blk_rq_is_passthrough(req))) 862 return false; 863 864 if (!iob->complete) 865 iob->complete = complete; 866 else if (iob->complete != complete) 867 return false; 868 iob->need_ts |= blk_mq_need_time_stamp(req); 869 rq_list_add(&iob->req_list, req); 870 return true; 871 } 872 873 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 874 void blk_mq_kick_requeue_list(struct request_queue *q); 875 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 876 void blk_mq_complete_request(struct request *rq); 877 bool blk_mq_complete_request_remote(struct request *rq); 878 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 879 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 880 void blk_mq_stop_hw_queues(struct request_queue *q); 881 void blk_mq_start_hw_queues(struct request_queue *q); 882 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 883 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 884 void blk_mq_quiesce_queue(struct request_queue *q); 885 void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); 886 void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); 887 void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); 888 void blk_mq_unquiesce_queue(struct request_queue *q); 889 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 890 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 891 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 892 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 893 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 894 busy_tag_iter_fn *fn, void *priv); 895 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 896 void blk_mq_freeze_queue(struct request_queue *q); 897 void blk_mq_unfreeze_queue(struct request_queue *q); 898 void blk_freeze_queue_start(struct request_queue *q); 899 void blk_mq_freeze_queue_wait(struct request_queue *q); 900 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 901 unsigned long timeout); 902 903 void blk_mq_map_queues(struct blk_mq_queue_map *qmap); 904 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 905 906 void blk_mq_quiesce_queue_nowait(struct request_queue *q); 907 908 unsigned int blk_mq_rq_cpu(struct request *rq); 909 910 bool __blk_should_fake_timeout(struct request_queue *q); 911 static inline bool blk_should_fake_timeout(struct request_queue *q) 912 { 913 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 914 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 915 return __blk_should_fake_timeout(q); 916 return false; 917 } 918 919 /** 920 * blk_mq_rq_from_pdu - cast a PDU to a request 921 * @pdu: the PDU (Protocol Data Unit) to be casted 922 * 923 * Return: request 924 * 925 * Driver command data is immediately after the request. So subtract request 926 * size to get back to the original request. 927 */ 928 static inline struct request *blk_mq_rq_from_pdu(void *pdu) 929 { 930 return pdu - sizeof(struct request); 931 } 932 933 /** 934 * blk_mq_rq_to_pdu - cast a request to a PDU 935 * @rq: the request to be casted 936 * 937 * Return: pointer to the PDU 938 * 939 * Driver command data is immediately after the request. So add request to get 940 * the PDU. 941 */ 942 static inline void *blk_mq_rq_to_pdu(struct request *rq) 943 { 944 return rq + 1; 945 } 946 947 #define queue_for_each_hw_ctx(q, hctx, i) \ 948 xa_for_each(&(q)->hctx_table, (i), (hctx)) 949 950 #define hctx_for_each_ctx(hctx, ctx, i) \ 951 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 952 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 953 954 static inline void blk_mq_cleanup_rq(struct request *rq) 955 { 956 if (rq->q->mq_ops->cleanup_rq) 957 rq->q->mq_ops->cleanup_rq(rq); 958 } 959 960 static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, 961 unsigned int nr_segs) 962 { 963 rq->nr_phys_segments = nr_segs; 964 rq->__data_len = bio->bi_iter.bi_size; 965 rq->bio = rq->biotail = bio; 966 rq->ioprio = bio_prio(bio); 967 } 968 969 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 970 struct lock_class_key *key); 971 972 static inline bool rq_is_sync(struct request *rq) 973 { 974 return op_is_sync(rq->cmd_flags); 975 } 976 977 void blk_rq_init(struct request_queue *q, struct request *rq); 978 int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 979 struct bio_set *bs, gfp_t gfp_mask, 980 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); 981 void blk_rq_unprep_clone(struct request *rq); 982 blk_status_t blk_insert_cloned_request(struct request *rq); 983 984 struct rq_map_data { 985 struct page **pages; 986 unsigned long offset; 987 unsigned short page_order; 988 unsigned short nr_entries; 989 bool null_mapped; 990 bool from_user; 991 }; 992 993 int blk_rq_map_user(struct request_queue *, struct request *, 994 struct rq_map_data *, void __user *, unsigned long, gfp_t); 995 int blk_rq_map_user_io(struct request *, struct rq_map_data *, 996 void __user *, unsigned long, gfp_t, bool, int, bool, int); 997 int blk_rq_map_user_iov(struct request_queue *, struct request *, 998 struct rq_map_data *, const struct iov_iter *, gfp_t); 999 int blk_rq_unmap_user(struct bio *); 1000 int blk_rq_map_kern(struct request_queue *, struct request *, void *, 1001 unsigned int, gfp_t); 1002 int blk_rq_append_bio(struct request *rq, struct bio *bio); 1003 void blk_execute_rq_nowait(struct request *rq, bool at_head); 1004 blk_status_t blk_execute_rq(struct request *rq, bool at_head); 1005 bool blk_rq_is_poll(struct request *rq); 1006 1007 struct req_iterator { 1008 struct bvec_iter iter; 1009 struct bio *bio; 1010 }; 1011 1012 #define __rq_for_each_bio(_bio, rq) \ 1013 if ((rq->bio)) \ 1014 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 1015 1016 #define rq_for_each_segment(bvl, _rq, _iter) \ 1017 __rq_for_each_bio(_iter.bio, _rq) \ 1018 bio_for_each_segment(bvl, _iter.bio, _iter.iter) 1019 1020 #define rq_for_each_bvec(bvl, _rq, _iter) \ 1021 __rq_for_each_bio(_iter.bio, _rq) \ 1022 bio_for_each_bvec(bvl, _iter.bio, _iter.iter) 1023 1024 #define rq_iter_last(bvec, _iter) \ 1025 (_iter.bio->bi_next == NULL && \ 1026 bio_iter_last(bvec, _iter.iter)) 1027 1028 /* 1029 * blk_rq_pos() : the current sector 1030 * blk_rq_bytes() : bytes left in the entire request 1031 * blk_rq_cur_bytes() : bytes left in the current segment 1032 * blk_rq_sectors() : sectors left in the entire request 1033 * blk_rq_cur_sectors() : sectors left in the current segment 1034 * blk_rq_stats_sectors() : sectors of the entire request used for stats 1035 */ 1036 static inline sector_t blk_rq_pos(const struct request *rq) 1037 { 1038 return rq->__sector; 1039 } 1040 1041 static inline unsigned int blk_rq_bytes(const struct request *rq) 1042 { 1043 return rq->__data_len; 1044 } 1045 1046 static inline int blk_rq_cur_bytes(const struct request *rq) 1047 { 1048 if (!rq->bio) 1049 return 0; 1050 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ 1051 return rq->bio->bi_iter.bi_size; 1052 return bio_iovec(rq->bio).bv_len; 1053 } 1054 1055 static inline unsigned int blk_rq_sectors(const struct request *rq) 1056 { 1057 return blk_rq_bytes(rq) >> SECTOR_SHIFT; 1058 } 1059 1060 static inline unsigned int blk_rq_cur_sectors(const struct request *rq) 1061 { 1062 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 1063 } 1064 1065 static inline unsigned int blk_rq_stats_sectors(const struct request *rq) 1066 { 1067 return rq->stats_sectors; 1068 } 1069 1070 /* 1071 * Some commands like WRITE SAME have a payload or data transfer size which 1072 * is different from the size of the request. Any driver that supports such 1073 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to 1074 * calculate the data transfer size. 1075 */ 1076 static inline unsigned int blk_rq_payload_bytes(struct request *rq) 1077 { 1078 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1079 return rq->special_vec.bv_len; 1080 return blk_rq_bytes(rq); 1081 } 1082 1083 /* 1084 * Return the first full biovec in the request. The caller needs to check that 1085 * there are any bvecs before calling this helper. 1086 */ 1087 static inline struct bio_vec req_bvec(struct request *rq) 1088 { 1089 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1090 return rq->special_vec; 1091 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); 1092 } 1093 1094 static inline unsigned int blk_rq_count_bios(struct request *rq) 1095 { 1096 unsigned int nr_bios = 0; 1097 struct bio *bio; 1098 1099 __rq_for_each_bio(bio, rq) 1100 nr_bios++; 1101 1102 return nr_bios; 1103 } 1104 1105 void blk_steal_bios(struct bio_list *list, struct request *rq); 1106 1107 /* 1108 * Request completion related functions. 1109 * 1110 * blk_update_request() completes given number of bytes and updates 1111 * the request without completing it. 1112 */ 1113 bool blk_update_request(struct request *rq, blk_status_t error, 1114 unsigned int nr_bytes); 1115 void blk_abort_request(struct request *); 1116 1117 /* 1118 * Number of physical segments as sent to the device. 1119 * 1120 * Normally this is the number of discontiguous data segments sent by the 1121 * submitter. But for data-less command like discard we might have no 1122 * actual data segments submitted, but the driver might have to add it's 1123 * own special payload. In that case we still return 1 here so that this 1124 * special payload will be mapped. 1125 */ 1126 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) 1127 { 1128 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1129 return 1; 1130 return rq->nr_phys_segments; 1131 } 1132 1133 /* 1134 * Number of discard segments (or ranges) the driver needs to fill in. 1135 * Each discard bio merged into a request is counted as one segment. 1136 */ 1137 static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) 1138 { 1139 return max_t(unsigned short, rq->nr_phys_segments, 1); 1140 } 1141 1142 int __blk_rq_map_sg(struct request_queue *q, struct request *rq, 1143 struct scatterlist *sglist, struct scatterlist **last_sg); 1144 static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1145 struct scatterlist *sglist) 1146 { 1147 struct scatterlist *last_sg = NULL; 1148 1149 return __blk_rq_map_sg(q, rq, sglist, &last_sg); 1150 } 1151 void blk_dump_rq_flags(struct request *, char *); 1152 1153 #ifdef CONFIG_BLK_DEV_ZONED 1154 static inline unsigned int blk_rq_zone_no(struct request *rq) 1155 { 1156 return disk_zone_no(rq->q->disk, blk_rq_pos(rq)); 1157 } 1158 1159 static inline unsigned int blk_rq_zone_is_seq(struct request *rq) 1160 { 1161 return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); 1162 } 1163 1164 /** 1165 * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization. 1166 * @rq: Request to examine. 1167 * 1168 * Note: REQ_OP_ZONE_APPEND requests do not require serialization. 1169 */ 1170 static inline bool blk_rq_is_seq_zoned_write(struct request *rq) 1171 { 1172 return op_needs_zoned_write_locking(req_op(rq)) && 1173 blk_rq_zone_is_seq(rq); 1174 } 1175 1176 bool blk_req_needs_zone_write_lock(struct request *rq); 1177 bool blk_req_zone_write_trylock(struct request *rq); 1178 void __blk_req_zone_write_lock(struct request *rq); 1179 void __blk_req_zone_write_unlock(struct request *rq); 1180 1181 static inline void blk_req_zone_write_lock(struct request *rq) 1182 { 1183 if (blk_req_needs_zone_write_lock(rq)) 1184 __blk_req_zone_write_lock(rq); 1185 } 1186 1187 static inline void blk_req_zone_write_unlock(struct request *rq) 1188 { 1189 if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) 1190 __blk_req_zone_write_unlock(rq); 1191 } 1192 1193 static inline bool blk_req_zone_is_write_locked(struct request *rq) 1194 { 1195 return rq->q->disk->seq_zones_wlock && 1196 test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock); 1197 } 1198 1199 static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1200 { 1201 if (!blk_req_needs_zone_write_lock(rq)) 1202 return true; 1203 return !blk_req_zone_is_write_locked(rq); 1204 } 1205 #else /* CONFIG_BLK_DEV_ZONED */ 1206 static inline bool blk_rq_is_seq_zoned_write(struct request *rq) 1207 { 1208 return false; 1209 } 1210 1211 static inline bool blk_req_needs_zone_write_lock(struct request *rq) 1212 { 1213 return false; 1214 } 1215 1216 static inline void blk_req_zone_write_lock(struct request *rq) 1217 { 1218 } 1219 1220 static inline void blk_req_zone_write_unlock(struct request *rq) 1221 { 1222 } 1223 static inline bool blk_req_zone_is_write_locked(struct request *rq) 1224 { 1225 return false; 1226 } 1227 1228 static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1229 { 1230 return true; 1231 } 1232 #endif /* CONFIG_BLK_DEV_ZONED */ 1233 1234 #endif /* BLK_MQ_H */ 1235