1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef BLK_MQ_H 3 #define BLK_MQ_H 4 5 #include <linux/blkdev.h> 6 #include <linux/sbitmap.h> 7 #include <linux/lockdep.h> 8 #include <linux/scatterlist.h> 9 #include <linux/prefetch.h> 10 11 struct blk_mq_tags; 12 struct blk_flush_queue; 13 14 #define BLKDEV_MIN_RQ 4 15 #define BLKDEV_DEFAULT_RQ 128 16 17 typedef void (rq_end_io_fn)(struct request *, blk_status_t); 18 19 /* 20 * request flags */ 21 typedef __u32 __bitwise req_flags_t; 22 23 /* drive already may have started this one */ 24 #define RQF_STARTED ((__force req_flags_t)(1 << 1)) 25 /* may not be passed by ioscheduler */ 26 #define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) 27 /* request for flush sequence */ 28 #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) 29 /* merge of different types, fail separately */ 30 #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) 31 /* track inflight for MQ */ 32 #define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) 33 /* don't call prep for this one */ 34 #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) 35 /* vaguely specified driver internal error. Ignored by the block layer */ 36 #define RQF_FAILED ((__force req_flags_t)(1 << 10)) 37 /* don't warn about errors */ 38 #define RQF_QUIET ((__force req_flags_t)(1 << 11)) 39 /* elevator private data attached */ 40 #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) 41 /* account into disk and partition IO statistics */ 42 #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) 43 /* runtime pm request */ 44 #define RQF_PM ((__force req_flags_t)(1 << 15)) 45 /* on IO scheduler merge hash */ 46 #define RQF_HASHED ((__force req_flags_t)(1 << 16)) 47 /* track IO completion time */ 48 #define RQF_STATS ((__force req_flags_t)(1 << 17)) 49 /* Look at ->special_vec for the actual data payload instead of the 50 bio chain. */ 51 #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) 52 /* The per-zone write lock is held for this request */ 53 #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) 54 /* already slept for hybrid poll */ 55 #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) 56 /* ->timeout has been called, don't expire again */ 57 #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) 58 /* queue has elevator attached */ 59 #define RQF_ELV ((__force req_flags_t)(1 << 22)) 60 #define RQF_RESV ((__force req_flags_t)(1 << 23)) 61 62 /* flags that prevent us from merging requests: */ 63 #define RQF_NOMERGE_FLAGS \ 64 (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) 65 66 enum mq_rq_state { 67 MQ_RQ_IDLE = 0, 68 MQ_RQ_IN_FLIGHT = 1, 69 MQ_RQ_COMPLETE = 2, 70 }; 71 72 /* 73 * Try to put the fields that are referenced together in the same cacheline. 74 * 75 * If you modify this structure, make sure to update blk_rq_init() and 76 * especially blk_mq_rq_ctx_init() to take care of the added fields. 77 */ 78 struct request { 79 struct request_queue *q; 80 struct blk_mq_ctx *mq_ctx; 81 struct blk_mq_hw_ctx *mq_hctx; 82 83 blk_opf_t cmd_flags; /* op and common flags */ 84 req_flags_t rq_flags; 85 86 int tag; 87 int internal_tag; 88 89 unsigned int timeout; 90 91 /* the following two fields are internal, NEVER access directly */ 92 unsigned int __data_len; /* total data len */ 93 sector_t __sector; /* sector cursor */ 94 95 struct bio *bio; 96 struct bio *biotail; 97 98 union { 99 struct list_head queuelist; 100 struct request *rq_next; 101 }; 102 103 struct block_device *part; 104 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 105 /* Time that the first bio started allocating this request. */ 106 u64 alloc_time_ns; 107 #endif 108 /* Time that this request was allocated for this IO. */ 109 u64 start_time_ns; 110 /* Time that I/O was submitted to the device. */ 111 u64 io_start_time_ns; 112 113 #ifdef CONFIG_BLK_WBT 114 unsigned short wbt_flags; 115 #endif 116 /* 117 * rq sectors used for blk stats. It has the same value 118 * with blk_rq_sectors(rq), except that it never be zeroed 119 * by completion. 120 */ 121 unsigned short stats_sectors; 122 123 /* 124 * Number of scatter-gather DMA addr+len pairs after 125 * physical address coalescing is performed. 126 */ 127 unsigned short nr_phys_segments; 128 129 #ifdef CONFIG_BLK_DEV_INTEGRITY 130 unsigned short nr_integrity_segments; 131 #endif 132 133 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 134 struct bio_crypt_ctx *crypt_ctx; 135 struct blk_crypto_keyslot *crypt_keyslot; 136 #endif 137 138 unsigned short write_hint; 139 unsigned short ioprio; 140 141 enum mq_rq_state state; 142 atomic_t ref; 143 144 unsigned long deadline; 145 146 /* 147 * The hash is used inside the scheduler, and killed once the 148 * request reaches the dispatch list. The ipi_list is only used 149 * to queue the request for softirq completion, which is long 150 * after the request has been unhashed (and even removed from 151 * the dispatch list). 152 */ 153 union { 154 struct hlist_node hash; /* merge hash */ 155 struct llist_node ipi_list; 156 }; 157 158 /* 159 * The rb_node is only used inside the io scheduler, requests 160 * are pruned when moved to the dispatch queue. So let the 161 * completion_data share space with the rb_node. 162 */ 163 union { 164 struct rb_node rb_node; /* sort/lookup */ 165 struct bio_vec special_vec; 166 void *completion_data; 167 }; 168 169 170 /* 171 * Three pointers are available for the IO schedulers, if they need 172 * more they have to dynamically allocate it. Flush requests are 173 * never put on the IO scheduler. So let the flush fields share 174 * space with the elevator data. 175 */ 176 union { 177 struct { 178 struct io_cq *icq; 179 void *priv[2]; 180 } elv; 181 182 struct { 183 unsigned int seq; 184 struct list_head list; 185 rq_end_io_fn *saved_end_io; 186 } flush; 187 }; 188 189 union { 190 struct __call_single_data csd; 191 u64 fifo_time; 192 }; 193 194 /* 195 * completion callback. 196 */ 197 rq_end_io_fn *end_io; 198 void *end_io_data; 199 }; 200 201 static inline enum req_op req_op(const struct request *req) 202 { 203 return req->cmd_flags & REQ_OP_MASK; 204 } 205 206 static inline bool blk_rq_is_passthrough(struct request *rq) 207 { 208 return blk_op_is_passthrough(req_op(rq)); 209 } 210 211 static inline unsigned short req_get_ioprio(struct request *req) 212 { 213 return req->ioprio; 214 } 215 216 #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 217 218 #define rq_dma_dir(rq) \ 219 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 220 221 #define rq_list_add(listptr, rq) do { \ 222 (rq)->rq_next = *(listptr); \ 223 *(listptr) = rq; \ 224 } while (0) 225 226 #define rq_list_pop(listptr) \ 227 ({ \ 228 struct request *__req = NULL; \ 229 if ((listptr) && *(listptr)) { \ 230 __req = *(listptr); \ 231 *(listptr) = __req->rq_next; \ 232 } \ 233 __req; \ 234 }) 235 236 #define rq_list_peek(listptr) \ 237 ({ \ 238 struct request *__req = NULL; \ 239 if ((listptr) && *(listptr)) \ 240 __req = *(listptr); \ 241 __req; \ 242 }) 243 244 #define rq_list_for_each(listptr, pos) \ 245 for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) 246 247 #define rq_list_for_each_safe(listptr, pos, nxt) \ 248 for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ 249 pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) 250 251 #define rq_list_next(rq) (rq)->rq_next 252 #define rq_list_empty(list) ((list) == (struct request *) NULL) 253 254 /** 255 * rq_list_move() - move a struct request from one list to another 256 * @src: The source list @rq is currently in 257 * @dst: The destination list that @rq will be appended to 258 * @rq: The request to move 259 * @prev: The request preceding @rq in @src (NULL if @rq is the head) 260 */ 261 static inline void rq_list_move(struct request **src, struct request **dst, 262 struct request *rq, struct request *prev) 263 { 264 if (prev) 265 prev->rq_next = rq->rq_next; 266 else 267 *src = rq->rq_next; 268 rq_list_add(dst, rq); 269 } 270 271 enum blk_eh_timer_return { 272 BLK_EH_DONE, /* drivers has completed the command */ 273 BLK_EH_RESET_TIMER, /* reset timer and try again */ 274 }; 275 276 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ 277 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ 278 279 /** 280 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 281 * block device 282 */ 283 struct blk_mq_hw_ctx { 284 struct { 285 /** @lock: Protects the dispatch list. */ 286 spinlock_t lock; 287 /** 288 * @dispatch: Used for requests that are ready to be 289 * dispatched to the hardware but for some reason (e.g. lack of 290 * resources) could not be sent to the hardware. As soon as the 291 * driver can send new requests, requests at this list will 292 * be sent first for a fairer dispatch. 293 */ 294 struct list_head dispatch; 295 /** 296 * @state: BLK_MQ_S_* flags. Defines the state of the hw 297 * queue (active, scheduled to restart, stopped). 298 */ 299 unsigned long state; 300 } ____cacheline_aligned_in_smp; 301 302 /** 303 * @run_work: Used for scheduling a hardware queue run at a later time. 304 */ 305 struct delayed_work run_work; 306 /** @cpumask: Map of available CPUs where this hctx can run. */ 307 cpumask_var_t cpumask; 308 /** 309 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 310 * selection from @cpumask. 311 */ 312 int next_cpu; 313 /** 314 * @next_cpu_batch: Counter of how many works left in the batch before 315 * changing to the next CPU. 316 */ 317 int next_cpu_batch; 318 319 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 320 unsigned long flags; 321 322 /** 323 * @sched_data: Pointer owned by the IO scheduler attached to a request 324 * queue. It's up to the IO scheduler how to use this pointer. 325 */ 326 void *sched_data; 327 /** 328 * @queue: Pointer to the request queue that owns this hardware context. 329 */ 330 struct request_queue *queue; 331 /** @fq: Queue of requests that need to perform a flush operation. */ 332 struct blk_flush_queue *fq; 333 334 /** 335 * @driver_data: Pointer to data owned by the block driver that created 336 * this hctx 337 */ 338 void *driver_data; 339 340 /** 341 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 342 * pending request in that software queue. 343 */ 344 struct sbitmap ctx_map; 345 346 /** 347 * @dispatch_from: Software queue to be used when no scheduler was 348 * selected. 349 */ 350 struct blk_mq_ctx *dispatch_from; 351 /** 352 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 353 * decide if the hw_queue is busy using Exponential Weighted Moving 354 * Average algorithm. 355 */ 356 unsigned int dispatch_busy; 357 358 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 359 unsigned short type; 360 /** @nr_ctx: Number of software queues. */ 361 unsigned short nr_ctx; 362 /** @ctxs: Array of software queues. */ 363 struct blk_mq_ctx **ctxs; 364 365 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 366 spinlock_t dispatch_wait_lock; 367 /** 368 * @dispatch_wait: Waitqueue to put requests when there is no tag 369 * available at the moment, to wait for another try in the future. 370 */ 371 wait_queue_entry_t dispatch_wait; 372 373 /** 374 * @wait_index: Index of next available dispatch_wait queue to insert 375 * requests. 376 */ 377 atomic_t wait_index; 378 379 /** 380 * @tags: Tags owned by the block driver. A tag at this set is only 381 * assigned when a request is dispatched from a hardware queue. 382 */ 383 struct blk_mq_tags *tags; 384 /** 385 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 386 * scheduler associated with a request queue, a tag is assigned when 387 * that request is allocated. Else, this member is not used. 388 */ 389 struct blk_mq_tags *sched_tags; 390 391 /** @queued: Number of queued requests. */ 392 unsigned long queued; 393 /** @run: Number of dispatched requests. */ 394 unsigned long run; 395 396 /** @numa_node: NUMA node the storage adapter has been connected to. */ 397 unsigned int numa_node; 398 /** @queue_num: Index of this hardware queue. */ 399 unsigned int queue_num; 400 401 /** 402 * @nr_active: Number of active requests. Only used when a tag set is 403 * shared across request queues. 404 */ 405 atomic_t nr_active; 406 407 /** @cpuhp_online: List to store request if CPU is going to die */ 408 struct hlist_node cpuhp_online; 409 /** @cpuhp_dead: List to store request if some CPU die. */ 410 struct hlist_node cpuhp_dead; 411 /** @kobj: Kernel object for sysfs. */ 412 struct kobject kobj; 413 414 #ifdef CONFIG_BLK_DEBUG_FS 415 /** 416 * @debugfs_dir: debugfs directory for this hardware queue. Named 417 * as cpu<cpu_number>. 418 */ 419 struct dentry *debugfs_dir; 420 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 421 struct dentry *sched_debugfs_dir; 422 #endif 423 424 /** 425 * @hctx_list: if this hctx is not in use, this is an entry in 426 * q->unused_hctx_list. 427 */ 428 struct list_head hctx_list; 429 }; 430 431 /** 432 * struct blk_mq_queue_map - Map software queues to hardware queues 433 * @mq_map: CPU ID to hardware queue index map. This is an array 434 * with nr_cpu_ids elements. Each element has a value in the range 435 * [@queue_offset, @queue_offset + @nr_queues). 436 * @nr_queues: Number of hardware queues to map CPU IDs onto. 437 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 438 * driver to map each hardware queue type (enum hctx_type) onto a distinct 439 * set of hardware queues. 440 */ 441 struct blk_mq_queue_map { 442 unsigned int *mq_map; 443 unsigned int nr_queues; 444 unsigned int queue_offset; 445 }; 446 447 /** 448 * enum hctx_type - Type of hardware queue 449 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 450 * @HCTX_TYPE_READ: Just for READ I/O. 451 * @HCTX_TYPE_POLL: Polled I/O of any kind. 452 * @HCTX_MAX_TYPES: Number of types of hctx. 453 */ 454 enum hctx_type { 455 HCTX_TYPE_DEFAULT, 456 HCTX_TYPE_READ, 457 HCTX_TYPE_POLL, 458 459 HCTX_MAX_TYPES, 460 }; 461 462 /** 463 * struct blk_mq_tag_set - tag set that can be shared between request queues 464 * @map: One or more ctx -> hctx mappings. One map exists for each 465 * hardware queue type (enum hctx_type) that the driver wishes 466 * to support. There are no restrictions on maps being of the 467 * same size, and it's perfectly legal to share maps between 468 * types. 469 * @nr_maps: Number of elements in the @map array. A number in the range 470 * [1, HCTX_MAX_TYPES]. 471 * @ops: Pointers to functions that implement block driver behavior. 472 * @nr_hw_queues: Number of hardware queues supported by the block driver that 473 * owns this data structure. 474 * @queue_depth: Number of tags per hardware queue, reserved tags included. 475 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 476 * allocations. 477 * @cmd_size: Number of additional bytes to allocate per request. The block 478 * driver owns these additional bytes. 479 * @numa_node: NUMA node the storage adapter has been connected to. 480 * @timeout: Request processing timeout in jiffies. 481 * @flags: Zero or more BLK_MQ_F_* flags. 482 * @driver_data: Pointer to data owned by the block driver that created this 483 * tag set. 484 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 485 * elements. 486 * @shared_tags: 487 * Shared set of tags. Has @nr_hw_queues elements. If set, 488 * shared by all @tags. 489 * @tag_list_lock: Serializes tag_list accesses. 490 * @tag_list: List of the request queues that use this tag set. See also 491 * request_queue.tag_set_list. 492 */ 493 struct blk_mq_tag_set { 494 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 495 unsigned int nr_maps; 496 const struct blk_mq_ops *ops; 497 unsigned int nr_hw_queues; 498 unsigned int queue_depth; 499 unsigned int reserved_tags; 500 unsigned int cmd_size; 501 int numa_node; 502 unsigned int timeout; 503 unsigned int flags; 504 void *driver_data; 505 506 struct blk_mq_tags **tags; 507 508 struct blk_mq_tags *shared_tags; 509 510 struct mutex tag_list_lock; 511 struct list_head tag_list; 512 }; 513 514 /** 515 * struct blk_mq_queue_data - Data about a request inserted in a queue 516 * 517 * @rq: Request pointer. 518 * @last: If it is the last request in the queue. 519 */ 520 struct blk_mq_queue_data { 521 struct request *rq; 522 bool last; 523 }; 524 525 typedef bool (busy_tag_iter_fn)(struct request *, void *); 526 527 /** 528 * struct blk_mq_ops - Callback functions that implements block driver 529 * behaviour. 530 */ 531 struct blk_mq_ops { 532 /** 533 * @queue_rq: Queue a new request from block IO. 534 */ 535 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 536 const struct blk_mq_queue_data *); 537 538 /** 539 * @commit_rqs: If a driver uses bd->last to judge when to submit 540 * requests to hardware, it must define this function. In case of errors 541 * that make us stop issuing further requests, this hook serves the 542 * purpose of kicking the hardware (which the last request otherwise 543 * would have done). 544 */ 545 void (*commit_rqs)(struct blk_mq_hw_ctx *); 546 547 /** 548 * @queue_rqs: Queue a list of new requests. Driver is guaranteed 549 * that each request belongs to the same queue. If the driver doesn't 550 * empty the @rqlist completely, then the rest will be queued 551 * individually by the block layer upon return. 552 */ 553 void (*queue_rqs)(struct request **rqlist); 554 555 /** 556 * @get_budget: Reserve budget before queue request, once .queue_rq is 557 * run, it is driver's responsibility to release the 558 * reserved budget. Also we have to handle failure case 559 * of .get_budget for avoiding I/O deadlock. 560 */ 561 int (*get_budget)(struct request_queue *); 562 563 /** 564 * @put_budget: Release the reserved budget. 565 */ 566 void (*put_budget)(struct request_queue *, int); 567 568 /** 569 * @set_rq_budget_token: store rq's budget token 570 */ 571 void (*set_rq_budget_token)(struct request *, int); 572 /** 573 * @get_rq_budget_token: retrieve rq's budget token 574 */ 575 int (*get_rq_budget_token)(struct request *); 576 577 /** 578 * @timeout: Called on request timeout. 579 */ 580 enum blk_eh_timer_return (*timeout)(struct request *); 581 582 /** 583 * @poll: Called to poll for completion of a specific tag. 584 */ 585 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); 586 587 /** 588 * @complete: Mark the request as complete. 589 */ 590 void (*complete)(struct request *); 591 592 /** 593 * @init_hctx: Called when the block layer side of a hardware queue has 594 * been set up, allowing the driver to allocate/init matching 595 * structures. 596 */ 597 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 598 /** 599 * @exit_hctx: Ditto for exit/teardown. 600 */ 601 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 602 603 /** 604 * @init_request: Called for every command allocated by the block layer 605 * to allow the driver to set up driver specific data. 606 * 607 * Tag greater than or equal to queue_depth is for setting up 608 * flush request. 609 */ 610 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 611 unsigned int, unsigned int); 612 /** 613 * @exit_request: Ditto for exit/teardown. 614 */ 615 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 616 unsigned int); 617 618 /** 619 * @cleanup_rq: Called before freeing one request which isn't completed 620 * yet, and usually for freeing the driver private data. 621 */ 622 void (*cleanup_rq)(struct request *); 623 624 /** 625 * @busy: If set, returns whether or not this queue currently is busy. 626 */ 627 bool (*busy)(struct request_queue *); 628 629 /** 630 * @map_queues: This allows drivers specify their own queue mapping by 631 * overriding the setup-time function that builds the mq_map. 632 */ 633 int (*map_queues)(struct blk_mq_tag_set *set); 634 635 #ifdef CONFIG_BLK_DEBUG_FS 636 /** 637 * @show_rq: Used by the debugfs implementation to show driver-specific 638 * information about a request. 639 */ 640 void (*show_rq)(struct seq_file *m, struct request *rq); 641 #endif 642 }; 643 644 enum { 645 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 646 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 647 /* 648 * Set when this device requires underlying blk-mq device for 649 * completing IO: 650 */ 651 BLK_MQ_F_STACKING = 1 << 2, 652 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 653 BLK_MQ_F_BLOCKING = 1 << 5, 654 /* Do not allow an I/O scheduler to be configured. */ 655 BLK_MQ_F_NO_SCHED = 1 << 6, 656 /* 657 * Select 'none' during queue registration in case of a single hwq 658 * or shared hwqs instead of 'mq-deadline'. 659 */ 660 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, 661 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 662 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 663 664 BLK_MQ_S_STOPPED = 0, 665 BLK_MQ_S_TAG_ACTIVE = 1, 666 BLK_MQ_S_SCHED_RESTART = 2, 667 668 /* hw queue is inactive after all its CPUs become offline */ 669 BLK_MQ_S_INACTIVE = 3, 670 671 BLK_MQ_MAX_DEPTH = 10240, 672 673 BLK_MQ_CPU_WORK_BATCH = 8, 674 }; 675 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 676 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 677 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 678 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 679 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 680 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 681 682 #define BLK_MQ_NO_HCTX_IDX (-1U) 683 684 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 685 struct lock_class_key *lkclass); 686 #define blk_mq_alloc_disk(set, queuedata) \ 687 ({ \ 688 static struct lock_class_key __key; \ 689 \ 690 __blk_mq_alloc_disk(set, queuedata, &__key); \ 691 }) 692 struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, 693 struct lock_class_key *lkclass); 694 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 695 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 696 struct request_queue *q); 697 void blk_mq_destroy_queue(struct request_queue *); 698 699 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 700 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 701 const struct blk_mq_ops *ops, unsigned int queue_depth, 702 unsigned int set_flags); 703 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 704 705 void blk_mq_free_request(struct request *rq); 706 707 bool blk_mq_queue_inflight(struct request_queue *q); 708 709 enum { 710 /* return when out of requests */ 711 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 712 /* allocate from reserved pool */ 713 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 714 /* set RQF_PM */ 715 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 716 }; 717 718 struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, 719 blk_mq_req_flags_t flags); 720 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 721 blk_opf_t opf, blk_mq_req_flags_t flags, 722 unsigned int hctx_idx); 723 724 /* 725 * Tag address space map. 726 */ 727 struct blk_mq_tags { 728 unsigned int nr_tags; 729 unsigned int nr_reserved_tags; 730 731 atomic_t active_queues; 732 733 struct sbitmap_queue bitmap_tags; 734 struct sbitmap_queue breserved_tags; 735 736 struct request **rqs; 737 struct request **static_rqs; 738 struct list_head page_list; 739 740 /* 741 * used to clear request reference in rqs[] before freeing one 742 * request pool 743 */ 744 spinlock_t lock; 745 }; 746 747 static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, 748 unsigned int tag) 749 { 750 if (tag < tags->nr_tags) { 751 prefetch(tags->rqs[tag]); 752 return tags->rqs[tag]; 753 } 754 755 return NULL; 756 } 757 758 enum { 759 BLK_MQ_UNIQUE_TAG_BITS = 16, 760 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 761 }; 762 763 u32 blk_mq_unique_tag(struct request *rq); 764 765 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 766 { 767 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 768 } 769 770 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 771 { 772 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 773 } 774 775 /** 776 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 777 * @rq: target request. 778 */ 779 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 780 { 781 return READ_ONCE(rq->state); 782 } 783 784 static inline int blk_mq_request_started(struct request *rq) 785 { 786 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 787 } 788 789 static inline int blk_mq_request_completed(struct request *rq) 790 { 791 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 792 } 793 794 /* 795 * 796 * Set the state to complete when completing a request from inside ->queue_rq. 797 * This is used by drivers that want to ensure special complete actions that 798 * need access to the request are called on failure, e.g. by nvme for 799 * multipathing. 800 */ 801 static inline void blk_mq_set_request_complete(struct request *rq) 802 { 803 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 804 } 805 806 /* 807 * Complete the request directly instead of deferring it to softirq or 808 * completing it another CPU. Useful in preemptible instead of an interrupt. 809 */ 810 static inline void blk_mq_complete_request_direct(struct request *rq, 811 void (*complete)(struct request *rq)) 812 { 813 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 814 complete(rq); 815 } 816 817 void blk_mq_start_request(struct request *rq); 818 void blk_mq_end_request(struct request *rq, blk_status_t error); 819 void __blk_mq_end_request(struct request *rq, blk_status_t error); 820 void blk_mq_end_request_batch(struct io_comp_batch *ib); 821 822 /* 823 * Only need start/end time stamping if we have iostat or 824 * blk stats enabled, or using an IO scheduler. 825 */ 826 static inline bool blk_mq_need_time_stamp(struct request *rq) 827 { 828 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV)); 829 } 830 831 static inline bool blk_mq_is_reserved_rq(struct request *rq) 832 { 833 return rq->rq_flags & RQF_RESV; 834 } 835 836 /* 837 * Batched completions only work when there is no I/O error and no special 838 * ->end_io handler. 839 */ 840 static inline bool blk_mq_add_to_batch(struct request *req, 841 struct io_comp_batch *iob, int ioerror, 842 void (*complete)(struct io_comp_batch *)) 843 { 844 if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) 845 return false; 846 if (!iob->complete) 847 iob->complete = complete; 848 else if (iob->complete != complete) 849 return false; 850 iob->need_ts |= blk_mq_need_time_stamp(req); 851 rq_list_add(&iob->req_list, req); 852 return true; 853 } 854 855 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 856 void blk_mq_kick_requeue_list(struct request_queue *q); 857 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 858 void blk_mq_complete_request(struct request *rq); 859 bool blk_mq_complete_request_remote(struct request *rq); 860 bool blk_mq_queue_stopped(struct request_queue *q); 861 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 862 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 863 void blk_mq_stop_hw_queues(struct request_queue *q); 864 void blk_mq_start_hw_queues(struct request_queue *q); 865 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 866 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 867 void blk_mq_quiesce_queue(struct request_queue *q); 868 void blk_mq_wait_quiesce_done(struct request_queue *q); 869 void blk_mq_unquiesce_queue(struct request_queue *q); 870 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 871 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 872 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 873 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 874 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 875 busy_tag_iter_fn *fn, void *priv); 876 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 877 void blk_mq_freeze_queue(struct request_queue *q); 878 void blk_mq_unfreeze_queue(struct request_queue *q); 879 void blk_freeze_queue_start(struct request_queue *q); 880 void blk_mq_freeze_queue_wait(struct request_queue *q); 881 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 882 unsigned long timeout); 883 884 int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 885 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 886 887 void blk_mq_quiesce_queue_nowait(struct request_queue *q); 888 889 unsigned int blk_mq_rq_cpu(struct request *rq); 890 891 bool __blk_should_fake_timeout(struct request_queue *q); 892 static inline bool blk_should_fake_timeout(struct request_queue *q) 893 { 894 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 895 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 896 return __blk_should_fake_timeout(q); 897 return false; 898 } 899 900 /** 901 * blk_mq_rq_from_pdu - cast a PDU to a request 902 * @pdu: the PDU (Protocol Data Unit) to be casted 903 * 904 * Return: request 905 * 906 * Driver command data is immediately after the request. So subtract request 907 * size to get back to the original request. 908 */ 909 static inline struct request *blk_mq_rq_from_pdu(void *pdu) 910 { 911 return pdu - sizeof(struct request); 912 } 913 914 /** 915 * blk_mq_rq_to_pdu - cast a request to a PDU 916 * @rq: the request to be casted 917 * 918 * Return: pointer to the PDU 919 * 920 * Driver command data is immediately after the request. So add request to get 921 * the PDU. 922 */ 923 static inline void *blk_mq_rq_to_pdu(struct request *rq) 924 { 925 return rq + 1; 926 } 927 928 #define queue_for_each_hw_ctx(q, hctx, i) \ 929 xa_for_each(&(q)->hctx_table, (i), (hctx)) 930 931 #define hctx_for_each_ctx(hctx, ctx, i) \ 932 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 933 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 934 935 static inline void blk_mq_cleanup_rq(struct request *rq) 936 { 937 if (rq->q->mq_ops->cleanup_rq) 938 rq->q->mq_ops->cleanup_rq(rq); 939 } 940 941 static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, 942 unsigned int nr_segs) 943 { 944 rq->nr_phys_segments = nr_segs; 945 rq->__data_len = bio->bi_iter.bi_size; 946 rq->bio = rq->biotail = bio; 947 rq->ioprio = bio_prio(bio); 948 } 949 950 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 951 struct lock_class_key *key); 952 953 static inline bool rq_is_sync(struct request *rq) 954 { 955 return op_is_sync(rq->cmd_flags); 956 } 957 958 void blk_rq_init(struct request_queue *q, struct request *rq); 959 int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 960 struct bio_set *bs, gfp_t gfp_mask, 961 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); 962 void blk_rq_unprep_clone(struct request *rq); 963 blk_status_t blk_insert_cloned_request(struct request *rq); 964 965 struct rq_map_data { 966 struct page **pages; 967 int page_order; 968 int nr_entries; 969 unsigned long offset; 970 int null_mapped; 971 int from_user; 972 }; 973 974 int blk_rq_map_user(struct request_queue *, struct request *, 975 struct rq_map_data *, void __user *, unsigned long, gfp_t); 976 int blk_rq_map_user_iov(struct request_queue *, struct request *, 977 struct rq_map_data *, const struct iov_iter *, gfp_t); 978 int blk_rq_unmap_user(struct bio *); 979 int blk_rq_map_kern(struct request_queue *, struct request *, void *, 980 unsigned int, gfp_t); 981 int blk_rq_append_bio(struct request *rq, struct bio *bio); 982 void blk_execute_rq_nowait(struct request *rq, bool at_head); 983 blk_status_t blk_execute_rq(struct request *rq, bool at_head); 984 985 struct req_iterator { 986 struct bvec_iter iter; 987 struct bio *bio; 988 }; 989 990 #define __rq_for_each_bio(_bio, rq) \ 991 if ((rq->bio)) \ 992 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 993 994 #define rq_for_each_segment(bvl, _rq, _iter) \ 995 __rq_for_each_bio(_iter.bio, _rq) \ 996 bio_for_each_segment(bvl, _iter.bio, _iter.iter) 997 998 #define rq_for_each_bvec(bvl, _rq, _iter) \ 999 __rq_for_each_bio(_iter.bio, _rq) \ 1000 bio_for_each_bvec(bvl, _iter.bio, _iter.iter) 1001 1002 #define rq_iter_last(bvec, _iter) \ 1003 (_iter.bio->bi_next == NULL && \ 1004 bio_iter_last(bvec, _iter.iter)) 1005 1006 /* 1007 * blk_rq_pos() : the current sector 1008 * blk_rq_bytes() : bytes left in the entire request 1009 * blk_rq_cur_bytes() : bytes left in the current segment 1010 * blk_rq_sectors() : sectors left in the entire request 1011 * blk_rq_cur_sectors() : sectors left in the current segment 1012 * blk_rq_stats_sectors() : sectors of the entire request used for stats 1013 */ 1014 static inline sector_t blk_rq_pos(const struct request *rq) 1015 { 1016 return rq->__sector; 1017 } 1018 1019 static inline unsigned int blk_rq_bytes(const struct request *rq) 1020 { 1021 return rq->__data_len; 1022 } 1023 1024 static inline int blk_rq_cur_bytes(const struct request *rq) 1025 { 1026 if (!rq->bio) 1027 return 0; 1028 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ 1029 return rq->bio->bi_iter.bi_size; 1030 return bio_iovec(rq->bio).bv_len; 1031 } 1032 1033 static inline unsigned int blk_rq_sectors(const struct request *rq) 1034 { 1035 return blk_rq_bytes(rq) >> SECTOR_SHIFT; 1036 } 1037 1038 static inline unsigned int blk_rq_cur_sectors(const struct request *rq) 1039 { 1040 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 1041 } 1042 1043 static inline unsigned int blk_rq_stats_sectors(const struct request *rq) 1044 { 1045 return rq->stats_sectors; 1046 } 1047 1048 /* 1049 * Some commands like WRITE SAME have a payload or data transfer size which 1050 * is different from the size of the request. Any driver that supports such 1051 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to 1052 * calculate the data transfer size. 1053 */ 1054 static inline unsigned int blk_rq_payload_bytes(struct request *rq) 1055 { 1056 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1057 return rq->special_vec.bv_len; 1058 return blk_rq_bytes(rq); 1059 } 1060 1061 /* 1062 * Return the first full biovec in the request. The caller needs to check that 1063 * there are any bvecs before calling this helper. 1064 */ 1065 static inline struct bio_vec req_bvec(struct request *rq) 1066 { 1067 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1068 return rq->special_vec; 1069 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); 1070 } 1071 1072 static inline unsigned int blk_rq_count_bios(struct request *rq) 1073 { 1074 unsigned int nr_bios = 0; 1075 struct bio *bio; 1076 1077 __rq_for_each_bio(bio, rq) 1078 nr_bios++; 1079 1080 return nr_bios; 1081 } 1082 1083 void blk_steal_bios(struct bio_list *list, struct request *rq); 1084 1085 /* 1086 * Request completion related functions. 1087 * 1088 * blk_update_request() completes given number of bytes and updates 1089 * the request without completing it. 1090 */ 1091 bool blk_update_request(struct request *rq, blk_status_t error, 1092 unsigned int nr_bytes); 1093 void blk_abort_request(struct request *); 1094 1095 /* 1096 * Number of physical segments as sent to the device. 1097 * 1098 * Normally this is the number of discontiguous data segments sent by the 1099 * submitter. But for data-less command like discard we might have no 1100 * actual data segments submitted, but the driver might have to add it's 1101 * own special payload. In that case we still return 1 here so that this 1102 * special payload will be mapped. 1103 */ 1104 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) 1105 { 1106 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1107 return 1; 1108 return rq->nr_phys_segments; 1109 } 1110 1111 /* 1112 * Number of discard segments (or ranges) the driver needs to fill in. 1113 * Each discard bio merged into a request is counted as one segment. 1114 */ 1115 static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) 1116 { 1117 return max_t(unsigned short, rq->nr_phys_segments, 1); 1118 } 1119 1120 int __blk_rq_map_sg(struct request_queue *q, struct request *rq, 1121 struct scatterlist *sglist, struct scatterlist **last_sg); 1122 static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1123 struct scatterlist *sglist) 1124 { 1125 struct scatterlist *last_sg = NULL; 1126 1127 return __blk_rq_map_sg(q, rq, sglist, &last_sg); 1128 } 1129 void blk_dump_rq_flags(struct request *, char *); 1130 1131 #ifdef CONFIG_BLK_DEV_ZONED 1132 static inline unsigned int blk_rq_zone_no(struct request *rq) 1133 { 1134 return disk_zone_no(rq->q->disk, blk_rq_pos(rq)); 1135 } 1136 1137 static inline unsigned int blk_rq_zone_is_seq(struct request *rq) 1138 { 1139 return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); 1140 } 1141 1142 bool blk_req_needs_zone_write_lock(struct request *rq); 1143 bool blk_req_zone_write_trylock(struct request *rq); 1144 void __blk_req_zone_write_lock(struct request *rq); 1145 void __blk_req_zone_write_unlock(struct request *rq); 1146 1147 static inline void blk_req_zone_write_lock(struct request *rq) 1148 { 1149 if (blk_req_needs_zone_write_lock(rq)) 1150 __blk_req_zone_write_lock(rq); 1151 } 1152 1153 static inline void blk_req_zone_write_unlock(struct request *rq) 1154 { 1155 if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) 1156 __blk_req_zone_write_unlock(rq); 1157 } 1158 1159 static inline bool blk_req_zone_is_write_locked(struct request *rq) 1160 { 1161 return rq->q->disk->seq_zones_wlock && 1162 test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock); 1163 } 1164 1165 static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1166 { 1167 if (!blk_req_needs_zone_write_lock(rq)) 1168 return true; 1169 return !blk_req_zone_is_write_locked(rq); 1170 } 1171 #else /* CONFIG_BLK_DEV_ZONED */ 1172 static inline bool blk_req_needs_zone_write_lock(struct request *rq) 1173 { 1174 return false; 1175 } 1176 1177 static inline void blk_req_zone_write_lock(struct request *rq) 1178 { 1179 } 1180 1181 static inline void blk_req_zone_write_unlock(struct request *rq) 1182 { 1183 } 1184 static inline bool blk_req_zone_is_write_locked(struct request *rq) 1185 { 1186 return false; 1187 } 1188 1189 static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1190 { 1191 return true; 1192 } 1193 #endif /* CONFIG_BLK_DEV_ZONED */ 1194 1195 #endif /* BLK_MQ_H */ 1196