1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef BLK_MQ_H 3 #define BLK_MQ_H 4 5 #include <linux/blkdev.h> 6 #include <linux/sbitmap.h> 7 #include <linux/srcu.h> 8 #include <linux/lockdep.h> 9 #include <linux/scatterlist.h> 10 #include <linux/prefetch.h> 11 12 struct blk_mq_tags; 13 struct blk_flush_queue; 14 15 #define BLKDEV_MIN_RQ 4 16 #define BLKDEV_DEFAULT_RQ 128 17 18 typedef void (rq_end_io_fn)(struct request *, blk_status_t); 19 20 /* 21 * request flags */ 22 typedef __u32 __bitwise req_flags_t; 23 24 /* drive already may have started this one */ 25 #define RQF_STARTED ((__force req_flags_t)(1 << 1)) 26 /* may not be passed by ioscheduler */ 27 #define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) 28 /* request for flush sequence */ 29 #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) 30 /* merge of different types, fail separately */ 31 #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) 32 /* track inflight for MQ */ 33 #define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) 34 /* don't call prep for this one */ 35 #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) 36 /* vaguely specified driver internal error. Ignored by the block layer */ 37 #define RQF_FAILED ((__force req_flags_t)(1 << 10)) 38 /* don't warn about errors */ 39 #define RQF_QUIET ((__force req_flags_t)(1 << 11)) 40 /* elevator private data attached */ 41 #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) 42 /* account into disk and partition IO statistics */ 43 #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) 44 /* runtime pm request */ 45 #define RQF_PM ((__force req_flags_t)(1 << 15)) 46 /* on IO scheduler merge hash */ 47 #define RQF_HASHED ((__force req_flags_t)(1 << 16)) 48 /* track IO completion time */ 49 #define RQF_STATS ((__force req_flags_t)(1 << 17)) 50 /* Look at ->special_vec for the actual data payload instead of the 51 bio chain. */ 52 #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) 53 /* The per-zone write lock is held for this request */ 54 #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) 55 /* already slept for hybrid poll */ 56 #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) 57 /* ->timeout has been called, don't expire again */ 58 #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) 59 /* queue has elevator attached */ 60 #define RQF_ELV ((__force req_flags_t)(1 << 22)) 61 62 /* flags that prevent us from merging requests: */ 63 #define RQF_NOMERGE_FLAGS \ 64 (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) 65 66 enum mq_rq_state { 67 MQ_RQ_IDLE = 0, 68 MQ_RQ_IN_FLIGHT = 1, 69 MQ_RQ_COMPLETE = 2, 70 }; 71 72 /* 73 * Try to put the fields that are referenced together in the same cacheline. 74 * 75 * If you modify this structure, make sure to update blk_rq_init() and 76 * especially blk_mq_rq_ctx_init() to take care of the added fields. 77 */ 78 struct request { 79 struct request_queue *q; 80 struct blk_mq_ctx *mq_ctx; 81 struct blk_mq_hw_ctx *mq_hctx; 82 83 unsigned int cmd_flags; /* op and common flags */ 84 req_flags_t rq_flags; 85 86 int tag; 87 int internal_tag; 88 89 unsigned int timeout; 90 91 /* the following two fields are internal, NEVER access directly */ 92 unsigned int __data_len; /* total data len */ 93 sector_t __sector; /* sector cursor */ 94 95 struct bio *bio; 96 struct bio *biotail; 97 98 union { 99 struct list_head queuelist; 100 struct request *rq_next; 101 }; 102 103 struct gendisk *rq_disk; 104 struct block_device *part; 105 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 106 /* Time that the first bio started allocating this request. */ 107 u64 alloc_time_ns; 108 #endif 109 /* Time that this request was allocated for this IO. */ 110 u64 start_time_ns; 111 /* Time that I/O was submitted to the device. */ 112 u64 io_start_time_ns; 113 114 #ifdef CONFIG_BLK_WBT 115 unsigned short wbt_flags; 116 #endif 117 /* 118 * rq sectors used for blk stats. It has the same value 119 * with blk_rq_sectors(rq), except that it never be zeroed 120 * by completion. 121 */ 122 unsigned short stats_sectors; 123 124 /* 125 * Number of scatter-gather DMA addr+len pairs after 126 * physical address coalescing is performed. 127 */ 128 unsigned short nr_phys_segments; 129 130 #ifdef CONFIG_BLK_DEV_INTEGRITY 131 unsigned short nr_integrity_segments; 132 #endif 133 134 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 135 struct bio_crypt_ctx *crypt_ctx; 136 struct blk_crypto_keyslot *crypt_keyslot; 137 #endif 138 139 unsigned short write_hint; 140 unsigned short ioprio; 141 142 enum mq_rq_state state; 143 refcount_t ref; 144 145 unsigned long deadline; 146 147 /* 148 * The hash is used inside the scheduler, and killed once the 149 * request reaches the dispatch list. The ipi_list is only used 150 * to queue the request for softirq completion, which is long 151 * after the request has been unhashed (and even removed from 152 * the dispatch list). 153 */ 154 union { 155 struct hlist_node hash; /* merge hash */ 156 struct llist_node ipi_list; 157 }; 158 159 /* 160 * The rb_node is only used inside the io scheduler, requests 161 * are pruned when moved to the dispatch queue. So let the 162 * completion_data share space with the rb_node. 163 */ 164 union { 165 struct rb_node rb_node; /* sort/lookup */ 166 struct bio_vec special_vec; 167 void *completion_data; 168 int error_count; /* for legacy drivers, don't use */ 169 }; 170 171 172 /* 173 * Three pointers are available for the IO schedulers, if they need 174 * more they have to dynamically allocate it. Flush requests are 175 * never put on the IO scheduler. So let the flush fields share 176 * space with the elevator data. 177 */ 178 union { 179 struct { 180 struct io_cq *icq; 181 void *priv[2]; 182 } elv; 183 184 struct { 185 unsigned int seq; 186 struct list_head list; 187 rq_end_io_fn *saved_end_io; 188 } flush; 189 }; 190 191 union { 192 struct __call_single_data csd; 193 u64 fifo_time; 194 }; 195 196 /* 197 * completion callback. 198 */ 199 rq_end_io_fn *end_io; 200 void *end_io_data; 201 }; 202 203 #define req_op(req) \ 204 ((req)->cmd_flags & REQ_OP_MASK) 205 206 static inline bool blk_rq_is_passthrough(struct request *rq) 207 { 208 return blk_op_is_passthrough(req_op(rq)); 209 } 210 211 static inline unsigned short req_get_ioprio(struct request *req) 212 { 213 return req->ioprio; 214 } 215 216 #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) 217 218 #define rq_dma_dir(rq) \ 219 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) 220 221 enum blk_eh_timer_return { 222 BLK_EH_DONE, /* drivers has completed the command */ 223 BLK_EH_RESET_TIMER, /* reset timer and try again */ 224 }; 225 226 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ 227 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ 228 229 /** 230 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 231 * block device 232 */ 233 struct blk_mq_hw_ctx { 234 struct { 235 /** @lock: Protects the dispatch list. */ 236 spinlock_t lock; 237 /** 238 * @dispatch: Used for requests that are ready to be 239 * dispatched to the hardware but for some reason (e.g. lack of 240 * resources) could not be sent to the hardware. As soon as the 241 * driver can send new requests, requests at this list will 242 * be sent first for a fairer dispatch. 243 */ 244 struct list_head dispatch; 245 /** 246 * @state: BLK_MQ_S_* flags. Defines the state of the hw 247 * queue (active, scheduled to restart, stopped). 248 */ 249 unsigned long state; 250 } ____cacheline_aligned_in_smp; 251 252 /** 253 * @run_work: Used for scheduling a hardware queue run at a later time. 254 */ 255 struct delayed_work run_work; 256 /** @cpumask: Map of available CPUs where this hctx can run. */ 257 cpumask_var_t cpumask; 258 /** 259 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 260 * selection from @cpumask. 261 */ 262 int next_cpu; 263 /** 264 * @next_cpu_batch: Counter of how many works left in the batch before 265 * changing to the next CPU. 266 */ 267 int next_cpu_batch; 268 269 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 270 unsigned long flags; 271 272 /** 273 * @sched_data: Pointer owned by the IO scheduler attached to a request 274 * queue. It's up to the IO scheduler how to use this pointer. 275 */ 276 void *sched_data; 277 /** 278 * @queue: Pointer to the request queue that owns this hardware context. 279 */ 280 struct request_queue *queue; 281 /** @fq: Queue of requests that need to perform a flush operation. */ 282 struct blk_flush_queue *fq; 283 284 /** 285 * @driver_data: Pointer to data owned by the block driver that created 286 * this hctx 287 */ 288 void *driver_data; 289 290 /** 291 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 292 * pending request in that software queue. 293 */ 294 struct sbitmap ctx_map; 295 296 /** 297 * @dispatch_from: Software queue to be used when no scheduler was 298 * selected. 299 */ 300 struct blk_mq_ctx *dispatch_from; 301 /** 302 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 303 * decide if the hw_queue is busy using Exponential Weighted Moving 304 * Average algorithm. 305 */ 306 unsigned int dispatch_busy; 307 308 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 309 unsigned short type; 310 /** @nr_ctx: Number of software queues. */ 311 unsigned short nr_ctx; 312 /** @ctxs: Array of software queues. */ 313 struct blk_mq_ctx **ctxs; 314 315 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 316 spinlock_t dispatch_wait_lock; 317 /** 318 * @dispatch_wait: Waitqueue to put requests when there is no tag 319 * available at the moment, to wait for another try in the future. 320 */ 321 wait_queue_entry_t dispatch_wait; 322 323 /** 324 * @wait_index: Index of next available dispatch_wait queue to insert 325 * requests. 326 */ 327 atomic_t wait_index; 328 329 /** 330 * @tags: Tags owned by the block driver. A tag at this set is only 331 * assigned when a request is dispatched from a hardware queue. 332 */ 333 struct blk_mq_tags *tags; 334 /** 335 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 336 * scheduler associated with a request queue, a tag is assigned when 337 * that request is allocated. Else, this member is not used. 338 */ 339 struct blk_mq_tags *sched_tags; 340 341 /** @queued: Number of queued requests. */ 342 unsigned long queued; 343 /** @run: Number of dispatched requests. */ 344 unsigned long run; 345 346 /** @numa_node: NUMA node the storage adapter has been connected to. */ 347 unsigned int numa_node; 348 /** @queue_num: Index of this hardware queue. */ 349 unsigned int queue_num; 350 351 /** 352 * @nr_active: Number of active requests. Only used when a tag set is 353 * shared across request queues. 354 */ 355 atomic_t nr_active; 356 357 /** @cpuhp_online: List to store request if CPU is going to die */ 358 struct hlist_node cpuhp_online; 359 /** @cpuhp_dead: List to store request if some CPU die. */ 360 struct hlist_node cpuhp_dead; 361 /** @kobj: Kernel object for sysfs. */ 362 struct kobject kobj; 363 364 #ifdef CONFIG_BLK_DEBUG_FS 365 /** 366 * @debugfs_dir: debugfs directory for this hardware queue. Named 367 * as cpu<cpu_number>. 368 */ 369 struct dentry *debugfs_dir; 370 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 371 struct dentry *sched_debugfs_dir; 372 #endif 373 374 /** 375 * @hctx_list: if this hctx is not in use, this is an entry in 376 * q->unused_hctx_list. 377 */ 378 struct list_head hctx_list; 379 380 /** 381 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is 382 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also 383 * blk_mq_hw_ctx_size(). 384 */ 385 struct srcu_struct srcu[]; 386 }; 387 388 /** 389 * struct blk_mq_queue_map - Map software queues to hardware queues 390 * @mq_map: CPU ID to hardware queue index map. This is an array 391 * with nr_cpu_ids elements. Each element has a value in the range 392 * [@queue_offset, @queue_offset + @nr_queues). 393 * @nr_queues: Number of hardware queues to map CPU IDs onto. 394 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 395 * driver to map each hardware queue type (enum hctx_type) onto a distinct 396 * set of hardware queues. 397 */ 398 struct blk_mq_queue_map { 399 unsigned int *mq_map; 400 unsigned int nr_queues; 401 unsigned int queue_offset; 402 }; 403 404 /** 405 * enum hctx_type - Type of hardware queue 406 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 407 * @HCTX_TYPE_READ: Just for READ I/O. 408 * @HCTX_TYPE_POLL: Polled I/O of any kind. 409 * @HCTX_MAX_TYPES: Number of types of hctx. 410 */ 411 enum hctx_type { 412 HCTX_TYPE_DEFAULT, 413 HCTX_TYPE_READ, 414 HCTX_TYPE_POLL, 415 416 HCTX_MAX_TYPES, 417 }; 418 419 /** 420 * struct blk_mq_tag_set - tag set that can be shared between request queues 421 * @map: One or more ctx -> hctx mappings. One map exists for each 422 * hardware queue type (enum hctx_type) that the driver wishes 423 * to support. There are no restrictions on maps being of the 424 * same size, and it's perfectly legal to share maps between 425 * types. 426 * @nr_maps: Number of elements in the @map array. A number in the range 427 * [1, HCTX_MAX_TYPES]. 428 * @ops: Pointers to functions that implement block driver behavior. 429 * @nr_hw_queues: Number of hardware queues supported by the block driver that 430 * owns this data structure. 431 * @queue_depth: Number of tags per hardware queue, reserved tags included. 432 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 433 * allocations. 434 * @cmd_size: Number of additional bytes to allocate per request. The block 435 * driver owns these additional bytes. 436 * @numa_node: NUMA node the storage adapter has been connected to. 437 * @timeout: Request processing timeout in jiffies. 438 * @flags: Zero or more BLK_MQ_F_* flags. 439 * @driver_data: Pointer to data owned by the block driver that created this 440 * tag set. 441 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 442 * elements. 443 * @shared_tags: 444 * Shared set of tags. Has @nr_hw_queues elements. If set, 445 * shared by all @tags. 446 * @tag_list_lock: Serializes tag_list accesses. 447 * @tag_list: List of the request queues that use this tag set. See also 448 * request_queue.tag_set_list. 449 */ 450 struct blk_mq_tag_set { 451 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 452 unsigned int nr_maps; 453 const struct blk_mq_ops *ops; 454 unsigned int nr_hw_queues; 455 unsigned int queue_depth; 456 unsigned int reserved_tags; 457 unsigned int cmd_size; 458 int numa_node; 459 unsigned int timeout; 460 unsigned int flags; 461 void *driver_data; 462 463 struct blk_mq_tags **tags; 464 465 struct blk_mq_tags *shared_tags; 466 467 struct mutex tag_list_lock; 468 struct list_head tag_list; 469 }; 470 471 /** 472 * struct blk_mq_queue_data - Data about a request inserted in a queue 473 * 474 * @rq: Request pointer. 475 * @last: If it is the last request in the queue. 476 */ 477 struct blk_mq_queue_data { 478 struct request *rq; 479 bool last; 480 }; 481 482 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 483 bool); 484 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 485 486 /** 487 * struct blk_mq_ops - Callback functions that implements block driver 488 * behaviour. 489 */ 490 struct blk_mq_ops { 491 /** 492 * @queue_rq: Queue a new request from block IO. 493 */ 494 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 495 const struct blk_mq_queue_data *); 496 497 /** 498 * @commit_rqs: If a driver uses bd->last to judge when to submit 499 * requests to hardware, it must define this function. In case of errors 500 * that make us stop issuing further requests, this hook serves the 501 * purpose of kicking the hardware (which the last request otherwise 502 * would have done). 503 */ 504 void (*commit_rqs)(struct blk_mq_hw_ctx *); 505 506 /** 507 * @get_budget: Reserve budget before queue request, once .queue_rq is 508 * run, it is driver's responsibility to release the 509 * reserved budget. Also we have to handle failure case 510 * of .get_budget for avoiding I/O deadlock. 511 */ 512 int (*get_budget)(struct request_queue *); 513 514 /** 515 * @put_budget: Release the reserved budget. 516 */ 517 void (*put_budget)(struct request_queue *, int); 518 519 /** 520 * @set_rq_budget_token: store rq's budget token 521 */ 522 void (*set_rq_budget_token)(struct request *, int); 523 /** 524 * @get_rq_budget_token: retrieve rq's budget token 525 */ 526 int (*get_rq_budget_token)(struct request *); 527 528 /** 529 * @timeout: Called on request timeout. 530 */ 531 enum blk_eh_timer_return (*timeout)(struct request *, bool); 532 533 /** 534 * @poll: Called to poll for completion of a specific tag. 535 */ 536 int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); 537 538 /** 539 * @complete: Mark the request as complete. 540 */ 541 void (*complete)(struct request *); 542 543 /** 544 * @init_hctx: Called when the block layer side of a hardware queue has 545 * been set up, allowing the driver to allocate/init matching 546 * structures. 547 */ 548 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 549 /** 550 * @exit_hctx: Ditto for exit/teardown. 551 */ 552 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 553 554 /** 555 * @init_request: Called for every command allocated by the block layer 556 * to allow the driver to set up driver specific data. 557 * 558 * Tag greater than or equal to queue_depth is for setting up 559 * flush request. 560 */ 561 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 562 unsigned int, unsigned int); 563 /** 564 * @exit_request: Ditto for exit/teardown. 565 */ 566 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 567 unsigned int); 568 569 /** 570 * @cleanup_rq: Called before freeing one request which isn't completed 571 * yet, and usually for freeing the driver private data. 572 */ 573 void (*cleanup_rq)(struct request *); 574 575 /** 576 * @busy: If set, returns whether or not this queue currently is busy. 577 */ 578 bool (*busy)(struct request_queue *); 579 580 /** 581 * @map_queues: This allows drivers specify their own queue mapping by 582 * overriding the setup-time function that builds the mq_map. 583 */ 584 int (*map_queues)(struct blk_mq_tag_set *set); 585 586 #ifdef CONFIG_BLK_DEBUG_FS 587 /** 588 * @show_rq: Used by the debugfs implementation to show driver-specific 589 * information about a request. 590 */ 591 void (*show_rq)(struct seq_file *m, struct request *rq); 592 #endif 593 }; 594 595 enum { 596 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 597 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 598 /* 599 * Set when this device requires underlying blk-mq device for 600 * completing IO: 601 */ 602 BLK_MQ_F_STACKING = 1 << 2, 603 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 604 BLK_MQ_F_BLOCKING = 1 << 5, 605 /* Do not allow an I/O scheduler to be configured. */ 606 BLK_MQ_F_NO_SCHED = 1 << 6, 607 /* 608 * Select 'none' during queue registration in case of a single hwq 609 * or shared hwqs instead of 'mq-deadline'. 610 */ 611 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, 612 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 613 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 614 615 BLK_MQ_S_STOPPED = 0, 616 BLK_MQ_S_TAG_ACTIVE = 1, 617 BLK_MQ_S_SCHED_RESTART = 2, 618 619 /* hw queue is inactive after all its CPUs become offline */ 620 BLK_MQ_S_INACTIVE = 3, 621 622 BLK_MQ_MAX_DEPTH = 10240, 623 624 BLK_MQ_CPU_WORK_BATCH = 8, 625 }; 626 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 627 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 628 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 629 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 630 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 631 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 632 633 #define BLK_MQ_NO_HCTX_IDX (-1U) 634 635 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 636 struct lock_class_key *lkclass); 637 #define blk_mq_alloc_disk(set, queuedata) \ 638 ({ \ 639 static struct lock_class_key __key; \ 640 \ 641 __blk_mq_alloc_disk(set, queuedata, &__key); \ 642 }) 643 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 644 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 645 struct request_queue *q); 646 void blk_mq_unregister_dev(struct device *, struct request_queue *); 647 648 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 649 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 650 const struct blk_mq_ops *ops, unsigned int queue_depth, 651 unsigned int set_flags); 652 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 653 654 void blk_mq_free_request(struct request *rq); 655 656 bool blk_mq_queue_inflight(struct request_queue *q); 657 658 enum { 659 /* return when out of requests */ 660 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 661 /* allocate from reserved pool */ 662 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 663 /* set RQF_PM */ 664 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 665 }; 666 667 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 668 blk_mq_req_flags_t flags); 669 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 670 unsigned int op, blk_mq_req_flags_t flags, 671 unsigned int hctx_idx); 672 673 /* 674 * Tag address space map. 675 */ 676 struct blk_mq_tags { 677 unsigned int nr_tags; 678 unsigned int nr_reserved_tags; 679 680 atomic_t active_queues; 681 682 struct sbitmap_queue bitmap_tags; 683 struct sbitmap_queue breserved_tags; 684 685 struct request **rqs; 686 struct request **static_rqs; 687 struct list_head page_list; 688 689 /* 690 * used to clear request reference in rqs[] before freeing one 691 * request pool 692 */ 693 spinlock_t lock; 694 }; 695 696 static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, 697 unsigned int tag) 698 { 699 if (tag < tags->nr_tags) { 700 prefetch(tags->rqs[tag]); 701 return tags->rqs[tag]; 702 } 703 704 return NULL; 705 } 706 707 enum { 708 BLK_MQ_UNIQUE_TAG_BITS = 16, 709 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 710 }; 711 712 u32 blk_mq_unique_tag(struct request *rq); 713 714 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 715 { 716 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 717 } 718 719 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 720 { 721 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 722 } 723 724 /** 725 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 726 * @rq: target request. 727 */ 728 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 729 { 730 return READ_ONCE(rq->state); 731 } 732 733 static inline int blk_mq_request_started(struct request *rq) 734 { 735 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 736 } 737 738 static inline int blk_mq_request_completed(struct request *rq) 739 { 740 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 741 } 742 743 /* 744 * 745 * Set the state to complete when completing a request from inside ->queue_rq. 746 * This is used by drivers that want to ensure special complete actions that 747 * need access to the request are called on failure, e.g. by nvme for 748 * multipathing. 749 */ 750 static inline void blk_mq_set_request_complete(struct request *rq) 751 { 752 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 753 } 754 755 void blk_mq_start_request(struct request *rq); 756 void blk_mq_end_request(struct request *rq, blk_status_t error); 757 void __blk_mq_end_request(struct request *rq, blk_status_t error); 758 void blk_mq_end_request_batch(struct io_comp_batch *ib); 759 760 /* 761 * Only need start/end time stamping if we have iostat or 762 * blk stats enabled, or using an IO scheduler. 763 */ 764 static inline bool blk_mq_need_time_stamp(struct request *rq) 765 { 766 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV)); 767 } 768 769 /* 770 * Batched completions only work when there is no I/O error and no special 771 * ->end_io handler. 772 */ 773 static inline bool blk_mq_add_to_batch(struct request *req, 774 struct io_comp_batch *iob, int ioerror, 775 void (*complete)(struct io_comp_batch *)) 776 { 777 if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror) 778 return false; 779 if (!iob->complete) 780 iob->complete = complete; 781 else if (iob->complete != complete) 782 return false; 783 iob->need_ts |= blk_mq_need_time_stamp(req); 784 rq_list_add(&iob->req_list, req); 785 return true; 786 } 787 788 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 789 void blk_mq_kick_requeue_list(struct request_queue *q); 790 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 791 void blk_mq_complete_request(struct request *rq); 792 bool blk_mq_complete_request_remote(struct request *rq); 793 bool blk_mq_queue_stopped(struct request_queue *q); 794 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 795 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 796 void blk_mq_stop_hw_queues(struct request_queue *q); 797 void blk_mq_start_hw_queues(struct request_queue *q); 798 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 799 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 800 void blk_mq_quiesce_queue(struct request_queue *q); 801 void blk_mq_wait_quiesce_done(struct request_queue *q); 802 void blk_mq_unquiesce_queue(struct request_queue *q); 803 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 804 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 805 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 806 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 807 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 808 busy_tag_iter_fn *fn, void *priv); 809 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 810 void blk_mq_freeze_queue(struct request_queue *q); 811 void blk_mq_unfreeze_queue(struct request_queue *q); 812 void blk_freeze_queue_start(struct request_queue *q); 813 void blk_mq_freeze_queue_wait(struct request_queue *q); 814 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 815 unsigned long timeout); 816 817 int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 818 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 819 820 void blk_mq_quiesce_queue_nowait(struct request_queue *q); 821 822 unsigned int blk_mq_rq_cpu(struct request *rq); 823 824 bool __blk_should_fake_timeout(struct request_queue *q); 825 static inline bool blk_should_fake_timeout(struct request_queue *q) 826 { 827 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 828 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 829 return __blk_should_fake_timeout(q); 830 return false; 831 } 832 833 /** 834 * blk_mq_rq_from_pdu - cast a PDU to a request 835 * @pdu: the PDU (Protocol Data Unit) to be casted 836 * 837 * Return: request 838 * 839 * Driver command data is immediately after the request. So subtract request 840 * size to get back to the original request. 841 */ 842 static inline struct request *blk_mq_rq_from_pdu(void *pdu) 843 { 844 return pdu - sizeof(struct request); 845 } 846 847 /** 848 * blk_mq_rq_to_pdu - cast a request to a PDU 849 * @rq: the request to be casted 850 * 851 * Return: pointer to the PDU 852 * 853 * Driver command data is immediately after the request. So add request to get 854 * the PDU. 855 */ 856 static inline void *blk_mq_rq_to_pdu(struct request *rq) 857 { 858 return rq + 1; 859 } 860 861 #define queue_for_each_hw_ctx(q, hctx, i) \ 862 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 863 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 864 865 #define hctx_for_each_ctx(hctx, ctx, i) \ 866 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 867 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 868 869 static inline void blk_mq_cleanup_rq(struct request *rq) 870 { 871 if (rq->q->mq_ops->cleanup_rq) 872 rq->q->mq_ops->cleanup_rq(rq); 873 } 874 875 static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, 876 unsigned int nr_segs) 877 { 878 rq->nr_phys_segments = nr_segs; 879 rq->__data_len = bio->bi_iter.bi_size; 880 rq->bio = rq->biotail = bio; 881 rq->ioprio = bio_prio(bio); 882 883 if (bio->bi_bdev) 884 rq->rq_disk = bio->bi_bdev->bd_disk; 885 } 886 887 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 888 struct lock_class_key *key); 889 890 static inline bool rq_is_sync(struct request *rq) 891 { 892 return op_is_sync(rq->cmd_flags); 893 } 894 895 void blk_rq_init(struct request_queue *q, struct request *rq); 896 int blk_rq_prep_clone(struct request *rq, struct request *rq_src, 897 struct bio_set *bs, gfp_t gfp_mask, 898 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); 899 void blk_rq_unprep_clone(struct request *rq); 900 blk_status_t blk_insert_cloned_request(struct request_queue *q, 901 struct request *rq); 902 903 struct rq_map_data { 904 struct page **pages; 905 int page_order; 906 int nr_entries; 907 unsigned long offset; 908 int null_mapped; 909 int from_user; 910 }; 911 912 int blk_rq_map_user(struct request_queue *, struct request *, 913 struct rq_map_data *, void __user *, unsigned long, gfp_t); 914 int blk_rq_map_user_iov(struct request_queue *, struct request *, 915 struct rq_map_data *, const struct iov_iter *, gfp_t); 916 int blk_rq_unmap_user(struct bio *); 917 int blk_rq_map_kern(struct request_queue *, struct request *, void *, 918 unsigned int, gfp_t); 919 int blk_rq_append_bio(struct request *rq, struct bio *bio); 920 void blk_execute_rq_nowait(struct gendisk *, struct request *, int, 921 rq_end_io_fn *); 922 blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq, 923 int at_head); 924 925 struct req_iterator { 926 struct bvec_iter iter; 927 struct bio *bio; 928 }; 929 930 #define __rq_for_each_bio(_bio, rq) \ 931 if ((rq->bio)) \ 932 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 933 934 #define rq_for_each_segment(bvl, _rq, _iter) \ 935 __rq_for_each_bio(_iter.bio, _rq) \ 936 bio_for_each_segment(bvl, _iter.bio, _iter.iter) 937 938 #define rq_for_each_bvec(bvl, _rq, _iter) \ 939 __rq_for_each_bio(_iter.bio, _rq) \ 940 bio_for_each_bvec(bvl, _iter.bio, _iter.iter) 941 942 #define rq_iter_last(bvec, _iter) \ 943 (_iter.bio->bi_next == NULL && \ 944 bio_iter_last(bvec, _iter.iter)) 945 946 /* 947 * blk_rq_pos() : the current sector 948 * blk_rq_bytes() : bytes left in the entire request 949 * blk_rq_cur_bytes() : bytes left in the current segment 950 * blk_rq_err_bytes() : bytes left till the next error boundary 951 * blk_rq_sectors() : sectors left in the entire request 952 * blk_rq_cur_sectors() : sectors left in the current segment 953 * blk_rq_stats_sectors() : sectors of the entire request used for stats 954 */ 955 static inline sector_t blk_rq_pos(const struct request *rq) 956 { 957 return rq->__sector; 958 } 959 960 static inline unsigned int blk_rq_bytes(const struct request *rq) 961 { 962 return rq->__data_len; 963 } 964 965 static inline int blk_rq_cur_bytes(const struct request *rq) 966 { 967 if (!rq->bio) 968 return 0; 969 if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ 970 return rq->bio->bi_iter.bi_size; 971 return bio_iovec(rq->bio).bv_len; 972 } 973 974 unsigned int blk_rq_err_bytes(const struct request *rq); 975 976 static inline unsigned int blk_rq_sectors(const struct request *rq) 977 { 978 return blk_rq_bytes(rq) >> SECTOR_SHIFT; 979 } 980 981 static inline unsigned int blk_rq_cur_sectors(const struct request *rq) 982 { 983 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; 984 } 985 986 static inline unsigned int blk_rq_stats_sectors(const struct request *rq) 987 { 988 return rq->stats_sectors; 989 } 990 991 /* 992 * Some commands like WRITE SAME have a payload or data transfer size which 993 * is different from the size of the request. Any driver that supports such 994 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to 995 * calculate the data transfer size. 996 */ 997 static inline unsigned int blk_rq_payload_bytes(struct request *rq) 998 { 999 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1000 return rq->special_vec.bv_len; 1001 return blk_rq_bytes(rq); 1002 } 1003 1004 /* 1005 * Return the first full biovec in the request. The caller needs to check that 1006 * there are any bvecs before calling this helper. 1007 */ 1008 static inline struct bio_vec req_bvec(struct request *rq) 1009 { 1010 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1011 return rq->special_vec; 1012 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); 1013 } 1014 1015 static inline unsigned int blk_rq_count_bios(struct request *rq) 1016 { 1017 unsigned int nr_bios = 0; 1018 struct bio *bio; 1019 1020 __rq_for_each_bio(bio, rq) 1021 nr_bios++; 1022 1023 return nr_bios; 1024 } 1025 1026 void blk_steal_bios(struct bio_list *list, struct request *rq); 1027 1028 /* 1029 * Request completion related functions. 1030 * 1031 * blk_update_request() completes given number of bytes and updates 1032 * the request without completing it. 1033 */ 1034 bool blk_update_request(struct request *rq, blk_status_t error, 1035 unsigned int nr_bytes); 1036 void blk_abort_request(struct request *); 1037 1038 /* 1039 * Number of physical segments as sent to the device. 1040 * 1041 * Normally this is the number of discontiguous data segments sent by the 1042 * submitter. But for data-less command like discard we might have no 1043 * actual data segments submitted, but the driver might have to add it's 1044 * own special payload. In that case we still return 1 here so that this 1045 * special payload will be mapped. 1046 */ 1047 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) 1048 { 1049 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) 1050 return 1; 1051 return rq->nr_phys_segments; 1052 } 1053 1054 /* 1055 * Number of discard segments (or ranges) the driver needs to fill in. 1056 * Each discard bio merged into a request is counted as one segment. 1057 */ 1058 static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) 1059 { 1060 return max_t(unsigned short, rq->nr_phys_segments, 1); 1061 } 1062 1063 int __blk_rq_map_sg(struct request_queue *q, struct request *rq, 1064 struct scatterlist *sglist, struct scatterlist **last_sg); 1065 static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1066 struct scatterlist *sglist) 1067 { 1068 struct scatterlist *last_sg = NULL; 1069 1070 return __blk_rq_map_sg(q, rq, sglist, &last_sg); 1071 } 1072 void blk_dump_rq_flags(struct request *, char *); 1073 1074 #ifdef CONFIG_BLK_DEV_ZONED 1075 static inline unsigned int blk_rq_zone_no(struct request *rq) 1076 { 1077 return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); 1078 } 1079 1080 static inline unsigned int blk_rq_zone_is_seq(struct request *rq) 1081 { 1082 return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); 1083 } 1084 1085 bool blk_req_needs_zone_write_lock(struct request *rq); 1086 bool blk_req_zone_write_trylock(struct request *rq); 1087 void __blk_req_zone_write_lock(struct request *rq); 1088 void __blk_req_zone_write_unlock(struct request *rq); 1089 1090 static inline void blk_req_zone_write_lock(struct request *rq) 1091 { 1092 if (blk_req_needs_zone_write_lock(rq)) 1093 __blk_req_zone_write_lock(rq); 1094 } 1095 1096 static inline void blk_req_zone_write_unlock(struct request *rq) 1097 { 1098 if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) 1099 __blk_req_zone_write_unlock(rq); 1100 } 1101 1102 static inline bool blk_req_zone_is_write_locked(struct request *rq) 1103 { 1104 return rq->q->seq_zones_wlock && 1105 test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); 1106 } 1107 1108 static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1109 { 1110 if (!blk_req_needs_zone_write_lock(rq)) 1111 return true; 1112 return !blk_req_zone_is_write_locked(rq); 1113 } 1114 #else /* CONFIG_BLK_DEV_ZONED */ 1115 static inline bool blk_req_needs_zone_write_lock(struct request *rq) 1116 { 1117 return false; 1118 } 1119 1120 static inline void blk_req_zone_write_lock(struct request *rq) 1121 { 1122 } 1123 1124 static inline void blk_req_zone_write_unlock(struct request *rq) 1125 { 1126 } 1127 static inline bool blk_req_zone_is_write_locked(struct request *rq) 1128 { 1129 return false; 1130 } 1131 1132 static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1133 { 1134 return true; 1135 } 1136 #endif /* CONFIG_BLK_DEV_ZONED */ 1137 1138 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1139 # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" 1140 #endif 1141 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1142 void rq_flush_dcache_pages(struct request *rq); 1143 #else 1144 static inline void rq_flush_dcache_pages(struct request *rq) 1145 { 1146 } 1147 #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ 1148 #endif /* BLK_MQ_H */ 1149