1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef BLK_MQ_H 3 #define BLK_MQ_H 4 5 #include <linux/blkdev.h> 6 #include <linux/sbitmap.h> 7 #include <linux/srcu.h> 8 #include <linux/lockdep.h> 9 10 struct blk_mq_tags; 11 struct blk_flush_queue; 12 13 /** 14 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 15 * block device 16 */ 17 struct blk_mq_hw_ctx { 18 struct { 19 /** @lock: Protects the dispatch list. */ 20 spinlock_t lock; 21 /** 22 * @dispatch: Used for requests that are ready to be 23 * dispatched to the hardware but for some reason (e.g. lack of 24 * resources) could not be sent to the hardware. As soon as the 25 * driver can send new requests, requests at this list will 26 * be sent first for a fairer dispatch. 27 */ 28 struct list_head dispatch; 29 /** 30 * @state: BLK_MQ_S_* flags. Defines the state of the hw 31 * queue (active, scheduled to restart, stopped). 32 */ 33 unsigned long state; 34 } ____cacheline_aligned_in_smp; 35 36 /** 37 * @run_work: Used for scheduling a hardware queue run at a later time. 38 */ 39 struct delayed_work run_work; 40 /** @cpumask: Map of available CPUs where this hctx can run. */ 41 cpumask_var_t cpumask; 42 /** 43 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 44 * selection from @cpumask. 45 */ 46 int next_cpu; 47 /** 48 * @next_cpu_batch: Counter of how many works left in the batch before 49 * changing to the next CPU. 50 */ 51 int next_cpu_batch; 52 53 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 54 unsigned long flags; 55 56 /** 57 * @sched_data: Pointer owned by the IO scheduler attached to a request 58 * queue. It's up to the IO scheduler how to use this pointer. 59 */ 60 void *sched_data; 61 /** 62 * @queue: Pointer to the request queue that owns this hardware context. 63 */ 64 struct request_queue *queue; 65 /** @fq: Queue of requests that need to perform a flush operation. */ 66 struct blk_flush_queue *fq; 67 68 /** 69 * @driver_data: Pointer to data owned by the block driver that created 70 * this hctx 71 */ 72 void *driver_data; 73 74 /** 75 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 76 * pending request in that software queue. 77 */ 78 struct sbitmap ctx_map; 79 80 /** 81 * @dispatch_from: Software queue to be used when no scheduler was 82 * selected. 83 */ 84 struct blk_mq_ctx *dispatch_from; 85 /** 86 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 87 * decide if the hw_queue is busy using Exponential Weighted Moving 88 * Average algorithm. 89 */ 90 unsigned int dispatch_busy; 91 92 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 93 unsigned short type; 94 /** @nr_ctx: Number of software queues. */ 95 unsigned short nr_ctx; 96 /** @ctxs: Array of software queues. */ 97 struct blk_mq_ctx **ctxs; 98 99 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 100 spinlock_t dispatch_wait_lock; 101 /** 102 * @dispatch_wait: Waitqueue to put requests when there is no tag 103 * available at the moment, to wait for another try in the future. 104 */ 105 wait_queue_entry_t dispatch_wait; 106 107 /** 108 * @wait_index: Index of next available dispatch_wait queue to insert 109 * requests. 110 */ 111 atomic_t wait_index; 112 113 /** 114 * @tags: Tags owned by the block driver. A tag at this set is only 115 * assigned when a request is dispatched from a hardware queue. 116 */ 117 struct blk_mq_tags *tags; 118 /** 119 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 120 * scheduler associated with a request queue, a tag is assigned when 121 * that request is allocated. Else, this member is not used. 122 */ 123 struct blk_mq_tags *sched_tags; 124 125 /** @queued: Number of queued requests. */ 126 unsigned long queued; 127 /** @run: Number of dispatched requests. */ 128 unsigned long run; 129 #define BLK_MQ_MAX_DISPATCH_ORDER 7 130 /** @dispatched: Number of dispatch requests by queue. */ 131 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 132 133 /** @numa_node: NUMA node the storage adapter has been connected to. */ 134 unsigned int numa_node; 135 /** @queue_num: Index of this hardware queue. */ 136 unsigned int queue_num; 137 138 /** 139 * @nr_active: Number of active requests. Only used when a tag set is 140 * shared across request queues. 141 */ 142 atomic_t nr_active; 143 /** 144 * @elevator_queued: Number of queued requests on hctx. 145 */ 146 atomic_t elevator_queued; 147 148 /** @cpuhp_online: List to store request if CPU is going to die */ 149 struct hlist_node cpuhp_online; 150 /** @cpuhp_dead: List to store request if some CPU die. */ 151 struct hlist_node cpuhp_dead; 152 /** @kobj: Kernel object for sysfs. */ 153 struct kobject kobj; 154 155 /** @poll_considered: Count times blk_poll() was called. */ 156 unsigned long poll_considered; 157 /** @poll_invoked: Count how many requests blk_poll() polled. */ 158 unsigned long poll_invoked; 159 /** @poll_success: Count how many polled requests were completed. */ 160 unsigned long poll_success; 161 162 #ifdef CONFIG_BLK_DEBUG_FS 163 /** 164 * @debugfs_dir: debugfs directory for this hardware queue. Named 165 * as cpu<cpu_number>. 166 */ 167 struct dentry *debugfs_dir; 168 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 169 struct dentry *sched_debugfs_dir; 170 #endif 171 172 /** 173 * @hctx_list: if this hctx is not in use, this is an entry in 174 * q->unused_hctx_list. 175 */ 176 struct list_head hctx_list; 177 178 /** 179 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is 180 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also 181 * blk_mq_hw_ctx_size(). 182 */ 183 struct srcu_struct srcu[]; 184 }; 185 186 /** 187 * struct blk_mq_queue_map - Map software queues to hardware queues 188 * @mq_map: CPU ID to hardware queue index map. This is an array 189 * with nr_cpu_ids elements. Each element has a value in the range 190 * [@queue_offset, @queue_offset + @nr_queues). 191 * @nr_queues: Number of hardware queues to map CPU IDs onto. 192 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 193 * driver to map each hardware queue type (enum hctx_type) onto a distinct 194 * set of hardware queues. 195 */ 196 struct blk_mq_queue_map { 197 unsigned int *mq_map; 198 unsigned int nr_queues; 199 unsigned int queue_offset; 200 }; 201 202 /** 203 * enum hctx_type - Type of hardware queue 204 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 205 * @HCTX_TYPE_READ: Just for READ I/O. 206 * @HCTX_TYPE_POLL: Polled I/O of any kind. 207 * @HCTX_MAX_TYPES: Number of types of hctx. 208 */ 209 enum hctx_type { 210 HCTX_TYPE_DEFAULT, 211 HCTX_TYPE_READ, 212 HCTX_TYPE_POLL, 213 214 HCTX_MAX_TYPES, 215 }; 216 217 /** 218 * struct blk_mq_tag_set - tag set that can be shared between request queues 219 * @map: One or more ctx -> hctx mappings. One map exists for each 220 * hardware queue type (enum hctx_type) that the driver wishes 221 * to support. There are no restrictions on maps being of the 222 * same size, and it's perfectly legal to share maps between 223 * types. 224 * @nr_maps: Number of elements in the @map array. A number in the range 225 * [1, HCTX_MAX_TYPES]. 226 * @ops: Pointers to functions that implement block driver behavior. 227 * @nr_hw_queues: Number of hardware queues supported by the block driver that 228 * owns this data structure. 229 * @queue_depth: Number of tags per hardware queue, reserved tags included. 230 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 231 * allocations. 232 * @cmd_size: Number of additional bytes to allocate per request. The block 233 * driver owns these additional bytes. 234 * @numa_node: NUMA node the storage adapter has been connected to. 235 * @timeout: Request processing timeout in jiffies. 236 * @flags: Zero or more BLK_MQ_F_* flags. 237 * @driver_data: Pointer to data owned by the block driver that created this 238 * tag set. 239 * @active_queues_shared_sbitmap: 240 * number of active request queues per tag set. 241 * @__bitmap_tags: A shared tags sbitmap, used over all hctx's 242 * @__breserved_tags: 243 * A shared reserved tags sbitmap, used over all hctx's 244 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 245 * elements. 246 * @tag_list_lock: Serializes tag_list accesses. 247 * @tag_list: List of the request queues that use this tag set. See also 248 * request_queue.tag_set_list. 249 */ 250 struct blk_mq_tag_set { 251 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 252 unsigned int nr_maps; 253 const struct blk_mq_ops *ops; 254 unsigned int nr_hw_queues; 255 unsigned int queue_depth; 256 unsigned int reserved_tags; 257 unsigned int cmd_size; 258 int numa_node; 259 unsigned int timeout; 260 unsigned int flags; 261 void *driver_data; 262 atomic_t active_queues_shared_sbitmap; 263 264 struct sbitmap_queue __bitmap_tags; 265 struct sbitmap_queue __breserved_tags; 266 struct blk_mq_tags **tags; 267 268 struct mutex tag_list_lock; 269 struct list_head tag_list; 270 }; 271 272 /** 273 * struct blk_mq_queue_data - Data about a request inserted in a queue 274 * 275 * @rq: Request pointer. 276 * @last: If it is the last request in the queue. 277 */ 278 struct blk_mq_queue_data { 279 struct request *rq; 280 bool last; 281 }; 282 283 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 284 bool); 285 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 286 287 /** 288 * struct blk_mq_ops - Callback functions that implements block driver 289 * behaviour. 290 */ 291 struct blk_mq_ops { 292 /** 293 * @queue_rq: Queue a new request from block IO. 294 */ 295 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, 296 const struct blk_mq_queue_data *); 297 298 /** 299 * @commit_rqs: If a driver uses bd->last to judge when to submit 300 * requests to hardware, it must define this function. In case of errors 301 * that make us stop issuing further requests, this hook serves the 302 * purpose of kicking the hardware (which the last request otherwise 303 * would have done). 304 */ 305 void (*commit_rqs)(struct blk_mq_hw_ctx *); 306 307 /** 308 * @get_budget: Reserve budget before queue request, once .queue_rq is 309 * run, it is driver's responsibility to release the 310 * reserved budget. Also we have to handle failure case 311 * of .get_budget for avoiding I/O deadlock. 312 */ 313 bool (*get_budget)(struct request_queue *); 314 315 /** 316 * @put_budget: Release the reserved budget. 317 */ 318 void (*put_budget)(struct request_queue *); 319 320 /** 321 * @timeout: Called on request timeout. 322 */ 323 enum blk_eh_timer_return (*timeout)(struct request *, bool); 324 325 /** 326 * @poll: Called to poll for completion of a specific tag. 327 */ 328 int (*poll)(struct blk_mq_hw_ctx *); 329 330 /** 331 * @complete: Mark the request as complete. 332 */ 333 void (*complete)(struct request *); 334 335 /** 336 * @init_hctx: Called when the block layer side of a hardware queue has 337 * been set up, allowing the driver to allocate/init matching 338 * structures. 339 */ 340 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); 341 /** 342 * @exit_hctx: Ditto for exit/teardown. 343 */ 344 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 345 346 /** 347 * @init_request: Called for every command allocated by the block layer 348 * to allow the driver to set up driver specific data. 349 * 350 * Tag greater than or equal to queue_depth is for setting up 351 * flush request. 352 */ 353 int (*init_request)(struct blk_mq_tag_set *set, struct request *, 354 unsigned int, unsigned int); 355 /** 356 * @exit_request: Ditto for exit/teardown. 357 */ 358 void (*exit_request)(struct blk_mq_tag_set *set, struct request *, 359 unsigned int); 360 361 /** 362 * @initialize_rq_fn: Called from inside blk_get_request(). 363 */ 364 void (*initialize_rq_fn)(struct request *rq); 365 366 /** 367 * @cleanup_rq: Called before freeing one request which isn't completed 368 * yet, and usually for freeing the driver private data. 369 */ 370 void (*cleanup_rq)(struct request *); 371 372 /** 373 * @busy: If set, returns whether or not this queue currently is busy. 374 */ 375 bool (*busy)(struct request_queue *); 376 377 /** 378 * @map_queues: This allows drivers specify their own queue mapping by 379 * overriding the setup-time function that builds the mq_map. 380 */ 381 int (*map_queues)(struct blk_mq_tag_set *set); 382 383 #ifdef CONFIG_BLK_DEBUG_FS 384 /** 385 * @show_rq: Used by the debugfs implementation to show driver-specific 386 * information about a request. 387 */ 388 void (*show_rq)(struct seq_file *m, struct request *rq); 389 #endif 390 }; 391 392 enum { 393 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 394 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, 395 /* 396 * Set when this device requires underlying blk-mq device for 397 * completing IO: 398 */ 399 BLK_MQ_F_STACKING = 1 << 2, 400 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, 401 BLK_MQ_F_BLOCKING = 1 << 5, 402 BLK_MQ_F_NO_SCHED = 1 << 6, 403 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 404 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 405 406 BLK_MQ_S_STOPPED = 0, 407 BLK_MQ_S_TAG_ACTIVE = 1, 408 BLK_MQ_S_SCHED_RESTART = 2, 409 410 /* hw queue is inactive after all its CPUs become offline */ 411 BLK_MQ_S_INACTIVE = 3, 412 413 BLK_MQ_MAX_DEPTH = 10240, 414 415 BLK_MQ_CPU_WORK_BATCH = 8, 416 }; 417 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 418 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 419 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 420 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 421 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 422 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 423 424 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 425 struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 426 void *queuedata); 427 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 428 struct request_queue *q, 429 bool elevator_init); 430 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 431 const struct blk_mq_ops *ops, 432 unsigned int queue_depth, 433 unsigned int set_flags); 434 void blk_mq_unregister_dev(struct device *, struct request_queue *); 435 436 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 437 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 438 439 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 440 441 void blk_mq_free_request(struct request *rq); 442 443 bool blk_mq_queue_inflight(struct request_queue *q); 444 445 enum { 446 /* return when out of requests */ 447 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 448 /* allocate from reserved pool */ 449 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 450 /* set RQF_PM */ 451 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), 452 }; 453 454 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 455 blk_mq_req_flags_t flags); 456 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 457 unsigned int op, blk_mq_req_flags_t flags, 458 unsigned int hctx_idx); 459 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 460 461 enum { 462 BLK_MQ_UNIQUE_TAG_BITS = 16, 463 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 464 }; 465 466 u32 blk_mq_unique_tag(struct request *rq); 467 468 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 469 { 470 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 471 } 472 473 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 474 { 475 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 476 } 477 478 /** 479 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 480 * @rq: target request. 481 */ 482 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 483 { 484 return READ_ONCE(rq->state); 485 } 486 487 static inline int blk_mq_request_started(struct request *rq) 488 { 489 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 490 } 491 492 static inline int blk_mq_request_completed(struct request *rq) 493 { 494 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 495 } 496 497 void blk_mq_start_request(struct request *rq); 498 void blk_mq_end_request(struct request *rq, blk_status_t error); 499 void __blk_mq_end_request(struct request *rq, blk_status_t error); 500 501 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 502 void blk_mq_kick_requeue_list(struct request_queue *q); 503 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 504 void blk_mq_complete_request(struct request *rq); 505 bool blk_mq_complete_request_remote(struct request *rq); 506 bool blk_mq_queue_stopped(struct request_queue *q); 507 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 508 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 509 void blk_mq_stop_hw_queues(struct request_queue *q); 510 void blk_mq_start_hw_queues(struct request_queue *q); 511 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 512 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 513 void blk_mq_quiesce_queue(struct request_queue *q); 514 void blk_mq_unquiesce_queue(struct request_queue *q); 515 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 516 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 517 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 518 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); 519 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 520 busy_tag_iter_fn *fn, void *priv); 521 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 522 void blk_mq_freeze_queue(struct request_queue *q); 523 void blk_mq_unfreeze_queue(struct request_queue *q); 524 void blk_freeze_queue_start(struct request_queue *q); 525 void blk_mq_freeze_queue_wait(struct request_queue *q); 526 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 527 unsigned long timeout); 528 529 int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 530 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 531 532 void blk_mq_quiesce_queue_nowait(struct request_queue *q); 533 534 unsigned int blk_mq_rq_cpu(struct request *rq); 535 536 bool __blk_should_fake_timeout(struct request_queue *q); 537 static inline bool blk_should_fake_timeout(struct request_queue *q) 538 { 539 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && 540 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) 541 return __blk_should_fake_timeout(q); 542 return false; 543 } 544 545 /** 546 * blk_mq_rq_from_pdu - cast a PDU to a request 547 * @pdu: the PDU (Protocol Data Unit) to be casted 548 * 549 * Return: request 550 * 551 * Driver command data is immediately after the request. So subtract request 552 * size to get back to the original request. 553 */ 554 static inline struct request *blk_mq_rq_from_pdu(void *pdu) 555 { 556 return pdu - sizeof(struct request); 557 } 558 559 /** 560 * blk_mq_rq_to_pdu - cast a request to a PDU 561 * @rq: the request to be casted 562 * 563 * Return: pointer to the PDU 564 * 565 * Driver command data is immediately after the request. So add request to get 566 * the PDU. 567 */ 568 static inline void *blk_mq_rq_to_pdu(struct request *rq) 569 { 570 return rq + 1; 571 } 572 573 #define queue_for_each_hw_ctx(q, hctx, i) \ 574 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 575 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 576 577 #define hctx_for_each_ctx(hctx, ctx, i) \ 578 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 579 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 580 581 static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, 582 struct request *rq) 583 { 584 if (rq->tag != -1) 585 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); 586 587 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | 588 BLK_QC_T_INTERNAL; 589 } 590 591 static inline void blk_mq_cleanup_rq(struct request *rq) 592 { 593 if (rq->q->mq_ops->cleanup_rq) 594 rq->q->mq_ops->cleanup_rq(rq); 595 } 596 597 static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, 598 unsigned int nr_segs) 599 { 600 rq->nr_phys_segments = nr_segs; 601 rq->__data_len = bio->bi_iter.bi_size; 602 rq->bio = rq->biotail = bio; 603 rq->ioprio = bio_prio(bio); 604 605 if (bio->bi_disk) 606 rq->rq_disk = bio->bi_disk; 607 } 608 609 blk_qc_t blk_mq_submit_bio(struct bio *bio); 610 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, 611 struct lock_class_key *key); 612 613 #endif 614