1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef BLK_MQ_H 3 #define BLK_MQ_H 4 5 #include <linux/blkdev.h> 6 #include <linux/sbitmap.h> 7 #include <linux/srcu.h> 8 9 struct blk_mq_tags; 10 struct blk_flush_queue; 11 12 /** 13 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware 14 * block device 15 */ 16 struct blk_mq_hw_ctx { 17 struct { 18 /** @lock: Protects the dispatch list. */ 19 spinlock_t lock; 20 /** 21 * @dispatch: Used for requests that are ready to be 22 * dispatched to the hardware but for some reason (e.g. lack of 23 * resources) could not be sent to the hardware. As soon as the 24 * driver can send new requests, requests at this list will 25 * be sent first for a fairer dispatch. 26 */ 27 struct list_head dispatch; 28 /** 29 * @state: BLK_MQ_S_* flags. Defines the state of the hw 30 * queue (active, scheduled to restart, stopped). 31 */ 32 unsigned long state; 33 } ____cacheline_aligned_in_smp; 34 35 /** 36 * @run_work: Used for scheduling a hardware queue run at a later time. 37 */ 38 struct delayed_work run_work; 39 /** @cpumask: Map of available CPUs where this hctx can run. */ 40 cpumask_var_t cpumask; 41 /** 42 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU 43 * selection from @cpumask. 44 */ 45 int next_cpu; 46 /** 47 * @next_cpu_batch: Counter of how many works left in the batch before 48 * changing to the next CPU. 49 */ 50 int next_cpu_batch; 51 52 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ 53 unsigned long flags; 54 55 /** 56 * @sched_data: Pointer owned by the IO scheduler attached to a request 57 * queue. It's up to the IO scheduler how to use this pointer. 58 */ 59 void *sched_data; 60 /** 61 * @queue: Pointer to the request queue that owns this hardware context. 62 */ 63 struct request_queue *queue; 64 /** @fq: Queue of requests that need to perform a flush operation. */ 65 struct blk_flush_queue *fq; 66 67 /** 68 * @driver_data: Pointer to data owned by the block driver that created 69 * this hctx 70 */ 71 void *driver_data; 72 73 /** 74 * @ctx_map: Bitmap for each software queue. If bit is on, there is a 75 * pending request in that software queue. 76 */ 77 struct sbitmap ctx_map; 78 79 /** 80 * @dispatch_from: Software queue to be used when no scheduler was 81 * selected. 82 */ 83 struct blk_mq_ctx *dispatch_from; 84 /** 85 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to 86 * decide if the hw_queue is busy using Exponential Weighted Moving 87 * Average algorithm. 88 */ 89 unsigned int dispatch_busy; 90 91 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ 92 unsigned short type; 93 /** @nr_ctx: Number of software queues. */ 94 unsigned short nr_ctx; 95 /** @ctxs: Array of software queues. */ 96 struct blk_mq_ctx **ctxs; 97 98 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ 99 spinlock_t dispatch_wait_lock; 100 /** 101 * @dispatch_wait: Waitqueue to put requests when there is no tag 102 * available at the moment, to wait for another try in the future. 103 */ 104 wait_queue_entry_t dispatch_wait; 105 106 /** 107 * @wait_index: Index of next available dispatch_wait queue to insert 108 * requests. 109 */ 110 atomic_t wait_index; 111 112 /** 113 * @tags: Tags owned by the block driver. A tag at this set is only 114 * assigned when a request is dispatched from a hardware queue. 115 */ 116 struct blk_mq_tags *tags; 117 /** 118 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O 119 * scheduler associated with a request queue, a tag is assigned when 120 * that request is allocated. Else, this member is not used. 121 */ 122 struct blk_mq_tags *sched_tags; 123 124 /** @queued: Number of queued requests. */ 125 unsigned long queued; 126 /** @run: Number of dispatched requests. */ 127 unsigned long run; 128 #define BLK_MQ_MAX_DISPATCH_ORDER 7 129 /** @dispatched: Number of dispatch requests by queue. */ 130 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 131 132 /** @numa_node: NUMA node the storage adapter has been connected to. */ 133 unsigned int numa_node; 134 /** @queue_num: Index of this hardware queue. */ 135 unsigned int queue_num; 136 137 /** 138 * @nr_active: Number of active requests. Only used when a tag set is 139 * shared across request queues. 140 */ 141 atomic_t nr_active; 142 143 /** @cpuhp_dead: List to store request if some CPU die. */ 144 struct hlist_node cpuhp_dead; 145 /** @kobj: Kernel object for sysfs. */ 146 struct kobject kobj; 147 148 /** @poll_considered: Count times blk_poll() was called. */ 149 unsigned long poll_considered; 150 /** @poll_invoked: Count how many requests blk_poll() polled. */ 151 unsigned long poll_invoked; 152 /** @poll_success: Count how many polled requests were completed. */ 153 unsigned long poll_success; 154 155 #ifdef CONFIG_BLK_DEBUG_FS 156 /** 157 * @debugfs_dir: debugfs directory for this hardware queue. Named 158 * as cpu<cpu_number>. 159 */ 160 struct dentry *debugfs_dir; 161 /** @sched_debugfs_dir: debugfs directory for the scheduler. */ 162 struct dentry *sched_debugfs_dir; 163 #endif 164 165 /** 166 * @hctx_list: if this hctx is not in use, this is an entry in 167 * q->unused_hctx_list. 168 */ 169 struct list_head hctx_list; 170 171 /** 172 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is 173 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also 174 * blk_mq_hw_ctx_size(). 175 */ 176 struct srcu_struct srcu[]; 177 }; 178 179 /** 180 * struct blk_mq_queue_map - Map software queues to hardware queues 181 * @mq_map: CPU ID to hardware queue index map. This is an array 182 * with nr_cpu_ids elements. Each element has a value in the range 183 * [@queue_offset, @queue_offset + @nr_queues). 184 * @nr_queues: Number of hardware queues to map CPU IDs onto. 185 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe 186 * driver to map each hardware queue type (enum hctx_type) onto a distinct 187 * set of hardware queues. 188 */ 189 struct blk_mq_queue_map { 190 unsigned int *mq_map; 191 unsigned int nr_queues; 192 unsigned int queue_offset; 193 }; 194 195 /** 196 * enum hctx_type - Type of hardware queue 197 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. 198 * @HCTX_TYPE_READ: Just for READ I/O. 199 * @HCTX_TYPE_POLL: Polled I/O of any kind. 200 * @HCTX_MAX_TYPES: Number of types of hctx. 201 */ 202 enum hctx_type { 203 HCTX_TYPE_DEFAULT, 204 HCTX_TYPE_READ, 205 HCTX_TYPE_POLL, 206 207 HCTX_MAX_TYPES, 208 }; 209 210 /** 211 * struct blk_mq_tag_set - tag set that can be shared between request queues 212 * @map: One or more ctx -> hctx mappings. One map exists for each 213 * hardware queue type (enum hctx_type) that the driver wishes 214 * to support. There are no restrictions on maps being of the 215 * same size, and it's perfectly legal to share maps between 216 * types. 217 * @nr_maps: Number of elements in the @map array. A number in the range 218 * [1, HCTX_MAX_TYPES]. 219 * @ops: Pointers to functions that implement block driver behavior. 220 * @nr_hw_queues: Number of hardware queues supported by the block driver that 221 * owns this data structure. 222 * @queue_depth: Number of tags per hardware queue, reserved tags included. 223 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag 224 * allocations. 225 * @cmd_size: Number of additional bytes to allocate per request. The block 226 * driver owns these additional bytes. 227 * @numa_node: NUMA node the storage adapter has been connected to. 228 * @timeout: Request processing timeout in jiffies. 229 * @flags: Zero or more BLK_MQ_F_* flags. 230 * @driver_data: Pointer to data owned by the block driver that created this 231 * tag set. 232 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues 233 * elements. 234 * @tag_list_lock: Serializes tag_list accesses. 235 * @tag_list: List of the request queues that use this tag set. See also 236 * request_queue.tag_set_list. 237 */ 238 struct blk_mq_tag_set { 239 struct blk_mq_queue_map map[HCTX_MAX_TYPES]; 240 unsigned int nr_maps; 241 const struct blk_mq_ops *ops; 242 unsigned int nr_hw_queues; 243 unsigned int queue_depth; 244 unsigned int reserved_tags; 245 unsigned int cmd_size; 246 int numa_node; 247 unsigned int timeout; 248 unsigned int flags; 249 void *driver_data; 250 251 struct blk_mq_tags **tags; 252 253 struct mutex tag_list_lock; 254 struct list_head tag_list; 255 }; 256 257 /** 258 * struct blk_mq_queue_data - Data about a request inserted in a queue 259 * 260 * @rq: Request pointer. 261 * @last: If it is the last request in the queue. 262 */ 263 struct blk_mq_queue_data { 264 struct request *rq; 265 bool last; 266 }; 267 268 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, 269 const struct blk_mq_queue_data *); 270 typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *); 271 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); 272 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); 273 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); 274 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 275 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 276 typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, 277 unsigned int, unsigned int); 278 typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, 279 unsigned int); 280 281 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, 282 bool); 283 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool); 284 typedef int (poll_fn)(struct blk_mq_hw_ctx *); 285 typedef int (map_queues_fn)(struct blk_mq_tag_set *set); 286 typedef bool (busy_fn)(struct request_queue *); 287 typedef void (complete_fn)(struct request *); 288 typedef void (cleanup_rq_fn)(struct request *); 289 290 /** 291 * struct blk_mq_ops - Callback functions that implements block driver 292 * behaviour. 293 */ 294 struct blk_mq_ops { 295 /** 296 * @queue_rq: Queue a new request from block IO. 297 */ 298 queue_rq_fn *queue_rq; 299 300 /** 301 * @commit_rqs: If a driver uses bd->last to judge when to submit 302 * requests to hardware, it must define this function. In case of errors 303 * that make us stop issuing further requests, this hook serves the 304 * purpose of kicking the hardware (which the last request otherwise 305 * would have done). 306 */ 307 commit_rqs_fn *commit_rqs; 308 309 /** 310 * @get_budget: Reserve budget before queue request, once .queue_rq is 311 * run, it is driver's responsibility to release the 312 * reserved budget. Also we have to handle failure case 313 * of .get_budget for avoiding I/O deadlock. 314 */ 315 get_budget_fn *get_budget; 316 /** 317 * @put_budget: Release the reserved budget. 318 */ 319 put_budget_fn *put_budget; 320 321 /** 322 * @timeout: Called on request timeout. 323 */ 324 timeout_fn *timeout; 325 326 /** 327 * @poll: Called to poll for completion of a specific tag. 328 */ 329 poll_fn *poll; 330 331 /** 332 * @complete: Mark the request as complete. 333 */ 334 complete_fn *complete; 335 336 /** 337 * @init_hctx: Called when the block layer side of a hardware queue has 338 * been set up, allowing the driver to allocate/init matching 339 * structures. 340 */ 341 init_hctx_fn *init_hctx; 342 /** 343 * @exit_hctx: Ditto for exit/teardown. 344 */ 345 exit_hctx_fn *exit_hctx; 346 347 /** 348 * @init_request: Called for every command allocated by the block layer 349 * to allow the driver to set up driver specific data. 350 * 351 * Tag greater than or equal to queue_depth is for setting up 352 * flush request. 353 */ 354 init_request_fn *init_request; 355 /** 356 * @exit_request: Ditto for exit/teardown. 357 */ 358 exit_request_fn *exit_request; 359 360 /** 361 * @initialize_rq_fn: Called from inside blk_get_request(). 362 */ 363 void (*initialize_rq_fn)(struct request *rq); 364 365 /** 366 * @cleanup_rq: Called before freeing one request which isn't completed 367 * yet, and usually for freeing the driver private data. 368 */ 369 cleanup_rq_fn *cleanup_rq; 370 371 /** 372 * @busy: If set, returns whether or not this queue currently is busy. 373 */ 374 busy_fn *busy; 375 376 /** 377 * @map_queues: This allows drivers specify their own queue mapping by 378 * overriding the setup-time function that builds the mq_map. 379 */ 380 map_queues_fn *map_queues; 381 382 #ifdef CONFIG_BLK_DEBUG_FS 383 /** 384 * @show_rq: Used by the debugfs implementation to show driver-specific 385 * information about a request. 386 */ 387 void (*show_rq)(struct seq_file *m, struct request *rq); 388 #endif 389 }; 390 391 enum { 392 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 393 BLK_MQ_F_TAG_SHARED = 1 << 1, 394 BLK_MQ_F_BLOCKING = 1 << 5, 395 BLK_MQ_F_NO_SCHED = 1 << 6, 396 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, 397 BLK_MQ_F_ALLOC_POLICY_BITS = 1, 398 399 BLK_MQ_S_STOPPED = 0, 400 BLK_MQ_S_TAG_ACTIVE = 1, 401 BLK_MQ_S_SCHED_RESTART = 2, 402 403 BLK_MQ_MAX_DEPTH = 10240, 404 405 BLK_MQ_CPU_WORK_BATCH = 8, 406 }; 407 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ 408 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ 409 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) 410 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ 411 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ 412 << BLK_MQ_F_ALLOC_POLICY_START_BIT) 413 414 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 415 struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 416 void *queuedata); 417 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 418 struct request_queue *q, 419 bool elevator_init); 420 struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, 421 const struct blk_mq_ops *ops, 422 unsigned int queue_depth, 423 unsigned int set_flags); 424 void blk_mq_unregister_dev(struct device *, struct request_queue *); 425 426 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); 427 void blk_mq_free_tag_set(struct blk_mq_tag_set *set); 428 429 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 430 431 void blk_mq_free_request(struct request *rq); 432 433 bool blk_mq_queue_inflight(struct request_queue *q); 434 435 enum { 436 /* return when out of requests */ 437 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), 438 /* allocate from reserved pool */ 439 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), 440 /* allocate internal/sched tag */ 441 BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), 442 /* set RQF_PREEMPT */ 443 BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), 444 }; 445 446 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 447 blk_mq_req_flags_t flags); 448 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 449 unsigned int op, blk_mq_req_flags_t flags, 450 unsigned int hctx_idx); 451 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag); 452 453 enum { 454 BLK_MQ_UNIQUE_TAG_BITS = 16, 455 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, 456 }; 457 458 u32 blk_mq_unique_tag(struct request *rq); 459 460 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) 461 { 462 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; 463 } 464 465 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) 466 { 467 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; 468 } 469 470 /** 471 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request 472 * @rq: target request. 473 */ 474 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) 475 { 476 return READ_ONCE(rq->state); 477 } 478 479 static inline int blk_mq_request_started(struct request *rq) 480 { 481 return blk_mq_rq_state(rq) != MQ_RQ_IDLE; 482 } 483 484 static inline int blk_mq_request_completed(struct request *rq) 485 { 486 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; 487 } 488 489 void blk_mq_start_request(struct request *rq); 490 void blk_mq_end_request(struct request *rq, blk_status_t error); 491 void __blk_mq_end_request(struct request *rq, blk_status_t error); 492 493 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); 494 void blk_mq_kick_requeue_list(struct request_queue *q); 495 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); 496 bool blk_mq_complete_request(struct request *rq); 497 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 498 struct bio *bio, unsigned int nr_segs); 499 bool blk_mq_queue_stopped(struct request_queue *q); 500 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 501 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 502 void blk_mq_stop_hw_queues(struct request_queue *q); 503 void blk_mq_start_hw_queues(struct request_queue *q); 504 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 505 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 506 void blk_mq_quiesce_queue(struct request_queue *q); 507 void blk_mq_unquiesce_queue(struct request_queue *q); 508 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 509 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 510 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 511 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, 512 busy_tag_iter_fn *fn, void *priv); 513 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); 514 void blk_mq_freeze_queue(struct request_queue *q); 515 void blk_mq_unfreeze_queue(struct request_queue *q); 516 void blk_freeze_queue_start(struct request_queue *q); 517 void blk_mq_freeze_queue_wait(struct request_queue *q); 518 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 519 unsigned long timeout); 520 521 int blk_mq_map_queues(struct blk_mq_queue_map *qmap); 522 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); 523 524 void blk_mq_quiesce_queue_nowait(struct request_queue *q); 525 526 unsigned int blk_mq_rq_cpu(struct request *rq); 527 528 /** 529 * blk_mq_rq_from_pdu - cast a PDU to a request 530 * @pdu: the PDU (Protocol Data Unit) to be casted 531 * 532 * Return: request 533 * 534 * Driver command data is immediately after the request. So subtract request 535 * size to get back to the original request. 536 */ 537 static inline struct request *blk_mq_rq_from_pdu(void *pdu) 538 { 539 return pdu - sizeof(struct request); 540 } 541 542 /** 543 * blk_mq_rq_to_pdu - cast a request to a PDU 544 * @rq: the request to be casted 545 * 546 * Return: pointer to the PDU 547 * 548 * Driver command data is immediately after the request. So add request to get 549 * the PDU. 550 */ 551 static inline void *blk_mq_rq_to_pdu(struct request *rq) 552 { 553 return rq + 1; 554 } 555 556 #define queue_for_each_hw_ctx(q, hctx, i) \ 557 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 558 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 559 560 #define hctx_for_each_ctx(hctx, ctx, i) \ 561 for ((i) = 0; (i) < (hctx)->nr_ctx && \ 562 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) 563 564 static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, 565 struct request *rq) 566 { 567 if (rq->tag != -1) 568 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT); 569 570 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) | 571 BLK_QC_T_INTERNAL; 572 } 573 574 static inline void blk_mq_cleanup_rq(struct request *rq) 575 { 576 if (rq->q->mq_ops->cleanup_rq) 577 rq->q->mq_ops->cleanup_rq(rq); 578 } 579 580 #endif 581