1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _FS_CEPH_OSD_CLIENT_H 3 #define _FS_CEPH_OSD_CLIENT_H 4 5 #include <linux/bitrev.h> 6 #include <linux/completion.h> 7 #include <linux/kref.h> 8 #include <linux/mempool.h> 9 #include <linux/rbtree.h> 10 #include <linux/refcount.h> 11 #include <linux/ktime.h> 12 13 #include <linux/ceph/types.h> 14 #include <linux/ceph/osdmap.h> 15 #include <linux/ceph/messenger.h> 16 #include <linux/ceph/msgpool.h> 17 #include <linux/ceph/auth.h> 18 #include <linux/ceph/pagelist.h> 19 20 struct ceph_msg; 21 struct ceph_snap_context; 22 struct ceph_osd_request; 23 struct ceph_osd_client; 24 25 /* 26 * completion callback for async writepages 27 */ 28 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); 29 30 #define CEPH_HOMELESS_OSD -1 31 32 /* 33 * A single extent in a SPARSE_READ reply. 34 * 35 * Note that these come from the OSD as little-endian values. On BE arches, 36 * we convert them in-place after receipt. 37 */ 38 struct ceph_sparse_extent { 39 u64 off; 40 u64 len; 41 } __packed; 42 43 /* Sparse read state machine state values */ 44 enum ceph_sparse_read_state { 45 CEPH_SPARSE_READ_HDR = 0, 46 CEPH_SPARSE_READ_EXTENTS, 47 CEPH_SPARSE_READ_DATA_LEN, 48 CEPH_SPARSE_READ_DATA, 49 }; 50 51 /* 52 * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of 53 * 64-bit offset/length pairs, and then all of the actual file data 54 * concatenated after it (sans holes). 55 * 56 * Unfortunately, we don't know how long the extent array is until we've 57 * started reading the data section of the reply. The caller should send down 58 * a destination buffer for the array, but we'll alloc one if it's too small 59 * or if the caller doesn't. 60 */ 61 struct ceph_sparse_read { 62 enum ceph_sparse_read_state sr_state; /* state machine state */ 63 u64 sr_req_off; /* orig request offset */ 64 u64 sr_req_len; /* orig request length */ 65 u64 sr_pos; /* current pos in buffer */ 66 int sr_index; /* current extent index */ 67 __le32 sr_datalen; /* length of actual data */ 68 u32 sr_count; /* extent count in reply */ 69 int sr_ext_len; /* length of extent array */ 70 struct ceph_sparse_extent *sr_extent; /* extent array */ 71 }; 72 73 /* 74 * A given osd we're communicating with. 75 * 76 * Note that the o_requests tree can be searched while holding the "lock" mutex 77 * or the "o_requests_lock" spinlock. Insertion or removal requires both! 78 */ 79 struct ceph_osd { 80 refcount_t o_ref; 81 int o_sparse_op_idx; 82 struct ceph_osd_client *o_osdc; 83 int o_osd; 84 int o_incarnation; 85 struct rb_node o_node; 86 struct ceph_connection o_con; 87 spinlock_t o_requests_lock; 88 struct rb_root o_requests; 89 struct rb_root o_linger_requests; 90 struct rb_root o_backoff_mappings; 91 struct rb_root o_backoffs_by_id; 92 struct list_head o_osd_lru; 93 struct ceph_auth_handshake o_auth; 94 unsigned long lru_ttl; 95 struct list_head o_keepalive_item; 96 struct mutex lock; 97 struct ceph_sparse_read o_sparse_read; 98 }; 99 100 #define CEPH_OSD_SLAB_OPS 2 101 #define CEPH_OSD_MAX_OPS 16 102 103 enum ceph_osd_data_type { 104 CEPH_OSD_DATA_TYPE_NONE = 0, 105 CEPH_OSD_DATA_TYPE_PAGES, 106 CEPH_OSD_DATA_TYPE_PAGELIST, 107 #ifdef CONFIG_BLOCK 108 CEPH_OSD_DATA_TYPE_BIO, 109 #endif /* CONFIG_BLOCK */ 110 CEPH_OSD_DATA_TYPE_BVECS, 111 CEPH_OSD_DATA_TYPE_ITER, 112 }; 113 114 struct ceph_osd_data { 115 enum ceph_osd_data_type type; 116 union { 117 struct { 118 struct page **pages; 119 u64 length; 120 u32 alignment; 121 bool pages_from_pool; 122 bool own_pages; 123 }; 124 struct ceph_pagelist *pagelist; 125 #ifdef CONFIG_BLOCK 126 struct { 127 struct ceph_bio_iter bio_pos; 128 u32 bio_length; 129 }; 130 #endif /* CONFIG_BLOCK */ 131 struct { 132 struct ceph_bvec_iter bvec_pos; 133 u32 num_bvecs; 134 }; 135 struct iov_iter iter; 136 }; 137 }; 138 139 struct ceph_osd_req_op { 140 u16 op; /* CEPH_OSD_OP_* */ 141 u32 flags; /* CEPH_OSD_OP_FLAG_* */ 142 u32 indata_len; /* request */ 143 u32 outdata_len; /* reply */ 144 s32 rval; 145 146 union { 147 struct ceph_osd_data raw_data_in; 148 struct { 149 u64 offset, length; 150 u64 truncate_size; 151 u32 truncate_seq; 152 int sparse_ext_cnt; 153 struct ceph_sparse_extent *sparse_ext; 154 struct ceph_osd_data osd_data; 155 } extent; 156 struct { 157 u32 name_len; 158 u32 value_len; 159 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 160 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 161 struct ceph_osd_data osd_data; 162 } xattr; 163 struct { 164 const char *class_name; 165 const char *method_name; 166 struct ceph_osd_data request_info; 167 struct ceph_osd_data request_data; 168 struct ceph_osd_data response_data; 169 __u8 class_len; 170 __u8 method_len; 171 u32 indata_len; 172 } cls; 173 struct { 174 u64 cookie; 175 __u8 op; /* CEPH_OSD_WATCH_OP_ */ 176 u32 gen; 177 } watch; 178 struct { 179 struct ceph_osd_data request_data; 180 } notify_ack; 181 struct { 182 u64 cookie; 183 struct ceph_osd_data request_data; 184 struct ceph_osd_data response_data; 185 } notify; 186 struct { 187 struct ceph_osd_data response_data; 188 } list_watchers; 189 struct { 190 u64 expected_object_size; 191 u64 expected_write_size; 192 u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ 193 } alloc_hint; 194 struct { 195 u64 snapid; 196 u64 src_version; 197 u8 flags; 198 u32 src_fadvise_flags; 199 struct ceph_osd_data osd_data; 200 } copy_from; 201 struct { 202 u64 ver; 203 } assert_ver; 204 }; 205 }; 206 207 struct ceph_osd_request_target { 208 struct ceph_object_id base_oid; 209 struct ceph_object_locator base_oloc; 210 struct ceph_object_id target_oid; 211 struct ceph_object_locator target_oloc; 212 213 struct ceph_pg pgid; /* last raw pg we mapped to */ 214 struct ceph_spg spgid; /* last actual spg we mapped to */ 215 u32 pg_num; 216 u32 pg_num_mask; 217 struct ceph_osds acting; 218 struct ceph_osds up; 219 int size; 220 int min_size; 221 bool sort_bitwise; 222 bool recovery_deletes; 223 224 unsigned int flags; /* CEPH_OSD_FLAG_* */ 225 bool used_replica; 226 bool paused; 227 228 u32 epoch; 229 u32 last_force_resend; 230 231 int osd; 232 }; 233 234 /* an in-flight request */ 235 struct ceph_osd_request { 236 u64 r_tid; /* unique for this client */ 237 struct rb_node r_node; 238 struct rb_node r_mc_node; /* map check */ 239 struct work_struct r_complete_work; 240 struct ceph_osd *r_osd; 241 242 struct ceph_osd_request_target r_t; 243 #define r_base_oid r_t.base_oid 244 #define r_base_oloc r_t.base_oloc 245 #define r_flags r_t.flags 246 247 struct ceph_msg *r_request, *r_reply; 248 u32 r_sent; /* >0 if r_request is sending/sent */ 249 250 /* request osd ops array */ 251 unsigned int r_num_ops; 252 253 int r_result; 254 255 struct ceph_osd_client *r_osdc; 256 struct kref r_kref; 257 bool r_mempool; 258 bool r_linger; /* don't resend on failure */ 259 struct completion r_completion; /* private to osd_client.c */ 260 ceph_osdc_callback_t r_callback; 261 262 struct inode *r_inode; /* for use by callbacks */ 263 struct list_head r_private_item; /* ditto */ 264 void *r_priv; /* ditto */ 265 266 /* set by submitter */ 267 u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ 268 struct ceph_snap_context *r_snapc; /* for writes */ 269 struct timespec64 r_mtime; /* ditto */ 270 u64 r_data_offset; /* ditto */ 271 272 /* internal */ 273 u64 r_version; /* data version sent in reply */ 274 unsigned long r_stamp; /* jiffies, send or check time */ 275 unsigned long r_start_stamp; /* jiffies */ 276 ktime_t r_start_latency; /* ktime_t */ 277 ktime_t r_end_latency; /* ktime_t */ 278 int r_attempts; 279 u32 r_map_dne_bound; 280 281 struct ceph_osd_req_op r_ops[]; 282 }; 283 284 struct ceph_request_redirect { 285 struct ceph_object_locator oloc; 286 }; 287 288 /* 289 * osd request identifier 290 * 291 * caller name + incarnation# + tid to unique identify this request 292 */ 293 struct ceph_osd_reqid { 294 struct ceph_entity_name name; 295 __le64 tid; 296 __le32 inc; 297 } __packed; 298 299 struct ceph_blkin_trace_info { 300 __le64 trace_id; 301 __le64 span_id; 302 __le64 parent_span_id; 303 } __packed; 304 305 typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, 306 u64 notifier_id, void *data, size_t data_len); 307 typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); 308 309 struct ceph_osd_linger_request { 310 struct ceph_osd_client *osdc; 311 u64 linger_id; 312 bool committed; 313 bool is_watch; /* watch or notify */ 314 315 struct ceph_osd *osd; 316 struct ceph_osd_request *reg_req; 317 struct ceph_osd_request *ping_req; 318 unsigned long ping_sent; 319 unsigned long watch_valid_thru; 320 struct list_head pending_lworks; 321 322 struct ceph_osd_request_target t; 323 u32 map_dne_bound; 324 325 struct timespec64 mtime; 326 327 struct kref kref; 328 struct mutex lock; 329 struct rb_node node; /* osd */ 330 struct rb_node osdc_node; /* osdc */ 331 struct rb_node mc_node; /* map check */ 332 struct list_head scan_item; 333 334 struct completion reg_commit_wait; 335 struct completion notify_finish_wait; 336 int reg_commit_error; 337 int notify_finish_error; 338 int last_error; 339 340 u32 register_gen; 341 u64 notify_id; 342 343 rados_watchcb2_t wcb; 344 rados_watcherrcb_t errcb; 345 void *data; 346 347 struct ceph_pagelist *request_pl; 348 struct page **notify_id_pages; 349 350 struct page ***preply_pages; 351 size_t *preply_len; 352 }; 353 354 struct ceph_watch_item { 355 struct ceph_entity_name name; 356 u64 cookie; 357 struct ceph_entity_addr addr; 358 }; 359 360 struct ceph_spg_mapping { 361 struct rb_node node; 362 struct ceph_spg spgid; 363 364 struct rb_root backoffs; 365 }; 366 367 struct ceph_hobject_id { 368 void *key; 369 size_t key_len; 370 void *oid; 371 size_t oid_len; 372 u64 snapid; 373 u32 hash; 374 u8 is_max; 375 void *nspace; 376 size_t nspace_len; 377 s64 pool; 378 379 /* cache */ 380 u32 hash_reverse_bits; 381 }; 382 383 static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid) 384 { 385 hoid->hash_reverse_bits = bitrev32(hoid->hash); 386 } 387 388 /* 389 * PG-wide backoff: [begin, end) 390 * per-object backoff: begin == end 391 */ 392 struct ceph_osd_backoff { 393 struct rb_node spg_node; 394 struct rb_node id_node; 395 396 struct ceph_spg spgid; 397 u64 id; 398 struct ceph_hobject_id *begin; 399 struct ceph_hobject_id *end; 400 }; 401 402 #define CEPH_LINGER_ID_START 0xffff000000000000ULL 403 404 struct ceph_osd_client { 405 struct ceph_client *client; 406 407 struct ceph_osdmap *osdmap; /* current map */ 408 struct rw_semaphore lock; 409 410 struct rb_root osds; /* osds */ 411 struct list_head osd_lru; /* idle osds */ 412 spinlock_t osd_lru_lock; 413 u32 epoch_barrier; 414 struct ceph_osd homeless_osd; 415 atomic64_t last_tid; /* tid of last request */ 416 u64 last_linger_id; 417 struct rb_root linger_requests; /* lingering requests */ 418 struct rb_root map_checks; 419 struct rb_root linger_map_checks; 420 atomic_t num_requests; 421 atomic_t num_homeless; 422 int abort_err; 423 struct delayed_work timeout_work; 424 struct delayed_work osds_timeout_work; 425 #ifdef CONFIG_DEBUG_FS 426 struct dentry *debugfs_file; 427 #endif 428 429 mempool_t *req_mempool; 430 431 struct ceph_msgpool msgpool_op; 432 struct ceph_msgpool msgpool_op_reply; 433 434 struct workqueue_struct *notify_wq; 435 struct workqueue_struct *completion_wq; 436 }; 437 438 static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag) 439 { 440 return osdc->osdmap->flags & flag; 441 } 442 443 extern int ceph_osdc_setup(void); 444 extern void ceph_osdc_cleanup(void); 445 446 extern int ceph_osdc_init(struct ceph_osd_client *osdc, 447 struct ceph_client *client); 448 extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 449 extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); 450 451 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, 452 struct ceph_msg *msg); 453 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 454 struct ceph_msg *msg); 455 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); 456 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); 457 void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); 458 459 #define osd_req_op_data(oreq, whch, typ, fld) \ 460 ({ \ 461 struct ceph_osd_request *__oreq = (oreq); \ 462 unsigned int __whch = (whch); \ 463 BUG_ON(__whch >= __oreq->r_num_ops); \ 464 &__oreq->r_ops[__whch].typ.fld; \ 465 }) 466 467 struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req, 468 unsigned int which, u16 opcode, u32 flags); 469 470 extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, 471 unsigned int which, 472 struct page **pages, u64 length, 473 u32 alignment, bool pages_from_pool, 474 bool own_pages); 475 476 extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, 477 unsigned int which, u16 opcode, 478 u64 offset, u64 length, 479 u64 truncate_size, u32 truncate_seq); 480 extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, 481 unsigned int which, u64 length); 482 extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, 483 unsigned int which, u64 offset_inc); 484 485 extern struct ceph_osd_data *osd_req_op_extent_osd_data( 486 struct ceph_osd_request *osd_req, 487 unsigned int which); 488 489 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, 490 unsigned int which, 491 struct page **pages, u64 length, 492 u32 alignment, bool pages_from_pool, 493 bool own_pages); 494 extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, 495 unsigned int which, 496 struct ceph_pagelist *pagelist); 497 #ifdef CONFIG_BLOCK 498 void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, 499 unsigned int which, 500 struct ceph_bio_iter *bio_pos, 501 u32 bio_length); 502 #endif /* CONFIG_BLOCK */ 503 void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, 504 unsigned int which, 505 struct bio_vec *bvecs, u32 num_bvecs, 506 u32 bytes); 507 void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, 508 unsigned int which, 509 struct ceph_bvec_iter *bvec_pos); 510 void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, 511 unsigned int which, struct iov_iter *iter); 512 513 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, 514 unsigned int which, 515 struct ceph_pagelist *pagelist); 516 extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, 517 unsigned int which, 518 struct page **pages, u64 length, 519 u32 alignment, bool pages_from_pool, 520 bool own_pages); 521 void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, 522 unsigned int which, 523 struct bio_vec *bvecs, u32 num_bvecs, 524 u32 bytes); 525 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, 526 unsigned int which, 527 struct page **pages, u64 length, 528 u32 alignment, bool pages_from_pool, 529 bool own_pages); 530 int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, 531 const char *class, const char *method); 532 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 533 u16 opcode, const char *name, const void *value, 534 size_t size, u8 cmp_op, u8 cmp_mode); 535 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 536 unsigned int which, 537 u64 expected_object_size, 538 u64 expected_write_size, 539 u32 flags); 540 extern int osd_req_op_copy_from_init(struct ceph_osd_request *req, 541 u64 src_snapid, u64 src_version, 542 struct ceph_object_id *src_oid, 543 struct ceph_object_locator *src_oloc, 544 u32 src_fadvise_flags, 545 u32 dst_fadvise_flags, 546 u32 truncate_seq, u64 truncate_size, 547 u8 copy_from_flags); 548 549 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 550 struct ceph_snap_context *snapc, 551 unsigned int num_ops, 552 bool use_mempool, 553 gfp_t gfp_flags); 554 int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); 555 556 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 557 struct ceph_file_layout *layout, 558 struct ceph_vino vino, 559 u64 offset, u64 *len, 560 unsigned int which, int num_ops, 561 int opcode, int flags, 562 struct ceph_snap_context *snapc, 563 u32 truncate_seq, u64 truncate_size, 564 bool use_mempool); 565 566 int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt); 567 568 /* 569 * How big an extent array should we preallocate for a sparse read? This is 570 * just a starting value. If we get more than this back from the OSD, the 571 * receiver will reallocate. 572 */ 573 #define CEPH_SPARSE_EXT_ARRAY_INITIAL 16 574 575 static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op) 576 { 577 return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL); 578 } 579 580 extern void ceph_osdc_get_request(struct ceph_osd_request *req); 581 extern void ceph_osdc_put_request(struct ceph_osd_request *req); 582 583 void ceph_osdc_start_request(struct ceph_osd_client *osdc, 584 struct ceph_osd_request *req); 585 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); 586 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 587 struct ceph_osd_request *req); 588 extern void ceph_osdc_sync(struct ceph_osd_client *osdc); 589 590 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); 591 void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc); 592 593 int ceph_osdc_call(struct ceph_osd_client *osdc, 594 struct ceph_object_id *oid, 595 struct ceph_object_locator *oloc, 596 const char *class, const char *method, 597 unsigned int flags, 598 struct page *req_page, size_t req_len, 599 struct page **resp_pages, size_t *resp_len); 600 601 /* watch/notify */ 602 struct ceph_osd_linger_request * 603 ceph_osdc_watch(struct ceph_osd_client *osdc, 604 struct ceph_object_id *oid, 605 struct ceph_object_locator *oloc, 606 rados_watchcb2_t wcb, 607 rados_watcherrcb_t errcb, 608 void *data); 609 int ceph_osdc_unwatch(struct ceph_osd_client *osdc, 610 struct ceph_osd_linger_request *lreq); 611 612 int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, 613 struct ceph_object_id *oid, 614 struct ceph_object_locator *oloc, 615 u64 notify_id, 616 u64 cookie, 617 void *payload, 618 u32 payload_len); 619 int ceph_osdc_notify(struct ceph_osd_client *osdc, 620 struct ceph_object_id *oid, 621 struct ceph_object_locator *oloc, 622 void *payload, 623 u32 payload_len, 624 u32 timeout, 625 struct page ***preply_pages, 626 size_t *preply_len); 627 int ceph_osdc_watch_check(struct ceph_osd_client *osdc, 628 struct ceph_osd_linger_request *lreq); 629 int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, 630 struct ceph_object_id *oid, 631 struct ceph_object_locator *oloc, 632 struct ceph_watch_item **watchers, 633 u32 *num_watchers); 634 635 /* Find offset into the buffer of the end of the extent map */ 636 static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op) 637 { 638 struct ceph_sparse_extent *ext; 639 640 /* No extents? No data */ 641 if (op->extent.sparse_ext_cnt == 0) 642 return 0; 643 644 ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1]; 645 646 return ext->off + ext->len - op->extent.offset; 647 } 648 649 #endif 650