1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, Delphix. All rights reserved. 24 * Copyright (c) 2013, Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2020, George Amanakis. All rights reserved. 27 */ 28 29 #ifndef _SYS_ARC_IMPL_H 30 #define _SYS_ARC_IMPL_H 31 32 #include <sys/arc.h> 33 #include <sys/zio_crypt.h> 34 #include <sys/zthr.h> 35 #include <sys/aggsum.h> 36 #include <sys/wmsum.h> 37 38 #ifdef __cplusplus 39 extern "C" { 40 #endif 41 42 /* 43 * Note that buffers can be in one of 6 states: 44 * ARC_anon - anonymous (discussed below) 45 * ARC_mru - recently used, currently cached 46 * ARC_mru_ghost - recently used, no longer in cache 47 * ARC_mfu - frequently used, currently cached 48 * ARC_mfu_ghost - frequently used, no longer in cache 49 * ARC_l2c_only - exists in L2ARC but not other states 50 * When there are no active references to the buffer, they are 51 * are linked onto a list in one of these arc states. These are 52 * the only buffers that can be evicted or deleted. Within each 53 * state there are multiple lists, one for meta-data and one for 54 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 55 * etc.) is tracked separately so that it can be managed more 56 * explicitly: favored over data, limited explicitly. 57 * 58 * Anonymous buffers are buffers that are not associated with 59 * a DVA. These are buffers that hold dirty block copies 60 * before they are written to stable storage. By definition, 61 * they are "ref'd" and are considered part of arc_mru 62 * that cannot be freed. Generally, they will acquire a DVA 63 * as they are written and migrate onto the arc_mru list. 64 * 65 * The ARC_l2c_only state is for buffers that are in the second 66 * level ARC but no longer in any of the ARC_m* lists. The second 67 * level ARC itself may also contain buffers that are in any of 68 * the ARC_m* states - meaning that a buffer can exist in two 69 * places. The reason for the ARC_l2c_only state is to keep the 70 * buffer header in the hash table, so that reads that hit the 71 * second level ARC benefit from these fast lookups. 72 */ 73 74 typedef struct arc_state { 75 /* 76 * list of evictable buffers 77 */ 78 multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 79 /* 80 * supports the "dbufs" kstat 81 */ 82 arc_state_type_t arcs_state; 83 /* 84 * total amount of evictable data in this state 85 */ 86 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES] ____cacheline_aligned; 87 /* 88 * total amount of data in this state; this includes: evictable, 89 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 90 */ 91 zfs_refcount_t arcs_size; 92 } arc_state_t; 93 94 typedef struct arc_callback arc_callback_t; 95 96 struct arc_callback { 97 void *acb_private; 98 arc_read_done_func_t *acb_done; 99 arc_buf_t *acb_buf; 100 boolean_t acb_encrypted; 101 boolean_t acb_compressed; 102 boolean_t acb_noauth; 103 boolean_t acb_nobuf; 104 zbookmark_phys_t acb_zb; 105 zio_t *acb_zio_dummy; 106 zio_t *acb_zio_head; 107 arc_callback_t *acb_next; 108 }; 109 110 typedef struct arc_write_callback arc_write_callback_t; 111 112 struct arc_write_callback { 113 void *awcb_private; 114 arc_write_done_func_t *awcb_ready; 115 arc_write_done_func_t *awcb_children_ready; 116 arc_write_done_func_t *awcb_physdone; 117 arc_write_done_func_t *awcb_done; 118 arc_buf_t *awcb_buf; 119 }; 120 121 /* 122 * ARC buffers are separated into multiple structs as a memory saving measure: 123 * - Common fields struct, always defined, and embedded within it: 124 * - L2-only fields, always allocated but undefined when not in L2ARC 125 * - L1-only fields, only allocated when in L1ARC 126 * 127 * Buffer in L1 Buffer only in L2 128 * +------------------------+ +------------------------+ 129 * | arc_buf_hdr_t | | arc_buf_hdr_t | 130 * | | | | 131 * | | | | 132 * | | | | 133 * +------------------------+ +------------------------+ 134 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 135 * | (undefined if L1-only) | | | 136 * +------------------------+ +------------------------+ 137 * | l1arc_buf_hdr_t | 138 * | | 139 * | | 140 * | | 141 * | | 142 * +------------------------+ 143 * 144 * Because it's possible for the L2ARC to become extremely large, we can wind 145 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 146 * is minimized by only allocating the fields necessary for an L1-cached buffer 147 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 148 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 149 * words in pointers. arc_hdr_realloc() is used to switch a header between 150 * these two allocation states. 151 */ 152 typedef struct l1arc_buf_hdr { 153 kmutex_t b_freeze_lock; 154 zio_cksum_t *b_freeze_cksum; 155 156 /* for waiting on reads to complete */ 157 kcondvar_t b_cv; 158 uint8_t b_byteswap; 159 160 /* protected by arc state mutex */ 161 arc_state_t *b_state; 162 multilist_node_t b_arc_node; 163 164 /* protected by hash lock */ 165 clock_t b_arc_access; 166 uint32_t b_mru_hits; 167 uint32_t b_mru_ghost_hits; 168 uint32_t b_mfu_hits; 169 uint32_t b_mfu_ghost_hits; 170 uint32_t b_bufcnt; 171 arc_buf_t *b_buf; 172 173 /* self protecting */ 174 zfs_refcount_t b_refcnt; 175 176 arc_callback_t *b_acb; 177 abd_t *b_pabd; 178 } l1arc_buf_hdr_t; 179 180 typedef enum l2arc_dev_hdr_flags_t { 181 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ 182 } l2arc_dev_hdr_flags_t; 183 184 /* 185 * Pointer used in persistent L2ARC (for pointing to log blocks). 186 */ 187 typedef struct l2arc_log_blkptr { 188 /* 189 * Offset of log block within the device, in bytes 190 */ 191 uint64_t lbp_daddr; 192 /* 193 * Aligned payload size (in bytes) of the log block 194 */ 195 uint64_t lbp_payload_asize; 196 /* 197 * Offset in bytes of the first buffer in the payload 198 */ 199 uint64_t lbp_payload_start; 200 /* 201 * lbp_prop has the following format: 202 * * logical size (in bytes) 203 * * aligned (after compression) size (in bytes) 204 * * compression algorithm (we always LZ4-compress l2arc logs) 205 * * checksum algorithm (used for lbp_cksum) 206 */ 207 uint64_t lbp_prop; 208 zio_cksum_t lbp_cksum; /* checksum of log */ 209 } l2arc_log_blkptr_t; 210 211 /* 212 * The persistent L2ARC device header. 213 * Byte order of magic determines whether 64-bit bswap of fields is necessary. 214 */ 215 typedef struct l2arc_dev_hdr_phys { 216 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ 217 uint64_t dh_version; /* Persistent L2ARC version */ 218 219 /* 220 * Global L2ARC device state and metadata. 221 */ 222 uint64_t dh_spa_guid; 223 uint64_t dh_vdev_guid; 224 uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ 225 uint64_t dh_evict; /* evicted offset in bytes */ 226 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ 227 /* 228 * Used in zdb.c for determining if a log block is valid, in the same 229 * way that l2arc_rebuild() does. 230 */ 231 uint64_t dh_start; /* mirror of l2ad_start */ 232 uint64_t dh_end; /* mirror of l2ad_end */ 233 /* 234 * Start of log block chain. [0] -> newest log, [1] -> one older (used 235 * for initiating prefetch). 236 */ 237 l2arc_log_blkptr_t dh_start_lbps[2]; 238 /* 239 * Aligned size of all log blocks as accounted by vdev_space_update(). 240 */ 241 uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ 242 uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ 243 /* 244 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to 245 * display when the cache device was fully trimmed for the last 246 * time. 247 */ 248 uint64_t dh_trim_action_time; 249 uint64_t dh_trim_state; 250 const uint64_t dh_pad[30]; /* pad to 512 bytes */ 251 zio_eck_t dh_tail; 252 } l2arc_dev_hdr_phys_t; 253 CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); 254 255 /* 256 * A single ARC buffer header entry in a l2arc_log_blk_phys_t. 257 */ 258 typedef struct l2arc_log_ent_phys { 259 dva_t le_dva; /* dva of buffer */ 260 uint64_t le_birth; /* birth txg of buffer */ 261 /* 262 * le_prop has the following format: 263 * * logical size (in bytes) 264 * * physical (compressed) size (in bytes) 265 * * compression algorithm 266 * * object type (used to restore arc_buf_contents_t) 267 * * protected status (used for encryption) 268 * * prefetch status (used in l2arc_read_done()) 269 */ 270 uint64_t le_prop; 271 uint64_t le_daddr; /* buf location on l2dev */ 272 uint64_t le_complevel; 273 /* 274 * We pad the size of each entry to a power of 2 so that the size of 275 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, 276 * because of the L2ARC_SET_*SIZE macros. 277 */ 278 const uint64_t le_pad[2]; /* pad to 64 bytes */ 279 } l2arc_log_ent_phys_t; 280 281 #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) 282 283 /* 284 * A log block of up to 1022 ARC buffer log entries, chained into the 285 * persistent L2ARC metadata linked list. Byte order of magic determines 286 * whether 64-bit bswap of fields is necessary. 287 */ 288 typedef struct l2arc_log_blk_phys { 289 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ 290 /* 291 * There are 2 chains (headed by dh_start_lbps[2]), and this field 292 * points back to the previous block in this chain. We alternate 293 * which chain we append to, so they are time-wise and offset-wise 294 * interleaved, but that is an optimization rather than for 295 * correctness. 296 */ 297 l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ 298 /* 299 * Pad header section to 128 bytes 300 */ 301 uint64_t lb_pad[7]; 302 /* Payload */ 303 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; 304 } l2arc_log_blk_phys_t; /* 64K total */ 305 306 /* 307 * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with 308 * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. 309 */ 310 CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 311 1ULL << SPA_MINBLOCKSHIFT)); 312 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); 313 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); 314 315 /* 316 * These structures hold in-flight abd buffers for log blocks as they're being 317 * written to the L2ARC device. 318 */ 319 typedef struct l2arc_lb_abd_buf { 320 abd_t *abd; 321 list_node_t node; 322 } l2arc_lb_abd_buf_t; 323 324 /* 325 * These structures hold pointers to log blocks present on the L2ARC device. 326 */ 327 typedef struct l2arc_lb_ptr_buf { 328 l2arc_log_blkptr_t *lb_ptr; 329 list_node_t node; 330 } l2arc_lb_ptr_buf_t; 331 332 /* Macros for setting fields in le_prop and lbp_prop */ 333 #define L2BLK_GET_LSIZE(field) \ 334 BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) 335 #define L2BLK_SET_LSIZE(field, x) \ 336 BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 337 #define L2BLK_GET_PSIZE(field) \ 338 BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) 339 #define L2BLK_SET_PSIZE(field, x) \ 340 BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 341 #define L2BLK_GET_COMPRESS(field) \ 342 BF64_GET((field), 32, SPA_COMPRESSBITS) 343 #define L2BLK_SET_COMPRESS(field, x) \ 344 BF64_SET((field), 32, SPA_COMPRESSBITS, x) 345 #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) 346 #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) 347 #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) 348 #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) 349 #define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) 350 #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) 351 #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) 352 #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) 353 #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) 354 #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) 355 356 #define PTR_SWAP(x, y) \ 357 do { \ 358 void *tmp = (x);\ 359 x = y; \ 360 y = tmp; \ 361 _NOTE(CONSTCOND)\ 362 } while (0) 363 364 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ 365 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ 366 367 /* 368 * L2ARC Internals 369 */ 370 typedef struct l2arc_dev { 371 vdev_t *l2ad_vdev; /* vdev */ 372 spa_t *l2ad_spa; /* spa */ 373 uint64_t l2ad_hand; /* next write location */ 374 uint64_t l2ad_start; /* first addr on device */ 375 uint64_t l2ad_end; /* last addr on device */ 376 boolean_t l2ad_first; /* first sweep through */ 377 boolean_t l2ad_writing; /* currently writing */ 378 kmutex_t l2ad_mtx; /* lock for buffer list */ 379 list_t l2ad_buflist; /* buffer list */ 380 list_node_t l2ad_node; /* device list node */ 381 zfs_refcount_t l2ad_alloc; /* allocated bytes */ 382 /* 383 * Persistence-related stuff 384 */ 385 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ 386 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ 387 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ 388 int l2ad_log_ent_idx; /* index into cur log blk */ 389 /* Number of bytes in current log block's payload */ 390 uint64_t l2ad_log_blk_payload_asize; 391 /* 392 * Offset (in bytes) of the first buffer in current log block's 393 * payload. 394 */ 395 uint64_t l2ad_log_blk_payload_start; 396 /* Flag indicating whether a rebuild is scheduled or is going on */ 397 boolean_t l2ad_rebuild; 398 boolean_t l2ad_rebuild_cancel; 399 boolean_t l2ad_rebuild_began; 400 uint64_t l2ad_log_entries; /* entries per log blk */ 401 uint64_t l2ad_evict; /* evicted offset in bytes */ 402 /* List of pointers to log blocks present in the L2ARC device */ 403 list_t l2ad_lbptr_list; 404 /* 405 * Aligned size of all log blocks as accounted by vdev_space_update(). 406 */ 407 zfs_refcount_t l2ad_lb_asize; 408 /* 409 * Number of log blocks present on the device. 410 */ 411 zfs_refcount_t l2ad_lb_count; 412 boolean_t l2ad_trim_all; /* TRIM whole device */ 413 } l2arc_dev_t; 414 415 /* 416 * Encrypted blocks will need to be stored encrypted on the L2ARC 417 * disk as they appear in the main pool. In order for this to work we 418 * need to pass around the encryption parameters so they can be used 419 * to write data to the L2ARC. This struct is only defined in the 420 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED 421 * flag set. 422 */ 423 typedef struct arc_buf_hdr_crypt { 424 abd_t *b_rabd; /* raw encrypted data */ 425 dmu_object_type_t b_ot; /* object type */ 426 uint32_t b_ebufcnt; /* count of encrypted buffers */ 427 428 /* dsobj for looking up encryption key for l2arc encryption */ 429 uint64_t b_dsobj; 430 431 /* encryption parameters */ 432 uint8_t b_salt[ZIO_DATA_SALT_LEN]; 433 uint8_t b_iv[ZIO_DATA_IV_LEN]; 434 435 /* 436 * Technically this could be removed since we will always be able to 437 * get the mac from the bp when we need it. However, it is inconvenient 438 * for callers of arc code to have to pass a bp in all the time. This 439 * also allows us to assert that L2ARC data is properly encrypted to 440 * match the data in the main storage pool. 441 */ 442 uint8_t b_mac[ZIO_DATA_MAC_LEN]; 443 } arc_buf_hdr_crypt_t; 444 445 typedef struct l2arc_buf_hdr { 446 /* protected by arc_buf_hdr mutex */ 447 l2arc_dev_t *b_dev; /* L2ARC device */ 448 uint64_t b_daddr; /* disk address, offset byte */ 449 uint32_t b_hits; 450 arc_state_type_t b_arcs_state; 451 list_node_t b_l2node; 452 } l2arc_buf_hdr_t; 453 454 typedef struct l2arc_write_callback { 455 l2arc_dev_t *l2wcb_dev; /* device info */ 456 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 457 /* in-flight list of log blocks */ 458 list_t l2wcb_abd_list; 459 } l2arc_write_callback_t; 460 461 struct arc_buf_hdr { 462 /* protected by hash lock */ 463 dva_t b_dva; 464 uint64_t b_birth; 465 466 arc_buf_contents_t b_type; 467 uint8_t b_complevel; 468 uint8_t b_reserved1; /* used for 4 byte alignment */ 469 uint16_t b_reserved2; /* used for 4 byte alignment */ 470 arc_buf_hdr_t *b_hash_next; 471 arc_flags_t b_flags; 472 473 /* 474 * This field stores the size of the data buffer after 475 * compression, and is set in the arc's zio completion handlers. 476 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 477 * 478 * While the block pointers can store up to 32MB in their psize 479 * field, we can only store up to 32MB minus 512B. This is due 480 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 481 * a field of zeros represents 512B in the bp). We can't use a 482 * bias of 1 since we need to reserve a psize of zero, here, to 483 * represent holes and embedded blocks. 484 * 485 * This isn't a problem in practice, since the maximum size of a 486 * buffer is limited to 16MB, so we never need to store 32MB in 487 * this field. Even in the upstream illumos code base, the 488 * maximum size of a buffer is limited to 16MB. 489 */ 490 uint16_t b_psize; 491 492 /* 493 * This field stores the size of the data buffer before 494 * compression, and cannot change once set. It is in units 495 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 496 */ 497 uint16_t b_lsize; /* immutable */ 498 uint64_t b_spa; /* immutable */ 499 500 /* L2ARC fields. Undefined when not in L2ARC. */ 501 l2arc_buf_hdr_t b_l2hdr; 502 /* L1ARC fields. Undefined when in l2arc_only state */ 503 l1arc_buf_hdr_t b_l1hdr; 504 /* 505 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED 506 * is set and the L1 header exists. 507 */ 508 arc_buf_hdr_crypt_t b_crypt_hdr; 509 }; 510 511 typedef struct arc_stats { 512 kstat_named_t arcstat_hits; 513 kstat_named_t arcstat_misses; 514 kstat_named_t arcstat_demand_data_hits; 515 kstat_named_t arcstat_demand_data_misses; 516 kstat_named_t arcstat_demand_metadata_hits; 517 kstat_named_t arcstat_demand_metadata_misses; 518 kstat_named_t arcstat_prefetch_data_hits; 519 kstat_named_t arcstat_prefetch_data_misses; 520 kstat_named_t arcstat_prefetch_metadata_hits; 521 kstat_named_t arcstat_prefetch_metadata_misses; 522 kstat_named_t arcstat_mru_hits; 523 kstat_named_t arcstat_mru_ghost_hits; 524 kstat_named_t arcstat_mfu_hits; 525 kstat_named_t arcstat_mfu_ghost_hits; 526 kstat_named_t arcstat_deleted; 527 /* 528 * Number of buffers that could not be evicted because the hash lock 529 * was held by another thread. The lock may not necessarily be held 530 * by something using the same buffer, since hash locks are shared 531 * by multiple buffers. 532 */ 533 kstat_named_t arcstat_mutex_miss; 534 /* 535 * Number of buffers skipped when updating the access state due to the 536 * header having already been released after acquiring the hash lock. 537 */ 538 kstat_named_t arcstat_access_skip; 539 /* 540 * Number of buffers skipped because they have I/O in progress, are 541 * indirect prefetch buffers that have not lived long enough, or are 542 * not from the spa we're trying to evict from. 543 */ 544 kstat_named_t arcstat_evict_skip; 545 /* 546 * Number of times arc_evict_state() was unable to evict enough 547 * buffers to reach its target amount. 548 */ 549 kstat_named_t arcstat_evict_not_enough; 550 kstat_named_t arcstat_evict_l2_cached; 551 kstat_named_t arcstat_evict_l2_eligible; 552 kstat_named_t arcstat_evict_l2_eligible_mfu; 553 kstat_named_t arcstat_evict_l2_eligible_mru; 554 kstat_named_t arcstat_evict_l2_ineligible; 555 kstat_named_t arcstat_evict_l2_skip; 556 kstat_named_t arcstat_hash_elements; 557 kstat_named_t arcstat_hash_elements_max; 558 kstat_named_t arcstat_hash_collisions; 559 kstat_named_t arcstat_hash_chains; 560 kstat_named_t arcstat_hash_chain_max; 561 kstat_named_t arcstat_p; 562 kstat_named_t arcstat_c; 563 kstat_named_t arcstat_c_min; 564 kstat_named_t arcstat_c_max; 565 kstat_named_t arcstat_size; 566 /* 567 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 568 * Note that the compressed bytes may match the uncompressed bytes 569 * if the block is either not compressed or compressed arc is disabled. 570 */ 571 kstat_named_t arcstat_compressed_size; 572 /* 573 * Uncompressed size of the data stored in b_pabd. If compressed 574 * arc is disabled then this value will be identical to the stat 575 * above. 576 */ 577 kstat_named_t arcstat_uncompressed_size; 578 /* 579 * Number of bytes stored in all the arc_buf_t's. This is classified 580 * as "overhead" since this data is typically short-lived and will 581 * be evicted from the arc when it becomes unreferenced unless the 582 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 583 * values have been set (see comment in dbuf.c for more information). 584 */ 585 kstat_named_t arcstat_overhead_size; 586 /* 587 * Number of bytes consumed by internal ARC structures necessary 588 * for tracking purposes; these structures are not actually 589 * backed by ARC buffers. This includes arc_buf_hdr_t structures 590 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 591 * caches), and arc_buf_t structures (allocated via arc_buf_t 592 * cache). 593 */ 594 kstat_named_t arcstat_hdr_size; 595 /* 596 * Number of bytes consumed by ARC buffers of type equal to 597 * ARC_BUFC_DATA. This is generally consumed by buffers backing 598 * on disk user data (e.g. plain file contents). 599 */ 600 kstat_named_t arcstat_data_size; 601 /* 602 * Number of bytes consumed by ARC buffers of type equal to 603 * ARC_BUFC_METADATA. This is generally consumed by buffers 604 * backing on disk data that is used for internal ZFS 605 * structures (e.g. ZAP, dnode, indirect blocks, etc). 606 */ 607 kstat_named_t arcstat_metadata_size; 608 /* 609 * Number of bytes consumed by dmu_buf_impl_t objects. 610 */ 611 kstat_named_t arcstat_dbuf_size; 612 /* 613 * Number of bytes consumed by dnode_t objects. 614 */ 615 kstat_named_t arcstat_dnode_size; 616 /* 617 * Number of bytes consumed by bonus buffers. 618 */ 619 kstat_named_t arcstat_bonus_size; 620 #if defined(COMPAT_FREEBSD11) 621 /* 622 * Sum of the previous three counters, provided for compatibility. 623 */ 624 kstat_named_t arcstat_other_size; 625 #endif 626 627 /* 628 * Total number of bytes consumed by ARC buffers residing in the 629 * arc_anon state. This includes *all* buffers in the arc_anon 630 * state; e.g. data, metadata, evictable, and unevictable buffers 631 * are all included in this value. 632 */ 633 kstat_named_t arcstat_anon_size; 634 /* 635 * Number of bytes consumed by ARC buffers that meet the 636 * following criteria: backing buffers of type ARC_BUFC_DATA, 637 * residing in the arc_anon state, and are eligible for eviction 638 * (e.g. have no outstanding holds on the buffer). 639 */ 640 kstat_named_t arcstat_anon_evictable_data; 641 /* 642 * Number of bytes consumed by ARC buffers that meet the 643 * following criteria: backing buffers of type ARC_BUFC_METADATA, 644 * residing in the arc_anon state, and are eligible for eviction 645 * (e.g. have no outstanding holds on the buffer). 646 */ 647 kstat_named_t arcstat_anon_evictable_metadata; 648 /* 649 * Total number of bytes consumed by ARC buffers residing in the 650 * arc_mru state. This includes *all* buffers in the arc_mru 651 * state; e.g. data, metadata, evictable, and unevictable buffers 652 * are all included in this value. 653 */ 654 kstat_named_t arcstat_mru_size; 655 /* 656 * Number of bytes consumed by ARC buffers that meet the 657 * following criteria: backing buffers of type ARC_BUFC_DATA, 658 * residing in the arc_mru state, and are eligible for eviction 659 * (e.g. have no outstanding holds on the buffer). 660 */ 661 kstat_named_t arcstat_mru_evictable_data; 662 /* 663 * Number of bytes consumed by ARC buffers that meet the 664 * following criteria: backing buffers of type ARC_BUFC_METADATA, 665 * residing in the arc_mru state, and are eligible for eviction 666 * (e.g. have no outstanding holds on the buffer). 667 */ 668 kstat_named_t arcstat_mru_evictable_metadata; 669 /* 670 * Total number of bytes that *would have been* consumed by ARC 671 * buffers in the arc_mru_ghost state. The key thing to note 672 * here, is the fact that this size doesn't actually indicate 673 * RAM consumption. The ghost lists only consist of headers and 674 * don't actually have ARC buffers linked off of these headers. 675 * Thus, *if* the headers had associated ARC buffers, these 676 * buffers *would have* consumed this number of bytes. 677 */ 678 kstat_named_t arcstat_mru_ghost_size; 679 /* 680 * Number of bytes that *would have been* consumed by ARC 681 * buffers that are eligible for eviction, of type 682 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 683 */ 684 kstat_named_t arcstat_mru_ghost_evictable_data; 685 /* 686 * Number of bytes that *would have been* consumed by ARC 687 * buffers that are eligible for eviction, of type 688 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 689 */ 690 kstat_named_t arcstat_mru_ghost_evictable_metadata; 691 /* 692 * Total number of bytes consumed by ARC buffers residing in the 693 * arc_mfu state. This includes *all* buffers in the arc_mfu 694 * state; e.g. data, metadata, evictable, and unevictable buffers 695 * are all included in this value. 696 */ 697 kstat_named_t arcstat_mfu_size; 698 /* 699 * Number of bytes consumed by ARC buffers that are eligible for 700 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 701 * state. 702 */ 703 kstat_named_t arcstat_mfu_evictable_data; 704 /* 705 * Number of bytes consumed by ARC buffers that are eligible for 706 * eviction, of type ARC_BUFC_METADATA, and reside in the 707 * arc_mfu state. 708 */ 709 kstat_named_t arcstat_mfu_evictable_metadata; 710 /* 711 * Total number of bytes that *would have been* consumed by ARC 712 * buffers in the arc_mfu_ghost state. See the comment above 713 * arcstat_mru_ghost_size for more details. 714 */ 715 kstat_named_t arcstat_mfu_ghost_size; 716 /* 717 * Number of bytes that *would have been* consumed by ARC 718 * buffers that are eligible for eviction, of type 719 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 720 */ 721 kstat_named_t arcstat_mfu_ghost_evictable_data; 722 /* 723 * Number of bytes that *would have been* consumed by ARC 724 * buffers that are eligible for eviction, of type 725 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 726 */ 727 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 728 kstat_named_t arcstat_l2_hits; 729 kstat_named_t arcstat_l2_misses; 730 /* 731 * Allocated size (in bytes) of L2ARC cached buffers by ARC state. 732 */ 733 kstat_named_t arcstat_l2_prefetch_asize; 734 kstat_named_t arcstat_l2_mru_asize; 735 kstat_named_t arcstat_l2_mfu_asize; 736 /* 737 * Allocated size (in bytes) of L2ARC cached buffers by buffer content 738 * type. 739 */ 740 kstat_named_t arcstat_l2_bufc_data_asize; 741 kstat_named_t arcstat_l2_bufc_metadata_asize; 742 kstat_named_t arcstat_l2_feeds; 743 kstat_named_t arcstat_l2_rw_clash; 744 kstat_named_t arcstat_l2_read_bytes; 745 kstat_named_t arcstat_l2_write_bytes; 746 kstat_named_t arcstat_l2_writes_sent; 747 kstat_named_t arcstat_l2_writes_done; 748 kstat_named_t arcstat_l2_writes_error; 749 kstat_named_t arcstat_l2_writes_lock_retry; 750 kstat_named_t arcstat_l2_evict_lock_retry; 751 kstat_named_t arcstat_l2_evict_reading; 752 kstat_named_t arcstat_l2_evict_l1cached; 753 kstat_named_t arcstat_l2_free_on_write; 754 kstat_named_t arcstat_l2_abort_lowmem; 755 kstat_named_t arcstat_l2_cksum_bad; 756 kstat_named_t arcstat_l2_io_error; 757 kstat_named_t arcstat_l2_lsize; 758 kstat_named_t arcstat_l2_psize; 759 kstat_named_t arcstat_l2_hdr_size; 760 /* 761 * Number of L2ARC log blocks written. These are used for restoring the 762 * L2ARC. Updated during writing of L2ARC log blocks. 763 */ 764 kstat_named_t arcstat_l2_log_blk_writes; 765 /* 766 * Moving average of the aligned size of the L2ARC log blocks, in 767 * bytes. Updated during L2ARC rebuild and during writing of L2ARC 768 * log blocks. 769 */ 770 kstat_named_t arcstat_l2_log_blk_avg_asize; 771 /* Aligned size of L2ARC log blocks on L2ARC devices. */ 772 kstat_named_t arcstat_l2_log_blk_asize; 773 /* Number of L2ARC log blocks present on L2ARC devices. */ 774 kstat_named_t arcstat_l2_log_blk_count; 775 /* 776 * Moving average of the aligned size of L2ARC restored data, in bytes, 777 * to the aligned size of their metadata in L2ARC, in bytes. 778 * Updated during L2ARC rebuild and during writing of L2ARC log blocks. 779 */ 780 kstat_named_t arcstat_l2_data_to_meta_ratio; 781 /* 782 * Number of times the L2ARC rebuild was successful for an L2ARC device. 783 */ 784 kstat_named_t arcstat_l2_rebuild_success; 785 /* 786 * Number of times the L2ARC rebuild failed because the device header 787 * was in an unsupported format or corrupted. 788 */ 789 kstat_named_t arcstat_l2_rebuild_abort_unsupported; 790 /* 791 * Number of times the L2ARC rebuild failed because of IO errors 792 * while reading a log block. 793 */ 794 kstat_named_t arcstat_l2_rebuild_abort_io_errors; 795 /* 796 * Number of times the L2ARC rebuild failed because of IO errors when 797 * reading the device header. 798 */ 799 kstat_named_t arcstat_l2_rebuild_abort_dh_errors; 800 /* 801 * Number of L2ARC log blocks which failed to be restored due to 802 * checksum errors. 803 */ 804 kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; 805 /* 806 * Number of times the L2ARC rebuild was aborted due to low system 807 * memory. 808 */ 809 kstat_named_t arcstat_l2_rebuild_abort_lowmem; 810 /* Logical size of L2ARC restored data, in bytes. */ 811 kstat_named_t arcstat_l2_rebuild_size; 812 /* Aligned size of L2ARC restored data, in bytes. */ 813 kstat_named_t arcstat_l2_rebuild_asize; 814 /* 815 * Number of L2ARC log entries (buffers) that were successfully 816 * restored in ARC. 817 */ 818 kstat_named_t arcstat_l2_rebuild_bufs; 819 /* 820 * Number of L2ARC log entries (buffers) already cached in ARC. These 821 * were not restored again. 822 */ 823 kstat_named_t arcstat_l2_rebuild_bufs_precached; 824 /* 825 * Number of L2ARC log blocks that were restored successfully. Each 826 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. 827 */ 828 kstat_named_t arcstat_l2_rebuild_log_blks; 829 kstat_named_t arcstat_memory_throttle_count; 830 kstat_named_t arcstat_memory_direct_count; 831 kstat_named_t arcstat_memory_indirect_count; 832 kstat_named_t arcstat_memory_all_bytes; 833 kstat_named_t arcstat_memory_free_bytes; 834 kstat_named_t arcstat_memory_available_bytes; 835 kstat_named_t arcstat_no_grow; 836 kstat_named_t arcstat_tempreserve; 837 kstat_named_t arcstat_loaned_bytes; 838 kstat_named_t arcstat_prune; 839 kstat_named_t arcstat_meta_used; 840 kstat_named_t arcstat_meta_limit; 841 kstat_named_t arcstat_dnode_limit; 842 kstat_named_t arcstat_meta_max; 843 kstat_named_t arcstat_meta_min; 844 kstat_named_t arcstat_async_upgrade_sync; 845 kstat_named_t arcstat_demand_hit_predictive_prefetch; 846 kstat_named_t arcstat_demand_hit_prescient_prefetch; 847 kstat_named_t arcstat_need_free; 848 kstat_named_t arcstat_sys_free; 849 kstat_named_t arcstat_raw_size; 850 kstat_named_t arcstat_cached_only_in_progress; 851 kstat_named_t arcstat_abd_chunk_waste_size; 852 } arc_stats_t; 853 854 typedef struct arc_sums { 855 wmsum_t arcstat_hits; 856 wmsum_t arcstat_misses; 857 wmsum_t arcstat_demand_data_hits; 858 wmsum_t arcstat_demand_data_misses; 859 wmsum_t arcstat_demand_metadata_hits; 860 wmsum_t arcstat_demand_metadata_misses; 861 wmsum_t arcstat_prefetch_data_hits; 862 wmsum_t arcstat_prefetch_data_misses; 863 wmsum_t arcstat_prefetch_metadata_hits; 864 wmsum_t arcstat_prefetch_metadata_misses; 865 wmsum_t arcstat_mru_hits; 866 wmsum_t arcstat_mru_ghost_hits; 867 wmsum_t arcstat_mfu_hits; 868 wmsum_t arcstat_mfu_ghost_hits; 869 wmsum_t arcstat_deleted; 870 wmsum_t arcstat_mutex_miss; 871 wmsum_t arcstat_access_skip; 872 wmsum_t arcstat_evict_skip; 873 wmsum_t arcstat_evict_not_enough; 874 wmsum_t arcstat_evict_l2_cached; 875 wmsum_t arcstat_evict_l2_eligible; 876 wmsum_t arcstat_evict_l2_eligible_mfu; 877 wmsum_t arcstat_evict_l2_eligible_mru; 878 wmsum_t arcstat_evict_l2_ineligible; 879 wmsum_t arcstat_evict_l2_skip; 880 wmsum_t arcstat_hash_collisions; 881 wmsum_t arcstat_hash_chains; 882 aggsum_t arcstat_size; 883 wmsum_t arcstat_compressed_size; 884 wmsum_t arcstat_uncompressed_size; 885 wmsum_t arcstat_overhead_size; 886 wmsum_t arcstat_hdr_size; 887 wmsum_t arcstat_data_size; 888 wmsum_t arcstat_metadata_size; 889 wmsum_t arcstat_dbuf_size; 890 aggsum_t arcstat_dnode_size; 891 wmsum_t arcstat_bonus_size; 892 wmsum_t arcstat_l2_hits; 893 wmsum_t arcstat_l2_misses; 894 wmsum_t arcstat_l2_prefetch_asize; 895 wmsum_t arcstat_l2_mru_asize; 896 wmsum_t arcstat_l2_mfu_asize; 897 wmsum_t arcstat_l2_bufc_data_asize; 898 wmsum_t arcstat_l2_bufc_metadata_asize; 899 wmsum_t arcstat_l2_feeds; 900 wmsum_t arcstat_l2_rw_clash; 901 wmsum_t arcstat_l2_read_bytes; 902 wmsum_t arcstat_l2_write_bytes; 903 wmsum_t arcstat_l2_writes_sent; 904 wmsum_t arcstat_l2_writes_done; 905 wmsum_t arcstat_l2_writes_error; 906 wmsum_t arcstat_l2_writes_lock_retry; 907 wmsum_t arcstat_l2_evict_lock_retry; 908 wmsum_t arcstat_l2_evict_reading; 909 wmsum_t arcstat_l2_evict_l1cached; 910 wmsum_t arcstat_l2_free_on_write; 911 wmsum_t arcstat_l2_abort_lowmem; 912 wmsum_t arcstat_l2_cksum_bad; 913 wmsum_t arcstat_l2_io_error; 914 wmsum_t arcstat_l2_lsize; 915 wmsum_t arcstat_l2_psize; 916 aggsum_t arcstat_l2_hdr_size; 917 wmsum_t arcstat_l2_log_blk_writes; 918 wmsum_t arcstat_l2_log_blk_asize; 919 wmsum_t arcstat_l2_log_blk_count; 920 wmsum_t arcstat_l2_rebuild_success; 921 wmsum_t arcstat_l2_rebuild_abort_unsupported; 922 wmsum_t arcstat_l2_rebuild_abort_io_errors; 923 wmsum_t arcstat_l2_rebuild_abort_dh_errors; 924 wmsum_t arcstat_l2_rebuild_abort_cksum_lb_errors; 925 wmsum_t arcstat_l2_rebuild_abort_lowmem; 926 wmsum_t arcstat_l2_rebuild_size; 927 wmsum_t arcstat_l2_rebuild_asize; 928 wmsum_t arcstat_l2_rebuild_bufs; 929 wmsum_t arcstat_l2_rebuild_bufs_precached; 930 wmsum_t arcstat_l2_rebuild_log_blks; 931 wmsum_t arcstat_memory_throttle_count; 932 wmsum_t arcstat_memory_direct_count; 933 wmsum_t arcstat_memory_indirect_count; 934 wmsum_t arcstat_prune; 935 aggsum_t arcstat_meta_used; 936 wmsum_t arcstat_async_upgrade_sync; 937 wmsum_t arcstat_demand_hit_predictive_prefetch; 938 wmsum_t arcstat_demand_hit_prescient_prefetch; 939 wmsum_t arcstat_raw_size; 940 wmsum_t arcstat_cached_only_in_progress; 941 wmsum_t arcstat_abd_chunk_waste_size; 942 } arc_sums_t; 943 944 typedef struct arc_evict_waiter { 945 list_node_t aew_node; 946 kcondvar_t aew_cv; 947 uint64_t aew_count; 948 } arc_evict_waiter_t; 949 950 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 951 952 #define ARCSTAT_INCR(stat, val) \ 953 wmsum_add(&arc_sums.stat, (val)) 954 955 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 956 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 957 958 #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ 959 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 960 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 961 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 962 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 963 #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ 964 965 #define arc_anon (&ARC_anon) 966 #define arc_mru (&ARC_mru) 967 #define arc_mru_ghost (&ARC_mru_ghost) 968 #define arc_mfu (&ARC_mfu) 969 #define arc_mfu_ghost (&ARC_mfu_ghost) 970 #define arc_l2c_only (&ARC_l2c_only) 971 972 extern taskq_t *arc_prune_taskq; 973 extern arc_stats_t arc_stats; 974 extern arc_sums_t arc_sums; 975 extern hrtime_t arc_growtime; 976 extern boolean_t arc_warm; 977 extern int arc_grow_retry; 978 extern int arc_no_grow_shift; 979 extern int arc_shrink_shift; 980 extern kmutex_t arc_prune_mtx; 981 extern list_t arc_prune_list; 982 extern arc_state_t ARC_mfu; 983 extern arc_state_t ARC_mru; 984 extern uint_t zfs_arc_pc_percent; 985 extern int arc_lotsfree_percent; 986 extern unsigned long zfs_arc_min; 987 extern unsigned long zfs_arc_max; 988 989 extern void arc_reduce_target_size(int64_t to_free); 990 extern boolean_t arc_reclaim_needed(void); 991 extern void arc_kmem_reap_soon(void); 992 extern void arc_wait_for_eviction(uint64_t, boolean_t); 993 994 extern void arc_lowmem_init(void); 995 extern void arc_lowmem_fini(void); 996 extern void arc_prune_async(int64_t); 997 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); 998 extern uint64_t arc_free_memory(void); 999 extern int64_t arc_available_memory(void); 1000 extern void arc_tuning_update(boolean_t); 1001 extern void arc_register_hotplug(void); 1002 extern void arc_unregister_hotplug(void); 1003 1004 extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); 1005 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); 1006 extern int param_set_arc_min(ZFS_MODULE_PARAM_ARGS); 1007 extern int param_set_arc_max(ZFS_MODULE_PARAM_ARGS); 1008 1009 /* used in zdb.c */ 1010 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, 1011 const l2arc_log_blkptr_t *lbp); 1012 1013 /* used in vdev_trim.c */ 1014 void l2arc_dev_hdr_update(l2arc_dev_t *dev); 1015 l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); 1016 1017 #ifdef __cplusplus 1018 } 1019 #endif 1020 1021 #endif /* _SYS_ARC_IMPL_H */ 1022