1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013, Delphix. All rights reserved. 24 * Copyright (c) 2013, Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2020, George Amanakis. All rights reserved. 27 */ 28 29 #ifndef _SYS_ARC_IMPL_H 30 #define _SYS_ARC_IMPL_H 31 32 #include <sys/arc.h> 33 #include <sys/zio_crypt.h> 34 #include <sys/zthr.h> 35 #include <sys/aggsum.h> 36 37 #ifdef __cplusplus 38 extern "C" { 39 #endif 40 41 /* 42 * Note that buffers can be in one of 6 states: 43 * ARC_anon - anonymous (discussed below) 44 * ARC_mru - recently used, currently cached 45 * ARC_mru_ghost - recently used, no longer in cache 46 * ARC_mfu - frequently used, currently cached 47 * ARC_mfu_ghost - frequently used, no longer in cache 48 * ARC_l2c_only - exists in L2ARC but not other states 49 * When there are no active references to the buffer, they are 50 * are linked onto a list in one of these arc states. These are 51 * the only buffers that can be evicted or deleted. Within each 52 * state there are multiple lists, one for meta-data and one for 53 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 54 * etc.) is tracked separately so that it can be managed more 55 * explicitly: favored over data, limited explicitly. 56 * 57 * Anonymous buffers are buffers that are not associated with 58 * a DVA. These are buffers that hold dirty block copies 59 * before they are written to stable storage. By definition, 60 * they are "ref'd" and are considered part of arc_mru 61 * that cannot be freed. Generally, they will acquire a DVA 62 * as they are written and migrate onto the arc_mru list. 63 * 64 * The ARC_l2c_only state is for buffers that are in the second 65 * level ARC but no longer in any of the ARC_m* lists. The second 66 * level ARC itself may also contain buffers that are in any of 67 * the ARC_m* states - meaning that a buffer can exist in two 68 * places. The reason for the ARC_l2c_only state is to keep the 69 * buffer header in the hash table, so that reads that hit the 70 * second level ARC benefit from these fast lookups. 71 */ 72 73 typedef struct arc_state { 74 /* 75 * list of evictable buffers 76 */ 77 multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 78 /* 79 * total amount of evictable data in this state 80 */ 81 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 82 /* 83 * total amount of data in this state; this includes: evictable, 84 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 85 */ 86 zfs_refcount_t arcs_size; 87 /* 88 * supports the "dbufs" kstat 89 */ 90 arc_state_type_t arcs_state; 91 } arc_state_t; 92 93 typedef struct arc_callback arc_callback_t; 94 95 struct arc_callback { 96 void *acb_private; 97 arc_read_done_func_t *acb_done; 98 arc_buf_t *acb_buf; 99 boolean_t acb_encrypted; 100 boolean_t acb_compressed; 101 boolean_t acb_noauth; 102 boolean_t acb_nobuf; 103 zbookmark_phys_t acb_zb; 104 zio_t *acb_zio_dummy; 105 zio_t *acb_zio_head; 106 arc_callback_t *acb_next; 107 }; 108 109 typedef struct arc_write_callback arc_write_callback_t; 110 111 struct arc_write_callback { 112 void *awcb_private; 113 arc_write_done_func_t *awcb_ready; 114 arc_write_done_func_t *awcb_children_ready; 115 arc_write_done_func_t *awcb_physdone; 116 arc_write_done_func_t *awcb_done; 117 arc_buf_t *awcb_buf; 118 }; 119 120 /* 121 * ARC buffers are separated into multiple structs as a memory saving measure: 122 * - Common fields struct, always defined, and embedded within it: 123 * - L2-only fields, always allocated but undefined when not in L2ARC 124 * - L1-only fields, only allocated when in L1ARC 125 * 126 * Buffer in L1 Buffer only in L2 127 * +------------------------+ +------------------------+ 128 * | arc_buf_hdr_t | | arc_buf_hdr_t | 129 * | | | | 130 * | | | | 131 * | | | | 132 * +------------------------+ +------------------------+ 133 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 134 * | (undefined if L1-only) | | | 135 * +------------------------+ +------------------------+ 136 * | l1arc_buf_hdr_t | 137 * | | 138 * | | 139 * | | 140 * | | 141 * +------------------------+ 142 * 143 * Because it's possible for the L2ARC to become extremely large, we can wind 144 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 145 * is minimized by only allocating the fields necessary for an L1-cached buffer 146 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 147 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 148 * words in pointers. arc_hdr_realloc() is used to switch a header between 149 * these two allocation states. 150 */ 151 typedef struct l1arc_buf_hdr { 152 kmutex_t b_freeze_lock; 153 zio_cksum_t *b_freeze_cksum; 154 155 arc_buf_t *b_buf; 156 uint32_t b_bufcnt; 157 /* for waiting on writes to complete */ 158 kcondvar_t b_cv; 159 uint8_t b_byteswap; 160 161 162 /* protected by arc state mutex */ 163 arc_state_t *b_state; 164 multilist_node_t b_arc_node; 165 166 /* updated atomically */ 167 clock_t b_arc_access; 168 uint32_t b_mru_hits; 169 uint32_t b_mru_ghost_hits; 170 uint32_t b_mfu_hits; 171 uint32_t b_mfu_ghost_hits; 172 uint32_t b_l2_hits; 173 174 /* self protecting */ 175 zfs_refcount_t b_refcnt; 176 177 arc_callback_t *b_acb; 178 abd_t *b_pabd; 179 } l1arc_buf_hdr_t; 180 181 typedef enum l2arc_dev_hdr_flags_t { 182 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ 183 } l2arc_dev_hdr_flags_t; 184 185 /* 186 * Pointer used in persistent L2ARC (for pointing to log blocks). 187 */ 188 typedef struct l2arc_log_blkptr { 189 /* 190 * Offset of log block within the device, in bytes 191 */ 192 uint64_t lbp_daddr; 193 /* 194 * Aligned payload size (in bytes) of the log block 195 */ 196 uint64_t lbp_payload_asize; 197 /* 198 * Offset in bytes of the first buffer in the payload 199 */ 200 uint64_t lbp_payload_start; 201 /* 202 * lbp_prop has the following format: 203 * * logical size (in bytes) 204 * * aligned (after compression) size (in bytes) 205 * * compression algorithm (we always LZ4-compress l2arc logs) 206 * * checksum algorithm (used for lbp_cksum) 207 */ 208 uint64_t lbp_prop; 209 zio_cksum_t lbp_cksum; /* checksum of log */ 210 } l2arc_log_blkptr_t; 211 212 /* 213 * The persistent L2ARC device header. 214 * Byte order of magic determines whether 64-bit bswap of fields is necessary. 215 */ 216 typedef struct l2arc_dev_hdr_phys { 217 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ 218 uint64_t dh_version; /* Persistent L2ARC version */ 219 220 /* 221 * Global L2ARC device state and metadata. 222 */ 223 uint64_t dh_spa_guid; 224 uint64_t dh_vdev_guid; 225 uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ 226 uint64_t dh_evict; /* evicted offset in bytes */ 227 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ 228 /* 229 * Used in zdb.c for determining if a log block is valid, in the same 230 * way that l2arc_rebuild() does. 231 */ 232 uint64_t dh_start; /* mirror of l2ad_start */ 233 uint64_t dh_end; /* mirror of l2ad_end */ 234 /* 235 * Start of log block chain. [0] -> newest log, [1] -> one older (used 236 * for initiating prefetch). 237 */ 238 l2arc_log_blkptr_t dh_start_lbps[2]; 239 /* 240 * Aligned size of all log blocks as accounted by vdev_space_update(). 241 */ 242 uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ 243 uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ 244 /* 245 * Mirrors of vdev_trim_action_time and vdev_trim_state, used to 246 * display when the cache device was fully trimmed for the last 247 * time. 248 */ 249 uint64_t dh_trim_action_time; 250 uint64_t dh_trim_state; 251 const uint64_t dh_pad[30]; /* pad to 512 bytes */ 252 zio_eck_t dh_tail; 253 } l2arc_dev_hdr_phys_t; 254 CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); 255 256 /* 257 * A single ARC buffer header entry in a l2arc_log_blk_phys_t. 258 */ 259 typedef struct l2arc_log_ent_phys { 260 dva_t le_dva; /* dva of buffer */ 261 uint64_t le_birth; /* birth txg of buffer */ 262 /* 263 * le_prop has the following format: 264 * * logical size (in bytes) 265 * * physical (compressed) size (in bytes) 266 * * compression algorithm 267 * * object type (used to restore arc_buf_contents_t) 268 * * protected status (used for encryption) 269 * * prefetch status (used in l2arc_read_done()) 270 */ 271 uint64_t le_prop; 272 uint64_t le_daddr; /* buf location on l2dev */ 273 uint64_t le_complevel; 274 /* 275 * We pad the size of each entry to a power of 2 so that the size of 276 * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, 277 * because of the L2ARC_SET_*SIZE macros. 278 */ 279 const uint64_t le_pad[2]; /* pad to 64 bytes */ 280 } l2arc_log_ent_phys_t; 281 282 #define L2ARC_LOG_BLK_MAX_ENTRIES (1022) 283 284 /* 285 * A log block of up to 1022 ARC buffer log entries, chained into the 286 * persistent L2ARC metadata linked list. Byte order of magic determines 287 * whether 64-bit bswap of fields is necessary. 288 */ 289 typedef struct l2arc_log_blk_phys { 290 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ 291 /* 292 * There are 2 chains (headed by dh_start_lbps[2]), and this field 293 * points back to the previous block in this chain. We alternate 294 * which chain we append to, so they are time-wise and offset-wise 295 * interleaved, but that is an optimization rather than for 296 * correctness. 297 */ 298 l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ 299 /* 300 * Pad header section to 128 bytes 301 */ 302 uint64_t lb_pad[7]; 303 /* Payload */ 304 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; 305 } l2arc_log_blk_phys_t; /* 64K total */ 306 307 /* 308 * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with 309 * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. 310 */ 311 CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), 312 1ULL << SPA_MINBLOCKSHIFT)); 313 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); 314 CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); 315 316 /* 317 * These structures hold in-flight abd buffers for log blocks as they're being 318 * written to the L2ARC device. 319 */ 320 typedef struct l2arc_lb_abd_buf { 321 abd_t *abd; 322 list_node_t node; 323 } l2arc_lb_abd_buf_t; 324 325 /* 326 * These structures hold pointers to log blocks present on the L2ARC device. 327 */ 328 typedef struct l2arc_lb_ptr_buf { 329 l2arc_log_blkptr_t *lb_ptr; 330 list_node_t node; 331 } l2arc_lb_ptr_buf_t; 332 333 /* Macros for setting fields in le_prop and lbp_prop */ 334 #define L2BLK_GET_LSIZE(field) \ 335 BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) 336 #define L2BLK_SET_LSIZE(field, x) \ 337 BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 338 #define L2BLK_GET_PSIZE(field) \ 339 BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) 340 #define L2BLK_SET_PSIZE(field, x) \ 341 BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) 342 #define L2BLK_GET_COMPRESS(field) \ 343 BF64_GET((field), 32, SPA_COMPRESSBITS) 344 #define L2BLK_SET_COMPRESS(field, x) \ 345 BF64_SET((field), 32, SPA_COMPRESSBITS, x) 346 #define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) 347 #define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) 348 #define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) 349 #define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) 350 #define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) 351 #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) 352 #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) 353 #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) 354 #define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) 355 #define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) 356 357 #define PTR_SWAP(x, y) \ 358 do { \ 359 void *tmp = (x);\ 360 x = y; \ 361 y = tmp; \ 362 _NOTE(CONSTCOND)\ 363 } while (0) 364 365 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ 366 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ 367 368 /* 369 * L2ARC Internals 370 */ 371 typedef struct l2arc_dev { 372 vdev_t *l2ad_vdev; /* vdev */ 373 spa_t *l2ad_spa; /* spa */ 374 uint64_t l2ad_hand; /* next write location */ 375 uint64_t l2ad_start; /* first addr on device */ 376 uint64_t l2ad_end; /* last addr on device */ 377 boolean_t l2ad_first; /* first sweep through */ 378 boolean_t l2ad_writing; /* currently writing */ 379 kmutex_t l2ad_mtx; /* lock for buffer list */ 380 list_t l2ad_buflist; /* buffer list */ 381 list_node_t l2ad_node; /* device list node */ 382 zfs_refcount_t l2ad_alloc; /* allocated bytes */ 383 /* 384 * Persistence-related stuff 385 */ 386 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ 387 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ 388 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ 389 int l2ad_log_ent_idx; /* index into cur log blk */ 390 /* Number of bytes in current log block's payload */ 391 uint64_t l2ad_log_blk_payload_asize; 392 /* 393 * Offset (in bytes) of the first buffer in current log block's 394 * payload. 395 */ 396 uint64_t l2ad_log_blk_payload_start; 397 /* Flag indicating whether a rebuild is scheduled or is going on */ 398 boolean_t l2ad_rebuild; 399 boolean_t l2ad_rebuild_cancel; 400 boolean_t l2ad_rebuild_began; 401 uint64_t l2ad_log_entries; /* entries per log blk */ 402 uint64_t l2ad_evict; /* evicted offset in bytes */ 403 /* List of pointers to log blocks present in the L2ARC device */ 404 list_t l2ad_lbptr_list; 405 /* 406 * Aligned size of all log blocks as accounted by vdev_space_update(). 407 */ 408 zfs_refcount_t l2ad_lb_asize; 409 /* 410 * Number of log blocks present on the device. 411 */ 412 zfs_refcount_t l2ad_lb_count; 413 boolean_t l2ad_trim_all; /* TRIM whole device */ 414 } l2arc_dev_t; 415 416 /* 417 * Encrypted blocks will need to be stored encrypted on the L2ARC 418 * disk as they appear in the main pool. In order for this to work we 419 * need to pass around the encryption parameters so they can be used 420 * to write data to the L2ARC. This struct is only defined in the 421 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED 422 * flag set. 423 */ 424 typedef struct arc_buf_hdr_crypt { 425 abd_t *b_rabd; /* raw encrypted data */ 426 dmu_object_type_t b_ot; /* object type */ 427 uint32_t b_ebufcnt; /* count of encrypted buffers */ 428 429 /* dsobj for looking up encryption key for l2arc encryption */ 430 uint64_t b_dsobj; 431 432 /* encryption parameters */ 433 uint8_t b_salt[ZIO_DATA_SALT_LEN]; 434 uint8_t b_iv[ZIO_DATA_IV_LEN]; 435 436 /* 437 * Technically this could be removed since we will always be able to 438 * get the mac from the bp when we need it. However, it is inconvenient 439 * for callers of arc code to have to pass a bp in all the time. This 440 * also allows us to assert that L2ARC data is properly encrypted to 441 * match the data in the main storage pool. 442 */ 443 uint8_t b_mac[ZIO_DATA_MAC_LEN]; 444 } arc_buf_hdr_crypt_t; 445 446 typedef struct l2arc_buf_hdr { 447 /* protected by arc_buf_hdr mutex */ 448 l2arc_dev_t *b_dev; /* L2ARC device */ 449 uint64_t b_daddr; /* disk address, offset byte */ 450 uint32_t b_hits; 451 arc_state_type_t b_arcs_state; 452 list_node_t b_l2node; 453 } l2arc_buf_hdr_t; 454 455 typedef struct l2arc_write_callback { 456 l2arc_dev_t *l2wcb_dev; /* device info */ 457 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 458 /* in-flight list of log blocks */ 459 list_t l2wcb_abd_list; 460 } l2arc_write_callback_t; 461 462 struct arc_buf_hdr { 463 /* protected by hash lock */ 464 dva_t b_dva; 465 uint64_t b_birth; 466 467 arc_buf_contents_t b_type; 468 uint8_t b_complevel; 469 uint8_t b_reserved1; /* used for 4 byte alignment */ 470 uint16_t b_reserved2; /* used for 4 byte alignment */ 471 arc_buf_hdr_t *b_hash_next; 472 arc_flags_t b_flags; 473 474 /* 475 * This field stores the size of the data buffer after 476 * compression, and is set in the arc's zio completion handlers. 477 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 478 * 479 * While the block pointers can store up to 32MB in their psize 480 * field, we can only store up to 32MB minus 512B. This is due 481 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 482 * a field of zeros represents 512B in the bp). We can't use a 483 * bias of 1 since we need to reserve a psize of zero, here, to 484 * represent holes and embedded blocks. 485 * 486 * This isn't a problem in practice, since the maximum size of a 487 * buffer is limited to 16MB, so we never need to store 32MB in 488 * this field. Even in the upstream illumos code base, the 489 * maximum size of a buffer is limited to 16MB. 490 */ 491 uint16_t b_psize; 492 493 /* 494 * This field stores the size of the data buffer before 495 * compression, and cannot change once set. It is in units 496 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 497 */ 498 uint16_t b_lsize; /* immutable */ 499 uint64_t b_spa; /* immutable */ 500 501 /* L2ARC fields. Undefined when not in L2ARC. */ 502 l2arc_buf_hdr_t b_l2hdr; 503 /* L1ARC fields. Undefined when in l2arc_only state */ 504 l1arc_buf_hdr_t b_l1hdr; 505 /* 506 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED 507 * is set and the L1 header exists. 508 */ 509 arc_buf_hdr_crypt_t b_crypt_hdr; 510 }; 511 512 typedef struct arc_stats { 513 kstat_named_t arcstat_hits; 514 kstat_named_t arcstat_misses; 515 kstat_named_t arcstat_demand_data_hits; 516 kstat_named_t arcstat_demand_data_misses; 517 kstat_named_t arcstat_demand_metadata_hits; 518 kstat_named_t arcstat_demand_metadata_misses; 519 kstat_named_t arcstat_prefetch_data_hits; 520 kstat_named_t arcstat_prefetch_data_misses; 521 kstat_named_t arcstat_prefetch_metadata_hits; 522 kstat_named_t arcstat_prefetch_metadata_misses; 523 kstat_named_t arcstat_mru_hits; 524 kstat_named_t arcstat_mru_ghost_hits; 525 kstat_named_t arcstat_mfu_hits; 526 kstat_named_t arcstat_mfu_ghost_hits; 527 kstat_named_t arcstat_deleted; 528 /* 529 * Number of buffers that could not be evicted because the hash lock 530 * was held by another thread. The lock may not necessarily be held 531 * by something using the same buffer, since hash locks are shared 532 * by multiple buffers. 533 */ 534 kstat_named_t arcstat_mutex_miss; 535 /* 536 * Number of buffers skipped when updating the access state due to the 537 * header having already been released after acquiring the hash lock. 538 */ 539 kstat_named_t arcstat_access_skip; 540 /* 541 * Number of buffers skipped because they have I/O in progress, are 542 * indirect prefetch buffers that have not lived long enough, or are 543 * not from the spa we're trying to evict from. 544 */ 545 kstat_named_t arcstat_evict_skip; 546 /* 547 * Number of times arc_evict_state() was unable to evict enough 548 * buffers to reach its target amount. 549 */ 550 kstat_named_t arcstat_evict_not_enough; 551 kstat_named_t arcstat_evict_l2_cached; 552 kstat_named_t arcstat_evict_l2_eligible; 553 kstat_named_t arcstat_evict_l2_eligible_mfu; 554 kstat_named_t arcstat_evict_l2_eligible_mru; 555 kstat_named_t arcstat_evict_l2_ineligible; 556 kstat_named_t arcstat_evict_l2_skip; 557 kstat_named_t arcstat_hash_elements; 558 kstat_named_t arcstat_hash_elements_max; 559 kstat_named_t arcstat_hash_collisions; 560 kstat_named_t arcstat_hash_chains; 561 kstat_named_t arcstat_hash_chain_max; 562 kstat_named_t arcstat_p; 563 kstat_named_t arcstat_c; 564 kstat_named_t arcstat_c_min; 565 kstat_named_t arcstat_c_max; 566 /* Not updated directly; only synced in arc_kstat_update. */ 567 kstat_named_t arcstat_size; 568 /* 569 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 570 * Note that the compressed bytes may match the uncompressed bytes 571 * if the block is either not compressed or compressed arc is disabled. 572 */ 573 kstat_named_t arcstat_compressed_size; 574 /* 575 * Uncompressed size of the data stored in b_pabd. If compressed 576 * arc is disabled then this value will be identical to the stat 577 * above. 578 */ 579 kstat_named_t arcstat_uncompressed_size; 580 /* 581 * Number of bytes stored in all the arc_buf_t's. This is classified 582 * as "overhead" since this data is typically short-lived and will 583 * be evicted from the arc when it becomes unreferenced unless the 584 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 585 * values have been set (see comment in dbuf.c for more information). 586 */ 587 kstat_named_t arcstat_overhead_size; 588 /* 589 * Number of bytes consumed by internal ARC structures necessary 590 * for tracking purposes; these structures are not actually 591 * backed by ARC buffers. This includes arc_buf_hdr_t structures 592 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 593 * caches), and arc_buf_t structures (allocated via arc_buf_t 594 * cache). 595 * Not updated directly; only synced in arc_kstat_update. 596 */ 597 kstat_named_t arcstat_hdr_size; 598 /* 599 * Number of bytes consumed by ARC buffers of type equal to 600 * ARC_BUFC_DATA. This is generally consumed by buffers backing 601 * on disk user data (e.g. plain file contents). 602 * Not updated directly; only synced in arc_kstat_update. 603 */ 604 kstat_named_t arcstat_data_size; 605 /* 606 * Number of bytes consumed by ARC buffers of type equal to 607 * ARC_BUFC_METADATA. This is generally consumed by buffers 608 * backing on disk data that is used for internal ZFS 609 * structures (e.g. ZAP, dnode, indirect blocks, etc). 610 * Not updated directly; only synced in arc_kstat_update. 611 */ 612 kstat_named_t arcstat_metadata_size; 613 /* 614 * Number of bytes consumed by dmu_buf_impl_t objects. 615 * Not updated directly; only synced in arc_kstat_update. 616 */ 617 kstat_named_t arcstat_dbuf_size; 618 /* 619 * Number of bytes consumed by dnode_t objects. 620 * Not updated directly; only synced in arc_kstat_update. 621 */ 622 kstat_named_t arcstat_dnode_size; 623 /* 624 * Number of bytes consumed by bonus buffers. 625 * Not updated directly; only synced in arc_kstat_update. 626 */ 627 kstat_named_t arcstat_bonus_size; 628 #if defined(COMPAT_FREEBSD11) 629 /* 630 * Sum of the previous three counters, provided for compatibility. 631 */ 632 kstat_named_t arcstat_other_size; 633 #endif 634 635 /* 636 * Total number of bytes consumed by ARC buffers residing in the 637 * arc_anon state. This includes *all* buffers in the arc_anon 638 * state; e.g. data, metadata, evictable, and unevictable buffers 639 * are all included in this value. 640 * Not updated directly; only synced in arc_kstat_update. 641 */ 642 kstat_named_t arcstat_anon_size; 643 /* 644 * Number of bytes consumed by ARC buffers that meet the 645 * following criteria: backing buffers of type ARC_BUFC_DATA, 646 * residing in the arc_anon state, and are eligible for eviction 647 * (e.g. have no outstanding holds on the buffer). 648 * Not updated directly; only synced in arc_kstat_update. 649 */ 650 kstat_named_t arcstat_anon_evictable_data; 651 /* 652 * Number of bytes consumed by ARC buffers that meet the 653 * following criteria: backing buffers of type ARC_BUFC_METADATA, 654 * residing in the arc_anon state, and are eligible for eviction 655 * (e.g. have no outstanding holds on the buffer). 656 * Not updated directly; only synced in arc_kstat_update. 657 */ 658 kstat_named_t arcstat_anon_evictable_metadata; 659 /* 660 * Total number of bytes consumed by ARC buffers residing in the 661 * arc_mru state. This includes *all* buffers in the arc_mru 662 * state; e.g. data, metadata, evictable, and unevictable buffers 663 * are all included in this value. 664 * Not updated directly; only synced in arc_kstat_update. 665 */ 666 kstat_named_t arcstat_mru_size; 667 /* 668 * Number of bytes consumed by ARC buffers that meet the 669 * following criteria: backing buffers of type ARC_BUFC_DATA, 670 * residing in the arc_mru state, and are eligible for eviction 671 * (e.g. have no outstanding holds on the buffer). 672 * Not updated directly; only synced in arc_kstat_update. 673 */ 674 kstat_named_t arcstat_mru_evictable_data; 675 /* 676 * Number of bytes consumed by ARC buffers that meet the 677 * following criteria: backing buffers of type ARC_BUFC_METADATA, 678 * residing in the arc_mru state, and are eligible for eviction 679 * (e.g. have no outstanding holds on the buffer). 680 * Not updated directly; only synced in arc_kstat_update. 681 */ 682 kstat_named_t arcstat_mru_evictable_metadata; 683 /* 684 * Total number of bytes that *would have been* consumed by ARC 685 * buffers in the arc_mru_ghost state. The key thing to note 686 * here, is the fact that this size doesn't actually indicate 687 * RAM consumption. The ghost lists only consist of headers and 688 * don't actually have ARC buffers linked off of these headers. 689 * Thus, *if* the headers had associated ARC buffers, these 690 * buffers *would have* consumed this number of bytes. 691 * Not updated directly; only synced in arc_kstat_update. 692 */ 693 kstat_named_t arcstat_mru_ghost_size; 694 /* 695 * Number of bytes that *would have been* consumed by ARC 696 * buffers that are eligible for eviction, of type 697 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 698 * Not updated directly; only synced in arc_kstat_update. 699 */ 700 kstat_named_t arcstat_mru_ghost_evictable_data; 701 /* 702 * Number of bytes that *would have been* consumed by ARC 703 * buffers that are eligible for eviction, of type 704 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 705 * Not updated directly; only synced in arc_kstat_update. 706 */ 707 kstat_named_t arcstat_mru_ghost_evictable_metadata; 708 /* 709 * Total number of bytes consumed by ARC buffers residing in the 710 * arc_mfu state. This includes *all* buffers in the arc_mfu 711 * state; e.g. data, metadata, evictable, and unevictable buffers 712 * are all included in this value. 713 * Not updated directly; only synced in arc_kstat_update. 714 */ 715 kstat_named_t arcstat_mfu_size; 716 /* 717 * Number of bytes consumed by ARC buffers that are eligible for 718 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 719 * state. 720 * Not updated directly; only synced in arc_kstat_update. 721 */ 722 kstat_named_t arcstat_mfu_evictable_data; 723 /* 724 * Number of bytes consumed by ARC buffers that are eligible for 725 * eviction, of type ARC_BUFC_METADATA, and reside in the 726 * arc_mfu state. 727 * Not updated directly; only synced in arc_kstat_update. 728 */ 729 kstat_named_t arcstat_mfu_evictable_metadata; 730 /* 731 * Total number of bytes that *would have been* consumed by ARC 732 * buffers in the arc_mfu_ghost state. See the comment above 733 * arcstat_mru_ghost_size for more details. 734 * Not updated directly; only synced in arc_kstat_update. 735 */ 736 kstat_named_t arcstat_mfu_ghost_size; 737 /* 738 * Number of bytes that *would have been* consumed by ARC 739 * buffers that are eligible for eviction, of type 740 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 741 * Not updated directly; only synced in arc_kstat_update. 742 */ 743 kstat_named_t arcstat_mfu_ghost_evictable_data; 744 /* 745 * Number of bytes that *would have been* consumed by ARC 746 * buffers that are eligible for eviction, of type 747 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 748 * Not updated directly; only synced in arc_kstat_update. 749 */ 750 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 751 kstat_named_t arcstat_l2_hits; 752 kstat_named_t arcstat_l2_misses; 753 /* 754 * Allocated size (in bytes) of L2ARC cached buffers by ARC state. 755 */ 756 kstat_named_t arcstat_l2_prefetch_asize; 757 kstat_named_t arcstat_l2_mru_asize; 758 kstat_named_t arcstat_l2_mfu_asize; 759 /* 760 * Allocated size (in bytes) of L2ARC cached buffers by buffer content 761 * type. 762 */ 763 kstat_named_t arcstat_l2_bufc_data_asize; 764 kstat_named_t arcstat_l2_bufc_metadata_asize; 765 kstat_named_t arcstat_l2_feeds; 766 kstat_named_t arcstat_l2_rw_clash; 767 kstat_named_t arcstat_l2_read_bytes; 768 kstat_named_t arcstat_l2_write_bytes; 769 kstat_named_t arcstat_l2_writes_sent; 770 kstat_named_t arcstat_l2_writes_done; 771 kstat_named_t arcstat_l2_writes_error; 772 kstat_named_t arcstat_l2_writes_lock_retry; 773 kstat_named_t arcstat_l2_evict_lock_retry; 774 kstat_named_t arcstat_l2_evict_reading; 775 kstat_named_t arcstat_l2_evict_l1cached; 776 kstat_named_t arcstat_l2_free_on_write; 777 kstat_named_t arcstat_l2_abort_lowmem; 778 kstat_named_t arcstat_l2_cksum_bad; 779 kstat_named_t arcstat_l2_io_error; 780 kstat_named_t arcstat_l2_lsize; 781 kstat_named_t arcstat_l2_psize; 782 /* Not updated directly; only synced in arc_kstat_update. */ 783 kstat_named_t arcstat_l2_hdr_size; 784 /* 785 * Number of L2ARC log blocks written. These are used for restoring the 786 * L2ARC. Updated during writing of L2ARC log blocks. 787 */ 788 kstat_named_t arcstat_l2_log_blk_writes; 789 /* 790 * Moving average of the aligned size of the L2ARC log blocks, in 791 * bytes. Updated during L2ARC rebuild and during writing of L2ARC 792 * log blocks. 793 */ 794 kstat_named_t arcstat_l2_log_blk_avg_asize; 795 /* Aligned size of L2ARC log blocks on L2ARC devices. */ 796 kstat_named_t arcstat_l2_log_blk_asize; 797 /* Number of L2ARC log blocks present on L2ARC devices. */ 798 kstat_named_t arcstat_l2_log_blk_count; 799 /* 800 * Moving average of the aligned size of L2ARC restored data, in bytes, 801 * to the aligned size of their metadata in L2ARC, in bytes. 802 * Updated during L2ARC rebuild and during writing of L2ARC log blocks. 803 */ 804 kstat_named_t arcstat_l2_data_to_meta_ratio; 805 /* 806 * Number of times the L2ARC rebuild was successful for an L2ARC device. 807 */ 808 kstat_named_t arcstat_l2_rebuild_success; 809 /* 810 * Number of times the L2ARC rebuild failed because the device header 811 * was in an unsupported format or corrupted. 812 */ 813 kstat_named_t arcstat_l2_rebuild_abort_unsupported; 814 /* 815 * Number of times the L2ARC rebuild failed because of IO errors 816 * while reading a log block. 817 */ 818 kstat_named_t arcstat_l2_rebuild_abort_io_errors; 819 /* 820 * Number of times the L2ARC rebuild failed because of IO errors when 821 * reading the device header. 822 */ 823 kstat_named_t arcstat_l2_rebuild_abort_dh_errors; 824 /* 825 * Number of L2ARC log blocks which failed to be restored due to 826 * checksum errors. 827 */ 828 kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; 829 /* 830 * Number of times the L2ARC rebuild was aborted due to low system 831 * memory. 832 */ 833 kstat_named_t arcstat_l2_rebuild_abort_lowmem; 834 /* Logical size of L2ARC restored data, in bytes. */ 835 kstat_named_t arcstat_l2_rebuild_size; 836 /* Aligned size of L2ARC restored data, in bytes. */ 837 kstat_named_t arcstat_l2_rebuild_asize; 838 /* 839 * Number of L2ARC log entries (buffers) that were successfully 840 * restored in ARC. 841 */ 842 kstat_named_t arcstat_l2_rebuild_bufs; 843 /* 844 * Number of L2ARC log entries (buffers) already cached in ARC. These 845 * were not restored again. 846 */ 847 kstat_named_t arcstat_l2_rebuild_bufs_precached; 848 /* 849 * Number of L2ARC log blocks that were restored successfully. Each 850 * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. 851 */ 852 kstat_named_t arcstat_l2_rebuild_log_blks; 853 kstat_named_t arcstat_memory_throttle_count; 854 kstat_named_t arcstat_memory_direct_count; 855 kstat_named_t arcstat_memory_indirect_count; 856 kstat_named_t arcstat_memory_all_bytes; 857 kstat_named_t arcstat_memory_free_bytes; 858 kstat_named_t arcstat_memory_available_bytes; 859 kstat_named_t arcstat_no_grow; 860 kstat_named_t arcstat_tempreserve; 861 kstat_named_t arcstat_loaned_bytes; 862 kstat_named_t arcstat_prune; 863 /* Not updated directly; only synced in arc_kstat_update. */ 864 kstat_named_t arcstat_meta_used; 865 kstat_named_t arcstat_meta_limit; 866 kstat_named_t arcstat_dnode_limit; 867 kstat_named_t arcstat_meta_max; 868 kstat_named_t arcstat_meta_min; 869 kstat_named_t arcstat_async_upgrade_sync; 870 kstat_named_t arcstat_demand_hit_predictive_prefetch; 871 kstat_named_t arcstat_demand_hit_prescient_prefetch; 872 kstat_named_t arcstat_need_free; 873 kstat_named_t arcstat_sys_free; 874 kstat_named_t arcstat_raw_size; 875 kstat_named_t arcstat_cached_only_in_progress; 876 kstat_named_t arcstat_abd_chunk_waste_size; 877 } arc_stats_t; 878 879 typedef struct arc_evict_waiter { 880 list_node_t aew_node; 881 kcondvar_t aew_cv; 882 uint64_t aew_count; 883 } arc_evict_waiter_t; 884 885 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 886 887 #define ARCSTAT_INCR(stat, val) \ 888 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 889 890 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 891 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 892 893 #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ 894 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 895 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 896 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 897 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 898 #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ 899 900 extern taskq_t *arc_prune_taskq; 901 extern arc_stats_t arc_stats; 902 extern hrtime_t arc_growtime; 903 extern boolean_t arc_warm; 904 extern int arc_grow_retry; 905 extern int arc_no_grow_shift; 906 extern int arc_shrink_shift; 907 extern kmutex_t arc_prune_mtx; 908 extern list_t arc_prune_list; 909 extern aggsum_t arc_size; 910 extern arc_state_t *arc_mfu; 911 extern arc_state_t *arc_mru; 912 extern uint_t zfs_arc_pc_percent; 913 extern int arc_lotsfree_percent; 914 extern unsigned long zfs_arc_min; 915 extern unsigned long zfs_arc_max; 916 917 extern void arc_reduce_target_size(int64_t to_free); 918 extern boolean_t arc_reclaim_needed(void); 919 extern void arc_kmem_reap_soon(void); 920 extern boolean_t arc_is_overflowing(void); 921 extern void arc_wait_for_eviction(uint64_t); 922 923 extern void arc_lowmem_init(void); 924 extern void arc_lowmem_fini(void); 925 extern void arc_prune_async(int64_t); 926 extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); 927 extern uint64_t arc_free_memory(void); 928 extern int64_t arc_available_memory(void); 929 extern void arc_tuning_update(boolean_t); 930 extern void arc_register_hotplug(void); 931 extern void arc_unregister_hotplug(void); 932 933 extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); 934 extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); 935 936 /* used in zdb.c */ 937 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, 938 const l2arc_log_blkptr_t *lbp); 939 940 /* used in vdev_trim.c */ 941 void l2arc_dev_hdr_update(l2arc_dev_t *dev); 942 l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); 943 944 #ifdef __cplusplus 945 } 946 #endif 947 948 #endif /* _SYS_ARC_IMPL_H */ 949