1 /*
2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 *
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 /*
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
37 *
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
40 */
41
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
46 #include <sys/spa.h>
47 #include <sys/zstd/zstd.h>
48
49 #define ZSTD_STATIC_LINKING_ONLY
50 #include "lib/zstd.h"
51 #include "lib/zstd_errors.h"
52
53 kstat_t *zstd_ksp = NULL;
54
55 typedef struct zstd_stats {
56 kstat_named_t zstd_stat_alloc_fail;
57 kstat_named_t zstd_stat_alloc_fallback;
58 kstat_named_t zstd_stat_com_alloc_fail;
59 kstat_named_t zstd_stat_dec_alloc_fail;
60 kstat_named_t zstd_stat_com_inval;
61 kstat_named_t zstd_stat_dec_inval;
62 kstat_named_t zstd_stat_dec_header_inval;
63 kstat_named_t zstd_stat_com_fail;
64 kstat_named_t zstd_stat_dec_fail;
65 kstat_named_t zstd_stat_buffers;
66 kstat_named_t zstd_stat_size;
67 } zstd_stats_t;
68
69 static zstd_stats_t zstd_stats = {
70 { "alloc_fail", KSTAT_DATA_UINT64 },
71 { "alloc_fallback", KSTAT_DATA_UINT64 },
72 { "compress_alloc_fail", KSTAT_DATA_UINT64 },
73 { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
74 { "compress_level_invalid", KSTAT_DATA_UINT64 },
75 { "decompress_level_invalid", KSTAT_DATA_UINT64 },
76 { "decompress_header_invalid", KSTAT_DATA_UINT64 },
77 { "compress_failed", KSTAT_DATA_UINT64 },
78 { "decompress_failed", KSTAT_DATA_UINT64 },
79 { "buffers", KSTAT_DATA_UINT64 },
80 { "size", KSTAT_DATA_UINT64 },
81 };
82
83 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
84 enum zstd_kmem_type {
85 ZSTD_KMEM_UNKNOWN = 0,
86 /* Allocation type using kmem_vmalloc */
87 ZSTD_KMEM_DEFAULT,
88 /* Pool based allocation using mempool_alloc */
89 ZSTD_KMEM_POOL,
90 /* Reserved fallback memory for decompression only */
91 ZSTD_KMEM_DCTX,
92 ZSTD_KMEM_COUNT,
93 };
94
95 /* Structure for pooled memory objects */
96 struct zstd_pool {
97 void *mem;
98 size_t size;
99 kmutex_t barrier;
100 hrtime_t timeout;
101 };
102
103 /* Global structure for handling memory allocations */
104 struct zstd_kmem {
105 enum zstd_kmem_type kmem_type;
106 size_t kmem_size;
107 struct zstd_pool *pool;
108 };
109
110 /* Fallback memory structure used for decompression only if memory runs out */
111 struct zstd_fallback_mem {
112 size_t mem_size;
113 void *mem;
114 kmutex_t barrier;
115 };
116
117 struct zstd_levelmap {
118 int16_t zstd_level;
119 enum zio_zstd_levels level;
120 };
121
122 /*
123 * ZSTD memory handlers
124 *
125 * For decompression we use a different handler which also provides fallback
126 * memory allocation in case memory runs out.
127 *
128 * The ZSTD handlers were split up for the most simplified implementation.
129 */
130 static void *zstd_alloc(void *opaque, size_t size);
131 static void *zstd_dctx_alloc(void *opaque, size_t size);
132 static void zstd_free(void *opaque, void *ptr);
133
134 /* Compression memory handler */
135 static const ZSTD_customMem zstd_malloc = {
136 zstd_alloc,
137 zstd_free,
138 NULL,
139 };
140
141 /* Decompression memory handler */
142 static const ZSTD_customMem zstd_dctx_malloc = {
143 zstd_dctx_alloc,
144 zstd_free,
145 NULL,
146 };
147
148 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
149 static struct zstd_levelmap zstd_levels[] = {
150 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
151 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
152 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
153 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
154 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
155 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
156 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
157 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
158 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
159 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
160 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
161 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
162 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
163 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
164 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
165 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
166 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
167 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
168 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
169 {-1, ZIO_ZSTD_LEVEL_FAST_1},
170 {-2, ZIO_ZSTD_LEVEL_FAST_2},
171 {-3, ZIO_ZSTD_LEVEL_FAST_3},
172 {-4, ZIO_ZSTD_LEVEL_FAST_4},
173 {-5, ZIO_ZSTD_LEVEL_FAST_5},
174 {-6, ZIO_ZSTD_LEVEL_FAST_6},
175 {-7, ZIO_ZSTD_LEVEL_FAST_7},
176 {-8, ZIO_ZSTD_LEVEL_FAST_8},
177 {-9, ZIO_ZSTD_LEVEL_FAST_9},
178 {-10, ZIO_ZSTD_LEVEL_FAST_10},
179 {-20, ZIO_ZSTD_LEVEL_FAST_20},
180 {-30, ZIO_ZSTD_LEVEL_FAST_30},
181 {-40, ZIO_ZSTD_LEVEL_FAST_40},
182 {-50, ZIO_ZSTD_LEVEL_FAST_50},
183 {-60, ZIO_ZSTD_LEVEL_FAST_60},
184 {-70, ZIO_ZSTD_LEVEL_FAST_70},
185 {-80, ZIO_ZSTD_LEVEL_FAST_80},
186 {-90, ZIO_ZSTD_LEVEL_FAST_90},
187 {-100, ZIO_ZSTD_LEVEL_FAST_100},
188 {-500, ZIO_ZSTD_LEVEL_FAST_500},
189 {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
190 };
191
192 /*
193 * This variable represents the maximum count of the pool based on the number
194 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
195 */
196 static int pool_count = 16;
197
198 #define ZSTD_POOL_MAX pool_count
199 #define ZSTD_POOL_TIMEOUT 60 * 2
200
201 static struct zstd_fallback_mem zstd_dctx_fallback;
202 static struct zstd_pool *zstd_mempool_cctx;
203 static struct zstd_pool *zstd_mempool_dctx;
204
205 /*
206 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
207 * and while ASAN does this, KASAN defines that and does not. So to avoid
208 * changing the external code, we do this.
209 */
210 #if defined(__has_feature)
211 #if __has_feature(address_sanitizer)
212 #define ADDRESS_SANITIZER 1
213 #endif
214 #elif defined(__SANITIZE_ADDRESS__)
215 #define ADDRESS_SANITIZER 1
216 #endif
217 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
218 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
219 void __asan_poison_memory_region(void const volatile *addr, size_t size);
__asan_unpoison_memory_region(void const volatile * addr,size_t size)220 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
__asan_poison_memory_region(void const volatile * addr,size_t size)221 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
222 #endif
223
224
225 static void
zstd_mempool_reap(struct zstd_pool * zstd_mempool)226 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
227 {
228 struct zstd_pool *pool;
229
230 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
231 return;
232 }
233
234 /* free obsolete slots */
235 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
236 pool = &zstd_mempool[i];
237 if (pool->mem && mutex_tryenter(&pool->barrier)) {
238 /* Free memory if unused object older than 2 minutes */
239 if (pool->mem && gethrestime_sec() > pool->timeout) {
240 vmem_free(pool->mem, pool->size);
241 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
242 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
243 pool->mem = NULL;
244 pool->size = 0;
245 pool->timeout = 0;
246 }
247 mutex_exit(&pool->barrier);
248 }
249 }
250 }
251
252 /*
253 * Try to get a cached allocated buffer from memory pool or allocate a new one
254 * if necessary. If a object is older than 2 minutes and does not fit the
255 * requested size, it will be released and a new cached entry will be allocated.
256 * If other pooled objects are detected without being used for 2 minutes, they
257 * will be released, too.
258 *
259 * The concept is that high frequency memory allocations of bigger objects are
260 * expensive. So if a lot of work is going on, allocations will be kept for a
261 * while and can be reused in that time frame.
262 *
263 * The scheduled release will be updated every time a object is reused.
264 */
265
266 static void *
zstd_mempool_alloc(struct zstd_pool * zstd_mempool,size_t size)267 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
268 {
269 struct zstd_pool *pool;
270 struct zstd_kmem *mem = NULL;
271
272 if (!zstd_mempool) {
273 return (NULL);
274 }
275
276 /* Seek for preallocated memory slot and free obsolete slots */
277 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
278 pool = &zstd_mempool[i];
279 /*
280 * This lock is simply a marker for a pool object being in use.
281 * If it's already hold, it will be skipped.
282 *
283 * We need to create it before checking it to avoid race
284 * conditions caused by running in a threaded context.
285 *
286 * The lock is later released by zstd_mempool_free.
287 */
288 if (mutex_tryenter(&pool->barrier)) {
289 /*
290 * Check if objects fits the size, if so we take it and
291 * update the timestamp.
292 */
293 if (pool->mem && size <= pool->size) {
294 pool->timeout = gethrestime_sec() +
295 ZSTD_POOL_TIMEOUT;
296 mem = pool->mem;
297 return (mem);
298 }
299 mutex_exit(&pool->barrier);
300 }
301 }
302
303 /*
304 * If no preallocated slot was found, try to fill in a new one.
305 *
306 * We run a similar algorithm twice here to avoid pool fragmentation.
307 * The first one may generate holes in the list if objects get released.
308 * We always make sure that these holes get filled instead of adding new
309 * allocations constantly at the end.
310 */
311 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
312 pool = &zstd_mempool[i];
313 if (mutex_tryenter(&pool->barrier)) {
314 /* Object is free, try to allocate new one */
315 if (!pool->mem) {
316 mem = vmem_alloc(size, KM_SLEEP);
317 if (mem) {
318 ZSTDSTAT_ADD(zstd_stat_buffers, 1);
319 ZSTDSTAT_ADD(zstd_stat_size, size);
320 pool->mem = mem;
321 pool->size = size;
322 /* Keep track for later release */
323 mem->pool = pool;
324 mem->kmem_type = ZSTD_KMEM_POOL;
325 mem->kmem_size = size;
326 }
327 }
328
329 if (size <= pool->size) {
330 /* Update timestamp */
331 pool->timeout = gethrestime_sec() +
332 ZSTD_POOL_TIMEOUT;
333
334 return (pool->mem);
335 }
336
337 mutex_exit(&pool->barrier);
338 }
339 }
340
341 /*
342 * If the pool is full or the allocation failed, try lazy allocation
343 * instead.
344 */
345 if (!mem) {
346 mem = vmem_alloc(size, KM_NOSLEEP);
347 if (mem) {
348 mem->pool = NULL;
349 mem->kmem_type = ZSTD_KMEM_DEFAULT;
350 mem->kmem_size = size;
351 }
352 }
353
354 return (mem);
355 }
356
357 /* Mark object as released by releasing the barrier mutex */
358 static void
zstd_mempool_free(struct zstd_kmem * z)359 zstd_mempool_free(struct zstd_kmem *z)
360 {
361 mutex_exit(&z->pool->barrier);
362 }
363
364 /* Convert ZFS internal enum to ZSTD level */
365 static int
zstd_enum_to_level(enum zio_zstd_levels level,int16_t * zstd_level)366 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
367 {
368 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
369 *zstd_level = zstd_levels[level - 1].zstd_level;
370 return (0);
371 }
372 if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
373 level <= ZIO_ZSTD_LEVEL_FAST_1000) {
374 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
375 + ZIO_ZSTD_LEVEL_19].zstd_level;
376 return (0);
377 }
378
379 /* Invalid/unknown zfs compression enum - this should never happen. */
380 return (1);
381 }
382
383
384 /* Compress block using zstd */
385 size_t
zfs_zstd_compress(void * s_start,void * d_start,size_t s_len,size_t d_len,int level)386 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
387 int level)
388 {
389 size_t c_len;
390 int16_t zstd_level;
391 zfs_zstdhdr_t *hdr;
392 ZSTD_CCtx *cctx;
393
394 hdr = (zfs_zstdhdr_t *)d_start;
395
396 /* Skip compression if the specified level is invalid */
397 if (zstd_enum_to_level(level, &zstd_level)) {
398 ZSTDSTAT_BUMP(zstd_stat_com_inval);
399 return (s_len);
400 }
401
402 ASSERT3U(d_len, >=, sizeof (*hdr));
403 ASSERT3U(d_len, <=, s_len);
404 ASSERT3U(zstd_level, !=, 0);
405
406 cctx = ZSTD_createCCtx_advanced(zstd_malloc);
407
408 /*
409 * Out of kernel memory, gently fall through - this will disable
410 * compression in zio_compress_data
411 */
412 if (!cctx) {
413 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
414 return (s_len);
415 }
416
417 /* Set the compression level */
418 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
419
420 /* Use the "magicless" zstd header which saves us 4 header bytes */
421 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
422
423 /*
424 * Disable redundant checksum calculation and content size storage since
425 * this is already done by ZFS itself.
426 */
427 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
428 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
429
430 c_len = ZSTD_compress2(cctx,
431 hdr->data,
432 d_len - sizeof (*hdr),
433 s_start, s_len);
434
435 ZSTD_freeCCtx(cctx);
436
437 /* Error in the compression routine, disable compression. */
438 if (ZSTD_isError(c_len)) {
439 /*
440 * If we are aborting the compression because the saves are
441 * too small, that is not a failure. Everything else is a
442 * failure, so increment the compression failure counter.
443 */
444 if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
445 ZSTDSTAT_BUMP(zstd_stat_com_fail);
446 }
447 return (s_len);
448 }
449
450 /*
451 * Encode the compressed buffer size at the start. We'll need this in
452 * decompression to counter the effects of padding which might be added
453 * to the compressed buffer and which, if unhandled, would confuse the
454 * hell out of our decompression function.
455 */
456 hdr->c_len = BE_32(c_len);
457
458 /*
459 * Check version for overflow.
460 * The limit of 24 bits must not be exceeded. This allows a maximum
461 * version 1677.72.15 which we don't expect to be ever reached.
462 */
463 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
464
465 /*
466 * Encode the compression level as well. We may need to know the
467 * original compression level if compressed_arc is disabled, to match
468 * the compression settings to write this block to the L2ARC.
469 *
470 * Encode the actual level, so if the enum changes in the future, we
471 * will be compatible.
472 *
473 * The upper 24 bits store the ZSTD version to be able to provide
474 * future compatibility, since new versions might enhance the
475 * compression algorithm in a way, where the compressed data will
476 * change.
477 *
478 * As soon as such incompatibility occurs, handling code needs to be
479 * added, differentiating between the versions.
480 */
481 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
482 zfs_set_hdrlevel(hdr, level);
483 hdr->raw_version_level = BE_32(hdr->raw_version_level);
484
485 return (c_len + sizeof (*hdr));
486 }
487
488 /* Decompress block using zstd and return its stored level */
489 int
zfs_zstd_decompress_level(void * s_start,void * d_start,size_t s_len,size_t d_len,uint8_t * level)490 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
491 size_t d_len, uint8_t *level)
492 {
493 ZSTD_DCtx *dctx;
494 size_t result;
495 int16_t zstd_level;
496 uint32_t c_len;
497 const zfs_zstdhdr_t *hdr;
498 zfs_zstdhdr_t hdr_copy;
499
500 hdr = (const zfs_zstdhdr_t *)s_start;
501 c_len = BE_32(hdr->c_len);
502
503 /*
504 * Make a copy instead of directly converting the header, since we must
505 * not modify the original data that may be used again later.
506 */
507 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
508 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
509
510 /*
511 * NOTE: We ignore the ZSTD version for now. As soon as any
512 * incompatibility occurs, it has to be handled accordingly.
513 * The version can be accessed via `hdr_copy.version`.
514 */
515
516 /*
517 * Convert and check the level
518 * An invalid level is a strong indicator for data corruption! In such
519 * case return an error so the upper layers can try to fix it.
520 */
521 if (zstd_enum_to_level(curlevel, &zstd_level)) {
522 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
523 return (1);
524 }
525
526 ASSERT3U(d_len, >=, s_len);
527 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
528
529 /* Invalid compressed buffer size encoded at start */
530 if (c_len + sizeof (*hdr) > s_len) {
531 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
532 return (1);
533 }
534
535 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
536 if (!dctx) {
537 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
538 return (1);
539 }
540
541 /* Set header type to "magicless" */
542 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
543
544 /* Decompress the data and release the context */
545 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
546 ZSTD_freeDCtx(dctx);
547
548 /*
549 * Returns 0 on success (decompression function returned non-negative)
550 * and non-zero on failure (decompression function returned negative.
551 */
552 if (ZSTD_isError(result)) {
553 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
554 return (1);
555 }
556
557 if (level) {
558 *level = curlevel;
559 }
560
561 return (0);
562 }
563
564 /* Decompress datablock using zstd */
565 int
zfs_zstd_decompress(void * s_start,void * d_start,size_t s_len,size_t d_len,int level __maybe_unused)566 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
567 int level __maybe_unused)
568 {
569
570 return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
571 NULL));
572 }
573
574 /* Allocator for zstd compression context using mempool_allocator */
575 static void *
zstd_alloc(void * opaque __maybe_unused,size_t size)576 zstd_alloc(void *opaque __maybe_unused, size_t size)
577 {
578 size_t nbytes = sizeof (struct zstd_kmem) + size;
579 struct zstd_kmem *z = NULL;
580
581 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
582
583 if (!z) {
584 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
585 return (NULL);
586 }
587
588 return ((void*)z + (sizeof (struct zstd_kmem)));
589 }
590
591 /*
592 * Allocator for zstd decompression context using mempool_allocator with
593 * fallback to reserved memory if allocation fails
594 */
595 static void *
zstd_dctx_alloc(void * opaque __maybe_unused,size_t size)596 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
597 {
598 size_t nbytes = sizeof (struct zstd_kmem) + size;
599 struct zstd_kmem *z = NULL;
600 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
601
602 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
603 if (!z) {
604 /* Try harder, decompression shall not fail */
605 z = vmem_alloc(nbytes, KM_SLEEP);
606 if (z) {
607 z->pool = NULL;
608 }
609 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
610 } else {
611 return ((void*)z + (sizeof (struct zstd_kmem)));
612 }
613
614 /* Fallback if everything fails */
615 if (!z) {
616 /*
617 * Barrier since we only can handle it in a single thread. All
618 * other following threads need to wait here until decompression
619 * is completed. zstd_free will release this barrier later.
620 */
621 mutex_enter(&zstd_dctx_fallback.barrier);
622
623 z = zstd_dctx_fallback.mem;
624 type = ZSTD_KMEM_DCTX;
625 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
626 }
627
628 /* Allocation should always be successful */
629 if (!z) {
630 return (NULL);
631 }
632
633 z->kmem_type = type;
634 z->kmem_size = nbytes;
635
636 return ((void*)z + (sizeof (struct zstd_kmem)));
637 }
638
639 /* Free allocated memory by its specific type */
640 static void
zstd_free(void * opaque __maybe_unused,void * ptr)641 zstd_free(void *opaque __maybe_unused, void *ptr)
642 {
643 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
644 enum zstd_kmem_type type;
645
646 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
647 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
648
649 type = z->kmem_type;
650 switch (type) {
651 case ZSTD_KMEM_DEFAULT:
652 vmem_free(z, z->kmem_size);
653 break;
654 case ZSTD_KMEM_POOL:
655 zstd_mempool_free(z);
656 break;
657 case ZSTD_KMEM_DCTX:
658 mutex_exit(&zstd_dctx_fallback.barrier);
659 break;
660 default:
661 break;
662 }
663 }
664
665 /* Allocate fallback memory to ensure safe decompression */
666 static void __init
create_fallback_mem(struct zstd_fallback_mem * mem,size_t size)667 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
668 {
669 mem->mem_size = size;
670 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
671 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
672 }
673
674 /* Initialize memory pool barrier mutexes */
675 static void __init
zstd_mempool_init(void)676 zstd_mempool_init(void)
677 {
678 zstd_mempool_cctx = (struct zstd_pool *)
679 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
680 zstd_mempool_dctx = (struct zstd_pool *)
681 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
682
683 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
684 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
685 MUTEX_DEFAULT, NULL);
686 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
687 MUTEX_DEFAULT, NULL);
688 }
689 }
690
691 /* Initialize zstd-related memory handling */
692 static int __init
zstd_meminit(void)693 zstd_meminit(void)
694 {
695 zstd_mempool_init();
696
697 /*
698 * Estimate the size of the fallback decompression context.
699 * The expected size on x64 with current ZSTD should be about 160 KB.
700 */
701 create_fallback_mem(&zstd_dctx_fallback,
702 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
703 PAGESIZE));
704
705 return (0);
706 }
707
708 /* Release object from pool and free memory */
709 static void __exit
release_pool(struct zstd_pool * pool)710 release_pool(struct zstd_pool *pool)
711 {
712 mutex_destroy(&pool->barrier);
713 vmem_free(pool->mem, pool->size);
714 pool->mem = NULL;
715 pool->size = 0;
716 }
717
718 /* Release memory pool objects */
719 static void __exit
zstd_mempool_deinit(void)720 zstd_mempool_deinit(void)
721 {
722 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
723 release_pool(&zstd_mempool_cctx[i]);
724 release_pool(&zstd_mempool_dctx[i]);
725 }
726
727 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
728 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
729 zstd_mempool_dctx = NULL;
730 zstd_mempool_cctx = NULL;
731 }
732
733 /* release unused memory from pool */
734
735 void
zfs_zstd_cache_reap_now(void)736 zfs_zstd_cache_reap_now(void)
737 {
738
739 /*
740 * Short-circuit if there are no buffers to begin with.
741 */
742 if (ZSTDSTAT(zstd_stat_buffers) == 0)
743 return;
744
745 /*
746 * calling alloc with zero size seeks
747 * and releases old unused objects
748 */
749 zstd_mempool_reap(zstd_mempool_cctx);
750 zstd_mempool_reap(zstd_mempool_dctx);
751 }
752
753 extern int __init
zstd_init(void)754 zstd_init(void)
755 {
756 /* Set pool size by using maximum sane thread count * 4 */
757 pool_count = (boot_ncpus * 4);
758 zstd_meminit();
759
760 /* Initialize kstat */
761 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
762 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
763 KSTAT_FLAG_VIRTUAL);
764 if (zstd_ksp != NULL) {
765 zstd_ksp->ks_data = &zstd_stats;
766 kstat_install(zstd_ksp);
767 }
768
769 return (0);
770 }
771
772 extern void __exit
zstd_fini(void)773 zstd_fini(void)
774 {
775 /* Deinitialize kstat */
776 if (zstd_ksp != NULL) {
777 kstat_delete(zstd_ksp);
778 zstd_ksp = NULL;
779 }
780
781 /* Release fallback memory */
782 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
783 mutex_destroy(&zstd_dctx_fallback.barrier);
784
785 /* Deinit memory pool */
786 zstd_mempool_deinit();
787 }
788
789 #if defined(_KERNEL)
790 module_init(zstd_init);
791 module_exit(zstd_fini);
792
793 ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
794 ZFS_MODULE_LICENSE("Dual BSD/GPL");
795 ZFS_MODULE_VERSION(ZSTD_VERSION_STRING "a");
796
797 EXPORT_SYMBOL(zfs_zstd_compress);
798 EXPORT_SYMBOL(zfs_zstd_decompress_level);
799 EXPORT_SYMBOL(zfs_zstd_decompress);
800 EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
801 #endif
802