@@ -, +, @@ (kgdb) where #0 sched_switch (td=0xfffff800055bf9a0, newtd=0xfffff80002341000, flags=) at /usr/src/sys/kern/sched_ule.c:1969 #1 0xffffffff80962635 in mi_switch (flags=, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:455 #2 0xffffffff809aaa3a in sleepq_wait (wchan=0x0, pri=0) at /usr/src/sys/kern/subr_sleepqueue.c:637 #3 0xffffffff80962038 in _sleep (ident=, lock=, priority=, wmesg=0xffffffff80e826ee "vmwait", sbt=0, pr=, flags=) at /usr/src/sys/kern/kern_synch.c:229 #4 0xffffffff80c1ac6b in vm_wait () at /usr/src/sys/vm/vm_page.c:2705 #5 0xffffffff80c06a9f in kmem_back (object=0xffffffff8144d6f0, addr=18446741874805047296, size=69632, flags=) at /usr/src/sys/vm/vm_kern.c:356 #6 0xffffffff80c068d2 in kmem_malloc (vmem=0xffffffff813aa500, size=69632, flags=2) at /usr/src/sys/vm/vm_kern.c:316 #7 0xffffffff80bfd7d6 in uma_large_malloc (size=69632, wait=2) at /usr/src/sys/vm/uma_core.c:1106 #8 0xffffffff8092f614 in malloc (size=, mtp=0xffffffff81b4d520, flags=0) at /usr/src/sys/kern/kern_malloc.c:513 #9 0xffffffff81b4ab99 in g_eli_crypto_run (wr=0xfffff80002560040, bp=0xfffff80008a86d90) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli_privacy.c:262 #10 0xffffffff81b3e860 in g_eli_worker (arg=0xfffff80002560040) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli.c:565 #11 0xffffffff80910f5c in fork_exit (callout=0xffffffff81b3e0b0 , arg=0xfffff80002560040, frame=0xfffffe005005ec00) at /usr/src/sys/kern/kern_fork.c:1034 #12 0xffffffff80c33f0e in fork_trampoline () at /usr/src/sys/amd64/amd64/exception.S:611 #13 0x0000000000000000 in ?? () (kgdb) p vm_cnt $16 = {v_swtch = 0, v_trap = 0, v_syscall = 0, v_intr = 0, v_soft = 0, v_vm_faults = 0, v_io_faults = 0, v_cow_faults = 0, v_cow_optim = 0, v_zfod = 0, v_ozfod = 0, v_swapin = 0, v_swapout = 0, v_swappgsin = 0, v_swappgsout = 0, v_vnodein = 0, v_vnodeout = 0, v_vnodepgsin = 0, v_vnodepgsout = 0, v_intrans = 0, v_reactivated = 0, v_pdwakeups = 22197, v_pdpages = 0, v_tcached = 0, v_dfree = 0, v_pfree = 0, v_tfree = 0, v_page_size = 4096, v_page_count = 247688, v_free_reserved = 372, v_free_target = 5320, v_free_min = 1609, v_free_count = 2, v_wire_count = 140735, v_active_count = 96194, v_inactive_target = 7980, v_inactive_count = 10756, v_cache_count = 0, v_pageout_free_min = 34, v_interrupt_free_min = 2, v_free_severe = 990, v_forks = 0, v_vforks = 0, v_rforks = 0, v_kthreads = 0, v_forkpages = 0, v_vforkpages = 0, v_rforkpages = 0, v_kthreadpages = 0, v_spare = 0xffffffff8144d5ac} -j4 and 1 GB of RAM: gpt/swap-ada1.eli BIO_WRITE value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4965848 8192 |@@@@@ 943980 12288 |@@ 362668 16384 |@ 161485 20480 |@ 120939 24576 | 87827 28672 | 57402 32768 | 40470 36864 | 42243 40960 | 28543 45056 | 20347 49152 | 15235 53248 | 13450 57344 | 9535 61440 | 9952 65536 |@ 179360 69632 | 0 gpt/swap-ada1.eli BIO_READ value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4645114 8192 | 0 12288 | 0 16384 | 3446 20480 | 0 --- sys/geom/eli/g_eli.c | 26 ++++++++++++++++++++++++-- sys/geom/eli/g_eli.h | 18 ++++++++++++++++++ sys/geom/eli/g_eli_privacy.c | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 75 insertions(+), 4 deletions(-) --- b/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -82,6 +82,17 @@ u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); +uma_zone_t g_eli_zone; +static u_int g_eli_uma_reserve = 1; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, uma_reserve, CTLFLAG_RDTUN, + &g_eli_uma_reserve, 0, "Items to pre-allocate in dedicated uma zone " + "and reserve for writes to onetime disks"); + +u_int g_eli_all_writes_use_uma = 0; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, use_uma_for_all_writes, CTLFLAG_RDTUN, + &g_eli_all_writes_use_uma, 0, "Use the dedicated uma zone for all writes. " + "May reduce write latency but also inflates memory use a bit"); + /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. @@ -246,7 +257,12 @@ g_eli_write_done(struct bio *bp) pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; - free(pbp->bio_driver2, M_ELI); + sc = pbp->bio_to->geom->softc; + if (g_eli_all_writes_use_uma || + (sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_zfree(g_eli_zone, pbp->bio_driver2); + else + free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, @@ -258,7 +274,6 @@ g_eli_write_done(struct bio *bp) /* * Write is finished, send it up. */ - sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); @@ -1254,6 +1269,12 @@ static void g_eli_init(struct g_class *mp) { + g_eli_zone = uma_zcreate("g_eli", ELI_ZONE_ITEM_SIZE, NULL, NULL, + NULL, NULL, 0, UMA_ZONE_NOFREE); + /* Increase the chances that items are available when needed. */ + uma_prealloc(g_eli_zone, g_eli_uma_reserve); + uma_zone_reserve(g_eli_zone, g_eli_uma_reserve); + g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) @@ -1264,6 +1285,7 @@ static void g_eli_fini(struct g_class *mp) { + uma_zdestroy(g_eli_zone); if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } --- b/sys/geom/eli/g_eli.h +++ b/sys/geom/eli/g_eli.h @@ -139,6 +139,24 @@ #define G_ELI_CRYPTO_SW 2 #ifdef _KERNEL +/* + * Items in the dedicated uma zone have a fixed size and need + * to be big enough for all write lengths. + * + * MAXPHYS is the largest amount of data geli can receive in a row, + * additionally we have to account for the encryption overhead, which + * depends on the number of sectors. + * + * 512 bytes is the smallest sector size supported and results in the + * largest overhead. If larger sectors are being used, we'll just waste + * a bit more memory. + * + * Given that the zone does not need a lot of items, the generous + * item size seems tolerable for now. + */ +#define ELI_ZONE_ITEM_SIZE (MAXPHYS + (MAXPHYS / 512) * \ + (sizeof(struct cryptop) + sizeof(struct cryptodesc))) + extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; --- b/sys/geom/eli/g_eli_privacy.c +++ b/sys/geom/eli/g_eli_privacy.c @@ -49,6 +49,9 @@ __FBSDID("$FreeBSD$"); #include #include +extern u_int g_eli_all_writes_use_uma; +extern uma_zone_t g_eli_zone; + /* * Code paths: * BIO_READ: @@ -153,7 +156,11 @@ g_eli_crypto_write_done(struct cryptop *crp) if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); - free(bp->bio_driver2, M_ELI); + if (g_eli_all_writes_use_uma || + (sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_zfree(g_eli_zone, bp->bio_driver2); + else + free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); @@ -259,8 +266,32 @@ g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; - p = malloc(size, M_ELI, M_WAITOK); + if (bp->bio_cmd == BIO_WRITE && + (((sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) || + g_eli_all_writes_use_uma)) { + int uma_flags; + + KASSERT(size <= ELI_ZONE_ITEM_SIZE, + ("Insufficient ELI_ZONE_ITEM_SIZE %u < %u", + (unsigned)ELI_ZONE_ITEM_SIZE, (unsigned)size)); + /* + * Writes to onetime providers are likely to originate + * from the page daemon, therefore we try to get the + * memory a bit harder for them to prevent vm deadlocks. + */ + if ((sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_flags = M_NOWAIT|M_USE_RESERVE; + else + uma_flags = M_WAITOK; + + while (NULL == (p = uma_zalloc(g_eli_zone, uma_flags))) { + /* Only reachable for onetime providers */ + pause("g_eli:uma", min(hz/1000, 1)); + } + } else { + p = malloc(size, M_ELI, M_WAITOK); + } bp->bio_inbed = 0; bp->bio_children = nsec; bp->bio_driver2 = p; --