*** /usr/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Fri Mar 14 15:36:17 2014 --- arc.c.original Thu Mar 13 09:18:48 2014 *************** *** 18,85 **** * * CDDL HEADER END */ - - /* Karl Denninger (karl@denninger.net), 3/13/2014, FreeBSD-specific - * - * If "NEWRECLAIM" is defined, change the "low memory" warning that causes - * the ARC cache to be pared down. The reason for the change is that the - * apparent attempted algorithm is to start evicting ARC cache when free - * pages fall below 25% of installed RAM. This maps reasonably well to how - * Solaris is documented to behave; when "lotsfree" is invaded ZFS is told - * to pare down. - * - * The problem is that on FreeBSD machines the system doesn't appear to be - * getting what the authors of the original code thought they were looking at - * with its test and as a result that test never triggers. That leaves the - * only reclaim trigger as the "paging needed" status flag, and by the time - * that trips the system is already in low-memory trouble. This can lead to - * severe pathological behavior under the following scenario: - * - The system starts to page and ARC is evicted. - * - The system stops paging as ARC's eviction drops wired RAM a bit. - * - ARC starts increasing its allocation again, and wired memory grows. - * - A new image is activated, and the system once again attempts to page. - * - ARC starts to be evicted again. - * - Back to #2 - * - * Note that ZFS's ARC default (unless you override it in /boot/loader.conf) - * is to allow the ARC cache to grab nearly all of free RAM, provided nobody - * else needs it. That would be ok if we evicted cache when required. - * - * Unfortunately the system can get into a state where it never - * manages to page anything of materiality back in, as if there is active - * I/O the ARC will start grabbing space once again as soon as the memory - * contention state drops. For this reason the "paging is occurring" flag - * should be the **last resort** condition for ARC eviction; you want to - * (as Solaris does) start when there is material free RAM left in the hope - * of never getting into the condition where you're potentially paging off - * executables in favor of leaving disk cache allocated. That's a recipe - * for terrible overall system performance. - * - * To fix this we instead grab four OIDs out of the sysctl status - * messages -- wired pages, active pages, inactive pages and cache (vnodes?) - * pages, sum those and compare against the free page count from the - * VM sysctl status OID, giving us a percentage of pages free. This - * is checked against a new tunable "vfs.zfs.arc_freepage_percent_target" - * and if less, we declare the system low on memory. - * - * Note that this sysctl variable is runtime tunable if you have reason - * to change it (e.g. you want more or less RAM free to be the "clean up" - * threshold.) - * - * If we're using this check for low memory we are replacing the previous - * ones, including the oddball "random" reclaim that appears to fire far - * more often than it should. We still trigger if the system pages. - * - * If you turn on NEWRECLAIM_DEBUG then the kernel will print on the console - * status messages when the reclaim status trips on and off, along with the - * page count aggregate that triggered it (and the free space) for each - * event. - */ - - #define NEWRECLAIM - #undef NEWRECLAIM_DEBUG - - /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. --- 18,23 ---- *************** *** 201,212 **** #include - #ifdef NEWRECLAIM - #ifdef __FreeBSD__ - #include - #endif - #endif /* NEWRECLAIM */ - #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ --- 139,144 ---- *************** *** 271,303 **** int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; - #ifdef NEWRECLAIM - #ifdef __FreeBSD__ - static int percent_target = 25; - #endif - #endif TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); - #ifdef NEWRECLAIM - #ifdef __FreeBSD__ - TUNABLE_INT("vfs.zfs.arc_freepage_percent_target", &percent_target); - #endif - #endif - SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); - #ifdef NEWRECLAIM - #ifdef __FreeBSD__ - SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_freepage_percent_target, CTLFLAG_RWTUN, &percent_target, 0, "ARC Free RAM Target percentage"); - #endif - #endif - /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) --- 203,218 ---- *************** *** 2523,2544 **** { #ifdef _KERNEL - #ifdef NEWRECLAIM - #ifdef __FreeBSD__ - u_int vmwire = 0; - u_int vmactive = 0; - u_int vminactive = 0; - u_int vmcache = 0; - u_int vmfree = 0; - u_int vmtotal = 0; - int percent = 25; - size_t vmsize; - #ifdef NEWRECLAIM_DEBUG - static int xval = -1; - static int oldpercent = 0; - #endif /* NEWRECLAIM_DEBUG */ - #endif /* NEWRECLAIM */ - #endif if (needfree) return (1); --- 2438,2443 ---- *************** *** 2577,2583 **** return (1); #if defined(__i386) - /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical --- 2476,2481 ---- *************** *** 2594,2659 **** return (1); #endif #else /* !sun */ - - #ifdef NEWRECLAIM - #ifdef __FreeBSD__ - /* - * Implement the new tunable free RAM algorithm. We check the various page - * VM stats and add them up, then check the free count percentage against - * the specified target. If we're under the target we are memory constrained - * and ask for ARC cache shrinkage. If this is defined on a FreeBSD system - * the older checks are not performed. - */ - vmsize = sizeof(vmwire); - kernel_sysctlbyname(curthread, "vm.stats.vm.v_wire_count", &vmwire, &vmsize, NULL, 0, NULL, 0); - vmsize = sizeof(vmactive); - kernel_sysctlbyname(curthread, "vm.stats.vm.v_active_count", &vmactive, &vmsize, NULL, 0, NULL, 0); - vmsize = sizeof(vminactive); - kernel_sysctlbyname(curthread, "vm.stats.vm.v_inactive_count", &vminactive, &vmsize, NULL, 0, NULL, 0); - vmsize = sizeof(vmcache); - kernel_sysctlbyname(curthread, "vm.stats.vm.v_cache_count", &vmcache, &vmsize, NULL, 0, NULL, 0); - vmsize = sizeof(vmfree); - kernel_sysctlbyname(curthread, "vm.stats.vm.v_free_count", &vmfree, &vmsize, NULL, 0, NULL, 0); - vmsize = sizeof(percent); - kernel_sysctlbyname(curthread, "vfs.zfs.arc_freepage_percent_target", &percent, &vmsize, NULL, 0, NULL, 0); - vmtotal = vmwire + vmactive + vminactive + vmcache + vmfree; - #ifdef NEWRECLAIM_DEBUG - if (percent != oldpercent) { - printf("ZFS ARC: Reservation change to [%d], [%d] pages, [%d] free\n", percent, vmtotal, vmfree); - oldpercent = percent; - } - #endif - - if (!vmtotal) { - vmtotal = 1; /* Protect against divide by zero */ - /* (should be impossible, but...) */ - } - - if (((vmfree * 100) / vmtotal) < percent) { - #ifdef NEWRECLAIM_DEBUG - if (xval != 1) { - printf("ZFS ARC: RECLAIM total %u, free %u, free pct (%u), target pct (%u)\n", vmtotal, vmfree, ((vmfree * 100) / vmtotal), percent); - xval = 1; - } - #endif /* NEWRECLAIM_DEBUG */ - return(1); - } else { - #ifdef NEWRECLAIM_DEBUG - if (xval != 0) { - printf("ZFS ARC: NORMAL total %u, free %u, free pct (%u), target pct (%u)\n", vmtotal, vmfree, ((vmfree * 100) / vmtotal), percent); - xval = 0; - } - #endif - return(0); - } - - #endif /* __FreeBSD__ */ - #endif /* NEWRECLAIM */ - if (kmem_used() > (kmem_size() * 3) / 4) return (1); #endif /* sun */ if (spa_get_random(100) == 0) return (1); #endif --- 2492,2502 ---- return (1); #endif #else /* !sun */ if (kmem_used() > (kmem_size() * 3) / 4) return (1); #endif /* sun */ + #else if (spa_get_random(100) == 0) return (1); #endif