xref: /dragonfly/sys/vm/vm_page2.h (revision 2198d48d13b680ca7a6ab9e1b5ca226967bcc1e5)
1 /*-
2  * Copyright (c) 1982, 1986, 1993
3  *        The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *        @(#)vmmeter.h       8.2 (Berkeley) 7/10/94
30  * $FreeBSD: src/sys/sys/vmmeter.h,v 1.21.2.2 2002/10/10 19:28:21 dillon Exp $
31  */
32 
33 #ifndef _VM_VM_PAGE2_H_
34 #define _VM_VM_PAGE2_H_
35 
36 #ifdef _KERNEL
37 
38 #ifndef _SYS_VMMETER_H_
39 #include <sys/vmmeter.h>
40 #endif
41 #ifndef _SYS_QUEUE_H_
42 #include <sys/queue.h>
43 #endif
44 #ifndef _VM_VM_PAGE_H_
45 #include <vm/vm_page.h>
46 #endif
47 #ifndef _SYS_SPINLOCK_H_
48 #include <sys/spinlock.h>
49 #endif
50 #ifndef _SYS_SPINLOCK2_H_
51 #include <sys/spinlock2.h>
52 #endif
53 
54 /*
55  * SMP NOTE
56  *
57  * VM fault rates are highly dependent on SMP locking conflicts and, on
58  * multi-socket systems, cache mastership changes for globals due to atomic
59  * ops (even simple atomic_add_*() calls).  Cache mastership changes can
60  * limit the aggregate fault rate.
61  *
62  * For this reason we go through some hoops to access VM statistics for
63  * low-memory handling, pageout, and other triggers.  Each cpu collects
64  * adjustments in gd->gd_vmstats_adj.  These get rolled up into the global
65  * vmstats structure.  The global vmstats structure is then pulled into
66  * gd->gd_vmstats by each cpu when it needs it.  Critical path checks always
67  * use the pcpu gd->gd_vmstats structure.
68  */
69 /*
70  * Return TRUE if we are under our severe low-free-pages threshold
71  *
72  * This causes user processes to stall to avoid exhausting memory that
73  * the kernel might need.
74  *
75  * reserved < severe < minimum < wait < start < target1 < target2
76  */
77 static __inline
78 int
vm_paging_severe(void)79 vm_paging_severe(void)
80 {
81           globaldata_t gd = mycpu;
82 
83           if (__predict_false(gd->gd_vmstats.v_free_severe >
84                                   gd->gd_vmstats.v_free_count +
85                                   gd->gd_vmstats.v_cache_count))
86           {
87                     return 1;
88           }
89           if (__predict_false(gd->gd_vmstats.v_free_reserved >
90                                   gd->gd_vmstats.v_free_count))
91           {
92                     return 1;
93           }
94           return 0;
95 }
96 
97 /*
98  * Return TRUE if we are under our minimum low-free-pages threshold.  We
99  * will not count (donotcount) free pages as being free (used mainly for
100  * hystersis tests).
101  *
102  * This will cause most normal page faults to block and activate the
103  * pageout daemon.
104  *
105  * The pageout daemon should already be active due to vm_paging_start(n)
106  * and will typically continue running until it hits target2
107  *
108  * reserved < severe < minimum < wait < start < target1 < target2
109  */
110 static __inline
111 int
vm_paging_min_dnc(long donotcount)112 vm_paging_min_dnc(long donotcount)
113 {
114           globaldata_t gd = mycpu;
115 
116           if (__predict_false(gd->gd_vmstats.v_free_min + donotcount >
117                                   (gd->gd_vmstats.v_free_count +
118                                    gd->gd_vmstats.v_cache_count)))
119           {
120                     return 1;
121           }
122           if (__predict_false(gd->gd_vmstats.v_free_reserved >
123                                   gd->gd_vmstats.v_free_count))
124           {
125                     return 1;
126           }
127           return 0;
128 }
129 
130 /*
131  * Returns TRUE if the number of FREE+CACHE pages falls below vm_paging_wait,
132  * based on the nice value the trip point can be between vm_paging_min and
133  * vm_paging_wait.
134  *
135  * Used by vm_fault (see vm_wait_pfault()) to block a process on low-memory
136  * based on the process 'nice' value (-20 to +20).
137  */
138 static __inline
139 int
vm_paging_min_nice(int nice)140 vm_paging_min_nice(int nice)
141 {
142           long count;
143           long delta;
144 
145           count = 0;
146           if (nice) {
147                     delta = vmstats.v_paging_wait - vmstats.v_free_min - 1;
148                     delta = delta >> 1;
149                     if (delta > 0) {
150                               /* range 0-40, 0 is high priority, 40 is low */
151                               count = (nice + 20) * delta / 40;
152                     }
153           }
154           return vm_paging_min_dnc(count);
155 }
156 
157 static __inline
158 int
vm_paging_min(void)159 vm_paging_min(void)
160 {
161           return vm_paging_min_dnc(0);
162 }
163 
164 /*
165  * Return TRUE if nominal userland / VM-system allocations should slow
166  * down (but not stop) due to low free pages in the system.  This is
167  * typically 1/2 way between min and start.
168  *
169  * reserved < severe < minimum < wait < start < target1 < target2
170  */
171 static __inline
172 int
vm_paging_wait(void)173 vm_paging_wait(void)
174 {
175           globaldata_t gd = mycpu;
176 
177           if (__predict_false(gd->gd_vmstats.v_paging_wait >
178                                   (gd->gd_vmstats.v_free_count +
179                                    gd->gd_vmstats.v_cache_count)))
180         {
181                     return 1;
182           }
183           if (__predict_false(gd->gd_vmstats.v_free_reserved >
184                                   gd->gd_vmstats.v_free_count))
185           {
186                     return 1;
187           }
188           return 0;
189 }
190 
191 /*
192  * Return TRUE if the pageout daemon should be started up or continue
193  * running.  Available pages have dropped to a level where we need to
194  * think about freeing some up.
195  *
196  * Also handles edge cases for required 'actually-free' pages.
197  *
198  * reserved < severe < minimum < wait < start < target1 < target2
199  */
200 static __inline
201 int
vm_paging_start(int adj)202 vm_paging_start(int adj)
203 {
204           globaldata_t gd = mycpu;
205 
206           if (__predict_false(gd->gd_vmstats.v_paging_start >
207                                   (gd->gd_vmstats.v_free_count +
208                                    gd->gd_vmstats.v_cache_count + adj)))
209           {
210                     return 1;
211           }
212           if (__predict_false(gd->gd_vmstats.v_free_min >
213                                   gd->gd_vmstats.v_free_count + adj))
214           {
215                     return 1;
216           }
217           if (__predict_false(gd->gd_vmstats.v_free_reserved >
218                                   gd->gd_vmstats.v_free_count))
219           {
220                     return 1;
221           }
222           return 0;
223 }
224 
225 /*
226  * Return TRUE if the pageout daemon has not yet reached its initial target.
227  * The pageout daemon works hard to reach target1.
228  *
229  * reserved < severe < minimum < wait < start < target1 < target2
230  */
231 static __inline
232 int
vm_paging_target1(void)233 vm_paging_target1(void)
234 {
235           globaldata_t gd = mycpu;
236 
237           if (__predict_false(gd->gd_vmstats.v_paging_target1 >
238                                   (gd->gd_vmstats.v_free_count +
239                                    gd->gd_vmstats.v_cache_count)))
240           {
241                     return 1;
242           }
243           if (__predict_false(gd->gd_vmstats.v_free_reserved >
244                                   gd->gd_vmstats.v_free_count))
245           {
246                     return 1;
247           }
248           return 0;
249 }
250 
251 static __inline
252 long
vm_paging_target1_count(void)253 vm_paging_target1_count(void)
254 {
255           globaldata_t gd = mycpu;
256           long delta;
257 
258           delta = gd->gd_vmstats.v_paging_target1 -
259                     (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count);
260           return delta;
261 }
262 
263 /*
264  * Return TRUE if the pageout daemon has not yet reached its final target.
265  * The pageout daemon takes it easy on its way between target1 and target2.
266  *
267  * reserved < severe < minimum < wait < start < target1 < target2
268  */
269 static __inline
270 int
vm_paging_target2(void)271 vm_paging_target2(void)
272 {
273           globaldata_t gd = mycpu;
274 
275           if (__predict_false(gd->gd_vmstats.v_paging_target2 >
276                                   (gd->gd_vmstats.v_free_count +
277                                    gd->gd_vmstats.v_cache_count)))
278           {
279                     return 1;
280           }
281           if (__predict_false(gd->gd_vmstats.v_free_reserved >
282                                   gd->gd_vmstats.v_free_count))
283           {
284                     return 1;
285           }
286           return 0;
287 }
288 
289 static __inline
290 long
vm_paging_target2_count(void)291 vm_paging_target2_count(void)
292 {
293           globaldata_t gd = mycpu;
294           long delta;
295 
296           delta = gd->gd_vmstats.v_paging_target2 -
297                     (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count);
298           return delta;
299 }
300 
301 /*
302  * Returns TRUE if additional pages must be deactivated, either during a
303  * pageout operation or during the page stats scan.
304  *
305  * Inactive tests are used in two places.  During heavy paging the
306  * inactive_target is used to refill the inactive queue in staged.
307  * Those pages are then ultimately flushed and moved to the cache or free
308  * queues.
309  *
310  * The inactive queue is also used to manage scans to update page stats
311  * (m->act_count).  The page stats scan occurs lazily in small batches to
312  * update m->act_count for pages in the active queue and to move pages
313  * (limited by inactive_target) to the inactive queue.  Page stats scanning
314  * and active deactivations only run while the inactive queue is below target.
315  * After this, additional page stats scanning just to update m->act_count
316  * (but not do further deactivations) continues to run for a limited period
317  * of time after any pageout daemon activity.
318  */
319 static __inline
320 int
vm_paging_inactive(void)321 vm_paging_inactive(void)
322 {
323           globaldata_t gd = mycpu;
324 
325           if (__predict_false((gd->gd_vmstats.v_free_count +
326                                    gd->gd_vmstats.v_cache_count +
327                                    gd->gd_vmstats.v_inactive_count) <
328                                   (gd->gd_vmstats.v_free_min +
329                                    gd->gd_vmstats.v_inactive_target)))
330           {
331                     return 1;
332           }
333           return 0;
334 }
335 
336 /*
337  * Return number of pages that need to be deactivated to achieve the inactive
338  * target as a positive number.  A negative number indicates that there are
339  * already a sufficient number of inactive pages.
340  */
341 static __inline
342 long
vm_paging_inactive_count(void)343 vm_paging_inactive_count(void)
344 {
345           globaldata_t gd = mycpu;
346           long delta;
347 
348           delta = (gd->gd_vmstats.v_free_min + gd->gd_vmstats.v_inactive_target) -
349                     (gd->gd_vmstats.v_free_count + gd->gd_vmstats.v_cache_count +
350                      gd->gd_vmstats.v_inactive_count);
351 
352           return delta;
353 }
354 
355 /*
356  * Clear dirty bits in the VM page but truncate the
357  * end to a DEV_BSIZE'd boundary.
358  *
359  * Used when reading data in, typically via getpages.
360  * The partial device block at the end of the truncation
361  * range should not lose its dirty bit.
362  *
363  * NOTE: This function does not clear the pmap modified bit.
364  */
365 static __inline
366 void
vm_page_clear_dirty_end_nonincl(vm_page_t m,int base,int size)367 vm_page_clear_dirty_end_nonincl(vm_page_t m, int base, int size)
368 {
369     size = (base + size) & ~DEV_BMASK;
370     if (base < size)
371           vm_page_clear_dirty(m, base, size - base);
372 }
373 
374 /*
375  * Clear dirty bits in the VM page but truncate the
376  * beginning to a DEV_BSIZE'd boundary.
377  *
378  * Used when truncating a buffer.  The partial device
379  * block at the beginning of the truncation range
380  * should not lose its dirty bit.
381  *
382  * NOTE: This function does not clear the pmap modified bit.
383  */
384 static __inline
385 void
vm_page_clear_dirty_beg_nonincl(vm_page_t m,int base,int size)386 vm_page_clear_dirty_beg_nonincl(vm_page_t m, int base, int size)
387 {
388     size += base;
389     base = (base + DEV_BMASK) & ~DEV_BMASK;
390     if (base < size)
391           vm_page_clear_dirty(m, base, size - base);
392 }
393 
394 static __inline
395 void
vm_page_spin_lock(vm_page_t m)396 vm_page_spin_lock(vm_page_t m)
397 {
398     spin_lock(&m->spin);
399 }
400 
401 static __inline
402 void
vm_page_spin_unlock(vm_page_t m)403 vm_page_spin_unlock(vm_page_t m)
404 {
405     spin_unlock(&m->spin);
406 }
407 
408 /*
409  * Wire a vm_page that is already wired.  Does not require a busied
410  * page.
411  */
412 static __inline
413 void
vm_page_wire_quick(vm_page_t m)414 vm_page_wire_quick(vm_page_t m)
415 {
416     if (atomic_fetchadd_int(&m->wire_count, 1) == 0)
417           panic("vm_page_wire_quick: wire_count was 0");
418 }
419 
420 /*
421  * Unwire a vm_page quickly, does not require a busied page.
422  *
423  * This routine refuses to drop the wire_count to 0 and will return
424  * TRUE if it would have had to (instead of decrementing it to 0).
425  * The caller can then busy the page and deal with it.
426  */
427 static __inline
428 int
vm_page_unwire_quick(vm_page_t m)429 vm_page_unwire_quick(vm_page_t m)
430 {
431     KKASSERT(m->wire_count > 0);
432     for (;;) {
433           u_int wire_count = m->wire_count;
434 
435           cpu_ccfence();
436           if (wire_count == 1)
437                     return TRUE;
438           if (atomic_cmpset_int(&m->wire_count, wire_count, wire_count - 1))
439                     return FALSE;
440     }
441 }
442 
443 /*
444  *        Functions implemented as macros
445  */
446 
447 static __inline void
vm_page_flag_set(vm_page_t m,unsigned int bits)448 vm_page_flag_set(vm_page_t m, unsigned int bits)
449 {
450           atomic_set_int(&(m)->flags, bits);
451 }
452 
453 static __inline void
vm_page_flag_clear(vm_page_t m,unsigned int bits)454 vm_page_flag_clear(vm_page_t m, unsigned int bits)
455 {
456           atomic_clear_int(&(m)->flags, bits);
457 }
458 
459 /*
460  * Wakeup anyone waiting for the page after potentially unbusying
461  * (hard or soft) or doing other work on a page that might make a
462  * waiter ready.  The setting of PBUSY_WANTED is integrated into the
463  * related flags and it can't be set once the flags are already
464  * clear, so there should be no races here.
465  */
466 static __inline void
vm_page_flash(vm_page_t m)467 vm_page_flash(vm_page_t m)
468 {
469           if (m->busy_count & PBUSY_WANTED) {
470                     atomic_clear_int(&m->busy_count, PBUSY_WANTED);
471                     wakeup(m);
472           }
473 }
474 
475 /*
476  * Adjust the soft-busy count on a page.  The drop code will issue an
477  * integrated wakeup if busy_count becomes 0.
478  */
479 static __inline void
vm_page_sbusy_hold(vm_page_t m)480 vm_page_sbusy_hold(vm_page_t m)
481 {
482           atomic_add_int(&m->busy_count, 1);
483 }
484 
485 static __inline void
vm_page_sbusy_drop(vm_page_t m)486 vm_page_sbusy_drop(vm_page_t m)
487 {
488           uint32_t ocount;
489 
490           ocount = atomic_fetchadd_int(&m->busy_count, -1);
491           if (ocount - 1 == PBUSY_WANTED) {
492                     /* WANTED and no longer BUSY or SBUSY */
493                     atomic_clear_int(&m->busy_count, PBUSY_WANTED);
494                     wakeup(m);
495           }
496 }
497 
498 /*
499  * Reduce the protection of a page.  This routine never raises the
500  * protection and therefore can be safely called if the page is already
501  * at VM_PROT_NONE (it will be a NOP effectively ).
502  *
503  * VM_PROT_NONE will remove all user mappings of a page.  This is often
504  * necessary when a page changes state (for example, turns into a copy-on-write
505  * page or needs to be frozen for write I/O) in order to force a fault, or
506  * to force a page's dirty bits to be synchronized and avoid hardware
507  * (modified/accessed) bit update races with pmap changes.
508  *
509  * Since 'prot' is usually a constant, this inline usually winds up optimizing
510  * out the primary conditional.
511  *
512  * Must be called with (m) hard-busied.
513  *
514  * WARNING: VM_PROT_NONE can block, but will loop until all mappings have
515  *            been cleared.  Callers should be aware that other page related
516  *            elements might have changed, however.
517  */
518 static __inline void
vm_page_protect(vm_page_t m,int prot)519 vm_page_protect(vm_page_t m, int prot)
520 {
521           KKASSERT(m->busy_count & PBUSY_LOCKED);
522           if (prot == VM_PROT_NONE) {
523                     if (pmap_mapped_sync(m) & (PG_MAPPED | PG_WRITEABLE)) {
524                               pmap_page_protect(m, VM_PROT_NONE);
525                               /* PG_WRITEABLE & PG_MAPPED cleared by call */
526                     }
527           } else if ((prot == VM_PROT_READ) &&
528                        (m->flags & PG_WRITEABLE) &&
529                        (pmap_mapped_sync(m) & PG_WRITEABLE)) {
530                     pmap_page_protect(m, VM_PROT_READ);
531                     /* PG_WRITEABLE cleared by call */
532           }
533 }
534 
535 /*
536  * Zero-fill the specified page.  The entire contents of the page will be
537  * zero'd out.
538  */
539 static __inline boolean_t
vm_page_zero_fill(vm_page_t m)540 vm_page_zero_fill(vm_page_t m)
541 {
542           pmap_zero_page(VM_PAGE_TO_PHYS(m));
543           return (TRUE);
544 }
545 
546 /*
547  * Copy the contents of src_m to dest_m.  The pages must be stable but spl
548  * and other protections depend on context.
549  */
550 static __inline void
vm_page_copy(vm_page_t src_m,vm_page_t dest_m)551 vm_page_copy(vm_page_t src_m, vm_page_t dest_m)
552 {
553           pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m));
554           dest_m->valid = VM_PAGE_BITS_ALL;
555           dest_m->dirty = VM_PAGE_BITS_ALL;
556 }
557 
558 /*
559  * Free a page.  The page must be marked BUSY.
560  */
561 static __inline void
vm_page_free(vm_page_t m)562 vm_page_free(vm_page_t m)
563 {
564           vm_page_free_toq(m);
565 }
566 
567 /*
568  * Free a page to the zerod-pages queue.  The caller must ensure that the
569  * page has been zerod.
570  */
571 static __inline void
vm_page_free_zero(vm_page_t m)572 vm_page_free_zero(vm_page_t m)
573 {
574 #ifdef PMAP_DEBUG
575 #ifdef PHYS_TO_DMAP
576           char *p = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
577           int i;
578 
579           for (i = 0; i < PAGE_SIZE; i++) {
580                     if (p[i] != 0) {
581                               panic("non-zero page in vm_page_free_zero()");
582                     }
583           }
584 #endif
585 #endif
586           vm_page_free_toq(m);
587 }
588 
589 /*
590  * Set page to not be dirty.  Note: does not clear pmap modify bits .
591  */
592 static __inline void
vm_page_undirty(vm_page_t m)593 vm_page_undirty(vm_page_t m)
594 {
595           m->dirty = 0;
596 }
597 
598 #endif    /* _KERNEL */
599 #endif    /* _VM_VM_PAGE2_H_ */
600 
601