1 /*        $NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $ */
2 /*        NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $ */
3 
4 /*-
5  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
35  * Copyright (c) 1991, 1993, The Regents of the University of California.
36  *
37  * All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * The Mach Operating System project at Carnegie-Mellon University.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *        @(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
68  *
69  *
70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
71  * All rights reserved.
72  *
73  * Permission to use, copy, modify and distribute this software and
74  * its documentation is hereby granted, provided that both the copyright
75  * notice and this permission notice appear in all copies of the
76  * software, derivative works or modified versions, and any portions
77  * thereof, and that both notices appear in supporting documentation.
78  *
79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
82  *
83  * Carnegie Mellon requests users of this software to return to
84  *
85  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
86  *  School of Computer Science
87  *  Carnegie Mellon University
88  *  Pittsburgh PA 15213-3890
89  *
90  * any improvements or extensions that they make and grant Carnegie the
91  * rights to redistribute these changes.
92  */
93 
94 #if defined(PDSIM)
95 
96 #include "pdsim.h"
97 
98 #else /* defined(PDSIM) */
99 
100 #include <sys/cdefs.h>
101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $");
102 
103 #include <sys/param.h>
104 #include <sys/proc.h>
105 #include <sys/systm.h>
106 #include <sys/kernel.h>
107 #include <sys/kmem.h>
108 #include <sys/atomic.h>
109 
110 #include <uvm/uvm.h>
111 #include <uvm/uvm_pdpolicy.h>
112 #include <uvm/uvm_pdpolicy_impl.h>
113 #include <uvm/uvm_stat.h>
114 
115 #endif /* defined(PDSIM) */
116 
117 /*
118  * per-CPU queue of pending page status changes.  128 entries makes for a
119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
120  * keeps lock contention events and wait times low, while not using too much
121  * memory nor allowing global state to fall too far behind.
122  */
123 #if !defined(CLOCK_PDQ_SIZE)
124 #define   CLOCK_PDQ_SIZE      128
125 #endif /* !defined(CLOCK_PDQ_SIZE) */
126 
127 #define PQ_INACTIVE 0x00000010          /* page is in inactive list */
128 #define PQ_ACTIVE   0x00000020          /* page is in active list */
129 
130 #if !defined(CLOCK_INACTIVEPCT)
131 #define   CLOCK_INACTIVEPCT   33
132 #endif /* !defined(CLOCK_INACTIVEPCT) */
133 
134 struct uvmpdpol_globalstate {
135           kmutex_t lock;                          /* lock on state */
136                                                   /* <= compiler pads here */
137           struct pglist s_activeq                 /* allocated pages, in use */
138               __aligned(COHERENCY_UNIT);
139           struct pglist s_inactiveq;    /* pages between the clock hands */
140           int s_active;
141           int s_inactive;
142           int s_inactarg;
143           struct uvm_pctparam s_anonmin;
144           struct uvm_pctparam s_filemin;
145           struct uvm_pctparam s_execmin;
146           struct uvm_pctparam s_anonmax;
147           struct uvm_pctparam s_filemax;
148           struct uvm_pctparam s_execmax;
149           struct uvm_pctparam s_inactivepct;
150 };
151 
152 struct uvmpdpol_scanstate {
153           bool ss_anonreact, ss_filereact, ss_execreact;
154           struct vm_page ss_marker;
155 };
156 
157 static void         uvmpdpol_pageactivate_locked(struct vm_page *);
158 static void         uvmpdpol_pagedeactivate_locked(struct vm_page *);
159 static void         uvmpdpol_pagedequeue_locked(struct vm_page *);
160 static bool         uvmpdpol_pagerealize_locked(struct vm_page *);
161 static struct uvm_cpu *uvmpdpol_flush(void);
162 
163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
164 static struct uvmpdpol_scanstate pdpol_scanstate;
165 
166 PDPOL_EVCNT_DEFINE(reactexec)
PDPOL_EVCNT_DEFINE(reactfile)167 PDPOL_EVCNT_DEFINE(reactfile)
168 PDPOL_EVCNT_DEFINE(reactanon)
169 
170 static void
171 clock_tune(void)
172 {
173           struct uvmpdpol_globalstate *s = &pdpol_state;
174 
175           s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
176               s->s_active + s->s_inactive);
177           if (s->s_inactarg <= uvmexp.freetarg) {
178                     s->s_inactarg = uvmexp.freetarg + 1;
179           }
180 }
181 
182 void
uvmpdpol_scaninit(void)183 uvmpdpol_scaninit(void)
184 {
185           struct uvmpdpol_globalstate *s = &pdpol_state;
186           struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
187           int t;
188           bool anonunder, fileunder, execunder;
189           bool anonover, fileover, execover;
190           bool anonreact, filereact, execreact;
191           int64_t freepg, anonpg, filepg, execpg;
192 
193           /*
194            * decide which types of pages we want to reactivate instead of freeing
195            * to keep usage within the minimum and maximum usage limits.
196            * uvm_availmem() will sync the counters.
197            */
198 
199           freepg = uvm_availmem(false);
200           anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) +
201               cpu_count_get(CPU_COUNT_ANONDIRTY) +
202               cpu_count_get(CPU_COUNT_ANONUNKNOWN);
203           execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
204           filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
205               cpu_count_get(CPU_COUNT_FILEDIRTY) +
206               cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
207               execpg;
208 
209           mutex_enter(&s->lock);
210           t = s->s_active + s->s_inactive + freepg;
211           anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
212           fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
213           execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
214           anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
215           fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
216           execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
217           anonreact = anonunder || (!anonover && (fileover || execover));
218           filereact = fileunder || (!fileover && (anonover || execover));
219           execreact = execunder || (!execover && (anonover || fileover));
220           if (filereact && execreact && (anonreact || uvm_swapisfull())) {
221                     anonreact = filereact = execreact = false;
222           }
223           ss->ss_anonreact = anonreact;
224           ss->ss_filereact = filereact;
225           ss->ss_execreact = execreact;
226           memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
227           ss->ss_marker.flags = PG_MARKER;
228           TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
229           mutex_exit(&s->lock);
230 }
231 
232 void
uvmpdpol_scanfini(void)233 uvmpdpol_scanfini(void)
234 {
235           struct uvmpdpol_globalstate *s = &pdpol_state;
236           struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
237 
238           mutex_enter(&s->lock);
239           TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
240           mutex_exit(&s->lock);
241 }
242 
243 struct vm_page *
uvmpdpol_selectvictim(krwlock_t ** plock)244 uvmpdpol_selectvictim(krwlock_t **plock)
245 {
246           struct uvmpdpol_globalstate *s = &pdpol_state;
247           struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
248           struct vm_page *pg;
249           krwlock_t *lock;
250 
251           mutex_enter(&s->lock);
252           while (/* CONSTCOND */ 1) {
253                     struct vm_anon *anon;
254                     struct uvm_object *uobj;
255 
256                     pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
257                     if (pg == NULL) {
258                               break;
259                     }
260                     KASSERT((pg->flags & PG_MARKER) == 0);
261                     uvmexp.pdscans++;
262 
263                     /*
264                      * acquire interlock to stabilize page identity.
265                      * if we have caught the page in a state of flux
266                      * deal with it and retry.
267                      */
268                     mutex_enter(&pg->interlock);
269                     if (uvmpdpol_pagerealize_locked(pg)) {
270                               mutex_exit(&pg->interlock);
271                               continue;
272                     }
273 
274                     /*
275                      * now prepare to move on to the next page.
276                      */
277                     TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
278                         pdqueue);
279                     TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
280                         &ss->ss_marker, pdqueue);
281 
282                     /*
283                      * enforce the minimum thresholds on different
284                      * types of memory usage.  if reusing the current
285                      * page would reduce that type of usage below its
286                      * minimum, reactivate the page instead and move
287                      * on to the next page.
288                      */
289                     anon = pg->uanon;
290                     uobj = pg->uobject;
291                     if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
292                               uvmpdpol_pageactivate_locked(pg);
293                               mutex_exit(&pg->interlock);
294                               PDPOL_EVCNT_INCR(reactexec);
295                               continue;
296                     }
297                     if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
298                         !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
299                               uvmpdpol_pageactivate_locked(pg);
300                               mutex_exit(&pg->interlock);
301                               PDPOL_EVCNT_INCR(reactfile);
302                               continue;
303                     }
304                     if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
305                               uvmpdpol_pageactivate_locked(pg);
306                               mutex_exit(&pg->interlock);
307                               PDPOL_EVCNT_INCR(reactanon);
308                               continue;
309                     }
310 
311                     /*
312                      * try to lock the object that owns the page.
313                      *
314                      * with the page interlock held, we can drop s->lock, which
315                      * could otherwise serve as a barrier to us getting the
316                      * object locked, because the owner of the object's lock may
317                      * be blocked on s->lock (i.e. a deadlock).
318                      *
319                      * whatever happens, uvmpd_trylockowner() will release the
320                      * interlock.  with the interlock dropped we can then
321                      * re-acquire our own lock.  the order is:
322                      *
323                      *        object -> pdpol -> interlock.
324                    */
325                   mutex_exit(&s->lock);
326           lock = uvmpd_trylockowner(pg);
327           /* pg->interlock now released */
328           mutex_enter(&s->lock);
329                     if (lock == NULL) {
330                               /* didn't get it - try the next page. */
331                               continue;
332                     }
333 
334                     /*
335                      * move referenced pages back to active queue and skip to
336                      * next page.
337                      */
338                     if (pmap_is_referenced(pg)) {
339                               mutex_enter(&pg->interlock);
340                               uvmpdpol_pageactivate_locked(pg);
341                               mutex_exit(&pg->interlock);
342                               uvmexp.pdreact++;
343                               rw_exit(lock);
344                               continue;
345                     }
346 
347                     /* we have a potential victim. */
348                     *plock = lock;
349                     break;
350           }
351           mutex_exit(&s->lock);
352           return pg;
353 }
354 
355 void
uvmpdpol_balancequeue(int swap_shortage)356 uvmpdpol_balancequeue(int swap_shortage)
357 {
358           struct uvmpdpol_globalstate *s = &pdpol_state;
359           int inactive_shortage;
360           struct vm_page *p, marker;
361           krwlock_t *lock;
362 
363           /*
364            * we have done the scan to get free pages.   now we work on meeting
365            * our inactive target.
366            */
367 
368           memset(&marker, 0, sizeof(marker));
369           marker.flags = PG_MARKER;
370 
371           mutex_enter(&s->lock);
372           TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
373           for (;;) {
374                     inactive_shortage =
375                         pdpol_state.s_inactarg - pdpol_state.s_inactive;
376                     if (inactive_shortage <= 0 && swap_shortage <= 0) {
377                               break;
378                     }
379                     p = TAILQ_NEXT(&marker, pdqueue);
380                     if (p == NULL) {
381                               break;
382                     }
383                     KASSERT((p->flags & PG_MARKER) == 0);
384 
385                     /*
386                      * acquire interlock to stabilize page identity.
387                      * if we have caught the page in a state of flux
388                      * deal with it and retry.
389                      */
390                     mutex_enter(&p->interlock);
391                     if (uvmpdpol_pagerealize_locked(p)) {
392                               mutex_exit(&p->interlock);
393                               continue;
394                     }
395 
396                     /*
397                      * now prepare to move on to the next page.
398                      */
399                     TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
400                     TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
401                         pdqueue);
402 
403                     /*
404                      * try to lock the object that owns the page.  see comments
405                      * in uvmpdol_selectvictim().
406                    */
407                   mutex_exit(&s->lock);
408           lock = uvmpd_trylockowner(p);
409           /* p->interlock now released */
410           mutex_enter(&s->lock);
411                     if (lock == NULL) {
412                               /* didn't get it - try the next page. */
413                               continue;
414                     }
415 
416                     /*
417                      * if there's a shortage of swap slots, try to free it.
418                      */
419                     if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
420                         (p->flags & PG_BUSY) == 0) {
421                               if (uvmpd_dropswap(p)) {
422                                         swap_shortage--;
423                               }
424                     }
425 
426                     /*
427                      * if there's a shortage of inactive pages, deactivate.
428                      */
429                     if (inactive_shortage > 0) {
430                               pmap_clear_reference(p);
431                               mutex_enter(&p->interlock);
432                               uvmpdpol_pagedeactivate_locked(p);
433                               mutex_exit(&p->interlock);
434                               uvmexp.pddeact++;
435                               inactive_shortage--;
436                     }
437                     rw_exit(lock);
438           }
439           TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
440           mutex_exit(&s->lock);
441 }
442 
443 static void
uvmpdpol_pagedeactivate_locked(struct vm_page * pg)444 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
445 {
446           struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
447 
448           KASSERT(mutex_owned(&s->lock));
449           KASSERT(mutex_owned(&pg->interlock));
450           KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
451               (PQ_INTENT_D | PQ_INTENT_SET));
452 
453           if (pg->pqflags & PQ_ACTIVE) {
454                     TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
455                     KASSERT(pdpol_state.s_active > 0);
456                     pdpol_state.s_active--;
457           }
458           if ((pg->pqflags & PQ_INACTIVE) == 0) {
459                     KASSERT(pg->wire_count == 0);
460                     TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
461                     pdpol_state.s_inactive++;
462           }
463           pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
464           pg->pqflags |= PQ_INACTIVE;
465 }
466 
467 void
uvmpdpol_pagedeactivate(struct vm_page * pg)468 uvmpdpol_pagedeactivate(struct vm_page *pg)
469 {
470 
471           KASSERT(uvm_page_owner_locked_p(pg, false));
472           KASSERT(mutex_owned(&pg->interlock));
473 
474           /*
475            * we have to clear the reference bit now, as when it comes time to
476            * realize the intent we won't have the object locked any more.
477            */
478           pmap_clear_reference(pg);
479           uvmpdpol_set_intent(pg, PQ_INTENT_I);
480 }
481 
482 static void
uvmpdpol_pageactivate_locked(struct vm_page * pg)483 uvmpdpol_pageactivate_locked(struct vm_page *pg)
484 {
485           struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
486 
487           KASSERT(mutex_owned(&s->lock));
488           KASSERT(mutex_owned(&pg->interlock));
489           KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
490               (PQ_INTENT_D | PQ_INTENT_SET));
491 
492           uvmpdpol_pagedequeue_locked(pg);
493           TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
494           pdpol_state.s_active++;
495           pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
496           pg->pqflags |= PQ_ACTIVE;
497 }
498 
499 void
uvmpdpol_pageactivate(struct vm_page * pg)500 uvmpdpol_pageactivate(struct vm_page *pg)
501 {
502 
503           KASSERT(uvm_page_owner_locked_p(pg, false));
504           KASSERT(mutex_owned(&pg->interlock));
505 
506           uvmpdpol_set_intent(pg, PQ_INTENT_A);
507 }
508 
509 static void
uvmpdpol_pagedequeue_locked(struct vm_page * pg)510 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
511 {
512           struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
513 
514           KASSERT(mutex_owned(&s->lock));
515           KASSERT(mutex_owned(&pg->interlock));
516 
517           if (pg->pqflags & PQ_ACTIVE) {
518                     TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
519                     KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
520                     KASSERT(pdpol_state.s_active > 0);
521                     pdpol_state.s_active--;
522           } else if (pg->pqflags & PQ_INACTIVE) {
523                     TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
524                     KASSERT(pdpol_state.s_inactive > 0);
525                     pdpol_state.s_inactive--;
526           }
527           pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
528 }
529 
530 void
uvmpdpol_pagedequeue(struct vm_page * pg)531 uvmpdpol_pagedequeue(struct vm_page *pg)
532 {
533 
534           KASSERT(uvm_page_owner_locked_p(pg, true));
535           KASSERT(mutex_owned(&pg->interlock));
536 
537           uvmpdpol_set_intent(pg, PQ_INTENT_D);
538 }
539 
540 void
uvmpdpol_pageenqueue(struct vm_page * pg)541 uvmpdpol_pageenqueue(struct vm_page *pg)
542 {
543 
544           KASSERT(uvm_page_owner_locked_p(pg, false));
545           KASSERT(mutex_owned(&pg->interlock));
546 
547           uvmpdpol_set_intent(pg, PQ_INTENT_E);
548 }
549 
550 void
uvmpdpol_anfree(struct vm_anon * an)551 uvmpdpol_anfree(struct vm_anon *an)
552 {
553 }
554 
555 bool
uvmpdpol_pageisqueued_p(struct vm_page * pg)556 uvmpdpol_pageisqueued_p(struct vm_page *pg)
557 {
558           uint32_t pqflags;
559 
560           /*
561            * if there's an intent set, we have to consider it.  otherwise,
562            * return the actual state.  we may be called unlocked for the
563            * purpose of assertions, which is safe due to the page lifecycle.
564            */
565           pqflags = atomic_load_relaxed(&pg->pqflags);
566           if ((pqflags & PQ_INTENT_SET) != 0) {
567                     return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
568           } else {
569                     return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
570           }
571 }
572 
573 bool
uvmpdpol_pageactivate_p(struct vm_page * pg)574 uvmpdpol_pageactivate_p(struct vm_page *pg)
575 {
576           uint32_t pqflags;
577 
578           /* consider intent in preference to actual state. */
579           pqflags = atomic_load_relaxed(&pg->pqflags);
580           if ((pqflags & PQ_INTENT_SET) != 0) {
581                     pqflags &= PQ_INTENT_MASK;
582                     return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E;
583           } else {
584                     /*
585                      * TODO: Enabling this may be too much of a big hammer,
586                      * since we do get useful information from activations.
587                      * Think about it more and maybe come up with a heuristic
588                      * or something.
589                      *
590                      * return (pqflags & PQ_ACTIVE) == 0;
591                      */
592                     return true;
593           }
594 }
595 
596 void
uvmpdpol_estimatepageable(int * active,int * inactive)597 uvmpdpol_estimatepageable(int *active, int *inactive)
598 {
599           struct uvmpdpol_globalstate *s = &pdpol_state;
600 
601           /*
602            * Don't take any locks here.  This can be called from DDB, and in
603            * any case the numbers are stale the instant the lock is dropped,
604            * so it just doesn't matter.
605            */
606           if (active) {
607                     *active = s->s_active;
608           }
609           if (inactive) {
610                     *inactive = s->s_inactive;
611           }
612 }
613 
614 #if !defined(PDSIM)
615 static int
min_check(struct uvm_pctparam * pct,int t)616 min_check(struct uvm_pctparam *pct, int t)
617 {
618           struct uvmpdpol_globalstate *s = &pdpol_state;
619           int total = t;
620 
621           if (pct != &s->s_anonmin) {
622                     total += uvm_pctparam_get(&s->s_anonmin);
623           }
624           if (pct != &s->s_filemin) {
625                     total += uvm_pctparam_get(&s->s_filemin);
626           }
627           if (pct != &s->s_execmin) {
628                     total += uvm_pctparam_get(&s->s_execmin);
629           }
630           if (total > 95) {
631                     return EINVAL;
632           }
633           return 0;
634 }
635 #endif /* !defined(PDSIM) */
636 
637 void
uvmpdpol_init(void)638 uvmpdpol_init(void)
639 {
640           struct uvmpdpol_globalstate *s = &pdpol_state;
641 
642           mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
643           TAILQ_INIT(&s->s_activeq);
644           TAILQ_INIT(&s->s_inactiveq);
645           uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
646           uvm_pctparam_init(&s->s_anonmin, 10, min_check);
647           uvm_pctparam_init(&s->s_filemin, 10, min_check);
648           uvm_pctparam_init(&s->s_execmin,  5, min_check);
649           uvm_pctparam_init(&s->s_anonmax, 80, NULL);
650           uvm_pctparam_init(&s->s_filemax, 50, NULL);
651           uvm_pctparam_init(&s->s_execmax, 30, NULL);
652 }
653 
654 void
uvmpdpol_init_cpu(struct uvm_cpu * ucpu)655 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
656 {
657 
658           ucpu->pdq =
659               kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
660           ucpu->pdqhead = CLOCK_PDQ_SIZE;
661           ucpu->pdqtail = CLOCK_PDQ_SIZE;
662 }
663 
664 void
uvmpdpol_reinit(void)665 uvmpdpol_reinit(void)
666 {
667 }
668 
669 bool
uvmpdpol_needsscan_p(void)670 uvmpdpol_needsscan_p(void)
671 {
672 
673           /*
674            * this must be an unlocked check: can be called from interrupt.
675            */
676           return pdpol_state.s_inactive < pdpol_state.s_inactarg;
677 }
678 
679 void
uvmpdpol_tune(void)680 uvmpdpol_tune(void)
681 {
682           struct uvmpdpol_globalstate *s = &pdpol_state;
683 
684           mutex_enter(&s->lock);
685           clock_tune();
686           mutex_exit(&s->lock);
687 }
688 
689 /*
690  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
691  * make it real.  return true if any work was done.
692  */
693 static bool
uvmpdpol_pagerealize_locked(struct vm_page * pg)694 uvmpdpol_pagerealize_locked(struct vm_page *pg)
695 {
696           struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
697 
698           KASSERT(mutex_owned(&s->lock));
699           KASSERT(mutex_owned(&pg->interlock));
700 
701           switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
702           case PQ_INTENT_A | PQ_INTENT_SET:
703           case PQ_INTENT_E | PQ_INTENT_SET:
704                     uvmpdpol_pageactivate_locked(pg);
705                     return true;
706           case PQ_INTENT_I | PQ_INTENT_SET:
707                     uvmpdpol_pagedeactivate_locked(pg);
708                     return true;
709           case PQ_INTENT_D | PQ_INTENT_SET:
710                     uvmpdpol_pagedequeue_locked(pg);
711                     return true;
712           default:
713                     return false;
714           }
715 }
716 
717 /*
718  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
719  * updates flushed to the global queues.  this routine may block, and
720  * so can switch cpu.  the idea is to empty to queue on whatever cpu
721  * we finally end up on.
722  */
723 static struct uvm_cpu *
uvmpdpol_flush(void)724 uvmpdpol_flush(void)
725 {
726           struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
727           struct uvm_cpu *ucpu;
728           struct vm_page *pg;
729 
730           KASSERT(kpreempt_disabled());
731 
732           mutex_enter(&s->lock);
733           for (;;) {
734                     /*
735                      * prefer scanning forwards (even though mutex_enter() is
736                      * serializing) so as to not defeat any prefetch logic in
737                      * the CPU.  that means elsewhere enqueuing backwards, like
738                      * a stack, but not so important there as pages are being
739                      * added singularly.
740                      *
741                      * prefetch the next "struct vm_page" while working on the
742                      * current one.  this has a measurable and very positive
743                      * effect in reducing the amount of time spent here under
744                      * the global lock.
745                      */
746                     ucpu = curcpu()->ci_data.cpu_uvm;
747                     KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
748                     if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
749                               break;
750                     }
751                     pg = ucpu->pdq[ucpu->pdqhead++];
752                     if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
753                               __builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
754                     }
755                     mutex_enter(&pg->interlock);
756                     pg->pqflags &= ~PQ_INTENT_QUEUED;
757                     (void)uvmpdpol_pagerealize_locked(pg);
758                     mutex_exit(&pg->interlock);
759           }
760           mutex_exit(&s->lock);
761           return ucpu;
762 }
763 
764 /*
765  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
766  * implementation, that means putting the page on a per-CPU queue to be
767  * dealt with later.
768  */
769 void
uvmpdpol_pagerealize(struct vm_page * pg)770 uvmpdpol_pagerealize(struct vm_page *pg)
771 {
772           struct uvm_cpu *ucpu;
773 
774           /*
775            * drain the per per-CPU queue if full, then enter the page.
776            */
777           kpreempt_disable();
778           ucpu = curcpu()->ci_data.cpu_uvm;
779           if (__predict_false(ucpu->pdqhead == 0)) {
780                     ucpu = uvmpdpol_flush();
781           }
782           ucpu->pdq[--(ucpu->pdqhead)] = pg;
783           kpreempt_enable();
784 }
785 
786 /*
787  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
788  * pending updates back to the global queues.
789  */
790 void
uvmpdpol_idle(struct uvm_cpu * ucpu)791 uvmpdpol_idle(struct uvm_cpu *ucpu)
792 {
793           struct uvmpdpol_globalstate *s = &pdpol_state;
794           struct vm_page *pg;
795 
796           KASSERT(kpreempt_disabled());
797 
798           /*
799            * if no pages in the queue, we have nothing to do.
800            */
801           if (ucpu->pdqhead == ucpu->pdqtail) {
802                     ucpu->pdqtime = getticks();
803                     return;
804           }
805 
806           /*
807            * don't do this more than ~8 times a second as it would needlessly
808            * exert pressure.
809            */
810           if (getticks() - ucpu->pdqtime < (hz >> 3)) {
811                     return;
812           }
813 
814           /*
815            * the idle LWP can't block, so we have to try for the lock.  if we
816            * get it, purge the per-CPU pending update queue.  continually
817            * check for a pending resched: in that case exit immediately.
818            */
819           if (mutex_tryenter(&s->lock)) {
820                     while (ucpu->pdqhead != ucpu->pdqtail) {
821                               pg = ucpu->pdq[ucpu->pdqhead];
822                               if (!mutex_tryenter(&pg->interlock)) {
823                                         break;
824                               }
825                               ucpu->pdqhead++;
826                               pg->pqflags &= ~PQ_INTENT_QUEUED;
827                               (void)uvmpdpol_pagerealize_locked(pg);
828                               mutex_exit(&pg->interlock);
829                               if (curcpu()->ci_want_resched) {
830                                         break;
831                               }
832                     }
833                     if (ucpu->pdqhead == ucpu->pdqtail) {
834                               ucpu->pdqtime = getticks();
835                     }
836                     mutex_exit(&s->lock);
837           }
838 }
839 
840 #if !defined(PDSIM)
841 
842 #include <sys/sysctl.h>       /* XXX SYSCTL_DESCR */
843 
844 void
uvmpdpol_sysctlsetup(void)845 uvmpdpol_sysctlsetup(void)
846 {
847           struct uvmpdpol_globalstate *s = &pdpol_state;
848 
849           uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
850               SYSCTL_DESCR("Percentage of physical memory reserved "
851               "for anonymous application data"));
852           uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
853               SYSCTL_DESCR("Percentage of physical memory reserved "
854               "for cached file data"));
855           uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
856               SYSCTL_DESCR("Percentage of physical memory reserved "
857               "for cached executable data"));
858 
859           uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
860               SYSCTL_DESCR("Percentage of physical memory which will "
861               "be reclaimed from other usage for "
862               "anonymous application data"));
863           uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
864               SYSCTL_DESCR("Percentage of physical memory which will "
865               "be reclaimed from other usage for cached "
866               "file data"));
867           uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
868               SYSCTL_DESCR("Percentage of physical memory which will "
869               "be reclaimed from other usage for cached "
870               "executable data"));
871 
872           uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
873               SYSCTL_DESCR("Percentage of inactive queue of "
874               "the entire (active + inactive) queue"));
875 }
876 
877 #endif /* !defined(PDSIM) */
878 
879 #if defined(PDSIM)
880 void
pdsim_dump(const char * id)881 pdsim_dump(const char *id)
882 {
883 #if defined(DEBUG)
884           /* XXX */
885 #endif /* defined(DEBUG) */
886 }
887 #endif /* defined(PDSIM) */
888