kern_clock.c - OpenGrok cross reference for /netbsd/src/sys/kern/kern_clock.c

/*        $NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $     */

/*-
 * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
 * NASA Ames Research Center.
 * This code is derived from software contributed to The NetBSD Foundation
 * by Charles M. Hannum.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*-
 * Copyright (c) 1982, 1986, 1991, 1993
 *        The Regents of the University of California.  All rights reserved.
 * (c) UNIX System Laboratories, Inc.
 * All or some portions of this file are derived from material licensed
 * to the University of California by American Telephone and Telegraph
 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
 * the permission of UNIX System Laboratories, Inc.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *        @(#)kern_clock.c    8.5 (Berkeley) 1/21/94
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $");

#ifdef _KERNEL_OPT
#include "opt_dtrace.h"
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/sched.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/rndsource.h>
#include <sys/heartbeat.h>

#ifdef GPROF
#include <sys/gmon.h>
#endif

#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
#include <sys/cpu.h>

cyclic_clock_func_t cyclic_clock_func[MAXCPUS];
#endif

static int sysctl_kern_clockrate(SYSCTLFN_PROTO);

/*
 * Clock handling routines.
 *
 * This code is written to operate with two timers that run independently of
 * each other.  The main clock, running hz times per second, is used to keep
 * track of real time.  The second timer handles kernel and user profiling,
 * and does resource use estimation.  If the second timer is programmable,
 * it is randomized to avoid aliasing between the two clocks.  For example,
 * the randomization prevents an adversary from always giving up the CPU
 * just before its quantum expires.  Otherwise, it would never accumulate
 * CPU ticks.  The mean frequency of the second timer is stathz.
 *
 * If no second timer exists, stathz will be zero; in this case we drive
 * profiling and statistics off the main clock.  This WILL NOT be accurate;
 * do not do it unless absolutely necessary.
 *
 * The statistics clock may (or may not) be run at a higher rate while
 * profiling.  This profile clock runs at profhz.  We require that profhz
 * be an integral multiple of stathz.
 *
 * If the statistics clock is running fast, it must be divided by the ratio
 * profhz/stathz for statistics.  (For profiling, every tick counts.)
 */

int       stathz;
int       profhz;
int       profsrc;
int       schedhz;
int       profprocs;
static int hardclock_ticks;
static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
static int psdiv;                       /* prof => stat divider */
int       psratio;                      /* ratio: prof / stat */

struct clockrnd {
          struct krndsource source;
          unsigned needed;
};

static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT);
static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT);

static void
clockrnd_get(size_t needed, void *cookie)
{
          struct clockrnd *C = cookie;

          /* Start sampling.  */
          atomic_store_relaxed(&C->needed, 2*NBBY*needed);
}

static void
clockrnd_sample(struct clockrnd *C)
{
          struct cpu_info *ci = curcpu();

          /* If there's nothing needed right now, stop here.  */
          if (__predict_true(atomic_load_relaxed(&C->needed) == 0))
                    return;

          /*
           * If we're not the primary core of a package, we're probably
           * driven by the same clock as the primary core, so don't
           * bother.
           */
          if (ci != ci->ci_package1st)
                    return;

          /* Take a sample and enter it into the pool.  */
          rnd_add_uint32(&C->source, 0);

          /*
           * On the primary CPU, count down.  Using an atomic decrement
           * here isn't really necessary -- on every platform we care
           * about, stores to unsigned int are atomic, and the only other
           * memory operation that could happen here is for another CPU
           * to store a higher value for needed.  But using an atomic
           * decrement avoids giving the impression of data races, and is
           * unlikely to hurt because only one CPU will ever be writing
           * to the location.
           */
          if (CPU_IS_PRIMARY(curcpu())) {
                    unsigned needed __diagused;

                    needed = atomic_dec_uint_nv(&C->needed);
                    KASSERT(needed != UINT_MAX);
          }
}

static u_int get_intr_timecount(struct timecounter *);

static struct timecounter intr_timecounter = {
          .tc_get_timecount   = get_intr_timecount,
          .tc_poll_pps                  = NULL,
          .tc_counter_mask    = ~0u,
          .tc_frequency                 = 0,
          .tc_name            = "clockinterrupt",
          /* quality - minimum implementation level for a clock */
          .tc_quality                   = 0,
          .tc_priv            = NULL,
};

static u_int
get_intr_timecount(struct timecounter *tc)
{

          return (u_int)getticks();
}

int
getticks(void)
{
          return atomic_load_relaxed(&hardclock_ticks);
}

/*
 * Initialize clock frequencies and start both clocks running.
 */
void
initclocks(void)
{
          static struct sysctllog *clog;
          int i;

          /*
           * Set divisors to 1 (normal case) and let the machine-specific
           * code do its bit.
           */
          psdiv = 1;

          /*
           * Call cpu_initclocks() before registering the default
           * timecounter, in case it needs to adjust hz.
           */
          const int old_hz = hz;
          cpu_initclocks();
          if (old_hz != hz) {
                    tick = 1000000 / hz;
                    tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1;
          }

          /*
           * provide minimum default time counter
           * will only run at interrupt resolution
           */
          intr_timecounter.tc_frequency = hz;
          tc_init(&intr_timecounter);

          /*
           * Compute profhz and stathz, fix profhz if needed.
           */
          i = stathz ? stathz : hz;
          if (profhz == 0)
                    profhz = i;
          psratio = profhz / i;
          if (schedhz == 0) {
                    /* 16Hz is best */
                    hardscheddiv = hz / 16;
                    if (hardscheddiv <= 0)
                              panic("hardscheddiv");
          }

          sysctl_createv(&clog, 0, NULL, NULL,
                           CTLFLAG_PERMANENT,
                           CTLTYPE_STRUCT, "clockrate",
                           SYSCTL_DESCR("Kernel clock rates"),
                           sysctl_kern_clockrate, 0, NULL,
                           sizeof(struct clockinfo),
                           CTL_KERN, KERN_CLOCKRATE, CTL_EOL);
          sysctl_createv(&clog, 0, NULL, NULL,
                           CTLFLAG_PERMANENT,
                           CTLTYPE_INT, "hardclock_ticks",
                           SYSCTL_DESCR("Number of hardclock ticks"),
                           NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks),
                           CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL);

          rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd);
          rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW,
              RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB);
          if (stathz) {
                    rndsource_setcb(&statclockrnd.source, clockrnd_get,
                        &statclockrnd);
                    rnd_attach_source(&statclockrnd.source, "statclock",
                        RND_TYPE_SKEW,
                        (RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|
                              RND_FLAG_HASCB));
          }
}

/*
 * The real-time timer, interrupting hz times per second.
 */
void
hardclock(struct clockframe *frame)
{
          struct lwp *l;
          struct cpu_info *ci;

          clockrnd_sample(&hardclockrnd);

          ci = curcpu();
          l = ci->ci_onproc;

          ptimer_tick(l, CLKF_USERMODE(frame));

          /*
           * If no separate statistics clock is available, run it from here.
           */
          if (stathz == 0)
                    statclock(frame);
          /*
           * If no separate schedclock is provided, call it here
           * at about 16 Hz.
           */
          if (schedhz == 0) {
                    if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
                              schedclock(l);
                              ci->ci_schedstate.spc_schedticks = hardscheddiv;
                    }
          }
          if ((--ci->ci_schedstate.spc_ticks) <= 0)
                    sched_tick(ci);

          if (CPU_IS_PRIMARY(ci)) {
                    atomic_store_relaxed(&hardclock_ticks,
                        atomic_load_relaxed(&hardclock_ticks) + 1);
                    tc_ticktock();
          }

          /*
           * Make sure the CPUs and timecounter are making progress.
           */
          heartbeat();

          /*
           * Update real-time timeout queue.
           */
          callout_hardclock();
}

/*
 * Start profiling on a process.
 *
 * Kernel profiling passes proc0 which never exits and hence
 * keeps the profile clock running constantly.
 */
void
startprofclock(struct proc *p)
{

          KASSERT(mutex_owned(&p->p_stmutex));

          if ((p->p_stflag & PST_PROFIL) == 0) {
                    p->p_stflag |= PST_PROFIL;
                    /*
                     * This is only necessary if using the clock as the
                     * profiling source.
                     */
                    if (++profprocs == 1 && stathz != 0)
                              psdiv = psratio;
          }
}

/*
 * Stop profiling on a process.
 */
void
stopprofclock(struct proc *p)
{

          KASSERT(mutex_owned(&p->p_stmutex));

          if (p->p_stflag & PST_PROFIL) {
                    p->p_stflag &= ~PST_PROFIL;
                    /*
                     * This is only necessary if using the clock as the
                     * profiling source.
                     */
                    if (--profprocs == 0 && stathz != 0)
                              psdiv = 1;
          }
}

void
schedclock(struct lwp *l)
{
          if ((l->l_flag & LW_IDLE) != 0)
                    return;

          sched_schedclock(l);
}

/*
 * Statistics clock.  Grab profile sample, and if divider reaches 0,
 * do process and kernel statistics.
 */
void
statclock(struct clockframe *frame)
{
#ifdef GPROF
          struct gmonparam *g;
          intptr_t i;
#endif
          struct cpu_info *ci = curcpu();
          struct schedstate_percpu *spc = &ci->ci_schedstate;
          struct proc *p;
          struct lwp *l;

          if (stathz)
                    clockrnd_sample(&statclockrnd);

          /*
           * Notice changes in divisor frequency, and adjust clock
           * frequency accordingly.
           */
          if (spc->spc_psdiv != psdiv) {
                    spc->spc_psdiv = psdiv;
                    spc->spc_pscnt = psdiv;
                    if (psdiv == 1) {
                              setstatclockrate(stathz);
                    } else {
                              setstatclockrate(profhz);
                    }
          }
          l = ci->ci_onproc;
          if ((l->l_flag & LW_IDLE) != 0) {
                    /*
                     * don't account idle lwps as swapper.
                     */
                    p = NULL;
          } else {
                    p = l->l_proc;
                    mutex_spin_enter(&p->p_stmutex);
          }

          if (CLKF_USERMODE(frame)) {
                    KASSERT(p != NULL);
                    if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
                              addupc_intr(l, CLKF_PC(frame));
                    if (--spc->spc_pscnt > 0) {
                              mutex_spin_exit(&p->p_stmutex);
                              return;
                    }

                    /*
                     * Came from user mode; CPU was in user state.
                     * If this process is being profiled record the tick.
                     */
                    p->p_uticks++;
                    if (p->p_nice > NZERO)
                              spc->spc_cp_time[CP_NICE]++;
                    else
                              spc->spc_cp_time[CP_USER]++;
          } else {
#ifdef GPROF
                    /*
                     * Kernel statistics are just like addupc_intr, only easier.
                     */
#if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL)
                    g = curcpu()->ci_gmon;
                    if (g != NULL &&
                        profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#else
                    g = &_gmonparam;
                    if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#endif
                              i = CLKF_PC(frame) - g->lowpc;
                              if (i < g->textsize) {
                                        i /= HISTFRACTION * sizeof(*g->kcount);
                                        g->kcount[i]++;
                              }
                    }
#endif
#ifdef LWP_PC
                    if (p != NULL && profsrc == PROFSRC_CLOCK &&
                        (p->p_stflag & PST_PROFIL)) {
                              addupc_intr(l, LWP_PC(l));
                    }
#endif
                    if (--spc->spc_pscnt > 0) {
                              if (p != NULL)
                                        mutex_spin_exit(&p->p_stmutex);
                              return;
                    }
                    /*
                     * Came from kernel mode, so we were:
                     * - handling an interrupt,
                     * - doing syscall or trap work on behalf of the current
                     *   user process, or
                     * - spinning in the idle loop.
                     * Whichever it is, charge the time as appropriate.
                     * Note that we charge interrupts to the current process,
                     * regardless of whether they are ``for'' that process,
                     * so that we know how much of its real time was spent
                     * in ``non-process'' (i.e., interrupt) work.
                     */
                    if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
                              if (p != NULL) {
                                        p->p_iticks++;
                              }
                              spc->spc_cp_time[CP_INTR]++;
                    } else if (p != NULL) {
                              p->p_sticks++;
                              spc->spc_cp_time[CP_SYS]++;
                    } else {
                              spc->spc_cp_time[CP_IDLE]++;
                    }
          }
          spc->spc_pscnt = psdiv;

          if (p != NULL) {
                    atomic_inc_uint(&l->l_cpticks);
                    mutex_spin_exit(&p->p_stmutex);
          }

#ifdef KDTRACE_HOOKS
          cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)];
          if (func) {
                    (*func)((struct clockframe *)frame);
          }
#endif
}

/*
 * sysctl helper routine for kern.clockrate. Assembles a struct on
 * the fly to be returned to the caller.
 */
static int
sysctl_kern_clockrate(SYSCTLFN_ARGS)
{
          struct clockinfo clkinfo;
          struct sysctlnode node;

          clkinfo.tick = tick;
          clkinfo.tickadj = tickadj;
          clkinfo.hz = hz;
          clkinfo.profhz = profhz;
          clkinfo.stathz = stathz ? stathz : hz;

          node = *rnode;
          node.sysctl_data = &clkinfo;
          return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}