xref: /NextBSD/sys/dev/hwpmc/hwpmc_piv.c (revision 287e3b14e9552995def1802ec9c5034f4adf28ec)
1 /*-
2  * Copyright (c) 2003-2007 Joseph Koshy
3  * Copyright (c) 2007 The FreeBSD Foundation
4  * All rights reserved.
5  *
6  * Portions of this software were developed by A. Joseph Koshy under
7  * sponsorship from the FreeBSD Foundation and Google, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/bus.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/pmc.h>
39 #include <sys/pmckern.h>
40 #include <sys/smp.h>
41 #include <sys/systm.h>
42 #include <machine/intr_machdep.h>
43 #if (__FreeBSD_version >= 1100000)
44 #include <x86/apicvar.h>
45 #else
46 #include <machine/apicvar.h>
47 #endif
48 #include <machine/cpu.h>
49 #include <machine/cpufunc.h>
50 #include <machine/cputypes.h>
51 #include <machine/md_var.h>
52 #include <machine/specialreg.h>
53 
54 /*
55  * PENTIUM 4 SUPPORT
56  *
57  * The P4 has 18 PMCs, divided into 4 groups with 4,4,4 and 6 PMCs
58  * respectively.  Each PMC comprises of two model specific registers:
59  * a counter configuration control register (CCCR) and a counter
60  * register that holds the actual event counts.
61  *
62  * Configuring an event requires the use of one of 45 event selection
63  * control registers (ESCR).  Events are associated with specific
64  * ESCRs.  Each PMC group has a set of ESCRs it can use.
65  *
66  * - The BPU counter group (4 PMCs) can use the 16 ESCRs:
67  *   BPU_ESCR{0,1}, IS_ESCR{0,1}, MOB_ESCR{0,1}, ITLB_ESCR{0,1},
68  *   PMH_ESCR{0,1}, IX_ESCR{0,1}, FSB_ESCR{0,}, BSU_ESCR{0,1}.
69  *
70  * - The MS counter group (4 PMCs) can use the 6 ESCRs: MS_ESCR{0,1},
71  *   TC_ESCR{0,1}, TBPU_ESCR{0,1}.
72  *
73  * - The FLAME counter group (4 PMCs) can use the 10 ESCRs:
74  *   FLAME_ESCR{0,1}, FIRM_ESCR{0,1}, SAAT_ESCR{0,1}, U2L_ESCR{0,1},
75  *   DAC_ESCR{0,1}.
76  *
77  * - The IQ counter group (6 PMCs) can use the 13 ESCRs: IQ_ESCR{0,1},
78  *   ALF_ESCR{0,1}, RAT_ESCR{0,1}, SSU_ESCR0, CRU_ESCR{0,1,2,3,4,5}.
79  *
80  * Even-numbered ESCRs can be used with counters 0, 1 and 4 (if
81  * present) of a counter group.  Odd-numbers ESCRs can be used with
82  * counters 2, 3 and 5 (if present) of a counter group.  The
83  * 'p4_escrs[]' table describes these restrictions in a form that
84  * function 'p4_allocate()' uses for making allocation decisions.
85  *
86  * SYSTEM-MODE AND THREAD-MODE ALLOCATION
87  *
88  * In addition to remembering the state of PMC rows
89  * ('FREE','STANDALONE', or 'THREAD'), we similar need to track the
90  * state of ESCR rows.  If an ESCR is allocated to a system-mode PMC
91  * on a CPU we cannot allocate this to a thread-mode PMC.  On a
92  * multi-cpu (multiple physical CPUs) system, ESCR allocation on each
93  * CPU is tracked by the pc_escrs[] array.
94  *
95  * Each system-mode PMC that is using an ESCR records its row-index in
96  * the appropriate entry and system-mode allocation attempts check
97  * that an ESCR is available using this array.  Process-mode PMCs do
98  * not use the pc_escrs[] array, since ESCR row itself would have been
99  * marked as in 'THREAD' mode.
100  *
101  * HYPERTHREADING SUPPORT
102  *
103  * When HTT is enabled, the FreeBSD kernel treats the two 'logical'
104  * cpus as independent CPUs and can schedule kernel threads on them
105  * independently.  However, the two logical CPUs share the same set of
106  * PMC resources.  We need to ensure that:
107  * - PMCs that use the PMC_F_DESCENDANTS semantics are handled correctly,
108  *   and,
109  * - Threads of multi-threaded processes that get scheduled on the same
110  *   physical CPU are handled correctly.
111  *
112  * HTT Detection
113  *
114  * Not all HTT capable systems will have HTT enabled.  We detect the
115  * presence of HTT by detecting if 'p4_init()' was called for a secondary
116  * CPU in a HTT pair.
117  *
118  * Note that hwpmc(4) cannot currently deal with a change in HTT status once
119  * loaded.
120  *
121  * Handling HTT READ / WRITE / START / STOP
122  *
123  * PMC resources are shared across the CPUs in an HTT pair.  We
124  * designate the lower numbered CPU in a HTT pair as the 'primary'
125  * CPU.  In each primary CPU's state we keep track of a 'runcount'
126  * which reflects the number of PMC-using processes that have been
127  * scheduled on its secondary CPU.  Process-mode PMC operations will
128  * actually 'start' or 'stop' hardware only if these are the first or
129  * last processes respectively to use the hardware.  PMC values
130  * written by a 'write' operation are saved and are transferred to
131  * hardware at PMC 'start' time if the runcount is 0.  If the runcount
132  * is greater than 0 at the time of a 'start' operation, we keep track
133  * of the actual hardware value at the time of the 'start' operation
134  * and use this to adjust the final readings at PMC 'stop' or 'read'
135  * time.
136  *
137  * Execution sequences:
138  *
139  * Case 1:   CPUx   +...-		(no overlap)
140  *	     CPUy         +...-
141  *           RC   0 1   0 1   0
142  *
143  * Case 2:   CPUx   +........-		(partial overlap)
144  * 	     CPUy       +........-
145  *           RC   0 1   2    1   0
146  *
147  * Case 3:   CPUx   +..............-	(fully overlapped)
148  *	     CPUy       +.....-
149  *	     RC   0 1   2     1    0
150  *
151  *     Key:
152  *     'CPU[xy]' : one of the two logical processors on a HTT CPU.
153  *     'RC'      : run count (#threads per physical core).
154  *     '+'       : point in time when a thread is put on a CPU.
155  *     '-'       : point in time where a thread is taken off a CPU.
156  *
157  * Handling HTT CONFIG
158  *
159  * Different processes attached to the same PMC may get scheduled on
160  * the two logical processors in the package.  We keep track of config
161  * and de-config operations using the CFGFLAGS fields of the per-physical
162  * cpu state.
163  */
164 
165 #define	P4_PMCS()				\
166 	P4_PMC(BPU_COUNTER0)			\
167 	P4_PMC(BPU_COUNTER1)			\
168 	P4_PMC(BPU_COUNTER2)			\
169 	P4_PMC(BPU_COUNTER3)			\
170 	P4_PMC(MS_COUNTER0)			\
171 	P4_PMC(MS_COUNTER1)			\
172 	P4_PMC(MS_COUNTER2)			\
173 	P4_PMC(MS_COUNTER3)			\
174 	P4_PMC(FLAME_COUNTER0)			\
175 	P4_PMC(FLAME_COUNTER1)			\
176 	P4_PMC(FLAME_COUNTER2)			\
177 	P4_PMC(FLAME_COUNTER3)			\
178 	P4_PMC(IQ_COUNTER0)			\
179 	P4_PMC(IQ_COUNTER1)			\
180 	P4_PMC(IQ_COUNTER2)			\
181 	P4_PMC(IQ_COUNTER3)			\
182 	P4_PMC(IQ_COUNTER4)			\
183 	P4_PMC(IQ_COUNTER5)			\
184 	P4_PMC(NONE)
185 
186 enum pmc_p4pmc {
187 #undef	P4_PMC
188 #define	P4_PMC(N)	P4_PMC_##N ,
189 	P4_PMCS()
190 };
191 
192 /*
193  * P4 ESCR descriptors
194  */
195 
196 #define	P4_ESCRS()							\
197     P4_ESCR(BSU_ESCR0,	0x3A0, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
198     P4_ESCR(BSU_ESCR1,	0x3A1, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
199     P4_ESCR(FSB_ESCR0,	0x3A2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
200     P4_ESCR(FSB_ESCR1,	0x3A3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
201     P4_ESCR(FIRM_ESCR0,	0x3A4, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
202     P4_ESCR(FIRM_ESCR1,	0x3A5, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
203     P4_ESCR(FLAME_ESCR0, 0x3A6, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
204     P4_ESCR(FLAME_ESCR1, 0x3A7, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
205     P4_ESCR(DAC_ESCR0,	0x3A8, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
206     P4_ESCR(DAC_ESCR1,	0x3A9, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
207     P4_ESCR(MOB_ESCR0,	0x3AA, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
208     P4_ESCR(MOB_ESCR1,	0x3AB, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
209     P4_ESCR(PMH_ESCR0,	0x3AC, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
210     P4_ESCR(PMH_ESCR1,	0x3AD, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
211     P4_ESCR(SAAT_ESCR0,	0x3AE, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
212     P4_ESCR(SAAT_ESCR1,	0x3AF, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
213     P4_ESCR(U2L_ESCR0,	0x3B0, FLAME_COUNTER0, FLAME_COUNTER1, NONE)	\
214     P4_ESCR(U2L_ESCR1,	0x3B1, FLAME_COUNTER2, FLAME_COUNTER3, NONE)	\
215     P4_ESCR(BPU_ESCR0,	0x3B2, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
216     P4_ESCR(BPU_ESCR1,	0x3B3, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
217     P4_ESCR(IS_ESCR0,	0x3B4, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
218     P4_ESCR(IS_ESCR1,	0x3B5, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
219     P4_ESCR(ITLB_ESCR0,	0x3B6, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
220     P4_ESCR(ITLB_ESCR1,	0x3B7, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
221     P4_ESCR(CRU_ESCR0,	0x3B8, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
222     P4_ESCR(CRU_ESCR1,	0x3B9, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
223     P4_ESCR(IQ_ESCR0,	0x3BA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
224     P4_ESCR(IQ_ESCR1,	0x3BB, IQ_COUNTER1, IQ_COUNTER3, IQ_COUNTER5)	\
225     P4_ESCR(RAT_ESCR0,	0x3BC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
226     P4_ESCR(RAT_ESCR1,	0x3BD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
227     P4_ESCR(SSU_ESCR0,	0x3BE, IQ_COUNTER0, IQ_COUNTER2, IQ_COUNTER4)	\
228     P4_ESCR(MS_ESCR0,	0x3C0, MS_COUNTER0, MS_COUNTER1, NONE)		\
229     P4_ESCR(MS_ESCR1,	0x3C1, MS_COUNTER2, MS_COUNTER3, NONE)		\
230     P4_ESCR(TBPU_ESCR0,	0x3C2, MS_COUNTER0, MS_COUNTER1, NONE)		\
231     P4_ESCR(TBPU_ESCR1,	0x3C3, MS_COUNTER2, MS_COUNTER3, NONE)		\
232     P4_ESCR(TC_ESCR0,	0x3C4, MS_COUNTER0, MS_COUNTER1, NONE)		\
233     P4_ESCR(TC_ESCR1,	0x3C5, MS_COUNTER2, MS_COUNTER3, NONE)		\
234     P4_ESCR(IX_ESCR0,	0x3C8, BPU_COUNTER0, BPU_COUNTER1, NONE)	\
235     P4_ESCR(IX_ESCR1,	0x3C9, BPU_COUNTER2, BPU_COUNTER3, NONE)	\
236     P4_ESCR(ALF_ESCR0,	0x3CA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
237     P4_ESCR(ALF_ESCR1,	0x3CB, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
238     P4_ESCR(CRU_ESCR2,	0x3CC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
239     P4_ESCR(CRU_ESCR3,	0x3CD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
240     P4_ESCR(CRU_ESCR4,	0x3E0, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4)	\
241     P4_ESCR(CRU_ESCR5,	0x3E1, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5)	\
242     P4_ESCR(NONE,		~0,    NONE, NONE, NONE)
243 
244 enum pmc_p4escr {
245 #define	P4_ESCR(N, MSR, P1, P2, P3)	P4_ESCR_##N ,
246 	P4_ESCRS()
247 #undef	P4_ESCR
248 };
249 
250 struct pmc_p4escr_descr {
251 	const char	pm_escrname[PMC_NAME_MAX];
252 	u_short		pm_escr_msr;
253 	const enum pmc_p4pmc pm_pmcs[P4_MAX_PMC_PER_ESCR];
254 };
255 
256 static struct pmc_p4escr_descr p4_escrs[] =
257 {
258 #define	P4_ESCR(N, MSR, P1, P2, P3)		\
259 	{					\
260 		.pm_escrname = #N,		\
261 		.pm_escr_msr = (MSR),		\
262 		.pm_pmcs =			\
263 		{				\
264 			P4_PMC_##P1,		\
265 			P4_PMC_##P2,		\
266 			P4_PMC_##P3		\
267 		}				\
268 	} ,
269 
270 	P4_ESCRS()
271 
272 #undef	P4_ESCR
273 };
274 
275 /*
276  * P4 Event descriptor
277  */
278 
279 struct p4_event_descr {
280 	const enum pmc_event pm_event;
281 	const uint32_t	pm_escr_eventselect;
282 	const uint32_t	pm_cccr_select;
283 	const char	pm_is_ti_event;
284 	enum pmc_p4escr	pm_escrs[P4_MAX_ESCR_PER_EVENT];
285 };
286 
287 static struct p4_event_descr p4_events[] = {
288 
289 #define	P4_EVDESCR(NAME, ESCREVENTSEL, CCCRSEL, TI_EVENT, ESCR0, ESCR1)	\
290 	{								\
291 		.pm_event            = PMC_EV_P4_##NAME,		\
292 		.pm_escr_eventselect = (ESCREVENTSEL),			\
293 		.pm_cccr_select      = (CCCRSEL),			\
294 		.pm_is_ti_event	     = (TI_EVENT),			\
295 		.pm_escrs            =					\
296 		{							\
297 			P4_ESCR_##ESCR0,				\
298 			P4_ESCR_##ESCR1					\
299 		}							\
300 	}
301 
302 P4_EVDESCR(TC_DELIVER_MODE,	0x01, 0x01, TRUE,  TC_ESCR0,	TC_ESCR1),
303 P4_EVDESCR(BPU_FETCH_REQUEST,	0x03, 0x00, FALSE, BPU_ESCR0,	BPU_ESCR1),
304 P4_EVDESCR(ITLB_REFERENCE,	0x18, 0x03, FALSE, ITLB_ESCR0,	ITLB_ESCR1),
305 P4_EVDESCR(MEMORY_CANCEL,	0x02, 0x05, FALSE, DAC_ESCR0,	DAC_ESCR1),
306 P4_EVDESCR(MEMORY_COMPLETE,	0x08, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
307 P4_EVDESCR(LOAD_PORT_REPLAY,	0x04, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
308 P4_EVDESCR(STORE_PORT_REPLAY,	0x05, 0x02, FALSE, SAAT_ESCR0,	SAAT_ESCR1),
309 P4_EVDESCR(MOB_LOAD_REPLAY,	0x03, 0x02, FALSE, MOB_ESCR0,	MOB_ESCR1),
310 P4_EVDESCR(PAGE_WALK_TYPE,	0x01, 0x04, TRUE,  PMH_ESCR0,	PMH_ESCR1),
311 P4_EVDESCR(BSQ_CACHE_REFERENCE,	0x0C, 0x07, FALSE, BSU_ESCR0,	BSU_ESCR1),
312 P4_EVDESCR(IOQ_ALLOCATION,	0x03, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
313 P4_EVDESCR(IOQ_ACTIVE_ENTRIES,	0x1A, 0x06, FALSE, FSB_ESCR1,	NONE),
314 P4_EVDESCR(FSB_DATA_ACTIVITY,	0x17, 0x06, TRUE,  FSB_ESCR0,	FSB_ESCR1),
315 P4_EVDESCR(BSQ_ALLOCATION,	0x05, 0x07, FALSE, BSU_ESCR0,	NONE),
316 P4_EVDESCR(BSQ_ACTIVE_ENTRIES,	0x06, 0x07, FALSE, BSU_ESCR1,	NONE),
317 	/* BSQ_ACTIVE_ENTRIES inherits CPU specificity from BSQ_ALLOCATION */
318 P4_EVDESCR(SSE_INPUT_ASSIST,	0x34, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
319 P4_EVDESCR(PACKED_SP_UOP,	0x08, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
320 P4_EVDESCR(PACKED_DP_UOP,	0x0C, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
321 P4_EVDESCR(SCALAR_SP_UOP,	0x0A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
322 P4_EVDESCR(SCALAR_DP_UOP,	0x0E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
323 P4_EVDESCR(64BIT_MMX_UOP,	0x02, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
324 P4_EVDESCR(128BIT_MMX_UOP,	0x1A, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
325 P4_EVDESCR(X87_FP_UOP,		0x04, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
326 P4_EVDESCR(X87_SIMD_MOVES_UOP,	0x2E, 0x01, TRUE,  FIRM_ESCR0,	FIRM_ESCR1),
327 P4_EVDESCR(GLOBAL_POWER_EVENTS,	0x13, 0x06, FALSE, FSB_ESCR0,	FSB_ESCR1),
328 P4_EVDESCR(TC_MS_XFER,		0x05, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
329 P4_EVDESCR(UOP_QUEUE_WRITES,	0x09, 0x00, FALSE, MS_ESCR0,	MS_ESCR1),
330 P4_EVDESCR(RETIRED_MISPRED_BRANCH_TYPE,
331     				0x05, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
332 P4_EVDESCR(RETIRED_BRANCH_TYPE,	0x04, 0x02, FALSE, TBPU_ESCR0,	TBPU_ESCR1),
333 P4_EVDESCR(RESOURCE_STALL,	0x01, 0x01, FALSE, ALF_ESCR0,	ALF_ESCR1),
334 P4_EVDESCR(WC_BUFFER,		0x05, 0x05, TRUE,  DAC_ESCR0,	DAC_ESCR1),
335 P4_EVDESCR(B2B_CYCLES,		0x16, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
336 P4_EVDESCR(BNR,			0x08, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
337 P4_EVDESCR(SNOOP,		0x06, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
338 P4_EVDESCR(RESPONSE,		0x04, 0x03, TRUE,  FSB_ESCR0,	FSB_ESCR1),
339 P4_EVDESCR(FRONT_END_EVENT,	0x08, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
340 P4_EVDESCR(EXECUTION_EVENT,	0x0C, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
341 P4_EVDESCR(REPLAY_EVENT, 	0x09, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
342 P4_EVDESCR(INSTR_RETIRED,	0x02, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
343 P4_EVDESCR(UOPS_RETIRED,	0x01, 0x04, FALSE, CRU_ESCR0,	CRU_ESCR1),
344 P4_EVDESCR(UOP_TYPE,		0x02, 0x02, FALSE, RAT_ESCR0,	RAT_ESCR1),
345 P4_EVDESCR(BRANCH_RETIRED,	0x06, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
346 P4_EVDESCR(MISPRED_BRANCH_RETIRED, 0x03, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1),
347 P4_EVDESCR(X87_ASSIST,		0x03, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3),
348 P4_EVDESCR(MACHINE_CLEAR,	0x02, 0x05, FALSE, CRU_ESCR2,	CRU_ESCR3)
349 
350 #undef	P4_EVDESCR
351 };
352 
353 #define	P4_EVENT_IS_TI(E) ((E)->pm_is_ti_event == TRUE)
354 
355 #define	P4_NEVENTS	(PMC_EV_P4_LAST - PMC_EV_P4_FIRST + 1)
356 
357 /*
358  * P4 PMC descriptors
359  */
360 
361 struct p4pmc_descr {
362 	struct pmc_descr pm_descr; 	/* common information */
363 	enum pmc_p4pmc	pm_pmcnum;	/* PMC number */
364 	uint32_t	pm_pmc_msr; 	/* PERFCTR MSR address */
365 	uint32_t	pm_cccr_msr;  	/* CCCR MSR address */
366 };
367 
368 static struct p4pmc_descr p4_pmcdesc[P4_NPMCS] = {
369 #define	P4_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM |  \
370 	PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \
371 	PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE |            \
372 	PMC_CAP_TAGGING | PMC_CAP_CASCADE)
373 
374 #define	P4_PMCDESCR(N, PMC, CCCR)			\
375 	{						\
376 		.pm_descr =				\
377 		{					\
378 			.pd_name = #N,			\
379 			.pd_class = PMC_CLASS_P4,	\
380 			.pd_caps = P4_PMC_CAPS,		\
381 			.pd_width = 40			\
382 		},					\
383 		.pm_pmcnum      = P4_PMC_##N,		\
384 		.pm_cccr_msr 	= (CCCR),		\
385 		.pm_pmc_msr	= (PMC)			\
386 	}
387 
388 	P4_PMCDESCR(BPU_COUNTER0,	0x300,	0x360),
389 	P4_PMCDESCR(BPU_COUNTER1,	0x301,	0x361),
390 	P4_PMCDESCR(BPU_COUNTER2,	0x302,	0x362),
391 	P4_PMCDESCR(BPU_COUNTER3,	0x303,	0x363),
392 	P4_PMCDESCR(MS_COUNTER0,	0x304,	0x364),
393 	P4_PMCDESCR(MS_COUNTER1,	0x305,	0x365),
394 	P4_PMCDESCR(MS_COUNTER2,	0x306,	0x366),
395 	P4_PMCDESCR(MS_COUNTER3,	0x307,	0x367),
396 	P4_PMCDESCR(FLAME_COUNTER0,	0x308,	0x368),
397 	P4_PMCDESCR(FLAME_COUNTER1,	0x309,	0x369),
398 	P4_PMCDESCR(FLAME_COUNTER2,	0x30A,	0x36A),
399 	P4_PMCDESCR(FLAME_COUNTER3,	0x30B,	0x36B),
400 	P4_PMCDESCR(IQ_COUNTER0,	0x30C,	0x36C),
401 	P4_PMCDESCR(IQ_COUNTER1,	0x30D,	0x36D),
402 	P4_PMCDESCR(IQ_COUNTER2,	0x30E,	0x36E),
403 	P4_PMCDESCR(IQ_COUNTER3,	0x30F,	0x36F),
404 	P4_PMCDESCR(IQ_COUNTER4,	0x310,	0x370),
405 	P4_PMCDESCR(IQ_COUNTER5,	0x311,	0x371),
406 
407 #undef	P4_PMCDESCR
408 };
409 
410 /* HTT support */
411 #define	P4_NHTT					2 /* logical processors/chip */
412 
413 static int p4_system_has_htt;
414 
415 /*
416  * Per-CPU data structure for P4 class CPUs
417  *
418  * [19 struct pmc_hw structures]
419  * [45 ESCRs status bytes]
420  * [per-cpu spin mutex]
421  * [19 flag fields for holding config flags and a runcount]
422  * [19*2 hw value fields]	(Thread mode PMC support)
423  *    or
424  * [19*2 EIP values]		(Sampling mode PMCs)
425  * [19*2 pmc value fields]	(Thread mode PMC support))
426  */
427 
428 struct p4_cpu {
429 	struct pmc_hw	pc_p4pmcs[P4_NPMCS];
430 	char		pc_escrs[P4_NESCR];
431 	struct mtx	pc_mtx;		/* spin lock */
432 	uint32_t	pc_intrflag;	/* NMI handler flags */
433 	unsigned int	pc_intrlock;	/* NMI handler spin lock */
434 	unsigned char	pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */
435 	union {
436 		pmc_value_t pc_hw[P4_NPMCS * P4_NHTT];
437 		uintptr_t   pc_ip[P4_NPMCS * P4_NHTT];
438 	}		pc_si;
439 	pmc_value_t	pc_pmc_values[P4_NPMCS * P4_NHTT];
440 };
441 
442 static struct p4_cpu **p4_pcpu;
443 
444 #define	P4_PCPU_PMC_VALUE(PC,RI,CPU) 	(PC)->pc_pmc_values[(RI)*((CPU) & 1)]
445 #define	P4_PCPU_HW_VALUE(PC,RI,CPU)	(PC)->pc_si.pc_hw[(RI)*((CPU) & 1)]
446 #define	P4_PCPU_SAVED_IP(PC,RI,CPU)	(PC)->pc_si.pc_ip[(RI)*((CPU) & 1)]
447 
448 #define	P4_PCPU_GET_FLAGS(PC,RI,MASK)	((PC)->pc_flags[(RI)] & (MASK))
449 #define	P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL)	do {	\
450 	char _tmp;					\
451 	_tmp = (PC)->pc_flags[(RI)];			\
452 	_tmp &= ~(MASK);				\
453 	_tmp |= (VAL) & (MASK);				\
454 	(PC)->pc_flags[(RI)] = _tmp;			\
455 } while (0)
456 
457 #define	P4_PCPU_GET_RUNCOUNT(PC,RI)	P4_PCPU_GET_FLAGS(PC,RI,0x0F)
458 #define	P4_PCPU_SET_RUNCOUNT(PC,RI,V)	P4_PCPU_SET_FLAGS(PC,RI,0x0F,V)
459 
460 #define	P4_PCPU_GET_CFGFLAGS(PC,RI)	(P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4)
461 #define	P4_PCPU_SET_CFGFLAGS(PC,RI,C)	P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4))
462 
463 #define	P4_CPU_TO_FLAG(C)		(P4_CPU_IS_HTT_SECONDARY(cpu) ? 0x2 : 0x1)
464 
465 #define	P4_PCPU_GET_INTRFLAG(PC,I)	((PC)->pc_intrflag & (1 << (I)))
466 #define	P4_PCPU_SET_INTRFLAG(PC,I,V)	do {		\
467 		uint32_t __mask;			\
468 		__mask = 1 << (I);			\
469 		if ((V))				\
470 			(PC)->pc_intrflag |= __mask;	\
471 		else					\
472 			(PC)->pc_intrflag &= ~__mask;	\
473 	} while (0)
474 
475 /*
476  * A minimal spin lock implementation for use inside the NMI handler.
477  *
478  * We don't want to use a regular spin lock here, because curthread
479  * may not be consistent at the time the handler is invoked.
480  */
481 #define	P4_PCPU_ACQ_INTR_SPINLOCK(PC) do {				\
482 		while (!atomic_cmpset_acq_int(&pc->pc_intrlock, 0, 1))	\
483 			ia32_pause();					\
484 	} while (0)
485 #define	P4_PCPU_REL_INTR_SPINLOCK(PC) 					\
486 	atomic_store_rel_int(&pc->pc_intrlock, 0);
487 
488 /* ESCR row disposition */
489 static int p4_escrdisp[P4_NESCR];
490 
491 #define	P4_ESCR_ROW_DISP_IS_THREAD(E)		(p4_escrdisp[(E)] > 0)
492 #define	P4_ESCR_ROW_DISP_IS_STANDALONE(E)	(p4_escrdisp[(E)] < 0)
493 #define	P4_ESCR_ROW_DISP_IS_FREE(E)		(p4_escrdisp[(E)] == 0)
494 
495 #define	P4_ESCR_MARK_ROW_STANDALONE(E) do {				\
496 	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
497 		    __LINE__));						\
498 	atomic_add_int(&p4_escrdisp[(E)], -1);				\
499 	KASSERT(p4_escrdisp[(E)] >= (-pmc_cpu_max_active()), 		\
500 		("[p4,%d] row disposition error", __LINE__));		\
501 } while (0)
502 
503 #define	P4_ESCR_UNMARK_ROW_STANDALONE(E) do {				\
504 	atomic_add_int(&p4_escrdisp[(E)], 1);				\
505 	KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\
506 		    __LINE__));						\
507 } while (0)
508 
509 #define	P4_ESCR_MARK_ROW_THREAD(E) do {					 \
510 	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
511 		    __LINE__));						 \
512 	atomic_add_int(&p4_escrdisp[(E)], 1);				 \
513 } while (0)
514 
515 #define	P4_ESCR_UNMARK_ROW_THREAD(E) do {				 \
516 	atomic_add_int(&p4_escrdisp[(E)], -1);				 \
517 	KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \
518 		    __LINE__));						 \
519 } while (0)
520 
521 #define	P4_PMC_IS_STOPPED(cccr)	((rdmsr(cccr) & P4_CCCR_ENABLE) == 0)
522 
523 #define	P4_CPU_IS_HTT_SECONDARY(cpu)					\
524 	(p4_system_has_htt ? ((cpu) & 1) : 0)
525 #define	P4_TO_HTT_PRIMARY(cpu) 						\
526 	(p4_system_has_htt ? ((cpu) & ~1) : (cpu))
527 
528 #define	P4_CCCR_Tx_MASK	(~(P4_CCCR_OVF_PMI_T0|P4_CCCR_OVF_PMI_T1|	\
529 			     P4_CCCR_ENABLE|P4_CCCR_OVF))
530 #define	P4_ESCR_Tx_MASK	(~(P4_ESCR_T0_OS|P4_ESCR_T0_USR|P4_ESCR_T1_OS|	\
531 			     P4_ESCR_T1_USR))
532 
533 /*
534  * support routines
535  */
536 
537 static struct p4_event_descr *
p4_find_event(enum pmc_event ev)538 p4_find_event(enum pmc_event ev)
539 {
540 	int n;
541 
542 	for (n = 0; n < P4_NEVENTS; n++)
543 		if (p4_events[n].pm_event == ev)
544 			break;
545 	if (n == P4_NEVENTS)
546 		return (NULL);
547 	return (&p4_events[n]);
548 }
549 
550 /*
551  * Initialize per-cpu state
552  */
553 
554 static int
p4_pcpu_init(struct pmc_mdep * md,int cpu)555 p4_pcpu_init(struct pmc_mdep *md, int cpu)
556 {
557 	char *pescr;
558 	int n, first_ri, phycpu;
559 	struct pmc_hw *phw;
560 	struct p4_cpu *p4c;
561 	struct pmc_cpu *pc, *plc;
562 
563 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
564 	    ("[p4,%d] insane cpu number %d", __LINE__, cpu));
565 
566 	PMCDBG2(MDP,INI,0, "p4-init cpu=%d is-primary=%d", cpu,
567 	    pmc_cpu_is_primary(cpu) != 0);
568 
569 	first_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_P4].pcd_ri;
570 
571 	/*
572 	 * The two CPUs in an HT pair share their per-cpu state.
573 	 *
574 	 * For HT capable CPUs, we assume that the two logical
575 	 * processors in the HT pair get two consecutive CPU ids
576 	 * starting with an even id #.
577 	 *
578 	 * The primary CPU (the even numbered CPU of the pair) would
579 	 * have been initialized prior to the initialization for the
580 	 * secondary.
581 	 */
582 
583 	if (!pmc_cpu_is_primary(cpu) && (cpu & 1)) {
584 
585 		p4_system_has_htt = 1;
586 
587 		phycpu = P4_TO_HTT_PRIMARY(cpu);
588 		pc = pmc_pcpu[phycpu];
589 		plc = pmc_pcpu[cpu];
590 
591 		KASSERT(plc != pc, ("[p4,%d] per-cpu config error", __LINE__));
592 
593 		PMCDBG3(MDP,INI,1, "p4-init cpu=%d phycpu=%d pc=%p", cpu,
594 		    phycpu, pc);
595 		KASSERT(pc, ("[p4,%d] Null Per-Cpu state cpu=%d phycpu=%d",
596 		    __LINE__, cpu, phycpu));
597 
598 		/* PMCs are shared with the physical CPU. */
599 		for (n = 0; n < P4_NPMCS; n++)
600 			plc->pc_hwpmcs[n + first_ri] =
601 			    pc->pc_hwpmcs[n + first_ri];
602 
603 		return (0);
604 	}
605 
606 	p4c = malloc(sizeof(struct p4_cpu), M_PMC, M_WAITOK|M_ZERO);
607 
608 	if (p4c == NULL)
609 		return (ENOMEM);
610 
611 	pc = pmc_pcpu[cpu];
612 
613 	KASSERT(pc != NULL, ("[p4,%d] cpu %d null per-cpu", __LINE__, cpu));
614 
615 	p4_pcpu[cpu] = p4c;
616 	phw = p4c->pc_p4pmcs;
617 
618 	for (n = 0; n < P4_NPMCS; n++, phw++) {
619 		phw->phw_state   = PMC_PHW_FLAG_IS_ENABLED |
620 		    PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n);
621 		phw->phw_pmc     = NULL;
622 		pc->pc_hwpmcs[n + first_ri] = phw;
623 	}
624 
625 	pescr = p4c->pc_escrs;
626 	for (n = 0; n < P4_NESCR; n++)
627 		*pescr++ = P4_INVALID_PMC_INDEX;
628 
629 	mtx_init(&p4c->pc_mtx, "p4-pcpu", "pmc-leaf", MTX_SPIN);
630 
631 	return (0);
632 }
633 
634 /*
635  * Destroy per-cpu state.
636  */
637 
638 static int
p4_pcpu_fini(struct pmc_mdep * md,int cpu)639 p4_pcpu_fini(struct pmc_mdep *md, int cpu)
640 {
641 	int first_ri, i;
642 	struct p4_cpu *p4c;
643 	struct pmc_cpu *pc;
644 
645 	PMCDBG1(MDP,INI,0, "p4-cleanup cpu=%d", cpu);
646 
647 	pc = pmc_pcpu[cpu];
648 	first_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_P4].pcd_ri;
649 
650 	for (i = 0; i < P4_NPMCS; i++)
651 		pc->pc_hwpmcs[i + first_ri] = NULL;
652 
653 	if (!pmc_cpu_is_primary(cpu) && (cpu & 1))
654 		return (0);
655 
656 	p4c = p4_pcpu[cpu];
657 
658 	KASSERT(p4c != NULL, ("[p4,%d] NULL pcpu", __LINE__));
659 
660 	/* Turn off all PMCs on this CPU */
661 	for (i = 0; i < P4_NPMCS - 1; i++)
662 		wrmsr(P4_CCCR_MSR_FIRST + i,
663 		    rdmsr(P4_CCCR_MSR_FIRST + i) & ~P4_CCCR_ENABLE);
664 
665 	mtx_destroy(&p4c->pc_mtx);
666 
667 	free(p4c, M_PMC);
668 
669 	p4_pcpu[cpu] = NULL;
670 
671 	return (0);
672 }
673 
674 /*
675  * Read a PMC
676  */
677 
678 static int
p4_read_pmc(int cpu,int ri,pmc_value_t * v)679 p4_read_pmc(int cpu, int ri, pmc_value_t *v)
680 {
681 	struct pmc *pm;
682 	pmc_value_t tmp;
683 	struct p4_cpu *pc;
684 	enum pmc_mode mode;
685 	struct p4pmc_descr *pd;
686 
687 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
688 	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
689 	KASSERT(ri >= 0 && ri < P4_NPMCS,
690 	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
691 
692 	pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
693 	pm = pc->pc_p4pmcs[ri].phw_pmc;
694 	pd = &p4_pmcdesc[ri];
695 
696 	KASSERT(pm != NULL,
697 	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri));
698 
699 	KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm),
700 	    ("[p4,%d] class mismatch pd %d != id class %d", __LINE__,
701 	    pd->pm_descr.pd_class, PMC_TO_CLASS(pm)));
702 
703 	mode = PMC_TO_MODE(pm);
704 
705 	PMCDBG3(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode);
706 
707 	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
708 	    ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class));
709 
710 	tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr);
711 
712 	if (PMC_IS_VIRTUAL_MODE(mode)) {
713 		if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */
714 			tmp += (P4_PERFCTR_MASK + 1) -
715 			    P4_PCPU_HW_VALUE(pc,ri,cpu);
716 		else
717 			tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
718 		tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu);
719 	}
720 
721 	if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */
722 		*v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp);
723 	else
724 		*v = tmp;
725 
726 	PMCDBG1(MDP,REA,2, "p4-read -> %jx", *v);
727 
728 	return (0);
729 }
730 
731 /*
732  * Write a PMC
733  */
734 
735 static int
p4_write_pmc(int cpu,int ri,pmc_value_t v)736 p4_write_pmc(int cpu, int ri, pmc_value_t v)
737 {
738 	enum pmc_mode mode;
739 	struct pmc *pm;
740 	struct p4_cpu *pc;
741 	const struct pmc_hw *phw;
742 	const struct p4pmc_descr *pd;
743 
744 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
745 	    ("[amd,%d] illegal CPU value %d", __LINE__, cpu));
746 	KASSERT(ri >= 0 && ri < P4_NPMCS,
747 	    ("[amd,%d] illegal row-index %d", __LINE__, ri));
748 
749 	pc  = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
750 	phw = &pc->pc_p4pmcs[ri];
751 	pm  = phw->phw_pmc;
752 	pd  = &p4_pmcdesc[ri];
753 
754 	KASSERT(pm != NULL,
755 	    ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__,
756 		cpu, ri));
757 
758 	mode = PMC_TO_MODE(pm);
759 
760 	PMCDBG4(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri,
761 	    mode, v);
762 
763 	/*
764 	 * write the PMC value to the register/saved value: for
765 	 * sampling mode PMCs, the value to be programmed into the PMC
766 	 * counter is -(C+1) where 'C' is the requested sample rate.
767 	 */
768 	if (PMC_IS_SAMPLING_MODE(mode))
769 		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v);
770 
771 	if (PMC_IS_SYSTEM_MODE(mode))
772 		wrmsr(pd->pm_pmc_msr, v);
773 	else
774 		P4_PCPU_PMC_VALUE(pc,ri,cpu) = v;
775 
776 	return (0);
777 }
778 
779 /*
780  * Configure a PMC 'pm' on the given CPU and row-index.
781  *
782  * 'pm' may be NULL to indicate de-configuration.
783  *
784  * On HTT systems, a PMC may get configured twice, once for each
785  * "logical" CPU.  We track this using the CFGFLAGS field of the
786  * per-cpu state; this field is a bit mask with one bit each for
787  * logical CPUs 0 & 1.
788  */
789 
790 static int
p4_config_pmc(int cpu,int ri,struct pmc * pm)791 p4_config_pmc(int cpu, int ri, struct pmc *pm)
792 {
793 	struct pmc_hw *phw;
794 	struct p4_cpu *pc;
795 	int cfgflags, cpuflag;
796 
797 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
798 	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
799 
800 	KASSERT(ri >= 0 && ri < P4_NPMCS,
801 	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
802 
803 	PMCDBG3(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm);
804 
805 	pc  = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
806 	phw = &pc->pc_p4pmcs[ri];
807 
808 	KASSERT(pm == NULL || phw->phw_pmc == NULL ||
809 	    (p4_system_has_htt && phw->phw_pmc == pm),
810 	    ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__));
811 
812 	mtx_lock_spin(&pc->pc_mtx);
813 	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
814 
815 	KASSERT(cfgflags >= 0 || cfgflags <= 3,
816 	    ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__,
817 		cfgflags, cpu, ri));
818 
819 	KASSERT(cfgflags == 0 || phw->phw_pmc,
820 	    ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count",
821 		__LINE__, cpu, ri));
822 
823 	cpuflag = P4_CPU_TO_FLAG(cpu);
824 
825 	if (pm) {		/* config */
826 		if (cfgflags == 0)
827 			phw->phw_pmc = pm;
828 
829 		KASSERT(phw->phw_pmc == pm,
830 		    ("[p4,%d] cpu=%d ri=%d config %p != hw %p",
831 			__LINE__, cpu, ri, pm, phw->phw_pmc));
832 
833 		cfgflags |= cpuflag;
834 	} else {		/* unconfig */
835 		cfgflags &= ~cpuflag;
836 
837 		if (cfgflags == 0)
838 			phw->phw_pmc = NULL;
839 	}
840 
841 	KASSERT(cfgflags >= 0 || cfgflags <= 3,
842 	    ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__,
843 		cfgflags, cpu, ri));
844 
845 	P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags);
846 
847 	mtx_unlock_spin(&pc->pc_mtx);
848 
849 	return (0);
850 }
851 
852 /*
853  * Retrieve a configured PMC pointer from hardware state.
854  */
855 
856 static int
p4_get_config(int cpu,int ri,struct pmc ** ppm)857 p4_get_config(int cpu, int ri, struct pmc **ppm)
858 {
859 	int cfgflags;
860 	struct p4_cpu *pc;
861 
862 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
863 	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
864 	KASSERT(ri >= 0 && ri < P4_NPMCS,
865 	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
866 
867 	pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
868 
869 	mtx_lock_spin(&pc->pc_mtx);
870 	cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri);
871 	mtx_unlock_spin(&pc->pc_mtx);
872 
873 	if (cfgflags & P4_CPU_TO_FLAG(cpu))
874 		*ppm = pc->pc_p4pmcs[ri].phw_pmc; /* PMC config'ed on this CPU */
875 	else
876 		*ppm = NULL;
877 
878 	return 0;
879 }
880 
881 /*
882  * Allocate a PMC.
883  *
884  * The allocation strategy differs between HTT and non-HTT systems.
885  *
886  * The non-HTT case:
887  *   - Given the desired event and the PMC row-index, lookup the
888  *   list of valid ESCRs for the event.
889  *   - For each valid ESCR:
890  *     - Check if the ESCR is free and the ESCR row is in a compatible
891  *       mode (i.e., system or process))
892  *     - Check if the ESCR is usable with a P4 PMC at the desired row-index.
893  *   If everything matches, we determine the appropriate bit values for the
894  *   ESCR and CCCR registers.
895  *
896  * The HTT case:
897  *
898  * - Process mode PMCs require special care.  The FreeBSD scheduler could
899  *   schedule any two processes on the same physical CPU.  We need to ensure
900  *   that a given PMC row-index is never allocated to two different
901  *   PMCs owned by different user-processes.
902  *   This is ensured by always allocating a PMC from a 'FREE' PMC row
903  *   if the system has HTT active.
904  * - A similar check needs to be done for ESCRs; we do not want two PMCs
905  *   using the same ESCR to be scheduled at the same time.  Thus ESCR
906  *   allocation is also restricted to FREE rows if the system has HTT
907  *   enabled.
908  * - Thirdly, some events are 'thread-independent' terminology, i.e.,
909  *   the PMC hardware cannot distinguish between events caused by
910  *   different logical CPUs.  This makes it impossible to assign events
911  *   to a given thread of execution.  If the system has HTT enabled,
912  *   these events are not allowed for process-mode PMCs.
913  */
914 
915 static int
p4_allocate_pmc(int cpu,int ri,struct pmc * pm,const struct pmc_op_pmcallocate * a)916 p4_allocate_pmc(int cpu, int ri, struct pmc *pm,
917     const struct pmc_op_pmcallocate *a)
918 {
919 	int found, n, m;
920 	uint32_t caps, cccrvalue, escrvalue, tflags;
921 	enum pmc_p4escr escr;
922 	struct p4_cpu *pc;
923 	struct p4_event_descr *pevent;
924 	const struct p4pmc_descr *pd;
925 
926 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
927 	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
928 	KASSERT(ri >= 0 && ri < P4_NPMCS,
929 	    ("[p4,%d] illegal row-index value %d", __LINE__, ri));
930 
931 	pd = &p4_pmcdesc[ri];
932 
933 	PMCDBG4(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x "
934 	    "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps,
935 	    pm->pm_caps);
936 
937 	/* check class */
938 	if (pd->pm_descr.pd_class != a->pm_class)
939 		return (EINVAL);
940 
941 	/* check requested capabilities */
942 	caps = a->pm_caps;
943 	if ((pd->pm_descr.pd_caps & caps) != caps)
944 		return (EPERM);
945 
946 	/*
947 	 * If the system has HTT enabled, and the desired allocation
948 	 * mode is process-private, and the PMC row disposition is not
949 	 * FREE (0), decline the allocation.
950 	 */
951 
952 	if (p4_system_has_htt &&
953 	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
954 	    pmc_getrowdisp(ri) != 0)
955 		return (EBUSY);
956 
957 	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
958 	    ("[p4,%d] unknown PMC class %d", __LINE__,
959 		pd->pm_descr.pd_class));
960 
961 	if (pm->pm_event < PMC_EV_P4_FIRST ||
962 	    pm->pm_event > PMC_EV_P4_LAST)
963 		return (EINVAL);
964 
965 	if ((pevent = p4_find_event(pm->pm_event)) == NULL)
966 		return (ESRCH);
967 
968 	PMCDBG4(MDP,ALL,2, "pevent={ev=%d,escrsel=0x%x,cccrsel=0x%x,isti=%d}",
969 	    pevent->pm_event, pevent->pm_escr_eventselect,
970 	    pevent->pm_cccr_select, pevent->pm_is_ti_event);
971 
972 	/*
973 	 * Some PMC events are 'thread independent'and therefore
974 	 * cannot be used for process-private modes if HTT is being
975 	 * used.
976 	 */
977 
978 	if (P4_EVENT_IS_TI(pevent) &&
979 	    PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) &&
980 	    p4_system_has_htt)
981 		return (EINVAL);
982 
983 	pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
984 
985 	found   = 0;
986 
987 	/* look for a suitable ESCR for this event */
988 	for (n = 0; n < P4_MAX_ESCR_PER_EVENT && !found; n++) {
989 		if ((escr = pevent->pm_escrs[n]) == P4_ESCR_NONE)
990 			break;	/* out of ESCRs */
991 		/*
992 		 * Check ESCR row disposition.
993 		 *
994 		 * If the request is for a system-mode PMC, then the
995 		 * ESCR row should not be in process-virtual mode, and
996 		 * should also be free on the current CPU.
997 		 */
998 
999 		if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1000 		    if (P4_ESCR_ROW_DISP_IS_THREAD(escr) ||
1001 			pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX)
1002 			    continue;
1003 		}
1004 
1005 		/*
1006 		 * If the request is for a process-virtual PMC, and if
1007 		 * HTT is not enabled, we can use an ESCR row that is
1008 		 * either FREE or already in process mode.
1009 		 *
1010 		 * If HTT is enabled, then we need to ensure that a
1011 		 * given ESCR is never allocated to two PMCS that
1012 		 * could run simultaneously on the two logical CPUs of
1013 		 * a CPU package.  We ensure this be only allocating
1014 		 * ESCRs from rows marked as 'FREE'.
1015 		 */
1016 
1017 		if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {
1018 			if (p4_system_has_htt) {
1019 				if (!P4_ESCR_ROW_DISP_IS_FREE(escr))
1020 					continue;
1021 			} else
1022 				if (P4_ESCR_ROW_DISP_IS_STANDALONE(escr))
1023 					continue;
1024 		}
1025 
1026 		/*
1027 		 * We found a suitable ESCR for this event.  Now check if
1028 		 * this escr can work with the PMC at row-index 'ri'.
1029 		 */
1030 
1031 		for (m = 0; m < P4_MAX_PMC_PER_ESCR; m++)
1032 			if (p4_escrs[escr].pm_pmcs[m] == pd->pm_pmcnum) {
1033 				found = 1;
1034 				break;
1035 			}
1036 	}
1037 
1038 	if (found == 0)
1039 		return (ESRCH);
1040 
1041 	KASSERT((int) escr >= 0 && escr < P4_NESCR,
1042 	    ("[p4,%d] illegal ESCR value %d", __LINE__, escr));
1043 
1044 	/* mark ESCR row mode */
1045 	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1046 		pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */
1047 		P4_ESCR_MARK_ROW_STANDALONE(escr);
1048 	} else {
1049 		KASSERT(pc->pc_escrs[escr] == P4_INVALID_PMC_INDEX,
1050 		    ("[p4,%d] escr[%d] already in use", __LINE__, escr));
1051 		P4_ESCR_MARK_ROW_THREAD(escr);
1052 	}
1053 
1054 	pm->pm_md.pm_p4.pm_p4_escrmsr   = p4_escrs[escr].pm_escr_msr;
1055 	pm->pm_md.pm_p4.pm_p4_escr      = escr;
1056 
1057 	cccrvalue = P4_CCCR_TO_ESCR_SELECT(pevent->pm_cccr_select);
1058 	escrvalue = P4_ESCR_TO_EVENT_SELECT(pevent->pm_escr_eventselect);
1059 
1060 	/* CCCR fields */
1061 	if (caps & PMC_CAP_THRESHOLD)
1062 		cccrvalue |= (a->pm_md.pm_p4.pm_p4_cccrconfig &
1063 		    P4_CCCR_THRESHOLD_MASK) | P4_CCCR_COMPARE;
1064 
1065 	if (caps & PMC_CAP_EDGE)
1066 		cccrvalue |= P4_CCCR_EDGE;
1067 
1068 	if (caps & PMC_CAP_INVERT)
1069 		cccrvalue |= P4_CCCR_COMPLEMENT;
1070 
1071 	if (p4_system_has_htt)
1072 		cccrvalue |= a->pm_md.pm_p4.pm_p4_cccrconfig &
1073 		    P4_CCCR_ACTIVE_THREAD_MASK;
1074 	else			/* no HTT; thread field should be '11b' */
1075 		cccrvalue |= P4_CCCR_TO_ACTIVE_THREAD(0x3);
1076 
1077 	if (caps & PMC_CAP_CASCADE)
1078 		cccrvalue |= P4_CCCR_CASCADE;
1079 
1080 	/* On HTT systems the PMI T0 field may get moved to T1 at pmc start */
1081 	if (caps & PMC_CAP_INTERRUPT)
1082 		cccrvalue |= P4_CCCR_OVF_PMI_T0;
1083 
1084 	/* ESCR fields */
1085 	if (caps & PMC_CAP_QUALIFIER)
1086 		escrvalue |= a->pm_md.pm_p4.pm_p4_escrconfig &
1087 		    P4_ESCR_EVENT_MASK_MASK;
1088 	if (caps & PMC_CAP_TAGGING)
1089 		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1090 		    P4_ESCR_TAG_VALUE_MASK) | P4_ESCR_TAG_ENABLE;
1091 	if (caps & PMC_CAP_QUALIFIER)
1092 		escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig &
1093 		    P4_ESCR_EVENT_MASK_MASK);
1094 
1095 	/* HTT: T0_{OS,USR} bits may get moved to T1 at pmc start */
1096 	tflags = 0;
1097 	if (caps & PMC_CAP_SYSTEM)
1098 		tflags |= P4_ESCR_T0_OS;
1099 	if (caps & PMC_CAP_USER)
1100 		tflags |= P4_ESCR_T0_USR;
1101 	if (tflags == 0)
1102 		tflags = (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1103 	escrvalue |= tflags;
1104 
1105 	pm->pm_md.pm_p4.pm_p4_cccrvalue = cccrvalue;
1106 	pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue;
1107 
1108 	PMCDBG5(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x "
1109 	    "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select,
1110 	    cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue);
1111 
1112 	return (0);
1113 }
1114 
1115 /*
1116  * release a PMC.
1117  */
1118 
1119 static int
p4_release_pmc(int cpu,int ri,struct pmc * pm)1120 p4_release_pmc(int cpu, int ri, struct pmc *pm)
1121 {
1122 	enum pmc_p4escr escr;
1123 	struct p4_cpu *pc;
1124 
1125 	KASSERT(ri >= 0 && ri < P4_NPMCS,
1126 	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
1127 
1128 	escr = pm->pm_md.pm_p4.pm_p4_escr;
1129 
1130 	PMCDBG3(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr);
1131 
1132 	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1133 		pc  = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1134 
1135 		KASSERT(pc->pc_p4pmcs[ri].phw_pmc == NULL,
1136 		    ("[p4,%d] releasing configured PMC ri=%d", __LINE__, ri));
1137 
1138 		P4_ESCR_UNMARK_ROW_STANDALONE(escr);
1139 		KASSERT(pc->pc_escrs[escr] == ri,
1140 		    ("[p4,%d] escr[%d] not allocated to ri %d", __LINE__,
1141 			escr, ri));
1142 	        pc->pc_escrs[escr] = P4_INVALID_PMC_INDEX; /* mark as free */
1143 	} else
1144 		P4_ESCR_UNMARK_ROW_THREAD(escr);
1145 
1146 	return (0);
1147 }
1148 
1149 /*
1150  * Start a PMC
1151  */
1152 
1153 static int
p4_start_pmc(int cpu,int ri)1154 p4_start_pmc(int cpu, int ri)
1155 {
1156 	int rc;
1157 	struct pmc *pm;
1158 	struct p4_cpu *pc;
1159 	struct p4pmc_descr *pd;
1160 	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1161 
1162 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
1163 	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1164 	KASSERT(ri >= 0 && ri < P4_NPMCS,
1165 	    ("[p4,%d] illegal row-index %d", __LINE__, ri));
1166 
1167 	pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1168 	pm = pc->pc_p4pmcs[ri].phw_pmc;
1169 	pd = &p4_pmcdesc[ri];
1170 
1171 	KASSERT(pm != NULL,
1172 	    ("[p4,%d] starting cpu%d,pmc%d with null pmc", __LINE__, cpu, ri));
1173 
1174 	PMCDBG2(MDP,STA,1, "p4-start cpu=%d ri=%d", cpu, ri);
1175 
1176 	KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4,
1177 	    ("[p4,%d] wrong PMC class %d", __LINE__,
1178 		pd->pm_descr.pd_class));
1179 
1180 	/* retrieve the desired CCCR/ESCR values from the PMC */
1181 	cccrvalue = pm->pm_md.pm_p4.pm_p4_cccrvalue;
1182 	escrvalue = pm->pm_md.pm_p4.pm_p4_escrvalue;
1183 	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1184 
1185 	/* extract and zero the logical processor selection bits */
1186 	cccrtbits = cccrvalue & P4_CCCR_OVF_PMI_T0;
1187 	escrtbits = escrvalue & (P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1188 	cccrvalue &= ~P4_CCCR_OVF_PMI_T0;
1189 	escrvalue &= ~(P4_ESCR_T0_OS|P4_ESCR_T0_USR);
1190 
1191 	if (P4_CPU_IS_HTT_SECONDARY(cpu)) { /* shift T0 bits to T1 position */
1192 		cccrtbits <<= 1;
1193 		escrtbits >>= 2;
1194 	}
1195 
1196 	/* start system mode PMCs directly */
1197 	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1198 		wrmsr(escrmsr, escrvalue | escrtbits);
1199 		wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE);
1200 		return 0;
1201 	}
1202 
1203 	/*
1204 	 * Thread mode PMCs
1205 	 *
1206 	 * On HTT machines, the same PMC could be scheduled on the
1207 	 * same physical CPU twice (once for each logical CPU), for
1208 	 * example, if two threads of a multi-threaded process get
1209 	 * scheduled on the same CPU.
1210 	 *
1211 	 */
1212 
1213 	mtx_lock_spin(&pc->pc_mtx);
1214 
1215 	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1216 	KASSERT(rc == 0 || rc == 1,
1217 	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1218 		rc));
1219 
1220 	if (rc == 0) {		/* 1st CPU and the non-HTT case */
1221 
1222 		KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr),
1223 		    ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__,
1224 			cpu, ri, pd->pm_cccr_msr));
1225 
1226 		/* write out the low 40 bits of the saved value to hardware */
1227 		wrmsr(pd->pm_pmc_msr,
1228 		    P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK);
1229 
1230 	} else if (rc == 1) {		/* 2nd CPU */
1231 
1232 		/*
1233 		 * Stop the PMC and retrieve the CCCR and ESCR values
1234 		 * from their MSRs, and turn on the additional T[0/1]
1235 		 * bits for the 2nd CPU.
1236 		 */
1237 
1238 		cccrvalue = rdmsr(pd->pm_cccr_msr);
1239 		wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1240 
1241 		/* check that the configuration bits read back match the PMC */
1242 		KASSERT((cccrvalue & P4_CCCR_Tx_MASK) ==
1243 		    (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK),
1244 		    ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d "
1245 			"cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri,
1246 			cccrvalue & P4_CCCR_Tx_MASK,
1247 			pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK));
1248 		KASSERT(cccrvalue & P4_CCCR_ENABLE,
1249 		    ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running",
1250 			__LINE__, rc, cpu, ri));
1251 		KASSERT((cccrvalue & cccrtbits) == 0,
1252 		    ("[p4,%d] CCCR T0/T1 mismatch rc=%d cpu=%d ri=%d"
1253 		     "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri,
1254 			cccrvalue, cccrtbits));
1255 
1256 		escrvalue = rdmsr(escrmsr);
1257 
1258 		KASSERT((escrvalue & P4_ESCR_Tx_MASK) ==
1259 		    (pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK),
1260 		    ("[p4,%d] Extra ESCR bits cpu=%d rc=%d ri=%d "
1261 			"escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri,
1262 			escrvalue & P4_ESCR_Tx_MASK,
1263 			pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK));
1264 		KASSERT((escrvalue & escrtbits) == 0,
1265 		    ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d "
1266 		     "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__,
1267 			rc, cpu, ri, escrmsr, escrvalue, escrtbits));
1268 	}
1269 
1270 	/* Enable the correct bits for this CPU. */
1271 	escrvalue |= escrtbits;
1272 	cccrvalue |= cccrtbits | P4_CCCR_ENABLE;
1273 
1274 	/* Save HW value at the time of starting hardware */
1275 	P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr);
1276 
1277 	/* Program the ESCR and CCCR and start the PMC */
1278 	wrmsr(escrmsr, escrvalue);
1279 	wrmsr(pd->pm_cccr_msr, cccrvalue);
1280 
1281 	++rc;
1282 	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1283 
1284 	mtx_unlock_spin(&pc->pc_mtx);
1285 
1286 	PMCDBG6(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d "
1287 	    "escrmsr=0x%x escrvalue=0x%x", cpu, rc,
1288 	    ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue);
1289 	PMCDBG2(MDP,STA,2,"cccr_config=0x%x v=%jx",
1290 	    cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu));
1291 
1292 	return (0);
1293 }
1294 
1295 /*
1296  * Stop a PMC.
1297  */
1298 
1299 static int
p4_stop_pmc(int cpu,int ri)1300 p4_stop_pmc(int cpu, int ri)
1301 {
1302 	int rc;
1303 	uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits;
1304 	struct pmc *pm;
1305 	struct p4_cpu *pc;
1306 	struct p4pmc_descr *pd;
1307 	pmc_value_t tmp;
1308 
1309 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
1310 	    ("[p4,%d] illegal CPU value %d", __LINE__, cpu));
1311 	KASSERT(ri >= 0 && ri < P4_NPMCS,
1312 	    ("[p4,%d] illegal row index %d", __LINE__, ri));
1313 
1314 	pd = &p4_pmcdesc[ri];
1315 	pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1316 	pm = pc->pc_p4pmcs[ri].phw_pmc;
1317 
1318 	KASSERT(pm != NULL,
1319 	    ("[p4,%d] null pmc for cpu%d, ri%d", __LINE__, cpu, ri));
1320 
1321 	PMCDBG2(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri);
1322 
1323 	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
1324 		wrmsr(pd->pm_cccr_msr,
1325 		    pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE);
1326 		return (0);
1327 	}
1328 
1329 	/*
1330 	 * Thread mode PMCs.
1331 	 *
1332 	 * On HTT machines, this PMC may be in use by two threads
1333 	 * running on two logical CPUS.  Thus we look at the
1334 	 * 'runcount' field and only turn off the appropriate TO/T1
1335 	 * bits (and keep the PMC running) if two logical CPUs were
1336 	 * using the PMC.
1337 	 *
1338 	 */
1339 
1340 	/* bits to mask */
1341 	cccrtbits = P4_CCCR_OVF_PMI_T0;
1342 	escrtbits = P4_ESCR_T0_OS | P4_ESCR_T0_USR;
1343 	if (P4_CPU_IS_HTT_SECONDARY(cpu)) {
1344 		cccrtbits <<= 1;
1345 		escrtbits >>= 2;
1346 	}
1347 
1348 	mtx_lock_spin(&pc->pc_mtx);
1349 
1350 	rc = P4_PCPU_GET_RUNCOUNT(pc,ri);
1351 
1352 	KASSERT(rc == 2 || rc == 1,
1353 	    ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri,
1354 		rc));
1355 
1356 	--rc;
1357 
1358 	P4_PCPU_SET_RUNCOUNT(pc,ri,rc);
1359 
1360 	/* Stop this PMC */
1361 	cccrvalue = rdmsr(pd->pm_cccr_msr);
1362 	wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE);
1363 
1364 	escrmsr   = pm->pm_md.pm_p4.pm_p4_escrmsr;
1365 	escrvalue = rdmsr(escrmsr);
1366 
1367 	/* The current CPU should be running on this PMC */
1368 	KASSERT(escrvalue & escrtbits,
1369 	    ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x "
1370 		"escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr,
1371 		escrvalue, escrtbits));
1372 	KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) ||
1373 	    (cccrvalue & cccrtbits),
1374 	    ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x "
1375 		"tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits));
1376 
1377 	/* get the current hardware reading */
1378 	tmp = rdmsr(pd->pm_pmc_msr);
1379 
1380 	if (rc == 1) {		/* need to keep the PMC running */
1381 		escrvalue &= ~escrtbits;
1382 		cccrvalue &= ~cccrtbits;
1383 		wrmsr(escrmsr, escrvalue);
1384 		wrmsr(pd->pm_cccr_msr, cccrvalue);
1385 	}
1386 
1387 	mtx_unlock_spin(&pc->pc_mtx);
1388 
1389 	PMCDBG5(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x "
1390 	    "escrval=0x%x", cpu, rc, ri, escrmsr, escrvalue);
1391 	PMCDBG2(MDP,STO,2, "cccrval=0x%x v=%jx", cccrvalue, tmp);
1392 
1393 	if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */
1394 		tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu);
1395 	else
1396 		tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu);
1397 
1398 	P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp;
1399 
1400 	return 0;
1401 }
1402 
1403 /*
1404  * Handle an interrupt.
1405  *
1406  * The hardware sets the CCCR_OVF whenever a counter overflow occurs,
1407  * so the handler examines all the 18 CCCR registers, processing the
1408  * counters that have overflowed.
1409  *
1410  * On HTT machines, the CCCR register is shared and will interrupt
1411  * both logical processors if so configured.  Thus multiple logical
1412  * CPUs could enter the NMI service routine at the same time.  These
1413  * will get serialized using a per-cpu spinlock dedicated for use in
1414  * the NMI handler.
1415  */
1416 
1417 static int
p4_intr(int cpu,struct trapframe * tf)1418 p4_intr(int cpu, struct trapframe *tf)
1419 {
1420 	uint32_t cccrval, ovf_mask, ovf_partner;
1421 	int did_interrupt, error, ri;
1422 	struct p4_cpu *pc;
1423 	struct pmc *pm;
1424 	pmc_value_t v;
1425 
1426 	PMCDBG3(MDP,INT, 1, "cpu=%d tf=0x%p um=%d", cpu, (void *) tf,
1427 	    TRAPF_USERMODE(tf));
1428 
1429 	pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)];
1430 
1431 	ovf_mask = P4_CPU_IS_HTT_SECONDARY(cpu) ?
1432 	    P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0;
1433 	ovf_mask |= P4_CCCR_OVF;
1434 	if (p4_system_has_htt)
1435 		ovf_partner = P4_CPU_IS_HTT_SECONDARY(cpu) ?
1436 		    P4_CCCR_OVF_PMI_T0 : P4_CCCR_OVF_PMI_T1;
1437 	else
1438 		ovf_partner = 0;
1439 	did_interrupt = 0;
1440 
1441 	if (p4_system_has_htt)
1442 		P4_PCPU_ACQ_INTR_SPINLOCK(pc);
1443 
1444 	/*
1445 	 * Loop through all CCCRs, looking for ones that have
1446 	 * interrupted this CPU.
1447 	 */
1448 	for (ri = 0; ri < P4_NPMCS; ri++) {
1449 
1450 		/*
1451 		 * Check if our partner logical CPU has already marked
1452 		 * this PMC has having interrupted it.  If so, reset
1453 		 * the flag and process the interrupt, but leave the
1454 		 * hardware alone.
1455 		 */
1456 		if (p4_system_has_htt && P4_PCPU_GET_INTRFLAG(pc,ri)) {
1457 			P4_PCPU_SET_INTRFLAG(pc,ri,0);
1458 			did_interrupt = 1;
1459 
1460 			/*
1461 			 * Ignore de-configured or stopped PMCs.
1462 			 * Ignore PMCs not in sampling mode.
1463 			 */
1464 			pm = pc->pc_p4pmcs[ri].phw_pmc;
1465 			if (pm == NULL ||
1466 			    pm->pm_state != PMC_STATE_RUNNING ||
1467 			    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1468 				continue;
1469 			}
1470 			(void) pmc_process_interrupt(cpu, PMC_HR, pm, tf,
1471 			    TRAPF_USERMODE(tf));
1472 			continue;
1473 		}
1474 
1475 		/*
1476 		 * Fresh interrupt.  Look for the CCCR_OVF bit
1477 		 * and the OVF_Tx bit for this logical
1478 		 * processor being set.
1479 		 */
1480 		cccrval = rdmsr(P4_CCCR_MSR_FIRST + ri);
1481 
1482 		if ((cccrval & ovf_mask) != ovf_mask)
1483 			continue;
1484 
1485 		/*
1486 		 * If the other logical CPU would also have been
1487 		 * interrupted due to the PMC being shared, record
1488 		 * this fact in the per-cpu saved interrupt flag
1489 		 * bitmask.
1490 		 */
1491 		if (p4_system_has_htt && (cccrval & ovf_partner))
1492 			P4_PCPU_SET_INTRFLAG(pc, ri, 1);
1493 
1494 		v = rdmsr(P4_PERFCTR_MSR_FIRST + ri);
1495 
1496 		PMCDBG2(MDP,INT, 2, "ri=%d v=%jx", ri, v);
1497 
1498 		/* Stop the counter, and reset the overflow  bit */
1499 		cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE);
1500 		wrmsr(P4_CCCR_MSR_FIRST + ri, cccrval);
1501 
1502 		did_interrupt = 1;
1503 
1504 		/*
1505 		 * Ignore de-configured or stopped PMCs.  Ignore PMCs
1506 		 * not in sampling mode.
1507 		 */
1508 		pm = pc->pc_p4pmcs[ri].phw_pmc;
1509 
1510 		if (pm == NULL ||
1511 		    pm->pm_state != PMC_STATE_RUNNING ||
1512 		    !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) {
1513 			continue;
1514 		}
1515 
1516 		/*
1517 		 * Process the interrupt.  Re-enable the PMC if
1518 		 * processing was successful.
1519 		 */
1520 		error = pmc_process_interrupt(cpu, PMC_HR, pm, tf,
1521 		    TRAPF_USERMODE(tf));
1522 
1523 		/*
1524 		 * Only the first processor executing the NMI handler
1525 		 * in a HTT pair will restart a PMC, and that too
1526 		 * only if there were no errors.
1527 		 */
1528 		v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(
1529 			pm->pm_sc.pm_reloadcount);
1530 		wrmsr(P4_PERFCTR_MSR_FIRST + ri, v);
1531 		if (error == 0)
1532 			wrmsr(P4_CCCR_MSR_FIRST + ri,
1533 			    cccrval | P4_CCCR_ENABLE);
1534 	}
1535 
1536 	/* allow the other CPU to proceed */
1537 	if (p4_system_has_htt)
1538 		P4_PCPU_REL_INTR_SPINLOCK(pc);
1539 
1540 	/*
1541 	 * On Intel P4 CPUs, the PMC 'pcint' entry in the LAPIC gets
1542 	 * masked when a PMC interrupts the CPU.  We need to unmask
1543 	 * the interrupt source explicitly.
1544 	 */
1545 
1546 	if (did_interrupt)
1547 		lapic_reenable_pmc();
1548 
1549 	atomic_add_int(did_interrupt ? &pmc_stats.pm_intr_processed :
1550 	    &pmc_stats.pm_intr_ignored, 1);
1551 
1552 	return (did_interrupt);
1553 }
1554 
1555 /*
1556  * Describe a CPU's PMC state.
1557  */
1558 
1559 static int
p4_describe(int cpu,int ri,struct pmc_info * pi,struct pmc ** ppmc)1560 p4_describe(int cpu, int ri, struct pmc_info *pi,
1561     struct pmc **ppmc)
1562 {
1563 	int error;
1564 	size_t copied;
1565 	const struct p4pmc_descr *pd;
1566 
1567 	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
1568 	    ("[p4,%d] illegal CPU %d", __LINE__, cpu));
1569 	KASSERT(ri >= 0 && ri < P4_NPMCS,
1570 	    ("[p4,%d] row-index %d out of range", __LINE__, ri));
1571 
1572 	PMCDBG2(MDP,OPS,1,"p4-describe cpu=%d ri=%d", cpu, ri);
1573 
1574 	if (P4_CPU_IS_HTT_SECONDARY(cpu))
1575 		return (EINVAL);
1576 
1577 	pd  = &p4_pmcdesc[ri];
1578 
1579 	if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name,
1580 	    PMC_NAME_MAX, &copied)) != 0)
1581 		return (error);
1582 
1583 	pi->pm_class = pd->pm_descr.pd_class;
1584 
1585 	if (p4_pcpu[cpu]->pc_p4pmcs[ri].phw_state & PMC_PHW_FLAG_IS_ENABLED) {
1586 		pi->pm_enabled = TRUE;
1587 		*ppmc          = p4_pcpu[cpu]->pc_p4pmcs[ri].phw_pmc;
1588 	} else {
1589 		pi->pm_enabled = FALSE;
1590 		*ppmc          = NULL;
1591 	}
1592 
1593 	return (0);
1594 }
1595 
1596 /*
1597  * Get MSR# for use with RDPMC.
1598  */
1599 
1600 static int
p4_get_msr(int ri,uint32_t * msr)1601 p4_get_msr(int ri, uint32_t *msr)
1602 {
1603 	KASSERT(ri >= 0 && ri < P4_NPMCS,
1604 	    ("[p4,%d] ri %d out of range", __LINE__, ri));
1605 
1606 	*msr = p4_pmcdesc[ri].pm_pmc_msr - P4_PERFCTR_MSR_FIRST;
1607 
1608 	PMCDBG2(MDP,OPS, 1, "ri=%d getmsr=0x%x", ri, *msr);
1609 
1610 	return 0;
1611 }
1612 
1613 
1614 int
pmc_p4_initialize(struct pmc_mdep * md,int ncpus)1615 pmc_p4_initialize(struct pmc_mdep *md, int ncpus)
1616 {
1617 	struct pmc_classdep *pcd;
1618 	struct p4_event_descr *pe;
1619 
1620 	KASSERT(md != NULL, ("[p4,%d] md is NULL", __LINE__));
1621 	KASSERT(cpu_vendor_id == CPU_VENDOR_INTEL,
1622 	    ("[p4,%d] Initializing non-intel processor", __LINE__));
1623 
1624 	PMCDBG0(MDP,INI,1, "p4-initialize");
1625 
1626 	/* Allocate space for pointers to per-cpu descriptors. */
1627 	p4_pcpu = malloc(sizeof(*p4_pcpu) * ncpus, M_PMC, M_ZERO | M_WAITOK);
1628 
1629 	/* Fill in the class dependent descriptor. */
1630 	pcd = &md->pmd_classdep[PMC_MDEP_CLASS_INDEX_P4];
1631 
1632 	switch (md->pmd_cputype) {
1633 	case PMC_CPU_INTEL_PIV:
1634 
1635 		pcd->pcd_caps		= P4_PMC_CAPS;
1636 		pcd->pcd_class		= PMC_CLASS_P4;
1637 		pcd->pcd_num		= P4_NPMCS;
1638 		pcd->pcd_ri		= md->pmd_npmc;
1639 		pcd->pcd_width		= 40;
1640 
1641 		pcd->pcd_allocate_pmc	= p4_allocate_pmc;
1642 		pcd->pcd_config_pmc	= p4_config_pmc;
1643 		pcd->pcd_describe	= p4_describe;
1644 		pcd->pcd_get_config	= p4_get_config;
1645 		pcd->pcd_get_msr	= p4_get_msr;
1646 		pcd->pcd_pcpu_fini 	= p4_pcpu_fini;
1647 		pcd->pcd_pcpu_init    	= p4_pcpu_init;
1648 		pcd->pcd_read_pmc	= p4_read_pmc;
1649 		pcd->pcd_release_pmc	= p4_release_pmc;
1650 		pcd->pcd_start_pmc	= p4_start_pmc;
1651 		pcd->pcd_stop_pmc	= p4_stop_pmc;
1652 		pcd->pcd_write_pmc	= p4_write_pmc;
1653 
1654 		md->pmd_pcpu_fini	= NULL;
1655 		md->pmd_pcpu_init	= NULL;
1656 		md->pmd_intr	    	= p4_intr;
1657 		md->pmd_npmc	       += P4_NPMCS;
1658 
1659 		/* model specific configuration */
1660 		if ((cpu_id & 0xFFF) < 0xF27) {
1661 
1662 			/*
1663 			 * On P4 and Xeon with CPUID < (Family 15,
1664 			 * Model 2, Stepping 7), only one ESCR is
1665 			 * available for the IOQ_ALLOCATION event.
1666 			 */
1667 
1668 			pe = p4_find_event(PMC_EV_P4_IOQ_ALLOCATION);
1669 			pe->pm_escrs[1] = P4_ESCR_NONE;
1670 		}
1671 
1672 		break;
1673 
1674 	default:
1675 		KASSERT(0,("[p4,%d] Unknown CPU type", __LINE__));
1676 		return ENOSYS;
1677 	}
1678 
1679 	return (0);
1680 }
1681 
1682 void
pmc_p4_finalize(struct pmc_mdep * md)1683 pmc_p4_finalize(struct pmc_mdep *md)
1684 {
1685 #if	defined(INVARIANTS)
1686 	int i, ncpus;
1687 #endif
1688 
1689 	KASSERT(p4_pcpu != NULL,
1690 	    ("[p4,%d] NULL p4_pcpu", __LINE__));
1691 
1692 #if	defined(INVARIANTS)
1693 	ncpus = pmc_cpu_max();
1694 	for (i = 0; i < ncpus; i++)
1695 		KASSERT(p4_pcpu[i] == NULL, ("[p4,%d] non-null pcpu %d",
1696 		    __LINE__, i));
1697 #endif
1698 
1699 	free(p4_pcpu, M_PMC);
1700 	p4_pcpu = NULL;
1701 }
1702