1 /*        $NetBSD: hypervisor_machdep.c,v 1.46 2023/03/01 08:13:44 riastradh Exp $        */
2 
3 /*
4  *
5  * Copyright (c) 2004 Christian Limpach.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /******************************************************************************
30  * hypervisor.c
31  *
32  * Communication to/from hypervisor.
33  *
34  * Copyright (c) 2002-2004, K A Fraser
35  *
36  * Permission is hereby granted, free of charge, to any person obtaining a copy
37  * of this software and associated documentation files (the "Software"), to
38  * deal in the Software without restriction, including without limitation the
39  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
40  * sell copies of the Software, and to permit persons to whom the Software is
41  * furnished to do so, subject to the following conditions:
42  *
43  * The above copyright notice and this permission notice shall be included in
44  * all copies or substantial portions of the Software.
45  *
46  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
51  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
52  * DEALINGS IN THE SOFTWARE.
53  */
54 
55 
56 #include <sys/cdefs.h>
57 __KERNEL_RCSID(0, "$NetBSD: hypervisor_machdep.c,v 1.46 2023/03/01 08:13:44 riastradh Exp $");
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/kmem.h>
62 #include <sys/cpu.h>
63 #include <sys/ksyms.h>
64 
65 #include <uvm/uvm_extern.h>
66 
67 #include <machine/vmparam.h>
68 #include <machine/pmap.h>
69 #include <machine/pmap_private.h>
70 
71 #include <x86/machdep.h>
72 #include <x86/cpuvar.h>
73 
74 #include <xen/xen.h>
75 #include <xen/intr.h>
76 #include <xen/hypervisor.h>
77 #include <xen/evtchn.h>
78 #include <xen/xenpmap.h>
79 
80 #include "opt_xen.h"
81 #include "opt_modular.h"
82 #include "opt_ddb.h"
83 #include "isa.h"
84 #include "pci.h"
85 #include "ksyms.h"
86 
87 #ifdef DDB
88 #include <machine/db_machdep.h>
89 #include <ddb/db_extern.h>
90 #include <ddb/db_output.h>
91 #include <ddb/db_interface.h>
92 #endif
93 
94 #ifdef XENPV
95 /*
96  * arch-dependent p2m frame lists list (L3 and L2)
97  * used by Xen for save/restore mappings
98  */
99 static unsigned long * l3_p2m_page;
100 static unsigned long * l2_p2m_page;
101 static int l2_p2m_page_size; /* size of L2 page, in pages */
102 
103 static void build_p2m_frame_list_list(void);
104 static void update_p2m_frame_list_list(void);
105 
106 #endif
107 
108 // #define PORT_DEBUG 4
109 // #define EARLY_DEBUG_EVENT
110 
111 /* callback function type */
112 typedef void (*iterate_func_t)(unsigned int, unsigned int,
113                                      unsigned int, void *);
114 
115 static inline void
evt_iterate_bits(volatile unsigned long * pendingl1,volatile unsigned long * pendingl2,volatile unsigned long * mask,iterate_func_t iterate_pending,void * iterate_args)116 evt_iterate_bits(volatile unsigned long *pendingl1,
117                      volatile unsigned long *pendingl2,
118                      volatile unsigned long *mask,
119                      iterate_func_t iterate_pending, void *iterate_args)
120 {
121 
122           KASSERT(pendingl1 != NULL);
123           KASSERT(pendingl2 != NULL);
124 
125           unsigned long l1, l2;
126           unsigned int l1i, l2i, port;
127 
128           l1 = xen_atomic_xchg(pendingl1, 0);
129           while ((l1i = xen_ffs(l1)) != 0) {
130                     l1i--;
131                     l1 &= ~(1UL << l1i);
132 
133                     l2 = pendingl2[l1i] & (mask != NULL ? ~mask[l1i] : -1UL);
134                     l2 &= curcpu()->ci_evtmask[l1i];
135 
136                     if (mask != NULL) xen_atomic_setbits_l(&mask[l1i], l2);
137                     xen_atomic_clearbits_l(&pendingl2[l1i], l2);
138 
139                     while ((l2i = xen_ffs(l2)) != 0) {
140                               l2i--;
141                               l2 &= ~(1UL << l2i);
142 
143                               port = (l1i << LONG_SHIFT) + l2i;
144 
145                               iterate_pending(port, l1i, l2i, iterate_args);
146                     }
147           }
148 }
149 
150 /*
151  * Set per-cpu "pending" information for outstanding events that
152  * cannot be processed now.
153  */
154 
155 static inline void
evt_set_pending(unsigned int port,unsigned int l1i,unsigned int l2i,void * args)156 evt_set_pending(unsigned int port, unsigned int l1i,
157                     unsigned int l2i, void *args)
158 {
159 
160           KASSERT(args != NULL);
161 
162           int *ret = args;
163           struct intrhand *ih;
164 
165           if (evtsource[port]) {
166                     hypervisor_set_ipending(evtsource[port]->ev_imask, l1i, l2i);
167                     evtsource[port]->ev_evcnt.ev_count++;
168                     ih = evtsource[port]->ev_handlers;
169                     while (ih != NULL) {
170                               ih->ih_pending++;
171                               ih = ih->ih_evt_next;
172                     }
173 
174                     if (*ret == 0 && curcpu()->ci_ilevel <
175                         evtsource[port]->ev_maxlevel)
176                               *ret = 1;
177           }
178 #ifdef DOM0OPS
179           else  {
180                     /* set pending event */
181                     xenevt_setipending(l1i, l2i);
182           }
183 #endif
184 }
185 
186 int stipending(void);
187 int
stipending(void)188 stipending(void)
189 {
190           volatile shared_info_t *s = HYPERVISOR_shared_info;
191           struct cpu_info *ci;
192           volatile struct vcpu_info *vci;
193           int ret;
194 
195           kpreempt_disable();
196 
197           ret = 0;
198           ci = curcpu();
199           vci = ci->ci_vcpu;
200 
201 #if 0
202           if (HYPERVISOR_shared_info->events)
203                     printf("stipending events %08lx mask %08lx ilevel %d\n",
204                         HYPERVISOR_shared_info->events,
205                         HYPERVISOR_shared_info->events_mask, ci->ci_ilevel);
206 #endif
207 
208 #ifdef EARLY_DEBUG_EVENT
209           if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
210                     xen_debug_handler(NULL);
211                     xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
212           }
213 #endif
214 
215           /*
216            * we're only called after STIC, so we know that we'll have to
217            * STI at the end
218            */
219 
220           while (vci->evtchn_upcall_pending) {
221                     x86_disable_intr();
222 
223                     vci->evtchn_upcall_pending = 0;
224 
225                     evt_iterate_bits(&vci->evtchn_pending_sel,
226                         s->evtchn_pending, s->evtchn_mask,
227                         evt_set_pending, &ret);
228 
229                     x86_enable_intr();
230           }
231 
232           kpreempt_enable();
233 
234           return (ret);
235 }
236 
237 /* Iterate through pending events and call the event handler */
238 
239 static inline void
evt_do_hypervisor_callback(unsigned int port,unsigned int l1i,unsigned int l2i,void * args)240 evt_do_hypervisor_callback(unsigned int port, unsigned int l1i,
241                                  unsigned int l2i, void *args)
242 {
243           KASSERT(args != NULL);
244 
245 #ifdef DOM0OPS
246           struct cpu_info *ci = curcpu();
247 #endif
248           struct intrframe *regs = args;
249 
250 #ifdef PORT_DEBUG
251           if (port == PORT_DEBUG)
252                     printf("do_hypervisor_callback event %d\n", port);
253 #endif
254           if (evtsource[port]) {
255                     KASSERT(cpu_intr_p());
256                     evtchn_do_event(port, regs);
257           }
258 #ifdef DOM0OPS
259           else  {
260                     if (ci->ci_ilevel < IPL_HIGH) {
261                               /* fast path */
262                               int oipl = ci->ci_ilevel;
263                               ci->ci_ilevel = IPL_HIGH;
264                               KASSERT(cpu_intr_p());
265                               xenevt_event(port);
266                               ci->ci_ilevel = oipl;
267                     } else {
268                               /* set pending event */
269                               xenevt_setipending(l1i, l2i);
270                     }
271           }
272 #endif
273 }
274 
275 void
do_hypervisor_callback(struct intrframe * regs)276 do_hypervisor_callback(struct intrframe *regs)
277 {
278           volatile shared_info_t *s = HYPERVISOR_shared_info;
279           struct cpu_info *ci;
280           volatile struct vcpu_info *vci;
281           uint64_t level __diagused;
282 
283           ci = curcpu();
284           vci = ci->ci_vcpu;
285           level = ci->ci_ilevel;
286 
287           /* Save trapframe for clock handler */
288           KASSERT(regs != NULL);
289           ci->ci_xen_clockf_usermode = USERMODE(regs->_INTRFRAME_CS);
290           ci->ci_xen_clockf_pc = regs->_INTRFRAME_IP;
291 
292           // DDD printf("do_hypervisor_callback\n");
293 
294 #ifdef EARLY_DEBUG_EVENT
295           if (xen_atomic_test_bit(&s->evtchn_pending[0], debug_port)) {
296                     xen_debug_handler(NULL);
297                     xen_atomic_clear_bit(&s->evtchn_pending[0], debug_port);
298           }
299 #endif
300 
301           while (vci->evtchn_upcall_pending) {
302                     vci->evtchn_upcall_pending = 0;
303 
304                     evt_iterate_bits(&vci->evtchn_pending_sel,
305                         s->evtchn_pending, s->evtchn_mask,
306                         evt_do_hypervisor_callback, regs);
307           }
308 
309 #ifdef DIAGNOSTIC
310           if (level != ci->ci_ilevel)
311                     printf("hypervisor done %08x level %" PRIu64 "/%" PRIu64 " ipending %0" PRIx64 "\n",
312                         (uint)vci->evtchn_pending_sel,
313                         level, (uint64_t)ci->ci_ilevel, (uint64_t)ci->ci_ipending);
314 #endif
315 }
316 
317 #if 0
318 void
319 hypervisor_send_event(struct cpu_info *ci, unsigned int ev)
320 {
321           KASSERT(ci != NULL);
322 
323           volatile shared_info_t *s = HYPERVISOR_shared_info;
324           volatile struct vcpu_info *vci = ci->ci_vcpu;
325 
326 #ifdef PORT_DEBUG
327           if (ev == PORT_DEBUG)
328                     printf("hypervisor_send_event %d\n", ev);
329 #endif
330 
331           xen_atomic_set_bit(&s->evtchn_pending[0], ev);
332 
333           if (__predict_false(ci == curcpu())) {
334                     xen_atomic_set_bit(&vci->evtchn_pending_sel,
335                         ev >> LONG_SHIFT);
336                     xen_atomic_set_bit(&vci->evtchn_upcall_pending, 0);
337           }
338 
339           xen_atomic_clear_bit(&s->evtchn_mask[0], ev);
340 
341           if (__predict_true(ci == curcpu())) {
342                     hypervisor_force_callback();
343           } else {
344                     if (__predict_false(xen_send_ipi(ci, XEN_IPI_HVCB))) {
345                               panic("xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n",
346                                   (int) ci->ci_cpuid, ci->ci_vcpuid);
347                     }
348           }
349 }
350 #endif
351 
352 void
hypervisor_unmask_event(unsigned int ev)353 hypervisor_unmask_event(unsigned int ev)
354 {
355 
356           KASSERT(ev > 0 && ev < NR_EVENT_CHANNELS);
357 
358 #ifdef PORT_DEBUG
359           if (ev == PORT_DEBUG)
360                     printf("hypervisor_unmask_event %d\n", ev);
361 #endif
362 
363           /* Xen unmasks the evtchn_mask[0]:ev bit for us. */
364           evtchn_op_t op;
365           op.cmd = EVTCHNOP_unmask;
366           op.u.unmask.port = ev;
367           if (HYPERVISOR_event_channel_op(&op) != 0)
368                     panic("Failed to unmask event %d\n", ev);
369 
370           return;
371 }
372 
373 void
hypervisor_mask_event(unsigned int ev)374 hypervisor_mask_event(unsigned int ev)
375 {
376           volatile shared_info_t *s = HYPERVISOR_shared_info;
377 #ifdef PORT_DEBUG
378           if (ev == PORT_DEBUG)
379                     printf("hypervisor_mask_event %d\n", ev);
380 #endif
381 
382           xen_atomic_set_bit(&s->evtchn_mask[0], ev);
383 }
384 
385 void
hypervisor_clear_event(unsigned int ev)386 hypervisor_clear_event(unsigned int ev)
387 {
388           volatile shared_info_t *s = HYPERVISOR_shared_info;
389 #ifdef PORT_DEBUG
390           if (ev == PORT_DEBUG)
391                     printf("hypervisor_clear_event %d\n", ev);
392 #endif
393 
394           xen_atomic_clear_bit(&s->evtchn_pending[0], ev);
395 }
396 
397 static inline void
evt_enable_event(unsigned int port,unsigned int l1i,unsigned int l2i,void * args)398 evt_enable_event(unsigned int port, unsigned int l1i,
399                      unsigned int l2i, void *args)
400 {
401           KASSERT(args == NULL);
402           hypervisor_unmask_event(port);
403 #if defined(XENPV) && (NPCI > 0 || NISA > 0)
404           hypervisor_ack_pirq_event(port);
405 #endif /* NPCI > 0 || NISA > 0 */
406 }
407 
408 void
hypervisor_enable_sir(unsigned int sir)409 hypervisor_enable_sir(unsigned int sir)
410 {
411           struct cpu_info *ci = curcpu();
412 
413           /*
414            * enable all events for ipl. As we only set an event in ipl_evt_mask
415            * for its lowest IPL, and pending IPLs are processed high to low,
416            * we know that all callback for this event have been processed.
417            */
418 
419           evt_iterate_bits(&ci->ci_isources[sir]->ipl_evt_mask1,
420               ci->ci_isources[sir]->ipl_evt_mask2, NULL,
421               evt_enable_event, NULL);
422 
423 }
424 
425 void
hypervisor_set_ipending(uint64_t imask,int l1,int l2)426 hypervisor_set_ipending(uint64_t imask, int l1, int l2)
427 {
428 
429           /* This function is not re-entrant */
430           KASSERT(x86_read_psl() != 0);
431 
432           int sir;
433           struct cpu_info *ci = curcpu();
434 
435           /* set pending bit for the appropriate IPLs */
436           ci->ci_ipending |= imask;
437 
438           /*
439            * And set event pending bit for the lowest IPL. As IPL are handled
440            * from high to low, this ensure that all callbacks will have been
441            * called when we ack the event
442            */
443           sir = ffs(imask);
444           KASSERT(sir > SIR_XENIPL_VM);
445           sir--;
446           KASSERT(sir <= SIR_XENIPL_HIGH);
447           KASSERT(ci->ci_isources[sir] != NULL);
448           ci->ci_isources[sir]->ipl_evt_mask1 |= 1UL << l1;
449           ci->ci_isources[sir]->ipl_evt_mask2[l1] |= 1UL << l2;
450           KASSERT(ci == curcpu());
451 #if 0
452           if (__predict_false(ci != curcpu())) {
453                     if (xen_send_ipi(ci, XEN_IPI_HVCB)) {
454                               panic("hypervisor_set_ipending: "
455                                   "xen_send_ipi(cpu%d id %d, XEN_IPI_HVCB) failed\n",
456                                   (int) ci->ci_cpuid, ci->ci_vcpuid);
457                     }
458           }
459 #endif
460 }
461 
462 void
hypervisor_machdep_attach(void)463 hypervisor_machdep_attach(void)
464 {
465 #ifdef XENPV
466           /* dom0 does not require the arch-dependent P2M translation table */
467           if (!xendomain_is_dom0()) {
468                     build_p2m_frame_list_list();
469                     sysctl_xen_suspend_setup();
470           }
471 #endif
472 }
473 
474 void
hypervisor_machdep_resume(void)475 hypervisor_machdep_resume(void)
476 {
477 #ifdef XENPV
478           /* dom0 does not require the arch-dependent P2M translation table */
479           if (!xendomain_is_dom0())
480                     update_p2m_frame_list_list();
481 #endif
482 }
483 
484 /*
485  * idle_block()
486  *
487  *        Called from the idle loop when we have nothing to do but wait
488  *        for an interrupt.
489  */
490 static void
idle_block(void)491 idle_block(void)
492 {
493           KASSERT(curcpu()->ci_ipending == 0);
494           HYPERVISOR_block();
495           KASSERT(curcpu()->ci_ipending == 0);
496 }
497 
498 void
x86_cpu_idle_xen(void)499 x86_cpu_idle_xen(void)
500 {
501           struct cpu_info *ci = curcpu();
502 
503           KASSERT(ci->ci_ilevel == IPL_NONE);
504 
505           x86_disable_intr();
506           if (__predict_false(!ci->ci_want_resched)) {
507                     idle_block();
508           } else {
509                     x86_enable_intr();
510           }
511 }
512 
513 #ifdef XENPV
514 /*
515  * Generate the p2m_frame_list_list table,
516  * needed for guest save/restore
517  */
518 static void
build_p2m_frame_list_list(void)519 build_p2m_frame_list_list(void)
520 {
521         int fpp; /* number of page (frame) pointer per page */
522         unsigned long max_pfn;
523         /*
524          * The p2m list is composed of three levels of indirection,
525          * each layer containing MFNs pointing to lower level pages
526          * The indirection is used to convert a given PFN to its MFN
527          * Each N level page can point to @fpp (N-1) level pages
528          * For example, for x86 32bit, we have:
529          * - PAGE_SIZE: 4096 bytes
530          * - fpp: 1024 (one L3 page can address 1024 L2 pages)
531          * A L1 page contains the list of MFN we are looking for
532          */
533         max_pfn = xen_start_info.nr_pages;
534         fpp = PAGE_SIZE / sizeof(xen_pfn_t);
535 
536         /* we only need one L3 page */
537         l3_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE,
538               PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
539         if (l3_p2m_page == NULL)
540                 panic("could not allocate memory for l3_p2m_page");
541 
542         /*
543          * Determine how many L2 pages we need for the mapping
544          * Each L2 can map a total of @fpp L1 pages
545          */
546         l2_p2m_page_size = howmany(max_pfn, fpp);
547 
548         l2_p2m_page = (vaddr_t *)uvm_km_alloc(kernel_map,
549               l2_p2m_page_size * PAGE_SIZE,
550               PAGE_SIZE, UVM_KMF_WIRED | UVM_KMF_NOWAIT);
551         if (l2_p2m_page == NULL)
552                 panic("could not allocate memory for l2_p2m_page");
553 
554         /* We now have L3 and L2 pages ready, update L1 mapping */
555         update_p2m_frame_list_list();
556 
557 }
558 
559 /*
560  * Update the L1 p2m_frame_list_list mapping (during guest boot or resume)
561  */
562 static void
update_p2m_frame_list_list(void)563 update_p2m_frame_list_list(void)
564 {
565         int i;
566         int fpp; /* number of page (frame) pointer per page */
567         unsigned long max_pfn;
568 
569         max_pfn = xen_start_info.nr_pages;
570         fpp = PAGE_SIZE / sizeof(xen_pfn_t);
571 
572         for (i = 0; i < l2_p2m_page_size; i++) {
573                 /*
574                  * Each time we start a new L2 page,
575                  * store its MFN in the L3 page
576                  */
577                 if ((i % fpp) == 0) {
578                         l3_p2m_page[i/fpp] = vtomfn(
579                                 (vaddr_t)&l2_p2m_page[i]);
580                 }
581                 /*
582                  * we use a shortcut
583                  * since @xpmap_phys_to_machine_mapping array
584                  * already contains PFN to MFN mapping, we just
585                  * set the l2_p2m_page MFN pointer to the MFN of the
586                  * according frame of @xpmap_phys_to_machine_mapping
587                  */
588                 l2_p2m_page[i] = vtomfn((vaddr_t)
589                         &xpmap_phys_to_machine_mapping[i*fpp]);
590         }
591 
592         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
593                                         vtomfn((vaddr_t)l3_p2m_page);
594         HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
595 
596 }
597 #endif /* XENPV */
598 
599 void
xen_init_ksyms(void)600 xen_init_ksyms(void)
601 {
602 #if NKSYMS || defined(DDB) || defined(MODULAR)
603           extern int end;
604           extern int *esym;
605 #ifdef DDB
606           db_machine_init();
607 #endif
608 
609 #ifdef XENPV
610           esym = xen_start_info.mod_start ?
611               (void *)xen_start_info.mod_start :
612               (void *)xen_start_info.mfn_list;
613 #endif /* XENPV */
614           /* for PVH, esym is set in locore.S */
615           ksyms_addsyms_elf(*(int *)(void *)&end,
616               ((int *)(void *)&end) + 1, esym);
617 #endif
618 }
619