ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 19107:696351cde9a4

Allow memflags to be specified to alloc_xenheap_pages().

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Jan 28 16:58:41 2009 +0000 (2009-01-28)
parents 7df072566b8c
children 0858f961c77a
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vmx_vcpu_save.h>
45 #include <asm/vhpt.h>
46 #include <asm/vcpu.h>
47 #include <asm/tlbflush.h>
48 #include <asm/regionreg.h>
49 #include <asm/dom_fw.h>
50 #include <asm/shadow.h>
51 #include <xen/guest_access.h>
52 #include <asm/tlb_track.h>
53 #include <asm/perfmon.h>
54 #include <asm/sal.h>
55 #include <public/vcpu.h>
56 #include <linux/cpu.h>
57 #include <linux/notifier.h>
58 #include <asm/debugger.h>
60 /* dom0_size: default memory allocation for dom0 (~4GB) */
61 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
63 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
64 static unsigned int __initdata dom0_max_vcpus = 4;
65 integer_param("dom0_max_vcpus", dom0_max_vcpus);
67 extern char dom0_command_line[];
69 /* forward declaration */
70 static void init_switch_stack(struct vcpu *v);
72 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
73 This is a Xen virtual address. */
74 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
75 DEFINE_PER_CPU(int *, current_psr_ic_addr);
77 DEFINE_PER_CPU(struct vcpu *, fp_owner);
79 #include <xen/sched-if.h>
81 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
82 {
83 int cpu = smp_processor_id();
84 int last_vcpu_id, last_processor;
86 if (!is_idle_domain(prev->domain))
87 tlbflush_update_time
88 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
89 tlbflush_current_time());
91 if (is_idle_domain(next->domain))
92 return;
94 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
95 last_processor = next->arch.last_processor;
97 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
98 next->arch.last_processor = cpu;
100 if ((last_vcpu_id != next->vcpu_id &&
101 last_vcpu_id != INVALID_VCPU_ID) ||
102 (last_vcpu_id == next->vcpu_id &&
103 last_processor != cpu &&
104 last_processor != INVALID_PROCESSOR)) {
105 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
106 u32 last_tlbflush_timestamp =
107 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
108 #endif
109 int vhpt_is_flushed = 0;
111 // if the vTLB implementation was changed,
112 // the followings must be updated either.
113 if (VMX_DOMAIN(next)) {
114 // currently vTLB for vt-i domian is per vcpu.
115 // so any flushing isn't needed.
116 } else if (HAS_PERVCPU_VHPT(next->domain)) {
117 // nothing to do
118 } else {
119 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
120 last_tlbflush_timestamp)) {
121 local_vhpt_flush();
122 vhpt_is_flushed = 1;
123 }
124 }
125 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
126 last_tlbflush_timestamp)) {
127 local_flush_tlb_all();
128 perfc_incr(tlbflush_clock_cswitch_purge);
129 } else {
130 perfc_incr(tlbflush_clock_cswitch_skip);
131 }
132 perfc_incr(flush_vtlb_for_context_switch);
133 }
134 }
136 static void flush_cache_for_context_switch(struct vcpu *next)
137 {
138 extern cpumask_t cpu_cache_coherent_map;
139 int cpu = smp_processor_id();
141 if (is_idle_vcpu(next) ||
142 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
143 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
144 unsigned long flags;
145 u64 progress = 0;
146 s64 status;
148 local_irq_save(flags);
149 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
150 local_irq_restore(flags);
151 if (status != 0)
152 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
153 "cache_type=4 status %lx", status);
154 }
155 }
156 }
158 static void set_current_psr_i_addr(struct vcpu* v)
159 {
160 __ia64_per_cpu_var(current_psr_i_addr) =
161 (uint8_t*)(v->domain->arch.shared_info_va +
162 INT_ENABLE_OFFSET(v));
163 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
164 (v->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
165 }
167 static void clear_current_psr_i_addr(void)
168 {
169 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
170 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
171 }
173 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
174 {
175 /*
176 * Implement eager save, lazy restore
177 */
178 if (!is_idle_vcpu(prev)) {
179 if (VMX_DOMAIN(prev)) {
180 if (FP_PSR(prev) & IA64_PSR_MFH) {
181 __ia64_save_fpu(prev->arch._thread.fph);
182 __ia64_per_cpu_var(fp_owner) = prev;
183 }
184 } else {
185 if (PSCB(prev, hpsr_mfh)) {
186 __ia64_save_fpu(prev->arch._thread.fph);
187 __ia64_per_cpu_var(fp_owner) = prev;
188 }
189 }
190 }
192 if (!is_idle_vcpu(next)) {
193 if (VMX_DOMAIN(next)) {
194 FP_PSR(next) = IA64_PSR_DFH;
195 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
196 } else {
197 PSCB(next, hpsr_dfh) = 1;
198 PSCB(next, hpsr_mfh) = 0;
199 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
200 }
201 }
202 }
204 static void load_state(struct vcpu *v)
205 {
206 load_region_regs(v);
207 ia64_set_pta(vcpu_pta(v));
208 vcpu_load_kernel_regs(v);
209 if (vcpu_pkr_in_use(v))
210 vcpu_pkr_load_regs(v);
211 set_current_psr_i_addr(v);
212 }
214 void schedule_tail(struct vcpu *prev)
215 {
216 extern char ia64_ivt;
218 context_saved(prev);
220 if (VMX_DOMAIN(current))
221 vmx_do_resume(current);
222 else {
223 if (VMX_DOMAIN(prev))
224 ia64_set_iva(&ia64_ivt);
225 load_state(current);
226 migrate_timer(&current->arch.hlt_timer, current->processor);
227 }
228 flush_vtlb_for_context_switch(prev, current);
229 }
231 void context_switch(struct vcpu *prev, struct vcpu *next)
232 {
233 uint64_t spsr;
235 local_irq_save(spsr);
237 if (VMX_DOMAIN(prev)) {
238 vmx_save_state(prev);
239 if (!VMX_DOMAIN(next)) {
240 /* VMX domains can change the physical cr.dcr.
241 * Restore default to prevent leakage. */
242 uint64_t dcr = ia64_getreg(_IA64_REG_CR_DCR);
243 /* xenoprof:
244 * don't change psr.pp.
245 * It is manipulated by xenoprof.
246 */
247 dcr = (IA64_DEFAULT_DCR_BITS & ~IA64_DCR_PP) | (dcr & IA64_DCR_PP);
248 ia64_setreg(_IA64_REG_CR_DCR, dcr);
249 }
250 }
252 lazy_fp_switch(prev, current);
254 if (prev->arch.dbg_used || next->arch.dbg_used) {
255 /*
256 * Load debug registers either because they are valid or to clear
257 * the previous one.
258 */
259 ia64_load_debug_regs(next->arch.dbr);
260 }
262 /*
263 * disable VHPT walker.
264 * ia64_switch_to() might cause VHPT fault because it flushes
265 * dtr[IA64_TR_VHPT] and reinsert the mapping with dtr[IA64_TR_STACK].
266 * (VHPT_SIZE_LOG2 << 2) is just for avoiding
267 * Reserved Register/Field fault.
268 */
269 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
270 prev = ia64_switch_to(next);
272 /* Note: ia64_switch_to does not return here at vcpu initialization. */
274 if (VMX_DOMAIN(current)) {
275 vmx_load_state(current);
276 } else {
277 extern char ia64_ivt;
279 if (VMX_DOMAIN(prev))
280 ia64_set_iva(&ia64_ivt);
282 if (!is_idle_vcpu(current)) {
283 load_state(current);
284 vcpu_set_next_timer(current);
285 if (vcpu_timer_expired(current))
286 vcpu_pend_timer(current);
287 /* steal time accounting */
288 if (!guest_handle_is_null(runstate_guest(current)))
289 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
290 } else {
291 /* When switching to idle domain, only need to disable vhpt
292 * walker. Then all accesses happen within idle context will
293 * be handled by TR mapping and identity mapping.
294 */
295 clear_current_psr_i_addr();
296 }
297 }
298 local_irq_restore(spsr);
300 /* lazy fp */
301 if (current->processor != current->arch.last_processor) {
302 unsigned long *addr;
303 addr = (unsigned long *)per_cpu_addr(fp_owner,
304 current->arch.last_processor);
305 ia64_cmpxchg(acq, addr, current, 0, 8);
306 }
308 flush_vtlb_for_context_switch(prev, current);
309 flush_cache_for_context_switch(current);
310 context_saved(prev);
311 }
313 void continue_running(struct vcpu *same)
314 {
315 /* nothing to do */
316 }
318 #ifdef CONFIG_PERFMON
319 static int pal_halt = 1;
320 static int can_do_pal_halt = 1;
322 static int __init nohalt_setup(char * str)
323 {
324 pal_halt = can_do_pal_halt = 0;
325 return 1;
326 }
327 __setup("nohalt", nohalt_setup);
329 void
330 update_pal_halt_status(int status)
331 {
332 can_do_pal_halt = pal_halt && status;
333 }
334 #else
335 #define can_do_pal_halt (1)
336 #endif
338 static void default_idle(void)
339 {
340 local_irq_disable();
341 if ( !softirq_pending(smp_processor_id()) ) {
342 if (can_do_pal_halt)
343 safe_halt();
344 else
345 cpu_relax();
346 }
347 local_irq_enable();
348 }
350 extern void play_dead(void);
352 static void continue_cpu_idle_loop(void)
353 {
354 int cpu = smp_processor_id();
356 for ( ; ; )
357 {
358 #ifdef IA64
359 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
360 #else
361 irq_stat[cpu].idle_timestamp = jiffies;
362 #endif
363 page_scrub_schedule_work();
364 while ( !softirq_pending(cpu) )
365 default_idle();
366 raise_softirq(SCHEDULE_SOFTIRQ);
367 do_softirq();
368 if (!cpu_online(cpu))
369 play_dead();
370 }
371 }
373 void startup_cpu_idle_loop(void)
374 {
375 /* Just some sanity to ensure that the scheduler is set up okay. */
376 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
377 raise_softirq(SCHEDULE_SOFTIRQ);
379 continue_cpu_idle_loop();
380 }
382 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
383 * get_order_from_shift(XMAPPEDREGS_SHIFT))
384 */
385 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
386 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
387 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
388 #endif
390 void hlt_timer_fn(void *data)
391 {
392 struct vcpu *v = data;
393 vcpu_unblock(v);
394 }
396 void relinquish_vcpu_resources(struct vcpu *v)
397 {
398 if (HAS_PERVCPU_VHPT(v->domain))
399 pervcpu_vhpt_free(v);
400 if (v->arch.privregs != NULL) {
401 free_xenheap_pages(v->arch.privregs,
402 get_order_from_shift(XMAPPEDREGS_SHIFT));
403 v->arch.privregs = NULL;
404 }
405 kill_timer(&v->arch.hlt_timer);
406 }
408 struct vcpu *alloc_vcpu_struct(void)
409 {
410 struct page_info *page;
411 struct vcpu *v;
412 struct thread_info *ti;
413 static int first_allocation = 1;
415 if (first_allocation) {
416 first_allocation = 0;
417 /* Still keep idle vcpu0 static allocated at compilation, due
418 * to some code from Linux still requires it in early phase.
419 */
420 return idle_vcpu[0];
421 }
423 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
424 if (page == NULL)
425 return NULL;
426 v = page_to_virt(page);
427 memset(v, 0, sizeof(*v));
429 ti = alloc_thread_info(v);
430 /* Clear thread_info to clear some important fields, like
431 * preempt_count
432 */
433 memset(ti, 0, sizeof(struct thread_info));
434 init_switch_stack(v);
436 return v;
437 }
439 void free_vcpu_struct(struct vcpu *v)
440 {
441 free_domheap_pages(virt_to_page(v), KERNEL_STACK_SIZE_ORDER);
442 }
444 int vcpu_initialise(struct vcpu *v)
445 {
446 struct domain *d = v->domain;
448 if (!is_idle_domain(d)) {
449 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
450 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
451 /* Set default values to saved_rr. */
452 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
453 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
455 /* Is it correct ?
456 It depends on the domain rid usage.
458 A domain may share rid among its processor (eg having a
459 global VHPT). In this case, we should also share rid
460 among vcpus and the rid range should be the same.
462 However a domain may have per cpu rid allocation. In
463 this case we don't want to share rid among vcpus, but we may
464 do it if two vcpus are on the same cpu... */
466 v->arch.starting_rid = d->arch.starting_rid;
467 v->arch.ending_rid = d->arch.ending_rid;
468 v->arch.rid_bits = d->arch.rid_bits;
469 v->arch.breakimm = d->arch.breakimm;
470 v->arch.last_processor = INVALID_PROCESSOR;
471 v->arch.vhpt_pg_shift = PAGE_SHIFT;
472 }
474 if (!VMX_DOMAIN(v))
475 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
476 first_cpu(cpu_online_map));
478 return 0;
479 }
481 static void vcpu_share_privregs_with_guest(struct vcpu *v)
482 {
483 struct domain *d = v->domain;
484 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
486 for (i = 0; i < (1 << order); i++)
487 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
488 d, XENSHARE_writable);
489 /*
490 * XXX IA64_XMAPPEDREGS_PADDR
491 * assign these pages into guest pseudo physical address
492 * space for dom0 to map this page by gmfn.
493 * this is necessary for domain save, restore and dump-core.
494 */
495 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
496 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
497 virt_to_maddr(v->arch.privregs + i));
498 }
500 int vcpu_late_initialise(struct vcpu *v)
501 {
502 int rc, order;
504 if (HAS_PERVCPU_VHPT(v->domain)) {
505 rc = pervcpu_vhpt_alloc(v);
506 if (rc != 0)
507 return rc;
508 }
510 /* Create privregs page. */
511 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
512 v->arch.privregs = alloc_xenheap_pages(order, 0);
513 if (v->arch.privregs == NULL)
514 return -ENOMEM;
515 BUG_ON(v->arch.privregs == NULL);
516 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
517 vcpu_share_privregs_with_guest(v);
519 return 0;
520 }
522 void vcpu_destroy(struct vcpu *v)
523 {
524 if (is_hvm_vcpu(v))
525 vmx_relinquish_vcpu_resources(v);
526 else
527 relinquish_vcpu_resources(v);
528 }
530 static unsigned long*
531 vcpu_to_rbs_bottom(struct vcpu *v)
532 {
533 return (unsigned long*)((char *)v + IA64_RBS_OFFSET);
534 }
536 static void init_switch_stack(struct vcpu *v)
537 {
538 struct pt_regs *regs = vcpu_regs (v);
539 struct switch_stack *sw = (struct switch_stack *) regs - 1;
540 extern void ia64_ret_from_clone;
542 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
543 sw->ar_bspstore = (unsigned long)vcpu_to_rbs_bottom(v);
544 sw->b0 = (unsigned long) &ia64_ret_from_clone;
545 sw->ar_fpsr = FPSR_DEFAULT;
546 v->arch._thread.ksp = (unsigned long) sw - 16;
547 // stay on kernel stack because may get interrupts!
548 // ia64_ret_from_clone switches to user stack
549 v->arch._thread.on_ustack = 0;
550 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
551 }
553 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
554 static int opt_pervcpu_vhpt = 1;
555 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
556 #endif
558 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
559 {
560 int i;
562 // the following will eventually need to be negotiated dynamically
563 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
564 d->arch.breakimm = __IA64_XEN_HYPERCALL_DEFAULT;
565 for (i = 0; i < NR_CPUS; i++) {
566 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
567 }
569 if (is_idle_domain(d))
570 return 0;
572 INIT_LIST_HEAD(&d->arch.pdev_list);
573 foreign_p2m_init(d);
574 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
575 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
576 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
577 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
578 #endif
579 if (tlb_track_create(d) < 0)
580 goto fail_nomem1;
581 d->shared_info = alloc_xenheap_pages(
582 get_order_from_shift(XSI_SHIFT), 0);
583 if (d->shared_info == NULL)
584 goto fail_nomem;
585 BUG_ON(d->shared_info == NULL);
586 memset(d->shared_info, 0, XSI_SIZE);
587 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
588 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
589 d, XENSHARE_writable);
591 /* We may also need emulation rid for region4, though it's unlikely
592 * to see guest issue uncacheable access in metaphysical mode. But
593 * keep such info here may be more sane.
594 */
595 if (!allocate_rid_range(d,0))
596 goto fail_nomem;
598 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
599 d->arch.relres = RELRES_not_started;
600 d->arch.mm_teardown_offset = 0;
601 INIT_LIST_HEAD(&d->arch.relmem_list);
603 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
604 goto fail_nomem;
606 if(iommu_domain_init(d) != 0)
607 goto fail_iommu;
609 /*
610 * grant_table_create() can't fully initialize grant table for domain
611 * because it is called before arch_domain_create().
612 * Here we complete the initialization which requires p2m table.
613 */
614 spin_lock(&d->grant_table->lock);
615 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
616 ia64_gnttab_create_shared_page(d, d->grant_table, i);
617 spin_unlock(&d->grant_table->lock);
619 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
620 RANGESETF_prettyprint_hex);
622 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
623 return 0;
625 fail_iommu:
626 iommu_domain_destroy(d);
627 fail_nomem:
628 tlb_track_destroy(d);
629 fail_nomem1:
630 if (d->arch.mm.pgd != NULL)
631 pgd_free(d->arch.mm.pgd);
632 if (d->shared_info != NULL)
633 free_xenheap_pages(d->shared_info,
634 get_order_from_shift(XSI_SHIFT));
635 return -ENOMEM;
636 }
638 void arch_domain_destroy(struct domain *d)
639 {
640 mm_final_teardown(d);
642 if (d->shared_info != NULL)
643 free_xenheap_pages(d->shared_info,
644 get_order_from_shift(XSI_SHIFT));
646 if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ) {
647 pci_release_devices(d);
648 iommu_domain_destroy(d);
649 }
651 tlb_track_destroy(d);
653 /* Clear vTLB for the next domain. */
654 domain_flush_tlb_vhpt(d);
656 deallocate_rid_range(d);
657 }
659 void arch_vcpu_reset(struct vcpu *v)
660 {
661 /* FIXME: Stub for now */
662 }
664 /* Here it is assumed that all of the CPUs has same RSE.N_STACKED_PHYS */
665 static unsigned long num_phys_stacked;
666 static int __init
667 init_num_phys_stacked(void)
668 {
669 switch (ia64_pal_rse_info(&num_phys_stacked, NULL)) {
670 case 0L:
671 printk("the number of physical stacked general registers"
672 "(RSE.N_STACKED_PHYS) = %ld\n", num_phys_stacked);
673 return 0;
674 case -2L:
675 case -3L:
676 default:
677 break;
678 }
679 printk("WARNING: PAL_RSE_INFO call failed. "
680 "domain save/restore may NOT work!\n");
681 return -EINVAL;
682 }
683 __initcall(init_num_phys_stacked);
685 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
687 #define AR_PFS_PEC_SHIFT 51
688 #define AR_PFS_REC_SIZE 6
689 #define AR_PFS_PEC_MASK (((1UL << 6) - 1) << 51)
691 /*
692 * See init_swtich_stack() and ptrace.h
693 */
694 static struct switch_stack*
695 vcpu_to_switch_stack(struct vcpu* v)
696 {
697 return (struct switch_stack *)(v->arch._thread.ksp + 16);
698 }
700 static int
701 vcpu_has_not_run(struct vcpu* v)
702 {
703 extern void ia64_ret_from_clone;
704 struct switch_stack *sw = vcpu_to_switch_stack(v);
706 return (sw == (struct switch_stack *)(vcpu_regs(v)) - 1) &&
707 (sw->b0 == (unsigned long)&ia64_ret_from_clone);
708 }
710 static void
711 nats_update(unsigned int* nats, unsigned int reg, char nat)
712 {
713 BUG_ON(reg > 31);
715 if (nat)
716 *nats |= (1UL << reg);
717 else
718 *nats &= ~(1UL << reg);
719 }
721 static unsigned long
722 __vcpu_get_itc(struct vcpu *v)
723 {
724 unsigned long itc_last;
725 unsigned long itc_offset;
726 unsigned long itc;
728 if (unlikely(v->arch.privregs == NULL))
729 return ia64_get_itc();
731 itc_last = v->arch.privregs->itc_last;
732 itc_offset = v->arch.privregs->itc_offset;
733 itc = ia64_get_itc();
734 itc += itc_offset;
735 if (itc_last >= itc)
736 itc = itc_last;
737 return itc;
738 }
740 static void
741 __vcpu_set_itc(struct vcpu *v, u64 val)
742 {
743 unsigned long itc;
744 unsigned long itc_offset;
745 unsigned long itc_last;
747 BUG_ON(v->arch.privregs == NULL);
749 if (v != current)
750 vcpu_pause(v);
752 itc = ia64_get_itc();
753 itc_offset = val - itc;
754 itc_last = val;
756 v->arch.privregs->itc_offset = itc_offset;
757 v->arch.privregs->itc_last = itc_last;
759 if (v != current)
760 vcpu_unpause(v);
761 }
763 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
764 {
765 int i;
766 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
767 struct cpu_user_regs *uregs = vcpu_regs(v);
768 struct switch_stack *sw = vcpu_to_switch_stack(v);
769 struct unw_frame_info info;
770 int is_hvm = VMX_DOMAIN(v);
771 unsigned int rbs_size;
772 unsigned long *const rbs_bottom = vcpu_to_rbs_bottom(v);
773 unsigned long *rbs_top;
774 unsigned long *rbs_rnat_addr;
775 unsigned int top_slot;
776 unsigned int num_regs;
778 memset(c.nat, 0, sizeof(*c.nat));
779 c.nat->regs.b[6] = uregs->b6;
780 c.nat->regs.b[7] = uregs->b7;
782 memset(&info, 0, sizeof(info));
783 unw_init_from_blocked_task(&info, v);
784 if (vcpu_has_not_run(v)) {
785 c.nat->regs.ar.lc = sw->ar_lc;
786 c.nat->regs.ar.ec =
787 (sw->ar_pfs & AR_PFS_PEC_MASK) >> AR_PFS_PEC_SHIFT;
788 } else if (unw_unwind_to_user(&info) < 0) {
789 /* warn: should panic? */
790 gdprintk(XENLOG_ERR, "vcpu=%d unw_unwind_to_user() failed.\n",
791 v->vcpu_id);
792 show_stack(v, NULL);
794 /* can't return error */
795 c.nat->regs.ar.lc = 0;
796 c.nat->regs.ar.ec = 0;
797 } else {
798 unw_get_ar(&info, UNW_AR_LC, &c.nat->regs.ar.lc);
799 unw_get_ar(&info, UNW_AR_EC, &c.nat->regs.ar.ec);
800 }
802 if (!is_hvm)
803 c.nat->regs.ar.itc = __vcpu_get_itc(v);
805 c.nat->regs.ar.csd = uregs->ar_csd;
806 c.nat->regs.ar.ssd = uregs->ar_ssd;
808 c.nat->regs.r[8] = uregs->r8;
809 c.nat->regs.r[9] = uregs->r9;
810 c.nat->regs.r[10] = uregs->r10;
811 c.nat->regs.r[11] = uregs->r11;
813 if (is_hvm)
814 c.nat->regs.psr = vmx_vcpu_get_psr(v);
815 else
816 c.nat->regs.psr = vcpu_get_psr(v);
818 c.nat->regs.ip = uregs->cr_iip;
819 c.nat->regs.cfm = uregs->cr_ifs;
821 c.nat->regs.ar.unat = uregs->ar_unat;
822 c.nat->regs.ar.pfs = uregs->ar_pfs;
823 c.nat->regs.ar.rsc = uregs->ar_rsc;
824 c.nat->regs.ar.rnat = uregs->ar_rnat;
825 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
827 c.nat->regs.pr = uregs->pr;
828 c.nat->regs.b[0] = uregs->b0;
829 rbs_size = uregs->loadrs >> 16;
830 num_regs = ia64_rse_num_regs(rbs_bottom,
831 (unsigned long*)((char*)rbs_bottom + rbs_size));
832 c.nat->regs.ar.bsp = (unsigned long)ia64_rse_skip_regs(
833 (unsigned long*)c.nat->regs.ar.bspstore, num_regs);
834 BUG_ON(num_regs > num_phys_stacked);
836 c.nat->regs.r[1] = uregs->r1;
837 c.nat->regs.r[12] = uregs->r12;
838 c.nat->regs.r[13] = uregs->r13;
839 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
840 c.nat->regs.r[15] = uregs->r15;
842 c.nat->regs.r[14] = uregs->r14;
843 c.nat->regs.r[2] = uregs->r2;
844 c.nat->regs.r[3] = uregs->r3;
845 c.nat->regs.r[16] = uregs->r16;
846 c.nat->regs.r[17] = uregs->r17;
847 c.nat->regs.r[18] = uregs->r18;
848 c.nat->regs.r[19] = uregs->r19;
849 c.nat->regs.r[20] = uregs->r20;
850 c.nat->regs.r[21] = uregs->r21;
851 c.nat->regs.r[22] = uregs->r22;
852 c.nat->regs.r[23] = uregs->r23;
853 c.nat->regs.r[24] = uregs->r24;
854 c.nat->regs.r[25] = uregs->r25;
855 c.nat->regs.r[26] = uregs->r26;
856 c.nat->regs.r[27] = uregs->r27;
857 c.nat->regs.r[28] = uregs->r28;
858 c.nat->regs.r[29] = uregs->r29;
859 c.nat->regs.r[30] = uregs->r30;
860 c.nat->regs.r[31] = uregs->r31;
862 c.nat->regs.ar.ccv = uregs->ar_ccv;
864 COPY_FPREG(&c.nat->regs.f[2], &sw->f2);
865 COPY_FPREG(&c.nat->regs.f[3], &sw->f3);
866 COPY_FPREG(&c.nat->regs.f[4], &sw->f4);
867 COPY_FPREG(&c.nat->regs.f[5], &sw->f5);
869 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
870 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
871 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
872 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
873 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
874 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
876 COPY_FPREG(&c.nat->regs.f[12], &sw->f12);
877 COPY_FPREG(&c.nat->regs.f[13], &sw->f13);
878 COPY_FPREG(&c.nat->regs.f[14], &sw->f14);
879 COPY_FPREG(&c.nat->regs.f[15], &sw->f15);
880 COPY_FPREG(&c.nat->regs.f[16], &sw->f16);
881 COPY_FPREG(&c.nat->regs.f[17], &sw->f17);
882 COPY_FPREG(&c.nat->regs.f[18], &sw->f18);
883 COPY_FPREG(&c.nat->regs.f[19], &sw->f19);
884 COPY_FPREG(&c.nat->regs.f[20], &sw->f20);
885 COPY_FPREG(&c.nat->regs.f[21], &sw->f21);
886 COPY_FPREG(&c.nat->regs.f[22], &sw->f22);
887 COPY_FPREG(&c.nat->regs.f[23], &sw->f23);
888 COPY_FPREG(&c.nat->regs.f[24], &sw->f24);
889 COPY_FPREG(&c.nat->regs.f[25], &sw->f25);
890 COPY_FPREG(&c.nat->regs.f[26], &sw->f26);
891 COPY_FPREG(&c.nat->regs.f[27], &sw->f27);
892 COPY_FPREG(&c.nat->regs.f[28], &sw->f28);
893 COPY_FPREG(&c.nat->regs.f[29], &sw->f29);
894 COPY_FPREG(&c.nat->regs.f[30], &sw->f30);
895 COPY_FPREG(&c.nat->regs.f[31], &sw->f31);
897 // f32 - f127
898 memcpy(&c.nat->regs.f[32], &v->arch._thread.fph[0],
899 sizeof(v->arch._thread.fph));
901 #define NATS_UPDATE(reg) \
902 nats_update(&c.nat->regs.nats, (reg), \
903 !!(uregs->eml_unat & \
904 (1UL << ia64_unat_pos(&uregs->r ## reg))))
906 // corresponding bit in ar.unat is determined by
907 // (&uregs->rN){8:3}.
908 // r8: the lowest gr member of struct cpu_user_regs.
909 // r7: the highest gr member of struct cpu_user_regs.
910 BUILD_BUG_ON(offsetof(struct cpu_user_regs, r7) -
911 offsetof(struct cpu_user_regs, r8) >
912 64 * sizeof(unsigned long));
914 NATS_UPDATE(1);
915 NATS_UPDATE(2);
916 NATS_UPDATE(3);
918 NATS_UPDATE(8);
919 NATS_UPDATE(9);
920 NATS_UPDATE(10);
921 NATS_UPDATE(11);
922 NATS_UPDATE(12);
923 NATS_UPDATE(13);
924 NATS_UPDATE(14);
925 NATS_UPDATE(15);
926 NATS_UPDATE(16);
927 NATS_UPDATE(17);
928 NATS_UPDATE(18);
929 NATS_UPDATE(19);
930 NATS_UPDATE(20);
931 NATS_UPDATE(21);
932 NATS_UPDATE(22);
933 NATS_UPDATE(23);
934 NATS_UPDATE(24);
935 NATS_UPDATE(25);
936 NATS_UPDATE(26);
937 NATS_UPDATE(27);
938 NATS_UPDATE(28);
939 NATS_UPDATE(29);
940 NATS_UPDATE(30);
941 NATS_UPDATE(31);
943 if (!is_hvm) {
944 c.nat->regs.r[4] = uregs->r4;
945 c.nat->regs.r[5] = uregs->r5;
946 c.nat->regs.r[6] = uregs->r6;
947 c.nat->regs.r[7] = uregs->r7;
949 NATS_UPDATE(4);
950 NATS_UPDATE(5);
951 NATS_UPDATE(6);
952 NATS_UPDATE(7);
953 #undef NATS_UPDATE
954 } else {
955 /*
956 * for VTi domain, r[4-7] are saved sometimes both in
957 * uregs->r[4-7] and memory stack or only in memory stack.
958 * So it is ok to get them from memory stack.
959 */
960 if (vcpu_has_not_run(v)) {
961 c.nat->regs.r[4] = sw->r4;
962 c.nat->regs.r[5] = sw->r5;
963 c.nat->regs.r[6] = sw->r6;
964 c.nat->regs.r[7] = sw->r7;
966 nats_update(&c.nat->regs.nats, 4,
967 !!(sw->ar_unat &
968 (1UL << ia64_unat_pos(&sw->r4))));
969 nats_update(&c.nat->regs.nats, 5,
970 !!(sw->ar_unat &
971 (1UL << ia64_unat_pos(&sw->r5))));
972 nats_update(&c.nat->regs.nats, 6,
973 !!(sw->ar_unat &
974 (1UL << ia64_unat_pos(&sw->r6))));
975 nats_update(&c.nat->regs.nats, 7,
976 !!(sw->ar_unat &
977 (1UL << ia64_unat_pos(&sw->r7))));
978 } else {
979 char nat;
981 unw_get_gr(&info, 4, &c.nat->regs.r[4], &nat);
982 nats_update(&c.nat->regs.nats, 4, nat);
983 unw_get_gr(&info, 5, &c.nat->regs.r[5], &nat);
984 nats_update(&c.nat->regs.nats, 5, nat);
985 unw_get_gr(&info, 6, &c.nat->regs.r[6], &nat);
986 nats_update(&c.nat->regs.nats, 6, nat);
987 unw_get_gr(&info, 7, &c.nat->regs.r[7], &nat);
988 nats_update(&c.nat->regs.nats, 7, nat);
989 }
990 }
992 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
993 if (unlikely(rbs_size > sizeof(c.nat->regs.rbs)))
994 gdprintk(XENLOG_INFO,
995 "rbs_size is too large 0x%x > 0x%lx\n",
996 rbs_size, sizeof(c.nat->regs.rbs));
997 else
998 memcpy(c.nat->regs.rbs, rbs_bottom, rbs_size);
1000 rbs_top = (unsigned long*)((char *)rbs_bottom + rbs_size) - 1;
1001 rbs_rnat_addr = ia64_rse_rnat_addr(rbs_top);
1002 if ((unsigned long)rbs_rnat_addr >= sw->ar_bspstore)
1003 rbs_rnat_addr = &sw->ar_rnat;
1005 top_slot = ia64_rse_slot_num(rbs_top);
1007 c.nat->regs.rbs_rnat = (*rbs_rnat_addr) & ((1UL << top_slot) - 1);
1008 if (ia64_rse_rnat_addr(rbs_bottom) == ia64_rse_rnat_addr(rbs_top)) {
1009 unsigned int bottom_slot = ia64_rse_slot_num(rbs_bottom);
1010 c.nat->regs.rbs_rnat &= ~((1UL << bottom_slot) - 1);
1013 c.nat->regs.num_phys_stacked = num_phys_stacked;
1015 if (VMX_DOMAIN(v))
1016 c.nat->privregs_pfn = VGC_PRIVREGS_HVM;
1017 else
1018 c.nat->privregs_pfn = get_gpfn_from_mfn(
1019 virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
1021 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1022 if (VMX_DOMAIN(v)) {
1023 vmx_vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
1024 vmx_vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
1025 } else {
1026 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
1027 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
1031 for (i = 0; i < 8; i++)
1032 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
1034 /* Fill extra regs. */
1035 for (i = 0;
1036 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1037 i++) {
1038 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
1039 tr->itrs[i].itir = v->arch.itrs[i].itir;
1040 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
1041 tr->itrs[i].rid = v->arch.itrs[i].rid;
1043 for (i = 0;
1044 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1045 i++) {
1046 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
1047 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
1048 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
1049 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
1051 c.nat->event_callback_ip = v->arch.event_callback_ip;
1053 /* If PV and privregs is not set, we can't read mapped registers. */
1054 if (!is_hvm_vcpu(v) && v->arch.privregs == NULL)
1055 return;
1057 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
1059 c.nat->regs.cr.itm = is_hvm_vcpu(v) ?
1060 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
1061 vcpu_get_iva(v, &c.nat->regs.cr.iva);
1062 vcpu_get_pta(v, &c.nat->regs.cr.pta);
1064 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
1065 vcpu_get_isr(v, &c.nat->regs.cr.isr);
1066 vcpu_get_iip(v, &c.nat->regs.cr.iip);
1067 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
1068 vcpu_get_itir(v, &c.nat->regs.cr.itir);
1069 vcpu_get_iha(v, &c.nat->regs.cr.iha);
1071 //XXX change irr[] and arch.insvc[]
1072 if (is_hvm_vcpu(v))
1073 /* c.nat->regs.cr.ivr = vmx_vcpu_get_ivr(v)*/;//XXXnot SMP-safe
1074 else
1075 vcpu_get_ivr (v, &c.nat->regs.cr.ivr);
1076 vcpu_get_iim(v, &c.nat->regs.cr.iim);
1078 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
1079 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
1080 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
1081 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
1082 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
1083 vcpu_get_itv(v, &c.nat->regs.cr.itv);//XXX vlsapic
1084 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
1085 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
1087 if (is_hvm)
1088 vmx_arch_get_info_guest(v, c);
1091 #if 0
1092 // for debug
1093 static void
1094 __rbs_print(const char* func, int line, const char* name,
1095 const unsigned long* rbs, unsigned int rbs_size)
1097 unsigned int i;
1098 printk("%s:%d %s rbs %p\n", func, line, name, rbs);
1099 printk(" rbs_size 0x%016x no 0x%lx\n",
1100 rbs_size, rbs_size / sizeof(unsigned long));
1102 for (i = 0; i < rbs_size / sizeof(unsigned long); i++) {
1103 const char* zero_or_n = "0x";
1104 if (ia64_rse_is_rnat_slot((unsigned long*)&rbs[i]))
1105 zero_or_n = "Nx";
1107 if ((i % 3) == 0)
1108 printk("0x%02x:", i);
1109 printk(" %s%016lx", zero_or_n, rbs[i]);
1110 if ((i % 3) == 2)
1111 printk("\n");
1113 printk("\n");
1116 #define rbs_print(rbs, rbs_size) \
1117 __rbs_print(__func__, __LINE__, (#rbs), (rbs), (rbs_size))
1118 #endif
1120 static int
1121 copy_rbs(struct vcpu* v, unsigned long* dst_rbs_size,
1122 const unsigned long* rbs, unsigned long rbs_size,
1123 unsigned long src_rnat, unsigned long rbs_voff)
1125 int rc = -EINVAL;
1126 struct page_info* page;
1127 unsigned char* vaddr;
1128 unsigned long* src_bsp;
1129 unsigned long* src_bspstore;
1131 struct switch_stack* sw = vcpu_to_switch_stack(v);
1132 unsigned long num_regs;
1133 unsigned long* dst_bsp;
1134 unsigned long* dst_bspstore;
1135 unsigned long* dst_rnat;
1136 unsigned long dst_rnat_tmp;
1137 unsigned long dst_rnat_mask;
1138 unsigned long flags;
1139 extern void ia64_copy_rbs(unsigned long* dst_bspstore,
1140 unsigned long* dst_rbs_size,
1141 unsigned long* dst_rnat_p,
1142 unsigned long* src_bsp,
1143 unsigned long src_rbs_size,
1144 unsigned long src_rnat);
1146 dst_bspstore = vcpu_to_rbs_bottom(v);
1147 *dst_rbs_size = rbs_size;
1148 if (rbs_size == 0)
1149 return 0;
1151 // rbs offset depends on sizeof(struct vcpu) so that
1152 // it's too unstable for hypercall ABI.
1153 // we need to take rbs offset into acount.
1154 //memcpy(dst_bspstore, c.nat->regs.rbs, rbs_size);
1156 // It is assumed that rbs_size is small enough compared
1157 // to KERNEL_STACK_SIZE.
1158 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
1159 if (page == NULL)
1160 return -ENOMEM;
1161 vaddr = page_to_virt(page);
1163 src_bspstore = (unsigned long*)(vaddr + rbs_voff * 8);
1164 src_bsp = (unsigned long*)((unsigned char*)src_bspstore + rbs_size);
1165 if ((unsigned long)src_bsp >= (unsigned long)vaddr + PAGE_SIZE)
1166 goto out;
1167 memcpy(src_bspstore, rbs, rbs_size);
1169 num_regs = ia64_rse_num_regs(src_bspstore, src_bsp);
1170 dst_bsp = ia64_rse_skip_regs(dst_bspstore, num_regs);
1171 *dst_rbs_size = (unsigned long)dst_bsp - (unsigned long)dst_bspstore;
1173 // rough check.
1174 if (((unsigned long)dst_bsp & ~PAGE_MASK) > KERNEL_STACK_SIZE / 2)
1175 goto out;
1177 // ia64_copy_rbs() uses real cpu's stack register.
1178 // So it may fault with an Illigal Operation fault resulting
1179 // in panic if rbs_size is too large to load compared to
1180 // the number of physical stacked registers, RSE.N_STACKED_PHYS,
1181 // which is cpu implementatin specific.
1182 // See SDM vol. 2 Register Stack Engine 6, especially 6.5.5.
1183 //
1184 // For safe operation and cpu model independency,
1185 // we need to copy them by hand without loadrs and flushrs
1186 // However even if we implement that, similar issue still occurs
1187 // when running guest. CPU context restore routine issues loadrs
1188 // resulting in Illegal Operation fault. And what if the vRSE is in
1189 // enforced lazy mode? We can't store any dirty stacked registers
1190 // into RBS without cover or br.call.
1191 if (num_regs > num_phys_stacked) {
1192 rc = -ENOSYS;
1193 gdprintk(XENLOG_WARNING,
1194 "%s:%d domain %d: can't load stacked registres\n"
1195 "requested size 0x%lx => 0x%lx, num regs %ld"
1196 "RSE.N_STACKED_PHYS %ld\n",
1197 __func__, __LINE__, v->domain->domain_id,
1198 rbs_size, *dst_rbs_size, num_regs,
1199 num_phys_stacked);
1200 goto out;
1203 // we mask interrupts to avoid using register backing store.
1204 local_irq_save(flags);
1205 ia64_copy_rbs(dst_bspstore, dst_rbs_size, &dst_rnat_tmp,
1206 src_bsp, rbs_size, src_rnat);
1207 local_irq_restore(flags);
1209 dst_rnat_mask = (1UL << ia64_rse_slot_num(dst_bsp)) - 1;
1210 dst_rnat = ia64_rse_rnat_addr(dst_bsp);
1211 if ((unsigned long)dst_rnat > sw->ar_bspstore)
1212 dst_rnat = &sw->ar_rnat;
1213 // if ia64_rse_rnat_addr(dst_bsp) ==
1214 // ia64_rse_rnat_addr(vcpu_to_rbs_bottom(v)), the lsb bit of rnat
1215 // is just ignored. so we don't have to mask it out.
1216 *dst_rnat =
1217 (*dst_rnat & ~dst_rnat_mask) | (dst_rnat_tmp & dst_rnat_mask);
1219 rc = 0;
1220 out:
1221 free_domheap_pages(page, KERNEL_STACK_SIZE_ORDER);
1222 return rc;
1225 static void
1226 unat_update(unsigned long *unat_eml, unsigned long *spill_addr, char nat)
1228 unsigned int pos = ia64_unat_pos(spill_addr);
1229 if (nat)
1230 *unat_eml |= (1UL << pos);
1231 else
1232 *unat_eml &= ~(1UL << pos);
1235 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
1237 struct cpu_user_regs *uregs = vcpu_regs(v);
1238 struct domain *d = v->domain;
1239 struct switch_stack *sw = vcpu_to_switch_stack(v);
1240 int was_initialised = v->is_initialised;
1241 struct unw_frame_info info;
1242 unsigned int rbs_size;
1243 unsigned int num_regs;
1244 unsigned long * const rbs_bottom = vcpu_to_rbs_bottom(v);
1245 int rc = 0;
1246 int i;
1248 /* Finish vcpu initialization. */
1249 if (!was_initialised) {
1250 if (is_hvm_domain(d))
1251 rc = vmx_final_setup_guest(v);
1252 else
1253 rc = vcpu_late_initialise(v);
1254 if (rc != 0)
1255 return rc;
1257 vcpu_init_regs(v);
1259 v->is_initialised = 1;
1260 /* Auto-online VCPU0 when it is initialised. */
1261 if (v->vcpu_id == 0 || (c.nat != NULL &&
1262 c.nat->flags & VGCF_online))
1263 clear_bit(_VPF_down, &v->pause_flags);
1266 if (c.nat == NULL)
1267 return 0;
1269 uregs->b6 = c.nat->regs.b[6];
1270 uregs->b7 = c.nat->regs.b[7];
1272 memset(&info, 0, sizeof(info));
1273 unw_init_from_blocked_task(&info, v);
1274 if (vcpu_has_not_run(v)) {
1275 sw->ar_lc = c.nat->regs.ar.lc;
1276 sw->ar_pfs =
1277 (sw->ar_pfs & ~AR_PFS_PEC_MASK) |
1278 ((c.nat->regs.ar.ec << AR_PFS_PEC_SHIFT) &
1279 AR_PFS_PEC_MASK);
1280 } else if (unw_unwind_to_user(&info) < 0) {
1281 /* warn: should panic? */
1282 gdprintk(XENLOG_ERR,
1283 "vcpu=%d unw_unwind_to_user() failed.\n",
1284 v->vcpu_id);
1285 show_stack(v, NULL);
1287 //return -ENOSYS;
1288 } else {
1289 unw_set_ar(&info, UNW_AR_LC, c.nat->regs.ar.lc);
1290 unw_set_ar(&info, UNW_AR_EC, c.nat->regs.ar.ec);
1293 if (!is_hvm_domain(d) && (c.nat->flags & VGCF_SET_AR_ITC))
1294 __vcpu_set_itc(v, c.nat->regs.ar.itc);
1296 uregs->ar_csd = c.nat->regs.ar.csd;
1297 uregs->ar_ssd = c.nat->regs.ar.ssd;
1299 uregs->r8 = c.nat->regs.r[8];
1300 uregs->r9 = c.nat->regs.r[9];
1301 uregs->r10 = c.nat->regs.r[10];
1302 uregs->r11 = c.nat->regs.r[11];
1304 if (!is_hvm_domain(d))
1305 vcpu_set_psr(v, c.nat->regs.psr);
1306 else
1307 vmx_vcpu_set_psr(v, c.nat->regs.psr);
1308 uregs->cr_iip = c.nat->regs.ip;
1309 uregs->cr_ifs = c.nat->regs.cfm;
1311 uregs->ar_unat = c.nat->regs.ar.unat;
1312 uregs->ar_pfs = c.nat->regs.ar.pfs;
1313 uregs->ar_rsc = c.nat->regs.ar.rsc;
1314 uregs->ar_rnat = c.nat->regs.ar.rnat;
1315 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
1317 uregs->pr = c.nat->regs.pr;
1318 uregs->b0 = c.nat->regs.b[0];
1319 num_regs = ia64_rse_num_regs((unsigned long*)c.nat->regs.ar.bspstore,
1320 (unsigned long*)c.nat->regs.ar.bsp);
1321 rbs_size = (unsigned long)ia64_rse_skip_regs(rbs_bottom, num_regs) -
1322 (unsigned long)rbs_bottom;
1323 if (rbs_size > sizeof (c.nat->regs.rbs)) {
1324 gdprintk(XENLOG_INFO,
1325 "rbs size is too large %x > %lx\n",
1326 rbs_size, sizeof (c.nat->regs.rbs));
1327 return -EINVAL;
1329 if (rbs_size > 0 &&
1330 ((IA64_RBS_OFFSET / 8) % 64) != c.nat->regs.rbs_voff)
1331 gdprintk(XENLOG_INFO,
1332 "rbs stack offset is different! xen 0x%x given 0x%x",
1333 (IA64_RBS_OFFSET / 8) % 64, c.nat->regs.rbs_voff);
1335 /* Protection against crazy user code. */
1336 if (!was_initialised)
1337 uregs->loadrs = (rbs_size << 16);
1338 if (rbs_size == (uregs->loadrs >> 16)) {
1339 unsigned long dst_rbs_size = 0;
1340 if (vcpu_has_not_run(v))
1341 sw->ar_bspstore = (unsigned long)rbs_bottom;
1343 rc = copy_rbs(v, &dst_rbs_size,
1344 c.nat->regs.rbs, rbs_size,
1345 c.nat->regs.rbs_rnat,
1346 c.nat->regs.rbs_voff);
1347 if (rc < 0)
1348 return rc;
1350 /* In case of newly created vcpu, ar_bspstore points to
1351 * the bottom of register stack. Move it up.
1352 * See also init_switch_stack().
1353 */
1354 if (vcpu_has_not_run(v)) {
1355 uregs->loadrs = (dst_rbs_size << 16);
1356 sw->ar_bspstore = (unsigned long)((char*)rbs_bottom +
1357 dst_rbs_size);
1361 // inhibit save/restore between cpus of different RSE.N_STACKED_PHYS.
1362 // to avoid nasty issues.
1363 //
1364 // The number of physical stacked general register(RSE.N_STACKED_PHYS)
1365 // isn't virtualized. Guest OS utilizes it via PAL_RSE_INFO call and
1366 // the value might be exported to user/user process.
1367 // (Linux does via /proc/cpuinfo)
1368 // The SDM says only that the number is cpu implementation specific.
1369 //
1370 // If the number of restoring cpu is different from one of saving cpu,
1371 // the following, or something worse, might happen.
1372 // - Xen VMM itself may panic when issuing loadrs to run guest with
1373 // illegal operation fault
1374 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1375 // restoring CPU
1376 // This case is detected to refuse restore by rbs_copy()
1377 // - guest kernel may panic with illegal operation fault
1378 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1379 // restoring CPU
1380 // - infomation leak from guest kernel to user process
1381 // When RSE.N_STACKED_PHYS of saving CPU < RSE.N_STACKED_PHYS of
1382 // restoring CPU
1383 // Before returning to user process, kernel should zero clear all
1384 // physical stacked resgisters to prevent kernel bits leak.
1385 // It would be based on RSE.N_STACKED_PHYS (Linux does.).
1386 // On the restored environtment the kernel clears only a part
1387 // of the physical stacked registers.
1388 // - user processes or human operators would be confused.
1389 // RSE.N_STACKED_PHYS might be exported to user process or human
1390 // operators. Actually on linux it is exported via /proc/cpuinfo.
1391 // user processes might use it.
1392 // I don't know any concrete example, but it's possible in theory.
1393 // e.g. thread libraly may allocate RBS area based on the value.
1394 // (Fortunately glibc nptl doesn't)
1395 if (c.nat->regs.num_phys_stacked != 0 && /* COMPAT */
1396 c.nat->regs.num_phys_stacked != num_phys_stacked) {
1397 gdprintk(XENLOG_WARNING,
1398 "num phys stacked is different! "
1399 "xen 0x%lx given 0x%lx",
1400 num_phys_stacked, c.nat->regs.num_phys_stacked);
1401 return -EINVAL;
1404 uregs->r1 = c.nat->regs.r[1];
1405 uregs->r12 = c.nat->regs.r[12];
1406 uregs->r13 = c.nat->regs.r[13];
1407 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
1408 uregs->r15 = c.nat->regs.r[15];
1410 uregs->r14 = c.nat->regs.r[14];
1411 uregs->r2 = c.nat->regs.r[2];
1412 uregs->r3 = c.nat->regs.r[3];
1413 uregs->r16 = c.nat->regs.r[16];
1414 uregs->r17 = c.nat->regs.r[17];
1415 uregs->r18 = c.nat->regs.r[18];
1416 uregs->r19 = c.nat->regs.r[19];
1417 uregs->r20 = c.nat->regs.r[20];
1418 uregs->r21 = c.nat->regs.r[21];
1419 uregs->r22 = c.nat->regs.r[22];
1420 uregs->r23 = c.nat->regs.r[23];
1421 uregs->r24 = c.nat->regs.r[24];
1422 uregs->r25 = c.nat->regs.r[25];
1423 uregs->r26 = c.nat->regs.r[26];
1424 uregs->r27 = c.nat->regs.r[27];
1425 uregs->r28 = c.nat->regs.r[28];
1426 uregs->r29 = c.nat->regs.r[29];
1427 uregs->r30 = c.nat->regs.r[30];
1428 uregs->r31 = c.nat->regs.r[31];
1430 uregs->ar_ccv = c.nat->regs.ar.ccv;
1432 COPY_FPREG(&sw->f2, &c.nat->regs.f[2]);
1433 COPY_FPREG(&sw->f3, &c.nat->regs.f[3]);
1434 COPY_FPREG(&sw->f4, &c.nat->regs.f[4]);
1435 COPY_FPREG(&sw->f5, &c.nat->regs.f[5]);
1437 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
1438 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
1439 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
1440 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
1441 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
1442 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
1444 COPY_FPREG(&sw->f12, &c.nat->regs.f[12]);
1445 COPY_FPREG(&sw->f13, &c.nat->regs.f[13]);
1446 COPY_FPREG(&sw->f14, &c.nat->regs.f[14]);
1447 COPY_FPREG(&sw->f15, &c.nat->regs.f[15]);
1448 COPY_FPREG(&sw->f16, &c.nat->regs.f[16]);
1449 COPY_FPREG(&sw->f17, &c.nat->regs.f[17]);
1450 COPY_FPREG(&sw->f18, &c.nat->regs.f[18]);
1451 COPY_FPREG(&sw->f19, &c.nat->regs.f[19]);
1452 COPY_FPREG(&sw->f20, &c.nat->regs.f[20]);
1453 COPY_FPREG(&sw->f21, &c.nat->regs.f[21]);
1454 COPY_FPREG(&sw->f22, &c.nat->regs.f[22]);
1455 COPY_FPREG(&sw->f23, &c.nat->regs.f[23]);
1456 COPY_FPREG(&sw->f24, &c.nat->regs.f[24]);
1457 COPY_FPREG(&sw->f25, &c.nat->regs.f[25]);
1458 COPY_FPREG(&sw->f26, &c.nat->regs.f[26]);
1459 COPY_FPREG(&sw->f27, &c.nat->regs.f[27]);
1460 COPY_FPREG(&sw->f28, &c.nat->regs.f[28]);
1461 COPY_FPREG(&sw->f29, &c.nat->regs.f[29]);
1462 COPY_FPREG(&sw->f30, &c.nat->regs.f[30]);
1463 COPY_FPREG(&sw->f31, &c.nat->regs.f[31]);
1465 // f32 - f127
1466 memcpy(&v->arch._thread.fph[0], &c.nat->regs.f[32],
1467 sizeof(v->arch._thread.fph));
1469 #define UNAT_UPDATE(reg) \
1470 unat_update(&uregs->eml_unat, &uregs->r ## reg, \
1471 !!(c.nat->regs.nats & (1UL << (reg))));
1473 uregs->eml_unat = 0;
1474 UNAT_UPDATE(1);
1475 UNAT_UPDATE(2);
1476 UNAT_UPDATE(3);
1478 UNAT_UPDATE(8);
1479 UNAT_UPDATE(9);
1480 UNAT_UPDATE(10);
1481 UNAT_UPDATE(11);
1482 UNAT_UPDATE(12);
1483 UNAT_UPDATE(13);
1484 UNAT_UPDATE(14);
1485 UNAT_UPDATE(15);
1486 UNAT_UPDATE(16);
1487 UNAT_UPDATE(17);
1488 UNAT_UPDATE(18);
1489 UNAT_UPDATE(19);
1490 UNAT_UPDATE(20);
1491 UNAT_UPDATE(21);
1492 UNAT_UPDATE(22);
1493 UNAT_UPDATE(23);
1494 UNAT_UPDATE(24);
1495 UNAT_UPDATE(25);
1496 UNAT_UPDATE(26);
1497 UNAT_UPDATE(27);
1498 UNAT_UPDATE(28);
1499 UNAT_UPDATE(29);
1500 UNAT_UPDATE(30);
1501 UNAT_UPDATE(31);
1503 /*
1504 * r4-r7 is saved sometimes both in pt_regs->r[4-7] and memory stack or
1505 * only in memory stack.
1506 * for both cases, both memory stack and pt_regs->r[4-7] are updated.
1507 */
1508 uregs->r4 = c.nat->regs.r[4];
1509 uregs->r5 = c.nat->regs.r[5];
1510 uregs->r6 = c.nat->regs.r[6];
1511 uregs->r7 = c.nat->regs.r[7];
1513 UNAT_UPDATE(4);
1514 UNAT_UPDATE(5);
1515 UNAT_UPDATE(6);
1516 UNAT_UPDATE(7);
1517 #undef UNAT_UPDATE
1518 if (vcpu_has_not_run(v)) {
1519 sw->r4 = c.nat->regs.r[4];
1520 sw->r5 = c.nat->regs.r[5];
1521 sw->r6 = c.nat->regs.r[6];
1522 sw->r7 = c.nat->regs.r[7];
1524 unat_update(&sw->ar_unat, &sw->r4,
1525 !!(c.nat->regs.nats & (1UL << 4)));
1526 unat_update(&sw->ar_unat, &sw->r5,
1527 !!(c.nat->regs.nats & (1UL << 5)));
1528 unat_update(&sw->ar_unat, &sw->r6,
1529 !!(c.nat->regs.nats & (1UL << 6)));
1530 unat_update(&sw->ar_unat, &sw->r7,
1531 !!(c.nat->regs.nats & (1UL << 7)));
1532 } else {
1533 unw_set_gr(&info, 4, c.nat->regs.r[4],
1534 !!(c.nat->regs.nats & (1UL << 4)));
1535 unw_set_gr(&info, 5, c.nat->regs.r[5],
1536 !!(c.nat->regs.nats & (1UL << 5)));
1537 unw_set_gr(&info, 6, c.nat->regs.r[6],
1538 !!(c.nat->regs.nats & (1UL << 6)));
1539 unw_set_gr(&info, 7, c.nat->regs.r[7],
1540 !!(c.nat->regs.nats & (1UL << 7)));
1543 if (!is_hvm_domain(d)) {
1544 /* domain runs at PL2/3 */
1545 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
1546 IA64_PSR_CPL0_BIT);
1547 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
1550 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1551 if (is_hvm_domain(d)) {
1552 vmx_vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1553 vmx_vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1554 } else {
1555 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1556 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1560 /* rr[] must be set before setting itrs[] dtrs[] */
1561 for (i = 0; i < 8; i++) {
1562 unsigned long rrval = c.nat->regs.rr[i];
1563 unsigned long reg = (unsigned long)i << 61;
1564 IA64FAULT fault = IA64_NO_FAULT;
1566 if (rrval == 0)
1567 continue;
1568 if (is_hvm_domain(d)) {
1569 //without VGCF_EXTRA_REGS check,
1570 //VTi domain doesn't boot.
1571 if (c.nat->flags & VGCF_EXTRA_REGS)
1572 fault = vmx_vcpu_set_rr(v, reg, rrval);
1573 } else
1574 fault = vcpu_set_rr(v, reg, rrval);
1575 if (fault != IA64_NO_FAULT)
1576 return -EINVAL;
1579 if (c.nat->flags & VGCF_EXTRA_REGS) {
1580 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
1582 for (i = 0;
1583 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1584 i++) {
1585 if (is_hvm_domain(d))
1586 vmx_vcpu_itr_i(v, i, tr->itrs[i].pte,
1587 tr->itrs[i].itir,
1588 tr->itrs[i].vadr);
1589 else
1590 vcpu_set_itr(v, i, tr->itrs[i].pte,
1591 tr->itrs[i].itir,
1592 tr->itrs[i].vadr,
1593 tr->itrs[i].rid);
1595 for (i = 0;
1596 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1597 i++) {
1598 if (is_hvm_domain(d))
1599 vmx_vcpu_itr_d(v, i, tr->dtrs[i].pte,
1600 tr->dtrs[i].itir,
1601 tr->dtrs[i].vadr);
1602 else
1603 vcpu_set_dtr(v, i,
1604 tr->dtrs[i].pte,
1605 tr->dtrs[i].itir,
1606 tr->dtrs[i].vadr,
1607 tr->dtrs[i].rid);
1609 v->arch.event_callback_ip = c.nat->event_callback_ip;
1610 vcpu_set_iva(v, c.nat->regs.cr.iva);
1613 if (is_hvm_domain(d))
1614 rc = vmx_arch_set_info_guest(v, c);
1616 return rc;
1619 static int relinquish_memory(struct domain *d, struct list_head *list)
1621 struct list_head *ent;
1622 struct page_info *page;
1623 #ifndef __ia64__
1624 unsigned long x, y;
1625 #endif
1626 int ret = 0;
1628 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1629 spin_lock_recursive(&d->page_alloc_lock);
1630 ent = list->next;
1631 while ( ent != list )
1633 page = list_entry(ent, struct page_info, list);
1634 /* Grab a reference to the page so it won't disappear from under us. */
1635 if ( unlikely(!get_page(page, d)) )
1637 /* Couldn't get a reference -- someone is freeing this page. */
1638 ent = ent->next;
1639 list_move_tail(&page->list, &d->arch.relmem_list);
1640 continue;
1643 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1644 put_page_and_type(page);
1646 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1647 put_page(page);
1649 #ifndef __ia64__
1650 /*
1651 * Forcibly invalidate base page tables at this point to break circular
1652 * 'linear page table' references. This is okay because MMU structures
1653 * are not shared across domains and this domain is now dead. Thus base
1654 * tables are not in use so a non-zero count means circular reference.
1655 */
1656 y = page->u.inuse.type_info;
1657 for ( ; ; )
1659 x = y;
1660 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1661 (PGT_base_page_table|PGT_validated)) )
1662 break;
1664 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1665 if ( likely(y == x) )
1667 free_page_type(page, PGT_base_page_table);
1668 break;
1671 #endif
1673 /* Follow the list chain and /then/ potentially free the page. */
1674 ent = ent->next;
1675 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
1676 list_move_tail(&page->list, &d->arch.relmem_list);
1677 put_page(page);
1679 if (hypercall_preempt_check()) {
1680 ret = -EAGAIN;
1681 goto out;
1685 list_splice_init(&d->arch.relmem_list, list);
1687 out:
1688 spin_unlock_recursive(&d->page_alloc_lock);
1689 return ret;
1692 int domain_relinquish_resources(struct domain *d)
1694 int ret = 0;
1696 switch (d->arch.relres) {
1697 case RELRES_not_started:
1698 /* Relinquish guest resources for VT-i domain. */
1699 if (is_hvm_domain(d))
1700 vmx_relinquish_guest_resources(d);
1701 d->arch.relres = RELRES_mm_teardown;
1702 /*fallthrough*/
1704 case RELRES_mm_teardown:
1705 if (d->arch.pirq_eoi_map != NULL) {
1706 put_page(virt_to_page(d->arch.pirq_eoi_map));
1707 d->arch.pirq_eoi_map = NULL;
1710 /* Tear down shadow mode stuff. */
1711 ret = mm_teardown(d);
1712 if (ret != 0)
1713 return ret;
1714 d->arch.relres = RELRES_xen;
1715 /* fallthrough */
1717 case RELRES_xen:
1718 /* Relinquish every xen page of memory. */
1719 ret = relinquish_memory(d, &d->xenpage_list);
1720 if (ret != 0)
1721 return ret;
1722 d->arch.relres = RELRES_dom;
1723 /* fallthrough */
1725 case RELRES_dom:
1726 /* Relinquish every domain page of memory. */
1727 ret = relinquish_memory(d, &d->page_list);
1728 if (ret != 0)
1729 return ret;
1730 d->arch.relres = RELRES_done;
1731 /* fallthrough */
1733 case RELRES_done:
1734 break;
1736 default:
1737 BUG();
1740 if (is_hvm_domain(d) && d->arch.sal_data)
1741 xfree(d->arch.sal_data);
1743 return 0;
1746 unsigned long
1747 domain_set_shared_info_va (unsigned long va)
1749 struct vcpu *v = current;
1750 struct domain *d = v->domain;
1751 int rc;
1753 /* Check virtual address:
1754 must belong to region 7,
1755 must be 64Kb aligned,
1756 must not be within Xen virtual space. */
1757 if ((va >> 61) != 7
1758 || (va & 0xffffUL) != 0
1759 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1760 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1762 /* Note: this doesn't work well if other cpus are already running.
1763 However this is part of the spec :-) */
1764 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1765 d->arch.shared_info_va = va;
1767 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1768 INT_ENABLE_OFFSET(v);
1769 set_current_psr_i_addr(v);
1771 /* Remap the shared pages. */
1772 BUG_ON(VMX_DOMAIN(v));
1773 rc = !set_one_rr(7UL << 61, PSCB(v,rrs[7]));
1774 BUG_ON(rc);
1776 return rc;
1779 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1780 #define SHADOW_COPY_CHUNK 1024
1782 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1784 unsigned int op = sc->op;
1785 int rc = 0;
1786 int i;
1787 //struct vcpu *v;
1789 if (unlikely(d == current->domain)) {
1790 gdprintk(XENLOG_INFO,
1791 "Don't try to do a shadow op on yourself!\n");
1792 return -EINVAL;
1795 domain_pause(d);
1797 switch (op)
1799 case XEN_DOMCTL_SHADOW_OP_OFF:
1800 if (shadow_mode_enabled (d)) {
1801 u64 *bm = d->arch.shadow_bitmap;
1802 struct vcpu *v;
1804 for_each_vcpu(d, v)
1805 v->arch.shadow_bitmap = NULL;
1807 /* Flush vhpt and tlb to restore dirty bit usage. */
1808 flush_tlb_for_log_dirty(d);
1810 /* Free bitmap. */
1811 d->arch.shadow_bitmap_size = 0;
1812 d->arch.shadow_bitmap = NULL;
1813 xfree(bm);
1815 break;
1817 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1818 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1819 rc = -EINVAL;
1820 break;
1822 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1823 if (shadow_mode_enabled(d)) {
1824 rc = -EINVAL;
1825 break;
1828 atomic64_set(&d->arch.shadow_fault_count, 0);
1829 atomic64_set(&d->arch.shadow_dirty_count, 0);
1831 d->arch.shadow_bitmap_size =
1832 (domain_get_maximum_gpfn(d) + BITS_PER_LONG) &
1833 ~(BITS_PER_LONG - 1);
1834 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1835 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1836 if (d->arch.shadow_bitmap == NULL) {
1837 d->arch.shadow_bitmap_size = 0;
1838 rc = -ENOMEM;
1840 else {
1841 struct vcpu *v;
1842 memset(d->arch.shadow_bitmap, 0,
1843 d->arch.shadow_bitmap_size / 8);
1845 for_each_vcpu(d, v)
1846 v->arch.shadow_bitmap = d->arch.shadow_bitmap;
1847 /* Flush vhtp and tlb to enable dirty bit
1848 virtualization. */
1849 flush_tlb_for_log_dirty(d);
1851 break;
1853 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1855 int nbr_bytes;
1857 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1858 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1860 atomic64_set(&d->arch.shadow_fault_count, 0);
1861 atomic64_set(&d->arch.shadow_dirty_count, 0);
1863 if (guest_handle_is_null(sc->dirty_bitmap) ||
1864 (d->arch.shadow_bitmap == NULL)) {
1865 rc = -EINVAL;
1866 break;
1869 if (sc->pages > d->arch.shadow_bitmap_size)
1870 sc->pages = d->arch.shadow_bitmap_size;
1872 nbr_bytes = (sc->pages + 7) / 8;
1874 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1875 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1876 SHADOW_COPY_CHUNK : nbr_bytes - i;
1878 if (copy_to_guest_offset(
1879 sc->dirty_bitmap, i,
1880 (uint8_t *)d->arch.shadow_bitmap + i,
1881 size)) {
1882 rc = -EFAULT;
1883 break;
1886 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1888 flush_tlb_for_log_dirty(d);
1890 break;
1893 case XEN_DOMCTL_SHADOW_OP_PEEK:
1895 unsigned long size;
1897 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1898 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1900 if (guest_handle_is_null(sc->dirty_bitmap) ||
1901 (d->arch.shadow_bitmap == NULL)) {
1902 rc = -EINVAL;
1903 break;
1906 if (sc->pages > d->arch.shadow_bitmap_size)
1907 sc->pages = d->arch.shadow_bitmap_size;
1909 size = (sc->pages + 7) / 8;
1910 if (copy_to_guest(sc->dirty_bitmap,
1911 (uint8_t *)d->arch.shadow_bitmap, size)) {
1912 rc = -EFAULT;
1913 break;
1915 break;
1917 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1918 sc->mb = 0;
1919 break;
1920 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1921 if (sc->mb > 0) {
1922 BUG();
1923 rc = -ENOMEM;
1925 break;
1926 default:
1927 rc = -EINVAL;
1928 break;
1931 domain_unpause(d);
1933 return rc;
1936 // remove following line if not privifying in memory
1937 //#define HAVE_PRIVIFY_MEMORY
1938 #ifndef HAVE_PRIVIFY_MEMORY
1939 #define privify_memory(x,y) do {} while(0)
1940 #endif
1942 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1943 unsigned long phys_load_offset)
1945 const elf_phdr *phdr;
1946 int phnum, h, filesz, memsz;
1947 unsigned long elfaddr, dom_mpaddr, dom_imva;
1948 struct page_info *p;
1950 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1951 for (h = 0; h < phnum; h++) {
1952 phdr = elf_phdr_by_index(elf, h);
1953 if (!elf_phdr_is_loadable(elf, phdr))
1954 continue;
1956 filesz = elf_uval(elf, phdr, p_filesz);
1957 memsz = elf_uval(elf, phdr, p_memsz);
1958 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1959 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1960 dom_mpaddr += phys_load_offset;
1962 while (memsz > 0) {
1963 p = assign_new_domain_page(d,dom_mpaddr);
1964 BUG_ON (unlikely(p == NULL));
1965 dom_imva = __va_ul(page_to_maddr(p));
1966 if (filesz > 0) {
1967 if (filesz >= PAGE_SIZE)
1968 copy_page((void *) dom_imva,
1969 (void *) elfaddr);
1970 else {
1971 // copy partial page
1972 memcpy((void *) dom_imva,
1973 (void *) elfaddr, filesz);
1974 // zero the rest of page
1975 memset((void *) dom_imva+filesz, 0,
1976 PAGE_SIZE-filesz);
1978 //FIXME: This test for code seems to find a lot more than objdump -x does
1979 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1980 privify_memory(dom_imva,PAGE_SIZE);
1981 flush_icache_range(dom_imva,
1982 dom_imva+PAGE_SIZE);
1985 else if (memsz > 0) {
1986 /* always zero out entire page */
1987 clear_page((void *) dom_imva);
1989 memsz -= PAGE_SIZE;
1990 filesz -= PAGE_SIZE;
1991 elfaddr += PAGE_SIZE;
1992 dom_mpaddr += PAGE_SIZE;
1997 static void __init calc_dom0_size(void)
1999 unsigned long domheap_pages;
2000 unsigned long p2m_pages;
2001 unsigned long spare_hv_pages;
2002 unsigned long max_dom0_size;
2004 /* Estimate maximum memory we can safely allocate for dom0
2005 * by subtracting the p2m table allocation and a chunk of memory
2006 * for DMA and PCI mapping from the available domheap pages. The
2007 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
2008 * to have a good idea of what those requirements might be ahead
2009 * of time, calculated at 128MB + 1MB per 4GB of system memory */
2010 domheap_pages = avail_domheap_pages();
2011 p2m_pages = domheap_pages / PTRS_PER_PTE;
2012 spare_hv_pages = 8192 + (domheap_pages / 4096);
2013 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
2014 * PAGE_SIZE;
2015 printk("Maximum permitted dom0 size: %luMB\n",
2016 max_dom0_size / (1024*1024));
2018 /* validate proposed dom0_size, fix up as needed */
2019 if (dom0_size > max_dom0_size) {
2020 printk("Reducing dom0 memory allocation from %luK to %luK "
2021 "to fit available memory\n",
2022 dom0_size / 1024, max_dom0_size / 1024);
2023 dom0_size = max_dom0_size;
2026 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
2027 if (dom0_size == 0) {
2028 printk("Allocating all available memory to dom0\n");
2029 dom0_size = max_dom0_size;
2032 /* Check dom0 size. */
2033 if (dom0_size < 4 * 1024 * 1024) {
2034 panic("dom0_mem is too small, boot aborted"
2035 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
2038 if (running_on_sim) {
2039 dom0_size = 128*1024*1024; //FIXME: Should be configurable
2042 /* no need to allocate pages for now
2043 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
2044 */
2048 /*
2049 * Domain 0 has direct access to all devices absolutely. However
2050 * the major point of this stub here, is to allow alloc_dom_mem
2051 * handled with order > 0 request. Dom0 requires that bit set to
2052 * allocate memory for other domains.
2053 */
2054 static void __init physdev_init_dom0(struct domain *d)
2056 if (iomem_permit_access(d, 0UL, ~0UL))
2057 BUG();
2058 if (irqs_permit_access(d, 0, NR_IRQS-1))
2059 BUG();
2060 if (ioports_permit_access(d, 0, 0, 0xffff))
2061 BUG();
2064 int __init construct_dom0(struct domain *d,
2065 unsigned long image_start, unsigned long image_len,
2066 unsigned long initrd_start, unsigned long initrd_len,
2067 char *cmdline)
2069 int i, rc;
2070 start_info_t *si;
2071 dom0_vga_console_info_t *ci;
2072 struct vcpu *v = d->vcpu[0];
2073 unsigned long max_pages;
2075 struct elf_binary elf;
2076 struct elf_dom_parms parms;
2077 unsigned long p_start;
2078 unsigned long pkern_start;
2079 unsigned long pkern_entry;
2080 unsigned long pkern_end;
2081 unsigned long pinitrd_start = 0;
2082 unsigned long pstart_info;
2083 unsigned long phys_load_offset;
2084 struct page_info *start_info_page;
2085 unsigned long bp_mpa;
2086 struct ia64_boot_param *bp;
2088 //printk("construct_dom0: starting\n");
2090 /* Sanity! */
2091 BUG_ON(d != dom0);
2092 BUG_ON(d->vcpu[0] == NULL);
2093 BUG_ON(v->is_initialised);
2095 printk("*** LOADING DOMAIN 0 ***\n");
2097 calc_dom0_size();
2099 max_pages = dom0_size / PAGE_SIZE;
2100 d->max_pages = max_pages;
2101 d->tot_pages = 0;
2103 rc = elf_init(&elf, (void*)image_start, image_len);
2104 if ( rc != 0 )
2105 return rc;
2106 #ifdef VERBOSE
2107 elf_set_verbose(&elf);
2108 #endif
2109 elf_parse_binary(&elf);
2110 if (0 != (elf_xen_parse(&elf, &parms)))
2111 return rc;
2113 /*
2114 * We cannot rely on the load address in the ELF headers to
2115 * determine the meta physical address at which the image
2116 * is loaded. Patch the address to match the real one, based
2117 * on xen_pstart
2118 */
2119 phys_load_offset = xen_pstart - elf.pstart;
2120 elf.pstart += phys_load_offset;
2121 elf.pend += phys_load_offset;
2122 parms.virt_kstart += phys_load_offset;
2123 parms.virt_kend += phys_load_offset;
2124 parms.virt_entry += phys_load_offset;
2126 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
2127 elf_64bit(&elf) ? "64-bit" : "32-bit",
2128 elf_msb(&elf) ? "msb" : "lsb",
2129 elf.pstart, elf.pend);
2130 if (!elf_64bit(&elf) ||
2131 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
2132 printk("Incompatible kernel binary\n");
2133 return -1;
2136 p_start = parms.virt_base;
2137 pkern_start = parms.virt_kstart;
2138 pkern_end = parms.virt_kend;
2139 pkern_entry = parms.virt_entry;
2141 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
2143 if ( (p_start & (PAGE_SIZE-1)) != 0 )
2145 printk("Initial guest OS must load to a page boundary.\n");
2146 return -EINVAL;
2149 pstart_info = PAGE_ALIGN(pkern_end);
2150 if(initrd_start && initrd_len){
2151 unsigned long offset;
2153 /* The next page aligned boundary after the start info.
2154 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
2155 pinitrd_start = pstart_info + PAGE_SIZE;
2157 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
2158 panic("%s: not enough memory assigned to dom0", __func__);
2160 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
2161 struct page_info *p;
2162 p = assign_new_domain_page(d, pinitrd_start + offset);
2163 if (p == NULL)
2164 panic("%s: can't allocate page for initrd image", __func__);
2165 if (initrd_len < offset + PAGE_SIZE)
2166 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
2167 initrd_len - offset);
2168 else
2169 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
2173 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
2174 " Kernel image: %lx->%lx\n"
2175 " Entry address: %lx\n"
2176 " Init. ramdisk: %lx len %lx\n"
2177 " Start info.: %lx->%lx\n",
2178 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
2179 pstart_info, pstart_info + PAGE_SIZE);
2181 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
2183 printk("Initial guest OS requires too much space\n"
2184 "(%luMB is greater than %luMB limit)\n",
2185 (pkern_end-pkern_start)>>20,
2186 (max_pages <<PAGE_SHIFT)>>20);
2187 return -ENOMEM;
2190 // if high 3 bits of pkern start are non-zero, error
2192 // if pkern end is after end of metaphysical memory, error
2193 // (we should be able to deal with this... later)
2195 /* Mask all upcalls... */
2196 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
2197 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
2199 if (dom0_max_vcpus == 0)
2200 dom0_max_vcpus = MAX_VIRT_CPUS;
2201 if (dom0_max_vcpus > num_online_cpus())
2202 dom0_max_vcpus = num_online_cpus();
2203 if (dom0_max_vcpus > MAX_VIRT_CPUS)
2204 dom0_max_vcpus = MAX_VIRT_CPUS;
2206 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
2207 for ( i = 1; i < dom0_max_vcpus; i++ )
2208 if (alloc_vcpu(d, i, i) == NULL)
2209 panic("Cannot allocate dom0 vcpu %d\n", i);
2211 /* Copy the OS image. */
2212 loaddomainelfimage(d, &elf, phys_load_offset);
2214 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
2215 sizeof(struct ia64_boot_param) > PAGE_SIZE);
2217 /* Set up start info area. */
2218 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
2219 start_info_page = assign_new_domain_page(d, pstart_info);
2220 if (start_info_page == NULL)
2221 panic("can't allocate start info page");
2222 si = page_to_virt(start_info_page);
2223 clear_page(si);
2224 snprintf(si->magic, sizeof(si->magic), "xen-3.0-ia64");
2225 si->nr_pages = max_pages;
2226 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
2227 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
2229 printk("Dom0: 0x%lx\n", (u64)dom0);
2231 v->is_initialised = 1;
2232 clear_bit(_VPF_down, &v->pause_flags);
2234 /* Build firmware.
2235 Note: Linux kernel reserve memory used by start_info, so there is
2236 no need to remove it from MDT. */
2237 bp_mpa = pstart_info + sizeof(struct start_info);
2238 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
2239 if (rc != 0)
2240 return rc;
2242 /* Fill boot param. */
2243 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
2245 bp = (struct ia64_boot_param *)((unsigned char *)si +
2246 sizeof(start_info_t));
2247 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
2249 /* We assume console has reached the last line! */
2250 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
2251 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
2252 bp->console_info.orig_x = 0;
2253 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
2254 0 : bp->console_info.num_rows - 1;
2256 bp->initrd_start = pinitrd_start;
2257 bp->initrd_size = ia64_boot_param->initrd_size;
2259 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
2260 sizeof(start_info_t) +
2261 sizeof(struct ia64_boot_param));
2263 if (fill_console_start_info(ci)) {
2264 si->console.dom0.info_off = sizeof(start_info_t) +
2265 sizeof(struct ia64_boot_param);
2266 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
2269 vcpu_init_regs (v);
2271 vcpu_regs(v)->r28 = bp_mpa;
2273 vcpu_regs (v)->cr_iip = pkern_entry;
2275 physdev_init_dom0(d);
2277 return 0;
2280 void machine_restart(unsigned int delay_millisecs)
2282 mdelay(delay_millisecs);
2283 console_start_sync();
2284 if (running_on_sim)
2285 printk ("machine_restart called. spinning...\n");
2286 else
2287 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
2288 while(1);
2291 extern void cpu_halt(void);
2293 void machine_halt(void)
2295 console_start_sync();
2297 #ifdef CONFIG_SMP
2298 smp_send_stop();
2299 #endif
2301 printk ("machine_halt called. spinning...\n");
2302 while(1);
2305 void sync_vcpu_execstate(struct vcpu *v)
2307 // __ia64_save_fpu(v->arch._thread.fph);
2308 // FIXME SMP: Anything else needed here for SMP?
2311 /* This function is taken from xen/arch/x86/domain.c */
2312 long
2313 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
2315 long rc = 0;
2317 switch (cmd) {
2318 case VCPUOP_register_runstate_memory_area:
2320 struct vcpu_register_runstate_memory_area area;
2321 struct vcpu_runstate_info runstate;
2323 rc = -EFAULT;
2324 if (copy_from_guest(&area, arg, 1))
2325 break;
2327 if (!guest_handle_okay(area.addr.h, 1))
2328 break;
2330 rc = 0;
2331 runstate_guest(v) = area.addr.h;
2333 if (v == current) {
2334 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
2335 } else {
2336 vcpu_runstate_get(v, &runstate);
2337 __copy_to_guest(runstate_guest(v), &runstate, 1);
2340 break;
2342 default:
2343 rc = -ENOSYS;
2344 break;
2347 return rc;
2350 static void __init parse_dom0_mem(char *s)
2352 dom0_size = parse_size_and_unit(s, NULL);
2354 custom_param("dom0_mem", parse_dom0_mem);
2356 /*
2357 * Helper function for the optimization stuff handling the identity mapping
2358 * feature.
2359 */
2360 static inline unsigned long
2361 optf_identity_mapping_cmd_to_flg(unsigned long cmd)
2363 switch(cmd) {
2364 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2365 return XEN_IA64_OPTF_IDENT_MAP_REG7_FLG;
2366 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2367 return XEN_IA64_OPTF_IDENT_MAP_REG4_FLG;
2368 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2369 return XEN_IA64_OPTF_IDENT_MAP_REG5_FLG;
2370 default:
2371 BUG();
2372 return 0;
2375 /* NOTREACHED */
2378 static inline void
2379 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
2380 struct xen_ia64_opt_feature* f)
2382 unsigned long flag = optf_identity_mapping_cmd_to_flg(f->cmd);
2384 if (f->on) {
2385 *mask |= flag;
2386 im->pgprot = f->pgprot;
2387 im->key = f->key;
2388 } else {
2389 *mask &= ~flag;
2390 im->pgprot = 0;
2391 im->key = 0;
2395 /*
2396 * Switch an optimization feature on/off.
2397 * The vcpu must be paused to avoid racy access to opt_feature.
2398 */
2399 int
2400 domain_opt_feature(struct domain *d, struct xen_ia64_opt_feature* f)
2402 struct opt_feature* optf = &d->arch.opt_feature;
2403 struct vcpu *v;
2404 long rc = 0;
2406 for_each_vcpu(d, v) {
2407 if (v != current)
2408 vcpu_pause(v);
2411 switch (f->cmd) {
2412 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2413 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
2414 break;
2415 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2416 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
2417 break;
2418 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2419 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
2420 break;
2421 default:
2422 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
2423 rc = -ENOSYS;
2424 break;
2427 for_each_vcpu(d, v) {
2428 if (v != current)
2429 vcpu_unpause(v);
2432 return rc;