ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 18633:f27787b9f8d7

[IA64] Change ioports_permit_access interface().

use VTD to assing device, guest port may not be equal to host port.
Change ioports_permit_access interface to get guest pseudo physical
address.

Signed-off-by: Anthony Xu <anthony.xu@intel.com>
author Isaku Yamahata <yamahata@valinux.co.jp>
date Fri Oct 17 17:40:15 2008 +0900 (2008-10-17)
parents 903a901ab372
children 02c8733e2d91
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vmx_vcpu_save.h>
45 #include <asm/vhpt.h>
46 #include <asm/vcpu.h>
47 #include <asm/tlbflush.h>
48 #include <asm/regionreg.h>
49 #include <asm/dom_fw.h>
50 #include <asm/shadow.h>
51 #include <xen/guest_access.h>
52 #include <asm/tlb_track.h>
53 #include <asm/perfmon.h>
54 #include <asm/sal.h>
55 #include <public/vcpu.h>
56 #include <linux/cpu.h>
57 #include <linux/notifier.h>
58 #include <asm/debugger.h>
60 /* dom0_size: default memory allocation for dom0 (~4GB) */
61 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
63 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
64 static unsigned int __initdata dom0_max_vcpus = 4;
65 integer_param("dom0_max_vcpus", dom0_max_vcpus);
67 extern char dom0_command_line[];
69 /* forward declaration */
70 static void init_switch_stack(struct vcpu *v);
72 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
73 This is a Xen virtual address. */
74 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
75 DEFINE_PER_CPU(int *, current_psr_ic_addr);
77 DEFINE_PER_CPU(struct vcpu *, fp_owner);
79 #include <xen/sched-if.h>
81 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
82 {
83 int cpu = smp_processor_id();
84 int last_vcpu_id, last_processor;
86 if (!is_idle_domain(prev->domain))
87 tlbflush_update_time
88 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
89 tlbflush_current_time());
91 if (is_idle_domain(next->domain))
92 return;
94 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
95 last_processor = next->arch.last_processor;
97 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
98 next->arch.last_processor = cpu;
100 if ((last_vcpu_id != next->vcpu_id &&
101 last_vcpu_id != INVALID_VCPU_ID) ||
102 (last_vcpu_id == next->vcpu_id &&
103 last_processor != cpu &&
104 last_processor != INVALID_PROCESSOR)) {
105 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
106 u32 last_tlbflush_timestamp =
107 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
108 #endif
109 int vhpt_is_flushed = 0;
111 // if the vTLB implementation was changed,
112 // the followings must be updated either.
113 if (VMX_DOMAIN(next)) {
114 // currently vTLB for vt-i domian is per vcpu.
115 // so any flushing isn't needed.
116 } else if (HAS_PERVCPU_VHPT(next->domain)) {
117 // nothing to do
118 } else {
119 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
120 last_tlbflush_timestamp)) {
121 local_vhpt_flush();
122 vhpt_is_flushed = 1;
123 }
124 }
125 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
126 last_tlbflush_timestamp)) {
127 local_flush_tlb_all();
128 perfc_incr(tlbflush_clock_cswitch_purge);
129 } else {
130 perfc_incr(tlbflush_clock_cswitch_skip);
131 }
132 perfc_incr(flush_vtlb_for_context_switch);
133 }
134 }
136 static void flush_cache_for_context_switch(struct vcpu *next)
137 {
138 extern cpumask_t cpu_cache_coherent_map;
139 int cpu = smp_processor_id();
141 if (is_idle_vcpu(next) ||
142 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
143 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
144 unsigned long flags;
145 u64 progress = 0;
146 s64 status;
148 local_irq_save(flags);
149 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
150 local_irq_restore(flags);
151 if (status != 0)
152 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
153 "cache_type=4 status %lx", status);
154 }
155 }
156 }
158 static void set_current_psr_i_addr(struct vcpu* v)
159 {
160 __ia64_per_cpu_var(current_psr_i_addr) =
161 (uint8_t*)(v->domain->arch.shared_info_va +
162 INT_ENABLE_OFFSET(v));
163 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
164 (v->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
165 }
167 static void clear_current_psr_i_addr(void)
168 {
169 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
170 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
171 }
173 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
174 {
175 /*
176 * Implement eager save, lazy restore
177 */
178 if (!is_idle_vcpu(prev)) {
179 if (VMX_DOMAIN(prev)) {
180 if (FP_PSR(prev) & IA64_PSR_MFH) {
181 __ia64_save_fpu(prev->arch._thread.fph);
182 __ia64_per_cpu_var(fp_owner) = prev;
183 }
184 } else {
185 if (PSCB(prev, hpsr_mfh)) {
186 __ia64_save_fpu(prev->arch._thread.fph);
187 __ia64_per_cpu_var(fp_owner) = prev;
188 }
189 }
190 }
192 if (!is_idle_vcpu(next)) {
193 if (VMX_DOMAIN(next)) {
194 FP_PSR(next) = IA64_PSR_DFH;
195 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
196 } else {
197 PSCB(next, hpsr_dfh) = 1;
198 PSCB(next, hpsr_mfh) = 0;
199 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
200 }
201 }
202 }
204 static void load_state(struct vcpu *v)
205 {
206 load_region_regs(v);
207 ia64_set_pta(vcpu_pta(v));
208 vcpu_load_kernel_regs(v);
209 if (vcpu_pkr_in_use(v))
210 vcpu_pkr_load_regs(v);
211 set_current_psr_i_addr(v);
212 }
214 void schedule_tail(struct vcpu *prev)
215 {
216 extern char ia64_ivt;
218 context_saved(prev);
220 if (VMX_DOMAIN(current))
221 vmx_do_resume(current);
222 else {
223 if (VMX_DOMAIN(prev))
224 ia64_set_iva(&ia64_ivt);
225 load_state(current);
226 migrate_timer(&current->arch.hlt_timer, current->processor);
227 }
228 flush_vtlb_for_context_switch(prev, current);
229 }
231 void context_switch(struct vcpu *prev, struct vcpu *next)
232 {
233 uint64_t spsr;
235 local_irq_save(spsr);
237 if (VMX_DOMAIN(prev)) {
238 vmx_save_state(prev);
239 if (!VMX_DOMAIN(next)) {
240 /* VMX domains can change the physical cr.dcr.
241 * Restore default to prevent leakage. */
242 uint64_t dcr = ia64_getreg(_IA64_REG_CR_DCR);
243 /* xenoprof:
244 * don't change psr.pp.
245 * It is manipulated by xenoprof.
246 */
247 dcr = (IA64_DEFAULT_DCR_BITS & ~IA64_DCR_PP) | (dcr & IA64_DCR_PP);
248 ia64_setreg(_IA64_REG_CR_DCR, dcr);
249 }
250 }
252 lazy_fp_switch(prev, current);
254 if (prev->arch.dbg_used || next->arch.dbg_used) {
255 /*
256 * Load debug registers either because they are valid or to clear
257 * the previous one.
258 */
259 ia64_load_debug_regs(next->arch.dbr);
260 }
262 /*
263 * disable VHPT walker.
264 * ia64_switch_to() might cause VHPT fault because it flushes
265 * dtr[IA64_TR_VHPT] and reinsert the mapping with dtr[IA64_TR_STACK].
266 * (VHPT_SIZE_LOG2 << 2) is just for avoiding
267 * Reserved Register/Field fault.
268 */
269 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
270 prev = ia64_switch_to(next);
272 /* Note: ia64_switch_to does not return here at vcpu initialization. */
274 if (VMX_DOMAIN(current)) {
275 vmx_load_state(current);
276 } else {
277 extern char ia64_ivt;
279 if (VMX_DOMAIN(prev))
280 ia64_set_iva(&ia64_ivt);
282 if (!is_idle_vcpu(current)) {
283 load_state(current);
284 vcpu_set_next_timer(current);
285 if (vcpu_timer_expired(current))
286 vcpu_pend_timer(current);
287 /* steal time accounting */
288 if (!guest_handle_is_null(runstate_guest(current)))
289 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
290 } else {
291 /* When switching to idle domain, only need to disable vhpt
292 * walker. Then all accesses happen within idle context will
293 * be handled by TR mapping and identity mapping.
294 */
295 clear_current_psr_i_addr();
296 }
297 }
298 local_irq_restore(spsr);
300 /* lazy fp */
301 if (current->processor != current->arch.last_processor) {
302 unsigned long *addr;
303 addr = (unsigned long *)per_cpu_addr(fp_owner,
304 current->arch.last_processor);
305 ia64_cmpxchg(acq, addr, current, 0, 8);
306 }
308 flush_vtlb_for_context_switch(prev, current);
309 flush_cache_for_context_switch(current);
310 context_saved(prev);
311 }
313 void continue_running(struct vcpu *same)
314 {
315 /* nothing to do */
316 }
318 #ifdef CONFIG_PERFMON
319 static int pal_halt = 1;
320 static int can_do_pal_halt = 1;
322 static int __init nohalt_setup(char * str)
323 {
324 pal_halt = can_do_pal_halt = 0;
325 return 1;
326 }
327 __setup("nohalt", nohalt_setup);
329 void
330 update_pal_halt_status(int status)
331 {
332 can_do_pal_halt = pal_halt && status;
333 }
334 #else
335 #define can_do_pal_halt (1)
336 #endif
338 static void default_idle(void)
339 {
340 local_irq_disable();
341 if ( !softirq_pending(smp_processor_id()) ) {
342 if (can_do_pal_halt)
343 safe_halt();
344 else
345 cpu_relax();
346 }
347 local_irq_enable();
348 }
350 extern void play_dead(void);
352 static void continue_cpu_idle_loop(void)
353 {
354 int cpu = smp_processor_id();
356 for ( ; ; )
357 {
358 #ifdef IA64
359 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
360 #else
361 irq_stat[cpu].idle_timestamp = jiffies;
362 #endif
363 page_scrub_schedule_work();
364 while ( !softirq_pending(cpu) )
365 default_idle();
366 raise_softirq(SCHEDULE_SOFTIRQ);
367 do_softirq();
368 if (!cpu_online(cpu))
369 play_dead();
370 }
371 }
373 void startup_cpu_idle_loop(void)
374 {
375 /* Just some sanity to ensure that the scheduler is set up okay. */
376 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
377 raise_softirq(SCHEDULE_SOFTIRQ);
379 continue_cpu_idle_loop();
380 }
382 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
383 * get_order_from_shift(XMAPPEDREGS_SHIFT))
384 */
385 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
386 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
387 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
388 #endif
390 void hlt_timer_fn(void *data)
391 {
392 struct vcpu *v = data;
393 vcpu_unblock(v);
394 }
396 void relinquish_vcpu_resources(struct vcpu *v)
397 {
398 if (HAS_PERVCPU_VHPT(v->domain))
399 pervcpu_vhpt_free(v);
400 if (v->arch.privregs != NULL) {
401 free_xenheap_pages(v->arch.privregs,
402 get_order_from_shift(XMAPPEDREGS_SHIFT));
403 v->arch.privregs = NULL;
404 }
405 kill_timer(&v->arch.hlt_timer);
406 }
408 struct vcpu *alloc_vcpu_struct(void)
409 {
410 struct page_info *page;
411 struct vcpu *v;
412 struct thread_info *ti;
413 static int first_allocation = 1;
415 if (first_allocation) {
416 first_allocation = 0;
417 /* Still keep idle vcpu0 static allocated at compilation, due
418 * to some code from Linux still requires it in early phase.
419 */
420 return idle_vcpu[0];
421 }
423 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
424 if (page == NULL)
425 return NULL;
426 v = page_to_virt(page);
427 memset(v, 0, sizeof(*v));
429 ti = alloc_thread_info(v);
430 /* Clear thread_info to clear some important fields, like
431 * preempt_count
432 */
433 memset(ti, 0, sizeof(struct thread_info));
434 init_switch_stack(v);
436 return v;
437 }
439 void free_vcpu_struct(struct vcpu *v)
440 {
441 free_domheap_pages(virt_to_page(v), KERNEL_STACK_SIZE_ORDER);
442 }
444 int vcpu_initialise(struct vcpu *v)
445 {
446 struct domain *d = v->domain;
448 if (!is_idle_domain(d)) {
449 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
450 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
451 /* Set default values to saved_rr. */
452 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
453 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
455 /* Is it correct ?
456 It depends on the domain rid usage.
458 A domain may share rid among its processor (eg having a
459 global VHPT). In this case, we should also share rid
460 among vcpus and the rid range should be the same.
462 However a domain may have per cpu rid allocation. In
463 this case we don't want to share rid among vcpus, but we may
464 do it if two vcpus are on the same cpu... */
466 v->arch.starting_rid = d->arch.starting_rid;
467 v->arch.ending_rid = d->arch.ending_rid;
468 v->arch.rid_bits = d->arch.rid_bits;
469 v->arch.breakimm = d->arch.breakimm;
470 v->arch.last_processor = INVALID_PROCESSOR;
471 v->arch.vhpt_pg_shift = PAGE_SHIFT;
472 }
474 if (!VMX_DOMAIN(v))
475 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
476 first_cpu(cpu_online_map));
478 return 0;
479 }
481 static void vcpu_share_privregs_with_guest(struct vcpu *v)
482 {
483 struct domain *d = v->domain;
484 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
486 for (i = 0; i < (1 << order); i++)
487 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
488 d, XENSHARE_writable);
489 /*
490 * XXX IA64_XMAPPEDREGS_PADDR
491 * assign these pages into guest pseudo physical address
492 * space for dom0 to map this page by gmfn.
493 * this is necessary for domain save, restore and dump-core.
494 */
495 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
496 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
497 virt_to_maddr(v->arch.privregs + i));
498 }
500 int vcpu_late_initialise(struct vcpu *v)
501 {
502 int rc, order;
504 if (HAS_PERVCPU_VHPT(v->domain)) {
505 rc = pervcpu_vhpt_alloc(v);
506 if (rc != 0)
507 return rc;
508 }
510 /* Create privregs page. */
511 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
512 v->arch.privregs = alloc_xenheap_pages(order);
513 if (v->arch.privregs == NULL)
514 return -ENOMEM;
515 BUG_ON(v->arch.privregs == NULL);
516 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
517 vcpu_share_privregs_with_guest(v);
519 return 0;
520 }
522 void vcpu_destroy(struct vcpu *v)
523 {
524 if (is_hvm_vcpu(v))
525 vmx_relinquish_vcpu_resources(v);
526 else
527 relinquish_vcpu_resources(v);
528 }
530 static unsigned long*
531 vcpu_to_rbs_bottom(struct vcpu *v)
532 {
533 return (unsigned long*)((char *)v + IA64_RBS_OFFSET);
534 }
536 static void init_switch_stack(struct vcpu *v)
537 {
538 struct pt_regs *regs = vcpu_regs (v);
539 struct switch_stack *sw = (struct switch_stack *) regs - 1;
540 extern void ia64_ret_from_clone;
542 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
543 sw->ar_bspstore = (unsigned long)vcpu_to_rbs_bottom(v);
544 sw->b0 = (unsigned long) &ia64_ret_from_clone;
545 sw->ar_fpsr = FPSR_DEFAULT;
546 v->arch._thread.ksp = (unsigned long) sw - 16;
547 // stay on kernel stack because may get interrupts!
548 // ia64_ret_from_clone switches to user stack
549 v->arch._thread.on_ustack = 0;
550 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
551 }
553 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
554 static int opt_pervcpu_vhpt = 1;
555 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
556 #endif
558 int arch_domain_create(struct domain *d, unsigned int domcr_flags)
559 {
560 int i;
562 // the following will eventually need to be negotiated dynamically
563 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
564 d->arch.breakimm = 0x1000;
565 for (i = 0; i < NR_CPUS; i++) {
566 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
567 }
569 if (is_idle_domain(d))
570 return 0;
572 foreign_p2m_init(d);
573 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
574 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
575 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
576 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
577 #endif
578 if (tlb_track_create(d) < 0)
579 goto fail_nomem1;
580 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
581 if (d->shared_info == NULL)
582 goto fail_nomem;
583 BUG_ON(d->shared_info == NULL);
584 memset(d->shared_info, 0, XSI_SIZE);
585 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
586 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
587 d, XENSHARE_writable);
589 /* We may also need emulation rid for region4, though it's unlikely
590 * to see guest issue uncacheable access in metaphysical mode. But
591 * keep such info here may be more sane.
592 */
593 if (!allocate_rid_range(d,0))
594 goto fail_nomem;
596 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
597 d->arch.relres = RELRES_not_started;
598 d->arch.mm_teardown_offset = 0;
599 INIT_LIST_HEAD(&d->arch.relmem_list);
601 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
602 goto fail_nomem;
604 /*
605 * grant_table_create() can't fully initialize grant table for domain
606 * because it is called before arch_domain_create().
607 * Here we complete the initialization which requires p2m table.
608 */
609 spin_lock(&d->grant_table->lock);
610 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
611 ia64_gnttab_create_shared_page(d, d->grant_table, i);
612 spin_unlock(&d->grant_table->lock);
614 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
615 RANGESETF_prettyprint_hex);
617 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
618 return 0;
620 fail_nomem:
621 tlb_track_destroy(d);
622 fail_nomem1:
623 if (d->arch.mm.pgd != NULL)
624 pgd_free(d->arch.mm.pgd);
625 if (d->shared_info != NULL)
626 free_xenheap_pages(d->shared_info,
627 get_order_from_shift(XSI_SHIFT));
628 return -ENOMEM;
629 }
631 void arch_domain_destroy(struct domain *d)
632 {
633 mm_final_teardown(d);
635 if (d->shared_info != NULL)
636 free_xenheap_pages(d->shared_info,
637 get_order_from_shift(XSI_SHIFT));
639 tlb_track_destroy(d);
641 /* Clear vTLB for the next domain. */
642 domain_flush_tlb_vhpt(d);
644 deallocate_rid_range(d);
645 }
647 void arch_vcpu_reset(struct vcpu *v)
648 {
649 /* FIXME: Stub for now */
650 }
652 /* Here it is assumed that all of the CPUs has same RSE.N_STACKED_PHYS */
653 static unsigned long num_phys_stacked;
654 static int __init
655 init_num_phys_stacked(void)
656 {
657 switch (ia64_pal_rse_info(&num_phys_stacked, NULL)) {
658 case 0L:
659 printk("the number of physical stacked general registers"
660 "(RSE.N_STACKED_PHYS) = %ld\n", num_phys_stacked);
661 return 0;
662 case -2L:
663 case -3L:
664 default:
665 break;
666 }
667 printk("WARNING: PAL_RSE_INFO call failed. "
668 "domain save/restore may NOT work!\n");
669 return -EINVAL;
670 }
671 __initcall(init_num_phys_stacked);
673 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
675 #define AR_PFS_PEC_SHIFT 51
676 #define AR_PFS_REC_SIZE 6
677 #define AR_PFS_PEC_MASK (((1UL << 6) - 1) << 51)
679 /*
680 * See init_swtich_stack() and ptrace.h
681 */
682 static struct switch_stack*
683 vcpu_to_switch_stack(struct vcpu* v)
684 {
685 return (struct switch_stack *)(v->arch._thread.ksp + 16);
686 }
688 static int
689 vcpu_has_not_run(struct vcpu* v)
690 {
691 extern void ia64_ret_from_clone;
692 struct switch_stack *sw = vcpu_to_switch_stack(v);
694 return (sw == (struct switch_stack *)(vcpu_regs(v)) - 1) &&
695 (sw->b0 == (unsigned long)&ia64_ret_from_clone);
696 }
698 static void
699 nats_update(unsigned int* nats, unsigned int reg, char nat)
700 {
701 BUG_ON(reg > 31);
703 if (nat)
704 *nats |= (1UL << reg);
705 else
706 *nats &= ~(1UL << reg);
707 }
709 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
710 {
711 int i;
712 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
713 struct cpu_user_regs *uregs = vcpu_regs(v);
714 struct switch_stack *sw = vcpu_to_switch_stack(v);
715 struct unw_frame_info info;
716 int is_hvm = VMX_DOMAIN(v);
717 unsigned int rbs_size;
718 unsigned long *const rbs_bottom = vcpu_to_rbs_bottom(v);
719 unsigned long *rbs_top;
720 unsigned long *rbs_rnat_addr;
721 unsigned int top_slot;
722 unsigned int num_regs;
724 memset(c.nat, 0, sizeof(*c.nat));
725 c.nat->regs.b[6] = uregs->b6;
726 c.nat->regs.b[7] = uregs->b7;
728 memset(&info, 0, sizeof(info));
729 unw_init_from_blocked_task(&info, v);
730 if (vcpu_has_not_run(v)) {
731 c.nat->regs.ar.lc = sw->ar_lc;
732 c.nat->regs.ar.ec =
733 (sw->ar_pfs & AR_PFS_PEC_MASK) >> AR_PFS_PEC_SHIFT;
734 } else if (unw_unwind_to_user(&info) < 0) {
735 /* warn: should panic? */
736 gdprintk(XENLOG_ERR, "vcpu=%d unw_unwind_to_user() failed.\n",
737 v->vcpu_id);
738 show_stack(v, NULL);
740 /* can't return error */
741 c.nat->regs.ar.lc = 0;
742 c.nat->regs.ar.ec = 0;
743 } else {
744 unw_get_ar(&info, UNW_AR_LC, &c.nat->regs.ar.lc);
745 unw_get_ar(&info, UNW_AR_EC, &c.nat->regs.ar.ec);
746 }
747 c.nat->regs.ar.csd = uregs->ar_csd;
748 c.nat->regs.ar.ssd = uregs->ar_ssd;
750 c.nat->regs.r[8] = uregs->r8;
751 c.nat->regs.r[9] = uregs->r9;
752 c.nat->regs.r[10] = uregs->r10;
753 c.nat->regs.r[11] = uregs->r11;
755 if (is_hvm)
756 c.nat->regs.psr = vmx_vcpu_get_psr(v);
757 else
758 c.nat->regs.psr = vcpu_get_psr(v);
760 c.nat->regs.ip = uregs->cr_iip;
761 c.nat->regs.cfm = uregs->cr_ifs;
763 c.nat->regs.ar.unat = uregs->ar_unat;
764 c.nat->regs.ar.pfs = uregs->ar_pfs;
765 c.nat->regs.ar.rsc = uregs->ar_rsc;
766 c.nat->regs.ar.rnat = uregs->ar_rnat;
767 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
769 c.nat->regs.pr = uregs->pr;
770 c.nat->regs.b[0] = uregs->b0;
771 rbs_size = uregs->loadrs >> 16;
772 num_regs = ia64_rse_num_regs(rbs_bottom,
773 (unsigned long*)((char*)rbs_bottom + rbs_size));
774 c.nat->regs.ar.bsp = (unsigned long)ia64_rse_skip_regs(
775 (unsigned long*)c.nat->regs.ar.bspstore, num_regs);
776 BUG_ON(num_regs > num_phys_stacked);
778 c.nat->regs.r[1] = uregs->r1;
779 c.nat->regs.r[12] = uregs->r12;
780 c.nat->regs.r[13] = uregs->r13;
781 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
782 c.nat->regs.r[15] = uregs->r15;
784 c.nat->regs.r[14] = uregs->r14;
785 c.nat->regs.r[2] = uregs->r2;
786 c.nat->regs.r[3] = uregs->r3;
787 c.nat->regs.r[16] = uregs->r16;
788 c.nat->regs.r[17] = uregs->r17;
789 c.nat->regs.r[18] = uregs->r18;
790 c.nat->regs.r[19] = uregs->r19;
791 c.nat->regs.r[20] = uregs->r20;
792 c.nat->regs.r[21] = uregs->r21;
793 c.nat->regs.r[22] = uregs->r22;
794 c.nat->regs.r[23] = uregs->r23;
795 c.nat->regs.r[24] = uregs->r24;
796 c.nat->regs.r[25] = uregs->r25;
797 c.nat->regs.r[26] = uregs->r26;
798 c.nat->regs.r[27] = uregs->r27;
799 c.nat->regs.r[28] = uregs->r28;
800 c.nat->regs.r[29] = uregs->r29;
801 c.nat->regs.r[30] = uregs->r30;
802 c.nat->regs.r[31] = uregs->r31;
804 c.nat->regs.ar.ccv = uregs->ar_ccv;
806 COPY_FPREG(&c.nat->regs.f[2], &sw->f2);
807 COPY_FPREG(&c.nat->regs.f[3], &sw->f3);
808 COPY_FPREG(&c.nat->regs.f[4], &sw->f4);
809 COPY_FPREG(&c.nat->regs.f[5], &sw->f5);
811 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
812 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
813 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
814 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
815 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
816 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
818 COPY_FPREG(&c.nat->regs.f[12], &sw->f12);
819 COPY_FPREG(&c.nat->regs.f[13], &sw->f13);
820 COPY_FPREG(&c.nat->regs.f[14], &sw->f14);
821 COPY_FPREG(&c.nat->regs.f[15], &sw->f15);
822 COPY_FPREG(&c.nat->regs.f[16], &sw->f16);
823 COPY_FPREG(&c.nat->regs.f[17], &sw->f17);
824 COPY_FPREG(&c.nat->regs.f[18], &sw->f18);
825 COPY_FPREG(&c.nat->regs.f[19], &sw->f19);
826 COPY_FPREG(&c.nat->regs.f[20], &sw->f20);
827 COPY_FPREG(&c.nat->regs.f[21], &sw->f21);
828 COPY_FPREG(&c.nat->regs.f[22], &sw->f22);
829 COPY_FPREG(&c.nat->regs.f[23], &sw->f23);
830 COPY_FPREG(&c.nat->regs.f[24], &sw->f24);
831 COPY_FPREG(&c.nat->regs.f[25], &sw->f25);
832 COPY_FPREG(&c.nat->regs.f[26], &sw->f26);
833 COPY_FPREG(&c.nat->regs.f[27], &sw->f27);
834 COPY_FPREG(&c.nat->regs.f[28], &sw->f28);
835 COPY_FPREG(&c.nat->regs.f[29], &sw->f29);
836 COPY_FPREG(&c.nat->regs.f[30], &sw->f30);
837 COPY_FPREG(&c.nat->regs.f[31], &sw->f31);
839 // f32 - f127
840 memcpy(&c.nat->regs.f[32], &v->arch._thread.fph[0],
841 sizeof(v->arch._thread.fph));
843 #define NATS_UPDATE(reg) \
844 nats_update(&c.nat->regs.nats, (reg), \
845 !!(uregs->eml_unat & \
846 (1UL << ia64_unat_pos(&uregs->r ## reg))))
848 // corresponding bit in ar.unat is determined by
849 // (&uregs->rN){8:3}.
850 // r8: the lowest gr member of struct cpu_user_regs.
851 // r7: the highest gr member of struct cpu_user_regs.
852 BUILD_BUG_ON(offsetof(struct cpu_user_regs, r7) -
853 offsetof(struct cpu_user_regs, r8) >
854 64 * sizeof(unsigned long));
856 NATS_UPDATE(1);
857 NATS_UPDATE(2);
858 NATS_UPDATE(3);
860 NATS_UPDATE(8);
861 NATS_UPDATE(9);
862 NATS_UPDATE(10);
863 NATS_UPDATE(11);
864 NATS_UPDATE(12);
865 NATS_UPDATE(13);
866 NATS_UPDATE(14);
867 NATS_UPDATE(15);
868 NATS_UPDATE(16);
869 NATS_UPDATE(17);
870 NATS_UPDATE(18);
871 NATS_UPDATE(19);
872 NATS_UPDATE(20);
873 NATS_UPDATE(21);
874 NATS_UPDATE(22);
875 NATS_UPDATE(23);
876 NATS_UPDATE(24);
877 NATS_UPDATE(25);
878 NATS_UPDATE(26);
879 NATS_UPDATE(27);
880 NATS_UPDATE(28);
881 NATS_UPDATE(29);
882 NATS_UPDATE(30);
883 NATS_UPDATE(31);
885 if (!is_hvm) {
886 c.nat->regs.r[4] = uregs->r4;
887 c.nat->regs.r[5] = uregs->r5;
888 c.nat->regs.r[6] = uregs->r6;
889 c.nat->regs.r[7] = uregs->r7;
891 NATS_UPDATE(4);
892 NATS_UPDATE(5);
893 NATS_UPDATE(6);
894 NATS_UPDATE(7);
895 #undef NATS_UPDATE
896 } else {
897 /*
898 * for VTi domain, r[4-7] are saved sometimes both in
899 * uregs->r[4-7] and memory stack or only in memory stack.
900 * So it is ok to get them from memory stack.
901 */
902 if (vcpu_has_not_run(v)) {
903 c.nat->regs.r[4] = sw->r4;
904 c.nat->regs.r[5] = sw->r5;
905 c.nat->regs.r[6] = sw->r6;
906 c.nat->regs.r[7] = sw->r7;
908 nats_update(&c.nat->regs.nats, 4,
909 !!(sw->ar_unat &
910 (1UL << ia64_unat_pos(&sw->r4))));
911 nats_update(&c.nat->regs.nats, 5,
912 !!(sw->ar_unat &
913 (1UL << ia64_unat_pos(&sw->r5))));
914 nats_update(&c.nat->regs.nats, 6,
915 !!(sw->ar_unat &
916 (1UL << ia64_unat_pos(&sw->r6))));
917 nats_update(&c.nat->regs.nats, 7,
918 !!(sw->ar_unat &
919 (1UL << ia64_unat_pos(&sw->r7))));
920 } else {
921 char nat;
923 unw_get_gr(&info, 4, &c.nat->regs.r[4], &nat);
924 nats_update(&c.nat->regs.nats, 4, nat);
925 unw_get_gr(&info, 5, &c.nat->regs.r[5], &nat);
926 nats_update(&c.nat->regs.nats, 5, nat);
927 unw_get_gr(&info, 6, &c.nat->regs.r[6], &nat);
928 nats_update(&c.nat->regs.nats, 6, nat);
929 unw_get_gr(&info, 7, &c.nat->regs.r[7], &nat);
930 nats_update(&c.nat->regs.nats, 7, nat);
931 }
932 }
934 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
935 if (unlikely(rbs_size > sizeof(c.nat->regs.rbs)))
936 gdprintk(XENLOG_INFO,
937 "rbs_size is too large 0x%x > 0x%lx\n",
938 rbs_size, sizeof(c.nat->regs.rbs));
939 else
940 memcpy(c.nat->regs.rbs, rbs_bottom, rbs_size);
942 rbs_top = (unsigned long*)((char *)rbs_bottom + rbs_size) - 1;
943 rbs_rnat_addr = ia64_rse_rnat_addr(rbs_top);
944 if ((unsigned long)rbs_rnat_addr >= sw->ar_bspstore)
945 rbs_rnat_addr = &sw->ar_rnat;
947 top_slot = ia64_rse_slot_num(rbs_top);
949 c.nat->regs.rbs_rnat = (*rbs_rnat_addr) & ((1UL << top_slot) - 1);
950 if (ia64_rse_rnat_addr(rbs_bottom) == ia64_rse_rnat_addr(rbs_top)) {
951 unsigned int bottom_slot = ia64_rse_slot_num(rbs_bottom);
952 c.nat->regs.rbs_rnat &= ~((1UL << bottom_slot) - 1);
953 }
955 c.nat->regs.num_phys_stacked = num_phys_stacked;
957 if (VMX_DOMAIN(v))
958 c.nat->privregs_pfn = VGC_PRIVREGS_HVM;
959 else
960 c.nat->privregs_pfn = get_gpfn_from_mfn(
961 virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
963 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
964 if (VMX_DOMAIN(v)) {
965 vmx_vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
966 vmx_vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
967 } else {
968 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
969 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
970 }
971 }
973 for (i = 0; i < 8; i++)
974 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
976 /* Fill extra regs. */
977 for (i = 0;
978 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
979 i++) {
980 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
981 tr->itrs[i].itir = v->arch.itrs[i].itir;
982 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
983 tr->itrs[i].rid = v->arch.itrs[i].rid;
984 }
985 for (i = 0;
986 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
987 i++) {
988 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
989 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
990 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
991 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
992 }
993 c.nat->event_callback_ip = v->arch.event_callback_ip;
995 /* If PV and privregs is not set, we can't read mapped registers. */
996 if (!is_hvm_vcpu(v) && v->arch.privregs == NULL)
997 return;
999 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
1001 c.nat->regs.cr.itm = is_hvm_vcpu(v) ?
1002 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
1003 vcpu_get_iva(v, &c.nat->regs.cr.iva);
1004 vcpu_get_pta(v, &c.nat->regs.cr.pta);
1006 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
1007 vcpu_get_isr(v, &c.nat->regs.cr.isr);
1008 vcpu_get_iip(v, &c.nat->regs.cr.iip);
1009 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
1010 vcpu_get_itir(v, &c.nat->regs.cr.itir);
1011 vcpu_get_iha(v, &c.nat->regs.cr.iha);
1013 //XXX change irr[] and arch.insvc[]
1014 if (is_hvm_vcpu(v))
1015 /* c.nat->regs.cr.ivr = vmx_vcpu_get_ivr(v)*/;//XXXnot SMP-safe
1016 else
1017 vcpu_get_ivr (v, &c.nat->regs.cr.ivr);
1018 vcpu_get_iim(v, &c.nat->regs.cr.iim);
1020 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
1021 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
1022 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
1023 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
1024 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
1025 vcpu_get_itv(v, &c.nat->regs.cr.itv);//XXX vlsapic
1026 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
1027 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
1029 if (is_hvm)
1030 vmx_arch_get_info_guest(v, c);
1033 #if 0
1034 // for debug
1035 static void
1036 __rbs_print(const char* func, int line, const char* name,
1037 const unsigned long* rbs, unsigned int rbs_size)
1039 unsigned int i;
1040 printk("%s:%d %s rbs %p\n", func, line, name, rbs);
1041 printk(" rbs_size 0x%016x no 0x%lx\n",
1042 rbs_size, rbs_size / sizeof(unsigned long));
1044 for (i = 0; i < rbs_size / sizeof(unsigned long); i++) {
1045 const char* zero_or_n = "0x";
1046 if (ia64_rse_is_rnat_slot((unsigned long*)&rbs[i]))
1047 zero_or_n = "Nx";
1049 if ((i % 3) == 0)
1050 printk("0x%02x:", i);
1051 printk(" %s%016lx", zero_or_n, rbs[i]);
1052 if ((i % 3) == 2)
1053 printk("\n");
1055 printk("\n");
1058 #define rbs_print(rbs, rbs_size) \
1059 __rbs_print(__func__, __LINE__, (#rbs), (rbs), (rbs_size))
1060 #endif
1062 static int
1063 copy_rbs(struct vcpu* v, unsigned long* dst_rbs_size,
1064 const unsigned long* rbs, unsigned long rbs_size,
1065 unsigned long src_rnat, unsigned long rbs_voff)
1067 int rc = -EINVAL;
1068 struct page_info* page;
1069 unsigned char* vaddr;
1070 unsigned long* src_bsp;
1071 unsigned long* src_bspstore;
1073 struct switch_stack* sw = vcpu_to_switch_stack(v);
1074 unsigned long num_regs;
1075 unsigned long* dst_bsp;
1076 unsigned long* dst_bspstore;
1077 unsigned long* dst_rnat;
1078 unsigned long dst_rnat_tmp;
1079 unsigned long dst_rnat_mask;
1080 unsigned long flags;
1081 extern void ia64_copy_rbs(unsigned long* dst_bspstore,
1082 unsigned long* dst_rbs_size,
1083 unsigned long* dst_rnat_p,
1084 unsigned long* src_bsp,
1085 unsigned long src_rbs_size,
1086 unsigned long src_rnat);
1088 dst_bspstore = vcpu_to_rbs_bottom(v);
1089 *dst_rbs_size = rbs_size;
1090 if (rbs_size == 0)
1091 return 0;
1093 // rbs offset depends on sizeof(struct vcpu) so that
1094 // it's too unstable for hypercall ABI.
1095 // we need to take rbs offset into acount.
1096 //memcpy(dst_bspstore, c.nat->regs.rbs, rbs_size);
1098 // It is assumed that rbs_size is small enough compared
1099 // to KERNEL_STACK_SIZE.
1100 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
1101 if (page == NULL)
1102 return -ENOMEM;
1103 vaddr = page_to_virt(page);
1105 src_bspstore = (unsigned long*)(vaddr + rbs_voff * 8);
1106 src_bsp = (unsigned long*)((unsigned char*)src_bspstore + rbs_size);
1107 if ((unsigned long)src_bsp >= (unsigned long)vaddr + PAGE_SIZE)
1108 goto out;
1109 memcpy(src_bspstore, rbs, rbs_size);
1111 num_regs = ia64_rse_num_regs(src_bspstore, src_bsp);
1112 dst_bsp = ia64_rse_skip_regs(dst_bspstore, num_regs);
1113 *dst_rbs_size = (unsigned long)dst_bsp - (unsigned long)dst_bspstore;
1115 // rough check.
1116 if (((unsigned long)dst_bsp & ~PAGE_MASK) > KERNEL_STACK_SIZE / 2)
1117 goto out;
1119 // ia64_copy_rbs() uses real cpu's stack register.
1120 // So it may fault with an Illigal Operation fault resulting
1121 // in panic if rbs_size is too large to load compared to
1122 // the number of physical stacked registers, RSE.N_STACKED_PHYS,
1123 // which is cpu implementatin specific.
1124 // See SDM vol. 2 Register Stack Engine 6, especially 6.5.5.
1125 //
1126 // For safe operation and cpu model independency,
1127 // we need to copy them by hand without loadrs and flushrs
1128 // However even if we implement that, similar issue still occurs
1129 // when running guest. CPU context restore routine issues loadrs
1130 // resulting in Illegal Operation fault. And what if the vRSE is in
1131 // enforced lazy mode? We can't store any dirty stacked registers
1132 // into RBS without cover or br.call.
1133 if (num_regs > num_phys_stacked) {
1134 rc = -ENOSYS;
1135 gdprintk(XENLOG_WARNING,
1136 "%s:%d domain %d: can't load stacked registres\n"
1137 "requested size 0x%lx => 0x%lx, num regs %ld"
1138 "RSE.N_STACKED_PHYS %ld\n",
1139 __func__, __LINE__, v->domain->domain_id,
1140 rbs_size, *dst_rbs_size, num_regs,
1141 num_phys_stacked);
1142 goto out;
1145 // we mask interrupts to avoid using register backing store.
1146 local_irq_save(flags);
1147 ia64_copy_rbs(dst_bspstore, dst_rbs_size, &dst_rnat_tmp,
1148 src_bsp, rbs_size, src_rnat);
1149 local_irq_restore(flags);
1151 dst_rnat_mask = (1UL << ia64_rse_slot_num(dst_bsp)) - 1;
1152 dst_rnat = ia64_rse_rnat_addr(dst_bsp);
1153 if ((unsigned long)dst_rnat > sw->ar_bspstore)
1154 dst_rnat = &sw->ar_rnat;
1155 // if ia64_rse_rnat_addr(dst_bsp) ==
1156 // ia64_rse_rnat_addr(vcpu_to_rbs_bottom(v)), the lsb bit of rnat
1157 // is just ignored. so we don't have to mask it out.
1158 *dst_rnat =
1159 (*dst_rnat & ~dst_rnat_mask) | (dst_rnat_tmp & dst_rnat_mask);
1161 rc = 0;
1162 out:
1163 free_domheap_pages(page, KERNEL_STACK_SIZE_ORDER);
1164 return rc;
1167 static void
1168 unat_update(unsigned long *unat_eml, unsigned long *spill_addr, char nat)
1170 unsigned int pos = ia64_unat_pos(spill_addr);
1171 if (nat)
1172 *unat_eml |= (1UL << pos);
1173 else
1174 *unat_eml &= ~(1UL << pos);
1177 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
1179 struct cpu_user_regs *uregs = vcpu_regs(v);
1180 struct domain *d = v->domain;
1181 struct switch_stack *sw = vcpu_to_switch_stack(v);
1182 int was_initialised = v->is_initialised;
1183 struct unw_frame_info info;
1184 unsigned int rbs_size;
1185 unsigned int num_regs;
1186 unsigned long * const rbs_bottom = vcpu_to_rbs_bottom(v);
1187 int rc = 0;
1188 int i;
1190 /* Finish vcpu initialization. */
1191 if (!was_initialised) {
1192 if (is_hvm_domain(d))
1193 rc = vmx_final_setup_guest(v);
1194 else
1195 rc = vcpu_late_initialise(v);
1196 if (rc != 0)
1197 return rc;
1199 vcpu_init_regs(v);
1201 v->is_initialised = 1;
1202 /* Auto-online VCPU0 when it is initialised. */
1203 if (v->vcpu_id == 0 || (c.nat != NULL &&
1204 c.nat->flags & VGCF_online))
1205 clear_bit(_VPF_down, &v->pause_flags);
1208 if (c.nat == NULL)
1209 return 0;
1211 uregs->b6 = c.nat->regs.b[6];
1212 uregs->b7 = c.nat->regs.b[7];
1214 memset(&info, 0, sizeof(info));
1215 unw_init_from_blocked_task(&info, v);
1216 if (vcpu_has_not_run(v)) {
1217 sw->ar_lc = c.nat->regs.ar.lc;
1218 sw->ar_pfs =
1219 (sw->ar_pfs & ~AR_PFS_PEC_MASK) |
1220 ((c.nat->regs.ar.ec << AR_PFS_PEC_SHIFT) &
1221 AR_PFS_PEC_MASK);
1222 } else if (unw_unwind_to_user(&info) < 0) {
1223 /* warn: should panic? */
1224 gdprintk(XENLOG_ERR,
1225 "vcpu=%d unw_unwind_to_user() failed.\n",
1226 v->vcpu_id);
1227 show_stack(v, NULL);
1229 //return -ENOSYS;
1230 } else {
1231 unw_set_ar(&info, UNW_AR_LC, c.nat->regs.ar.lc);
1232 unw_set_ar(&info, UNW_AR_EC, c.nat->regs.ar.ec);
1234 uregs->ar_csd = c.nat->regs.ar.csd;
1235 uregs->ar_ssd = c.nat->regs.ar.ssd;
1237 uregs->r8 = c.nat->regs.r[8];
1238 uregs->r9 = c.nat->regs.r[9];
1239 uregs->r10 = c.nat->regs.r[10];
1240 uregs->r11 = c.nat->regs.r[11];
1242 if (!is_hvm_domain(d))
1243 vcpu_set_psr(v, c.nat->regs.psr);
1244 else
1245 vmx_vcpu_set_psr(v, c.nat->regs.psr);
1246 uregs->cr_iip = c.nat->regs.ip;
1247 uregs->cr_ifs = c.nat->regs.cfm;
1249 uregs->ar_unat = c.nat->regs.ar.unat;
1250 uregs->ar_pfs = c.nat->regs.ar.pfs;
1251 uregs->ar_rsc = c.nat->regs.ar.rsc;
1252 uregs->ar_rnat = c.nat->regs.ar.rnat;
1253 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
1255 uregs->pr = c.nat->regs.pr;
1256 uregs->b0 = c.nat->regs.b[0];
1257 num_regs = ia64_rse_num_regs((unsigned long*)c.nat->regs.ar.bspstore,
1258 (unsigned long*)c.nat->regs.ar.bsp);
1259 rbs_size = (unsigned long)ia64_rse_skip_regs(rbs_bottom, num_regs) -
1260 (unsigned long)rbs_bottom;
1261 if (rbs_size > sizeof (c.nat->regs.rbs)) {
1262 gdprintk(XENLOG_INFO,
1263 "rbs size is too large %x > %lx\n",
1264 rbs_size, sizeof (c.nat->regs.rbs));
1265 return -EINVAL;
1267 if (rbs_size > 0 &&
1268 ((IA64_RBS_OFFSET / 8) % 64) != c.nat->regs.rbs_voff)
1269 gdprintk(XENLOG_INFO,
1270 "rbs stack offset is different! xen 0x%x given 0x%x",
1271 (IA64_RBS_OFFSET / 8) % 64, c.nat->regs.rbs_voff);
1273 /* Protection against crazy user code. */
1274 if (!was_initialised)
1275 uregs->loadrs = (rbs_size << 16);
1276 if (rbs_size == (uregs->loadrs >> 16)) {
1277 unsigned long dst_rbs_size = 0;
1278 if (vcpu_has_not_run(v))
1279 sw->ar_bspstore = (unsigned long)rbs_bottom;
1281 rc = copy_rbs(v, &dst_rbs_size,
1282 c.nat->regs.rbs, rbs_size,
1283 c.nat->regs.rbs_rnat,
1284 c.nat->regs.rbs_voff);
1285 if (rc < 0)
1286 return rc;
1288 /* In case of newly created vcpu, ar_bspstore points to
1289 * the bottom of register stack. Move it up.
1290 * See also init_switch_stack().
1291 */
1292 if (vcpu_has_not_run(v)) {
1293 uregs->loadrs = (dst_rbs_size << 16);
1294 sw->ar_bspstore = (unsigned long)((char*)rbs_bottom +
1295 dst_rbs_size);
1299 // inhibit save/restore between cpus of different RSE.N_STACKED_PHYS.
1300 // to avoid nasty issues.
1301 //
1302 // The number of physical stacked general register(RSE.N_STACKED_PHYS)
1303 // isn't virtualized. Guest OS utilizes it via PAL_RSE_INFO call and
1304 // the value might be exported to user/user process.
1305 // (Linux does via /proc/cpuinfo)
1306 // The SDM says only that the number is cpu implementation specific.
1307 //
1308 // If the number of restoring cpu is different from one of saving cpu,
1309 // the following, or something worse, might happen.
1310 // - Xen VMM itself may panic when issuing loadrs to run guest with
1311 // illegal operation fault
1312 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1313 // restoring CPU
1314 // This case is detected to refuse restore by rbs_copy()
1315 // - guest kernel may panic with illegal operation fault
1316 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1317 // restoring CPU
1318 // - infomation leak from guest kernel to user process
1319 // When RSE.N_STACKED_PHYS of saving CPU < RSE.N_STACKED_PHYS of
1320 // restoring CPU
1321 // Before returning to user process, kernel should zero clear all
1322 // physical stacked resgisters to prevent kernel bits leak.
1323 // It would be based on RSE.N_STACKED_PHYS (Linux does.).
1324 // On the restored environtment the kernel clears only a part
1325 // of the physical stacked registers.
1326 // - user processes or human operators would be confused.
1327 // RSE.N_STACKED_PHYS might be exported to user process or human
1328 // operators. Actually on linux it is exported via /proc/cpuinfo.
1329 // user processes might use it.
1330 // I don't know any concrete example, but it's possible in theory.
1331 // e.g. thread libraly may allocate RBS area based on the value.
1332 // (Fortunately glibc nptl doesn't)
1333 if (c.nat->regs.num_phys_stacked != 0 && /* COMPAT */
1334 c.nat->regs.num_phys_stacked != num_phys_stacked) {
1335 gdprintk(XENLOG_WARNING,
1336 "num phys stacked is different! "
1337 "xen 0x%lx given 0x%lx",
1338 num_phys_stacked, c.nat->regs.num_phys_stacked);
1339 return -EINVAL;
1342 uregs->r1 = c.nat->regs.r[1];
1343 uregs->r12 = c.nat->regs.r[12];
1344 uregs->r13 = c.nat->regs.r[13];
1345 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
1346 uregs->r15 = c.nat->regs.r[15];
1348 uregs->r14 = c.nat->regs.r[14];
1349 uregs->r2 = c.nat->regs.r[2];
1350 uregs->r3 = c.nat->regs.r[3];
1351 uregs->r16 = c.nat->regs.r[16];
1352 uregs->r17 = c.nat->regs.r[17];
1353 uregs->r18 = c.nat->regs.r[18];
1354 uregs->r19 = c.nat->regs.r[19];
1355 uregs->r20 = c.nat->regs.r[20];
1356 uregs->r21 = c.nat->regs.r[21];
1357 uregs->r22 = c.nat->regs.r[22];
1358 uregs->r23 = c.nat->regs.r[23];
1359 uregs->r24 = c.nat->regs.r[24];
1360 uregs->r25 = c.nat->regs.r[25];
1361 uregs->r26 = c.nat->regs.r[26];
1362 uregs->r27 = c.nat->regs.r[27];
1363 uregs->r28 = c.nat->regs.r[28];
1364 uregs->r29 = c.nat->regs.r[29];
1365 uregs->r30 = c.nat->regs.r[30];
1366 uregs->r31 = c.nat->regs.r[31];
1368 uregs->ar_ccv = c.nat->regs.ar.ccv;
1370 COPY_FPREG(&sw->f2, &c.nat->regs.f[2]);
1371 COPY_FPREG(&sw->f3, &c.nat->regs.f[3]);
1372 COPY_FPREG(&sw->f4, &c.nat->regs.f[4]);
1373 COPY_FPREG(&sw->f5, &c.nat->regs.f[5]);
1375 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
1376 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
1377 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
1378 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
1379 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
1380 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
1382 COPY_FPREG(&sw->f12, &c.nat->regs.f[12]);
1383 COPY_FPREG(&sw->f13, &c.nat->regs.f[13]);
1384 COPY_FPREG(&sw->f14, &c.nat->regs.f[14]);
1385 COPY_FPREG(&sw->f15, &c.nat->regs.f[15]);
1386 COPY_FPREG(&sw->f16, &c.nat->regs.f[16]);
1387 COPY_FPREG(&sw->f17, &c.nat->regs.f[17]);
1388 COPY_FPREG(&sw->f18, &c.nat->regs.f[18]);
1389 COPY_FPREG(&sw->f19, &c.nat->regs.f[19]);
1390 COPY_FPREG(&sw->f20, &c.nat->regs.f[20]);
1391 COPY_FPREG(&sw->f21, &c.nat->regs.f[21]);
1392 COPY_FPREG(&sw->f22, &c.nat->regs.f[22]);
1393 COPY_FPREG(&sw->f23, &c.nat->regs.f[23]);
1394 COPY_FPREG(&sw->f24, &c.nat->regs.f[24]);
1395 COPY_FPREG(&sw->f25, &c.nat->regs.f[25]);
1396 COPY_FPREG(&sw->f26, &c.nat->regs.f[26]);
1397 COPY_FPREG(&sw->f27, &c.nat->regs.f[27]);
1398 COPY_FPREG(&sw->f28, &c.nat->regs.f[28]);
1399 COPY_FPREG(&sw->f29, &c.nat->regs.f[29]);
1400 COPY_FPREG(&sw->f30, &c.nat->regs.f[30]);
1401 COPY_FPREG(&sw->f31, &c.nat->regs.f[31]);
1403 // f32 - f127
1404 memcpy(&v->arch._thread.fph[0], &c.nat->regs.f[32],
1405 sizeof(v->arch._thread.fph));
1407 #define UNAT_UPDATE(reg) \
1408 unat_update(&uregs->eml_unat, &uregs->r ## reg, \
1409 !!(c.nat->regs.nats & (1UL << (reg))));
1411 uregs->eml_unat = 0;
1412 UNAT_UPDATE(1);
1413 UNAT_UPDATE(2);
1414 UNAT_UPDATE(3);
1416 UNAT_UPDATE(8);
1417 UNAT_UPDATE(9);
1418 UNAT_UPDATE(10);
1419 UNAT_UPDATE(11);
1420 UNAT_UPDATE(12);
1421 UNAT_UPDATE(13);
1422 UNAT_UPDATE(14);
1423 UNAT_UPDATE(15);
1424 UNAT_UPDATE(16);
1425 UNAT_UPDATE(17);
1426 UNAT_UPDATE(18);
1427 UNAT_UPDATE(19);
1428 UNAT_UPDATE(20);
1429 UNAT_UPDATE(21);
1430 UNAT_UPDATE(22);
1431 UNAT_UPDATE(23);
1432 UNAT_UPDATE(24);
1433 UNAT_UPDATE(25);
1434 UNAT_UPDATE(26);
1435 UNAT_UPDATE(27);
1436 UNAT_UPDATE(28);
1437 UNAT_UPDATE(29);
1438 UNAT_UPDATE(30);
1439 UNAT_UPDATE(31);
1441 /*
1442 * r4-r7 is saved sometimes both in pt_regs->r[4-7] and memory stack or
1443 * only in memory stack.
1444 * for both cases, both memory stack and pt_regs->r[4-7] are updated.
1445 */
1446 uregs->r4 = c.nat->regs.r[4];
1447 uregs->r5 = c.nat->regs.r[5];
1448 uregs->r6 = c.nat->regs.r[6];
1449 uregs->r7 = c.nat->regs.r[7];
1451 UNAT_UPDATE(4);
1452 UNAT_UPDATE(5);
1453 UNAT_UPDATE(6);
1454 UNAT_UPDATE(7);
1455 #undef UNAT_UPDATE
1456 if (vcpu_has_not_run(v)) {
1457 sw->r4 = c.nat->regs.r[4];
1458 sw->r5 = c.nat->regs.r[5];
1459 sw->r6 = c.nat->regs.r[6];
1460 sw->r7 = c.nat->regs.r[7];
1462 unat_update(&sw->ar_unat, &sw->r4,
1463 !!(c.nat->regs.nats & (1UL << 4)));
1464 unat_update(&sw->ar_unat, &sw->r5,
1465 !!(c.nat->regs.nats & (1UL << 5)));
1466 unat_update(&sw->ar_unat, &sw->r6,
1467 !!(c.nat->regs.nats & (1UL << 6)));
1468 unat_update(&sw->ar_unat, &sw->r7,
1469 !!(c.nat->regs.nats & (1UL << 7)));
1470 } else {
1471 unw_set_gr(&info, 4, c.nat->regs.r[4],
1472 !!(c.nat->regs.nats & (1UL << 4)));
1473 unw_set_gr(&info, 5, c.nat->regs.r[5],
1474 !!(c.nat->regs.nats & (1UL << 5)));
1475 unw_set_gr(&info, 6, c.nat->regs.r[6],
1476 !!(c.nat->regs.nats & (1UL << 6)));
1477 unw_set_gr(&info, 7, c.nat->regs.r[7],
1478 !!(c.nat->regs.nats & (1UL << 7)));
1481 if (!is_hvm_domain(d)) {
1482 /* domain runs at PL2/3 */
1483 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
1484 IA64_PSR_CPL0_BIT);
1485 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
1488 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1489 if (is_hvm_domain(d)) {
1490 vmx_vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1491 vmx_vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1492 } else {
1493 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1494 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1498 /* rr[] must be set before setting itrs[] dtrs[] */
1499 for (i = 0; i < 8; i++) {
1500 unsigned long rrval = c.nat->regs.rr[i];
1501 unsigned long reg = (unsigned long)i << 61;
1502 IA64FAULT fault = IA64_NO_FAULT;
1504 if (rrval == 0)
1505 continue;
1506 if (is_hvm_domain(d)) {
1507 //without VGCF_EXTRA_REGS check,
1508 //VTi domain doesn't boot.
1509 if (c.nat->flags & VGCF_EXTRA_REGS)
1510 fault = vmx_vcpu_set_rr(v, reg, rrval);
1511 } else
1512 fault = vcpu_set_rr(v, reg, rrval);
1513 if (fault != IA64_NO_FAULT)
1514 return -EINVAL;
1517 if (c.nat->flags & VGCF_EXTRA_REGS) {
1518 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
1520 for (i = 0;
1521 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1522 i++) {
1523 if (is_hvm_domain(d))
1524 vmx_vcpu_itr_i(v, i, tr->itrs[i].pte,
1525 tr->itrs[i].itir,
1526 tr->itrs[i].vadr);
1527 else
1528 vcpu_set_itr(v, i, tr->itrs[i].pte,
1529 tr->itrs[i].itir,
1530 tr->itrs[i].vadr,
1531 tr->itrs[i].rid);
1533 for (i = 0;
1534 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1535 i++) {
1536 if (is_hvm_domain(d))
1537 vmx_vcpu_itr_d(v, i, tr->dtrs[i].pte,
1538 tr->dtrs[i].itir,
1539 tr->dtrs[i].vadr);
1540 else
1541 vcpu_set_dtr(v, i,
1542 tr->dtrs[i].pte,
1543 tr->dtrs[i].itir,
1544 tr->dtrs[i].vadr,
1545 tr->dtrs[i].rid);
1547 v->arch.event_callback_ip = c.nat->event_callback_ip;
1548 vcpu_set_iva(v, c.nat->regs.cr.iva);
1551 if (is_hvm_domain(d))
1552 rc = vmx_arch_set_info_guest(v, c);
1554 return rc;
1557 static int relinquish_memory(struct domain *d, struct list_head *list)
1559 struct list_head *ent;
1560 struct page_info *page;
1561 #ifndef __ia64__
1562 unsigned long x, y;
1563 #endif
1564 int ret = 0;
1566 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1567 spin_lock_recursive(&d->page_alloc_lock);
1568 ent = list->next;
1569 while ( ent != list )
1571 page = list_entry(ent, struct page_info, list);
1572 /* Grab a reference to the page so it won't disappear from under us. */
1573 if ( unlikely(!get_page(page, d)) )
1575 /* Couldn't get a reference -- someone is freeing this page. */
1576 ent = ent->next;
1577 list_move_tail(&page->list, &d->arch.relmem_list);
1578 continue;
1581 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1582 put_page_and_type(page);
1584 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1585 put_page(page);
1587 #ifndef __ia64__
1588 /*
1589 * Forcibly invalidate base page tables at this point to break circular
1590 * 'linear page table' references. This is okay because MMU structures
1591 * are not shared across domains and this domain is now dead. Thus base
1592 * tables are not in use so a non-zero count means circular reference.
1593 */
1594 y = page->u.inuse.type_info;
1595 for ( ; ; )
1597 x = y;
1598 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1599 (PGT_base_page_table|PGT_validated)) )
1600 break;
1602 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1603 if ( likely(y == x) )
1605 free_page_type(page, PGT_base_page_table);
1606 break;
1609 #endif
1611 /* Follow the list chain and /then/ potentially free the page. */
1612 ent = ent->next;
1613 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
1614 list_move_tail(&page->list, &d->arch.relmem_list);
1615 put_page(page);
1617 if (hypercall_preempt_check()) {
1618 ret = -EAGAIN;
1619 goto out;
1623 list_splice_init(&d->arch.relmem_list, list);
1625 out:
1626 spin_unlock_recursive(&d->page_alloc_lock);
1627 return ret;
1630 int domain_relinquish_resources(struct domain *d)
1632 int ret = 0;
1634 switch (d->arch.relres) {
1635 case RELRES_not_started:
1636 /* Relinquish guest resources for VT-i domain. */
1637 if (is_hvm_domain(d))
1638 vmx_relinquish_guest_resources(d);
1639 d->arch.relres = RELRES_mm_teardown;
1640 /*fallthrough*/
1642 case RELRES_mm_teardown:
1643 /* Tear down shadow mode stuff. */
1644 ret = mm_teardown(d);
1645 if (ret != 0)
1646 return ret;
1647 d->arch.relres = RELRES_xen;
1648 /* fallthrough */
1650 case RELRES_xen:
1651 /* Relinquish every xen page of memory. */
1652 ret = relinquish_memory(d, &d->xenpage_list);
1653 if (ret != 0)
1654 return ret;
1655 d->arch.relres = RELRES_dom;
1656 /* fallthrough */
1658 case RELRES_dom:
1659 /* Relinquish every domain page of memory. */
1660 ret = relinquish_memory(d, &d->page_list);
1661 if (ret != 0)
1662 return ret;
1663 d->arch.relres = RELRES_done;
1664 /* fallthrough */
1666 case RELRES_done:
1667 break;
1669 default:
1670 BUG();
1673 if (is_hvm_domain(d) && d->arch.sal_data)
1674 xfree(d->arch.sal_data);
1676 /* Free page used by xen oprofile buffer */
1677 free_xenoprof_pages(d);
1679 return 0;
1682 unsigned long
1683 domain_set_shared_info_va (unsigned long va)
1685 struct vcpu *v = current;
1686 struct domain *d = v->domain;
1687 int rc;
1689 /* Check virtual address:
1690 must belong to region 7,
1691 must be 64Kb aligned,
1692 must not be within Xen virtual space. */
1693 if ((va >> 61) != 7
1694 || (va & 0xffffUL) != 0
1695 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1696 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1698 /* Note: this doesn't work well if other cpus are already running.
1699 However this is part of the spec :-) */
1700 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1701 d->arch.shared_info_va = va;
1703 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1704 INT_ENABLE_OFFSET(v);
1705 set_current_psr_i_addr(v);
1707 /* Remap the shared pages. */
1708 BUG_ON(VMX_DOMAIN(v));
1709 rc = !set_one_rr(7UL << 61, PSCB(v,rrs[7]));
1710 BUG_ON(rc);
1712 return rc;
1715 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1716 #define SHADOW_COPY_CHUNK 1024
1718 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1720 unsigned int op = sc->op;
1721 int rc = 0;
1722 int i;
1723 //struct vcpu *v;
1725 if (unlikely(d == current->domain)) {
1726 gdprintk(XENLOG_INFO,
1727 "Don't try to do a shadow op on yourself!\n");
1728 return -EINVAL;
1731 domain_pause(d);
1733 switch (op)
1735 case XEN_DOMCTL_SHADOW_OP_OFF:
1736 if (shadow_mode_enabled (d)) {
1737 u64 *bm = d->arch.shadow_bitmap;
1738 struct vcpu *v;
1740 for_each_vcpu(d, v)
1741 v->arch.shadow_bitmap = NULL;
1743 /* Flush vhpt and tlb to restore dirty bit usage. */
1744 flush_tlb_for_log_dirty(d);
1746 /* Free bitmap. */
1747 d->arch.shadow_bitmap_size = 0;
1748 d->arch.shadow_bitmap = NULL;
1749 xfree(bm);
1751 break;
1753 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1754 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1755 rc = -EINVAL;
1756 break;
1758 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1759 if (shadow_mode_enabled(d)) {
1760 rc = -EINVAL;
1761 break;
1764 atomic64_set(&d->arch.shadow_fault_count, 0);
1765 atomic64_set(&d->arch.shadow_dirty_count, 0);
1767 d->arch.shadow_bitmap_size =
1768 (domain_get_maximum_gpfn(d) + BITS_PER_LONG) &
1769 ~(BITS_PER_LONG - 1);
1770 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1771 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1772 if (d->arch.shadow_bitmap == NULL) {
1773 d->arch.shadow_bitmap_size = 0;
1774 rc = -ENOMEM;
1776 else {
1777 struct vcpu *v;
1778 memset(d->arch.shadow_bitmap, 0,
1779 d->arch.shadow_bitmap_size / 8);
1781 for_each_vcpu(d, v)
1782 v->arch.shadow_bitmap = d->arch.shadow_bitmap;
1783 /* Flush vhtp and tlb to enable dirty bit
1784 virtualization. */
1785 flush_tlb_for_log_dirty(d);
1787 break;
1789 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1791 int nbr_bytes;
1793 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1794 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1796 atomic64_set(&d->arch.shadow_fault_count, 0);
1797 atomic64_set(&d->arch.shadow_dirty_count, 0);
1799 if (guest_handle_is_null(sc->dirty_bitmap) ||
1800 (d->arch.shadow_bitmap == NULL)) {
1801 rc = -EINVAL;
1802 break;
1805 if (sc->pages > d->arch.shadow_bitmap_size)
1806 sc->pages = d->arch.shadow_bitmap_size;
1808 nbr_bytes = (sc->pages + 7) / 8;
1810 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1811 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1812 SHADOW_COPY_CHUNK : nbr_bytes - i;
1814 if (copy_to_guest_offset(
1815 sc->dirty_bitmap, i,
1816 (uint8_t *)d->arch.shadow_bitmap + i,
1817 size)) {
1818 rc = -EFAULT;
1819 break;
1822 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1824 flush_tlb_for_log_dirty(d);
1826 break;
1829 case XEN_DOMCTL_SHADOW_OP_PEEK:
1831 unsigned long size;
1833 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1834 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1836 if (guest_handle_is_null(sc->dirty_bitmap) ||
1837 (d->arch.shadow_bitmap == NULL)) {
1838 rc = -EINVAL;
1839 break;
1842 if (sc->pages > d->arch.shadow_bitmap_size)
1843 sc->pages = d->arch.shadow_bitmap_size;
1845 size = (sc->pages + 7) / 8;
1846 if (copy_to_guest(sc->dirty_bitmap,
1847 (uint8_t *)d->arch.shadow_bitmap, size)) {
1848 rc = -EFAULT;
1849 break;
1851 break;
1853 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1854 sc->mb = 0;
1855 break;
1856 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1857 if (sc->mb > 0) {
1858 BUG();
1859 rc = -ENOMEM;
1861 break;
1862 default:
1863 rc = -EINVAL;
1864 break;
1867 domain_unpause(d);
1869 return rc;
1872 // remove following line if not privifying in memory
1873 //#define HAVE_PRIVIFY_MEMORY
1874 #ifndef HAVE_PRIVIFY_MEMORY
1875 #define privify_memory(x,y) do {} while(0)
1876 #endif
1878 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1879 unsigned long phys_load_offset)
1881 const elf_phdr *phdr;
1882 int phnum, h, filesz, memsz;
1883 unsigned long elfaddr, dom_mpaddr, dom_imva;
1884 struct page_info *p;
1886 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1887 for (h = 0; h < phnum; h++) {
1888 phdr = elf_phdr_by_index(elf, h);
1889 if (!elf_phdr_is_loadable(elf, phdr))
1890 continue;
1892 filesz = elf_uval(elf, phdr, p_filesz);
1893 memsz = elf_uval(elf, phdr, p_memsz);
1894 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1895 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1896 dom_mpaddr += phys_load_offset;
1898 while (memsz > 0) {
1899 p = assign_new_domain_page(d,dom_mpaddr);
1900 BUG_ON (unlikely(p == NULL));
1901 dom_imva = __va_ul(page_to_maddr(p));
1902 if (filesz > 0) {
1903 if (filesz >= PAGE_SIZE)
1904 copy_page((void *) dom_imva,
1905 (void *) elfaddr);
1906 else {
1907 // copy partial page
1908 memcpy((void *) dom_imva,
1909 (void *) elfaddr, filesz);
1910 // zero the rest of page
1911 memset((void *) dom_imva+filesz, 0,
1912 PAGE_SIZE-filesz);
1914 //FIXME: This test for code seems to find a lot more than objdump -x does
1915 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1916 privify_memory(dom_imva,PAGE_SIZE);
1917 flush_icache_range(dom_imva,
1918 dom_imva+PAGE_SIZE);
1921 else if (memsz > 0) {
1922 /* always zero out entire page */
1923 clear_page((void *) dom_imva);
1925 memsz -= PAGE_SIZE;
1926 filesz -= PAGE_SIZE;
1927 elfaddr += PAGE_SIZE;
1928 dom_mpaddr += PAGE_SIZE;
1933 static void __init calc_dom0_size(void)
1935 unsigned long domheap_pages;
1936 unsigned long p2m_pages;
1937 unsigned long spare_hv_pages;
1938 unsigned long max_dom0_size;
1940 /* Estimate maximum memory we can safely allocate for dom0
1941 * by subtracting the p2m table allocation and a chunk of memory
1942 * for DMA and PCI mapping from the available domheap pages. The
1943 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1944 * to have a good idea of what those requirements might be ahead
1945 * of time, calculated at 128MB + 1MB per 4GB of system memory */
1946 domheap_pages = avail_domheap_pages();
1947 p2m_pages = domheap_pages / PTRS_PER_PTE;
1948 spare_hv_pages = 8192 + (domheap_pages / 4096);
1949 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1950 * PAGE_SIZE;
1951 printk("Maximum permitted dom0 size: %luMB\n",
1952 max_dom0_size / (1024*1024));
1954 /* validate proposed dom0_size, fix up as needed */
1955 if (dom0_size > max_dom0_size) {
1956 printk("Reducing dom0 memory allocation from %luK to %luK "
1957 "to fit available memory\n",
1958 dom0_size / 1024, max_dom0_size / 1024);
1959 dom0_size = max_dom0_size;
1962 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1963 if (dom0_size == 0) {
1964 printk("Allocating all available memory to dom0\n");
1965 dom0_size = max_dom0_size;
1968 /* Check dom0 size. */
1969 if (dom0_size < 4 * 1024 * 1024) {
1970 panic("dom0_mem is too small, boot aborted"
1971 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1974 if (running_on_sim) {
1975 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1978 /* no need to allocate pages for now
1979 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1980 */
1984 /*
1985 * Domain 0 has direct access to all devices absolutely. However
1986 * the major point of this stub here, is to allow alloc_dom_mem
1987 * handled with order > 0 request. Dom0 requires that bit set to
1988 * allocate memory for other domains.
1989 */
1990 static void __init physdev_init_dom0(struct domain *d)
1992 if (iomem_permit_access(d, 0UL, ~0UL))
1993 BUG();
1994 if (irqs_permit_access(d, 0, NR_IRQS-1))
1995 BUG();
1996 if (ioports_permit_access(d, 0, 0, 0xffff))
1997 BUG();
2000 int __init construct_dom0(struct domain *d,
2001 unsigned long image_start, unsigned long image_len,
2002 unsigned long initrd_start, unsigned long initrd_len,
2003 char *cmdline)
2005 int i, rc;
2006 start_info_t *si;
2007 dom0_vga_console_info_t *ci;
2008 struct vcpu *v = d->vcpu[0];
2009 unsigned long max_pages;
2011 struct elf_binary elf;
2012 struct elf_dom_parms parms;
2013 unsigned long p_start;
2014 unsigned long pkern_start;
2015 unsigned long pkern_entry;
2016 unsigned long pkern_end;
2017 unsigned long pinitrd_start = 0;
2018 unsigned long pstart_info;
2019 unsigned long phys_load_offset;
2020 struct page_info *start_info_page;
2021 unsigned long bp_mpa;
2022 struct ia64_boot_param *bp;
2024 //printk("construct_dom0: starting\n");
2026 /* Sanity! */
2027 BUG_ON(d != dom0);
2028 BUG_ON(d->vcpu[0] == NULL);
2029 BUG_ON(v->is_initialised);
2031 printk("*** LOADING DOMAIN 0 ***\n");
2033 calc_dom0_size();
2035 max_pages = dom0_size / PAGE_SIZE;
2036 d->max_pages = max_pages;
2037 d->tot_pages = 0;
2039 rc = elf_init(&elf, (void*)image_start, image_len);
2040 if ( rc != 0 )
2041 return rc;
2042 #ifdef VERBOSE
2043 elf_set_verbose(&elf);
2044 #endif
2045 elf_parse_binary(&elf);
2046 if (0 != (elf_xen_parse(&elf, &parms)))
2047 return rc;
2049 /*
2050 * We cannot rely on the load address in the ELF headers to
2051 * determine the meta physical address at which the image
2052 * is loaded. Patch the address to match the real one, based
2053 * on xen_pstart
2054 */
2055 phys_load_offset = xen_pstart - elf.pstart;
2056 elf.pstart += phys_load_offset;
2057 elf.pend += phys_load_offset;
2058 parms.virt_kstart += phys_load_offset;
2059 parms.virt_kend += phys_load_offset;
2060 parms.virt_entry += phys_load_offset;
2062 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
2063 elf_64bit(&elf) ? "64-bit" : "32-bit",
2064 elf_msb(&elf) ? "msb" : "lsb",
2065 elf.pstart, elf.pend);
2066 if (!elf_64bit(&elf) ||
2067 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
2068 printk("Incompatible kernel binary\n");
2069 return -1;
2072 p_start = parms.virt_base;
2073 pkern_start = parms.virt_kstart;
2074 pkern_end = parms.virt_kend;
2075 pkern_entry = parms.virt_entry;
2077 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
2079 if ( (p_start & (PAGE_SIZE-1)) != 0 )
2081 printk("Initial guest OS must load to a page boundary.\n");
2082 return -EINVAL;
2085 pstart_info = PAGE_ALIGN(pkern_end);
2086 if(initrd_start && initrd_len){
2087 unsigned long offset;
2089 /* The next page aligned boundary after the start info.
2090 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
2091 pinitrd_start = pstart_info + PAGE_SIZE;
2093 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
2094 panic("%s: not enough memory assigned to dom0", __func__);
2096 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
2097 struct page_info *p;
2098 p = assign_new_domain_page(d, pinitrd_start + offset);
2099 if (p == NULL)
2100 panic("%s: can't allocate page for initrd image", __func__);
2101 if (initrd_len < offset + PAGE_SIZE)
2102 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
2103 initrd_len - offset);
2104 else
2105 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
2109 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
2110 " Kernel image: %lx->%lx\n"
2111 " Entry address: %lx\n"
2112 " Init. ramdisk: %lx len %lx\n"
2113 " Start info.: %lx->%lx\n",
2114 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
2115 pstart_info, pstart_info + PAGE_SIZE);
2117 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
2119 printk("Initial guest OS requires too much space\n"
2120 "(%luMB is greater than %luMB limit)\n",
2121 (pkern_end-pkern_start)>>20,
2122 (max_pages <<PAGE_SHIFT)>>20);
2123 return -ENOMEM;
2126 // if high 3 bits of pkern start are non-zero, error
2128 // if pkern end is after end of metaphysical memory, error
2129 // (we should be able to deal with this... later)
2131 /* Mask all upcalls... */
2132 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
2133 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
2135 if (dom0_max_vcpus == 0)
2136 dom0_max_vcpus = MAX_VIRT_CPUS;
2137 if (dom0_max_vcpus > num_online_cpus())
2138 dom0_max_vcpus = num_online_cpus();
2139 if (dom0_max_vcpus > MAX_VIRT_CPUS)
2140 dom0_max_vcpus = MAX_VIRT_CPUS;
2142 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
2143 for ( i = 1; i < dom0_max_vcpus; i++ )
2144 if (alloc_vcpu(d, i, i) == NULL)
2145 panic("Cannot allocate dom0 vcpu %d\n", i);
2147 /* Copy the OS image. */
2148 loaddomainelfimage(d, &elf, phys_load_offset);
2150 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
2151 sizeof(struct ia64_boot_param) > PAGE_SIZE);
2153 /* Set up start info area. */
2154 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
2155 start_info_page = assign_new_domain_page(d, pstart_info);
2156 if (start_info_page == NULL)
2157 panic("can't allocate start info page");
2158 si = page_to_virt(start_info_page);
2159 clear_page(si);
2160 snprintf(si->magic, sizeof(si->magic), "xen-3.0-ia64");
2161 si->nr_pages = max_pages;
2162 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
2163 si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK;
2165 printk("Dom0: 0x%lx\n", (u64)dom0);
2167 v->is_initialised = 1;
2168 clear_bit(_VPF_down, &v->pause_flags);
2170 /* Build firmware.
2171 Note: Linux kernel reserve memory used by start_info, so there is
2172 no need to remove it from MDT. */
2173 bp_mpa = pstart_info + sizeof(struct start_info);
2174 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
2175 if (rc != 0)
2176 return rc;
2178 /* Fill boot param. */
2179 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
2181 bp = (struct ia64_boot_param *)((unsigned char *)si +
2182 sizeof(start_info_t));
2183 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
2185 /* We assume console has reached the last line! */
2186 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
2187 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
2188 bp->console_info.orig_x = 0;
2189 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
2190 0 : bp->console_info.num_rows - 1;
2192 bp->initrd_start = pinitrd_start;
2193 bp->initrd_size = ia64_boot_param->initrd_size;
2195 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
2196 sizeof(start_info_t) +
2197 sizeof(struct ia64_boot_param));
2199 if (fill_console_start_info(ci)) {
2200 si->console.dom0.info_off = sizeof(start_info_t) +
2201 sizeof(struct ia64_boot_param);
2202 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
2205 vcpu_init_regs (v);
2207 vcpu_regs(v)->r28 = bp_mpa;
2209 vcpu_regs (v)->cr_iip = pkern_entry;
2211 physdev_init_dom0(d);
2213 return 0;
2216 void machine_restart(unsigned int delay_millisecs)
2218 mdelay(delay_millisecs);
2219 console_start_sync();
2220 if (running_on_sim)
2221 printk ("machine_restart called. spinning...\n");
2222 else
2223 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
2224 while(1);
2227 extern void cpu_halt(void);
2229 void machine_halt(void)
2231 console_start_sync();
2233 #ifdef CONFIG_SMP
2234 smp_send_stop();
2235 #endif
2237 printk ("machine_halt called. spinning...\n");
2238 while(1);
2241 void sync_vcpu_execstate(struct vcpu *v)
2243 // __ia64_save_fpu(v->arch._thread.fph);
2244 // FIXME SMP: Anything else needed here for SMP?
2247 /* This function is taken from xen/arch/x86/domain.c */
2248 long
2249 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
2251 long rc = 0;
2253 switch (cmd) {
2254 case VCPUOP_register_runstate_memory_area:
2256 struct vcpu_register_runstate_memory_area area;
2257 struct vcpu_runstate_info runstate;
2259 rc = -EFAULT;
2260 if (copy_from_guest(&area, arg, 1))
2261 break;
2263 if (!guest_handle_okay(area.addr.h, 1))
2264 break;
2266 rc = 0;
2267 runstate_guest(v) = area.addr.h;
2269 if (v == current) {
2270 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
2271 } else {
2272 vcpu_runstate_get(v, &runstate);
2273 __copy_to_guest(runstate_guest(v), &runstate, 1);
2276 break;
2278 default:
2279 rc = -ENOSYS;
2280 break;
2283 return rc;
2286 static void __init parse_dom0_mem(char *s)
2288 dom0_size = parse_size_and_unit(s, NULL);
2290 custom_param("dom0_mem", parse_dom0_mem);
2292 /*
2293 * Helper function for the optimization stuff handling the identity mapping
2294 * feature.
2295 */
2296 static inline unsigned long
2297 optf_identity_mapping_cmd_to_flg(unsigned long cmd)
2299 switch(cmd) {
2300 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2301 return XEN_IA64_OPTF_IDENT_MAP_REG7_FLG;
2302 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2303 return XEN_IA64_OPTF_IDENT_MAP_REG4_FLG;
2304 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2305 return XEN_IA64_OPTF_IDENT_MAP_REG5_FLG;
2306 default:
2307 BUG();
2308 return 0;
2311 /* NOTREACHED */
2314 static inline void
2315 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
2316 struct xen_ia64_opt_feature* f)
2318 unsigned long flag = optf_identity_mapping_cmd_to_flg(f->cmd);
2320 if (f->on) {
2321 *mask |= flag;
2322 im->pgprot = f->pgprot;
2323 im->key = f->key;
2324 } else {
2325 *mask &= ~flag;
2326 im->pgprot = 0;
2327 im->key = 0;
2331 /*
2332 * Switch an optimization feature on/off.
2333 * The vcpu must be paused to avoid racy access to opt_feature.
2334 */
2335 int
2336 domain_opt_feature(struct domain *d, struct xen_ia64_opt_feature* f)
2338 struct opt_feature* optf = &d->arch.opt_feature;
2339 struct vcpu *v;
2340 long rc = 0;
2342 for_each_vcpu(d, v) {
2343 if (v != current)
2344 vcpu_pause(v);
2347 switch (f->cmd) {
2348 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2349 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
2350 break;
2351 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2352 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
2353 break;
2354 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2355 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
2356 break;
2357 default:
2358 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
2359 rc = -ENOSYS;
2360 break;
2363 for_each_vcpu(d, v) {
2364 if (v != current)
2365 vcpu_unpause(v);
2368 return rc;