ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 16101:e71c7789c2f5

[IA64] Minor clean up of sync_vcpu_execstate()

vmx_save_state() is called by context_switch()

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Alex Williamson <alex.williamson@hp.com>
date Fri Oct 12 13:52:30 2007 -0600 (2007-10-12)
parents 9fbbba4c23fb
children 9c52742f7734
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <asm/sal.h>
54 #include <public/vcpu.h>
55 #include <linux/cpu.h>
56 #include <linux/notifier.h>
58 /* dom0_size: default memory allocation for dom0 (~4GB) */
59 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
61 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
62 static unsigned int __initdata dom0_max_vcpus = 4;
63 integer_param("dom0_max_vcpus", dom0_max_vcpus);
65 extern char dom0_command_line[];
67 /* forward declaration */
68 static void init_switch_stack(struct vcpu *v);
70 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
71 This is a Xen virtual address. */
72 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
73 DEFINE_PER_CPU(int *, current_psr_ic_addr);
75 DEFINE_PER_CPU(struct vcpu *, fp_owner);
77 #include <xen/sched-if.h>
79 static void
80 ia64_disable_vhpt_walker(void)
81 {
82 // disable VHPT. ia64_new_rr7() might cause VHPT
83 // fault without this because it flushes dtr[IA64_TR_VHPT]
84 // (VHPT_SIZE_LOG2 << 2) is just for avoid
85 // Reserved Register/Field fault.
86 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
87 }
89 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
90 {
91 int cpu = smp_processor_id();
92 int last_vcpu_id, last_processor;
94 if (!is_idle_domain(prev->domain))
95 tlbflush_update_time
96 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
97 tlbflush_current_time());
99 if (is_idle_domain(next->domain))
100 return;
102 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
103 last_processor = next->arch.last_processor;
105 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
106 next->arch.last_processor = cpu;
108 if ((last_vcpu_id != next->vcpu_id &&
109 last_vcpu_id != INVALID_VCPU_ID) ||
110 (last_vcpu_id == next->vcpu_id &&
111 last_processor != cpu &&
112 last_processor != INVALID_PROCESSOR)) {
113 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
114 u32 last_tlbflush_timestamp =
115 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
116 #endif
117 int vhpt_is_flushed = 0;
119 // if the vTLB implementation was changed,
120 // the followings must be updated either.
121 if (VMX_DOMAIN(next)) {
122 // currently vTLB for vt-i domian is per vcpu.
123 // so any flushing isn't needed.
124 } else if (HAS_PERVCPU_VHPT(next->domain)) {
125 // nothing to do
126 } else {
127 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
128 last_tlbflush_timestamp)) {
129 local_vhpt_flush();
130 vhpt_is_flushed = 1;
131 }
132 }
133 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
134 last_tlbflush_timestamp)) {
135 local_flush_tlb_all();
136 perfc_incr(tlbflush_clock_cswitch_purge);
137 } else {
138 perfc_incr(tlbflush_clock_cswitch_skip);
139 }
140 perfc_incr(flush_vtlb_for_context_switch);
141 }
142 }
144 static void flush_cache_for_context_switch(struct vcpu *next)
145 {
146 extern cpumask_t cpu_cache_coherent_map;
147 int cpu = smp_processor_id();
149 if (is_idle_vcpu(next) ||
150 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
151 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
152 unsigned long flags;
153 u64 progress = 0;
154 s64 status;
156 local_irq_save(flags);
157 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
158 local_irq_restore(flags);
159 if (status != 0)
160 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
161 "cache_type=4 status %lx", status);
162 }
163 }
164 }
166 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
167 {
168 /*
169 * Implement eager save, lazy restore
170 */
171 if (!is_idle_vcpu(prev)) {
172 if (VMX_DOMAIN(prev)) {
173 if (FP_PSR(prev) & IA64_PSR_MFH) {
174 __ia64_save_fpu(prev->arch._thread.fph);
175 __ia64_per_cpu_var(fp_owner) = prev;
176 }
177 } else {
178 if (PSCB(prev, hpsr_mfh)) {
179 __ia64_save_fpu(prev->arch._thread.fph);
180 __ia64_per_cpu_var(fp_owner) = prev;
181 }
182 }
183 }
185 if (!is_idle_vcpu(next)) {
186 if (VMX_DOMAIN(next)) {
187 FP_PSR(next) = IA64_PSR_DFH;
188 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
189 } else {
190 PSCB(next, hpsr_dfh) = 1;
191 PSCB(next, hpsr_mfh) = 0;
192 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
193 }
194 }
195 }
197 void schedule_tail(struct vcpu *prev)
198 {
199 extern char ia64_ivt;
201 context_saved(prev);
202 ia64_disable_vhpt_walker();
204 if (VMX_DOMAIN(current)) {
205 vmx_do_launch(current);
206 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
207 current->processor);
208 } else {
209 ia64_set_iva(&ia64_ivt);
210 load_region_regs(current);
211 ia64_set_pta(vcpu_pta(current));
212 vcpu_load_kernel_regs(current);
213 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
214 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
215 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
216 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
217 migrate_timer(&current->arch.hlt_timer, current->processor);
218 }
219 flush_vtlb_for_context_switch(prev, current);
220 }
222 void context_switch(struct vcpu *prev, struct vcpu *next)
223 {
224 uint64_t spsr;
226 local_irq_save(spsr);
228 if (VMX_DOMAIN(prev)) {
229 vmx_save_state(prev);
230 if (!VMX_DOMAIN(next)) {
231 /* VMX domains can change the physical cr.dcr.
232 * Restore default to prevent leakage. */
233 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
234 }
235 }
236 if (VMX_DOMAIN(next))
237 vmx_load_state(next);
239 ia64_disable_vhpt_walker();
240 lazy_fp_switch(prev, current);
242 if (prev->arch.dbg_used || next->arch.dbg_used) {
243 /*
244 * Load debug registers either because they are valid or to clear
245 * the previous one.
246 */
247 ia64_load_debug_regs(next->arch.dbr);
248 }
250 prev = ia64_switch_to(next);
252 /* Note: ia64_switch_to does not return here at vcpu initialization. */
254 if (VMX_DOMAIN(current)) {
255 vmx_load_all_rr(current);
256 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
257 current->processor);
258 } else {
259 struct domain *nd;
260 extern char ia64_ivt;
262 ia64_set_iva(&ia64_ivt);
264 nd = current->domain;
265 if (!is_idle_domain(nd)) {
266 load_region_regs(current);
267 ia64_set_pta(vcpu_pta(current));
268 vcpu_load_kernel_regs(current);
269 if (vcpu_pkr_in_use(current))
270 vcpu_pkr_load_regs(current);
271 vcpu_set_next_timer(current);
272 if (vcpu_timer_expired(current))
273 vcpu_pend_timer(current);
274 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
275 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
276 __ia64_per_cpu_var(current_psr_ic_addr) =
277 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
278 /* steal time accounting */
279 if (!guest_handle_is_null(runstate_guest(current)))
280 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
281 } else {
282 /* When switching to idle domain, only need to disable vhpt
283 * walker. Then all accesses happen within idle context will
284 * be handled by TR mapping and identity mapping.
285 */
286 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
287 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
288 }
289 }
290 local_irq_restore(spsr);
292 /* lazy fp */
293 if (current->processor != current->arch.last_processor) {
294 unsigned long *addr;
295 addr = (unsigned long *)per_cpu_addr(fp_owner,
296 current->arch.last_processor);
297 ia64_cmpxchg(acq, addr, current, 0, 8);
298 }
300 flush_vtlb_for_context_switch(prev, current);
301 flush_cache_for_context_switch(current);
302 context_saved(prev);
303 }
305 void continue_running(struct vcpu *same)
306 {
307 /* nothing to do */
308 }
310 #ifdef CONFIG_PERFMON
311 static int pal_halt = 1;
312 static int can_do_pal_halt = 1;
314 static int __init nohalt_setup(char * str)
315 {
316 pal_halt = can_do_pal_halt = 0;
317 return 1;
318 }
319 __setup("nohalt", nohalt_setup);
321 void
322 update_pal_halt_status(int status)
323 {
324 can_do_pal_halt = pal_halt && status;
325 }
326 #else
327 #define can_do_pal_halt (1)
328 #endif
330 static void default_idle(void)
331 {
332 local_irq_disable();
333 if ( !softirq_pending(smp_processor_id()) ) {
334 if (can_do_pal_halt)
335 safe_halt();
336 else
337 cpu_relax();
338 }
339 local_irq_enable();
340 }
342 extern void play_dead(void);
344 static void continue_cpu_idle_loop(void)
345 {
346 int cpu = smp_processor_id();
348 for ( ; ; )
349 {
350 #ifdef IA64
351 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
352 #else
353 irq_stat[cpu].idle_timestamp = jiffies;
354 #endif
355 page_scrub_schedule_work();
356 while ( !softirq_pending(cpu) )
357 default_idle();
358 raise_softirq(SCHEDULE_SOFTIRQ);
359 do_softirq();
360 if (!cpu_online(cpu))
361 play_dead();
362 }
363 }
365 void startup_cpu_idle_loop(void)
366 {
367 /* Just some sanity to ensure that the scheduler is set up okay. */
368 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
369 raise_softirq(SCHEDULE_SOFTIRQ);
371 continue_cpu_idle_loop();
372 }
374 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
375 * get_order_from_shift(XMAPPEDREGS_SHIFT))
376 */
377 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
378 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
379 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
380 #endif
382 void hlt_timer_fn(void *data)
383 {
384 struct vcpu *v = data;
385 vcpu_unblock(v);
386 }
388 void relinquish_vcpu_resources(struct vcpu *v)
389 {
390 if (HAS_PERVCPU_VHPT(v->domain))
391 pervcpu_vhpt_free(v);
392 if (v->arch.privregs != NULL) {
393 free_xenheap_pages(v->arch.privregs,
394 get_order_from_shift(XMAPPEDREGS_SHIFT));
395 v->arch.privregs = NULL;
396 }
397 kill_timer(&v->arch.hlt_timer);
398 }
400 struct vcpu *alloc_vcpu_struct(void)
401 {
402 struct vcpu *v;
403 struct thread_info *ti;
404 static int first_allocation = 1;
406 if (first_allocation) {
407 first_allocation = 0;
408 /* Still keep idle vcpu0 static allocated at compilation, due
409 * to some code from Linux still requires it in early phase.
410 */
411 return idle_vcpu[0];
412 }
414 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
415 return NULL;
416 memset(v, 0, sizeof(*v));
418 ti = alloc_thread_info(v);
419 /* Clear thread_info to clear some important fields, like
420 * preempt_count
421 */
422 memset(ti, 0, sizeof(struct thread_info));
423 init_switch_stack(v);
425 return v;
426 }
428 void free_vcpu_struct(struct vcpu *v)
429 {
430 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
431 }
433 int vcpu_initialise(struct vcpu *v)
434 {
435 struct domain *d = v->domain;
437 if (!is_idle_domain(d)) {
438 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
439 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
440 /* Set default values to saved_rr. */
441 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
442 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
444 /* Is it correct ?
445 It depends on the domain rid usage.
447 A domain may share rid among its processor (eg having a
448 global VHPT). In this case, we should also share rid
449 among vcpus and the rid range should be the same.
451 However a domain may have per cpu rid allocation. In
452 this case we don't want to share rid among vcpus, but we may
453 do it if two vcpus are on the same cpu... */
455 v->arch.starting_rid = d->arch.starting_rid;
456 v->arch.ending_rid = d->arch.ending_rid;
457 v->arch.breakimm = d->arch.breakimm;
458 v->arch.last_processor = INVALID_PROCESSOR;
459 v->arch.vhpt_pg_shift = PAGE_SHIFT;
460 }
462 if (!VMX_DOMAIN(v))
463 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
464 first_cpu(cpu_online_map));
466 return 0;
467 }
469 void vcpu_share_privregs_with_guest(struct vcpu *v)
470 {
471 struct domain *d = v->domain;
472 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
474 for (i = 0; i < (1 << order); i++)
475 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
476 d, XENSHARE_writable);
477 /*
478 * XXX IA64_XMAPPEDREGS_PADDR
479 * assign these pages into guest pseudo physical address
480 * space for dom0 to map this page by gmfn.
481 * this is necessary for domain save, restore and dump-core.
482 */
483 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
484 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
485 virt_to_maddr(v->arch.privregs + i));
486 }
488 int vcpu_late_initialise(struct vcpu *v)
489 {
490 struct domain *d = v->domain;
491 int rc, order;
493 if (HAS_PERVCPU_VHPT(d)) {
494 rc = pervcpu_vhpt_alloc(v);
495 if (rc != 0)
496 return rc;
497 }
499 /* Create privregs page. */
500 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
501 v->arch.privregs = alloc_xenheap_pages(order);
502 BUG_ON(v->arch.privregs == NULL);
503 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
504 vcpu_share_privregs_with_guest(v);
506 return 0;
507 }
509 void vcpu_destroy(struct vcpu *v)
510 {
511 if (v->domain->arch.is_vti)
512 vmx_relinquish_vcpu_resources(v);
513 else
514 relinquish_vcpu_resources(v);
515 }
517 static void init_switch_stack(struct vcpu *v)
518 {
519 struct pt_regs *regs = vcpu_regs (v);
520 struct switch_stack *sw = (struct switch_stack *) regs - 1;
521 extern void ia64_ret_from_clone;
523 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
524 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
525 sw->b0 = (unsigned long) &ia64_ret_from_clone;
526 sw->ar_fpsr = FPSR_DEFAULT;
527 v->arch._thread.ksp = (unsigned long) sw - 16;
528 // stay on kernel stack because may get interrupts!
529 // ia64_ret_from_clone switches to user stack
530 v->arch._thread.on_ustack = 0;
531 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
532 }
534 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
535 static int opt_pervcpu_vhpt = 1;
536 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
537 #endif
539 int arch_domain_create(struct domain *d)
540 {
541 int i;
543 // the following will eventually need to be negotiated dynamically
544 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
545 d->arch.breakimm = 0x1000;
546 for (i = 0; i < NR_CPUS; i++) {
547 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
548 }
550 if (is_idle_domain(d))
551 return 0;
553 foreign_p2m_init(d);
554 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
555 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
556 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
557 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
558 #endif
559 if (tlb_track_create(d) < 0)
560 goto fail_nomem1;
561 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
562 if (d->shared_info == NULL)
563 goto fail_nomem;
564 memset(d->shared_info, 0, XSI_SIZE);
565 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
566 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
567 d, XENSHARE_writable);
569 /* We may also need emulation rid for region4, though it's unlikely
570 * to see guest issue uncacheable access in metaphysical mode. But
571 * keep such info here may be more sane.
572 */
573 if (!allocate_rid_range(d,0))
574 goto fail_nomem;
576 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
577 d->arch.mm_teardown_offset = 0;
579 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
580 goto fail_nomem;
582 /*
583 * grant_table_create() can't fully initialize grant table for domain
584 * because it is called before arch_domain_create().
585 * Here we complete the initialization which requires p2m table.
586 */
587 spin_lock(&d->grant_table->lock);
588 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
589 ia64_gnttab_create_shared_page(d, d->grant_table, i);
590 spin_unlock(&d->grant_table->lock);
592 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
593 RANGESETF_prettyprint_hex);
595 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
596 return 0;
598 fail_nomem:
599 tlb_track_destroy(d);
600 fail_nomem1:
601 if (d->arch.mm.pgd != NULL)
602 pgd_free(d->arch.mm.pgd);
603 if (d->shared_info != NULL)
604 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
605 return -ENOMEM;
606 }
608 void arch_domain_destroy(struct domain *d)
609 {
610 mm_final_teardown(d);
612 if (d->shared_info != NULL)
613 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
615 tlb_track_destroy(d);
617 /* Clear vTLB for the next domain. */
618 domain_flush_tlb_vhpt(d);
620 deallocate_rid_range(d);
621 }
623 int arch_vcpu_reset(struct vcpu *v)
624 {
625 /* FIXME: Stub for now */
626 return 0;
627 }
629 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
631 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
632 {
633 int i;
634 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
635 struct cpu_user_regs *uregs = vcpu_regs(v);
636 int is_hvm = VMX_DOMAIN(v);
637 unsigned int rbs_size;
639 c.nat->regs.b[6] = uregs->b6;
640 c.nat->regs.b[7] = uregs->b7;
642 c.nat->regs.ar.csd = uregs->ar_csd;
643 c.nat->regs.ar.ssd = uregs->ar_ssd;
645 c.nat->regs.r[8] = uregs->r8;
646 c.nat->regs.r[9] = uregs->r9;
647 c.nat->regs.r[10] = uregs->r10;
648 c.nat->regs.r[11] = uregs->r11;
650 if (is_hvm)
651 c.nat->regs.psr = vmx_vcpu_get_psr(v);
652 else
653 c.nat->regs.psr = vcpu_get_psr(v);
655 c.nat->regs.ip = uregs->cr_iip;
656 c.nat->regs.cfm = uregs->cr_ifs;
658 c.nat->regs.ar.unat = uregs->ar_unat;
659 c.nat->regs.ar.pfs = uregs->ar_pfs;
660 c.nat->regs.ar.rsc = uregs->ar_rsc;
661 c.nat->regs.ar.rnat = uregs->ar_rnat;
662 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
664 c.nat->regs.pr = uregs->pr;
665 c.nat->regs.b[0] = uregs->b0;
666 rbs_size = uregs->loadrs >> 16;
667 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
669 c.nat->regs.r[1] = uregs->r1;
670 c.nat->regs.r[12] = uregs->r12;
671 c.nat->regs.r[13] = uregs->r13;
672 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
673 c.nat->regs.r[15] = uregs->r15;
675 c.nat->regs.r[14] = uregs->r14;
676 c.nat->regs.r[2] = uregs->r2;
677 c.nat->regs.r[3] = uregs->r3;
678 c.nat->regs.r[16] = uregs->r16;
679 c.nat->regs.r[17] = uregs->r17;
680 c.nat->regs.r[18] = uregs->r18;
681 c.nat->regs.r[19] = uregs->r19;
682 c.nat->regs.r[20] = uregs->r20;
683 c.nat->regs.r[21] = uregs->r21;
684 c.nat->regs.r[22] = uregs->r22;
685 c.nat->regs.r[23] = uregs->r23;
686 c.nat->regs.r[24] = uregs->r24;
687 c.nat->regs.r[25] = uregs->r25;
688 c.nat->regs.r[26] = uregs->r26;
689 c.nat->regs.r[27] = uregs->r27;
690 c.nat->regs.r[28] = uregs->r28;
691 c.nat->regs.r[29] = uregs->r29;
692 c.nat->regs.r[30] = uregs->r30;
693 c.nat->regs.r[31] = uregs->r31;
695 c.nat->regs.ar.ccv = uregs->ar_ccv;
697 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
698 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
699 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
700 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
701 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
702 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
704 c.nat->regs.r[4] = uregs->r4;
705 c.nat->regs.r[5] = uregs->r5;
706 c.nat->regs.r[6] = uregs->r6;
707 c.nat->regs.r[7] = uregs->r7;
709 /* FIXME: to be reordered. */
710 c.nat->regs.nats = uregs->eml_unat;
712 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
713 if (rbs_size < sizeof (c.nat->regs.rbs))
714 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
716 c.nat->privregs_pfn = get_gpfn_from_mfn
717 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
719 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
720 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
721 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
722 }
724 for (i = 0; i < 8; i++)
725 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
727 /* Fill extra regs. */
728 for (i = 0;
729 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
730 i++) {
731 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
732 tr->itrs[i].itir = v->arch.itrs[i].itir;
733 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
734 tr->itrs[i].rid = v->arch.itrs[i].rid;
735 }
736 for (i = 0;
737 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
738 i++) {
739 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
740 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
741 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
742 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
743 }
744 c.nat->event_callback_ip = v->arch.event_callback_ip;
746 /* If PV and privregs is not set, we can't read mapped registers. */
747 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
748 return;
750 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
752 c.nat->regs.cr.itm = v->domain->arch.is_vti ?
753 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
754 vcpu_get_iva(v, &c.nat->regs.cr.iva);
755 vcpu_get_pta(v, &c.nat->regs.cr.pta);
757 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
758 vcpu_get_isr(v, &c.nat->regs.cr.isr);
759 vcpu_get_iip(v, &c.nat->regs.cr.iip);
760 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
761 vcpu_get_itir(v, &c.nat->regs.cr.itir);
762 vcpu_get_iha(v, &c.nat->regs.cr.iha);
763 vcpu_get_ivr(v, &c.nat->regs.cr.ivr);
764 vcpu_get_iim(v, &c.nat->regs.cr.iim);
766 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
767 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
768 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
769 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
770 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
771 vcpu_get_itv(v, &c.nat->regs.cr.itv);
772 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
773 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
774 }
776 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
777 {
778 struct cpu_user_regs *uregs = vcpu_regs(v);
779 struct domain *d = v->domain;
780 int was_initialised = v->is_initialised;
781 unsigned int rbs_size;
782 int rc, i;
784 /* Finish vcpu initialization. */
785 if (!was_initialised) {
786 if (d->arch.is_vti)
787 rc = vmx_final_setup_guest(v);
788 else
789 rc = vcpu_late_initialise(v);
790 if (rc != 0)
791 return rc;
793 vcpu_init_regs(v);
795 v->is_initialised = 1;
796 /* Auto-online VCPU0 when it is initialised. */
797 if (v->vcpu_id == 0)
798 clear_bit(_VPF_down, &v->pause_flags);
799 }
801 if (c.nat == NULL)
802 return 0;
804 uregs->b6 = c.nat->regs.b[6];
805 uregs->b7 = c.nat->regs.b[7];
807 uregs->ar_csd = c.nat->regs.ar.csd;
808 uregs->ar_ssd = c.nat->regs.ar.ssd;
810 uregs->r8 = c.nat->regs.r[8];
811 uregs->r9 = c.nat->regs.r[9];
812 uregs->r10 = c.nat->regs.r[10];
813 uregs->r11 = c.nat->regs.r[11];
815 if (!d->arch.is_vti)
816 vcpu_set_psr(v, c.nat->regs.psr);
817 else
818 vmx_vcpu_set_psr(v, c.nat->regs.psr);
819 uregs->cr_iip = c.nat->regs.ip;
820 uregs->cr_ifs = c.nat->regs.cfm;
822 uregs->ar_unat = c.nat->regs.ar.unat;
823 uregs->ar_pfs = c.nat->regs.ar.pfs;
824 uregs->ar_rsc = c.nat->regs.ar.rsc;
825 uregs->ar_rnat = c.nat->regs.ar.rnat;
826 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
828 uregs->pr = c.nat->regs.pr;
829 uregs->b0 = c.nat->regs.b[0];
830 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
831 /* Protection against crazy user code. */
832 if (!was_initialised)
833 uregs->loadrs = (rbs_size) << 16;
834 if (rbs_size == (uregs->loadrs >> 16))
835 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
837 uregs->r1 = c.nat->regs.r[1];
838 uregs->r12 = c.nat->regs.r[12];
839 uregs->r13 = c.nat->regs.r[13];
840 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
841 uregs->r15 = c.nat->regs.r[15];
843 uregs->r14 = c.nat->regs.r[14];
844 uregs->r2 = c.nat->regs.r[2];
845 uregs->r3 = c.nat->regs.r[3];
846 uregs->r16 = c.nat->regs.r[16];
847 uregs->r17 = c.nat->regs.r[17];
848 uregs->r18 = c.nat->regs.r[18];
849 uregs->r19 = c.nat->regs.r[19];
850 uregs->r20 = c.nat->regs.r[20];
851 uregs->r21 = c.nat->regs.r[21];
852 uregs->r22 = c.nat->regs.r[22];
853 uregs->r23 = c.nat->regs.r[23];
854 uregs->r24 = c.nat->regs.r[24];
855 uregs->r25 = c.nat->regs.r[25];
856 uregs->r26 = c.nat->regs.r[26];
857 uregs->r27 = c.nat->regs.r[27];
858 uregs->r28 = c.nat->regs.r[28];
859 uregs->r29 = c.nat->regs.r[29];
860 uregs->r30 = c.nat->regs.r[30];
861 uregs->r31 = c.nat->regs.r[31];
863 uregs->ar_ccv = c.nat->regs.ar.ccv;
865 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
866 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
867 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
868 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
869 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
870 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
872 uregs->r4 = c.nat->regs.r[4];
873 uregs->r5 = c.nat->regs.r[5];
874 uregs->r6 = c.nat->regs.r[6];
875 uregs->r7 = c.nat->regs.r[7];
877 /* FIXME: to be reordered and restored. */
878 /* uregs->eml_unat = c.nat->regs.nat; */
879 uregs->eml_unat = 0;
881 if (!d->arch.is_vti) {
882 /* domain runs at PL2/3 */
883 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
884 IA64_PSR_CPL0_BIT);
885 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
886 }
888 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
889 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
890 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
891 }
893 if (c.nat->flags & VGCF_EXTRA_REGS) {
894 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
896 for (i = 0;
897 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
898 i++) {
899 vcpu_set_itr(v, i, tr->itrs[i].pte,
900 tr->itrs[i].itir,
901 tr->itrs[i].vadr,
902 tr->itrs[i].rid);
903 }
904 for (i = 0;
905 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
906 i++) {
907 vcpu_set_dtr(v, i,
908 tr->dtrs[i].pte,
909 tr->dtrs[i].itir,
910 tr->dtrs[i].vadr,
911 tr->dtrs[i].rid);
912 }
913 v->arch.event_callback_ip = c.nat->event_callback_ip;
914 v->arch.iva = c.nat->regs.cr.iva;
915 }
917 return 0;
918 }
920 static void relinquish_memory(struct domain *d, struct list_head *list)
921 {
922 struct list_head *ent;
923 struct page_info *page;
924 #ifndef __ia64__
925 unsigned long x, y;
926 #endif
928 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
929 spin_lock_recursive(&d->page_alloc_lock);
930 ent = list->next;
931 while ( ent != list )
932 {
933 page = list_entry(ent, struct page_info, list);
934 /* Grab a reference to the page so it won't disappear from under us. */
935 if ( unlikely(!get_page(page, d)) )
936 {
937 /* Couldn't get a reference -- someone is freeing this page. */
938 ent = ent->next;
939 continue;
940 }
942 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
943 put_page_and_type(page);
945 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
946 put_page(page);
948 #ifndef __ia64__
949 /*
950 * Forcibly invalidate base page tables at this point to break circular
951 * 'linear page table' references. This is okay because MMU structures
952 * are not shared across domains and this domain is now dead. Thus base
953 * tables are not in use so a non-zero count means circular reference.
954 */
955 y = page->u.inuse.type_info;
956 for ( ; ; )
957 {
958 x = y;
959 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
960 (PGT_base_page_table|PGT_validated)) )
961 break;
963 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
964 if ( likely(y == x) )
965 {
966 free_page_type(page, PGT_base_page_table);
967 break;
968 }
969 }
970 #endif
972 /* Follow the list chain and /then/ potentially free the page. */
973 ent = ent->next;
974 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
975 put_page(page);
976 }
978 spin_unlock_recursive(&d->page_alloc_lock);
979 }
981 int domain_relinquish_resources(struct domain *d)
982 {
983 int ret;
984 /* Relinquish guest resources for VT-i domain. */
985 if (d->arch.is_vti)
986 vmx_relinquish_guest_resources(d);
988 /* Tear down shadow mode stuff. */
989 ret = mm_teardown(d);
990 if (ret != 0)
991 return ret;
993 /* Relinquish every page of memory. */
994 relinquish_memory(d, &d->xenpage_list);
995 relinquish_memory(d, &d->page_list);
997 if (d->arch.is_vti && d->arch.sal_data)
998 xfree(d->arch.sal_data);
1000 /* Free page used by xen oprofile buffer */
1001 free_xenoprof_pages(d);
1003 return 0;
1006 unsigned long
1007 domain_set_shared_info_va (unsigned long va)
1009 struct vcpu *v = current;
1010 struct domain *d = v->domain;
1012 /* Check virtual address:
1013 must belong to region 7,
1014 must be 64Kb aligned,
1015 must not be within Xen virtual space. */
1016 if ((va >> 61) != 7
1017 || (va & 0xffffUL) != 0
1018 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1019 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1021 /* Note: this doesn't work well if other cpus are already running.
1022 However this is part of the spec :-) */
1023 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1024 d->arch.shared_info_va = va;
1026 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1027 INT_ENABLE_OFFSET(v);
1029 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
1031 /* Remap the shared pages. */
1032 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
1034 return 0;
1037 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1038 #define SHADOW_COPY_CHUNK 1024
1040 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1042 unsigned int op = sc->op;
1043 int rc = 0;
1044 int i;
1045 //struct vcpu *v;
1047 if (unlikely(d == current->domain)) {
1048 gdprintk(XENLOG_INFO,
1049 "Don't try to do a shadow op on yourself!\n");
1050 return -EINVAL;
1053 domain_pause(d);
1055 switch (op)
1057 case XEN_DOMCTL_SHADOW_OP_OFF:
1058 if (shadow_mode_enabled (d)) {
1059 u64 *bm = d->arch.shadow_bitmap;
1061 /* Flush vhpt and tlb to restore dirty bit usage. */
1062 domain_flush_tlb_vhpt(d);
1064 /* Free bitmap. */
1065 d->arch.shadow_bitmap_size = 0;
1066 d->arch.shadow_bitmap = NULL;
1067 xfree(bm);
1069 break;
1071 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1072 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1073 rc = -EINVAL;
1074 break;
1076 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1077 if (shadow_mode_enabled(d)) {
1078 rc = -EINVAL;
1079 break;
1082 atomic64_set(&d->arch.shadow_fault_count, 0);
1083 atomic64_set(&d->arch.shadow_dirty_count, 0);
1085 d->arch.shadow_bitmap_size =
1086 ((d->arch.convmem_end >> PAGE_SHIFT) +
1087 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1088 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1089 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1090 if (d->arch.shadow_bitmap == NULL) {
1091 d->arch.shadow_bitmap_size = 0;
1092 rc = -ENOMEM;
1094 else {
1095 memset(d->arch.shadow_bitmap, 0,
1096 d->arch.shadow_bitmap_size / 8);
1098 /* Flush vhtp and tlb to enable dirty bit
1099 virtualization. */
1100 domain_flush_tlb_vhpt(d);
1102 break;
1104 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1106 int nbr_bytes;
1108 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1109 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1111 atomic64_set(&d->arch.shadow_fault_count, 0);
1112 atomic64_set(&d->arch.shadow_dirty_count, 0);
1114 if (guest_handle_is_null(sc->dirty_bitmap) ||
1115 (d->arch.shadow_bitmap == NULL)) {
1116 rc = -EINVAL;
1117 break;
1120 if (sc->pages > d->arch.shadow_bitmap_size)
1121 sc->pages = d->arch.shadow_bitmap_size;
1123 nbr_bytes = (sc->pages + 7) / 8;
1125 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1126 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1127 SHADOW_COPY_CHUNK : nbr_bytes - i;
1129 if (copy_to_guest_offset(
1130 sc->dirty_bitmap, i,
1131 (uint8_t *)d->arch.shadow_bitmap + i,
1132 size)) {
1133 rc = -EFAULT;
1134 break;
1137 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1140 break;
1143 case XEN_DOMCTL_SHADOW_OP_PEEK:
1145 unsigned long size;
1147 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1148 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1150 if (guest_handle_is_null(sc->dirty_bitmap) ||
1151 (d->arch.shadow_bitmap == NULL)) {
1152 rc = -EINVAL;
1153 break;
1156 if (sc->pages > d->arch.shadow_bitmap_size)
1157 sc->pages = d->arch.shadow_bitmap_size;
1159 size = (sc->pages + 7) / 8;
1160 if (copy_to_guest(sc->dirty_bitmap,
1161 (uint8_t *)d->arch.shadow_bitmap, size)) {
1162 rc = -EFAULT;
1163 break;
1165 break;
1167 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1168 sc->mb = 0;
1169 break;
1170 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1171 if (sc->mb > 0) {
1172 BUG();
1173 rc = -ENOMEM;
1175 break;
1176 default:
1177 rc = -EINVAL;
1178 break;
1181 domain_unpause(d);
1183 return rc;
1186 // remove following line if not privifying in memory
1187 //#define HAVE_PRIVIFY_MEMORY
1188 #ifndef HAVE_PRIVIFY_MEMORY
1189 #define privify_memory(x,y) do {} while(0)
1190 #endif
1192 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1193 unsigned long phys_load_offset)
1195 const elf_phdr *phdr;
1196 int phnum, h, filesz, memsz;
1197 unsigned long elfaddr, dom_mpaddr, dom_imva;
1198 struct page_info *p;
1200 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1201 for (h = 0; h < phnum; h++) {
1202 phdr = elf_phdr_by_index(elf, h);
1203 if (!elf_phdr_is_loadable(elf, phdr))
1204 continue;
1206 filesz = elf_uval(elf, phdr, p_filesz);
1207 memsz = elf_uval(elf, phdr, p_memsz);
1208 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1209 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1210 dom_mpaddr += phys_load_offset;
1212 while (memsz > 0) {
1213 p = assign_new_domain_page(d,dom_mpaddr);
1214 BUG_ON (unlikely(p == NULL));
1215 dom_imva = __va_ul(page_to_maddr(p));
1216 if (filesz > 0) {
1217 if (filesz >= PAGE_SIZE)
1218 copy_page((void *) dom_imva,
1219 (void *) elfaddr);
1220 else {
1221 // copy partial page
1222 memcpy((void *) dom_imva,
1223 (void *) elfaddr, filesz);
1224 // zero the rest of page
1225 memset((void *) dom_imva+filesz, 0,
1226 PAGE_SIZE-filesz);
1228 //FIXME: This test for code seems to find a lot more than objdump -x does
1229 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1230 privify_memory(dom_imva,PAGE_SIZE);
1231 flush_icache_range(dom_imva,
1232 dom_imva+PAGE_SIZE);
1235 else if (memsz > 0) {
1236 /* always zero out entire page */
1237 clear_page((void *) dom_imva);
1239 memsz -= PAGE_SIZE;
1240 filesz -= PAGE_SIZE;
1241 elfaddr += PAGE_SIZE;
1242 dom_mpaddr += PAGE_SIZE;
1247 static void __init calc_dom0_size(void)
1249 unsigned long domheap_pages;
1250 unsigned long p2m_pages;
1251 unsigned long spare_hv_pages;
1252 unsigned long max_dom0_size;
1254 /* Estimate maximum memory we can safely allocate for dom0
1255 * by subtracting the p2m table allocation and a chunk of memory
1256 * for DMA and PCI mapping from the available domheap pages. The
1257 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1258 * to have a good idea of what those requirements might be ahead
1259 * of time, calculated at 1MB per 4GB of system memory */
1260 domheap_pages = avail_domheap_pages();
1261 p2m_pages = domheap_pages / PTRS_PER_PTE;
1262 spare_hv_pages = domheap_pages / 4096;
1263 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1264 * PAGE_SIZE;
1265 printk("Maximum permitted dom0 size: %luMB\n",
1266 max_dom0_size / (1024*1024));
1268 /* validate proposed dom0_size, fix up as needed */
1269 if (dom0_size > max_dom0_size) {
1270 printk("Reducing dom0 memory allocation from %luK to %luK "
1271 "to fit available memory\n",
1272 dom0_size / 1024, max_dom0_size / 1024);
1273 dom0_size = max_dom0_size;
1276 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1277 if (dom0_size == 0) {
1278 printk("Allocating all available memory to dom0\n");
1279 dom0_size = max_dom0_size;
1282 /* Check dom0 size. */
1283 if (dom0_size < 4 * 1024 * 1024) {
1284 panic("dom0_mem is too small, boot aborted"
1285 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1288 if (running_on_sim) {
1289 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1292 /* no need to allocate pages for now
1293 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1294 */
1298 /*
1299 * Domain 0 has direct access to all devices absolutely. However
1300 * the major point of this stub here, is to allow alloc_dom_mem
1301 * handled with order > 0 request. Dom0 requires that bit set to
1302 * allocate memory for other domains.
1303 */
1304 static void __init physdev_init_dom0(struct domain *d)
1306 if (iomem_permit_access(d, 0UL, ~0UL))
1307 BUG();
1308 if (irqs_permit_access(d, 0, NR_IRQS-1))
1309 BUG();
1310 if (ioports_permit_access(d, 0, 0xffff))
1311 BUG();
1314 int __init construct_dom0(struct domain *d,
1315 unsigned long image_start, unsigned long image_len,
1316 unsigned long initrd_start, unsigned long initrd_len,
1317 char *cmdline)
1319 int i, rc;
1320 start_info_t *si;
1321 dom0_vga_console_info_t *ci;
1322 struct vcpu *v = d->vcpu[0];
1323 unsigned long max_pages;
1325 struct elf_binary elf;
1326 struct elf_dom_parms parms;
1327 unsigned long p_start;
1328 unsigned long pkern_start;
1329 unsigned long pkern_entry;
1330 unsigned long pkern_end;
1331 unsigned long pinitrd_start = 0;
1332 unsigned long pstart_info;
1333 unsigned long phys_load_offset;
1334 struct page_info *start_info_page;
1335 unsigned long bp_mpa;
1336 struct ia64_boot_param *bp;
1338 //printk("construct_dom0: starting\n");
1340 /* Sanity! */
1341 BUG_ON(d != dom0);
1342 BUG_ON(d->vcpu[0] == NULL);
1343 BUG_ON(v->is_initialised);
1345 printk("*** LOADING DOMAIN 0 ***\n");
1347 calc_dom0_size();
1349 max_pages = dom0_size / PAGE_SIZE;
1350 d->max_pages = max_pages;
1351 d->tot_pages = 0;
1353 rc = elf_init(&elf, (void*)image_start, image_len);
1354 if ( rc != 0 )
1355 return rc;
1356 #ifdef VERBOSE
1357 elf_set_verbose(&elf);
1358 #endif
1359 elf_parse_binary(&elf);
1360 if (0 != (elf_xen_parse(&elf, &parms)))
1361 return rc;
1363 /*
1364 * We cannot rely on the load address in the ELF headers to
1365 * determine the meta physical address at which the image
1366 * is loaded. Patch the address to match the real one, based
1367 * on xen_pstart
1368 */
1369 phys_load_offset = xen_pstart - elf.pstart;
1370 elf.pstart += phys_load_offset;
1371 elf.pend += phys_load_offset;
1372 parms.virt_kstart += phys_load_offset;
1373 parms.virt_kend += phys_load_offset;
1374 parms.virt_entry += phys_load_offset;
1376 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1377 elf_64bit(&elf) ? "64-bit" : "32-bit",
1378 elf_msb(&elf) ? "msb" : "lsb",
1379 elf.pstart, elf.pend);
1380 if (!elf_64bit(&elf) ||
1381 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1382 printk("Incompatible kernel binary\n");
1383 return -1;
1386 p_start = parms.virt_base;
1387 pkern_start = parms.virt_kstart;
1388 pkern_end = parms.virt_kend;
1389 pkern_entry = parms.virt_entry;
1391 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1393 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1395 printk("Initial guest OS must load to a page boundary.\n");
1396 return -EINVAL;
1399 pstart_info = PAGE_ALIGN(pkern_end);
1400 if(initrd_start && initrd_len){
1401 unsigned long offset;
1403 /* The next page aligned boundary after the start info.
1404 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1405 pinitrd_start = pstart_info + PAGE_SIZE;
1407 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1408 panic("%s: not enough memory assigned to dom0", __func__);
1410 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1411 struct page_info *p;
1412 p = assign_new_domain_page(d, pinitrd_start + offset);
1413 if (p == NULL)
1414 panic("%s: can't allocate page for initrd image", __func__);
1415 if (initrd_len < offset + PAGE_SIZE)
1416 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1417 initrd_len - offset);
1418 else
1419 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1423 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1424 " Kernel image: %lx->%lx\n"
1425 " Entry address: %lx\n"
1426 " Init. ramdisk: %lx len %lx\n"
1427 " Start info.: %lx->%lx\n",
1428 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1429 pstart_info, pstart_info + PAGE_SIZE);
1431 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1433 printk("Initial guest OS requires too much space\n"
1434 "(%luMB is greater than %luMB limit)\n",
1435 (pkern_end-pkern_start)>>20,
1436 (max_pages <<PAGE_SHIFT)>>20);
1437 return -ENOMEM;
1440 // if high 3 bits of pkern start are non-zero, error
1442 // if pkern end is after end of metaphysical memory, error
1443 // (we should be able to deal with this... later)
1445 /* Mask all upcalls... */
1446 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1447 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1449 if (dom0_max_vcpus == 0)
1450 dom0_max_vcpus = MAX_VIRT_CPUS;
1451 if (dom0_max_vcpus > num_online_cpus())
1452 dom0_max_vcpus = num_online_cpus();
1453 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1454 dom0_max_vcpus = MAX_VIRT_CPUS;
1456 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1457 for ( i = 1; i < dom0_max_vcpus; i++ )
1458 if (alloc_vcpu(d, i, i) == NULL)
1459 panic("Cannot allocate dom0 vcpu %d\n", i);
1461 /* Copy the OS image. */
1462 loaddomainelfimage(d, &elf, phys_load_offset);
1464 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1465 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1467 /* Set up start info area. */
1468 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1469 start_info_page = assign_new_domain_page(d, pstart_info);
1470 if (start_info_page == NULL)
1471 panic("can't allocate start info page");
1472 si = page_to_virt(start_info_page);
1473 clear_page(si);
1474 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1475 xen_major_version(), xen_minor_version());
1476 si->nr_pages = max_pages;
1477 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1479 printk("Dom0: 0x%lx\n", (u64)dom0);
1481 v->is_initialised = 1;
1482 clear_bit(_VPF_down, &v->pause_flags);
1484 /* Build firmware.
1485 Note: Linux kernel reserve memory used by start_info, so there is
1486 no need to remove it from MDT. */
1487 bp_mpa = pstart_info + sizeof(struct start_info);
1488 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1489 if (rc != 0)
1490 return rc;
1492 /* Fill boot param. */
1493 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1495 bp = (struct ia64_boot_param *)((unsigned char *)si +
1496 sizeof(start_info_t));
1497 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1499 /* We assume console has reached the last line! */
1500 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1501 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1502 bp->console_info.orig_x = 0;
1503 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1504 0 : bp->console_info.num_rows - 1;
1506 bp->initrd_start = pinitrd_start;
1507 bp->initrd_size = ia64_boot_param->initrd_size;
1509 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1510 sizeof(start_info_t) +
1511 sizeof(struct ia64_boot_param));
1513 if (fill_console_start_info(ci)) {
1514 si->console.dom0.info_off = sizeof(start_info_t) +
1515 sizeof(struct ia64_boot_param);
1516 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1519 vcpu_init_regs (v);
1521 vcpu_regs(v)->r28 = bp_mpa;
1523 vcpu_regs (v)->cr_iip = pkern_entry;
1525 physdev_init_dom0(d);
1527 return 0;
1530 void machine_restart(void)
1532 console_start_sync();
1533 if (running_on_sim)
1534 printk ("machine_restart called. spinning...\n");
1535 else
1536 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1537 while(1);
1540 extern void cpu_halt(void);
1542 void machine_halt(void)
1544 console_start_sync();
1545 if (running_on_sim)
1546 printk ("machine_halt called. spinning...\n");
1547 else
1548 cpu_halt();
1549 while(1);
1552 void sync_vcpu_execstate(struct vcpu *v)
1554 // __ia64_save_fpu(v->arch._thread.fph);
1555 // FIXME SMP: Anything else needed here for SMP?
1558 /* This function is taken from xen/arch/x86/domain.c */
1559 long
1560 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1562 long rc = 0;
1564 switch (cmd) {
1565 case VCPUOP_register_runstate_memory_area:
1567 struct vcpu_register_runstate_memory_area area;
1568 struct vcpu_runstate_info runstate;
1570 rc = -EFAULT;
1571 if (copy_from_guest(&area, arg, 1))
1572 break;
1574 if (!guest_handle_okay(area.addr.h, 1))
1575 break;
1577 rc = 0;
1578 runstate_guest(v) = area.addr.h;
1580 if (v == current) {
1581 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1582 } else {
1583 vcpu_runstate_get(v, &runstate);
1584 __copy_to_guest(runstate_guest(v), &runstate, 1);
1587 break;
1589 default:
1590 rc = -ENOSYS;
1591 break;
1594 return rc;
1597 static void __init parse_dom0_mem(char *s)
1599 dom0_size = parse_size_and_unit(s, NULL);
1601 custom_param("dom0_mem", parse_dom0_mem);
1603 /*
1604 * Helper function for the optimization stuff handling the identity mapping
1605 * feature.
1606 */
1607 static inline void
1608 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
1609 struct xen_ia64_opt_feature* f)
1611 if (f->on) {
1612 *mask |= f->cmd;
1613 im->pgprot = f->pgprot;
1614 im->key = f->key;
1615 } else {
1616 *mask &= ~(f->cmd);
1617 im->pgprot = 0;
1618 im->key = 0;
1622 /* Switch a optimization feature on/off. */
1623 int
1624 domain_opt_feature(struct xen_ia64_opt_feature* f)
1626 struct opt_feature* optf = &(current->domain->arch.opt_feature);
1627 long rc = 0;
1629 switch (f->cmd) {
1630 case XEN_IA64_OPTF_IDENT_MAP_REG4:
1631 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
1632 break;
1633 case XEN_IA64_OPTF_IDENT_MAP_REG5:
1634 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
1635 break;
1636 case XEN_IA64_OPTF_IDENT_MAP_REG7:
1637 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
1638 break;
1639 default:
1640 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
1641 rc = -ENOSYS;
1642 break;
1644 return rc;