ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 15826:7e79e7f01f3d

Implement ia64 continuable domain destroy.
Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author kfraser@localhost.localdomain
date Fri Aug 31 15:46:37 2007 +0100 (2007-08-31)
parents bd59dd48e208
children 4ffca478e2f7 1c85fe14169f
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <public/vcpu.h>
55 /* dom0_size: default memory allocation for dom0 (~4GB) */
56 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
58 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
59 static unsigned int __initdata dom0_max_vcpus = 4;
60 integer_param("dom0_max_vcpus", dom0_max_vcpus);
62 extern char dom0_command_line[];
64 /* forward declaration */
65 static void init_switch_stack(struct vcpu *v);
67 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
68 This is a Xen virtual address. */
69 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
70 DEFINE_PER_CPU(int *, current_psr_ic_addr);
72 DEFINE_PER_CPU(struct vcpu *, fp_owner);
74 #include <xen/sched-if.h>
76 static void
77 ia64_disable_vhpt_walker(void)
78 {
79 // disable VHPT. ia64_new_rr7() might cause VHPT
80 // fault without this because it flushes dtr[IA64_TR_VHPT]
81 // (VHPT_SIZE_LOG2 << 2) is just for avoid
82 // Reserved Register/Field fault.
83 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
84 }
86 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
87 {
88 int cpu = smp_processor_id();
89 int last_vcpu_id, last_processor;
91 if (!is_idle_domain(prev->domain))
92 tlbflush_update_time
93 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
94 tlbflush_current_time());
96 if (is_idle_domain(next->domain))
97 return;
99 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
100 last_processor = next->arch.last_processor;
102 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
103 next->arch.last_processor = cpu;
105 if ((last_vcpu_id != next->vcpu_id &&
106 last_vcpu_id != INVALID_VCPU_ID) ||
107 (last_vcpu_id == next->vcpu_id &&
108 last_processor != cpu &&
109 last_processor != INVALID_PROCESSOR)) {
110 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
111 u32 last_tlbflush_timestamp =
112 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
113 #endif
114 int vhpt_is_flushed = 0;
116 // if the vTLB implementation was changed,
117 // the followings must be updated either.
118 if (VMX_DOMAIN(next)) {
119 // currently vTLB for vt-i domian is per vcpu.
120 // so any flushing isn't needed.
121 } else if (HAS_PERVCPU_VHPT(next->domain)) {
122 // nothing to do
123 } else {
124 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
125 last_tlbflush_timestamp)) {
126 local_vhpt_flush();
127 vhpt_is_flushed = 1;
128 }
129 }
130 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
131 last_tlbflush_timestamp)) {
132 local_flush_tlb_all();
133 perfc_incr(tlbflush_clock_cswitch_purge);
134 } else {
135 perfc_incr(tlbflush_clock_cswitch_skip);
136 }
137 perfc_incr(flush_vtlb_for_context_switch);
138 }
139 }
141 static void flush_cache_for_context_switch(struct vcpu *next)
142 {
143 extern cpumask_t cpu_cache_coherent_map;
144 int cpu = smp_processor_id();
146 if (is_idle_vcpu(next) ||
147 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
148 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
149 unsigned long flags;
150 u64 progress = 0;
151 s64 status;
153 local_irq_save(flags);
154 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
155 local_irq_restore(flags);
156 if (status != 0)
157 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
158 "cache_type=4 status %lx", status);
159 }
160 }
161 }
163 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
164 {
165 /*
166 * Implement eager save, lazy restore
167 */
168 if (!is_idle_vcpu(prev)) {
169 if (VMX_DOMAIN(prev)) {
170 if (FP_PSR(prev) & IA64_PSR_MFH) {
171 __ia64_save_fpu(prev->arch._thread.fph);
172 __ia64_per_cpu_var(fp_owner) = prev;
173 }
174 } else {
175 if (PSCB(prev, hpsr_mfh)) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 }
180 }
182 if (!is_idle_vcpu(next)) {
183 if (VMX_DOMAIN(next)) {
184 FP_PSR(next) = IA64_PSR_DFH;
185 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
186 } else {
187 PSCB(next, hpsr_dfh) = 1;
188 PSCB(next, hpsr_mfh) = 0;
189 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
190 }
191 }
192 }
194 void schedule_tail(struct vcpu *prev)
195 {
196 extern char ia64_ivt;
198 context_saved(prev);
199 ia64_disable_vhpt_walker();
201 if (VMX_DOMAIN(current)) {
202 vmx_do_launch(current);
203 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
204 current->processor);
205 } else {
206 ia64_set_iva(&ia64_ivt);
207 load_region_regs(current);
208 ia64_set_pta(vcpu_pta(current));
209 vcpu_load_kernel_regs(current);
210 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
211 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
212 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
213 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
214 migrate_timer(&current->arch.hlt_timer, current->processor);
215 }
216 flush_vtlb_for_context_switch(prev, current);
217 }
219 void context_switch(struct vcpu *prev, struct vcpu *next)
220 {
221 uint64_t spsr;
223 local_irq_save(spsr);
225 if (VMX_DOMAIN(prev)) {
226 vmx_save_state(prev);
227 if (!VMX_DOMAIN(next)) {
228 /* VMX domains can change the physical cr.dcr.
229 * Restore default to prevent leakage. */
230 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
231 }
232 }
233 if (VMX_DOMAIN(next))
234 vmx_load_state(next);
236 ia64_disable_vhpt_walker();
237 lazy_fp_switch(prev, current);
239 if (prev->arch.dbg_used || next->arch.dbg_used) {
240 /*
241 * Load debug registers either because they are valid or to clear
242 * the previous one.
243 */
244 ia64_load_debug_regs(next->arch.dbr);
245 }
247 prev = ia64_switch_to(next);
249 /* Note: ia64_switch_to does not return here at vcpu initialization. */
251 if (VMX_DOMAIN(current)) {
252 vmx_load_all_rr(current);
253 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
254 current->processor);
255 } else {
256 struct domain *nd;
257 extern char ia64_ivt;
259 ia64_set_iva(&ia64_ivt);
261 nd = current->domain;
262 if (!is_idle_domain(nd)) {
263 load_region_regs(current);
264 ia64_set_pta(vcpu_pta(current));
265 vcpu_load_kernel_regs(current);
266 if (vcpu_pkr_in_use(current))
267 vcpu_pkr_load_regs(current);
268 vcpu_set_next_timer(current);
269 if (vcpu_timer_expired(current))
270 vcpu_pend_timer(current);
271 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
272 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
273 __ia64_per_cpu_var(current_psr_ic_addr) =
274 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
275 /* steal time accounting */
276 if (!guest_handle_is_null(runstate_guest(current)))
277 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
278 } else {
279 /* When switching to idle domain, only need to disable vhpt
280 * walker. Then all accesses happen within idle context will
281 * be handled by TR mapping and identity mapping.
282 */
283 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
284 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
285 }
286 }
287 local_irq_restore(spsr);
289 /* lazy fp */
290 if (current->processor != current->arch.last_processor) {
291 unsigned long *addr;
292 addr = (unsigned long *)per_cpu_addr(fp_owner,
293 current->arch.last_processor);
294 ia64_cmpxchg(acq, addr, current, 0, 8);
295 }
297 flush_vtlb_for_context_switch(prev, current);
298 flush_cache_for_context_switch(current);
299 context_saved(prev);
300 }
302 void continue_running(struct vcpu *same)
303 {
304 /* nothing to do */
305 }
307 #ifdef CONFIG_PERFMON
308 static int pal_halt = 1;
309 static int can_do_pal_halt = 1;
311 static int __init nohalt_setup(char * str)
312 {
313 pal_halt = can_do_pal_halt = 0;
314 return 1;
315 }
316 __setup("nohalt", nohalt_setup);
318 void
319 update_pal_halt_status(int status)
320 {
321 can_do_pal_halt = pal_halt && status;
322 }
323 #else
324 #define can_do_pal_halt (1)
325 #endif
327 static void default_idle(void)
328 {
329 local_irq_disable();
330 if ( !softirq_pending(smp_processor_id()) ) {
331 if (can_do_pal_halt)
332 safe_halt();
333 else
334 cpu_relax();
335 }
336 local_irq_enable();
337 }
339 static void continue_cpu_idle_loop(void)
340 {
341 for ( ; ; )
342 {
343 #ifdef IA64
344 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
345 #else
346 irq_stat[cpu].idle_timestamp = jiffies;
347 #endif
348 page_scrub_schedule_work();
349 while ( !softirq_pending(smp_processor_id()) )
350 default_idle();
351 raise_softirq(SCHEDULE_SOFTIRQ);
352 do_softirq();
353 }
354 }
356 void startup_cpu_idle_loop(void)
357 {
358 /* Just some sanity to ensure that the scheduler is set up okay. */
359 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
360 raise_softirq(SCHEDULE_SOFTIRQ);
362 continue_cpu_idle_loop();
363 }
365 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
366 * get_order_from_shift(XMAPPEDREGS_SHIFT))
367 */
368 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
369 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
370 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
371 #endif
373 void hlt_timer_fn(void *data)
374 {
375 struct vcpu *v = data;
376 vcpu_unblock(v);
377 }
379 void relinquish_vcpu_resources(struct vcpu *v)
380 {
381 if (HAS_PERVCPU_VHPT(v->domain))
382 pervcpu_vhpt_free(v);
383 if (v->arch.privregs != NULL) {
384 free_xenheap_pages(v->arch.privregs,
385 get_order_from_shift(XMAPPEDREGS_SHIFT));
386 v->arch.privregs = NULL;
387 }
388 kill_timer(&v->arch.hlt_timer);
389 }
391 struct vcpu *alloc_vcpu_struct(void)
392 {
393 struct vcpu *v;
394 struct thread_info *ti;
395 static int first_allocation = 1;
397 if (first_allocation) {
398 first_allocation = 0;
399 /* Still keep idle vcpu0 static allocated at compilation, due
400 * to some code from Linux still requires it in early phase.
401 */
402 return idle_vcpu[0];
403 }
405 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
406 return NULL;
407 memset(v, 0, sizeof(*v));
409 ti = alloc_thread_info(v);
410 /* Clear thread_info to clear some important fields, like
411 * preempt_count
412 */
413 memset(ti, 0, sizeof(struct thread_info));
414 init_switch_stack(v);
416 return v;
417 }
419 void free_vcpu_struct(struct vcpu *v)
420 {
421 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
422 }
424 int vcpu_initialise(struct vcpu *v)
425 {
426 struct domain *d = v->domain;
428 if (!is_idle_domain(d)) {
429 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
430 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
431 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
432 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
434 /* Is it correct ?
435 It depends on the domain rid usage.
437 A domain may share rid among its processor (eg having a
438 global VHPT). In this case, we should also share rid
439 among vcpus and the rid range should be the same.
441 However a domain may have per cpu rid allocation. In
442 this case we don't want to share rid among vcpus, but we may
443 do it if two vcpus are on the same cpu... */
445 v->arch.starting_rid = d->arch.starting_rid;
446 v->arch.ending_rid = d->arch.ending_rid;
447 v->arch.breakimm = d->arch.breakimm;
448 v->arch.last_processor = INVALID_PROCESSOR;
449 v->arch.vhpt_pg_shift = PAGE_SHIFT;
450 }
452 if (!VMX_DOMAIN(v))
453 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
454 first_cpu(cpu_online_map));
456 return 0;
457 }
459 void vcpu_share_privregs_with_guest(struct vcpu *v)
460 {
461 struct domain *d = v->domain;
462 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
464 for (i = 0; i < (1 << order); i++)
465 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
466 d, XENSHARE_writable);
467 /*
468 * XXX IA64_XMAPPEDREGS_PADDR
469 * assign these pages into guest pseudo physical address
470 * space for dom0 to map this page by gmfn.
471 * this is necessary for domain save, restore and dump-core.
472 */
473 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
474 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
475 virt_to_maddr(v->arch.privregs + i));
476 }
478 int vcpu_late_initialise(struct vcpu *v)
479 {
480 struct domain *d = v->domain;
481 int rc, order;
483 if (HAS_PERVCPU_VHPT(d)) {
484 rc = pervcpu_vhpt_alloc(v);
485 if (rc != 0)
486 return rc;
487 }
489 /* Create privregs page. */
490 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
491 v->arch.privregs = alloc_xenheap_pages(order);
492 BUG_ON(v->arch.privregs == NULL);
493 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
494 vcpu_share_privregs_with_guest(v);
496 return 0;
497 }
499 void vcpu_destroy(struct vcpu *v)
500 {
501 if (v->domain->arch.is_vti)
502 vmx_relinquish_vcpu_resources(v);
503 else
504 relinquish_vcpu_resources(v);
505 }
507 static void init_switch_stack(struct vcpu *v)
508 {
509 struct pt_regs *regs = vcpu_regs (v);
510 struct switch_stack *sw = (struct switch_stack *) regs - 1;
511 extern void ia64_ret_from_clone;
513 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
514 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
515 sw->b0 = (unsigned long) &ia64_ret_from_clone;
516 sw->ar_fpsr = FPSR_DEFAULT;
517 v->arch._thread.ksp = (unsigned long) sw - 16;
518 // stay on kernel stack because may get interrupts!
519 // ia64_ret_from_clone switches to user stack
520 v->arch._thread.on_ustack = 0;
521 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
522 }
524 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
525 static int opt_pervcpu_vhpt = 1;
526 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
527 #endif
529 int arch_domain_create(struct domain *d)
530 {
531 int i;
533 // the following will eventually need to be negotiated dynamically
534 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
535 d->arch.breakimm = 0x1000;
536 for (i = 0; i < NR_CPUS; i++) {
537 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
538 }
540 if (is_idle_domain(d))
541 return 0;
543 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
544 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
545 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
546 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
547 #endif
548 if (tlb_track_create(d) < 0)
549 goto fail_nomem1;
550 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
551 if (d->shared_info == NULL)
552 goto fail_nomem;
553 memset(d->shared_info, 0, XSI_SIZE);
554 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
555 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
556 d, XENSHARE_writable);
558 /* We may also need emulation rid for region4, though it's unlikely
559 * to see guest issue uncacheable access in metaphysical mode. But
560 * keep such info here may be more sane.
561 */
562 if (!allocate_rid_range(d,0))
563 goto fail_nomem;
565 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
566 d->arch.mm_teardown_offset = 0;
568 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
569 goto fail_nomem;
571 /*
572 * grant_table_create() can't fully initialize grant table for domain
573 * because it is called before arch_domain_create().
574 * Here we complete the initialization which requires p2m table.
575 */
576 spin_lock(&d->grant_table->lock);
577 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
578 ia64_gnttab_create_shared_page(d, d->grant_table, i);
579 spin_unlock(&d->grant_table->lock);
581 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
582 RANGESETF_prettyprint_hex);
584 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
585 return 0;
587 fail_nomem:
588 tlb_track_destroy(d);
589 fail_nomem1:
590 if (d->arch.mm.pgd != NULL)
591 pgd_free(d->arch.mm.pgd);
592 if (d->shared_info != NULL)
593 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
594 return -ENOMEM;
595 }
597 void arch_domain_destroy(struct domain *d)
598 {
599 mm_final_teardown(d);
601 if (d->shared_info != NULL)
602 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
604 tlb_track_destroy(d);
606 /* Clear vTLB for the next domain. */
607 domain_flush_tlb_vhpt(d);
609 deallocate_rid_range(d);
610 }
612 int arch_vcpu_reset(struct vcpu *v)
613 {
614 /* FIXME: Stub for now */
615 return 0;
616 }
618 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
620 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
621 {
622 int i;
623 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
624 struct cpu_user_regs *uregs = vcpu_regs(v);
625 int is_hvm = VMX_DOMAIN(v);
626 unsigned int rbs_size;
628 c.nat->regs.b[6] = uregs->b6;
629 c.nat->regs.b[7] = uregs->b7;
631 c.nat->regs.ar.csd = uregs->ar_csd;
632 c.nat->regs.ar.ssd = uregs->ar_ssd;
634 c.nat->regs.r[8] = uregs->r8;
635 c.nat->regs.r[9] = uregs->r9;
636 c.nat->regs.r[10] = uregs->r10;
637 c.nat->regs.r[11] = uregs->r11;
639 if (is_hvm)
640 c.nat->regs.psr = vmx_vcpu_get_psr(v);
641 else
642 c.nat->regs.psr = vcpu_get_psr(v);
644 c.nat->regs.ip = uregs->cr_iip;
645 c.nat->regs.cfm = uregs->cr_ifs;
647 c.nat->regs.ar.unat = uregs->ar_unat;
648 c.nat->regs.ar.pfs = uregs->ar_pfs;
649 c.nat->regs.ar.rsc = uregs->ar_rsc;
650 c.nat->regs.ar.rnat = uregs->ar_rnat;
651 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
653 c.nat->regs.pr = uregs->pr;
654 c.nat->regs.b[0] = uregs->b0;
655 rbs_size = uregs->loadrs >> 16;
656 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
658 c.nat->regs.r[1] = uregs->r1;
659 c.nat->regs.r[12] = uregs->r12;
660 c.nat->regs.r[13] = uregs->r13;
661 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
662 c.nat->regs.r[15] = uregs->r15;
664 c.nat->regs.r[14] = uregs->r14;
665 c.nat->regs.r[2] = uregs->r2;
666 c.nat->regs.r[3] = uregs->r3;
667 c.nat->regs.r[16] = uregs->r16;
668 c.nat->regs.r[17] = uregs->r17;
669 c.nat->regs.r[18] = uregs->r18;
670 c.nat->regs.r[19] = uregs->r19;
671 c.nat->regs.r[20] = uregs->r20;
672 c.nat->regs.r[21] = uregs->r21;
673 c.nat->regs.r[22] = uregs->r22;
674 c.nat->regs.r[23] = uregs->r23;
675 c.nat->regs.r[24] = uregs->r24;
676 c.nat->regs.r[25] = uregs->r25;
677 c.nat->regs.r[26] = uregs->r26;
678 c.nat->regs.r[27] = uregs->r27;
679 c.nat->regs.r[28] = uregs->r28;
680 c.nat->regs.r[29] = uregs->r29;
681 c.nat->regs.r[30] = uregs->r30;
682 c.nat->regs.r[31] = uregs->r31;
684 c.nat->regs.ar.ccv = uregs->ar_ccv;
686 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
687 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
688 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
689 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
690 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
691 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
693 c.nat->regs.r[4] = uregs->r4;
694 c.nat->regs.r[5] = uregs->r5;
695 c.nat->regs.r[6] = uregs->r6;
696 c.nat->regs.r[7] = uregs->r7;
698 /* FIXME: to be reordered. */
699 c.nat->regs.nats = uregs->eml_unat;
701 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
702 if (rbs_size < sizeof (c.nat->regs.rbs))
703 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
705 c.nat->privregs_pfn = get_gpfn_from_mfn
706 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
708 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
709 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
710 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
711 }
713 for (i = 0; i < 7; i++)
714 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
716 /* Fill extra regs. */
717 for (i = 0; i < 8; i++) {
718 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
719 tr->itrs[i].itir = v->arch.itrs[i].itir;
720 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
721 tr->itrs[i].rid = v->arch.itrs[i].rid;
722 }
723 for (i = 0; i < 8; i++) {
724 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
725 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
726 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
727 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
728 }
729 c.nat->event_callback_ip = v->arch.event_callback_ip;
731 /* If PV and privregs is not set, we can't read mapped registers. */
732 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
733 return;
735 vcpu_get_dcr (v, &c.nat->regs.cr.dcr);
736 vcpu_get_iva (v, &c.nat->regs.cr.iva);
737 }
739 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
740 {
741 struct cpu_user_regs *uregs = vcpu_regs(v);
742 struct domain *d = v->domain;
743 int was_initialised = v->is_initialised;
744 unsigned int rbs_size;
745 int rc, i;
747 /* Finish vcpu initialization. */
748 if (!was_initialised) {
749 if (d->arch.is_vti)
750 rc = vmx_final_setup_guest(v);
751 else
752 rc = vcpu_late_initialise(v);
753 if (rc != 0)
754 return rc;
756 vcpu_init_regs(v);
758 v->is_initialised = 1;
759 /* Auto-online VCPU0 when it is initialised. */
760 if (v->vcpu_id == 0)
761 clear_bit(_VPF_down, &v->pause_flags);
762 }
764 if (c.nat == NULL)
765 return 0;
767 uregs->b6 = c.nat->regs.b[6];
768 uregs->b7 = c.nat->regs.b[7];
770 uregs->ar_csd = c.nat->regs.ar.csd;
771 uregs->ar_ssd = c.nat->regs.ar.ssd;
773 uregs->r8 = c.nat->regs.r[8];
774 uregs->r9 = c.nat->regs.r[9];
775 uregs->r10 = c.nat->regs.r[10];
776 uregs->r11 = c.nat->regs.r[11];
778 if (!d->arch.is_vti)
779 vcpu_set_psr(v, c.nat->regs.psr);
780 else
781 vmx_vcpu_set_psr(v, c.nat->regs.psr);
782 uregs->cr_iip = c.nat->regs.ip;
783 uregs->cr_ifs = c.nat->regs.cfm;
785 uregs->ar_unat = c.nat->regs.ar.unat;
786 uregs->ar_pfs = c.nat->regs.ar.pfs;
787 uregs->ar_rsc = c.nat->regs.ar.rsc;
788 uregs->ar_rnat = c.nat->regs.ar.rnat;
789 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
791 uregs->pr = c.nat->regs.pr;
792 uregs->b0 = c.nat->regs.b[0];
793 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
794 /* Protection against crazy user code. */
795 if (!was_initialised)
796 uregs->loadrs = (rbs_size) << 16;
797 if (rbs_size == (uregs->loadrs >> 16))
798 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
800 uregs->r1 = c.nat->regs.r[1];
801 uregs->r12 = c.nat->regs.r[12];
802 uregs->r13 = c.nat->regs.r[13];
803 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
804 uregs->r15 = c.nat->regs.r[15];
806 uregs->r14 = c.nat->regs.r[14];
807 uregs->r2 = c.nat->regs.r[2];
808 uregs->r3 = c.nat->regs.r[3];
809 uregs->r16 = c.nat->regs.r[16];
810 uregs->r17 = c.nat->regs.r[17];
811 uregs->r18 = c.nat->regs.r[18];
812 uregs->r19 = c.nat->regs.r[19];
813 uregs->r20 = c.nat->regs.r[20];
814 uregs->r21 = c.nat->regs.r[21];
815 uregs->r22 = c.nat->regs.r[22];
816 uregs->r23 = c.nat->regs.r[23];
817 uregs->r24 = c.nat->regs.r[24];
818 uregs->r25 = c.nat->regs.r[25];
819 uregs->r26 = c.nat->regs.r[26];
820 uregs->r27 = c.nat->regs.r[27];
821 uregs->r28 = c.nat->regs.r[28];
822 uregs->r29 = c.nat->regs.r[29];
823 uregs->r30 = c.nat->regs.r[30];
824 uregs->r31 = c.nat->regs.r[31];
826 uregs->ar_ccv = c.nat->regs.ar.ccv;
828 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
829 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
830 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
831 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
832 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
833 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
835 uregs->r4 = c.nat->regs.r[4];
836 uregs->r5 = c.nat->regs.r[5];
837 uregs->r6 = c.nat->regs.r[6];
838 uregs->r7 = c.nat->regs.r[7];
840 /* FIXME: to be reordered and restored. */
841 /* uregs->eml_unat = c.nat->regs.nat; */
842 uregs->eml_unat = 0;
844 if (!d->arch.is_vti) {
845 /* domain runs at PL2/3 */
846 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
847 IA64_PSR_CPL0_BIT);
848 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
849 }
851 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
852 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
853 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
854 }
856 if (c.nat->flags & VGCF_EXTRA_REGS) {
857 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
859 for (i = 0; i < 8; i++) {
860 vcpu_set_itr(v, i, tr->itrs[i].pte,
861 tr->itrs[i].itir,
862 tr->itrs[i].vadr,
863 tr->itrs[i].rid);
864 }
865 for (i = 0; i < 8; i++) {
866 vcpu_set_dtr(v, i,
867 tr->dtrs[i].pte,
868 tr->dtrs[i].itir,
869 tr->dtrs[i].vadr,
870 tr->dtrs[i].rid);
871 }
872 v->arch.event_callback_ip = c.nat->event_callback_ip;
873 v->arch.iva = c.nat->regs.cr.iva;
874 }
876 return 0;
877 }
879 static void relinquish_memory(struct domain *d, struct list_head *list)
880 {
881 struct list_head *ent;
882 struct page_info *page;
883 #ifndef __ia64__
884 unsigned long x, y;
885 #endif
887 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
888 spin_lock_recursive(&d->page_alloc_lock);
889 ent = list->next;
890 while ( ent != list )
891 {
892 page = list_entry(ent, struct page_info, list);
893 /* Grab a reference to the page so it won't disappear from under us. */
894 if ( unlikely(!get_page(page, d)) )
895 {
896 /* Couldn't get a reference -- someone is freeing this page. */
897 ent = ent->next;
898 continue;
899 }
901 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
902 put_page_and_type(page);
904 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
905 put_page(page);
907 #ifndef __ia64__
908 /*
909 * Forcibly invalidate base page tables at this point to break circular
910 * 'linear page table' references. This is okay because MMU structures
911 * are not shared across domains and this domain is now dead. Thus base
912 * tables are not in use so a non-zero count means circular reference.
913 */
914 y = page->u.inuse.type_info;
915 for ( ; ; )
916 {
917 x = y;
918 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
919 (PGT_base_page_table|PGT_validated)) )
920 break;
922 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
923 if ( likely(y == x) )
924 {
925 free_page_type(page, PGT_base_page_table);
926 break;
927 }
928 }
929 #endif
931 /* Follow the list chain and /then/ potentially free the page. */
932 ent = ent->next;
933 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
934 put_page(page);
935 }
937 spin_unlock_recursive(&d->page_alloc_lock);
938 }
940 int domain_relinquish_resources(struct domain *d)
941 {
942 int ret;
943 /* Relinquish guest resources for VT-i domain. */
944 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
945 vmx_relinquish_guest_resources(d);
947 /* Tear down shadow mode stuff. */
948 ret = mm_teardown(d);
949 if (ret != 0)
950 return ret;
952 /* Relinquish every page of memory. */
953 relinquish_memory(d, &d->xenpage_list);
954 relinquish_memory(d, &d->page_list);
956 if (d->arch.is_vti && d->arch.sal_data)
957 xfree(d->arch.sal_data);
959 /* Free page used by xen oprofile buffer */
960 free_xenoprof_pages(d);
962 return 0;
963 }
965 unsigned long
966 domain_set_shared_info_va (unsigned long va)
967 {
968 struct vcpu *v = current;
969 struct domain *d = v->domain;
971 /* Check virtual address:
972 must belong to region 7,
973 must be 64Kb aligned,
974 must not be within Xen virtual space. */
975 if ((va >> 61) != 7
976 || (va & 0xffffUL) != 0
977 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
978 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
980 /* Note: this doesn't work well if other cpus are already running.
981 However this is part of the spec :-) */
982 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
983 d->arch.shared_info_va = va;
985 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
986 INT_ENABLE_OFFSET(v);
988 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
990 /* Remap the shared pages. */
991 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
993 return 0;
994 }
996 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
997 #define SHADOW_COPY_CHUNK 1024
999 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1001 unsigned int op = sc->op;
1002 int rc = 0;
1003 int i;
1004 //struct vcpu *v;
1006 if (unlikely(d == current->domain)) {
1007 gdprintk(XENLOG_INFO,
1008 "Don't try to do a shadow op on yourself!\n");
1009 return -EINVAL;
1012 domain_pause(d);
1014 switch (op)
1016 case XEN_DOMCTL_SHADOW_OP_OFF:
1017 if (shadow_mode_enabled (d)) {
1018 u64 *bm = d->arch.shadow_bitmap;
1020 /* Flush vhpt and tlb to restore dirty bit usage. */
1021 domain_flush_tlb_vhpt(d);
1023 /* Free bitmap. */
1024 d->arch.shadow_bitmap_size = 0;
1025 d->arch.shadow_bitmap = NULL;
1026 xfree(bm);
1028 break;
1030 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1031 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1032 rc = -EINVAL;
1033 break;
1035 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1036 if (shadow_mode_enabled(d)) {
1037 rc = -EINVAL;
1038 break;
1041 atomic64_set(&d->arch.shadow_fault_count, 0);
1042 atomic64_set(&d->arch.shadow_dirty_count, 0);
1044 d->arch.shadow_bitmap_size =
1045 ((d->arch.convmem_end >> PAGE_SHIFT) +
1046 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1047 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1048 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1049 if (d->arch.shadow_bitmap == NULL) {
1050 d->arch.shadow_bitmap_size = 0;
1051 rc = -ENOMEM;
1053 else {
1054 memset(d->arch.shadow_bitmap, 0,
1055 d->arch.shadow_bitmap_size / 8);
1057 /* Flush vhtp and tlb to enable dirty bit
1058 virtualization. */
1059 domain_flush_tlb_vhpt(d);
1061 break;
1063 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1065 int nbr_bytes;
1067 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1068 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1070 atomic64_set(&d->arch.shadow_fault_count, 0);
1071 atomic64_set(&d->arch.shadow_dirty_count, 0);
1073 if (guest_handle_is_null(sc->dirty_bitmap) ||
1074 (d->arch.shadow_bitmap == NULL)) {
1075 rc = -EINVAL;
1076 break;
1079 if (sc->pages > d->arch.shadow_bitmap_size)
1080 sc->pages = d->arch.shadow_bitmap_size;
1082 nbr_bytes = (sc->pages + 7) / 8;
1084 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1085 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1086 SHADOW_COPY_CHUNK : nbr_bytes - i;
1088 if (copy_to_guest_offset(
1089 sc->dirty_bitmap, i,
1090 (uint8_t *)d->arch.shadow_bitmap + i,
1091 size)) {
1092 rc = -EFAULT;
1093 break;
1096 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1099 break;
1102 case XEN_DOMCTL_SHADOW_OP_PEEK:
1104 unsigned long size;
1106 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1107 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1109 if (guest_handle_is_null(sc->dirty_bitmap) ||
1110 (d->arch.shadow_bitmap == NULL)) {
1111 rc = -EINVAL;
1112 break;
1115 if (sc->pages > d->arch.shadow_bitmap_size)
1116 sc->pages = d->arch.shadow_bitmap_size;
1118 size = (sc->pages + 7) / 8;
1119 if (copy_to_guest(sc->dirty_bitmap,
1120 (uint8_t *)d->arch.shadow_bitmap, size)) {
1121 rc = -EFAULT;
1122 break;
1124 break;
1126 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1127 sc->mb = 0;
1128 break;
1129 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1130 if (sc->mb > 0) {
1131 BUG();
1132 rc = -ENOMEM;
1134 break;
1135 default:
1136 rc = -EINVAL;
1137 break;
1140 domain_unpause(d);
1142 return rc;
1145 // remove following line if not privifying in memory
1146 //#define HAVE_PRIVIFY_MEMORY
1147 #ifndef HAVE_PRIVIFY_MEMORY
1148 #define privify_memory(x,y) do {} while(0)
1149 #endif
1151 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1152 unsigned long phys_load_offset)
1154 const elf_phdr *phdr;
1155 int phnum, h, filesz, memsz;
1156 unsigned long elfaddr, dom_mpaddr, dom_imva;
1157 struct page_info *p;
1159 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1160 for (h = 0; h < phnum; h++) {
1161 phdr = elf_phdr_by_index(elf, h);
1162 if (!elf_phdr_is_loadable(elf, phdr))
1163 continue;
1165 filesz = elf_uval(elf, phdr, p_filesz);
1166 memsz = elf_uval(elf, phdr, p_memsz);
1167 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1168 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1169 dom_mpaddr += phys_load_offset;
1171 while (memsz > 0) {
1172 p = assign_new_domain_page(d,dom_mpaddr);
1173 BUG_ON (unlikely(p == NULL));
1174 dom_imva = __va_ul(page_to_maddr(p));
1175 if (filesz > 0) {
1176 if (filesz >= PAGE_SIZE)
1177 copy_page((void *) dom_imva,
1178 (void *) elfaddr);
1179 else {
1180 // copy partial page
1181 memcpy((void *) dom_imva,
1182 (void *) elfaddr, filesz);
1183 // zero the rest of page
1184 memset((void *) dom_imva+filesz, 0,
1185 PAGE_SIZE-filesz);
1187 //FIXME: This test for code seems to find a lot more than objdump -x does
1188 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1189 privify_memory(dom_imva,PAGE_SIZE);
1190 flush_icache_range(dom_imva,
1191 dom_imva+PAGE_SIZE);
1194 else if (memsz > 0) {
1195 /* always zero out entire page */
1196 clear_page((void *) dom_imva);
1198 memsz -= PAGE_SIZE;
1199 filesz -= PAGE_SIZE;
1200 elfaddr += PAGE_SIZE;
1201 dom_mpaddr += PAGE_SIZE;
1206 static void __init calc_dom0_size(void)
1208 unsigned long domheap_pages;
1209 unsigned long p2m_pages;
1210 unsigned long spare_hv_pages;
1211 unsigned long max_dom0_size;
1213 /* Estimate maximum memory we can safely allocate for dom0
1214 * by subtracting the p2m table allocation and a chunk of memory
1215 * for DMA and PCI mapping from the available domheap pages. The
1216 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1217 * to have a good idea of what those requirements might be ahead
1218 * of time, calculated at 1MB per 4GB of system memory */
1219 domheap_pages = avail_domheap_pages();
1220 p2m_pages = domheap_pages / PTRS_PER_PTE;
1221 spare_hv_pages = domheap_pages / 4096;
1222 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1223 * PAGE_SIZE;
1224 printk("Maximum permitted dom0 size: %luMB\n",
1225 max_dom0_size / (1024*1024));
1227 /* validate proposed dom0_size, fix up as needed */
1228 if (dom0_size > max_dom0_size) {
1229 printk("Reducing dom0 memory allocation from %luK to %luK "
1230 "to fit available memory\n",
1231 dom0_size / 1024, max_dom0_size / 1024);
1232 dom0_size = max_dom0_size;
1235 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1236 if (dom0_size == 0) {
1237 printk("Allocating all available memory to dom0\n");
1238 dom0_size = max_dom0_size;
1241 /* Check dom0 size. */
1242 if (dom0_size < 4 * 1024 * 1024) {
1243 panic("dom0_mem is too small, boot aborted"
1244 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1247 if (running_on_sim) {
1248 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1251 /* no need to allocate pages for now
1252 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1253 */
1257 /*
1258 * Domain 0 has direct access to all devices absolutely. However
1259 * the major point of this stub here, is to allow alloc_dom_mem
1260 * handled with order > 0 request. Dom0 requires that bit set to
1261 * allocate memory for other domains.
1262 */
1263 static void __init physdev_init_dom0(struct domain *d)
1265 if (iomem_permit_access(d, 0UL, ~0UL))
1266 BUG();
1267 if (irqs_permit_access(d, 0, NR_IRQS-1))
1268 BUG();
1269 if (ioports_permit_access(d, 0, 0xffff))
1270 BUG();
1273 int __init construct_dom0(struct domain *d,
1274 unsigned long image_start, unsigned long image_len,
1275 unsigned long initrd_start, unsigned long initrd_len,
1276 char *cmdline)
1278 int i, rc;
1279 start_info_t *si;
1280 dom0_vga_console_info_t *ci;
1281 struct vcpu *v = d->vcpu[0];
1282 unsigned long max_pages;
1284 struct elf_binary elf;
1285 struct elf_dom_parms parms;
1286 unsigned long p_start;
1287 unsigned long pkern_start;
1288 unsigned long pkern_entry;
1289 unsigned long pkern_end;
1290 unsigned long pinitrd_start = 0;
1291 unsigned long pstart_info;
1292 unsigned long phys_load_offset;
1293 struct page_info *start_info_page;
1294 unsigned long bp_mpa;
1295 struct ia64_boot_param *bp;
1297 //printk("construct_dom0: starting\n");
1299 /* Sanity! */
1300 BUG_ON(d != dom0);
1301 BUG_ON(d->vcpu[0] == NULL);
1302 BUG_ON(v->is_initialised);
1304 printk("*** LOADING DOMAIN 0 ***\n");
1306 calc_dom0_size();
1308 max_pages = dom0_size / PAGE_SIZE;
1309 d->max_pages = max_pages;
1310 d->tot_pages = 0;
1312 rc = elf_init(&elf, (void*)image_start, image_len);
1313 if ( rc != 0 )
1314 return rc;
1315 #ifdef VERBOSE
1316 elf_set_verbose(&elf);
1317 #endif
1318 elf_parse_binary(&elf);
1319 if (0 != (elf_xen_parse(&elf, &parms)))
1320 return rc;
1322 /*
1323 * We cannot rely on the load address in the ELF headers to
1324 * determine the meta physical address at which the image
1325 * is loaded. Patch the address to match the real one, based
1326 * on xen_pstart
1327 */
1328 phys_load_offset = xen_pstart - elf.pstart;
1329 elf.pstart += phys_load_offset;
1330 elf.pend += phys_load_offset;
1331 parms.virt_kstart += phys_load_offset;
1332 parms.virt_kend += phys_load_offset;
1333 parms.virt_entry += phys_load_offset;
1335 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1336 elf_64bit(&elf) ? "64-bit" : "32-bit",
1337 elf_msb(&elf) ? "msb" : "lsb",
1338 elf.pstart, elf.pend);
1339 if (!elf_64bit(&elf) ||
1340 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1341 printk("Incompatible kernel binary\n");
1342 return -1;
1345 p_start = parms.virt_base;
1346 pkern_start = parms.virt_kstart;
1347 pkern_end = parms.virt_kend;
1348 pkern_entry = parms.virt_entry;
1350 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1352 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1354 printk("Initial guest OS must load to a page boundary.\n");
1355 return -EINVAL;
1358 pstart_info = PAGE_ALIGN(pkern_end);
1359 if(initrd_start && initrd_len){
1360 unsigned long offset;
1362 /* The next page aligned boundary after the start info.
1363 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1364 pinitrd_start = pstart_info + PAGE_SIZE;
1366 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1367 panic("%s: not enough memory assigned to dom0", __func__);
1369 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1370 struct page_info *p;
1371 p = assign_new_domain_page(d, pinitrd_start + offset);
1372 if (p == NULL)
1373 panic("%s: can't allocate page for initrd image", __func__);
1374 if (initrd_len < offset + PAGE_SIZE)
1375 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1376 initrd_len - offset);
1377 else
1378 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1382 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1383 " Kernel image: %lx->%lx\n"
1384 " Entry address: %lx\n"
1385 " Init. ramdisk: %lx len %lx\n"
1386 " Start info.: %lx->%lx\n",
1387 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1388 pstart_info, pstart_info + PAGE_SIZE);
1390 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1392 printk("Initial guest OS requires too much space\n"
1393 "(%luMB is greater than %luMB limit)\n",
1394 (pkern_end-pkern_start)>>20,
1395 (max_pages <<PAGE_SHIFT)>>20);
1396 return -ENOMEM;
1399 // if high 3 bits of pkern start are non-zero, error
1401 // if pkern end is after end of metaphysical memory, error
1402 // (we should be able to deal with this... later)
1404 /* Mask all upcalls... */
1405 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1406 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1408 if (dom0_max_vcpus == 0)
1409 dom0_max_vcpus = MAX_VIRT_CPUS;
1410 if (dom0_max_vcpus > num_online_cpus())
1411 dom0_max_vcpus = num_online_cpus();
1412 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1413 dom0_max_vcpus = MAX_VIRT_CPUS;
1415 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1416 for ( i = 1; i < dom0_max_vcpus; i++ )
1417 if (alloc_vcpu(d, i, i) == NULL)
1418 panic("Cannot allocate dom0 vcpu %d\n", i);
1420 /* Copy the OS image. */
1421 loaddomainelfimage(d, &elf, phys_load_offset);
1423 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1424 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1426 /* Set up start info area. */
1427 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1428 start_info_page = assign_new_domain_page(d, pstart_info);
1429 if (start_info_page == NULL)
1430 panic("can't allocate start info page");
1431 si = page_to_virt(start_info_page);
1432 clear_page(si);
1433 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1434 xen_major_version(), xen_minor_version());
1435 si->nr_pages = max_pages;
1436 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1438 printk("Dom0: 0x%lx\n", (u64)dom0);
1440 v->is_initialised = 1;
1441 clear_bit(_VPF_down, &v->pause_flags);
1443 /* Build firmware.
1444 Note: Linux kernel reserve memory used by start_info, so there is
1445 no need to remove it from MDT. */
1446 bp_mpa = pstart_info + sizeof(struct start_info);
1447 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1448 if (rc != 0)
1449 return rc;
1451 /* Fill boot param. */
1452 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1454 bp = (struct ia64_boot_param *)((unsigned char *)si +
1455 sizeof(start_info_t));
1456 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1458 /* We assume console has reached the last line! */
1459 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1460 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1461 bp->console_info.orig_x = 0;
1462 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1463 0 : bp->console_info.num_rows - 1;
1465 bp->initrd_start = pinitrd_start;
1466 bp->initrd_size = ia64_boot_param->initrd_size;
1468 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1469 sizeof(start_info_t) +
1470 sizeof(struct ia64_boot_param));
1472 if (fill_console_start_info(ci)) {
1473 si->console.dom0.info_off = sizeof(start_info_t) +
1474 sizeof(struct ia64_boot_param);
1475 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1478 vcpu_init_regs (v);
1480 vcpu_regs(v)->r28 = bp_mpa;
1482 vcpu_regs (v)->cr_iip = pkern_entry;
1484 physdev_init_dom0(d);
1486 return 0;
1489 void machine_restart(char * __unused)
1491 console_start_sync();
1492 if (running_on_sim)
1493 printk ("machine_restart called. spinning...\n");
1494 else
1495 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1496 while(1);
1499 extern void cpu_halt(void);
1501 void machine_halt(void)
1503 console_start_sync();
1504 if (running_on_sim)
1505 printk ("machine_halt called. spinning...\n");
1506 else
1507 cpu_halt();
1508 while(1);
1511 void sync_vcpu_execstate(struct vcpu *v)
1513 // __ia64_save_fpu(v->arch._thread.fph);
1514 // if (VMX_DOMAIN(v))
1515 // vmx_save_state(v);
1516 // FIXME SMP: Anything else needed here for SMP?
1519 /* This function is taken from xen/arch/x86/domain.c */
1520 long
1521 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1523 long rc = 0;
1525 switch (cmd) {
1526 case VCPUOP_register_runstate_memory_area:
1528 struct vcpu_register_runstate_memory_area area;
1529 struct vcpu_runstate_info runstate;
1531 rc = -EFAULT;
1532 if (copy_from_guest(&area, arg, 1))
1533 break;
1535 if (!guest_handle_okay(area.addr.h, 1))
1536 break;
1538 rc = 0;
1539 runstate_guest(v) = area.addr.h;
1541 if (v == current) {
1542 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1543 } else {
1544 vcpu_runstate_get(v, &runstate);
1545 __copy_to_guest(runstate_guest(v), &runstate, 1);
1548 break;
1550 default:
1551 rc = -ENOSYS;
1552 break;
1555 return rc;
1558 static void __init parse_dom0_mem(char *s)
1560 dom0_size = parse_size_and_unit(s, NULL);
1562 custom_param("dom0_mem", parse_dom0_mem);
1564 /*
1565 * Helper function for the optimization stuff handling the identity mapping
1566 * feature.
1567 */
1568 static inline void
1569 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
1570 struct xen_ia64_opt_feature* f)
1572 if (f->on) {
1573 *mask |= f->cmd;
1574 im->pgprot = f->pgprot;
1575 im->key = f->key;
1576 } else {
1577 *mask &= ~(f->cmd);
1578 im->pgprot = 0;
1579 im->key = 0;
1583 /* Switch a optimization feature on/off. */
1584 int
1585 domain_opt_feature(struct xen_ia64_opt_feature* f)
1587 struct opt_feature* optf = &(current->domain->arch.opt_feature);
1588 long rc = 0;
1590 switch (f->cmd) {
1591 case XEN_IA64_OPTF_IDENT_MAP_REG4:
1592 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
1593 break;
1594 case XEN_IA64_OPTF_IDENT_MAP_REG5:
1595 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
1596 break;
1597 case XEN_IA64_OPTF_IDENT_MAP_REG7:
1598 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
1599 break;
1600 default:
1601 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
1602 rc = -ENOSYS;
1603 break;
1605 return rc;