ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 15761:0cc2e0a1b2fc

[IA64] Kill PM timer

The release proceeding of domain resources forgot to stop (or kill)
PM timer, and freed the domain structure.
VMX flag of VCPU#0 was not set when VHPT allocation for VCPU#0
failed. For this reason, domain_relinquish_resources() did not
call vmx_relinqush_guest_resources(). But the domain structure
was freed. As a result, timer_softirq_action() lose sight of
the callback function for PM timer.

Signed-off-by: Masaki Kanno <kanno.masaki@jp.fujitsu.com>
author Alex Williamson <alex.williamson@hp.com>
date Fri Aug 24 15:06:49 2007 -0600 (2007-08-24)
parents b5dbf184df6c
children 4ffca478e2f7
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <public/vcpu.h>
55 /* dom0_size: default memory allocation for dom0 (~4GB) */
56 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
58 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
59 static unsigned int __initdata dom0_max_vcpus = 4;
60 integer_param("dom0_max_vcpus", dom0_max_vcpus);
62 extern char dom0_command_line[];
64 /* forward declaration */
65 static void init_switch_stack(struct vcpu *v);
67 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
68 This is a Xen virtual address. */
69 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
70 DEFINE_PER_CPU(int *, current_psr_ic_addr);
72 DEFINE_PER_CPU(struct vcpu *, fp_owner);
74 #include <xen/sched-if.h>
76 static void
77 ia64_disable_vhpt_walker(void)
78 {
79 // disable VHPT. ia64_new_rr7() might cause VHPT
80 // fault without this because it flushes dtr[IA64_TR_VHPT]
81 // (VHPT_SIZE_LOG2 << 2) is just for avoid
82 // Reserved Register/Field fault.
83 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
84 }
86 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
87 {
88 int cpu = smp_processor_id();
89 int last_vcpu_id, last_processor;
91 if (!is_idle_domain(prev->domain))
92 tlbflush_update_time
93 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
94 tlbflush_current_time());
96 if (is_idle_domain(next->domain))
97 return;
99 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
100 last_processor = next->arch.last_processor;
102 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
103 next->arch.last_processor = cpu;
105 if ((last_vcpu_id != next->vcpu_id &&
106 last_vcpu_id != INVALID_VCPU_ID) ||
107 (last_vcpu_id == next->vcpu_id &&
108 last_processor != cpu &&
109 last_processor != INVALID_PROCESSOR)) {
110 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
111 u32 last_tlbflush_timestamp =
112 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
113 #endif
114 int vhpt_is_flushed = 0;
116 // if the vTLB implementation was changed,
117 // the followings must be updated either.
118 if (VMX_DOMAIN(next)) {
119 // currently vTLB for vt-i domian is per vcpu.
120 // so any flushing isn't needed.
121 } else if (HAS_PERVCPU_VHPT(next->domain)) {
122 // nothing to do
123 } else {
124 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
125 last_tlbflush_timestamp)) {
126 local_vhpt_flush();
127 vhpt_is_flushed = 1;
128 }
129 }
130 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
131 last_tlbflush_timestamp)) {
132 local_flush_tlb_all();
133 perfc_incr(tlbflush_clock_cswitch_purge);
134 } else {
135 perfc_incr(tlbflush_clock_cswitch_skip);
136 }
137 perfc_incr(flush_vtlb_for_context_switch);
138 }
139 }
141 static void flush_cache_for_context_switch(struct vcpu *next)
142 {
143 extern cpumask_t cpu_cache_coherent_map;
144 int cpu = smp_processor_id();
146 if (is_idle_vcpu(next) ||
147 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
148 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
149 unsigned long flags;
150 u64 progress = 0;
151 s64 status;
153 local_irq_save(flags);
154 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
155 local_irq_restore(flags);
156 if (status != 0)
157 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
158 "cache_type=4 status %lx", status);
159 }
160 }
161 }
163 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
164 {
165 /*
166 * Implement eager save, lazy restore
167 */
168 if (!is_idle_vcpu(prev)) {
169 if (VMX_DOMAIN(prev)) {
170 if (FP_PSR(prev) & IA64_PSR_MFH) {
171 __ia64_save_fpu(prev->arch._thread.fph);
172 __ia64_per_cpu_var(fp_owner) = prev;
173 }
174 } else {
175 if (PSCB(prev, hpsr_mfh)) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 }
180 }
182 if (!is_idle_vcpu(next)) {
183 if (VMX_DOMAIN(next)) {
184 FP_PSR(next) = IA64_PSR_DFH;
185 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
186 } else {
187 PSCB(next, hpsr_dfh) = 1;
188 PSCB(next, hpsr_mfh) = 0;
189 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
190 }
191 }
192 }
194 void schedule_tail(struct vcpu *prev)
195 {
196 extern char ia64_ivt;
198 context_saved(prev);
199 ia64_disable_vhpt_walker();
201 if (VMX_DOMAIN(current)) {
202 vmx_do_launch(current);
203 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
204 current->processor);
205 } else {
206 ia64_set_iva(&ia64_ivt);
207 load_region_regs(current);
208 ia64_set_pta(vcpu_pta(current));
209 vcpu_load_kernel_regs(current);
210 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
211 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
212 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
213 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
214 migrate_timer(&current->arch.hlt_timer, current->processor);
215 }
216 flush_vtlb_for_context_switch(prev, current);
217 }
219 void context_switch(struct vcpu *prev, struct vcpu *next)
220 {
221 uint64_t spsr;
223 local_irq_save(spsr);
225 if (VMX_DOMAIN(prev)) {
226 vmx_save_state(prev);
227 if (!VMX_DOMAIN(next)) {
228 /* VMX domains can change the physical cr.dcr.
229 * Restore default to prevent leakage. */
230 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
231 }
232 }
233 if (VMX_DOMAIN(next))
234 vmx_load_state(next);
236 ia64_disable_vhpt_walker();
237 lazy_fp_switch(prev, current);
239 if (prev->arch.dbg_used || next->arch.dbg_used) {
240 /*
241 * Load debug registers either because they are valid or to clear
242 * the previous one.
243 */
244 ia64_load_debug_regs(next->arch.dbr);
245 }
247 prev = ia64_switch_to(next);
249 /* Note: ia64_switch_to does not return here at vcpu initialization. */
251 if (VMX_DOMAIN(current)) {
252 vmx_load_all_rr(current);
253 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
254 current->processor);
255 } else {
256 struct domain *nd;
257 extern char ia64_ivt;
259 ia64_set_iva(&ia64_ivt);
261 nd = current->domain;
262 if (!is_idle_domain(nd)) {
263 load_region_regs(current);
264 ia64_set_pta(vcpu_pta(current));
265 vcpu_load_kernel_regs(current);
266 if (vcpu_pkr_in_use(current))
267 vcpu_pkr_load_regs(current);
268 vcpu_set_next_timer(current);
269 if (vcpu_timer_expired(current))
270 vcpu_pend_timer(current);
271 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
272 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
273 __ia64_per_cpu_var(current_psr_ic_addr) =
274 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
275 /* steal time accounting */
276 if (!guest_handle_is_null(runstate_guest(current)))
277 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
278 } else {
279 /* When switching to idle domain, only need to disable vhpt
280 * walker. Then all accesses happen within idle context will
281 * be handled by TR mapping and identity mapping.
282 */
283 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
284 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
285 }
286 }
287 local_irq_restore(spsr);
289 /* lazy fp */
290 if (current->processor != current->arch.last_processor) {
291 unsigned long *addr;
292 addr = (unsigned long *)per_cpu_addr(fp_owner,
293 current->arch.last_processor);
294 ia64_cmpxchg(acq, addr, current, 0, 8);
295 }
297 flush_vtlb_for_context_switch(prev, current);
298 flush_cache_for_context_switch(current);
299 context_saved(prev);
300 }
302 void continue_running(struct vcpu *same)
303 {
304 /* nothing to do */
305 }
307 #ifdef CONFIG_PERFMON
308 static int pal_halt = 1;
309 static int can_do_pal_halt = 1;
311 static int __init nohalt_setup(char * str)
312 {
313 pal_halt = can_do_pal_halt = 0;
314 return 1;
315 }
316 __setup("nohalt", nohalt_setup);
318 void
319 update_pal_halt_status(int status)
320 {
321 can_do_pal_halt = pal_halt && status;
322 }
323 #else
324 #define can_do_pal_halt (1)
325 #endif
327 static void default_idle(void)
328 {
329 local_irq_disable();
330 if ( !softirq_pending(smp_processor_id()) ) {
331 if (can_do_pal_halt)
332 safe_halt();
333 else
334 cpu_relax();
335 }
336 local_irq_enable();
337 }
339 static void continue_cpu_idle_loop(void)
340 {
341 for ( ; ; )
342 {
343 #ifdef IA64
344 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
345 #else
346 irq_stat[cpu].idle_timestamp = jiffies;
347 #endif
348 page_scrub_schedule_work();
349 while ( !softirq_pending(smp_processor_id()) )
350 default_idle();
351 raise_softirq(SCHEDULE_SOFTIRQ);
352 do_softirq();
353 }
354 }
356 void startup_cpu_idle_loop(void)
357 {
358 /* Just some sanity to ensure that the scheduler is set up okay. */
359 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
360 raise_softirq(SCHEDULE_SOFTIRQ);
362 continue_cpu_idle_loop();
363 }
365 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
366 * get_order_from_shift(XMAPPEDREGS_SHIFT))
367 */
368 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
369 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
370 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
371 #endif
373 void hlt_timer_fn(void *data)
374 {
375 struct vcpu *v = data;
376 vcpu_unblock(v);
377 }
379 void relinquish_vcpu_resources(struct vcpu *v)
380 {
381 if (HAS_PERVCPU_VHPT(v->domain))
382 pervcpu_vhpt_free(v);
383 if (v->arch.privregs != NULL) {
384 free_xenheap_pages(v->arch.privregs,
385 get_order_from_shift(XMAPPEDREGS_SHIFT));
386 v->arch.privregs = NULL;
387 }
388 kill_timer(&v->arch.hlt_timer);
389 }
391 struct vcpu *alloc_vcpu_struct(void)
392 {
393 struct vcpu *v;
394 struct thread_info *ti;
395 static int first_allocation = 1;
397 if (first_allocation) {
398 first_allocation = 0;
399 /* Still keep idle vcpu0 static allocated at compilation, due
400 * to some code from Linux still requires it in early phase.
401 */
402 return idle_vcpu[0];
403 }
405 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
406 return NULL;
407 memset(v, 0, sizeof(*v));
409 ti = alloc_thread_info(v);
410 /* Clear thread_info to clear some important fields, like
411 * preempt_count
412 */
413 memset(ti, 0, sizeof(struct thread_info));
414 init_switch_stack(v);
416 return v;
417 }
419 void free_vcpu_struct(struct vcpu *v)
420 {
421 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
422 }
424 int vcpu_initialise(struct vcpu *v)
425 {
426 struct domain *d = v->domain;
428 if (!is_idle_domain(d)) {
429 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
430 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
431 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
432 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
434 /* Is it correct ?
435 It depends on the domain rid usage.
437 A domain may share rid among its processor (eg having a
438 global VHPT). In this case, we should also share rid
439 among vcpus and the rid range should be the same.
441 However a domain may have per cpu rid allocation. In
442 this case we don't want to share rid among vcpus, but we may
443 do it if two vcpus are on the same cpu... */
445 v->arch.starting_rid = d->arch.starting_rid;
446 v->arch.ending_rid = d->arch.ending_rid;
447 v->arch.breakimm = d->arch.breakimm;
448 v->arch.last_processor = INVALID_PROCESSOR;
449 v->arch.vhpt_pg_shift = PAGE_SHIFT;
450 }
452 if (!VMX_DOMAIN(v))
453 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
454 first_cpu(cpu_online_map));
456 return 0;
457 }
459 void vcpu_share_privregs_with_guest(struct vcpu *v)
460 {
461 struct domain *d = v->domain;
462 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
464 for (i = 0; i < (1 << order); i++)
465 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
466 d, XENSHARE_writable);
467 /*
468 * XXX IA64_XMAPPEDREGS_PADDR
469 * assign these pages into guest pseudo physical address
470 * space for dom0 to map this page by gmfn.
471 * this is necessary for domain save, restore and dump-core.
472 */
473 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
474 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
475 virt_to_maddr(v->arch.privregs + i));
476 }
478 int vcpu_late_initialise(struct vcpu *v)
479 {
480 struct domain *d = v->domain;
481 int rc, order;
483 if (HAS_PERVCPU_VHPT(d)) {
484 rc = pervcpu_vhpt_alloc(v);
485 if (rc != 0)
486 return rc;
487 }
489 /* Create privregs page. */
490 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
491 v->arch.privregs = alloc_xenheap_pages(order);
492 BUG_ON(v->arch.privregs == NULL);
493 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
494 vcpu_share_privregs_with_guest(v);
496 return 0;
497 }
499 void vcpu_destroy(struct vcpu *v)
500 {
501 if (v->domain->arch.is_vti)
502 vmx_relinquish_vcpu_resources(v);
503 else
504 relinquish_vcpu_resources(v);
505 }
507 static void init_switch_stack(struct vcpu *v)
508 {
509 struct pt_regs *regs = vcpu_regs (v);
510 struct switch_stack *sw = (struct switch_stack *) regs - 1;
511 extern void ia64_ret_from_clone;
513 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
514 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
515 sw->b0 = (unsigned long) &ia64_ret_from_clone;
516 sw->ar_fpsr = FPSR_DEFAULT;
517 v->arch._thread.ksp = (unsigned long) sw - 16;
518 // stay on kernel stack because may get interrupts!
519 // ia64_ret_from_clone switches to user stack
520 v->arch._thread.on_ustack = 0;
521 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
522 }
524 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
525 static int opt_pervcpu_vhpt = 1;
526 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
527 #endif
529 int arch_domain_create(struct domain *d)
530 {
531 int i;
533 // the following will eventually need to be negotiated dynamically
534 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
535 d->arch.breakimm = 0x1000;
536 for (i = 0; i < NR_CPUS; i++) {
537 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
538 }
540 if (is_idle_domain(d))
541 return 0;
543 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
544 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
545 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
546 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
547 #endif
548 if (tlb_track_create(d) < 0)
549 goto fail_nomem1;
550 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
551 if (d->shared_info == NULL)
552 goto fail_nomem;
553 memset(d->shared_info, 0, XSI_SIZE);
554 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
555 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
556 d, XENSHARE_writable);
558 /* We may also need emulation rid for region4, though it's unlikely
559 * to see guest issue uncacheable access in metaphysical mode. But
560 * keep such info here may be more sane.
561 */
562 if (!allocate_rid_range(d,0))
563 goto fail_nomem;
565 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
567 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
568 goto fail_nomem;
570 /*
571 * grant_table_create() can't fully initialize grant table for domain
572 * because it is called before arch_domain_create().
573 * Here we complete the initialization which requires p2m table.
574 */
575 spin_lock(&d->grant_table->lock);
576 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
577 ia64_gnttab_create_shared_page(d, d->grant_table, i);
578 spin_unlock(&d->grant_table->lock);
580 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
581 RANGESETF_prettyprint_hex);
583 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
584 return 0;
586 fail_nomem:
587 tlb_track_destroy(d);
588 fail_nomem1:
589 if (d->arch.mm.pgd != NULL)
590 pgd_free(d->arch.mm.pgd);
591 if (d->shared_info != NULL)
592 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
593 return -ENOMEM;
594 }
596 void arch_domain_destroy(struct domain *d)
597 {
598 mm_final_teardown(d);
600 if (d->shared_info != NULL)
601 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
603 tlb_track_destroy(d);
605 /* Clear vTLB for the next domain. */
606 domain_flush_tlb_vhpt(d);
608 deallocate_rid_range(d);
609 }
611 int arch_vcpu_reset(struct vcpu *v)
612 {
613 /* FIXME: Stub for now */
614 return 0;
615 }
617 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
619 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
620 {
621 int i;
622 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
623 struct cpu_user_regs *uregs = vcpu_regs(v);
624 int is_hvm = VMX_DOMAIN(v);
625 unsigned int rbs_size;
627 c.nat->regs.b[6] = uregs->b6;
628 c.nat->regs.b[7] = uregs->b7;
630 c.nat->regs.ar.csd = uregs->ar_csd;
631 c.nat->regs.ar.ssd = uregs->ar_ssd;
633 c.nat->regs.r[8] = uregs->r8;
634 c.nat->regs.r[9] = uregs->r9;
635 c.nat->regs.r[10] = uregs->r10;
636 c.nat->regs.r[11] = uregs->r11;
638 if (is_hvm)
639 c.nat->regs.psr = vmx_vcpu_get_psr(v);
640 else
641 c.nat->regs.psr = vcpu_get_psr(v);
643 c.nat->regs.ip = uregs->cr_iip;
644 c.nat->regs.cfm = uregs->cr_ifs;
646 c.nat->regs.ar.unat = uregs->ar_unat;
647 c.nat->regs.ar.pfs = uregs->ar_pfs;
648 c.nat->regs.ar.rsc = uregs->ar_rsc;
649 c.nat->regs.ar.rnat = uregs->ar_rnat;
650 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
652 c.nat->regs.pr = uregs->pr;
653 c.nat->regs.b[0] = uregs->b0;
654 rbs_size = uregs->loadrs >> 16;
655 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
657 c.nat->regs.r[1] = uregs->r1;
658 c.nat->regs.r[12] = uregs->r12;
659 c.nat->regs.r[13] = uregs->r13;
660 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
661 c.nat->regs.r[15] = uregs->r15;
663 c.nat->regs.r[14] = uregs->r14;
664 c.nat->regs.r[2] = uregs->r2;
665 c.nat->regs.r[3] = uregs->r3;
666 c.nat->regs.r[16] = uregs->r16;
667 c.nat->regs.r[17] = uregs->r17;
668 c.nat->regs.r[18] = uregs->r18;
669 c.nat->regs.r[19] = uregs->r19;
670 c.nat->regs.r[20] = uregs->r20;
671 c.nat->regs.r[21] = uregs->r21;
672 c.nat->regs.r[22] = uregs->r22;
673 c.nat->regs.r[23] = uregs->r23;
674 c.nat->regs.r[24] = uregs->r24;
675 c.nat->regs.r[25] = uregs->r25;
676 c.nat->regs.r[26] = uregs->r26;
677 c.nat->regs.r[27] = uregs->r27;
678 c.nat->regs.r[28] = uregs->r28;
679 c.nat->regs.r[29] = uregs->r29;
680 c.nat->regs.r[30] = uregs->r30;
681 c.nat->regs.r[31] = uregs->r31;
683 c.nat->regs.ar.ccv = uregs->ar_ccv;
685 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
686 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
687 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
688 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
689 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
690 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
692 c.nat->regs.r[4] = uregs->r4;
693 c.nat->regs.r[5] = uregs->r5;
694 c.nat->regs.r[6] = uregs->r6;
695 c.nat->regs.r[7] = uregs->r7;
697 /* FIXME: to be reordered. */
698 c.nat->regs.nats = uregs->eml_unat;
700 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
701 if (rbs_size < sizeof (c.nat->regs.rbs))
702 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
704 c.nat->privregs_pfn = get_gpfn_from_mfn
705 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
707 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
708 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
709 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
710 }
712 for (i = 0; i < 7; i++)
713 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
715 /* Fill extra regs. */
716 for (i = 0; i < 8; i++) {
717 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
718 tr->itrs[i].itir = v->arch.itrs[i].itir;
719 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
720 tr->itrs[i].rid = v->arch.itrs[i].rid;
721 }
722 for (i = 0; i < 8; i++) {
723 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
724 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
725 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
726 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
727 }
728 c.nat->event_callback_ip = v->arch.event_callback_ip;
730 /* If PV and privregs is not set, we can't read mapped registers. */
731 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
732 return;
734 vcpu_get_dcr (v, &c.nat->regs.cr.dcr);
735 vcpu_get_iva (v, &c.nat->regs.cr.iva);
736 }
738 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
739 {
740 struct cpu_user_regs *uregs = vcpu_regs(v);
741 struct domain *d = v->domain;
742 int was_initialised = v->is_initialised;
743 unsigned int rbs_size;
744 int rc, i;
746 /* Finish vcpu initialization. */
747 if (!was_initialised) {
748 if (d->arch.is_vti)
749 rc = vmx_final_setup_guest(v);
750 else
751 rc = vcpu_late_initialise(v);
752 if (rc != 0)
753 return rc;
755 vcpu_init_regs(v);
757 v->is_initialised = 1;
758 /* Auto-online VCPU0 when it is initialised. */
759 if (v->vcpu_id == 0)
760 clear_bit(_VPF_down, &v->pause_flags);
761 }
763 if (c.nat == NULL)
764 return 0;
766 uregs->b6 = c.nat->regs.b[6];
767 uregs->b7 = c.nat->regs.b[7];
769 uregs->ar_csd = c.nat->regs.ar.csd;
770 uregs->ar_ssd = c.nat->regs.ar.ssd;
772 uregs->r8 = c.nat->regs.r[8];
773 uregs->r9 = c.nat->regs.r[9];
774 uregs->r10 = c.nat->regs.r[10];
775 uregs->r11 = c.nat->regs.r[11];
777 if (!d->arch.is_vti)
778 vcpu_set_psr(v, c.nat->regs.psr);
779 else
780 vmx_vcpu_set_psr(v, c.nat->regs.psr);
781 uregs->cr_iip = c.nat->regs.ip;
782 uregs->cr_ifs = c.nat->regs.cfm;
784 uregs->ar_unat = c.nat->regs.ar.unat;
785 uregs->ar_pfs = c.nat->regs.ar.pfs;
786 uregs->ar_rsc = c.nat->regs.ar.rsc;
787 uregs->ar_rnat = c.nat->regs.ar.rnat;
788 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
790 uregs->pr = c.nat->regs.pr;
791 uregs->b0 = c.nat->regs.b[0];
792 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
793 /* Protection against crazy user code. */
794 if (!was_initialised)
795 uregs->loadrs = (rbs_size) << 16;
796 if (rbs_size == (uregs->loadrs >> 16))
797 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
799 uregs->r1 = c.nat->regs.r[1];
800 uregs->r12 = c.nat->regs.r[12];
801 uregs->r13 = c.nat->regs.r[13];
802 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
803 uregs->r15 = c.nat->regs.r[15];
805 uregs->r14 = c.nat->regs.r[14];
806 uregs->r2 = c.nat->regs.r[2];
807 uregs->r3 = c.nat->regs.r[3];
808 uregs->r16 = c.nat->regs.r[16];
809 uregs->r17 = c.nat->regs.r[17];
810 uregs->r18 = c.nat->regs.r[18];
811 uregs->r19 = c.nat->regs.r[19];
812 uregs->r20 = c.nat->regs.r[20];
813 uregs->r21 = c.nat->regs.r[21];
814 uregs->r22 = c.nat->regs.r[22];
815 uregs->r23 = c.nat->regs.r[23];
816 uregs->r24 = c.nat->regs.r[24];
817 uregs->r25 = c.nat->regs.r[25];
818 uregs->r26 = c.nat->regs.r[26];
819 uregs->r27 = c.nat->regs.r[27];
820 uregs->r28 = c.nat->regs.r[28];
821 uregs->r29 = c.nat->regs.r[29];
822 uregs->r30 = c.nat->regs.r[30];
823 uregs->r31 = c.nat->regs.r[31];
825 uregs->ar_ccv = c.nat->regs.ar.ccv;
827 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
828 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
829 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
830 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
831 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
832 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
834 uregs->r4 = c.nat->regs.r[4];
835 uregs->r5 = c.nat->regs.r[5];
836 uregs->r6 = c.nat->regs.r[6];
837 uregs->r7 = c.nat->regs.r[7];
839 /* FIXME: to be reordered and restored. */
840 /* uregs->eml_unat = c.nat->regs.nat; */
841 uregs->eml_unat = 0;
843 if (!d->arch.is_vti) {
844 /* domain runs at PL2/3 */
845 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
846 IA64_PSR_CPL0_BIT);
847 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
848 }
850 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
851 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
852 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
853 }
855 if (c.nat->flags & VGCF_EXTRA_REGS) {
856 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
858 for (i = 0; i < 8; i++) {
859 vcpu_set_itr(v, i, tr->itrs[i].pte,
860 tr->itrs[i].itir,
861 tr->itrs[i].vadr,
862 tr->itrs[i].rid);
863 }
864 for (i = 0; i < 8; i++) {
865 vcpu_set_dtr(v, i,
866 tr->dtrs[i].pte,
867 tr->dtrs[i].itir,
868 tr->dtrs[i].vadr,
869 tr->dtrs[i].rid);
870 }
871 v->arch.event_callback_ip = c.nat->event_callback_ip;
872 v->arch.iva = c.nat->regs.cr.iva;
873 }
875 return 0;
876 }
878 static void relinquish_memory(struct domain *d, struct list_head *list)
879 {
880 struct list_head *ent;
881 struct page_info *page;
882 #ifndef __ia64__
883 unsigned long x, y;
884 #endif
886 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
887 spin_lock_recursive(&d->page_alloc_lock);
888 ent = list->next;
889 while ( ent != list )
890 {
891 page = list_entry(ent, struct page_info, list);
892 /* Grab a reference to the page so it won't disappear from under us. */
893 if ( unlikely(!get_page(page, d)) )
894 {
895 /* Couldn't get a reference -- someone is freeing this page. */
896 ent = ent->next;
897 continue;
898 }
900 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
901 put_page_and_type(page);
903 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
904 put_page(page);
906 #ifndef __ia64__
907 /*
908 * Forcibly invalidate base page tables at this point to break circular
909 * 'linear page table' references. This is okay because MMU structures
910 * are not shared across domains and this domain is now dead. Thus base
911 * tables are not in use so a non-zero count means circular reference.
912 */
913 y = page->u.inuse.type_info;
914 for ( ; ; )
915 {
916 x = y;
917 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
918 (PGT_base_page_table|PGT_validated)) )
919 break;
921 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
922 if ( likely(y == x) )
923 {
924 free_page_type(page, PGT_base_page_table);
925 break;
926 }
927 }
928 #endif
930 /* Follow the list chain and /then/ potentially free the page. */
931 ent = ent->next;
932 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
933 put_page(page);
934 }
936 spin_unlock_recursive(&d->page_alloc_lock);
937 }
939 void domain_relinquish_resources(struct domain *d)
940 {
941 /* Relinquish guest resources for VT-i domain. */
942 if (d->arch.is_vti)
943 vmx_relinquish_guest_resources(d);
945 /* Tear down shadow mode stuff. */
946 mm_teardown(d);
948 /* Relinquish every page of memory. */
949 relinquish_memory(d, &d->xenpage_list);
950 relinquish_memory(d, &d->page_list);
952 if (d->arch.is_vti && d->arch.sal_data)
953 xfree(d->arch.sal_data);
955 /* Free page used by xen oprofile buffer */
956 free_xenoprof_pages(d);
957 }
959 unsigned long
960 domain_set_shared_info_va (unsigned long va)
961 {
962 struct vcpu *v = current;
963 struct domain *d = v->domain;
965 /* Check virtual address:
966 must belong to region 7,
967 must be 64Kb aligned,
968 must not be within Xen virtual space. */
969 if ((va >> 61) != 7
970 || (va & 0xffffUL) != 0
971 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
972 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
974 /* Note: this doesn't work well if other cpus are already running.
975 However this is part of the spec :-) */
976 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
977 d->arch.shared_info_va = va;
979 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
980 INT_ENABLE_OFFSET(v);
982 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
984 /* Remap the shared pages. */
985 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
987 return 0;
988 }
990 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
991 #define SHADOW_COPY_CHUNK 1024
993 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
994 {
995 unsigned int op = sc->op;
996 int rc = 0;
997 int i;
998 //struct vcpu *v;
1000 if (unlikely(d == current->domain)) {
1001 gdprintk(XENLOG_INFO,
1002 "Don't try to do a shadow op on yourself!\n");
1003 return -EINVAL;
1006 domain_pause(d);
1008 switch (op)
1010 case XEN_DOMCTL_SHADOW_OP_OFF:
1011 if (shadow_mode_enabled (d)) {
1012 u64 *bm = d->arch.shadow_bitmap;
1014 /* Flush vhpt and tlb to restore dirty bit usage. */
1015 domain_flush_tlb_vhpt(d);
1017 /* Free bitmap. */
1018 d->arch.shadow_bitmap_size = 0;
1019 d->arch.shadow_bitmap = NULL;
1020 xfree(bm);
1022 break;
1024 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1025 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1026 rc = -EINVAL;
1027 break;
1029 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1030 if (shadow_mode_enabled(d)) {
1031 rc = -EINVAL;
1032 break;
1035 atomic64_set(&d->arch.shadow_fault_count, 0);
1036 atomic64_set(&d->arch.shadow_dirty_count, 0);
1038 d->arch.shadow_bitmap_size =
1039 ((d->arch.convmem_end >> PAGE_SHIFT) +
1040 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1041 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1042 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1043 if (d->arch.shadow_bitmap == NULL) {
1044 d->arch.shadow_bitmap_size = 0;
1045 rc = -ENOMEM;
1047 else {
1048 memset(d->arch.shadow_bitmap, 0,
1049 d->arch.shadow_bitmap_size / 8);
1051 /* Flush vhtp and tlb to enable dirty bit
1052 virtualization. */
1053 domain_flush_tlb_vhpt(d);
1055 break;
1057 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1059 int nbr_bytes;
1061 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1062 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1064 atomic64_set(&d->arch.shadow_fault_count, 0);
1065 atomic64_set(&d->arch.shadow_dirty_count, 0);
1067 if (guest_handle_is_null(sc->dirty_bitmap) ||
1068 (d->arch.shadow_bitmap == NULL)) {
1069 rc = -EINVAL;
1070 break;
1073 if (sc->pages > d->arch.shadow_bitmap_size)
1074 sc->pages = d->arch.shadow_bitmap_size;
1076 nbr_bytes = (sc->pages + 7) / 8;
1078 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1079 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1080 SHADOW_COPY_CHUNK : nbr_bytes - i;
1082 if (copy_to_guest_offset(
1083 sc->dirty_bitmap, i,
1084 (uint8_t *)d->arch.shadow_bitmap + i,
1085 size)) {
1086 rc = -EFAULT;
1087 break;
1090 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1093 break;
1096 case XEN_DOMCTL_SHADOW_OP_PEEK:
1098 unsigned long size;
1100 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1101 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1103 if (guest_handle_is_null(sc->dirty_bitmap) ||
1104 (d->arch.shadow_bitmap == NULL)) {
1105 rc = -EINVAL;
1106 break;
1109 if (sc->pages > d->arch.shadow_bitmap_size)
1110 sc->pages = d->arch.shadow_bitmap_size;
1112 size = (sc->pages + 7) / 8;
1113 if (copy_to_guest(sc->dirty_bitmap,
1114 (uint8_t *)d->arch.shadow_bitmap, size)) {
1115 rc = -EFAULT;
1116 break;
1118 break;
1120 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1121 sc->mb = 0;
1122 break;
1123 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1124 if (sc->mb > 0) {
1125 BUG();
1126 rc = -ENOMEM;
1128 break;
1129 default:
1130 rc = -EINVAL;
1131 break;
1134 domain_unpause(d);
1136 return rc;
1139 // remove following line if not privifying in memory
1140 //#define HAVE_PRIVIFY_MEMORY
1141 #ifndef HAVE_PRIVIFY_MEMORY
1142 #define privify_memory(x,y) do {} while(0)
1143 #endif
1145 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1146 unsigned long phys_load_offset)
1148 const elf_phdr *phdr;
1149 int phnum, h, filesz, memsz;
1150 unsigned long elfaddr, dom_mpaddr, dom_imva;
1151 struct page_info *p;
1153 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1154 for (h = 0; h < phnum; h++) {
1155 phdr = elf_phdr_by_index(elf, h);
1156 if (!elf_phdr_is_loadable(elf, phdr))
1157 continue;
1159 filesz = elf_uval(elf, phdr, p_filesz);
1160 memsz = elf_uval(elf, phdr, p_memsz);
1161 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1162 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1163 dom_mpaddr += phys_load_offset;
1165 while (memsz > 0) {
1166 p = assign_new_domain_page(d,dom_mpaddr);
1167 BUG_ON (unlikely(p == NULL));
1168 dom_imva = __va_ul(page_to_maddr(p));
1169 if (filesz > 0) {
1170 if (filesz >= PAGE_SIZE)
1171 copy_page((void *) dom_imva,
1172 (void *) elfaddr);
1173 else {
1174 // copy partial page
1175 memcpy((void *) dom_imva,
1176 (void *) elfaddr, filesz);
1177 // zero the rest of page
1178 memset((void *) dom_imva+filesz, 0,
1179 PAGE_SIZE-filesz);
1181 //FIXME: This test for code seems to find a lot more than objdump -x does
1182 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1183 privify_memory(dom_imva,PAGE_SIZE);
1184 flush_icache_range(dom_imva,
1185 dom_imva+PAGE_SIZE);
1188 else if (memsz > 0) {
1189 /* always zero out entire page */
1190 clear_page((void *) dom_imva);
1192 memsz -= PAGE_SIZE;
1193 filesz -= PAGE_SIZE;
1194 elfaddr += PAGE_SIZE;
1195 dom_mpaddr += PAGE_SIZE;
1200 static void __init calc_dom0_size(void)
1202 unsigned long domheap_pages;
1203 unsigned long p2m_pages;
1204 unsigned long spare_hv_pages;
1205 unsigned long max_dom0_size;
1207 /* Estimate maximum memory we can safely allocate for dom0
1208 * by subtracting the p2m table allocation and a chunk of memory
1209 * for DMA and PCI mapping from the available domheap pages. The
1210 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1211 * to have a good idea of what those requirements might be ahead
1212 * of time, calculated at 1MB per 4GB of system memory */
1213 domheap_pages = avail_domheap_pages();
1214 p2m_pages = domheap_pages / PTRS_PER_PTE;
1215 spare_hv_pages = domheap_pages / 4096;
1216 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1217 * PAGE_SIZE;
1218 printk("Maximum permitted dom0 size: %luMB\n",
1219 max_dom0_size / (1024*1024));
1221 /* validate proposed dom0_size, fix up as needed */
1222 if (dom0_size > max_dom0_size) {
1223 printk("Reducing dom0 memory allocation from %luK to %luK "
1224 "to fit available memory\n",
1225 dom0_size / 1024, max_dom0_size / 1024);
1226 dom0_size = max_dom0_size;
1229 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1230 if (dom0_size == 0) {
1231 printk("Allocating all available memory to dom0\n");
1232 dom0_size = max_dom0_size;
1235 /* Check dom0 size. */
1236 if (dom0_size < 4 * 1024 * 1024) {
1237 panic("dom0_mem is too small, boot aborted"
1238 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1241 if (running_on_sim) {
1242 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1245 /* no need to allocate pages for now
1246 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1247 */
1251 /*
1252 * Domain 0 has direct access to all devices absolutely. However
1253 * the major point of this stub here, is to allow alloc_dom_mem
1254 * handled with order > 0 request. Dom0 requires that bit set to
1255 * allocate memory for other domains.
1256 */
1257 static void __init physdev_init_dom0(struct domain *d)
1259 if (iomem_permit_access(d, 0UL, ~0UL))
1260 BUG();
1261 if (irqs_permit_access(d, 0, NR_IRQS-1))
1262 BUG();
1263 if (ioports_permit_access(d, 0, 0xffff))
1264 BUG();
1267 int __init construct_dom0(struct domain *d,
1268 unsigned long image_start, unsigned long image_len,
1269 unsigned long initrd_start, unsigned long initrd_len,
1270 char *cmdline)
1272 int i, rc;
1273 start_info_t *si;
1274 dom0_vga_console_info_t *ci;
1275 struct vcpu *v = d->vcpu[0];
1276 unsigned long max_pages;
1278 struct elf_binary elf;
1279 struct elf_dom_parms parms;
1280 unsigned long p_start;
1281 unsigned long pkern_start;
1282 unsigned long pkern_entry;
1283 unsigned long pkern_end;
1284 unsigned long pinitrd_start = 0;
1285 unsigned long pstart_info;
1286 unsigned long phys_load_offset;
1287 struct page_info *start_info_page;
1288 unsigned long bp_mpa;
1289 struct ia64_boot_param *bp;
1291 //printk("construct_dom0: starting\n");
1293 /* Sanity! */
1294 BUG_ON(d != dom0);
1295 BUG_ON(d->vcpu[0] == NULL);
1296 BUG_ON(v->is_initialised);
1298 printk("*** LOADING DOMAIN 0 ***\n");
1300 calc_dom0_size();
1302 max_pages = dom0_size / PAGE_SIZE;
1303 d->max_pages = max_pages;
1304 d->tot_pages = 0;
1306 rc = elf_init(&elf, (void*)image_start, image_len);
1307 if ( rc != 0 )
1308 return rc;
1309 #ifdef VERBOSE
1310 elf_set_verbose(&elf);
1311 #endif
1312 elf_parse_binary(&elf);
1313 if (0 != (elf_xen_parse(&elf, &parms)))
1314 return rc;
1316 /*
1317 * We cannot rely on the load address in the ELF headers to
1318 * determine the meta physical address at which the image
1319 * is loaded. Patch the address to match the real one, based
1320 * on xen_pstart
1321 */
1322 phys_load_offset = xen_pstart - elf.pstart;
1323 elf.pstart += phys_load_offset;
1324 elf.pend += phys_load_offset;
1325 parms.virt_kstart += phys_load_offset;
1326 parms.virt_kend += phys_load_offset;
1327 parms.virt_entry += phys_load_offset;
1329 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1330 elf_64bit(&elf) ? "64-bit" : "32-bit",
1331 elf_msb(&elf) ? "msb" : "lsb",
1332 elf.pstart, elf.pend);
1333 if (!elf_64bit(&elf) ||
1334 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1335 printk("Incompatible kernel binary\n");
1336 return -1;
1339 p_start = parms.virt_base;
1340 pkern_start = parms.virt_kstart;
1341 pkern_end = parms.virt_kend;
1342 pkern_entry = parms.virt_entry;
1344 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1346 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1348 printk("Initial guest OS must load to a page boundary.\n");
1349 return -EINVAL;
1352 pstart_info = PAGE_ALIGN(pkern_end);
1353 if(initrd_start && initrd_len){
1354 unsigned long offset;
1356 /* The next page aligned boundary after the start info.
1357 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1358 pinitrd_start = pstart_info + PAGE_SIZE;
1360 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1361 panic("%s: not enough memory assigned to dom0", __func__);
1363 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1364 struct page_info *p;
1365 p = assign_new_domain_page(d, pinitrd_start + offset);
1366 if (p == NULL)
1367 panic("%s: can't allocate page for initrd image", __func__);
1368 if (initrd_len < offset + PAGE_SIZE)
1369 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1370 initrd_len - offset);
1371 else
1372 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1376 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1377 " Kernel image: %lx->%lx\n"
1378 " Entry address: %lx\n"
1379 " Init. ramdisk: %lx len %lx\n"
1380 " Start info.: %lx->%lx\n",
1381 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1382 pstart_info, pstart_info + PAGE_SIZE);
1384 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1386 printk("Initial guest OS requires too much space\n"
1387 "(%luMB is greater than %luMB limit)\n",
1388 (pkern_end-pkern_start)>>20,
1389 (max_pages <<PAGE_SHIFT)>>20);
1390 return -ENOMEM;
1393 // if high 3 bits of pkern start are non-zero, error
1395 // if pkern end is after end of metaphysical memory, error
1396 // (we should be able to deal with this... later)
1398 /* Mask all upcalls... */
1399 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1400 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1402 if (dom0_max_vcpus == 0)
1403 dom0_max_vcpus = MAX_VIRT_CPUS;
1404 if (dom0_max_vcpus > num_online_cpus())
1405 dom0_max_vcpus = num_online_cpus();
1406 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1407 dom0_max_vcpus = MAX_VIRT_CPUS;
1409 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1410 for ( i = 1; i < dom0_max_vcpus; i++ )
1411 if (alloc_vcpu(d, i, i) == NULL)
1412 panic("Cannot allocate dom0 vcpu %d\n", i);
1414 /* Copy the OS image. */
1415 loaddomainelfimage(d, &elf, phys_load_offset);
1417 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1418 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1420 /* Set up start info area. */
1421 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1422 start_info_page = assign_new_domain_page(d, pstart_info);
1423 if (start_info_page == NULL)
1424 panic("can't allocate start info page");
1425 si = page_to_virt(start_info_page);
1426 clear_page(si);
1427 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1428 xen_major_version(), xen_minor_version());
1429 si->nr_pages = max_pages;
1430 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1432 printk("Dom0: 0x%lx\n", (u64)dom0);
1434 v->is_initialised = 1;
1435 clear_bit(_VPF_down, &v->pause_flags);
1437 /* Build firmware.
1438 Note: Linux kernel reserve memory used by start_info, so there is
1439 no need to remove it from MDT. */
1440 bp_mpa = pstart_info + sizeof(struct start_info);
1441 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1442 if (rc != 0)
1443 return rc;
1445 /* Fill boot param. */
1446 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1448 bp = (struct ia64_boot_param *)((unsigned char *)si +
1449 sizeof(start_info_t));
1450 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1452 /* We assume console has reached the last line! */
1453 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1454 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1455 bp->console_info.orig_x = 0;
1456 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1457 0 : bp->console_info.num_rows - 1;
1459 bp->initrd_start = pinitrd_start;
1460 bp->initrd_size = ia64_boot_param->initrd_size;
1462 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1463 sizeof(start_info_t) +
1464 sizeof(struct ia64_boot_param));
1466 if (fill_console_start_info(ci)) {
1467 si->console.dom0.info_off = sizeof(start_info_t) +
1468 sizeof(struct ia64_boot_param);
1469 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1472 vcpu_init_regs (v);
1474 vcpu_regs(v)->r28 = bp_mpa;
1476 vcpu_regs (v)->cr_iip = pkern_entry;
1478 physdev_init_dom0(d);
1480 return 0;
1483 void machine_restart(char * __unused)
1485 console_start_sync();
1486 if (running_on_sim)
1487 printk ("machine_restart called. spinning...\n");
1488 else
1489 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1490 while(1);
1493 extern void cpu_halt(void);
1495 void machine_halt(void)
1497 console_start_sync();
1498 if (running_on_sim)
1499 printk ("machine_halt called. spinning...\n");
1500 else
1501 cpu_halt();
1502 while(1);
1505 void sync_vcpu_execstate(struct vcpu *v)
1507 // __ia64_save_fpu(v->arch._thread.fph);
1508 // if (VMX_DOMAIN(v))
1509 // vmx_save_state(v);
1510 // FIXME SMP: Anything else needed here for SMP?
1513 /* This function is taken from xen/arch/x86/domain.c */
1514 long
1515 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1517 long rc = 0;
1519 switch (cmd) {
1520 case VCPUOP_register_runstate_memory_area:
1522 struct vcpu_register_runstate_memory_area area;
1523 struct vcpu_runstate_info runstate;
1525 rc = -EFAULT;
1526 if (copy_from_guest(&area, arg, 1))
1527 break;
1529 if (!guest_handle_okay(area.addr.h, 1))
1530 break;
1532 rc = 0;
1533 runstate_guest(v) = area.addr.h;
1535 if (v == current) {
1536 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1537 } else {
1538 vcpu_runstate_get(v, &runstate);
1539 __copy_to_guest(runstate_guest(v), &runstate, 1);
1542 break;
1544 default:
1545 rc = -ENOSYS;
1546 break;
1549 return rc;
1552 static void __init parse_dom0_mem(char *s)
1554 dom0_size = parse_size_and_unit(s, NULL);
1556 custom_param("dom0_mem", parse_dom0_mem);
1558 /*
1559 * Helper function for the optimization stuff handling the identity mapping
1560 * feature.
1561 */
1562 static inline void
1563 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
1564 struct xen_ia64_opt_feature* f)
1566 if (f->on) {
1567 *mask |= f->cmd;
1568 im->pgprot = f->pgprot;
1569 im->key = f->key;
1570 } else {
1571 *mask &= ~(f->cmd);
1572 im->pgprot = 0;
1573 im->key = 0;
1577 /* Switch a optimization feature on/off. */
1578 int
1579 domain_opt_feature(struct xen_ia64_opt_feature* f)
1581 struct opt_feature* optf = &(current->domain->arch.opt_feature);
1582 long rc = 0;
1584 switch (f->cmd) {
1585 case XEN_IA64_OPTF_IDENT_MAP_REG4:
1586 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
1587 break;
1588 case XEN_IA64_OPTF_IDENT_MAP_REG5:
1589 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
1590 break;
1591 case XEN_IA64_OPTF_IDENT_MAP_REG7:
1592 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
1593 break;
1594 default:
1595 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
1596 rc = -ENOSYS;
1597 break;
1599 return rc;