ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 15693:87a72ba32301

[IA64] Saner dom0 memory and cpu defaults

Some ia64 xen dom0 tweaks:
* Increase default memory allocation from 512M to 4G
* Increase default vcpu allocation from 1 to 4
* Implement rough calculation of what the maximum memory
that can be safely allocated to dom0 is
* If need be, scale down requested memory allocation to fit
available memory, rather than simply panicking
* If dom0_mem=0 is specified, allocate all available mem

Signed-off-by: Jarod Wilson <jwilson@redhat.com>
author Alex Williamson <alex.williamson@hp.com>
date Wed Aug 08 20:48:11 2007 -0600 (2007-08-08)
parents 57f519c41534
children b5dbf184df6c
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <public/vcpu.h>
55 /* dom0_size: default memory allocation for dom0 (~4GB) */
56 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
58 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
59 static unsigned int __initdata dom0_max_vcpus = 4;
60 integer_param("dom0_max_vcpus", dom0_max_vcpus);
62 extern char dom0_command_line[];
64 /* forward declaration */
65 static void init_switch_stack(struct vcpu *v);
67 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
68 This is a Xen virtual address. */
69 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
70 DEFINE_PER_CPU(int *, current_psr_ic_addr);
72 DEFINE_PER_CPU(struct vcpu *, fp_owner);
74 #include <xen/sched-if.h>
76 static void
77 ia64_disable_vhpt_walker(void)
78 {
79 // disable VHPT. ia64_new_rr7() might cause VHPT
80 // fault without this because it flushes dtr[IA64_TR_VHPT]
81 // (VHPT_SIZE_LOG2 << 2) is just for avoid
82 // Reserved Register/Field fault.
83 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
84 }
86 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
87 {
88 int cpu = smp_processor_id();
89 int last_vcpu_id, last_processor;
91 if (!is_idle_domain(prev->domain))
92 tlbflush_update_time
93 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
94 tlbflush_current_time());
96 if (is_idle_domain(next->domain))
97 return;
99 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
100 last_processor = next->arch.last_processor;
102 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
103 next->arch.last_processor = cpu;
105 if ((last_vcpu_id != next->vcpu_id &&
106 last_vcpu_id != INVALID_VCPU_ID) ||
107 (last_vcpu_id == next->vcpu_id &&
108 last_processor != cpu &&
109 last_processor != INVALID_PROCESSOR)) {
110 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
111 u32 last_tlbflush_timestamp =
112 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
113 #endif
114 int vhpt_is_flushed = 0;
116 // if the vTLB implementation was changed,
117 // the followings must be updated either.
118 if (VMX_DOMAIN(next)) {
119 // currently vTLB for vt-i domian is per vcpu.
120 // so any flushing isn't needed.
121 } else if (HAS_PERVCPU_VHPT(next->domain)) {
122 // nothing to do
123 } else {
124 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
125 last_tlbflush_timestamp)) {
126 local_vhpt_flush();
127 vhpt_is_flushed = 1;
128 }
129 }
130 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
131 last_tlbflush_timestamp)) {
132 local_flush_tlb_all();
133 perfc_incr(tlbflush_clock_cswitch_purge);
134 } else {
135 perfc_incr(tlbflush_clock_cswitch_skip);
136 }
137 perfc_incr(flush_vtlb_for_context_switch);
138 }
139 }
141 static void flush_cache_for_context_switch(struct vcpu *next)
142 {
143 extern cpumask_t cpu_cache_coherent_map;
144 int cpu = smp_processor_id();
146 if (is_idle_vcpu(next) ||
147 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
148 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
149 unsigned long flags;
150 u64 progress = 0;
151 s64 status;
153 local_irq_save(flags);
154 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
155 local_irq_restore(flags);
156 if (status != 0)
157 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
158 "cache_type=4 status %lx", status);
159 }
160 }
161 }
163 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
164 {
165 /*
166 * Implement eager save, lazy restore
167 */
168 if (!is_idle_vcpu(prev)) {
169 if (VMX_DOMAIN(prev)) {
170 if (FP_PSR(prev) & IA64_PSR_MFH) {
171 __ia64_save_fpu(prev->arch._thread.fph);
172 __ia64_per_cpu_var(fp_owner) = prev;
173 }
174 } else {
175 if (PSCB(prev, hpsr_mfh)) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 }
180 }
182 if (!is_idle_vcpu(next)) {
183 if (VMX_DOMAIN(next)) {
184 FP_PSR(next) = IA64_PSR_DFH;
185 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
186 } else {
187 PSCB(next, hpsr_dfh) = 1;
188 PSCB(next, hpsr_mfh) = 0;
189 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
190 }
191 }
192 }
194 void schedule_tail(struct vcpu *prev)
195 {
196 extern char ia64_ivt;
198 context_saved(prev);
199 ia64_disable_vhpt_walker();
201 if (VMX_DOMAIN(current)) {
202 vmx_do_launch(current);
203 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
204 current->processor);
205 } else {
206 ia64_set_iva(&ia64_ivt);
207 load_region_regs(current);
208 ia64_set_pta(vcpu_pta(current));
209 vcpu_load_kernel_regs(current);
210 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
211 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
212 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
213 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
214 migrate_timer(&current->arch.hlt_timer, current->processor);
215 }
216 flush_vtlb_for_context_switch(prev, current);
217 }
219 void context_switch(struct vcpu *prev, struct vcpu *next)
220 {
221 uint64_t spsr;
223 local_irq_save(spsr);
225 if (VMX_DOMAIN(prev)) {
226 vmx_save_state(prev);
227 if (!VMX_DOMAIN(next)) {
228 /* VMX domains can change the physical cr.dcr.
229 * Restore default to prevent leakage. */
230 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
231 }
232 }
233 if (VMX_DOMAIN(next))
234 vmx_load_state(next);
236 ia64_disable_vhpt_walker();
237 lazy_fp_switch(prev, current);
239 if (prev->arch.dbg_used || next->arch.dbg_used) {
240 /*
241 * Load debug registers either because they are valid or to clear
242 * the previous one.
243 */
244 ia64_load_debug_regs(next->arch.dbr);
245 }
247 prev = ia64_switch_to(next);
249 /* Note: ia64_switch_to does not return here at vcpu initialization. */
251 if (VMX_DOMAIN(current)) {
252 vmx_load_all_rr(current);
253 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
254 current->processor);
255 } else {
256 struct domain *nd;
257 extern char ia64_ivt;
259 ia64_set_iva(&ia64_ivt);
261 nd = current->domain;
262 if (!is_idle_domain(nd)) {
263 load_region_regs(current);
264 ia64_set_pta(vcpu_pta(current));
265 vcpu_load_kernel_regs(current);
266 if (vcpu_pkr_in_use(current))
267 vcpu_pkr_load_regs(current);
268 vcpu_set_next_timer(current);
269 if (vcpu_timer_expired(current))
270 vcpu_pend_timer(current);
271 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
272 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
273 __ia64_per_cpu_var(current_psr_ic_addr) =
274 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
275 /* steal time accounting */
276 if (!guest_handle_is_null(runstate_guest(current)))
277 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
278 } else {
279 /* When switching to idle domain, only need to disable vhpt
280 * walker. Then all accesses happen within idle context will
281 * be handled by TR mapping and identity mapping.
282 */
283 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
284 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
285 }
286 }
287 local_irq_restore(spsr);
289 /* lazy fp */
290 if (current->processor != current->arch.last_processor) {
291 unsigned long *addr;
292 addr = (unsigned long *)per_cpu_addr(fp_owner,
293 current->arch.last_processor);
294 ia64_cmpxchg(acq, addr, current, 0, 8);
295 }
297 flush_vtlb_for_context_switch(prev, current);
298 flush_cache_for_context_switch(current);
299 context_saved(prev);
300 }
302 void continue_running(struct vcpu *same)
303 {
304 /* nothing to do */
305 }
307 #ifdef CONFIG_PERFMON
308 static int pal_halt = 1;
309 static int can_do_pal_halt = 1;
311 static int __init nohalt_setup(char * str)
312 {
313 pal_halt = can_do_pal_halt = 0;
314 return 1;
315 }
316 __setup("nohalt", nohalt_setup);
318 void
319 update_pal_halt_status(int status)
320 {
321 can_do_pal_halt = pal_halt && status;
322 }
323 #else
324 #define can_do_pal_halt (1)
325 #endif
327 static void default_idle(void)
328 {
329 local_irq_disable();
330 if ( !softirq_pending(smp_processor_id()) ) {
331 if (can_do_pal_halt)
332 safe_halt();
333 else
334 cpu_relax();
335 }
336 local_irq_enable();
337 }
339 static void continue_cpu_idle_loop(void)
340 {
341 for ( ; ; )
342 {
343 #ifdef IA64
344 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
345 #else
346 irq_stat[cpu].idle_timestamp = jiffies;
347 #endif
348 page_scrub_schedule_work();
349 while ( !softirq_pending(smp_processor_id()) )
350 default_idle();
351 raise_softirq(SCHEDULE_SOFTIRQ);
352 do_softirq();
353 }
354 }
356 void startup_cpu_idle_loop(void)
357 {
358 /* Just some sanity to ensure that the scheduler is set up okay. */
359 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
360 raise_softirq(SCHEDULE_SOFTIRQ);
362 continue_cpu_idle_loop();
363 }
365 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
366 * get_order_from_shift(XMAPPEDREGS_SHIFT))
367 */
368 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
369 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
370 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
371 #endif
373 void hlt_timer_fn(void *data)
374 {
375 struct vcpu *v = data;
376 vcpu_unblock(v);
377 }
379 void relinquish_vcpu_resources(struct vcpu *v)
380 {
381 if (HAS_PERVCPU_VHPT(v->domain))
382 pervcpu_vhpt_free(v);
383 if (v->arch.privregs != NULL) {
384 free_xenheap_pages(v->arch.privregs,
385 get_order_from_shift(XMAPPEDREGS_SHIFT));
386 v->arch.privregs = NULL;
387 }
388 kill_timer(&v->arch.hlt_timer);
389 }
391 struct vcpu *alloc_vcpu_struct(void)
392 {
393 struct vcpu *v;
394 struct thread_info *ti;
395 static int first_allocation = 1;
397 if (first_allocation) {
398 first_allocation = 0;
399 /* Still keep idle vcpu0 static allocated at compilation, due
400 * to some code from Linux still requires it in early phase.
401 */
402 return idle_vcpu[0];
403 }
405 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
406 return NULL;
407 memset(v, 0, sizeof(*v));
409 ti = alloc_thread_info(v);
410 /* Clear thread_info to clear some important fields, like
411 * preempt_count
412 */
413 memset(ti, 0, sizeof(struct thread_info));
414 init_switch_stack(v);
416 return v;
417 }
419 void free_vcpu_struct(struct vcpu *v)
420 {
421 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
422 }
424 int vcpu_initialise(struct vcpu *v)
425 {
426 struct domain *d = v->domain;
428 if (!is_idle_domain(d)) {
429 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
430 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
431 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
432 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
434 /* Is it correct ?
435 It depends on the domain rid usage.
437 A domain may share rid among its processor (eg having a
438 global VHPT). In this case, we should also share rid
439 among vcpus and the rid range should be the same.
441 However a domain may have per cpu rid allocation. In
442 this case we don't want to share rid among vcpus, but we may
443 do it if two vcpus are on the same cpu... */
445 v->arch.starting_rid = d->arch.starting_rid;
446 v->arch.ending_rid = d->arch.ending_rid;
447 v->arch.breakimm = d->arch.breakimm;
448 v->arch.last_processor = INVALID_PROCESSOR;
449 }
451 if (!VMX_DOMAIN(v))
452 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
453 first_cpu(cpu_online_map));
455 return 0;
456 }
458 void vcpu_share_privregs_with_guest(struct vcpu *v)
459 {
460 struct domain *d = v->domain;
461 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
463 for (i = 0; i < (1 << order); i++)
464 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
465 d, XENSHARE_writable);
466 /*
467 * XXX IA64_XMAPPEDREGS_PADDR
468 * assign these pages into guest pseudo physical address
469 * space for dom0 to map this page by gmfn.
470 * this is necessary for domain save, restore and dump-core.
471 */
472 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
473 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
474 virt_to_maddr(v->arch.privregs + i));
475 }
477 int vcpu_late_initialise(struct vcpu *v)
478 {
479 struct domain *d = v->domain;
480 int rc, order;
482 if (HAS_PERVCPU_VHPT(d)) {
483 rc = pervcpu_vhpt_alloc(v);
484 if (rc != 0)
485 return rc;
486 }
488 /* Create privregs page. */
489 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
490 v->arch.privregs = alloc_xenheap_pages(order);
491 BUG_ON(v->arch.privregs == NULL);
492 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
493 vcpu_share_privregs_with_guest(v);
495 return 0;
496 }
498 void vcpu_destroy(struct vcpu *v)
499 {
500 if (v->domain->arch.is_vti)
501 vmx_relinquish_vcpu_resources(v);
502 else
503 relinquish_vcpu_resources(v);
504 }
506 static void init_switch_stack(struct vcpu *v)
507 {
508 struct pt_regs *regs = vcpu_regs (v);
509 struct switch_stack *sw = (struct switch_stack *) regs - 1;
510 extern void ia64_ret_from_clone;
512 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
513 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
514 sw->b0 = (unsigned long) &ia64_ret_from_clone;
515 sw->ar_fpsr = FPSR_DEFAULT;
516 v->arch._thread.ksp = (unsigned long) sw - 16;
517 // stay on kernel stack because may get interrupts!
518 // ia64_ret_from_clone switches to user stack
519 v->arch._thread.on_ustack = 0;
520 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
521 }
523 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
524 static int opt_pervcpu_vhpt = 1;
525 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
526 #endif
528 int arch_domain_create(struct domain *d)
529 {
530 int i;
532 // the following will eventually need to be negotiated dynamically
533 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
534 d->arch.breakimm = 0x1000;
535 for (i = 0; i < NR_CPUS; i++) {
536 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
537 }
539 if (is_idle_domain(d))
540 return 0;
542 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
543 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
544 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
545 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
546 #endif
547 if (tlb_track_create(d) < 0)
548 goto fail_nomem1;
549 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
550 if (d->shared_info == NULL)
551 goto fail_nomem;
552 memset(d->shared_info, 0, XSI_SIZE);
553 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
554 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
555 d, XENSHARE_writable);
557 /* We may also need emulation rid for region4, though it's unlikely
558 * to see guest issue uncacheable access in metaphysical mode. But
559 * keep such info here may be more sane.
560 */
561 if (!allocate_rid_range(d,0))
562 goto fail_nomem;
564 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
566 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
567 goto fail_nomem;
569 /*
570 * grant_table_create() can't fully initialize grant table for domain
571 * because it is called before arch_domain_create().
572 * Here we complete the initialization which requires p2m table.
573 */
574 spin_lock(&d->grant_table->lock);
575 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
576 ia64_gnttab_create_shared_page(d, d->grant_table, i);
577 spin_unlock(&d->grant_table->lock);
579 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
580 RANGESETF_prettyprint_hex);
582 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
583 return 0;
585 fail_nomem:
586 tlb_track_destroy(d);
587 fail_nomem1:
588 if (d->arch.mm.pgd != NULL)
589 pgd_free(d->arch.mm.pgd);
590 if (d->shared_info != NULL)
591 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
592 return -ENOMEM;
593 }
595 void arch_domain_destroy(struct domain *d)
596 {
597 mm_final_teardown(d);
599 if (d->shared_info != NULL)
600 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
602 tlb_track_destroy(d);
604 /* Clear vTLB for the next domain. */
605 domain_flush_tlb_vhpt(d);
607 deallocate_rid_range(d);
608 }
610 int arch_vcpu_reset(struct vcpu *v)
611 {
612 /* FIXME: Stub for now */
613 return 0;
614 }
616 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
618 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
619 {
620 int i;
621 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
622 struct cpu_user_regs *uregs = vcpu_regs(v);
623 int is_hvm = VMX_DOMAIN(v);
624 unsigned int rbs_size;
626 c.nat->regs.b[6] = uregs->b6;
627 c.nat->regs.b[7] = uregs->b7;
629 c.nat->regs.ar.csd = uregs->ar_csd;
630 c.nat->regs.ar.ssd = uregs->ar_ssd;
632 c.nat->regs.r[8] = uregs->r8;
633 c.nat->regs.r[9] = uregs->r9;
634 c.nat->regs.r[10] = uregs->r10;
635 c.nat->regs.r[11] = uregs->r11;
637 if (is_hvm)
638 c.nat->regs.psr = vmx_vcpu_get_psr(v);
639 else
640 c.nat->regs.psr = vcpu_get_psr(v);
642 c.nat->regs.ip = uregs->cr_iip;
643 c.nat->regs.cfm = uregs->cr_ifs;
645 c.nat->regs.ar.unat = uregs->ar_unat;
646 c.nat->regs.ar.pfs = uregs->ar_pfs;
647 c.nat->regs.ar.rsc = uregs->ar_rsc;
648 c.nat->regs.ar.rnat = uregs->ar_rnat;
649 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
651 c.nat->regs.pr = uregs->pr;
652 c.nat->regs.b[0] = uregs->b0;
653 rbs_size = uregs->loadrs >> 16;
654 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
656 c.nat->regs.r[1] = uregs->r1;
657 c.nat->regs.r[12] = uregs->r12;
658 c.nat->regs.r[13] = uregs->r13;
659 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
660 c.nat->regs.r[15] = uregs->r15;
662 c.nat->regs.r[14] = uregs->r14;
663 c.nat->regs.r[2] = uregs->r2;
664 c.nat->regs.r[3] = uregs->r3;
665 c.nat->regs.r[16] = uregs->r16;
666 c.nat->regs.r[17] = uregs->r17;
667 c.nat->regs.r[18] = uregs->r18;
668 c.nat->regs.r[19] = uregs->r19;
669 c.nat->regs.r[20] = uregs->r20;
670 c.nat->regs.r[21] = uregs->r21;
671 c.nat->regs.r[22] = uregs->r22;
672 c.nat->regs.r[23] = uregs->r23;
673 c.nat->regs.r[24] = uregs->r24;
674 c.nat->regs.r[25] = uregs->r25;
675 c.nat->regs.r[26] = uregs->r26;
676 c.nat->regs.r[27] = uregs->r27;
677 c.nat->regs.r[28] = uregs->r28;
678 c.nat->regs.r[29] = uregs->r29;
679 c.nat->regs.r[30] = uregs->r30;
680 c.nat->regs.r[31] = uregs->r31;
682 c.nat->regs.ar.ccv = uregs->ar_ccv;
684 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
685 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
686 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
687 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
688 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
689 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
691 c.nat->regs.r[4] = uregs->r4;
692 c.nat->regs.r[5] = uregs->r5;
693 c.nat->regs.r[6] = uregs->r6;
694 c.nat->regs.r[7] = uregs->r7;
696 /* FIXME: to be reordered. */
697 c.nat->regs.nats = uregs->eml_unat;
699 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
700 if (rbs_size < sizeof (c.nat->regs.rbs))
701 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
703 c.nat->privregs_pfn = get_gpfn_from_mfn
704 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
706 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
707 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
708 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
709 }
711 for (i = 0; i < 7; i++)
712 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
714 /* Fill extra regs. */
715 for (i = 0; i < 8; i++) {
716 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
717 tr->itrs[i].itir = v->arch.itrs[i].itir;
718 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
719 tr->itrs[i].rid = v->arch.itrs[i].rid;
720 }
721 for (i = 0; i < 8; i++) {
722 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
723 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
724 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
725 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
726 }
727 c.nat->event_callback_ip = v->arch.event_callback_ip;
729 /* If PV and privregs is not set, we can't read mapped registers. */
730 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
731 return;
733 vcpu_get_dcr (v, &c.nat->regs.cr.dcr);
734 vcpu_get_iva (v, &c.nat->regs.cr.iva);
735 }
737 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
738 {
739 struct cpu_user_regs *uregs = vcpu_regs(v);
740 struct domain *d = v->domain;
741 int was_initialised = v->is_initialised;
742 unsigned int rbs_size;
743 int rc, i;
745 /* Finish vcpu initialization. */
746 if (!was_initialised) {
747 if (d->arch.is_vti)
748 rc = vmx_final_setup_guest(v);
749 else
750 rc = vcpu_late_initialise(v);
751 if (rc != 0)
752 return rc;
754 vcpu_init_regs(v);
756 v->is_initialised = 1;
757 /* Auto-online VCPU0 when it is initialised. */
758 if (v->vcpu_id == 0)
759 clear_bit(_VPF_down, &v->pause_flags);
760 }
762 if (c.nat == NULL)
763 return 0;
765 uregs->b6 = c.nat->regs.b[6];
766 uregs->b7 = c.nat->regs.b[7];
768 uregs->ar_csd = c.nat->regs.ar.csd;
769 uregs->ar_ssd = c.nat->regs.ar.ssd;
771 uregs->r8 = c.nat->regs.r[8];
772 uregs->r9 = c.nat->regs.r[9];
773 uregs->r10 = c.nat->regs.r[10];
774 uregs->r11 = c.nat->regs.r[11];
776 if (!d->arch.is_vti)
777 vcpu_set_psr(v, c.nat->regs.psr);
778 else
779 vmx_vcpu_set_psr(v, c.nat->regs.psr);
780 uregs->cr_iip = c.nat->regs.ip;
781 uregs->cr_ifs = c.nat->regs.cfm;
783 uregs->ar_unat = c.nat->regs.ar.unat;
784 uregs->ar_pfs = c.nat->regs.ar.pfs;
785 uregs->ar_rsc = c.nat->regs.ar.rsc;
786 uregs->ar_rnat = c.nat->regs.ar.rnat;
787 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
789 uregs->pr = c.nat->regs.pr;
790 uregs->b0 = c.nat->regs.b[0];
791 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
792 /* Protection against crazy user code. */
793 if (!was_initialised)
794 uregs->loadrs = (rbs_size) << 16;
795 if (rbs_size == (uregs->loadrs >> 16))
796 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
798 uregs->r1 = c.nat->regs.r[1];
799 uregs->r12 = c.nat->regs.r[12];
800 uregs->r13 = c.nat->regs.r[13];
801 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
802 uregs->r15 = c.nat->regs.r[15];
804 uregs->r14 = c.nat->regs.r[14];
805 uregs->r2 = c.nat->regs.r[2];
806 uregs->r3 = c.nat->regs.r[3];
807 uregs->r16 = c.nat->regs.r[16];
808 uregs->r17 = c.nat->regs.r[17];
809 uregs->r18 = c.nat->regs.r[18];
810 uregs->r19 = c.nat->regs.r[19];
811 uregs->r20 = c.nat->regs.r[20];
812 uregs->r21 = c.nat->regs.r[21];
813 uregs->r22 = c.nat->regs.r[22];
814 uregs->r23 = c.nat->regs.r[23];
815 uregs->r24 = c.nat->regs.r[24];
816 uregs->r25 = c.nat->regs.r[25];
817 uregs->r26 = c.nat->regs.r[26];
818 uregs->r27 = c.nat->regs.r[27];
819 uregs->r28 = c.nat->regs.r[28];
820 uregs->r29 = c.nat->regs.r[29];
821 uregs->r30 = c.nat->regs.r[30];
822 uregs->r31 = c.nat->regs.r[31];
824 uregs->ar_ccv = c.nat->regs.ar.ccv;
826 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
827 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
828 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
829 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
830 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
831 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
833 uregs->r4 = c.nat->regs.r[4];
834 uregs->r5 = c.nat->regs.r[5];
835 uregs->r6 = c.nat->regs.r[6];
836 uregs->r7 = c.nat->regs.r[7];
838 /* FIXME: to be reordered and restored. */
839 /* uregs->eml_unat = c.nat->regs.nat; */
840 uregs->eml_unat = 0;
842 if (!d->arch.is_vti) {
843 /* domain runs at PL2/3 */
844 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
845 IA64_PSR_CPL0_BIT);
846 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
847 }
849 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
850 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
851 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
852 }
854 if (c.nat->flags & VGCF_EXTRA_REGS) {
855 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
857 for (i = 0; i < 8; i++) {
858 vcpu_set_itr(v, i, tr->itrs[i].pte,
859 tr->itrs[i].itir,
860 tr->itrs[i].vadr,
861 tr->itrs[i].rid);
862 }
863 for (i = 0; i < 8; i++) {
864 vcpu_set_dtr(v, i,
865 tr->dtrs[i].pte,
866 tr->dtrs[i].itir,
867 tr->dtrs[i].vadr,
868 tr->dtrs[i].rid);
869 }
870 v->arch.event_callback_ip = c.nat->event_callback_ip;
871 v->arch.iva = c.nat->regs.cr.iva;
872 }
874 return 0;
875 }
877 static void relinquish_memory(struct domain *d, struct list_head *list)
878 {
879 struct list_head *ent;
880 struct page_info *page;
881 #ifndef __ia64__
882 unsigned long x, y;
883 #endif
885 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
886 spin_lock_recursive(&d->page_alloc_lock);
887 ent = list->next;
888 while ( ent != list )
889 {
890 page = list_entry(ent, struct page_info, list);
891 /* Grab a reference to the page so it won't disappear from under us. */
892 if ( unlikely(!get_page(page, d)) )
893 {
894 /* Couldn't get a reference -- someone is freeing this page. */
895 ent = ent->next;
896 continue;
897 }
899 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
900 put_page_and_type(page);
902 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
903 put_page(page);
905 #ifndef __ia64__
906 /*
907 * Forcibly invalidate base page tables at this point to break circular
908 * 'linear page table' references. This is okay because MMU structures
909 * are not shared across domains and this domain is now dead. Thus base
910 * tables are not in use so a non-zero count means circular reference.
911 */
912 y = page->u.inuse.type_info;
913 for ( ; ; )
914 {
915 x = y;
916 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
917 (PGT_base_page_table|PGT_validated)) )
918 break;
920 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
921 if ( likely(y == x) )
922 {
923 free_page_type(page, PGT_base_page_table);
924 break;
925 }
926 }
927 #endif
929 /* Follow the list chain and /then/ potentially free the page. */
930 ent = ent->next;
931 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
932 put_page(page);
933 }
935 spin_unlock_recursive(&d->page_alloc_lock);
936 }
938 void domain_relinquish_resources(struct domain *d)
939 {
940 /* Relinquish guest resources for VT-i domain. */
941 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
942 vmx_relinquish_guest_resources(d);
944 /* Tear down shadow mode stuff. */
945 mm_teardown(d);
947 /* Relinquish every page of memory. */
948 relinquish_memory(d, &d->xenpage_list);
949 relinquish_memory(d, &d->page_list);
951 if (d->arch.is_vti && d->arch.sal_data)
952 xfree(d->arch.sal_data);
954 /* Free page used by xen oprofile buffer */
955 free_xenoprof_pages(d);
956 }
958 unsigned long
959 domain_set_shared_info_va (unsigned long va)
960 {
961 struct vcpu *v = current;
962 struct domain *d = v->domain;
964 /* Check virtual address:
965 must belong to region 7,
966 must be 64Kb aligned,
967 must not be within Xen virtual space. */
968 if ((va >> 61) != 7
969 || (va & 0xffffUL) != 0
970 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
971 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
973 /* Note: this doesn't work well if other cpus are already running.
974 However this is part of the spec :-) */
975 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
976 d->arch.shared_info_va = va;
978 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
979 INT_ENABLE_OFFSET(v);
981 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
983 /* Remap the shared pages. */
984 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
986 return 0;
987 }
989 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
990 #define SHADOW_COPY_CHUNK 1024
992 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
993 {
994 unsigned int op = sc->op;
995 int rc = 0;
996 int i;
997 //struct vcpu *v;
999 if (unlikely(d == current->domain)) {
1000 gdprintk(XENLOG_INFO,
1001 "Don't try to do a shadow op on yourself!\n");
1002 return -EINVAL;
1005 domain_pause(d);
1007 switch (op)
1009 case XEN_DOMCTL_SHADOW_OP_OFF:
1010 if (shadow_mode_enabled (d)) {
1011 u64 *bm = d->arch.shadow_bitmap;
1013 /* Flush vhpt and tlb to restore dirty bit usage. */
1014 domain_flush_tlb_vhpt(d);
1016 /* Free bitmap. */
1017 d->arch.shadow_bitmap_size = 0;
1018 d->arch.shadow_bitmap = NULL;
1019 xfree(bm);
1021 break;
1023 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1024 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1025 rc = -EINVAL;
1026 break;
1028 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1029 if (shadow_mode_enabled(d)) {
1030 rc = -EINVAL;
1031 break;
1034 atomic64_set(&d->arch.shadow_fault_count, 0);
1035 atomic64_set(&d->arch.shadow_dirty_count, 0);
1037 d->arch.shadow_bitmap_size =
1038 ((d->arch.convmem_end >> PAGE_SHIFT) +
1039 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1040 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1041 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1042 if (d->arch.shadow_bitmap == NULL) {
1043 d->arch.shadow_bitmap_size = 0;
1044 rc = -ENOMEM;
1046 else {
1047 memset(d->arch.shadow_bitmap, 0,
1048 d->arch.shadow_bitmap_size / 8);
1050 /* Flush vhtp and tlb to enable dirty bit
1051 virtualization. */
1052 domain_flush_tlb_vhpt(d);
1054 break;
1056 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1058 int nbr_bytes;
1060 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1061 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1063 atomic64_set(&d->arch.shadow_fault_count, 0);
1064 atomic64_set(&d->arch.shadow_dirty_count, 0);
1066 if (guest_handle_is_null(sc->dirty_bitmap) ||
1067 (d->arch.shadow_bitmap == NULL)) {
1068 rc = -EINVAL;
1069 break;
1072 if (sc->pages > d->arch.shadow_bitmap_size)
1073 sc->pages = d->arch.shadow_bitmap_size;
1075 nbr_bytes = (sc->pages + 7) / 8;
1077 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1078 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1079 SHADOW_COPY_CHUNK : nbr_bytes - i;
1081 if (copy_to_guest_offset(
1082 sc->dirty_bitmap, i,
1083 (uint8_t *)d->arch.shadow_bitmap + i,
1084 size)) {
1085 rc = -EFAULT;
1086 break;
1089 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1092 break;
1095 case XEN_DOMCTL_SHADOW_OP_PEEK:
1097 unsigned long size;
1099 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1100 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1102 if (guest_handle_is_null(sc->dirty_bitmap) ||
1103 (d->arch.shadow_bitmap == NULL)) {
1104 rc = -EINVAL;
1105 break;
1108 if (sc->pages > d->arch.shadow_bitmap_size)
1109 sc->pages = d->arch.shadow_bitmap_size;
1111 size = (sc->pages + 7) / 8;
1112 if (copy_to_guest(sc->dirty_bitmap,
1113 (uint8_t *)d->arch.shadow_bitmap, size)) {
1114 rc = -EFAULT;
1115 break;
1117 break;
1119 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1120 sc->mb = 0;
1121 break;
1122 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1123 if (sc->mb > 0) {
1124 BUG();
1125 rc = -ENOMEM;
1127 break;
1128 default:
1129 rc = -EINVAL;
1130 break;
1133 domain_unpause(d);
1135 return rc;
1138 // remove following line if not privifying in memory
1139 //#define HAVE_PRIVIFY_MEMORY
1140 #ifndef HAVE_PRIVIFY_MEMORY
1141 #define privify_memory(x,y) do {} while(0)
1142 #endif
1144 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1145 unsigned long phys_load_offset)
1147 const elf_phdr *phdr;
1148 int phnum, h, filesz, memsz;
1149 unsigned long elfaddr, dom_mpaddr, dom_imva;
1150 struct page_info *p;
1152 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1153 for (h = 0; h < phnum; h++) {
1154 phdr = elf_phdr_by_index(elf, h);
1155 if (!elf_phdr_is_loadable(elf, phdr))
1156 continue;
1158 filesz = elf_uval(elf, phdr, p_filesz);
1159 memsz = elf_uval(elf, phdr, p_memsz);
1160 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1161 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1162 dom_mpaddr += phys_load_offset;
1164 while (memsz > 0) {
1165 p = assign_new_domain_page(d,dom_mpaddr);
1166 BUG_ON (unlikely(p == NULL));
1167 dom_imva = __va_ul(page_to_maddr(p));
1168 if (filesz > 0) {
1169 if (filesz >= PAGE_SIZE)
1170 copy_page((void *) dom_imva,
1171 (void *) elfaddr);
1172 else {
1173 // copy partial page
1174 memcpy((void *) dom_imva,
1175 (void *) elfaddr, filesz);
1176 // zero the rest of page
1177 memset((void *) dom_imva+filesz, 0,
1178 PAGE_SIZE-filesz);
1180 //FIXME: This test for code seems to find a lot more than objdump -x does
1181 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1182 privify_memory(dom_imva,PAGE_SIZE);
1183 flush_icache_range(dom_imva,
1184 dom_imva+PAGE_SIZE);
1187 else if (memsz > 0) {
1188 /* always zero out entire page */
1189 clear_page((void *) dom_imva);
1191 memsz -= PAGE_SIZE;
1192 filesz -= PAGE_SIZE;
1193 elfaddr += PAGE_SIZE;
1194 dom_mpaddr += PAGE_SIZE;
1199 static void __init calc_dom0_size(void)
1201 unsigned long domheap_pages;
1202 unsigned long p2m_pages;
1203 unsigned long spare_hv_pages;
1204 unsigned long max_dom0_size;
1206 /* Estimate maximum memory we can safely allocate for dom0
1207 * by subtracting the p2m table allocation and a chunk of memory
1208 * for DMA and PCI mapping from the available domheap pages. The
1209 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1210 * to have a good idea of what those requirements might be ahead
1211 * of time, calculated at 1MB per 4GB of system memory */
1212 domheap_pages = avail_domheap_pages();
1213 p2m_pages = domheap_pages / PTRS_PER_PTE;
1214 spare_hv_pages = domheap_pages / 4096;
1215 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1216 * PAGE_SIZE;
1217 printk("Maximum permitted dom0 size: %luMB\n",
1218 max_dom0_size / (1024*1024));
1220 /* validate proposed dom0_size, fix up as needed */
1221 if (dom0_size > max_dom0_size) {
1222 printk("Reducing dom0 memory allocation from %luK to %luK "
1223 "to fit available memory\n",
1224 dom0_size / 1024, max_dom0_size / 1024);
1225 dom0_size = max_dom0_size;
1228 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1229 if (dom0_size == 0) {
1230 printk("Allocating all available memory to dom0\n");
1231 dom0_size = max_dom0_size;
1234 /* Check dom0 size. */
1235 if (dom0_size < 4 * 1024 * 1024) {
1236 panic("dom0_mem is too small, boot aborted"
1237 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1240 if (running_on_sim) {
1241 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1244 /* no need to allocate pages for now
1245 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1246 */
1250 /*
1251 * Domain 0 has direct access to all devices absolutely. However
1252 * the major point of this stub here, is to allow alloc_dom_mem
1253 * handled with order > 0 request. Dom0 requires that bit set to
1254 * allocate memory for other domains.
1255 */
1256 static void __init physdev_init_dom0(struct domain *d)
1258 if (iomem_permit_access(d, 0UL, ~0UL))
1259 BUG();
1260 if (irqs_permit_access(d, 0, NR_IRQS-1))
1261 BUG();
1262 if (ioports_permit_access(d, 0, 0xffff))
1263 BUG();
1266 int __init construct_dom0(struct domain *d,
1267 unsigned long image_start, unsigned long image_len,
1268 unsigned long initrd_start, unsigned long initrd_len,
1269 char *cmdline)
1271 int i, rc;
1272 start_info_t *si;
1273 dom0_vga_console_info_t *ci;
1274 struct vcpu *v = d->vcpu[0];
1275 unsigned long max_pages;
1277 struct elf_binary elf;
1278 struct elf_dom_parms parms;
1279 unsigned long p_start;
1280 unsigned long pkern_start;
1281 unsigned long pkern_entry;
1282 unsigned long pkern_end;
1283 unsigned long pinitrd_start = 0;
1284 unsigned long pstart_info;
1285 unsigned long phys_load_offset;
1286 struct page_info *start_info_page;
1287 unsigned long bp_mpa;
1288 struct ia64_boot_param *bp;
1290 //printk("construct_dom0: starting\n");
1292 /* Sanity! */
1293 BUG_ON(d != dom0);
1294 BUG_ON(d->vcpu[0] == NULL);
1295 BUG_ON(v->is_initialised);
1297 printk("*** LOADING DOMAIN 0 ***\n");
1299 calc_dom0_size();
1301 max_pages = dom0_size / PAGE_SIZE;
1302 d->max_pages = max_pages;
1303 d->tot_pages = 0;
1305 rc = elf_init(&elf, (void*)image_start, image_len);
1306 if ( rc != 0 )
1307 return rc;
1308 #ifdef VERBOSE
1309 elf_set_verbose(&elf);
1310 #endif
1311 elf_parse_binary(&elf);
1312 if (0 != (elf_xen_parse(&elf, &parms)))
1313 return rc;
1315 /*
1316 * We cannot rely on the load address in the ELF headers to
1317 * determine the meta physical address at which the image
1318 * is loaded. Patch the address to match the real one, based
1319 * on xen_pstart
1320 */
1321 phys_load_offset = xen_pstart - elf.pstart;
1322 elf.pstart += phys_load_offset;
1323 elf.pend += phys_load_offset;
1324 parms.virt_kstart += phys_load_offset;
1325 parms.virt_kend += phys_load_offset;
1326 parms.virt_entry += phys_load_offset;
1328 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1329 elf_64bit(&elf) ? "64-bit" : "32-bit",
1330 elf_msb(&elf) ? "msb" : "lsb",
1331 elf.pstart, elf.pend);
1332 if (!elf_64bit(&elf) ||
1333 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1334 printk("Incompatible kernel binary\n");
1335 return -1;
1338 p_start = parms.virt_base;
1339 pkern_start = parms.virt_kstart;
1340 pkern_end = parms.virt_kend;
1341 pkern_entry = parms.virt_entry;
1343 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1345 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1347 printk("Initial guest OS must load to a page boundary.\n");
1348 return -EINVAL;
1351 pstart_info = PAGE_ALIGN(pkern_end);
1352 if(initrd_start && initrd_len){
1353 unsigned long offset;
1355 /* The next page aligned boundary after the start info.
1356 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1357 pinitrd_start = pstart_info + PAGE_SIZE;
1359 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1360 panic("%s: not enough memory assigned to dom0", __func__);
1362 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1363 struct page_info *p;
1364 p = assign_new_domain_page(d, pinitrd_start + offset);
1365 if (p == NULL)
1366 panic("%s: can't allocate page for initrd image", __func__);
1367 if (initrd_len < offset + PAGE_SIZE)
1368 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1369 initrd_len - offset);
1370 else
1371 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1375 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1376 " Kernel image: %lx->%lx\n"
1377 " Entry address: %lx\n"
1378 " Init. ramdisk: %lx len %lx\n"
1379 " Start info.: %lx->%lx\n",
1380 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1381 pstart_info, pstart_info + PAGE_SIZE);
1383 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1385 printk("Initial guest OS requires too much space\n"
1386 "(%luMB is greater than %luMB limit)\n",
1387 (pkern_end-pkern_start)>>20,
1388 (max_pages <<PAGE_SHIFT)>>20);
1389 return -ENOMEM;
1392 // if high 3 bits of pkern start are non-zero, error
1394 // if pkern end is after end of metaphysical memory, error
1395 // (we should be able to deal with this... later)
1397 /* Mask all upcalls... */
1398 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1399 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1401 if (dom0_max_vcpus == 0)
1402 dom0_max_vcpus = MAX_VIRT_CPUS;
1403 if (dom0_max_vcpus > num_online_cpus())
1404 dom0_max_vcpus = num_online_cpus();
1405 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1406 dom0_max_vcpus = MAX_VIRT_CPUS;
1408 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1409 for ( i = 1; i < dom0_max_vcpus; i++ )
1410 if (alloc_vcpu(d, i, i) == NULL)
1411 panic("Cannot allocate dom0 vcpu %d\n", i);
1413 /* Copy the OS image. */
1414 loaddomainelfimage(d, &elf, phys_load_offset);
1416 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1417 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1419 /* Set up start info area. */
1420 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1421 start_info_page = assign_new_domain_page(d, pstart_info);
1422 if (start_info_page == NULL)
1423 panic("can't allocate start info page");
1424 si = page_to_virt(start_info_page);
1425 clear_page(si);
1426 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1427 xen_major_version(), xen_minor_version());
1428 si->nr_pages = max_pages;
1429 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1431 printk("Dom0: 0x%lx\n", (u64)dom0);
1433 v->is_initialised = 1;
1434 clear_bit(_VPF_down, &v->pause_flags);
1436 /* Build firmware.
1437 Note: Linux kernel reserve memory used by start_info, so there is
1438 no need to remove it from MDT. */
1439 bp_mpa = pstart_info + sizeof(struct start_info);
1440 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1441 if (rc != 0)
1442 return rc;
1444 /* Fill boot param. */
1445 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1447 bp = (struct ia64_boot_param *)((unsigned char *)si +
1448 sizeof(start_info_t));
1449 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1451 /* We assume console has reached the last line! */
1452 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1453 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1454 bp->console_info.orig_x = 0;
1455 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1456 0 : bp->console_info.num_rows - 1;
1458 bp->initrd_start = pinitrd_start;
1459 bp->initrd_size = ia64_boot_param->initrd_size;
1461 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1462 sizeof(start_info_t) +
1463 sizeof(struct ia64_boot_param));
1465 if (fill_console_start_info(ci)) {
1466 si->console.dom0.info_off = sizeof(start_info_t) +
1467 sizeof(struct ia64_boot_param);
1468 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1471 vcpu_init_regs (v);
1473 vcpu_regs(v)->r28 = bp_mpa;
1475 vcpu_regs (v)->cr_iip = pkern_entry;
1477 physdev_init_dom0(d);
1479 return 0;
1482 void machine_restart(char * __unused)
1484 console_start_sync();
1485 if (running_on_sim)
1486 printk ("machine_restart called. spinning...\n");
1487 else
1488 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1489 while(1);
1492 extern void cpu_halt(void);
1494 void machine_halt(void)
1496 console_start_sync();
1497 if (running_on_sim)
1498 printk ("machine_halt called. spinning...\n");
1499 else
1500 cpu_halt();
1501 while(1);
1504 void sync_vcpu_execstate(struct vcpu *v)
1506 // __ia64_save_fpu(v->arch._thread.fph);
1507 // if (VMX_DOMAIN(v))
1508 // vmx_save_state(v);
1509 // FIXME SMP: Anything else needed here for SMP?
1512 /* This function is taken from xen/arch/x86/domain.c */
1513 long
1514 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1516 long rc = 0;
1518 switch (cmd) {
1519 case VCPUOP_register_runstate_memory_area:
1521 struct vcpu_register_runstate_memory_area area;
1522 struct vcpu_runstate_info runstate;
1524 rc = -EFAULT;
1525 if (copy_from_guest(&area, arg, 1))
1526 break;
1528 if (!guest_handle_okay(area.addr.h, 1))
1529 break;
1531 rc = 0;
1532 runstate_guest(v) = area.addr.h;
1534 if (v == current) {
1535 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1536 } else {
1537 vcpu_runstate_get(v, &runstate);
1538 __copy_to_guest(runstate_guest(v), &runstate, 1);
1541 break;
1543 default:
1544 rc = -ENOSYS;
1545 break;
1548 return rc;
1551 static void __init parse_dom0_mem(char *s)
1553 dom0_size = parse_size_and_unit(s, NULL);
1555 custom_param("dom0_mem", parse_dom0_mem);
1557 /*
1558 * Helper function for the optimization stuff handling the identity mapping
1559 * feature.
1560 */
1561 static inline void
1562 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
1563 struct xen_ia64_opt_feature* f)
1565 if (f->on) {
1566 *mask |= f->cmd;
1567 im->pgprot = f->pgprot;
1568 im->key = f->key;
1569 } else {
1570 *mask &= ~(f->cmd);
1571 im->pgprot = 0;
1572 im->key = 0;
1576 /* Switch a optimization feature on/off. */
1577 int
1578 domain_opt_feature(struct xen_ia64_opt_feature* f)
1580 struct opt_feature* optf = &(current->domain->arch.opt_feature);
1581 long rc = 0;
1583 switch (f->cmd) {
1584 case XEN_IA64_OPTF_IDENT_MAP_REG4:
1585 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
1586 break;
1587 case XEN_IA64_OPTF_IDENT_MAP_REG5:
1588 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
1589 break;
1590 case XEN_IA64_OPTF_IDENT_MAP_REG7:
1591 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
1592 break;
1593 default:
1594 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
1595 rc = -ENOSYS;
1596 break;
1598 return rc;