ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 16102:9c52742f7734

[IA64] Avoid set cr.ivt when context switch if possible

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Alex Williamson <alex.williamson@hp.com>
date Fri Oct 12 13:56:42 2007 -0600 (2007-10-12)
parents e71c7789c2f5
children ff1f49f62204
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <asm/sal.h>
54 #include <public/vcpu.h>
55 #include <linux/cpu.h>
56 #include <linux/notifier.h>
58 /* dom0_size: default memory allocation for dom0 (~4GB) */
59 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
61 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
62 static unsigned int __initdata dom0_max_vcpus = 4;
63 integer_param("dom0_max_vcpus", dom0_max_vcpus);
65 extern char dom0_command_line[];
67 /* forward declaration */
68 static void init_switch_stack(struct vcpu *v);
70 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
71 This is a Xen virtual address. */
72 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
73 DEFINE_PER_CPU(int *, current_psr_ic_addr);
75 DEFINE_PER_CPU(struct vcpu *, fp_owner);
77 #include <xen/sched-if.h>
79 static void
80 ia64_disable_vhpt_walker(void)
81 {
82 // disable VHPT. ia64_new_rr7() might cause VHPT
83 // fault without this because it flushes dtr[IA64_TR_VHPT]
84 // (VHPT_SIZE_LOG2 << 2) is just for avoid
85 // Reserved Register/Field fault.
86 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
87 }
89 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
90 {
91 int cpu = smp_processor_id();
92 int last_vcpu_id, last_processor;
94 if (!is_idle_domain(prev->domain))
95 tlbflush_update_time
96 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
97 tlbflush_current_time());
99 if (is_idle_domain(next->domain))
100 return;
102 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
103 last_processor = next->arch.last_processor;
105 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
106 next->arch.last_processor = cpu;
108 if ((last_vcpu_id != next->vcpu_id &&
109 last_vcpu_id != INVALID_VCPU_ID) ||
110 (last_vcpu_id == next->vcpu_id &&
111 last_processor != cpu &&
112 last_processor != INVALID_PROCESSOR)) {
113 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
114 u32 last_tlbflush_timestamp =
115 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
116 #endif
117 int vhpt_is_flushed = 0;
119 // if the vTLB implementation was changed,
120 // the followings must be updated either.
121 if (VMX_DOMAIN(next)) {
122 // currently vTLB for vt-i domian is per vcpu.
123 // so any flushing isn't needed.
124 } else if (HAS_PERVCPU_VHPT(next->domain)) {
125 // nothing to do
126 } else {
127 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
128 last_tlbflush_timestamp)) {
129 local_vhpt_flush();
130 vhpt_is_flushed = 1;
131 }
132 }
133 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
134 last_tlbflush_timestamp)) {
135 local_flush_tlb_all();
136 perfc_incr(tlbflush_clock_cswitch_purge);
137 } else {
138 perfc_incr(tlbflush_clock_cswitch_skip);
139 }
140 perfc_incr(flush_vtlb_for_context_switch);
141 }
142 }
144 static void flush_cache_for_context_switch(struct vcpu *next)
145 {
146 extern cpumask_t cpu_cache_coherent_map;
147 int cpu = smp_processor_id();
149 if (is_idle_vcpu(next) ||
150 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
151 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
152 unsigned long flags;
153 u64 progress = 0;
154 s64 status;
156 local_irq_save(flags);
157 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
158 local_irq_restore(flags);
159 if (status != 0)
160 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
161 "cache_type=4 status %lx", status);
162 }
163 }
164 }
166 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
167 {
168 /*
169 * Implement eager save, lazy restore
170 */
171 if (!is_idle_vcpu(prev)) {
172 if (VMX_DOMAIN(prev)) {
173 if (FP_PSR(prev) & IA64_PSR_MFH) {
174 __ia64_save_fpu(prev->arch._thread.fph);
175 __ia64_per_cpu_var(fp_owner) = prev;
176 }
177 } else {
178 if (PSCB(prev, hpsr_mfh)) {
179 __ia64_save_fpu(prev->arch._thread.fph);
180 __ia64_per_cpu_var(fp_owner) = prev;
181 }
182 }
183 }
185 if (!is_idle_vcpu(next)) {
186 if (VMX_DOMAIN(next)) {
187 FP_PSR(next) = IA64_PSR_DFH;
188 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
189 } else {
190 PSCB(next, hpsr_dfh) = 1;
191 PSCB(next, hpsr_mfh) = 0;
192 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
193 }
194 }
195 }
197 void schedule_tail(struct vcpu *prev)
198 {
199 extern char ia64_ivt;
201 context_saved(prev);
202 ia64_disable_vhpt_walker();
204 if (VMX_DOMAIN(current)) {
205 vmx_do_launch(current);
206 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
207 current->processor);
208 } else {
209 if (VMX_DOMAIN(prev))
210 ia64_set_iva(&ia64_ivt);
211 load_region_regs(current);
212 ia64_set_pta(vcpu_pta(current));
213 vcpu_load_kernel_regs(current);
214 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
215 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
216 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
217 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
218 migrate_timer(&current->arch.hlt_timer, current->processor);
219 }
220 flush_vtlb_for_context_switch(prev, current);
221 }
223 void context_switch(struct vcpu *prev, struct vcpu *next)
224 {
225 uint64_t spsr;
227 local_irq_save(spsr);
229 if (VMX_DOMAIN(prev)) {
230 vmx_save_state(prev);
231 if (!VMX_DOMAIN(next)) {
232 /* VMX domains can change the physical cr.dcr.
233 * Restore default to prevent leakage. */
234 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
235 }
236 }
237 if (VMX_DOMAIN(next))
238 vmx_load_state(next);
240 ia64_disable_vhpt_walker();
241 lazy_fp_switch(prev, current);
243 if (prev->arch.dbg_used || next->arch.dbg_used) {
244 /*
245 * Load debug registers either because they are valid or to clear
246 * the previous one.
247 */
248 ia64_load_debug_regs(next->arch.dbr);
249 }
251 prev = ia64_switch_to(next);
253 /* Note: ia64_switch_to does not return here at vcpu initialization. */
255 if (VMX_DOMAIN(current)) {
256 vmx_load_all_rr(current);
257 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
258 current->processor);
259 } else {
260 struct domain *nd;
261 extern char ia64_ivt;
263 if (VMX_DOMAIN(prev))
264 ia64_set_iva(&ia64_ivt);
266 nd = current->domain;
267 if (!is_idle_domain(nd)) {
268 load_region_regs(current);
269 ia64_set_pta(vcpu_pta(current));
270 vcpu_load_kernel_regs(current);
271 if (vcpu_pkr_in_use(current))
272 vcpu_pkr_load_regs(current);
273 vcpu_set_next_timer(current);
274 if (vcpu_timer_expired(current))
275 vcpu_pend_timer(current);
276 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
277 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
278 __ia64_per_cpu_var(current_psr_ic_addr) =
279 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
280 /* steal time accounting */
281 if (!guest_handle_is_null(runstate_guest(current)))
282 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
283 } else {
284 /* When switching to idle domain, only need to disable vhpt
285 * walker. Then all accesses happen within idle context will
286 * be handled by TR mapping and identity mapping.
287 */
288 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
289 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
290 }
291 }
292 local_irq_restore(spsr);
294 /* lazy fp */
295 if (current->processor != current->arch.last_processor) {
296 unsigned long *addr;
297 addr = (unsigned long *)per_cpu_addr(fp_owner,
298 current->arch.last_processor);
299 ia64_cmpxchg(acq, addr, current, 0, 8);
300 }
302 flush_vtlb_for_context_switch(prev, current);
303 flush_cache_for_context_switch(current);
304 context_saved(prev);
305 }
307 void continue_running(struct vcpu *same)
308 {
309 /* nothing to do */
310 }
312 #ifdef CONFIG_PERFMON
313 static int pal_halt = 1;
314 static int can_do_pal_halt = 1;
316 static int __init nohalt_setup(char * str)
317 {
318 pal_halt = can_do_pal_halt = 0;
319 return 1;
320 }
321 __setup("nohalt", nohalt_setup);
323 void
324 update_pal_halt_status(int status)
325 {
326 can_do_pal_halt = pal_halt && status;
327 }
328 #else
329 #define can_do_pal_halt (1)
330 #endif
332 static void default_idle(void)
333 {
334 local_irq_disable();
335 if ( !softirq_pending(smp_processor_id()) ) {
336 if (can_do_pal_halt)
337 safe_halt();
338 else
339 cpu_relax();
340 }
341 local_irq_enable();
342 }
344 extern void play_dead(void);
346 static void continue_cpu_idle_loop(void)
347 {
348 int cpu = smp_processor_id();
350 for ( ; ; )
351 {
352 #ifdef IA64
353 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
354 #else
355 irq_stat[cpu].idle_timestamp = jiffies;
356 #endif
357 page_scrub_schedule_work();
358 while ( !softirq_pending(cpu) )
359 default_idle();
360 raise_softirq(SCHEDULE_SOFTIRQ);
361 do_softirq();
362 if (!cpu_online(cpu))
363 play_dead();
364 }
365 }
367 void startup_cpu_idle_loop(void)
368 {
369 /* Just some sanity to ensure that the scheduler is set up okay. */
370 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
371 raise_softirq(SCHEDULE_SOFTIRQ);
373 continue_cpu_idle_loop();
374 }
376 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
377 * get_order_from_shift(XMAPPEDREGS_SHIFT))
378 */
379 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
380 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
381 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
382 #endif
384 void hlt_timer_fn(void *data)
385 {
386 struct vcpu *v = data;
387 vcpu_unblock(v);
388 }
390 void relinquish_vcpu_resources(struct vcpu *v)
391 {
392 if (HAS_PERVCPU_VHPT(v->domain))
393 pervcpu_vhpt_free(v);
394 if (v->arch.privregs != NULL) {
395 free_xenheap_pages(v->arch.privregs,
396 get_order_from_shift(XMAPPEDREGS_SHIFT));
397 v->arch.privregs = NULL;
398 }
399 kill_timer(&v->arch.hlt_timer);
400 }
402 struct vcpu *alloc_vcpu_struct(void)
403 {
404 struct vcpu *v;
405 struct thread_info *ti;
406 static int first_allocation = 1;
408 if (first_allocation) {
409 first_allocation = 0;
410 /* Still keep idle vcpu0 static allocated at compilation, due
411 * to some code from Linux still requires it in early phase.
412 */
413 return idle_vcpu[0];
414 }
416 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
417 return NULL;
418 memset(v, 0, sizeof(*v));
420 ti = alloc_thread_info(v);
421 /* Clear thread_info to clear some important fields, like
422 * preempt_count
423 */
424 memset(ti, 0, sizeof(struct thread_info));
425 init_switch_stack(v);
427 return v;
428 }
430 void free_vcpu_struct(struct vcpu *v)
431 {
432 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
433 }
435 int vcpu_initialise(struct vcpu *v)
436 {
437 struct domain *d = v->domain;
439 if (!is_idle_domain(d)) {
440 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
441 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
442 /* Set default values to saved_rr. */
443 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
444 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
446 /* Is it correct ?
447 It depends on the domain rid usage.
449 A domain may share rid among its processor (eg having a
450 global VHPT). In this case, we should also share rid
451 among vcpus and the rid range should be the same.
453 However a domain may have per cpu rid allocation. In
454 this case we don't want to share rid among vcpus, but we may
455 do it if two vcpus are on the same cpu... */
457 v->arch.starting_rid = d->arch.starting_rid;
458 v->arch.ending_rid = d->arch.ending_rid;
459 v->arch.breakimm = d->arch.breakimm;
460 v->arch.last_processor = INVALID_PROCESSOR;
461 v->arch.vhpt_pg_shift = PAGE_SHIFT;
462 }
464 if (!VMX_DOMAIN(v))
465 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
466 first_cpu(cpu_online_map));
468 return 0;
469 }
471 void vcpu_share_privregs_with_guest(struct vcpu *v)
472 {
473 struct domain *d = v->domain;
474 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
476 for (i = 0; i < (1 << order); i++)
477 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
478 d, XENSHARE_writable);
479 /*
480 * XXX IA64_XMAPPEDREGS_PADDR
481 * assign these pages into guest pseudo physical address
482 * space for dom0 to map this page by gmfn.
483 * this is necessary for domain save, restore and dump-core.
484 */
485 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
486 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
487 virt_to_maddr(v->arch.privregs + i));
488 }
490 int vcpu_late_initialise(struct vcpu *v)
491 {
492 struct domain *d = v->domain;
493 int rc, order;
495 if (HAS_PERVCPU_VHPT(d)) {
496 rc = pervcpu_vhpt_alloc(v);
497 if (rc != 0)
498 return rc;
499 }
501 /* Create privregs page. */
502 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
503 v->arch.privregs = alloc_xenheap_pages(order);
504 BUG_ON(v->arch.privregs == NULL);
505 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
506 vcpu_share_privregs_with_guest(v);
508 return 0;
509 }
511 void vcpu_destroy(struct vcpu *v)
512 {
513 if (v->domain->arch.is_vti)
514 vmx_relinquish_vcpu_resources(v);
515 else
516 relinquish_vcpu_resources(v);
517 }
519 static void init_switch_stack(struct vcpu *v)
520 {
521 struct pt_regs *regs = vcpu_regs (v);
522 struct switch_stack *sw = (struct switch_stack *) regs - 1;
523 extern void ia64_ret_from_clone;
525 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
526 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
527 sw->b0 = (unsigned long) &ia64_ret_from_clone;
528 sw->ar_fpsr = FPSR_DEFAULT;
529 v->arch._thread.ksp = (unsigned long) sw - 16;
530 // stay on kernel stack because may get interrupts!
531 // ia64_ret_from_clone switches to user stack
532 v->arch._thread.on_ustack = 0;
533 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
534 }
536 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
537 static int opt_pervcpu_vhpt = 1;
538 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
539 #endif
541 int arch_domain_create(struct domain *d)
542 {
543 int i;
545 // the following will eventually need to be negotiated dynamically
546 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
547 d->arch.breakimm = 0x1000;
548 for (i = 0; i < NR_CPUS; i++) {
549 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
550 }
552 if (is_idle_domain(d))
553 return 0;
555 foreign_p2m_init(d);
556 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
557 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
558 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
559 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
560 #endif
561 if (tlb_track_create(d) < 0)
562 goto fail_nomem1;
563 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
564 if (d->shared_info == NULL)
565 goto fail_nomem;
566 memset(d->shared_info, 0, XSI_SIZE);
567 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
568 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
569 d, XENSHARE_writable);
571 /* We may also need emulation rid for region4, though it's unlikely
572 * to see guest issue uncacheable access in metaphysical mode. But
573 * keep such info here may be more sane.
574 */
575 if (!allocate_rid_range(d,0))
576 goto fail_nomem;
578 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
579 d->arch.mm_teardown_offset = 0;
581 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
582 goto fail_nomem;
584 /*
585 * grant_table_create() can't fully initialize grant table for domain
586 * because it is called before arch_domain_create().
587 * Here we complete the initialization which requires p2m table.
588 */
589 spin_lock(&d->grant_table->lock);
590 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
591 ia64_gnttab_create_shared_page(d, d->grant_table, i);
592 spin_unlock(&d->grant_table->lock);
594 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
595 RANGESETF_prettyprint_hex);
597 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
598 return 0;
600 fail_nomem:
601 tlb_track_destroy(d);
602 fail_nomem1:
603 if (d->arch.mm.pgd != NULL)
604 pgd_free(d->arch.mm.pgd);
605 if (d->shared_info != NULL)
606 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
607 return -ENOMEM;
608 }
610 void arch_domain_destroy(struct domain *d)
611 {
612 mm_final_teardown(d);
614 if (d->shared_info != NULL)
615 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
617 tlb_track_destroy(d);
619 /* Clear vTLB for the next domain. */
620 domain_flush_tlb_vhpt(d);
622 deallocate_rid_range(d);
623 }
625 int arch_vcpu_reset(struct vcpu *v)
626 {
627 /* FIXME: Stub for now */
628 return 0;
629 }
631 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
633 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
634 {
635 int i;
636 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
637 struct cpu_user_regs *uregs = vcpu_regs(v);
638 int is_hvm = VMX_DOMAIN(v);
639 unsigned int rbs_size;
641 c.nat->regs.b[6] = uregs->b6;
642 c.nat->regs.b[7] = uregs->b7;
644 c.nat->regs.ar.csd = uregs->ar_csd;
645 c.nat->regs.ar.ssd = uregs->ar_ssd;
647 c.nat->regs.r[8] = uregs->r8;
648 c.nat->regs.r[9] = uregs->r9;
649 c.nat->regs.r[10] = uregs->r10;
650 c.nat->regs.r[11] = uregs->r11;
652 if (is_hvm)
653 c.nat->regs.psr = vmx_vcpu_get_psr(v);
654 else
655 c.nat->regs.psr = vcpu_get_psr(v);
657 c.nat->regs.ip = uregs->cr_iip;
658 c.nat->regs.cfm = uregs->cr_ifs;
660 c.nat->regs.ar.unat = uregs->ar_unat;
661 c.nat->regs.ar.pfs = uregs->ar_pfs;
662 c.nat->regs.ar.rsc = uregs->ar_rsc;
663 c.nat->regs.ar.rnat = uregs->ar_rnat;
664 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
666 c.nat->regs.pr = uregs->pr;
667 c.nat->regs.b[0] = uregs->b0;
668 rbs_size = uregs->loadrs >> 16;
669 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
671 c.nat->regs.r[1] = uregs->r1;
672 c.nat->regs.r[12] = uregs->r12;
673 c.nat->regs.r[13] = uregs->r13;
674 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
675 c.nat->regs.r[15] = uregs->r15;
677 c.nat->regs.r[14] = uregs->r14;
678 c.nat->regs.r[2] = uregs->r2;
679 c.nat->regs.r[3] = uregs->r3;
680 c.nat->regs.r[16] = uregs->r16;
681 c.nat->regs.r[17] = uregs->r17;
682 c.nat->regs.r[18] = uregs->r18;
683 c.nat->regs.r[19] = uregs->r19;
684 c.nat->regs.r[20] = uregs->r20;
685 c.nat->regs.r[21] = uregs->r21;
686 c.nat->regs.r[22] = uregs->r22;
687 c.nat->regs.r[23] = uregs->r23;
688 c.nat->regs.r[24] = uregs->r24;
689 c.nat->regs.r[25] = uregs->r25;
690 c.nat->regs.r[26] = uregs->r26;
691 c.nat->regs.r[27] = uregs->r27;
692 c.nat->regs.r[28] = uregs->r28;
693 c.nat->regs.r[29] = uregs->r29;
694 c.nat->regs.r[30] = uregs->r30;
695 c.nat->regs.r[31] = uregs->r31;
697 c.nat->regs.ar.ccv = uregs->ar_ccv;
699 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
700 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
701 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
702 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
703 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
704 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
706 c.nat->regs.r[4] = uregs->r4;
707 c.nat->regs.r[5] = uregs->r5;
708 c.nat->regs.r[6] = uregs->r6;
709 c.nat->regs.r[7] = uregs->r7;
711 /* FIXME: to be reordered. */
712 c.nat->regs.nats = uregs->eml_unat;
714 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
715 if (rbs_size < sizeof (c.nat->regs.rbs))
716 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
718 c.nat->privregs_pfn = get_gpfn_from_mfn
719 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
721 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
722 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
723 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
724 }
726 for (i = 0; i < 8; i++)
727 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
729 /* Fill extra regs. */
730 for (i = 0;
731 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
732 i++) {
733 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
734 tr->itrs[i].itir = v->arch.itrs[i].itir;
735 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
736 tr->itrs[i].rid = v->arch.itrs[i].rid;
737 }
738 for (i = 0;
739 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
740 i++) {
741 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
742 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
743 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
744 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
745 }
746 c.nat->event_callback_ip = v->arch.event_callback_ip;
748 /* If PV and privregs is not set, we can't read mapped registers. */
749 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
750 return;
752 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
754 c.nat->regs.cr.itm = v->domain->arch.is_vti ?
755 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
756 vcpu_get_iva(v, &c.nat->regs.cr.iva);
757 vcpu_get_pta(v, &c.nat->regs.cr.pta);
759 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
760 vcpu_get_isr(v, &c.nat->regs.cr.isr);
761 vcpu_get_iip(v, &c.nat->regs.cr.iip);
762 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
763 vcpu_get_itir(v, &c.nat->regs.cr.itir);
764 vcpu_get_iha(v, &c.nat->regs.cr.iha);
765 vcpu_get_ivr(v, &c.nat->regs.cr.ivr);
766 vcpu_get_iim(v, &c.nat->regs.cr.iim);
768 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
769 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
770 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
771 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
772 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
773 vcpu_get_itv(v, &c.nat->regs.cr.itv);
774 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
775 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
776 }
778 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
779 {
780 struct cpu_user_regs *uregs = vcpu_regs(v);
781 struct domain *d = v->domain;
782 int was_initialised = v->is_initialised;
783 unsigned int rbs_size;
784 int rc, i;
786 /* Finish vcpu initialization. */
787 if (!was_initialised) {
788 if (d->arch.is_vti)
789 rc = vmx_final_setup_guest(v);
790 else
791 rc = vcpu_late_initialise(v);
792 if (rc != 0)
793 return rc;
795 vcpu_init_regs(v);
797 v->is_initialised = 1;
798 /* Auto-online VCPU0 when it is initialised. */
799 if (v->vcpu_id == 0)
800 clear_bit(_VPF_down, &v->pause_flags);
801 }
803 if (c.nat == NULL)
804 return 0;
806 uregs->b6 = c.nat->regs.b[6];
807 uregs->b7 = c.nat->regs.b[7];
809 uregs->ar_csd = c.nat->regs.ar.csd;
810 uregs->ar_ssd = c.nat->regs.ar.ssd;
812 uregs->r8 = c.nat->regs.r[8];
813 uregs->r9 = c.nat->regs.r[9];
814 uregs->r10 = c.nat->regs.r[10];
815 uregs->r11 = c.nat->regs.r[11];
817 if (!d->arch.is_vti)
818 vcpu_set_psr(v, c.nat->regs.psr);
819 else
820 vmx_vcpu_set_psr(v, c.nat->regs.psr);
821 uregs->cr_iip = c.nat->regs.ip;
822 uregs->cr_ifs = c.nat->regs.cfm;
824 uregs->ar_unat = c.nat->regs.ar.unat;
825 uregs->ar_pfs = c.nat->regs.ar.pfs;
826 uregs->ar_rsc = c.nat->regs.ar.rsc;
827 uregs->ar_rnat = c.nat->regs.ar.rnat;
828 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
830 uregs->pr = c.nat->regs.pr;
831 uregs->b0 = c.nat->regs.b[0];
832 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
833 /* Protection against crazy user code. */
834 if (!was_initialised)
835 uregs->loadrs = (rbs_size) << 16;
836 if (rbs_size == (uregs->loadrs >> 16))
837 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
839 uregs->r1 = c.nat->regs.r[1];
840 uregs->r12 = c.nat->regs.r[12];
841 uregs->r13 = c.nat->regs.r[13];
842 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
843 uregs->r15 = c.nat->regs.r[15];
845 uregs->r14 = c.nat->regs.r[14];
846 uregs->r2 = c.nat->regs.r[2];
847 uregs->r3 = c.nat->regs.r[3];
848 uregs->r16 = c.nat->regs.r[16];
849 uregs->r17 = c.nat->regs.r[17];
850 uregs->r18 = c.nat->regs.r[18];
851 uregs->r19 = c.nat->regs.r[19];
852 uregs->r20 = c.nat->regs.r[20];
853 uregs->r21 = c.nat->regs.r[21];
854 uregs->r22 = c.nat->regs.r[22];
855 uregs->r23 = c.nat->regs.r[23];
856 uregs->r24 = c.nat->regs.r[24];
857 uregs->r25 = c.nat->regs.r[25];
858 uregs->r26 = c.nat->regs.r[26];
859 uregs->r27 = c.nat->regs.r[27];
860 uregs->r28 = c.nat->regs.r[28];
861 uregs->r29 = c.nat->regs.r[29];
862 uregs->r30 = c.nat->regs.r[30];
863 uregs->r31 = c.nat->regs.r[31];
865 uregs->ar_ccv = c.nat->regs.ar.ccv;
867 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
868 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
869 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
870 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
871 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
872 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
874 uregs->r4 = c.nat->regs.r[4];
875 uregs->r5 = c.nat->regs.r[5];
876 uregs->r6 = c.nat->regs.r[6];
877 uregs->r7 = c.nat->regs.r[7];
879 /* FIXME: to be reordered and restored. */
880 /* uregs->eml_unat = c.nat->regs.nat; */
881 uregs->eml_unat = 0;
883 if (!d->arch.is_vti) {
884 /* domain runs at PL2/3 */
885 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
886 IA64_PSR_CPL0_BIT);
887 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
888 }
890 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
891 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
892 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
893 }
895 if (c.nat->flags & VGCF_EXTRA_REGS) {
896 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
898 for (i = 0;
899 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
900 i++) {
901 vcpu_set_itr(v, i, tr->itrs[i].pte,
902 tr->itrs[i].itir,
903 tr->itrs[i].vadr,
904 tr->itrs[i].rid);
905 }
906 for (i = 0;
907 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
908 i++) {
909 vcpu_set_dtr(v, i,
910 tr->dtrs[i].pte,
911 tr->dtrs[i].itir,
912 tr->dtrs[i].vadr,
913 tr->dtrs[i].rid);
914 }
915 v->arch.event_callback_ip = c.nat->event_callback_ip;
916 vcpu_set_iva(v, c.nat->regs.cr.iva);
917 }
919 return 0;
920 }
922 static void relinquish_memory(struct domain *d, struct list_head *list)
923 {
924 struct list_head *ent;
925 struct page_info *page;
926 #ifndef __ia64__
927 unsigned long x, y;
928 #endif
930 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
931 spin_lock_recursive(&d->page_alloc_lock);
932 ent = list->next;
933 while ( ent != list )
934 {
935 page = list_entry(ent, struct page_info, list);
936 /* Grab a reference to the page so it won't disappear from under us. */
937 if ( unlikely(!get_page(page, d)) )
938 {
939 /* Couldn't get a reference -- someone is freeing this page. */
940 ent = ent->next;
941 continue;
942 }
944 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
945 put_page_and_type(page);
947 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
948 put_page(page);
950 #ifndef __ia64__
951 /*
952 * Forcibly invalidate base page tables at this point to break circular
953 * 'linear page table' references. This is okay because MMU structures
954 * are not shared across domains and this domain is now dead. Thus base
955 * tables are not in use so a non-zero count means circular reference.
956 */
957 y = page->u.inuse.type_info;
958 for ( ; ; )
959 {
960 x = y;
961 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
962 (PGT_base_page_table|PGT_validated)) )
963 break;
965 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
966 if ( likely(y == x) )
967 {
968 free_page_type(page, PGT_base_page_table);
969 break;
970 }
971 }
972 #endif
974 /* Follow the list chain and /then/ potentially free the page. */
975 ent = ent->next;
976 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
977 put_page(page);
978 }
980 spin_unlock_recursive(&d->page_alloc_lock);
981 }
983 int domain_relinquish_resources(struct domain *d)
984 {
985 int ret;
986 /* Relinquish guest resources for VT-i domain. */
987 if (d->arch.is_vti)
988 vmx_relinquish_guest_resources(d);
990 /* Tear down shadow mode stuff. */
991 ret = mm_teardown(d);
992 if (ret != 0)
993 return ret;
995 /* Relinquish every page of memory. */
996 relinquish_memory(d, &d->xenpage_list);
997 relinquish_memory(d, &d->page_list);
999 if (d->arch.is_vti && d->arch.sal_data)
1000 xfree(d->arch.sal_data);
1002 /* Free page used by xen oprofile buffer */
1003 free_xenoprof_pages(d);
1005 return 0;
1008 unsigned long
1009 domain_set_shared_info_va (unsigned long va)
1011 struct vcpu *v = current;
1012 struct domain *d = v->domain;
1014 /* Check virtual address:
1015 must belong to region 7,
1016 must be 64Kb aligned,
1017 must not be within Xen virtual space. */
1018 if ((va >> 61) != 7
1019 || (va & 0xffffUL) != 0
1020 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1021 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1023 /* Note: this doesn't work well if other cpus are already running.
1024 However this is part of the spec :-) */
1025 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1026 d->arch.shared_info_va = va;
1028 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1029 INT_ENABLE_OFFSET(v);
1031 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
1033 /* Remap the shared pages. */
1034 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
1036 return 0;
1039 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1040 #define SHADOW_COPY_CHUNK 1024
1042 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1044 unsigned int op = sc->op;
1045 int rc = 0;
1046 int i;
1047 //struct vcpu *v;
1049 if (unlikely(d == current->domain)) {
1050 gdprintk(XENLOG_INFO,
1051 "Don't try to do a shadow op on yourself!\n");
1052 return -EINVAL;
1055 domain_pause(d);
1057 switch (op)
1059 case XEN_DOMCTL_SHADOW_OP_OFF:
1060 if (shadow_mode_enabled (d)) {
1061 u64 *bm = d->arch.shadow_bitmap;
1063 /* Flush vhpt and tlb to restore dirty bit usage. */
1064 domain_flush_tlb_vhpt(d);
1066 /* Free bitmap. */
1067 d->arch.shadow_bitmap_size = 0;
1068 d->arch.shadow_bitmap = NULL;
1069 xfree(bm);
1071 break;
1073 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1074 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1075 rc = -EINVAL;
1076 break;
1078 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1079 if (shadow_mode_enabled(d)) {
1080 rc = -EINVAL;
1081 break;
1084 atomic64_set(&d->arch.shadow_fault_count, 0);
1085 atomic64_set(&d->arch.shadow_dirty_count, 0);
1087 d->arch.shadow_bitmap_size =
1088 ((d->arch.convmem_end >> PAGE_SHIFT) +
1089 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1090 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1091 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1092 if (d->arch.shadow_bitmap == NULL) {
1093 d->arch.shadow_bitmap_size = 0;
1094 rc = -ENOMEM;
1096 else {
1097 memset(d->arch.shadow_bitmap, 0,
1098 d->arch.shadow_bitmap_size / 8);
1100 /* Flush vhtp and tlb to enable dirty bit
1101 virtualization. */
1102 domain_flush_tlb_vhpt(d);
1104 break;
1106 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1108 int nbr_bytes;
1110 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1111 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1113 atomic64_set(&d->arch.shadow_fault_count, 0);
1114 atomic64_set(&d->arch.shadow_dirty_count, 0);
1116 if (guest_handle_is_null(sc->dirty_bitmap) ||
1117 (d->arch.shadow_bitmap == NULL)) {
1118 rc = -EINVAL;
1119 break;
1122 if (sc->pages > d->arch.shadow_bitmap_size)
1123 sc->pages = d->arch.shadow_bitmap_size;
1125 nbr_bytes = (sc->pages + 7) / 8;
1127 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1128 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1129 SHADOW_COPY_CHUNK : nbr_bytes - i;
1131 if (copy_to_guest_offset(
1132 sc->dirty_bitmap, i,
1133 (uint8_t *)d->arch.shadow_bitmap + i,
1134 size)) {
1135 rc = -EFAULT;
1136 break;
1139 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1142 break;
1145 case XEN_DOMCTL_SHADOW_OP_PEEK:
1147 unsigned long size;
1149 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1150 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1152 if (guest_handle_is_null(sc->dirty_bitmap) ||
1153 (d->arch.shadow_bitmap == NULL)) {
1154 rc = -EINVAL;
1155 break;
1158 if (sc->pages > d->arch.shadow_bitmap_size)
1159 sc->pages = d->arch.shadow_bitmap_size;
1161 size = (sc->pages + 7) / 8;
1162 if (copy_to_guest(sc->dirty_bitmap,
1163 (uint8_t *)d->arch.shadow_bitmap, size)) {
1164 rc = -EFAULT;
1165 break;
1167 break;
1169 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1170 sc->mb = 0;
1171 break;
1172 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1173 if (sc->mb > 0) {
1174 BUG();
1175 rc = -ENOMEM;
1177 break;
1178 default:
1179 rc = -EINVAL;
1180 break;
1183 domain_unpause(d);
1185 return rc;
1188 // remove following line if not privifying in memory
1189 //#define HAVE_PRIVIFY_MEMORY
1190 #ifndef HAVE_PRIVIFY_MEMORY
1191 #define privify_memory(x,y) do {} while(0)
1192 #endif
1194 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1195 unsigned long phys_load_offset)
1197 const elf_phdr *phdr;
1198 int phnum, h, filesz, memsz;
1199 unsigned long elfaddr, dom_mpaddr, dom_imva;
1200 struct page_info *p;
1202 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1203 for (h = 0; h < phnum; h++) {
1204 phdr = elf_phdr_by_index(elf, h);
1205 if (!elf_phdr_is_loadable(elf, phdr))
1206 continue;
1208 filesz = elf_uval(elf, phdr, p_filesz);
1209 memsz = elf_uval(elf, phdr, p_memsz);
1210 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1211 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1212 dom_mpaddr += phys_load_offset;
1214 while (memsz > 0) {
1215 p = assign_new_domain_page(d,dom_mpaddr);
1216 BUG_ON (unlikely(p == NULL));
1217 dom_imva = __va_ul(page_to_maddr(p));
1218 if (filesz > 0) {
1219 if (filesz >= PAGE_SIZE)
1220 copy_page((void *) dom_imva,
1221 (void *) elfaddr);
1222 else {
1223 // copy partial page
1224 memcpy((void *) dom_imva,
1225 (void *) elfaddr, filesz);
1226 // zero the rest of page
1227 memset((void *) dom_imva+filesz, 0,
1228 PAGE_SIZE-filesz);
1230 //FIXME: This test for code seems to find a lot more than objdump -x does
1231 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1232 privify_memory(dom_imva,PAGE_SIZE);
1233 flush_icache_range(dom_imva,
1234 dom_imva+PAGE_SIZE);
1237 else if (memsz > 0) {
1238 /* always zero out entire page */
1239 clear_page((void *) dom_imva);
1241 memsz -= PAGE_SIZE;
1242 filesz -= PAGE_SIZE;
1243 elfaddr += PAGE_SIZE;
1244 dom_mpaddr += PAGE_SIZE;
1249 static void __init calc_dom0_size(void)
1251 unsigned long domheap_pages;
1252 unsigned long p2m_pages;
1253 unsigned long spare_hv_pages;
1254 unsigned long max_dom0_size;
1256 /* Estimate maximum memory we can safely allocate for dom0
1257 * by subtracting the p2m table allocation and a chunk of memory
1258 * for DMA and PCI mapping from the available domheap pages. The
1259 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1260 * to have a good idea of what those requirements might be ahead
1261 * of time, calculated at 1MB per 4GB of system memory */
1262 domheap_pages = avail_domheap_pages();
1263 p2m_pages = domheap_pages / PTRS_PER_PTE;
1264 spare_hv_pages = domheap_pages / 4096;
1265 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1266 * PAGE_SIZE;
1267 printk("Maximum permitted dom0 size: %luMB\n",
1268 max_dom0_size / (1024*1024));
1270 /* validate proposed dom0_size, fix up as needed */
1271 if (dom0_size > max_dom0_size) {
1272 printk("Reducing dom0 memory allocation from %luK to %luK "
1273 "to fit available memory\n",
1274 dom0_size / 1024, max_dom0_size / 1024);
1275 dom0_size = max_dom0_size;
1278 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1279 if (dom0_size == 0) {
1280 printk("Allocating all available memory to dom0\n");
1281 dom0_size = max_dom0_size;
1284 /* Check dom0 size. */
1285 if (dom0_size < 4 * 1024 * 1024) {
1286 panic("dom0_mem is too small, boot aborted"
1287 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1290 if (running_on_sim) {
1291 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1294 /* no need to allocate pages for now
1295 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1296 */
1300 /*
1301 * Domain 0 has direct access to all devices absolutely. However
1302 * the major point of this stub here, is to allow alloc_dom_mem
1303 * handled with order > 0 request. Dom0 requires that bit set to
1304 * allocate memory for other domains.
1305 */
1306 static void __init physdev_init_dom0(struct domain *d)
1308 if (iomem_permit_access(d, 0UL, ~0UL))
1309 BUG();
1310 if (irqs_permit_access(d, 0, NR_IRQS-1))
1311 BUG();
1312 if (ioports_permit_access(d, 0, 0xffff))
1313 BUG();
1316 int __init construct_dom0(struct domain *d,
1317 unsigned long image_start, unsigned long image_len,
1318 unsigned long initrd_start, unsigned long initrd_len,
1319 char *cmdline)
1321 int i, rc;
1322 start_info_t *si;
1323 dom0_vga_console_info_t *ci;
1324 struct vcpu *v = d->vcpu[0];
1325 unsigned long max_pages;
1327 struct elf_binary elf;
1328 struct elf_dom_parms parms;
1329 unsigned long p_start;
1330 unsigned long pkern_start;
1331 unsigned long pkern_entry;
1332 unsigned long pkern_end;
1333 unsigned long pinitrd_start = 0;
1334 unsigned long pstart_info;
1335 unsigned long phys_load_offset;
1336 struct page_info *start_info_page;
1337 unsigned long bp_mpa;
1338 struct ia64_boot_param *bp;
1340 //printk("construct_dom0: starting\n");
1342 /* Sanity! */
1343 BUG_ON(d != dom0);
1344 BUG_ON(d->vcpu[0] == NULL);
1345 BUG_ON(v->is_initialised);
1347 printk("*** LOADING DOMAIN 0 ***\n");
1349 calc_dom0_size();
1351 max_pages = dom0_size / PAGE_SIZE;
1352 d->max_pages = max_pages;
1353 d->tot_pages = 0;
1355 rc = elf_init(&elf, (void*)image_start, image_len);
1356 if ( rc != 0 )
1357 return rc;
1358 #ifdef VERBOSE
1359 elf_set_verbose(&elf);
1360 #endif
1361 elf_parse_binary(&elf);
1362 if (0 != (elf_xen_parse(&elf, &parms)))
1363 return rc;
1365 /*
1366 * We cannot rely on the load address in the ELF headers to
1367 * determine the meta physical address at which the image
1368 * is loaded. Patch the address to match the real one, based
1369 * on xen_pstart
1370 */
1371 phys_load_offset = xen_pstart - elf.pstart;
1372 elf.pstart += phys_load_offset;
1373 elf.pend += phys_load_offset;
1374 parms.virt_kstart += phys_load_offset;
1375 parms.virt_kend += phys_load_offset;
1376 parms.virt_entry += phys_load_offset;
1378 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1379 elf_64bit(&elf) ? "64-bit" : "32-bit",
1380 elf_msb(&elf) ? "msb" : "lsb",
1381 elf.pstart, elf.pend);
1382 if (!elf_64bit(&elf) ||
1383 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1384 printk("Incompatible kernel binary\n");
1385 return -1;
1388 p_start = parms.virt_base;
1389 pkern_start = parms.virt_kstart;
1390 pkern_end = parms.virt_kend;
1391 pkern_entry = parms.virt_entry;
1393 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1395 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1397 printk("Initial guest OS must load to a page boundary.\n");
1398 return -EINVAL;
1401 pstart_info = PAGE_ALIGN(pkern_end);
1402 if(initrd_start && initrd_len){
1403 unsigned long offset;
1405 /* The next page aligned boundary after the start info.
1406 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1407 pinitrd_start = pstart_info + PAGE_SIZE;
1409 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1410 panic("%s: not enough memory assigned to dom0", __func__);
1412 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1413 struct page_info *p;
1414 p = assign_new_domain_page(d, pinitrd_start + offset);
1415 if (p == NULL)
1416 panic("%s: can't allocate page for initrd image", __func__);
1417 if (initrd_len < offset + PAGE_SIZE)
1418 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1419 initrd_len - offset);
1420 else
1421 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1425 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1426 " Kernel image: %lx->%lx\n"
1427 " Entry address: %lx\n"
1428 " Init. ramdisk: %lx len %lx\n"
1429 " Start info.: %lx->%lx\n",
1430 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1431 pstart_info, pstart_info + PAGE_SIZE);
1433 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1435 printk("Initial guest OS requires too much space\n"
1436 "(%luMB is greater than %luMB limit)\n",
1437 (pkern_end-pkern_start)>>20,
1438 (max_pages <<PAGE_SHIFT)>>20);
1439 return -ENOMEM;
1442 // if high 3 bits of pkern start are non-zero, error
1444 // if pkern end is after end of metaphysical memory, error
1445 // (we should be able to deal with this... later)
1447 /* Mask all upcalls... */
1448 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1449 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1451 if (dom0_max_vcpus == 0)
1452 dom0_max_vcpus = MAX_VIRT_CPUS;
1453 if (dom0_max_vcpus > num_online_cpus())
1454 dom0_max_vcpus = num_online_cpus();
1455 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1456 dom0_max_vcpus = MAX_VIRT_CPUS;
1458 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1459 for ( i = 1; i < dom0_max_vcpus; i++ )
1460 if (alloc_vcpu(d, i, i) == NULL)
1461 panic("Cannot allocate dom0 vcpu %d\n", i);
1463 /* Copy the OS image. */
1464 loaddomainelfimage(d, &elf, phys_load_offset);
1466 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1467 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1469 /* Set up start info area. */
1470 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1471 start_info_page = assign_new_domain_page(d, pstart_info);
1472 if (start_info_page == NULL)
1473 panic("can't allocate start info page");
1474 si = page_to_virt(start_info_page);
1475 clear_page(si);
1476 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1477 xen_major_version(), xen_minor_version());
1478 si->nr_pages = max_pages;
1479 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1481 printk("Dom0: 0x%lx\n", (u64)dom0);
1483 v->is_initialised = 1;
1484 clear_bit(_VPF_down, &v->pause_flags);
1486 /* Build firmware.
1487 Note: Linux kernel reserve memory used by start_info, so there is
1488 no need to remove it from MDT. */
1489 bp_mpa = pstart_info + sizeof(struct start_info);
1490 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1491 if (rc != 0)
1492 return rc;
1494 /* Fill boot param. */
1495 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1497 bp = (struct ia64_boot_param *)((unsigned char *)si +
1498 sizeof(start_info_t));
1499 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1501 /* We assume console has reached the last line! */
1502 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1503 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1504 bp->console_info.orig_x = 0;
1505 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1506 0 : bp->console_info.num_rows - 1;
1508 bp->initrd_start = pinitrd_start;
1509 bp->initrd_size = ia64_boot_param->initrd_size;
1511 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1512 sizeof(start_info_t) +
1513 sizeof(struct ia64_boot_param));
1515 if (fill_console_start_info(ci)) {
1516 si->console.dom0.info_off = sizeof(start_info_t) +
1517 sizeof(struct ia64_boot_param);
1518 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1521 vcpu_init_regs (v);
1523 vcpu_regs(v)->r28 = bp_mpa;
1525 vcpu_regs (v)->cr_iip = pkern_entry;
1527 physdev_init_dom0(d);
1529 return 0;
1532 void machine_restart(void)
1534 console_start_sync();
1535 if (running_on_sim)
1536 printk ("machine_restart called. spinning...\n");
1537 else
1538 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1539 while(1);
1542 extern void cpu_halt(void);
1544 void machine_halt(void)
1546 console_start_sync();
1547 if (running_on_sim)
1548 printk ("machine_halt called. spinning...\n");
1549 else
1550 cpu_halt();
1551 while(1);
1554 void sync_vcpu_execstate(struct vcpu *v)
1556 // __ia64_save_fpu(v->arch._thread.fph);
1557 // FIXME SMP: Anything else needed here for SMP?
1560 /* This function is taken from xen/arch/x86/domain.c */
1561 long
1562 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1564 long rc = 0;
1566 switch (cmd) {
1567 case VCPUOP_register_runstate_memory_area:
1569 struct vcpu_register_runstate_memory_area area;
1570 struct vcpu_runstate_info runstate;
1572 rc = -EFAULT;
1573 if (copy_from_guest(&area, arg, 1))
1574 break;
1576 if (!guest_handle_okay(area.addr.h, 1))
1577 break;
1579 rc = 0;
1580 runstate_guest(v) = area.addr.h;
1582 if (v == current) {
1583 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1584 } else {
1585 vcpu_runstate_get(v, &runstate);
1586 __copy_to_guest(runstate_guest(v), &runstate, 1);
1589 break;
1591 default:
1592 rc = -ENOSYS;
1593 break;
1596 return rc;
1599 static void __init parse_dom0_mem(char *s)
1601 dom0_size = parse_size_and_unit(s, NULL);
1603 custom_param("dom0_mem", parse_dom0_mem);
1605 /*
1606 * Helper function for the optimization stuff handling the identity mapping
1607 * feature.
1608 */
1609 static inline void
1610 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
1611 struct xen_ia64_opt_feature* f)
1613 if (f->on) {
1614 *mask |= f->cmd;
1615 im->pgprot = f->pgprot;
1616 im->key = f->key;
1617 } else {
1618 *mask &= ~(f->cmd);
1619 im->pgprot = 0;
1620 im->key = 0;
1624 /* Switch a optimization feature on/off. */
1625 int
1626 domain_opt_feature(struct xen_ia64_opt_feature* f)
1628 struct opt_feature* optf = &(current->domain->arch.opt_feature);
1629 long rc = 0;
1631 switch (f->cmd) {
1632 case XEN_IA64_OPTF_IDENT_MAP_REG4:
1633 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
1634 break;
1635 case XEN_IA64_OPTF_IDENT_MAP_REG5:
1636 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
1637 break;
1638 case XEN_IA64_OPTF_IDENT_MAP_REG7:
1639 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
1640 break;
1641 default:
1642 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
1643 rc = -ENOSYS;
1644 break;
1646 return rc;