ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 15423:cbf749e9961f

[IA64] Cleanup: Move is_platform_hp_ski() from xenmisc.c to xensetup.c

- only caller is start_kernel
- change to static __init
- also move running_on_sim to xensetup.c, and change it from unsigned
long to int, since it's just a boolean
- declare running_on_sim in config.h near some other externs

Tested by building, booting, starting a PV guest on rx2620.

Signed-off-by: Aron Griffis <aron@hp.com>
author Alex Williamson <alex.williamson@hp.com>
date Mon Jul 02 10:25:29 2007 -0600 (2007-07-02)
parents c7e16caf4e63
children 87b0b6a08dbd
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <public/vcpu.h>
55 static unsigned long __initdata dom0_size = 512*1024*1024;
57 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
58 static unsigned int __initdata dom0_max_vcpus = 1;
59 integer_param("dom0_max_vcpus", dom0_max_vcpus);
61 extern char dom0_command_line[];
63 /* forward declaration */
64 static void init_switch_stack(struct vcpu *v);
66 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
67 This is a Xen virtual address. */
68 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
69 DEFINE_PER_CPU(int *, current_psr_ic_addr);
71 DEFINE_PER_CPU(struct vcpu *, fp_owner);
73 #include <xen/sched-if.h>
75 static void
76 ia64_disable_vhpt_walker(void)
77 {
78 // disable VHPT. ia64_new_rr7() might cause VHPT
79 // fault without this because it flushes dtr[IA64_TR_VHPT]
80 // (VHPT_SIZE_LOG2 << 2) is just for avoid
81 // Reserved Register/Field fault.
82 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
83 }
85 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
86 {
87 int cpu = smp_processor_id();
88 int last_vcpu_id, last_processor;
90 if (!is_idle_domain(prev->domain))
91 tlbflush_update_time
92 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
93 tlbflush_current_time());
95 if (is_idle_domain(next->domain))
96 return;
98 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
99 last_processor = next->arch.last_processor;
101 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
102 next->arch.last_processor = cpu;
104 if ((last_vcpu_id != next->vcpu_id &&
105 last_vcpu_id != INVALID_VCPU_ID) ||
106 (last_vcpu_id == next->vcpu_id &&
107 last_processor != cpu &&
108 last_processor != INVALID_PROCESSOR)) {
109 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
110 u32 last_tlbflush_timestamp =
111 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
112 #endif
113 int vhpt_is_flushed = 0;
115 // if the vTLB implementation was changed,
116 // the followings must be updated either.
117 if (VMX_DOMAIN(next)) {
118 // currently vTLB for vt-i domian is per vcpu.
119 // so any flushing isn't needed.
120 } else if (HAS_PERVCPU_VHPT(next->domain)) {
121 // nothing to do
122 } else {
123 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
124 last_tlbflush_timestamp)) {
125 local_vhpt_flush();
126 vhpt_is_flushed = 1;
127 }
128 }
129 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
130 last_tlbflush_timestamp)) {
131 local_flush_tlb_all();
132 perfc_incr(tlbflush_clock_cswitch_purge);
133 } else {
134 perfc_incr(tlbflush_clock_cswitch_skip);
135 }
136 perfc_incr(flush_vtlb_for_context_switch);
137 }
138 }
140 static void flush_cache_for_context_switch(struct vcpu *next)
141 {
142 extern cpumask_t cpu_cache_coherent_map;
143 int cpu = smp_processor_id();
145 if (is_idle_vcpu(next) ||
146 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
147 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
148 unsigned long flags;
149 u64 progress = 0;
150 s64 status;
152 local_irq_save(flags);
153 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
154 local_irq_restore(flags);
155 if (status != 0)
156 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
157 "cache_type=4 status %lx", status);
158 }
159 }
160 }
162 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
163 {
164 /*
165 * Implement eager save, lazy restore
166 */
167 if (!is_idle_vcpu(prev)) {
168 if (VMX_DOMAIN(prev)) {
169 if (FP_PSR(prev) & IA64_PSR_MFH) {
170 __ia64_save_fpu(prev->arch._thread.fph);
171 __ia64_per_cpu_var(fp_owner) = prev;
172 }
173 } else {
174 if (PSCB(prev, hpsr_mfh)) {
175 __ia64_save_fpu(prev->arch._thread.fph);
176 __ia64_per_cpu_var(fp_owner) = prev;
177 }
178 }
179 }
181 if (!is_idle_vcpu(next)) {
182 if (VMX_DOMAIN(next)) {
183 FP_PSR(next) = IA64_PSR_DFH;
184 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
185 } else {
186 PSCB(next, hpsr_dfh) = 1;
187 PSCB(next, hpsr_mfh) = 0;
188 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
189 }
190 }
191 }
193 void schedule_tail(struct vcpu *prev)
194 {
195 extern char ia64_ivt;
197 context_saved(prev);
198 ia64_disable_vhpt_walker();
200 if (VMX_DOMAIN(current)) {
201 vmx_do_launch(current);
202 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
203 current->processor);
204 } else {
205 ia64_set_iva(&ia64_ivt);
206 load_region_regs(current);
207 ia64_set_pta(vcpu_pta(current));
208 vcpu_load_kernel_regs(current);
209 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
210 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
211 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
212 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
213 migrate_timer(&current->arch.hlt_timer, current->processor);
214 }
215 flush_vtlb_for_context_switch(prev, current);
216 }
218 void context_switch(struct vcpu *prev, struct vcpu *next)
219 {
220 uint64_t spsr;
222 local_irq_save(spsr);
224 if (VMX_DOMAIN(prev)) {
225 vmx_save_state(prev);
226 if (!VMX_DOMAIN(next)) {
227 /* VMX domains can change the physical cr.dcr.
228 * Restore default to prevent leakage. */
229 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
230 }
231 }
232 if (VMX_DOMAIN(next))
233 vmx_load_state(next);
235 ia64_disable_vhpt_walker();
236 lazy_fp_switch(prev, current);
238 if (prev->arch.dbg_used || next->arch.dbg_used) {
239 /*
240 * Load debug registers either because they are valid or to clear
241 * the previous one.
242 */
243 ia64_load_debug_regs(next->arch.dbr);
244 }
246 prev = ia64_switch_to(next);
248 /* Note: ia64_switch_to does not return here at vcpu initialization. */
250 if (VMX_DOMAIN(current)) {
251 vmx_load_all_rr(current);
252 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
253 current->processor);
254 } else {
255 struct domain *nd;
256 extern char ia64_ivt;
258 ia64_set_iva(&ia64_ivt);
260 nd = current->domain;
261 if (!is_idle_domain(nd)) {
262 load_region_regs(current);
263 ia64_set_pta(vcpu_pta(current));
264 vcpu_load_kernel_regs(current);
265 vcpu_set_next_timer(current);
266 if (vcpu_timer_expired(current))
267 vcpu_pend_timer(current);
268 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
269 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
270 __ia64_per_cpu_var(current_psr_ic_addr) =
271 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
272 /* steal time accounting */
273 if (!guest_handle_is_null(runstate_guest(current)))
274 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
275 } else {
276 /* When switching to idle domain, only need to disable vhpt
277 * walker. Then all accesses happen within idle context will
278 * be handled by TR mapping and identity mapping.
279 */
280 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
281 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
282 }
283 }
284 local_irq_restore(spsr);
286 /* lazy fp */
287 if (current->processor != current->arch.last_processor) {
288 unsigned long *addr;
289 addr = (unsigned long *)per_cpu_addr(fp_owner,
290 current->arch.last_processor);
291 ia64_cmpxchg(acq, addr, current, 0, 8);
292 }
294 flush_vtlb_for_context_switch(prev, current);
295 flush_cache_for_context_switch(current);
296 context_saved(prev);
297 }
299 void continue_running(struct vcpu *same)
300 {
301 /* nothing to do */
302 }
304 #ifdef CONFIG_PERFMON
305 static int pal_halt = 1;
306 static int can_do_pal_halt = 1;
308 static int __init nohalt_setup(char * str)
309 {
310 pal_halt = can_do_pal_halt = 0;
311 return 1;
312 }
313 __setup("nohalt", nohalt_setup);
315 void
316 update_pal_halt_status(int status)
317 {
318 can_do_pal_halt = pal_halt && status;
319 }
320 #else
321 #define can_do_pal_halt (1)
322 #endif
324 static void default_idle(void)
325 {
326 local_irq_disable();
327 if ( !softirq_pending(smp_processor_id()) ) {
328 if (can_do_pal_halt)
329 safe_halt();
330 else
331 cpu_relax();
332 }
333 local_irq_enable();
334 }
336 static void continue_cpu_idle_loop(void)
337 {
338 for ( ; ; )
339 {
340 #ifdef IA64
341 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
342 #else
343 irq_stat[cpu].idle_timestamp = jiffies;
344 #endif
345 page_scrub_schedule_work();
346 while ( !softirq_pending(smp_processor_id()) )
347 default_idle();
348 raise_softirq(SCHEDULE_SOFTIRQ);
349 do_softirq();
350 }
351 }
353 void startup_cpu_idle_loop(void)
354 {
355 /* Just some sanity to ensure that the scheduler is set up okay. */
356 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
357 raise_softirq(SCHEDULE_SOFTIRQ);
359 continue_cpu_idle_loop();
360 }
362 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
363 * get_order_from_shift(XMAPPEDREGS_SHIFT))
364 */
365 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
366 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
367 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
368 #endif
370 void hlt_timer_fn(void *data)
371 {
372 struct vcpu *v = data;
373 vcpu_unblock(v);
374 }
376 void relinquish_vcpu_resources(struct vcpu *v)
377 {
378 if (HAS_PERVCPU_VHPT(v->domain))
379 pervcpu_vhpt_free(v);
380 if (v->arch.privregs != NULL) {
381 free_xenheap_pages(v->arch.privregs,
382 get_order_from_shift(XMAPPEDREGS_SHIFT));
383 v->arch.privregs = NULL;
384 }
385 kill_timer(&v->arch.hlt_timer);
386 }
388 struct vcpu *alloc_vcpu_struct(void)
389 {
390 struct vcpu *v;
391 struct thread_info *ti;
392 static int first_allocation = 1;
394 if (first_allocation) {
395 first_allocation = 0;
396 /* Still keep idle vcpu0 static allocated at compilation, due
397 * to some code from Linux still requires it in early phase.
398 */
399 return idle_vcpu[0];
400 }
402 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
403 return NULL;
404 memset(v, 0, sizeof(*v));
406 ti = alloc_thread_info(v);
407 /* Clear thread_info to clear some important fields, like
408 * preempt_count
409 */
410 memset(ti, 0, sizeof(struct thread_info));
411 init_switch_stack(v);
413 return v;
414 }
416 void free_vcpu_struct(struct vcpu *v)
417 {
418 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
419 }
421 int vcpu_initialise(struct vcpu *v)
422 {
423 struct domain *d = v->domain;
425 if (!is_idle_domain(d)) {
426 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
427 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
428 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
429 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
431 /* Is it correct ?
432 It depends on the domain rid usage.
434 A domain may share rid among its processor (eg having a
435 global VHPT). In this case, we should also share rid
436 among vcpus and the rid range should be the same.
438 However a domain may have per cpu rid allocation. In
439 this case we don't want to share rid among vcpus, but we may
440 do it if two vcpus are on the same cpu... */
442 v->arch.starting_rid = d->arch.starting_rid;
443 v->arch.ending_rid = d->arch.ending_rid;
444 v->arch.breakimm = d->arch.breakimm;
445 v->arch.last_processor = INVALID_PROCESSOR;
446 }
448 if (!VMX_DOMAIN(v))
449 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
450 first_cpu(cpu_online_map));
452 return 0;
453 }
455 void vcpu_share_privregs_with_guest(struct vcpu *v)
456 {
457 struct domain *d = v->domain;
458 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
460 for (i = 0; i < (1 << order); i++)
461 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
462 d, XENSHARE_writable);
463 /*
464 * XXX IA64_XMAPPEDREGS_PADDR
465 * assign these pages into guest pseudo physical address
466 * space for dom0 to map this page by gmfn.
467 * this is necessary for domain save, restore and dump-core.
468 */
469 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
470 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
471 virt_to_maddr(v->arch.privregs + i));
472 }
474 int vcpu_late_initialise(struct vcpu *v)
475 {
476 struct domain *d = v->domain;
477 int rc, order;
479 if (HAS_PERVCPU_VHPT(d)) {
480 rc = pervcpu_vhpt_alloc(v);
481 if (rc != 0)
482 return rc;
483 }
485 /* Create privregs page. */
486 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
487 v->arch.privregs = alloc_xenheap_pages(order);
488 BUG_ON(v->arch.privregs == NULL);
489 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
490 vcpu_share_privregs_with_guest(v);
492 return 0;
493 }
495 void vcpu_destroy(struct vcpu *v)
496 {
497 if (v->domain->arch.is_vti)
498 vmx_relinquish_vcpu_resources(v);
499 else
500 relinquish_vcpu_resources(v);
501 }
503 static void init_switch_stack(struct vcpu *v)
504 {
505 struct pt_regs *regs = vcpu_regs (v);
506 struct switch_stack *sw = (struct switch_stack *) regs - 1;
507 extern void ia64_ret_from_clone;
509 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
510 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
511 sw->b0 = (unsigned long) &ia64_ret_from_clone;
512 sw->ar_fpsr = FPSR_DEFAULT;
513 v->arch._thread.ksp = (unsigned long) sw - 16;
514 // stay on kernel stack because may get interrupts!
515 // ia64_ret_from_clone switches to user stack
516 v->arch._thread.on_ustack = 0;
517 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
518 }
520 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
521 static int opt_pervcpu_vhpt = 1;
522 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
523 #endif
525 int arch_domain_create(struct domain *d)
526 {
527 int i;
529 // the following will eventually need to be negotiated dynamically
530 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
531 d->arch.breakimm = 0x1000;
532 for (i = 0; i < NR_CPUS; i++) {
533 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
534 }
536 if (is_idle_domain(d))
537 return 0;
539 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
540 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
541 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
542 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
543 #endif
544 if (tlb_track_create(d) < 0)
545 goto fail_nomem1;
546 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
547 if (d->shared_info == NULL)
548 goto fail_nomem;
549 memset(d->shared_info, 0, XSI_SIZE);
550 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
551 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
552 d, XENSHARE_writable);
554 /* We may also need emulation rid for region4, though it's unlikely
555 * to see guest issue uncacheable access in metaphysical mode. But
556 * keep such info here may be more sane.
557 */
558 if (!allocate_rid_range(d,0))
559 goto fail_nomem;
561 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
563 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
564 goto fail_nomem;
566 /*
567 * grant_table_create() can't fully initialize grant table for domain
568 * because it is called before arch_domain_create().
569 * Here we complete the initialization which requires p2m table.
570 */
571 spin_lock(&d->grant_table->lock);
572 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
573 ia64_gnttab_create_shared_page(d, d->grant_table, i);
574 spin_unlock(&d->grant_table->lock);
576 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
577 RANGESETF_prettyprint_hex);
579 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
580 return 0;
582 fail_nomem:
583 tlb_track_destroy(d);
584 fail_nomem1:
585 if (d->arch.mm.pgd != NULL)
586 pgd_free(d->arch.mm.pgd);
587 if (d->shared_info != NULL)
588 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
589 return -ENOMEM;
590 }
592 void arch_domain_destroy(struct domain *d)
593 {
594 mm_final_teardown(d);
596 if (d->shared_info != NULL)
597 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
599 tlb_track_destroy(d);
601 /* Clear vTLB for the next domain. */
602 domain_flush_tlb_vhpt(d);
604 deallocate_rid_range(d);
605 }
607 int arch_vcpu_reset(struct vcpu *v)
608 {
609 /* FIXME: Stub for now */
610 return 0;
611 }
613 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
615 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
616 {
617 int i;
618 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
619 struct cpu_user_regs *uregs = vcpu_regs(v);
620 int is_hvm = VMX_DOMAIN(v);
621 unsigned int rbs_size;
623 c.nat->regs.b[6] = uregs->b6;
624 c.nat->regs.b[7] = uregs->b7;
626 c.nat->regs.ar.csd = uregs->ar_csd;
627 c.nat->regs.ar.ssd = uregs->ar_ssd;
629 c.nat->regs.r[8] = uregs->r8;
630 c.nat->regs.r[9] = uregs->r9;
631 c.nat->regs.r[10] = uregs->r10;
632 c.nat->regs.r[11] = uregs->r11;
634 if (is_hvm)
635 c.nat->regs.psr = vmx_vcpu_get_psr(v);
636 else
637 c.nat->regs.psr = vcpu_get_psr(v);
639 c.nat->regs.ip = uregs->cr_iip;
640 c.nat->regs.cfm = uregs->cr_ifs;
642 c.nat->regs.ar.unat = uregs->ar_unat;
643 c.nat->regs.ar.pfs = uregs->ar_pfs;
644 c.nat->regs.ar.rsc = uregs->ar_rsc;
645 c.nat->regs.ar.rnat = uregs->ar_rnat;
646 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
648 c.nat->regs.pr = uregs->pr;
649 c.nat->regs.b[0] = uregs->b0;
650 rbs_size = uregs->loadrs >> 16;
651 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
653 c.nat->regs.r[1] = uregs->r1;
654 c.nat->regs.r[12] = uregs->r12;
655 c.nat->regs.r[13] = uregs->r13;
656 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
657 c.nat->regs.r[15] = uregs->r15;
659 c.nat->regs.r[14] = uregs->r14;
660 c.nat->regs.r[2] = uregs->r2;
661 c.nat->regs.r[3] = uregs->r3;
662 c.nat->regs.r[16] = uregs->r16;
663 c.nat->regs.r[17] = uregs->r17;
664 c.nat->regs.r[18] = uregs->r18;
665 c.nat->regs.r[19] = uregs->r19;
666 c.nat->regs.r[20] = uregs->r20;
667 c.nat->regs.r[21] = uregs->r21;
668 c.nat->regs.r[22] = uregs->r22;
669 c.nat->regs.r[23] = uregs->r23;
670 c.nat->regs.r[24] = uregs->r24;
671 c.nat->regs.r[25] = uregs->r25;
672 c.nat->regs.r[26] = uregs->r26;
673 c.nat->regs.r[27] = uregs->r27;
674 c.nat->regs.r[28] = uregs->r28;
675 c.nat->regs.r[29] = uregs->r29;
676 c.nat->regs.r[30] = uregs->r30;
677 c.nat->regs.r[31] = uregs->r31;
679 c.nat->regs.ar.ccv = uregs->ar_ccv;
681 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
682 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
683 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
684 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
685 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
686 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
688 c.nat->regs.r[4] = uregs->r4;
689 c.nat->regs.r[5] = uregs->r5;
690 c.nat->regs.r[6] = uregs->r6;
691 c.nat->regs.r[7] = uregs->r7;
693 /* FIXME: to be reordered. */
694 c.nat->regs.nats = uregs->eml_unat;
696 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
697 if (rbs_size < sizeof (c.nat->regs.rbs))
698 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
700 c.nat->privregs_pfn = get_gpfn_from_mfn
701 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
703 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
704 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
705 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
706 }
708 for (i = 0; i < 7; i++)
709 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
711 /* Fill extra regs. */
712 for (i = 0; i < 8; i++) {
713 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
714 tr->itrs[i].itir = v->arch.itrs[i].itir;
715 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
716 tr->itrs[i].rid = v->arch.itrs[i].rid;
717 }
718 for (i = 0; i < 8; i++) {
719 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
720 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
721 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
722 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
723 }
724 c.nat->event_callback_ip = v->arch.event_callback_ip;
726 /* If PV and privregs is not set, we can't read mapped registers. */
727 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
728 return;
730 vcpu_get_dcr (v, &c.nat->regs.cr.dcr);
731 vcpu_get_iva (v, &c.nat->regs.cr.iva);
732 }
734 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
735 {
736 struct cpu_user_regs *uregs = vcpu_regs(v);
737 struct domain *d = v->domain;
738 int was_initialised = v->is_initialised;
739 unsigned int rbs_size;
740 int rc, i;
742 /* Finish vcpu initialization. */
743 if (!was_initialised) {
744 if (d->arch.is_vti)
745 rc = vmx_final_setup_guest(v);
746 else
747 rc = vcpu_late_initialise(v);
748 if (rc != 0)
749 return rc;
751 vcpu_init_regs(v);
753 v->is_initialised = 1;
754 /* Auto-online VCPU0 when it is initialised. */
755 if (v->vcpu_id == 0)
756 clear_bit(_VPF_down, &v->pause_flags);
757 }
759 if (c.nat == NULL)
760 return 0;
762 uregs->b6 = c.nat->regs.b[6];
763 uregs->b7 = c.nat->regs.b[7];
765 uregs->ar_csd = c.nat->regs.ar.csd;
766 uregs->ar_ssd = c.nat->regs.ar.ssd;
768 uregs->r8 = c.nat->regs.r[8];
769 uregs->r9 = c.nat->regs.r[9];
770 uregs->r10 = c.nat->regs.r[10];
771 uregs->r11 = c.nat->regs.r[11];
773 if (!d->arch.is_vti)
774 vcpu_set_psr(v, c.nat->regs.psr);
775 else
776 vmx_vcpu_set_psr(v, c.nat->regs.psr);
777 uregs->cr_iip = c.nat->regs.ip;
778 uregs->cr_ifs = c.nat->regs.cfm;
780 uregs->ar_unat = c.nat->regs.ar.unat;
781 uregs->ar_pfs = c.nat->regs.ar.pfs;
782 uregs->ar_rsc = c.nat->regs.ar.rsc;
783 uregs->ar_rnat = c.nat->regs.ar.rnat;
784 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
786 uregs->pr = c.nat->regs.pr;
787 uregs->b0 = c.nat->regs.b[0];
788 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
789 /* Protection against crazy user code. */
790 if (!was_initialised)
791 uregs->loadrs = (rbs_size) << 16;
792 if (rbs_size == (uregs->loadrs >> 16))
793 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
795 uregs->r1 = c.nat->regs.r[1];
796 uregs->r12 = c.nat->regs.r[12];
797 uregs->r13 = c.nat->regs.r[13];
798 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
799 uregs->r15 = c.nat->regs.r[15];
801 uregs->r14 = c.nat->regs.r[14];
802 uregs->r2 = c.nat->regs.r[2];
803 uregs->r3 = c.nat->regs.r[3];
804 uregs->r16 = c.nat->regs.r[16];
805 uregs->r17 = c.nat->regs.r[17];
806 uregs->r18 = c.nat->regs.r[18];
807 uregs->r19 = c.nat->regs.r[19];
808 uregs->r20 = c.nat->regs.r[20];
809 uregs->r21 = c.nat->regs.r[21];
810 uregs->r22 = c.nat->regs.r[22];
811 uregs->r23 = c.nat->regs.r[23];
812 uregs->r24 = c.nat->regs.r[24];
813 uregs->r25 = c.nat->regs.r[25];
814 uregs->r26 = c.nat->regs.r[26];
815 uregs->r27 = c.nat->regs.r[27];
816 uregs->r28 = c.nat->regs.r[28];
817 uregs->r29 = c.nat->regs.r[29];
818 uregs->r30 = c.nat->regs.r[30];
819 uregs->r31 = c.nat->regs.r[31];
821 uregs->ar_ccv = c.nat->regs.ar.ccv;
823 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
824 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
825 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
826 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
827 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
828 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
830 uregs->r4 = c.nat->regs.r[4];
831 uregs->r5 = c.nat->regs.r[5];
832 uregs->r6 = c.nat->regs.r[6];
833 uregs->r7 = c.nat->regs.r[7];
835 /* FIXME: to be reordered and restored. */
836 /* uregs->eml_unat = c.nat->regs.nat; */
837 uregs->eml_unat = 0;
839 if (!d->arch.is_vti) {
840 /* domain runs at PL2/3 */
841 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
842 IA64_PSR_CPL0_BIT);
843 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
844 }
846 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
847 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
848 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
849 }
851 if (c.nat->flags & VGCF_EXTRA_REGS) {
852 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
854 for (i = 0; i < 8; i++) {
855 vcpu_set_itr(v, i, tr->itrs[i].pte,
856 tr->itrs[i].itir,
857 tr->itrs[i].vadr,
858 tr->itrs[i].rid);
859 }
860 for (i = 0; i < 8; i++) {
861 vcpu_set_dtr(v, i,
862 tr->dtrs[i].pte,
863 tr->dtrs[i].itir,
864 tr->dtrs[i].vadr,
865 tr->dtrs[i].rid);
866 }
867 v->arch.event_callback_ip = c.nat->event_callback_ip;
868 v->arch.iva = c.nat->regs.cr.iva;
869 }
871 return 0;
872 }
874 static void relinquish_memory(struct domain *d, struct list_head *list)
875 {
876 struct list_head *ent;
877 struct page_info *page;
878 #ifndef __ia64__
879 unsigned long x, y;
880 #endif
882 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
883 spin_lock_recursive(&d->page_alloc_lock);
884 ent = list->next;
885 while ( ent != list )
886 {
887 page = list_entry(ent, struct page_info, list);
888 /* Grab a reference to the page so it won't disappear from under us. */
889 if ( unlikely(!get_page(page, d)) )
890 {
891 /* Couldn't get a reference -- someone is freeing this page. */
892 ent = ent->next;
893 continue;
894 }
896 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
897 put_page_and_type(page);
899 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
900 put_page(page);
902 #ifndef __ia64__
903 /*
904 * Forcibly invalidate base page tables at this point to break circular
905 * 'linear page table' references. This is okay because MMU structures
906 * are not shared across domains and this domain is now dead. Thus base
907 * tables are not in use so a non-zero count means circular reference.
908 */
909 y = page->u.inuse.type_info;
910 for ( ; ; )
911 {
912 x = y;
913 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
914 (PGT_base_page_table|PGT_validated)) )
915 break;
917 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
918 if ( likely(y == x) )
919 {
920 free_page_type(page, PGT_base_page_table);
921 break;
922 }
923 }
924 #endif
926 /* Follow the list chain and /then/ potentially free the page. */
927 ent = ent->next;
928 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
929 put_page(page);
930 }
932 spin_unlock_recursive(&d->page_alloc_lock);
933 }
935 void domain_relinquish_resources(struct domain *d)
936 {
937 /* Relinquish guest resources for VT-i domain. */
938 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
939 vmx_relinquish_guest_resources(d);
941 /* Tear down shadow mode stuff. */
942 mm_teardown(d);
944 /* Relinquish every page of memory. */
945 relinquish_memory(d, &d->xenpage_list);
946 relinquish_memory(d, &d->page_list);
948 if (d->arch.is_vti && d->arch.sal_data)
949 xfree(d->arch.sal_data);
951 /* Free page used by xen oprofile buffer */
952 free_xenoprof_pages(d);
953 }
955 unsigned long
956 domain_set_shared_info_va (unsigned long va)
957 {
958 struct vcpu *v = current;
959 struct domain *d = v->domain;
961 /* Check virtual address:
962 must belong to region 7,
963 must be 64Kb aligned,
964 must not be within Xen virtual space. */
965 if ((va >> 61) != 7
966 || (va & 0xffffUL) != 0
967 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
968 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
970 /* Note: this doesn't work well if other cpus are already running.
971 However this is part of the spec :-) */
972 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
973 d->arch.shared_info_va = va;
975 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
976 INT_ENABLE_OFFSET(v);
978 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
980 /* Remap the shared pages. */
981 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
983 return 0;
984 }
986 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
987 #define SHADOW_COPY_CHUNK 1024
989 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
990 {
991 unsigned int op = sc->op;
992 int rc = 0;
993 int i;
994 //struct vcpu *v;
996 if (unlikely(d == current->domain)) {
997 gdprintk(XENLOG_INFO,
998 "Don't try to do a shadow op on yourself!\n");
999 return -EINVAL;
1002 domain_pause(d);
1004 switch (op)
1006 case XEN_DOMCTL_SHADOW_OP_OFF:
1007 if (shadow_mode_enabled (d)) {
1008 u64 *bm = d->arch.shadow_bitmap;
1010 /* Flush vhpt and tlb to restore dirty bit usage. */
1011 domain_flush_tlb_vhpt(d);
1013 /* Free bitmap. */
1014 d->arch.shadow_bitmap_size = 0;
1015 d->arch.shadow_bitmap = NULL;
1016 xfree(bm);
1018 break;
1020 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1021 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1022 rc = -EINVAL;
1023 break;
1025 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1026 if (shadow_mode_enabled(d)) {
1027 rc = -EINVAL;
1028 break;
1031 atomic64_set(&d->arch.shadow_fault_count, 0);
1032 atomic64_set(&d->arch.shadow_dirty_count, 0);
1034 d->arch.shadow_bitmap_size =
1035 ((d->arch.convmem_end >> PAGE_SHIFT) +
1036 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1037 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1038 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1039 if (d->arch.shadow_bitmap == NULL) {
1040 d->arch.shadow_bitmap_size = 0;
1041 rc = -ENOMEM;
1043 else {
1044 memset(d->arch.shadow_bitmap, 0,
1045 d->arch.shadow_bitmap_size / 8);
1047 /* Flush vhtp and tlb to enable dirty bit
1048 virtualization. */
1049 domain_flush_tlb_vhpt(d);
1051 break;
1053 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1055 int nbr_bytes;
1057 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1058 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1060 atomic64_set(&d->arch.shadow_fault_count, 0);
1061 atomic64_set(&d->arch.shadow_dirty_count, 0);
1063 if (guest_handle_is_null(sc->dirty_bitmap) ||
1064 (d->arch.shadow_bitmap == NULL)) {
1065 rc = -EINVAL;
1066 break;
1069 if (sc->pages > d->arch.shadow_bitmap_size)
1070 sc->pages = d->arch.shadow_bitmap_size;
1072 nbr_bytes = (sc->pages + 7) / 8;
1074 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1075 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1076 SHADOW_COPY_CHUNK : nbr_bytes - i;
1078 if (copy_to_guest_offset(
1079 sc->dirty_bitmap, i,
1080 (uint8_t *)d->arch.shadow_bitmap + i,
1081 size)) {
1082 rc = -EFAULT;
1083 break;
1086 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1089 break;
1092 case XEN_DOMCTL_SHADOW_OP_PEEK:
1094 unsigned long size;
1096 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1097 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1099 if (guest_handle_is_null(sc->dirty_bitmap) ||
1100 (d->arch.shadow_bitmap == NULL)) {
1101 rc = -EINVAL;
1102 break;
1105 if (sc->pages > d->arch.shadow_bitmap_size)
1106 sc->pages = d->arch.shadow_bitmap_size;
1108 size = (sc->pages + 7) / 8;
1109 if (copy_to_guest(sc->dirty_bitmap,
1110 (uint8_t *)d->arch.shadow_bitmap, size)) {
1111 rc = -EFAULT;
1112 break;
1114 break;
1116 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1117 sc->mb = 0;
1118 break;
1119 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1120 if (sc->mb > 0) {
1121 BUG();
1122 rc = -ENOMEM;
1124 break;
1125 default:
1126 rc = -EINVAL;
1127 break;
1130 domain_unpause(d);
1132 return rc;
1135 // remove following line if not privifying in memory
1136 //#define HAVE_PRIVIFY_MEMORY
1137 #ifndef HAVE_PRIVIFY_MEMORY
1138 #define privify_memory(x,y) do {} while(0)
1139 #endif
1141 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1142 unsigned long phys_load_offset)
1144 const elf_phdr *phdr;
1145 int phnum, h, filesz, memsz;
1146 unsigned long elfaddr, dom_mpaddr, dom_imva;
1147 struct page_info *p;
1149 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1150 for (h = 0; h < phnum; h++) {
1151 phdr = elf_phdr_by_index(elf, h);
1152 if (!elf_phdr_is_loadable(elf, phdr))
1153 continue;
1155 filesz = elf_uval(elf, phdr, p_filesz);
1156 memsz = elf_uval(elf, phdr, p_memsz);
1157 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1158 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1159 dom_mpaddr += phys_load_offset;
1161 while (memsz > 0) {
1162 p = assign_new_domain_page(d,dom_mpaddr);
1163 BUG_ON (unlikely(p == NULL));
1164 dom_imva = __va_ul(page_to_maddr(p));
1165 if (filesz > 0) {
1166 if (filesz >= PAGE_SIZE)
1167 copy_page((void *) dom_imva,
1168 (void *) elfaddr);
1169 else {
1170 // copy partial page
1171 memcpy((void *) dom_imva,
1172 (void *) elfaddr, filesz);
1173 // zero the rest of page
1174 memset((void *) dom_imva+filesz, 0,
1175 PAGE_SIZE-filesz);
1177 //FIXME: This test for code seems to find a lot more than objdump -x does
1178 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1179 privify_memory(dom_imva,PAGE_SIZE);
1180 flush_icache_range(dom_imva,
1181 dom_imva+PAGE_SIZE);
1184 else if (memsz > 0) {
1185 /* always zero out entire page */
1186 clear_page((void *) dom_imva);
1188 memsz -= PAGE_SIZE;
1189 filesz -= PAGE_SIZE;
1190 elfaddr += PAGE_SIZE;
1191 dom_mpaddr += PAGE_SIZE;
1196 void __init alloc_dom0(void)
1198 /* Check dom0 size. */
1199 if (dom0_size < 4 * 1024 * 1024) {
1200 panic("dom0_mem is too small, boot aborted"
1201 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1204 if (running_on_sim) {
1205 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1208 /* no need to allocate pages for now
1209 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1210 */
1214 /*
1215 * Domain 0 has direct access to all devices absolutely. However
1216 * the major point of this stub here, is to allow alloc_dom_mem
1217 * handled with order > 0 request. Dom0 requires that bit set to
1218 * allocate memory for other domains.
1219 */
1220 static void __init physdev_init_dom0(struct domain *d)
1222 if (iomem_permit_access(d, 0UL, ~0UL))
1223 BUG();
1224 if (irqs_permit_access(d, 0, NR_IRQS-1))
1225 BUG();
1226 if (ioports_permit_access(d, 0, 0xffff))
1227 BUG();
1230 int __init construct_dom0(struct domain *d,
1231 unsigned long image_start, unsigned long image_len,
1232 unsigned long initrd_start, unsigned long initrd_len,
1233 char *cmdline)
1235 int i, rc;
1236 start_info_t *si;
1237 dom0_vga_console_info_t *ci;
1238 struct vcpu *v = d->vcpu[0];
1239 unsigned long max_pages;
1241 struct elf_binary elf;
1242 struct elf_dom_parms parms;
1243 unsigned long p_start;
1244 unsigned long pkern_start;
1245 unsigned long pkern_entry;
1246 unsigned long pkern_end;
1247 unsigned long pinitrd_start = 0;
1248 unsigned long pstart_info;
1249 unsigned long phys_load_offset;
1250 struct page_info *start_info_page;
1251 unsigned long bp_mpa;
1252 struct ia64_boot_param *bp;
1254 //printk("construct_dom0: starting\n");
1256 /* Sanity! */
1257 BUG_ON(d != dom0);
1258 BUG_ON(d->vcpu[0] == NULL);
1259 BUG_ON(v->is_initialised);
1261 printk("*** LOADING DOMAIN 0 ***\n");
1263 max_pages = dom0_size / PAGE_SIZE;
1264 d->max_pages = max_pages;
1265 d->tot_pages = 0;
1267 rc = elf_init(&elf, (void*)image_start, image_len);
1268 if ( rc != 0 )
1269 return rc;
1270 #ifdef VERBOSE
1271 elf_set_verbose(&elf);
1272 #endif
1273 elf_parse_binary(&elf);
1274 if (0 != (elf_xen_parse(&elf, &parms)))
1275 return rc;
1277 /*
1278 * We cannot rely on the load address in the ELF headers to
1279 * determine the meta physical address at which the image
1280 * is loaded. Patch the address to match the real one, based
1281 * on xen_pstart
1282 */
1283 phys_load_offset = xen_pstart - elf.pstart;
1284 elf.pstart += phys_load_offset;
1285 elf.pend += phys_load_offset;
1286 parms.virt_kstart += phys_load_offset;
1287 parms.virt_kend += phys_load_offset;
1288 parms.virt_entry += phys_load_offset;
1290 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1291 elf_64bit(&elf) ? "64-bit" : "32-bit",
1292 elf_msb(&elf) ? "msb" : "lsb",
1293 elf.pstart, elf.pend);
1294 if (!elf_64bit(&elf) ||
1295 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1296 printk("Incompatible kernel binary\n");
1297 return -1;
1300 p_start = parms.virt_base;
1301 pkern_start = parms.virt_kstart;
1302 pkern_end = parms.virt_kend;
1303 pkern_entry = parms.virt_entry;
1305 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1307 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1309 printk("Initial guest OS must load to a page boundary.\n");
1310 return -EINVAL;
1313 pstart_info = PAGE_ALIGN(pkern_end);
1314 if(initrd_start && initrd_len){
1315 unsigned long offset;
1317 /* The next page aligned boundary after the start info.
1318 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1319 pinitrd_start = pstart_info + PAGE_SIZE;
1321 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1322 panic("%s: not enough memory assigned to dom0", __func__);
1324 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1325 struct page_info *p;
1326 p = assign_new_domain_page(d, pinitrd_start + offset);
1327 if (p == NULL)
1328 panic("%s: can't allocate page for initrd image", __func__);
1329 if (initrd_len < offset + PAGE_SIZE)
1330 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1331 initrd_len - offset);
1332 else
1333 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1337 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1338 " Kernel image: %lx->%lx\n"
1339 " Entry address: %lx\n"
1340 " Init. ramdisk: %lx len %lx\n"
1341 " Start info.: %lx->%lx\n",
1342 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1343 pstart_info, pstart_info + PAGE_SIZE);
1345 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1347 printk("Initial guest OS requires too much space\n"
1348 "(%luMB is greater than %luMB limit)\n",
1349 (pkern_end-pkern_start)>>20,
1350 (max_pages <<PAGE_SHIFT)>>20);
1351 return -ENOMEM;
1354 // if high 3 bits of pkern start are non-zero, error
1356 // if pkern end is after end of metaphysical memory, error
1357 // (we should be able to deal with this... later)
1359 /* Mask all upcalls... */
1360 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1361 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1363 if (dom0_max_vcpus == 0)
1364 dom0_max_vcpus = MAX_VIRT_CPUS;
1365 if (dom0_max_vcpus > num_online_cpus())
1366 dom0_max_vcpus = num_online_cpus();
1367 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1368 dom0_max_vcpus = MAX_VIRT_CPUS;
1370 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1371 for ( i = 1; i < dom0_max_vcpus; i++ )
1372 if (alloc_vcpu(d, i, i) == NULL)
1373 panic("Cannot allocate dom0 vcpu %d\n", i);
1375 /* Copy the OS image. */
1376 loaddomainelfimage(d, &elf, phys_load_offset);
1378 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1379 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1381 /* Set up start info area. */
1382 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1383 start_info_page = assign_new_domain_page(d, pstart_info);
1384 if (start_info_page == NULL)
1385 panic("can't allocate start info page");
1386 si = page_to_virt(start_info_page);
1387 clear_page(si);
1388 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1389 xen_major_version(), xen_minor_version());
1390 si->nr_pages = max_pages;
1391 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1393 printk("Dom0: 0x%lx\n", (u64)dom0);
1395 v->is_initialised = 1;
1396 clear_bit(_VPF_down, &v->pause_flags);
1398 /* Build firmware.
1399 Note: Linux kernel reserve memory used by start_info, so there is
1400 no need to remove it from MDT. */
1401 bp_mpa = pstart_info + sizeof(struct start_info);
1402 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1403 if (rc != 0)
1404 return rc;
1406 /* Fill boot param. */
1407 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1409 bp = (struct ia64_boot_param *)((unsigned char *)si +
1410 sizeof(start_info_t));
1411 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1413 /* We assume console has reached the last line! */
1414 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1415 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1416 bp->console_info.orig_x = 0;
1417 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1418 0 : bp->console_info.num_rows - 1;
1420 bp->initrd_start = pinitrd_start;
1421 bp->initrd_size = ia64_boot_param->initrd_size;
1423 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1424 sizeof(start_info_t) +
1425 sizeof(struct ia64_boot_param));
1427 if (fill_console_start_info(ci)) {
1428 si->console.dom0.info_off = sizeof(start_info_t) +
1429 sizeof(struct ia64_boot_param);
1430 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1433 vcpu_init_regs (v);
1435 vcpu_regs(v)->r28 = bp_mpa;
1437 vcpu_regs (v)->cr_iip = pkern_entry;
1439 physdev_init_dom0(d);
1441 return 0;
1444 void machine_restart(char * __unused)
1446 console_start_sync();
1447 if (running_on_sim)
1448 printk ("machine_restart called. spinning...\n");
1449 else
1450 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1451 while(1);
1454 extern void cpu_halt(void);
1456 void machine_halt(void)
1458 console_start_sync();
1459 if (running_on_sim)
1460 printk ("machine_halt called. spinning...\n");
1461 else
1462 cpu_halt();
1463 while(1);
1466 void sync_vcpu_execstate(struct vcpu *v)
1468 // __ia64_save_fpu(v->arch._thread.fph);
1469 // if (VMX_DOMAIN(v))
1470 // vmx_save_state(v);
1471 // FIXME SMP: Anything else needed here for SMP?
1474 /* This function is taken from xen/arch/x86/domain.c */
1475 long
1476 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1478 long rc = 0;
1480 switch (cmd) {
1481 case VCPUOP_register_runstate_memory_area:
1483 struct vcpu_register_runstate_memory_area area;
1484 struct vcpu_runstate_info runstate;
1486 rc = -EFAULT;
1487 if (copy_from_guest(&area, arg, 1))
1488 break;
1490 if (!guest_handle_okay(area.addr.h, 1))
1491 break;
1493 rc = 0;
1494 runstate_guest(v) = area.addr.h;
1496 if (v == current) {
1497 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1498 } else {
1499 vcpu_runstate_get(v, &runstate);
1500 __copy_to_guest(runstate_guest(v), &runstate, 1);
1503 break;
1505 default:
1506 rc = -ENOSYS;
1507 break;
1510 return rc;
1513 static void __init parse_dom0_mem(char *s)
1515 dom0_size = parse_size_and_unit(s, NULL);
1517 custom_param("dom0_mem", parse_dom0_mem);