ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 15664:57f519c41534

[IA64] Switch on PKR

First implementation of handling protection keys in domU's. Currently
only 15 registers are usable by domU's. pkr[15] is reserved for the
hypervisor. The hypervisor doesn't take care of entries with the same key.

Signed-off-by: Dietmar Hahn <dietmar.hahn@fujitsu-siemens.com>
author Alex Williamson <alex.williamson@hp.com>
date Mon Jul 30 16:51:52 2007 -0600 (2007-07-30)
parents 87b0b6a08dbd
children 87a72ba32301
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/vcpu.h>
46 #include <asm/tlbflush.h>
47 #include <asm/regionreg.h>
48 #include <asm/dom_fw.h>
49 #include <asm/shadow.h>
50 #include <xen/guest_access.h>
51 #include <asm/tlb_track.h>
52 #include <asm/perfmon.h>
53 #include <public/vcpu.h>
55 static unsigned long __initdata dom0_size = 512*1024*1024;
57 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
58 static unsigned int __initdata dom0_max_vcpus = 1;
59 integer_param("dom0_max_vcpus", dom0_max_vcpus);
61 extern char dom0_command_line[];
63 /* forward declaration */
64 static void init_switch_stack(struct vcpu *v);
66 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
67 This is a Xen virtual address. */
68 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
69 DEFINE_PER_CPU(int *, current_psr_ic_addr);
71 DEFINE_PER_CPU(struct vcpu *, fp_owner);
73 #include <xen/sched-if.h>
75 static void
76 ia64_disable_vhpt_walker(void)
77 {
78 // disable VHPT. ia64_new_rr7() might cause VHPT
79 // fault without this because it flushes dtr[IA64_TR_VHPT]
80 // (VHPT_SIZE_LOG2 << 2) is just for avoid
81 // Reserved Register/Field fault.
82 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
83 }
85 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
86 {
87 int cpu = smp_processor_id();
88 int last_vcpu_id, last_processor;
90 if (!is_idle_domain(prev->domain))
91 tlbflush_update_time
92 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
93 tlbflush_current_time());
95 if (is_idle_domain(next->domain))
96 return;
98 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
99 last_processor = next->arch.last_processor;
101 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
102 next->arch.last_processor = cpu;
104 if ((last_vcpu_id != next->vcpu_id &&
105 last_vcpu_id != INVALID_VCPU_ID) ||
106 (last_vcpu_id == next->vcpu_id &&
107 last_processor != cpu &&
108 last_processor != INVALID_PROCESSOR)) {
109 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
110 u32 last_tlbflush_timestamp =
111 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
112 #endif
113 int vhpt_is_flushed = 0;
115 // if the vTLB implementation was changed,
116 // the followings must be updated either.
117 if (VMX_DOMAIN(next)) {
118 // currently vTLB for vt-i domian is per vcpu.
119 // so any flushing isn't needed.
120 } else if (HAS_PERVCPU_VHPT(next->domain)) {
121 // nothing to do
122 } else {
123 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
124 last_tlbflush_timestamp)) {
125 local_vhpt_flush();
126 vhpt_is_flushed = 1;
127 }
128 }
129 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
130 last_tlbflush_timestamp)) {
131 local_flush_tlb_all();
132 perfc_incr(tlbflush_clock_cswitch_purge);
133 } else {
134 perfc_incr(tlbflush_clock_cswitch_skip);
135 }
136 perfc_incr(flush_vtlb_for_context_switch);
137 }
138 }
140 static void flush_cache_for_context_switch(struct vcpu *next)
141 {
142 extern cpumask_t cpu_cache_coherent_map;
143 int cpu = smp_processor_id();
145 if (is_idle_vcpu(next) ||
146 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
147 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
148 unsigned long flags;
149 u64 progress = 0;
150 s64 status;
152 local_irq_save(flags);
153 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
154 local_irq_restore(flags);
155 if (status != 0)
156 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
157 "cache_type=4 status %lx", status);
158 }
159 }
160 }
162 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
163 {
164 /*
165 * Implement eager save, lazy restore
166 */
167 if (!is_idle_vcpu(prev)) {
168 if (VMX_DOMAIN(prev)) {
169 if (FP_PSR(prev) & IA64_PSR_MFH) {
170 __ia64_save_fpu(prev->arch._thread.fph);
171 __ia64_per_cpu_var(fp_owner) = prev;
172 }
173 } else {
174 if (PSCB(prev, hpsr_mfh)) {
175 __ia64_save_fpu(prev->arch._thread.fph);
176 __ia64_per_cpu_var(fp_owner) = prev;
177 }
178 }
179 }
181 if (!is_idle_vcpu(next)) {
182 if (VMX_DOMAIN(next)) {
183 FP_PSR(next) = IA64_PSR_DFH;
184 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
185 } else {
186 PSCB(next, hpsr_dfh) = 1;
187 PSCB(next, hpsr_mfh) = 0;
188 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
189 }
190 }
191 }
193 void schedule_tail(struct vcpu *prev)
194 {
195 extern char ia64_ivt;
197 context_saved(prev);
198 ia64_disable_vhpt_walker();
200 if (VMX_DOMAIN(current)) {
201 vmx_do_launch(current);
202 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
203 current->processor);
204 } else {
205 ia64_set_iva(&ia64_ivt);
206 load_region_regs(current);
207 ia64_set_pta(vcpu_pta(current));
208 vcpu_load_kernel_regs(current);
209 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
210 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
211 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
212 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
213 migrate_timer(&current->arch.hlt_timer, current->processor);
214 }
215 flush_vtlb_for_context_switch(prev, current);
216 }
218 void context_switch(struct vcpu *prev, struct vcpu *next)
219 {
220 uint64_t spsr;
222 local_irq_save(spsr);
224 if (VMX_DOMAIN(prev)) {
225 vmx_save_state(prev);
226 if (!VMX_DOMAIN(next)) {
227 /* VMX domains can change the physical cr.dcr.
228 * Restore default to prevent leakage. */
229 ia64_setreg(_IA64_REG_CR_DCR, IA64_DEFAULT_DCR_BITS);
230 }
231 }
232 if (VMX_DOMAIN(next))
233 vmx_load_state(next);
235 ia64_disable_vhpt_walker();
236 lazy_fp_switch(prev, current);
238 if (prev->arch.dbg_used || next->arch.dbg_used) {
239 /*
240 * Load debug registers either because they are valid or to clear
241 * the previous one.
242 */
243 ia64_load_debug_regs(next->arch.dbr);
244 }
246 prev = ia64_switch_to(next);
248 /* Note: ia64_switch_to does not return here at vcpu initialization. */
250 if (VMX_DOMAIN(current)) {
251 vmx_load_all_rr(current);
252 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
253 current->processor);
254 } else {
255 struct domain *nd;
256 extern char ia64_ivt;
258 ia64_set_iva(&ia64_ivt);
260 nd = current->domain;
261 if (!is_idle_domain(nd)) {
262 load_region_regs(current);
263 ia64_set_pta(vcpu_pta(current));
264 vcpu_load_kernel_regs(current);
265 if (vcpu_pkr_in_use(current))
266 vcpu_pkr_load_regs(current);
267 vcpu_set_next_timer(current);
268 if (vcpu_timer_expired(current))
269 vcpu_pend_timer(current);
270 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
271 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
272 __ia64_per_cpu_var(current_psr_ic_addr) =
273 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
274 /* steal time accounting */
275 if (!guest_handle_is_null(runstate_guest(current)))
276 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
277 } else {
278 /* When switching to idle domain, only need to disable vhpt
279 * walker. Then all accesses happen within idle context will
280 * be handled by TR mapping and identity mapping.
281 */
282 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
283 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
284 }
285 }
286 local_irq_restore(spsr);
288 /* lazy fp */
289 if (current->processor != current->arch.last_processor) {
290 unsigned long *addr;
291 addr = (unsigned long *)per_cpu_addr(fp_owner,
292 current->arch.last_processor);
293 ia64_cmpxchg(acq, addr, current, 0, 8);
294 }
296 flush_vtlb_for_context_switch(prev, current);
297 flush_cache_for_context_switch(current);
298 context_saved(prev);
299 }
301 void continue_running(struct vcpu *same)
302 {
303 /* nothing to do */
304 }
306 #ifdef CONFIG_PERFMON
307 static int pal_halt = 1;
308 static int can_do_pal_halt = 1;
310 static int __init nohalt_setup(char * str)
311 {
312 pal_halt = can_do_pal_halt = 0;
313 return 1;
314 }
315 __setup("nohalt", nohalt_setup);
317 void
318 update_pal_halt_status(int status)
319 {
320 can_do_pal_halt = pal_halt && status;
321 }
322 #else
323 #define can_do_pal_halt (1)
324 #endif
326 static void default_idle(void)
327 {
328 local_irq_disable();
329 if ( !softirq_pending(smp_processor_id()) ) {
330 if (can_do_pal_halt)
331 safe_halt();
332 else
333 cpu_relax();
334 }
335 local_irq_enable();
336 }
338 static void continue_cpu_idle_loop(void)
339 {
340 for ( ; ; )
341 {
342 #ifdef IA64
343 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
344 #else
345 irq_stat[cpu].idle_timestamp = jiffies;
346 #endif
347 page_scrub_schedule_work();
348 while ( !softirq_pending(smp_processor_id()) )
349 default_idle();
350 raise_softirq(SCHEDULE_SOFTIRQ);
351 do_softirq();
352 }
353 }
355 void startup_cpu_idle_loop(void)
356 {
357 /* Just some sanity to ensure that the scheduler is set up okay. */
358 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
359 raise_softirq(SCHEDULE_SOFTIRQ);
361 continue_cpu_idle_loop();
362 }
364 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
365 * get_order_from_shift(XMAPPEDREGS_SHIFT))
366 */
367 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
368 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
369 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
370 #endif
372 void hlt_timer_fn(void *data)
373 {
374 struct vcpu *v = data;
375 vcpu_unblock(v);
376 }
378 void relinquish_vcpu_resources(struct vcpu *v)
379 {
380 if (HAS_PERVCPU_VHPT(v->domain))
381 pervcpu_vhpt_free(v);
382 if (v->arch.privregs != NULL) {
383 free_xenheap_pages(v->arch.privregs,
384 get_order_from_shift(XMAPPEDREGS_SHIFT));
385 v->arch.privregs = NULL;
386 }
387 kill_timer(&v->arch.hlt_timer);
388 }
390 struct vcpu *alloc_vcpu_struct(void)
391 {
392 struct vcpu *v;
393 struct thread_info *ti;
394 static int first_allocation = 1;
396 if (first_allocation) {
397 first_allocation = 0;
398 /* Still keep idle vcpu0 static allocated at compilation, due
399 * to some code from Linux still requires it in early phase.
400 */
401 return idle_vcpu[0];
402 }
404 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
405 return NULL;
406 memset(v, 0, sizeof(*v));
408 ti = alloc_thread_info(v);
409 /* Clear thread_info to clear some important fields, like
410 * preempt_count
411 */
412 memset(ti, 0, sizeof(struct thread_info));
413 init_switch_stack(v);
415 return v;
416 }
418 void free_vcpu_struct(struct vcpu *v)
419 {
420 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
421 }
423 int vcpu_initialise(struct vcpu *v)
424 {
425 struct domain *d = v->domain;
427 if (!is_idle_domain(d)) {
428 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
429 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
430 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
431 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
433 /* Is it correct ?
434 It depends on the domain rid usage.
436 A domain may share rid among its processor (eg having a
437 global VHPT). In this case, we should also share rid
438 among vcpus and the rid range should be the same.
440 However a domain may have per cpu rid allocation. In
441 this case we don't want to share rid among vcpus, but we may
442 do it if two vcpus are on the same cpu... */
444 v->arch.starting_rid = d->arch.starting_rid;
445 v->arch.ending_rid = d->arch.ending_rid;
446 v->arch.breakimm = d->arch.breakimm;
447 v->arch.last_processor = INVALID_PROCESSOR;
448 }
450 if (!VMX_DOMAIN(v))
451 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
452 first_cpu(cpu_online_map));
454 return 0;
455 }
457 void vcpu_share_privregs_with_guest(struct vcpu *v)
458 {
459 struct domain *d = v->domain;
460 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
462 for (i = 0; i < (1 << order); i++)
463 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
464 d, XENSHARE_writable);
465 /*
466 * XXX IA64_XMAPPEDREGS_PADDR
467 * assign these pages into guest pseudo physical address
468 * space for dom0 to map this page by gmfn.
469 * this is necessary for domain save, restore and dump-core.
470 */
471 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
472 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
473 virt_to_maddr(v->arch.privregs + i));
474 }
476 int vcpu_late_initialise(struct vcpu *v)
477 {
478 struct domain *d = v->domain;
479 int rc, order;
481 if (HAS_PERVCPU_VHPT(d)) {
482 rc = pervcpu_vhpt_alloc(v);
483 if (rc != 0)
484 return rc;
485 }
487 /* Create privregs page. */
488 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
489 v->arch.privregs = alloc_xenheap_pages(order);
490 BUG_ON(v->arch.privregs == NULL);
491 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
492 vcpu_share_privregs_with_guest(v);
494 return 0;
495 }
497 void vcpu_destroy(struct vcpu *v)
498 {
499 if (v->domain->arch.is_vti)
500 vmx_relinquish_vcpu_resources(v);
501 else
502 relinquish_vcpu_resources(v);
503 }
505 static void init_switch_stack(struct vcpu *v)
506 {
507 struct pt_regs *regs = vcpu_regs (v);
508 struct switch_stack *sw = (struct switch_stack *) regs - 1;
509 extern void ia64_ret_from_clone;
511 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
512 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
513 sw->b0 = (unsigned long) &ia64_ret_from_clone;
514 sw->ar_fpsr = FPSR_DEFAULT;
515 v->arch._thread.ksp = (unsigned long) sw - 16;
516 // stay on kernel stack because may get interrupts!
517 // ia64_ret_from_clone switches to user stack
518 v->arch._thread.on_ustack = 0;
519 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
520 }
522 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
523 static int opt_pervcpu_vhpt = 1;
524 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
525 #endif
527 int arch_domain_create(struct domain *d)
528 {
529 int i;
531 // the following will eventually need to be negotiated dynamically
532 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
533 d->arch.breakimm = 0x1000;
534 for (i = 0; i < NR_CPUS; i++) {
535 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
536 }
538 if (is_idle_domain(d))
539 return 0;
541 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
542 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
543 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
544 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
545 #endif
546 if (tlb_track_create(d) < 0)
547 goto fail_nomem1;
548 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
549 if (d->shared_info == NULL)
550 goto fail_nomem;
551 memset(d->shared_info, 0, XSI_SIZE);
552 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
553 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
554 d, XENSHARE_writable);
556 /* We may also need emulation rid for region4, though it's unlikely
557 * to see guest issue uncacheable access in metaphysical mode. But
558 * keep such info here may be more sane.
559 */
560 if (!allocate_rid_range(d,0))
561 goto fail_nomem;
563 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
565 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
566 goto fail_nomem;
568 /*
569 * grant_table_create() can't fully initialize grant table for domain
570 * because it is called before arch_domain_create().
571 * Here we complete the initialization which requires p2m table.
572 */
573 spin_lock(&d->grant_table->lock);
574 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
575 ia64_gnttab_create_shared_page(d, d->grant_table, i);
576 spin_unlock(&d->grant_table->lock);
578 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
579 RANGESETF_prettyprint_hex);
581 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
582 return 0;
584 fail_nomem:
585 tlb_track_destroy(d);
586 fail_nomem1:
587 if (d->arch.mm.pgd != NULL)
588 pgd_free(d->arch.mm.pgd);
589 if (d->shared_info != NULL)
590 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
591 return -ENOMEM;
592 }
594 void arch_domain_destroy(struct domain *d)
595 {
596 mm_final_teardown(d);
598 if (d->shared_info != NULL)
599 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
601 tlb_track_destroy(d);
603 /* Clear vTLB for the next domain. */
604 domain_flush_tlb_vhpt(d);
606 deallocate_rid_range(d);
607 }
609 int arch_vcpu_reset(struct vcpu *v)
610 {
611 /* FIXME: Stub for now */
612 return 0;
613 }
615 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
617 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
618 {
619 int i;
620 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
621 struct cpu_user_regs *uregs = vcpu_regs(v);
622 int is_hvm = VMX_DOMAIN(v);
623 unsigned int rbs_size;
625 c.nat->regs.b[6] = uregs->b6;
626 c.nat->regs.b[7] = uregs->b7;
628 c.nat->regs.ar.csd = uregs->ar_csd;
629 c.nat->regs.ar.ssd = uregs->ar_ssd;
631 c.nat->regs.r[8] = uregs->r8;
632 c.nat->regs.r[9] = uregs->r9;
633 c.nat->regs.r[10] = uregs->r10;
634 c.nat->regs.r[11] = uregs->r11;
636 if (is_hvm)
637 c.nat->regs.psr = vmx_vcpu_get_psr(v);
638 else
639 c.nat->regs.psr = vcpu_get_psr(v);
641 c.nat->regs.ip = uregs->cr_iip;
642 c.nat->regs.cfm = uregs->cr_ifs;
644 c.nat->regs.ar.unat = uregs->ar_unat;
645 c.nat->regs.ar.pfs = uregs->ar_pfs;
646 c.nat->regs.ar.rsc = uregs->ar_rsc;
647 c.nat->regs.ar.rnat = uregs->ar_rnat;
648 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
650 c.nat->regs.pr = uregs->pr;
651 c.nat->regs.b[0] = uregs->b0;
652 rbs_size = uregs->loadrs >> 16;
653 c.nat->regs.ar.bsp = uregs->ar_bspstore + rbs_size;
655 c.nat->regs.r[1] = uregs->r1;
656 c.nat->regs.r[12] = uregs->r12;
657 c.nat->regs.r[13] = uregs->r13;
658 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
659 c.nat->regs.r[15] = uregs->r15;
661 c.nat->regs.r[14] = uregs->r14;
662 c.nat->regs.r[2] = uregs->r2;
663 c.nat->regs.r[3] = uregs->r3;
664 c.nat->regs.r[16] = uregs->r16;
665 c.nat->regs.r[17] = uregs->r17;
666 c.nat->regs.r[18] = uregs->r18;
667 c.nat->regs.r[19] = uregs->r19;
668 c.nat->regs.r[20] = uregs->r20;
669 c.nat->regs.r[21] = uregs->r21;
670 c.nat->regs.r[22] = uregs->r22;
671 c.nat->regs.r[23] = uregs->r23;
672 c.nat->regs.r[24] = uregs->r24;
673 c.nat->regs.r[25] = uregs->r25;
674 c.nat->regs.r[26] = uregs->r26;
675 c.nat->regs.r[27] = uregs->r27;
676 c.nat->regs.r[28] = uregs->r28;
677 c.nat->regs.r[29] = uregs->r29;
678 c.nat->regs.r[30] = uregs->r30;
679 c.nat->regs.r[31] = uregs->r31;
681 c.nat->regs.ar.ccv = uregs->ar_ccv;
683 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
684 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
685 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
686 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
687 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
688 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
690 c.nat->regs.r[4] = uregs->r4;
691 c.nat->regs.r[5] = uregs->r5;
692 c.nat->regs.r[6] = uregs->r6;
693 c.nat->regs.r[7] = uregs->r7;
695 /* FIXME: to be reordered. */
696 c.nat->regs.nats = uregs->eml_unat;
698 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
699 if (rbs_size < sizeof (c.nat->regs.rbs))
700 memcpy(c.nat->regs.rbs, (char *)v + IA64_RBS_OFFSET, rbs_size);
702 c.nat->privregs_pfn = get_gpfn_from_mfn
703 (virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
705 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
706 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
707 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
708 }
710 for (i = 0; i < 7; i++)
711 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
713 /* Fill extra regs. */
714 for (i = 0; i < 8; i++) {
715 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
716 tr->itrs[i].itir = v->arch.itrs[i].itir;
717 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
718 tr->itrs[i].rid = v->arch.itrs[i].rid;
719 }
720 for (i = 0; i < 8; i++) {
721 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
722 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
723 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
724 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
725 }
726 c.nat->event_callback_ip = v->arch.event_callback_ip;
728 /* If PV and privregs is not set, we can't read mapped registers. */
729 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
730 return;
732 vcpu_get_dcr (v, &c.nat->regs.cr.dcr);
733 vcpu_get_iva (v, &c.nat->regs.cr.iva);
734 }
736 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
737 {
738 struct cpu_user_regs *uregs = vcpu_regs(v);
739 struct domain *d = v->domain;
740 int was_initialised = v->is_initialised;
741 unsigned int rbs_size;
742 int rc, i;
744 /* Finish vcpu initialization. */
745 if (!was_initialised) {
746 if (d->arch.is_vti)
747 rc = vmx_final_setup_guest(v);
748 else
749 rc = vcpu_late_initialise(v);
750 if (rc != 0)
751 return rc;
753 vcpu_init_regs(v);
755 v->is_initialised = 1;
756 /* Auto-online VCPU0 when it is initialised. */
757 if (v->vcpu_id == 0)
758 clear_bit(_VPF_down, &v->pause_flags);
759 }
761 if (c.nat == NULL)
762 return 0;
764 uregs->b6 = c.nat->regs.b[6];
765 uregs->b7 = c.nat->regs.b[7];
767 uregs->ar_csd = c.nat->regs.ar.csd;
768 uregs->ar_ssd = c.nat->regs.ar.ssd;
770 uregs->r8 = c.nat->regs.r[8];
771 uregs->r9 = c.nat->regs.r[9];
772 uregs->r10 = c.nat->regs.r[10];
773 uregs->r11 = c.nat->regs.r[11];
775 if (!d->arch.is_vti)
776 vcpu_set_psr(v, c.nat->regs.psr);
777 else
778 vmx_vcpu_set_psr(v, c.nat->regs.psr);
779 uregs->cr_iip = c.nat->regs.ip;
780 uregs->cr_ifs = c.nat->regs.cfm;
782 uregs->ar_unat = c.nat->regs.ar.unat;
783 uregs->ar_pfs = c.nat->regs.ar.pfs;
784 uregs->ar_rsc = c.nat->regs.ar.rsc;
785 uregs->ar_rnat = c.nat->regs.ar.rnat;
786 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
788 uregs->pr = c.nat->regs.pr;
789 uregs->b0 = c.nat->regs.b[0];
790 rbs_size = c.nat->regs.ar.bsp - c.nat->regs.ar.bspstore;
791 /* Protection against crazy user code. */
792 if (!was_initialised)
793 uregs->loadrs = (rbs_size) << 16;
794 if (rbs_size == (uregs->loadrs >> 16))
795 memcpy((char *)v + IA64_RBS_OFFSET, c.nat->regs.rbs, rbs_size);
797 uregs->r1 = c.nat->regs.r[1];
798 uregs->r12 = c.nat->regs.r[12];
799 uregs->r13 = c.nat->regs.r[13];
800 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
801 uregs->r15 = c.nat->regs.r[15];
803 uregs->r14 = c.nat->regs.r[14];
804 uregs->r2 = c.nat->regs.r[2];
805 uregs->r3 = c.nat->regs.r[3];
806 uregs->r16 = c.nat->regs.r[16];
807 uregs->r17 = c.nat->regs.r[17];
808 uregs->r18 = c.nat->regs.r[18];
809 uregs->r19 = c.nat->regs.r[19];
810 uregs->r20 = c.nat->regs.r[20];
811 uregs->r21 = c.nat->regs.r[21];
812 uregs->r22 = c.nat->regs.r[22];
813 uregs->r23 = c.nat->regs.r[23];
814 uregs->r24 = c.nat->regs.r[24];
815 uregs->r25 = c.nat->regs.r[25];
816 uregs->r26 = c.nat->regs.r[26];
817 uregs->r27 = c.nat->regs.r[27];
818 uregs->r28 = c.nat->regs.r[28];
819 uregs->r29 = c.nat->regs.r[29];
820 uregs->r30 = c.nat->regs.r[30];
821 uregs->r31 = c.nat->regs.r[31];
823 uregs->ar_ccv = c.nat->regs.ar.ccv;
825 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
826 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
827 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
828 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
829 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
830 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
832 uregs->r4 = c.nat->regs.r[4];
833 uregs->r5 = c.nat->regs.r[5];
834 uregs->r6 = c.nat->regs.r[6];
835 uregs->r7 = c.nat->regs.r[7];
837 /* FIXME: to be reordered and restored. */
838 /* uregs->eml_unat = c.nat->regs.nat; */
839 uregs->eml_unat = 0;
841 if (!d->arch.is_vti) {
842 /* domain runs at PL2/3 */
843 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
844 IA64_PSR_CPL0_BIT);
845 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
846 }
848 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
849 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
850 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
851 }
853 if (c.nat->flags & VGCF_EXTRA_REGS) {
854 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
856 for (i = 0; i < 8; i++) {
857 vcpu_set_itr(v, i, tr->itrs[i].pte,
858 tr->itrs[i].itir,
859 tr->itrs[i].vadr,
860 tr->itrs[i].rid);
861 }
862 for (i = 0; i < 8; i++) {
863 vcpu_set_dtr(v, i,
864 tr->dtrs[i].pte,
865 tr->dtrs[i].itir,
866 tr->dtrs[i].vadr,
867 tr->dtrs[i].rid);
868 }
869 v->arch.event_callback_ip = c.nat->event_callback_ip;
870 v->arch.iva = c.nat->regs.cr.iva;
871 }
873 return 0;
874 }
876 static void relinquish_memory(struct domain *d, struct list_head *list)
877 {
878 struct list_head *ent;
879 struct page_info *page;
880 #ifndef __ia64__
881 unsigned long x, y;
882 #endif
884 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
885 spin_lock_recursive(&d->page_alloc_lock);
886 ent = list->next;
887 while ( ent != list )
888 {
889 page = list_entry(ent, struct page_info, list);
890 /* Grab a reference to the page so it won't disappear from under us. */
891 if ( unlikely(!get_page(page, d)) )
892 {
893 /* Couldn't get a reference -- someone is freeing this page. */
894 ent = ent->next;
895 continue;
896 }
898 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
899 put_page_and_type(page);
901 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
902 put_page(page);
904 #ifndef __ia64__
905 /*
906 * Forcibly invalidate base page tables at this point to break circular
907 * 'linear page table' references. This is okay because MMU structures
908 * are not shared across domains and this domain is now dead. Thus base
909 * tables are not in use so a non-zero count means circular reference.
910 */
911 y = page->u.inuse.type_info;
912 for ( ; ; )
913 {
914 x = y;
915 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
916 (PGT_base_page_table|PGT_validated)) )
917 break;
919 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
920 if ( likely(y == x) )
921 {
922 free_page_type(page, PGT_base_page_table);
923 break;
924 }
925 }
926 #endif
928 /* Follow the list chain and /then/ potentially free the page. */
929 ent = ent->next;
930 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
931 put_page(page);
932 }
934 spin_unlock_recursive(&d->page_alloc_lock);
935 }
937 void domain_relinquish_resources(struct domain *d)
938 {
939 /* Relinquish guest resources for VT-i domain. */
940 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
941 vmx_relinquish_guest_resources(d);
943 /* Tear down shadow mode stuff. */
944 mm_teardown(d);
946 /* Relinquish every page of memory. */
947 relinquish_memory(d, &d->xenpage_list);
948 relinquish_memory(d, &d->page_list);
950 if (d->arch.is_vti && d->arch.sal_data)
951 xfree(d->arch.sal_data);
953 /* Free page used by xen oprofile buffer */
954 free_xenoprof_pages(d);
955 }
957 unsigned long
958 domain_set_shared_info_va (unsigned long va)
959 {
960 struct vcpu *v = current;
961 struct domain *d = v->domain;
963 /* Check virtual address:
964 must belong to region 7,
965 must be 64Kb aligned,
966 must not be within Xen virtual space. */
967 if ((va >> 61) != 7
968 || (va & 0xffffUL) != 0
969 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
970 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
972 /* Note: this doesn't work well if other cpus are already running.
973 However this is part of the spec :-) */
974 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
975 d->arch.shared_info_va = va;
977 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
978 INT_ENABLE_OFFSET(v);
980 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
982 /* Remap the shared pages. */
983 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
985 return 0;
986 }
988 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
989 #define SHADOW_COPY_CHUNK 1024
991 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
992 {
993 unsigned int op = sc->op;
994 int rc = 0;
995 int i;
996 //struct vcpu *v;
998 if (unlikely(d == current->domain)) {
999 gdprintk(XENLOG_INFO,
1000 "Don't try to do a shadow op on yourself!\n");
1001 return -EINVAL;
1004 domain_pause(d);
1006 switch (op)
1008 case XEN_DOMCTL_SHADOW_OP_OFF:
1009 if (shadow_mode_enabled (d)) {
1010 u64 *bm = d->arch.shadow_bitmap;
1012 /* Flush vhpt and tlb to restore dirty bit usage. */
1013 domain_flush_tlb_vhpt(d);
1015 /* Free bitmap. */
1016 d->arch.shadow_bitmap_size = 0;
1017 d->arch.shadow_bitmap = NULL;
1018 xfree(bm);
1020 break;
1022 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1023 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1024 rc = -EINVAL;
1025 break;
1027 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1028 if (shadow_mode_enabled(d)) {
1029 rc = -EINVAL;
1030 break;
1033 atomic64_set(&d->arch.shadow_fault_count, 0);
1034 atomic64_set(&d->arch.shadow_dirty_count, 0);
1036 d->arch.shadow_bitmap_size =
1037 ((d->arch.convmem_end >> PAGE_SHIFT) +
1038 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1039 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1040 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1041 if (d->arch.shadow_bitmap == NULL) {
1042 d->arch.shadow_bitmap_size = 0;
1043 rc = -ENOMEM;
1045 else {
1046 memset(d->arch.shadow_bitmap, 0,
1047 d->arch.shadow_bitmap_size / 8);
1049 /* Flush vhtp and tlb to enable dirty bit
1050 virtualization. */
1051 domain_flush_tlb_vhpt(d);
1053 break;
1055 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1057 int nbr_bytes;
1059 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1060 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1062 atomic64_set(&d->arch.shadow_fault_count, 0);
1063 atomic64_set(&d->arch.shadow_dirty_count, 0);
1065 if (guest_handle_is_null(sc->dirty_bitmap) ||
1066 (d->arch.shadow_bitmap == NULL)) {
1067 rc = -EINVAL;
1068 break;
1071 if (sc->pages > d->arch.shadow_bitmap_size)
1072 sc->pages = d->arch.shadow_bitmap_size;
1074 nbr_bytes = (sc->pages + 7) / 8;
1076 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1077 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1078 SHADOW_COPY_CHUNK : nbr_bytes - i;
1080 if (copy_to_guest_offset(
1081 sc->dirty_bitmap, i,
1082 (uint8_t *)d->arch.shadow_bitmap + i,
1083 size)) {
1084 rc = -EFAULT;
1085 break;
1088 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1091 break;
1094 case XEN_DOMCTL_SHADOW_OP_PEEK:
1096 unsigned long size;
1098 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1099 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1101 if (guest_handle_is_null(sc->dirty_bitmap) ||
1102 (d->arch.shadow_bitmap == NULL)) {
1103 rc = -EINVAL;
1104 break;
1107 if (sc->pages > d->arch.shadow_bitmap_size)
1108 sc->pages = d->arch.shadow_bitmap_size;
1110 size = (sc->pages + 7) / 8;
1111 if (copy_to_guest(sc->dirty_bitmap,
1112 (uint8_t *)d->arch.shadow_bitmap, size)) {
1113 rc = -EFAULT;
1114 break;
1116 break;
1118 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1119 sc->mb = 0;
1120 break;
1121 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1122 if (sc->mb > 0) {
1123 BUG();
1124 rc = -ENOMEM;
1126 break;
1127 default:
1128 rc = -EINVAL;
1129 break;
1132 domain_unpause(d);
1134 return rc;
1137 // remove following line if not privifying in memory
1138 //#define HAVE_PRIVIFY_MEMORY
1139 #ifndef HAVE_PRIVIFY_MEMORY
1140 #define privify_memory(x,y) do {} while(0)
1141 #endif
1143 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1144 unsigned long phys_load_offset)
1146 const elf_phdr *phdr;
1147 int phnum, h, filesz, memsz;
1148 unsigned long elfaddr, dom_mpaddr, dom_imva;
1149 struct page_info *p;
1151 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1152 for (h = 0; h < phnum; h++) {
1153 phdr = elf_phdr_by_index(elf, h);
1154 if (!elf_phdr_is_loadable(elf, phdr))
1155 continue;
1157 filesz = elf_uval(elf, phdr, p_filesz);
1158 memsz = elf_uval(elf, phdr, p_memsz);
1159 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1160 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1161 dom_mpaddr += phys_load_offset;
1163 while (memsz > 0) {
1164 p = assign_new_domain_page(d,dom_mpaddr);
1165 BUG_ON (unlikely(p == NULL));
1166 dom_imva = __va_ul(page_to_maddr(p));
1167 if (filesz > 0) {
1168 if (filesz >= PAGE_SIZE)
1169 copy_page((void *) dom_imva,
1170 (void *) elfaddr);
1171 else {
1172 // copy partial page
1173 memcpy((void *) dom_imva,
1174 (void *) elfaddr, filesz);
1175 // zero the rest of page
1176 memset((void *) dom_imva+filesz, 0,
1177 PAGE_SIZE-filesz);
1179 //FIXME: This test for code seems to find a lot more than objdump -x does
1180 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1181 privify_memory(dom_imva,PAGE_SIZE);
1182 flush_icache_range(dom_imva,
1183 dom_imva+PAGE_SIZE);
1186 else if (memsz > 0) {
1187 /* always zero out entire page */
1188 clear_page((void *) dom_imva);
1190 memsz -= PAGE_SIZE;
1191 filesz -= PAGE_SIZE;
1192 elfaddr += PAGE_SIZE;
1193 dom_mpaddr += PAGE_SIZE;
1198 void __init alloc_dom0(void)
1200 /* Check dom0 size. */
1201 if (dom0_size < 4 * 1024 * 1024) {
1202 panic("dom0_mem is too small, boot aborted"
1203 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1206 if (running_on_sim) {
1207 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1210 /* no need to allocate pages for now
1211 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1212 */
1216 /*
1217 * Domain 0 has direct access to all devices absolutely. However
1218 * the major point of this stub here, is to allow alloc_dom_mem
1219 * handled with order > 0 request. Dom0 requires that bit set to
1220 * allocate memory for other domains.
1221 */
1222 static void __init physdev_init_dom0(struct domain *d)
1224 if (iomem_permit_access(d, 0UL, ~0UL))
1225 BUG();
1226 if (irqs_permit_access(d, 0, NR_IRQS-1))
1227 BUG();
1228 if (ioports_permit_access(d, 0, 0xffff))
1229 BUG();
1232 int __init construct_dom0(struct domain *d,
1233 unsigned long image_start, unsigned long image_len,
1234 unsigned long initrd_start, unsigned long initrd_len,
1235 char *cmdline)
1237 int i, rc;
1238 start_info_t *si;
1239 dom0_vga_console_info_t *ci;
1240 struct vcpu *v = d->vcpu[0];
1241 unsigned long max_pages;
1243 struct elf_binary elf;
1244 struct elf_dom_parms parms;
1245 unsigned long p_start;
1246 unsigned long pkern_start;
1247 unsigned long pkern_entry;
1248 unsigned long pkern_end;
1249 unsigned long pinitrd_start = 0;
1250 unsigned long pstart_info;
1251 unsigned long phys_load_offset;
1252 struct page_info *start_info_page;
1253 unsigned long bp_mpa;
1254 struct ia64_boot_param *bp;
1256 //printk("construct_dom0: starting\n");
1258 /* Sanity! */
1259 BUG_ON(d != dom0);
1260 BUG_ON(d->vcpu[0] == NULL);
1261 BUG_ON(v->is_initialised);
1263 printk("*** LOADING DOMAIN 0 ***\n");
1265 max_pages = dom0_size / PAGE_SIZE;
1266 d->max_pages = max_pages;
1267 d->tot_pages = 0;
1269 rc = elf_init(&elf, (void*)image_start, image_len);
1270 if ( rc != 0 )
1271 return rc;
1272 #ifdef VERBOSE
1273 elf_set_verbose(&elf);
1274 #endif
1275 elf_parse_binary(&elf);
1276 if (0 != (elf_xen_parse(&elf, &parms)))
1277 return rc;
1279 /*
1280 * We cannot rely on the load address in the ELF headers to
1281 * determine the meta physical address at which the image
1282 * is loaded. Patch the address to match the real one, based
1283 * on xen_pstart
1284 */
1285 phys_load_offset = xen_pstart - elf.pstart;
1286 elf.pstart += phys_load_offset;
1287 elf.pend += phys_load_offset;
1288 parms.virt_kstart += phys_load_offset;
1289 parms.virt_kend += phys_load_offset;
1290 parms.virt_entry += phys_load_offset;
1292 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
1293 elf_64bit(&elf) ? "64-bit" : "32-bit",
1294 elf_msb(&elf) ? "msb" : "lsb",
1295 elf.pstart, elf.pend);
1296 if (!elf_64bit(&elf) ||
1297 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
1298 printk("Incompatible kernel binary\n");
1299 return -1;
1302 p_start = parms.virt_base;
1303 pkern_start = parms.virt_kstart;
1304 pkern_end = parms.virt_kend;
1305 pkern_entry = parms.virt_entry;
1307 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1309 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1311 printk("Initial guest OS must load to a page boundary.\n");
1312 return -EINVAL;
1315 pstart_info = PAGE_ALIGN(pkern_end);
1316 if(initrd_start && initrd_len){
1317 unsigned long offset;
1319 /* The next page aligned boundary after the start info.
1320 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1321 pinitrd_start = pstart_info + PAGE_SIZE;
1323 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
1324 panic("%s: not enough memory assigned to dom0", __func__);
1326 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1327 struct page_info *p;
1328 p = assign_new_domain_page(d, pinitrd_start + offset);
1329 if (p == NULL)
1330 panic("%s: can't allocate page for initrd image", __func__);
1331 if (initrd_len < offset + PAGE_SIZE)
1332 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1333 initrd_len - offset);
1334 else
1335 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1339 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1340 " Kernel image: %lx->%lx\n"
1341 " Entry address: %lx\n"
1342 " Init. ramdisk: %lx len %lx\n"
1343 " Start info.: %lx->%lx\n",
1344 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1345 pstart_info, pstart_info + PAGE_SIZE);
1347 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1349 printk("Initial guest OS requires too much space\n"
1350 "(%luMB is greater than %luMB limit)\n",
1351 (pkern_end-pkern_start)>>20,
1352 (max_pages <<PAGE_SHIFT)>>20);
1353 return -ENOMEM;
1356 // if high 3 bits of pkern start are non-zero, error
1358 // if pkern end is after end of metaphysical memory, error
1359 // (we should be able to deal with this... later)
1361 /* Mask all upcalls... */
1362 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1363 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1365 if (dom0_max_vcpus == 0)
1366 dom0_max_vcpus = MAX_VIRT_CPUS;
1367 if (dom0_max_vcpus > num_online_cpus())
1368 dom0_max_vcpus = num_online_cpus();
1369 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1370 dom0_max_vcpus = MAX_VIRT_CPUS;
1372 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1373 for ( i = 1; i < dom0_max_vcpus; i++ )
1374 if (alloc_vcpu(d, i, i) == NULL)
1375 panic("Cannot allocate dom0 vcpu %d\n", i);
1377 /* Copy the OS image. */
1378 loaddomainelfimage(d, &elf, phys_load_offset);
1380 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1381 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1383 /* Set up start info area. */
1384 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1385 start_info_page = assign_new_domain_page(d, pstart_info);
1386 if (start_info_page == NULL)
1387 panic("can't allocate start info page");
1388 si = page_to_virt(start_info_page);
1389 clear_page(si);
1390 snprintf(si->magic, sizeof(si->magic), "xen-%i.%i-ia64",
1391 xen_major_version(), xen_minor_version());
1392 si->nr_pages = max_pages;
1393 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1395 printk("Dom0: 0x%lx\n", (u64)dom0);
1397 v->is_initialised = 1;
1398 clear_bit(_VPF_down, &v->pause_flags);
1400 /* Build firmware.
1401 Note: Linux kernel reserve memory used by start_info, so there is
1402 no need to remove it from MDT. */
1403 bp_mpa = pstart_info + sizeof(struct start_info);
1404 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1405 if (rc != 0)
1406 return rc;
1408 /* Fill boot param. */
1409 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1411 bp = (struct ia64_boot_param *)((unsigned char *)si +
1412 sizeof(start_info_t));
1413 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1415 /* We assume console has reached the last line! */
1416 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1417 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1418 bp->console_info.orig_x = 0;
1419 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1420 0 : bp->console_info.num_rows - 1;
1422 bp->initrd_start = pinitrd_start;
1423 bp->initrd_size = ia64_boot_param->initrd_size;
1425 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1426 sizeof(start_info_t) +
1427 sizeof(struct ia64_boot_param));
1429 if (fill_console_start_info(ci)) {
1430 si->console.dom0.info_off = sizeof(start_info_t) +
1431 sizeof(struct ia64_boot_param);
1432 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1435 vcpu_init_regs (v);
1437 vcpu_regs(v)->r28 = bp_mpa;
1439 vcpu_regs (v)->cr_iip = pkern_entry;
1441 physdev_init_dom0(d);
1443 return 0;
1446 void machine_restart(char * __unused)
1448 console_start_sync();
1449 if (running_on_sim)
1450 printk ("machine_restart called. spinning...\n");
1451 else
1452 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1453 while(1);
1456 extern void cpu_halt(void);
1458 void machine_halt(void)
1460 console_start_sync();
1461 if (running_on_sim)
1462 printk ("machine_halt called. spinning...\n");
1463 else
1464 cpu_halt();
1465 while(1);
1468 void sync_vcpu_execstate(struct vcpu *v)
1470 // __ia64_save_fpu(v->arch._thread.fph);
1471 // if (VMX_DOMAIN(v))
1472 // vmx_save_state(v);
1473 // FIXME SMP: Anything else needed here for SMP?
1476 /* This function is taken from xen/arch/x86/domain.c */
1477 long
1478 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
1480 long rc = 0;
1482 switch (cmd) {
1483 case VCPUOP_register_runstate_memory_area:
1485 struct vcpu_register_runstate_memory_area area;
1486 struct vcpu_runstate_info runstate;
1488 rc = -EFAULT;
1489 if (copy_from_guest(&area, arg, 1))
1490 break;
1492 if (!guest_handle_okay(area.addr.h, 1))
1493 break;
1495 rc = 0;
1496 runstate_guest(v) = area.addr.h;
1498 if (v == current) {
1499 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
1500 } else {
1501 vcpu_runstate_get(v, &runstate);
1502 __copy_to_guest(runstate_guest(v), &runstate, 1);
1505 break;
1507 default:
1508 rc = -ENOSYS;
1509 break;
1512 return rc;
1515 static void __init parse_dom0_mem(char *s)
1517 dom0_size = parse_size_and_unit(s, NULL);
1519 custom_param("dom0_mem", parse_dom0_mem);
1521 /*
1522 * Helper function for the optimization stuff handling the identity mapping
1523 * feature.
1524 */
1525 static inline void
1526 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
1527 struct xen_ia64_opt_feature* f)
1529 if (f->on) {
1530 *mask |= f->cmd;
1531 im->pgprot = f->pgprot;
1532 im->key = f->key;
1533 } else {
1534 *mask &= ~(f->cmd);
1535 im->pgprot = 0;
1536 im->key = 0;
1540 /* Switch a optimization feature on/off. */
1541 int
1542 domain_opt_feature(struct xen_ia64_opt_feature* f)
1544 struct opt_feature* optf = &(current->domain->arch.opt_feature);
1545 long rc = 0;
1547 switch (f->cmd) {
1548 case XEN_IA64_OPTF_IDENT_MAP_REG4:
1549 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
1550 break;
1551 case XEN_IA64_OPTF_IDENT_MAP_REG5:
1552 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
1553 break;
1554 case XEN_IA64_OPTF_IDENT_MAP_REG7:
1555 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
1556 break;
1557 default:
1558 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
1559 rc = -ENOSYS;
1560 break;
1562 return rc;