ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 13608:30af6cfdb05c

Make domctl/sysctl interfaces 32-/64-bit invariant.
This kills off a fair amount of unpleasant CONFIG_COMPAT shimming and
avoids needing to keep the compat paths in sync as these interfaces
continue to develop.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jan 24 16:33:19 2007 +0000 (2007-01-24)
parents 7d8670a30445
children 3c9926aadec5
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
49 #include <xen/guest_access.h>
50 #include <asm/tlb_track.h>
51 #include <asm/perfmon.h>
53 unsigned long dom0_size = 512*1024*1024;
55 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
56 static unsigned int dom0_max_vcpus = 1;
57 integer_param("dom0_max_vcpus", dom0_max_vcpus);
59 extern unsigned long running_on_sim;
61 extern char dom0_command_line[];
63 /* forward declaration */
64 static void init_switch_stack(struct vcpu *v);
66 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
67 This is a Xen virtual address. */
68 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
69 DEFINE_PER_CPU(int *, current_psr_ic_addr);
71 #include <xen/sched-if.h>
73 static void
74 ia64_disable_vhpt_walker(void)
75 {
76 // disable VHPT. ia64_new_rr7() might cause VHPT
77 // fault without this because it flushes dtr[IA64_TR_VHPT]
78 // (VHPT_SIZE_LOG2 << 2) is just for avoid
79 // Reserved Register/Field fault.
80 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
81 }
83 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
84 {
85 int cpu = smp_processor_id();
86 int last_vcpu_id, last_processor;
88 if (!is_idle_domain(prev->domain))
89 tlbflush_update_time
90 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
91 tlbflush_current_time());
93 if (is_idle_domain(next->domain))
94 return;
96 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
97 last_processor = next->arch.last_processor;
99 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
100 next->arch.last_processor = cpu;
102 if ((last_vcpu_id != next->vcpu_id &&
103 last_vcpu_id != INVALID_VCPU_ID) ||
104 (last_vcpu_id == next->vcpu_id &&
105 last_processor != cpu &&
106 last_processor != INVALID_PROCESSOR)) {
107 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
108 u32 last_tlbflush_timestamp =
109 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
110 #endif
111 int vhpt_is_flushed = 0;
113 // if the vTLB implementation was changed,
114 // the followings must be updated either.
115 if (VMX_DOMAIN(next)) {
116 // currently vTLB for vt-i domian is per vcpu.
117 // so any flushing isn't needed.
118 } else if (HAS_PERVCPU_VHPT(next->domain)) {
119 // nothing to do
120 } else {
121 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
122 last_tlbflush_timestamp)) {
123 local_vhpt_flush();
124 vhpt_is_flushed = 1;
125 }
126 }
127 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
128 last_tlbflush_timestamp)) {
129 local_flush_tlb_all();
130 perfc_incrc(tlbflush_clock_cswitch_purge);
131 } else {
132 perfc_incrc(tlbflush_clock_cswitch_skip);
133 }
134 perfc_incrc(flush_vtlb_for_context_switch);
135 }
136 }
138 void schedule_tail(struct vcpu *prev)
139 {
140 extern char ia64_ivt;
141 context_saved(prev);
143 ia64_disable_vhpt_walker();
144 if (VMX_DOMAIN(current)) {
145 vmx_do_launch(current);
146 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
147 current->processor);
148 } else {
149 ia64_set_iva(&ia64_ivt);
150 load_region_regs(current);
151 ia64_set_pta(vcpu_pta(current));
152 vcpu_load_kernel_regs(current);
153 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
154 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
155 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
156 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
157 migrate_timer(&current->arch.hlt_timer, current->processor);
158 }
159 flush_vtlb_for_context_switch(prev, current);
160 }
162 void context_switch(struct vcpu *prev, struct vcpu *next)
163 {
164 uint64_t spsr;
166 local_irq_save(spsr);
168 if (!is_idle_domain(prev->domain))
169 __ia64_save_fpu(prev->arch._thread.fph);
170 if (!is_idle_domain(next->domain))
171 __ia64_load_fpu(next->arch._thread.fph);
173 if (VMX_DOMAIN(prev)) {
174 vmx_save_state(prev);
175 if (!VMX_DOMAIN(next)) {
176 /* VMX domains can change the physical cr.dcr.
177 * Restore default to prevent leakage. */
178 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
179 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
180 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
181 }
182 }
183 if (VMX_DOMAIN(next))
184 vmx_load_state(next);
186 ia64_disable_vhpt_walker();
187 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
188 prev = ia64_switch_to(next);
190 /* Note: ia64_switch_to does not return here at vcpu initialization. */
192 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
194 if (VMX_DOMAIN(current)){
195 vmx_load_all_rr(current);
196 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
197 current->processor);
198 } else {
199 struct domain *nd;
200 extern char ia64_ivt;
202 ia64_set_iva(&ia64_ivt);
204 nd = current->domain;
205 if (!is_idle_domain(nd)) {
206 load_region_regs(current);
207 ia64_set_pta(vcpu_pta(current));
208 vcpu_load_kernel_regs(current);
209 vcpu_set_next_timer(current);
210 if (vcpu_timer_expired(current))
211 vcpu_pend_timer(current);
212 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
213 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
214 __ia64_per_cpu_var(current_psr_ic_addr) =
215 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
216 } else {
217 /* When switching to idle domain, only need to disable vhpt
218 * walker. Then all accesses happen within idle context will
219 * be handled by TR mapping and identity mapping.
220 */
221 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
222 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
223 }
224 }
225 local_irq_restore(spsr);
226 flush_vtlb_for_context_switch(prev, current);
227 context_saved(prev);
228 }
230 void continue_running(struct vcpu *same)
231 {
232 /* nothing to do */
233 }
235 #ifdef CONFIG_PERFMON
236 static int pal_halt = 1;
237 static int can_do_pal_halt = 1;
239 static int __init nohalt_setup(char * str)
240 {
241 pal_halt = can_do_pal_halt = 0;
242 return 1;
243 }
244 __setup("nohalt", nohalt_setup);
246 void
247 update_pal_halt_status(int status)
248 {
249 can_do_pal_halt = pal_halt && status;
250 }
251 #else
252 #define can_do_pal_halt (1)
253 #endif
255 static void default_idle(void)
256 {
257 local_irq_disable();
258 if ( !softirq_pending(smp_processor_id()) ) {
259 if (can_do_pal_halt)
260 safe_halt();
261 else
262 cpu_relax();
263 }
264 local_irq_enable();
265 }
267 static void continue_cpu_idle_loop(void)
268 {
269 for ( ; ; )
270 {
271 #ifdef IA64
272 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
273 #else
274 irq_stat[cpu].idle_timestamp = jiffies;
275 #endif
276 while ( !softirq_pending(smp_processor_id()) )
277 default_idle();
278 raise_softirq(SCHEDULE_SOFTIRQ);
279 do_softirq();
280 }
281 }
283 void startup_cpu_idle_loop(void)
284 {
285 /* Just some sanity to ensure that the scheduler is set up okay. */
286 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
287 raise_softirq(SCHEDULE_SOFTIRQ);
289 continue_cpu_idle_loop();
290 }
292 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
293 * get_order_from_shift(XMAPPEDREGS_SHIFT))
294 */
295 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
296 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
297 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
298 #endif
300 void hlt_timer_fn(void *data)
301 {
302 struct vcpu *v = data;
303 vcpu_unblock(v);
304 }
306 void relinquish_vcpu_resources(struct vcpu *v)
307 {
308 if (HAS_PERVCPU_VHPT(v->domain))
309 pervcpu_vhpt_free(v);
310 if (v->arch.privregs != NULL) {
311 free_xenheap_pages(v->arch.privregs,
312 get_order_from_shift(XMAPPEDREGS_SHIFT));
313 v->arch.privregs = NULL;
314 }
315 kill_timer(&v->arch.hlt_timer);
316 }
318 struct vcpu *alloc_vcpu_struct(void)
319 {
320 struct vcpu *v;
321 struct thread_info *ti;
322 static int first_allocation = 1;
324 if (first_allocation) {
325 first_allocation = 0;
326 /* Still keep idle vcpu0 static allocated at compilation, due
327 * to some code from Linux still requires it in early phase.
328 */
329 return idle_vcpu[0];
330 }
332 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
333 return NULL;
334 memset(v, 0, sizeof(*v));
336 ti = alloc_thread_info(v);
337 /* Clear thread_info to clear some important fields, like
338 * preempt_count
339 */
340 memset(ti, 0, sizeof(struct thread_info));
341 init_switch_stack(v);
343 return v;
344 }
346 void free_vcpu_struct(struct vcpu *v)
347 {
348 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
349 }
351 int vcpu_initialise(struct vcpu *v)
352 {
353 struct domain *d = v->domain;
355 if (!is_idle_domain(d)) {
356 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
357 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
358 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
359 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
361 /* Is it correct ?
362 It depends on the domain rid usage.
364 A domain may share rid among its processor (eg having a
365 global VHPT). In this case, we should also share rid
366 among vcpus and the rid range should be the same.
368 However a domain may have per cpu rid allocation. In
369 this case we don't want to share rid among vcpus, but we may
370 do it if two vcpus are on the same cpu... */
372 v->arch.starting_rid = d->arch.starting_rid;
373 v->arch.ending_rid = d->arch.ending_rid;
374 v->arch.breakimm = d->arch.breakimm;
375 v->arch.last_processor = INVALID_PROCESSOR;
376 }
378 if (!VMX_DOMAIN(v))
379 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
380 first_cpu(cpu_online_map));
382 return 0;
383 }
385 int vcpu_late_initialise(struct vcpu *v)
386 {
387 struct domain *d = v->domain;
388 int rc, order, i;
390 if (HAS_PERVCPU_VHPT(d)) {
391 rc = pervcpu_vhpt_alloc(v);
392 if (rc != 0)
393 return rc;
394 }
396 /* Create privregs page. */
397 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
398 v->arch.privregs = alloc_xenheap_pages(order);
399 BUG_ON(v->arch.privregs == NULL);
400 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
401 for (i = 0; i < (1 << order); i++)
402 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
403 d, XENSHARE_writable);
404 /*
405 * XXX IA64_XMAPPEDREGS_PADDR
406 * assign these pages into guest pseudo physical address
407 * space for dom0 to map this page by gmfn.
408 * this is necessary for domain save, restore and dump-core.
409 */
410 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
411 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
412 virt_to_maddr(v->arch.privregs + i));
414 tlbflush_update_time(&v->arch.tlbflush_timestamp,
415 tlbflush_current_time());
417 return 0;
418 }
420 void vcpu_destroy(struct vcpu *v)
421 {
422 if (v->domain->arch.is_vti)
423 vmx_relinquish_vcpu_resources(v);
424 else
425 relinquish_vcpu_resources(v);
426 }
428 static void init_switch_stack(struct vcpu *v)
429 {
430 struct pt_regs *regs = vcpu_regs (v);
431 struct switch_stack *sw = (struct switch_stack *) regs - 1;
432 extern void ia64_ret_from_clone;
434 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
435 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
436 sw->b0 = (unsigned long) &ia64_ret_from_clone;
437 sw->ar_fpsr = FPSR_DEFAULT;
438 v->arch._thread.ksp = (unsigned long) sw - 16;
439 // stay on kernel stack because may get interrupts!
440 // ia64_ret_from_clone switches to user stack
441 v->arch._thread.on_ustack = 0;
442 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
443 }
445 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
446 static int opt_pervcpu_vhpt = 1;
447 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
448 #endif
450 int arch_domain_create(struct domain *d)
451 {
452 int i;
454 // the following will eventually need to be negotiated dynamically
455 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
456 d->arch.breakimm = 0x1000;
457 for (i = 0; i < NR_CPUS; i++) {
458 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
459 }
461 if (is_idle_domain(d))
462 return 0;
464 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
465 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
466 dprintk(XENLOG_WARNING, "%s:%d domain %d pervcpu_vhpt %d\n",
467 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
468 #endif
469 if (tlb_track_create(d) < 0)
470 goto fail_nomem1;
471 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
472 if (d->shared_info == NULL)
473 goto fail_nomem;
474 memset(d->shared_info, 0, XSI_SIZE);
475 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
476 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
477 d, XENSHARE_writable);
479 /* We may also need emulation rid for region4, though it's unlikely
480 * to see guest issue uncacheable access in metaphysical mode. But
481 * keep such info here may be more sane.
482 */
483 if (!allocate_rid_range(d,0))
484 goto fail_nomem;
486 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
488 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
489 goto fail_nomem;
491 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
492 RANGESETF_prettyprint_hex);
494 printk ("arch_domain_create: domain=%p\n", d);
495 return 0;
497 fail_nomem:
498 tlb_track_destroy(d);
499 fail_nomem1:
500 if (d->arch.mm.pgd != NULL)
501 pgd_free(d->arch.mm.pgd);
502 if (d->shared_info != NULL)
503 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
504 return -ENOMEM;
505 }
507 void arch_domain_destroy(struct domain *d)
508 {
509 mm_final_teardown(d);
511 if (d->shared_info != NULL)
512 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
514 tlb_track_destroy(d);
516 /* Clear vTLB for the next domain. */
517 domain_flush_tlb_vhpt(d);
519 deallocate_rid_range(d);
520 }
522 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
523 {
524 int i;
525 struct vcpu_extra_regs *er = &c.nat->extra_regs;
527 c.nat->user_regs = *vcpu_regs(v);
528 c.nat->privregs_pfn = get_gpfn_from_mfn(virt_to_maddr(v->arch.privregs) >>
529 PAGE_SHIFT);
531 /* Fill extra regs. */
532 for (i = 0; i < 8; i++) {
533 er->itrs[i].pte = v->arch.itrs[i].pte.val;
534 er->itrs[i].itir = v->arch.itrs[i].itir;
535 er->itrs[i].vadr = v->arch.itrs[i].vadr;
536 er->itrs[i].rid = v->arch.itrs[i].rid;
537 }
538 for (i = 0; i < 8; i++) {
539 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
540 er->dtrs[i].itir = v->arch.dtrs[i].itir;
541 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
542 er->dtrs[i].rid = v->arch.dtrs[i].rid;
543 }
544 er->event_callback_ip = v->arch.event_callback_ip;
545 er->dcr = v->arch.dcr;
546 er->iva = v->arch.iva;
547 }
549 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
550 {
551 struct pt_regs *regs = vcpu_regs (v);
552 struct domain *d = v->domain;
553 int rc;
555 *regs = c.nat->user_regs;
557 if (!d->arch.is_vti) {
558 /* domain runs at PL2/3 */
559 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
560 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
561 }
563 if (c.nat->flags & VGCF_EXTRA_REGS) {
564 int i;
565 struct vcpu_extra_regs *er = &c.nat->extra_regs;
567 for (i = 0; i < 8; i++) {
568 vcpu_set_itr(v, i, er->itrs[i].pte,
569 er->itrs[i].itir,
570 er->itrs[i].vadr,
571 er->itrs[i].rid);
572 }
573 for (i = 0; i < 8; i++) {
574 vcpu_set_dtr(v, i,
575 er->dtrs[i].pte,
576 er->dtrs[i].itir,
577 er->dtrs[i].vadr,
578 er->dtrs[i].rid);
579 }
580 v->arch.event_callback_ip = er->event_callback_ip;
581 v->arch.dcr = er->dcr;
582 v->arch.iva = er->iva;
583 }
585 if (test_bit(_VCPUF_initialised, &v->vcpu_flags))
586 return 0;
588 if (d->arch.is_vti)
589 vmx_final_setup_guest(v);
590 else {
591 rc = vcpu_late_initialise(v);
592 if (rc != 0)
593 return rc;
594 VCPU(v, interrupt_mask_addr) =
595 (unsigned char *) d->arch.shared_info_va +
596 INT_ENABLE_OFFSET(v);
597 }
599 /* This overrides some registers. */
600 vcpu_init_regs(v);
602 /* Don't redo final setup */
603 set_bit(_VCPUF_initialised, &v->vcpu_flags);
604 return 0;
605 }
607 static void relinquish_memory(struct domain *d, struct list_head *list)
608 {
609 struct list_head *ent;
610 struct page_info *page;
611 #ifndef __ia64__
612 unsigned long x, y;
613 #endif
615 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
616 spin_lock_recursive(&d->page_alloc_lock);
617 ent = list->next;
618 while ( ent != list )
619 {
620 page = list_entry(ent, struct page_info, list);
621 /* Grab a reference to the page so it won't disappear from under us. */
622 if ( unlikely(!get_page(page, d)) )
623 {
624 /* Couldn't get a reference -- someone is freeing this page. */
625 ent = ent->next;
626 continue;
627 }
629 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
630 put_page_and_type(page);
632 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
633 put_page(page);
635 #ifndef __ia64__
636 /*
637 * Forcibly invalidate base page tables at this point to break circular
638 * 'linear page table' references. This is okay because MMU structures
639 * are not shared across domains and this domain is now dead. Thus base
640 * tables are not in use so a non-zero count means circular reference.
641 */
642 y = page->u.inuse.type_info;
643 for ( ; ; )
644 {
645 x = y;
646 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
647 (PGT_base_page_table|PGT_validated)) )
648 break;
650 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
651 if ( likely(y == x) )
652 {
653 free_page_type(page, PGT_base_page_table);
654 break;
655 }
656 }
657 #endif
659 /* Follow the list chain and /then/ potentially free the page. */
660 ent = ent->next;
661 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
662 put_page(page);
663 }
665 spin_unlock_recursive(&d->page_alloc_lock);
666 }
668 void domain_relinquish_resources(struct domain *d)
669 {
670 /* Relinquish guest resources for VT-i domain. */
671 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
672 vmx_relinquish_guest_resources(d);
674 /* Tear down shadow mode stuff. */
675 mm_teardown(d);
677 /* Relinquish every page of memory. */
678 relinquish_memory(d, &d->xenpage_list);
679 relinquish_memory(d, &d->page_list);
681 if (d->arch.is_vti && d->arch.sal_data)
682 xfree(d->arch.sal_data);
684 /* Free page used by xen oprofile buffer */
685 free_xenoprof_pages(d);
686 }
688 unsigned long
689 domain_set_shared_info_va (unsigned long va)
690 {
691 struct vcpu *v = current;
692 struct domain *d = v->domain;
694 /* Check virtual address:
695 must belong to region 7,
696 must be 64Kb aligned,
697 must not be within Xen virtual space. */
698 if ((va >> 61) != 7
699 || (va & 0xffffUL) != 0
700 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
701 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
703 /* Note: this doesn't work well if other cpus are already running.
704 However this is part of the spec :-) */
705 printk ("Domain set shared_info_va to 0x%016lx\n", va);
706 d->arch.shared_info_va = va;
708 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
709 INT_ENABLE_OFFSET(v);
711 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
713 /* Remap the shared pages. */
714 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
716 return 0;
717 }
719 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
720 #define SHADOW_COPY_CHUNK 1024
722 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
723 {
724 unsigned int op = sc->op;
725 int rc = 0;
726 int i;
727 //struct vcpu *v;
729 if (unlikely(d == current->domain)) {
730 gdprintk(XENLOG_INFO,
731 "Don't try to do a shadow op on yourself!\n");
732 return -EINVAL;
733 }
735 domain_pause(d);
737 switch (op)
738 {
739 case XEN_DOMCTL_SHADOW_OP_OFF:
740 if (shadow_mode_enabled (d)) {
741 u64 *bm = d->arch.shadow_bitmap;
743 /* Flush vhpt and tlb to restore dirty bit usage. */
744 domain_flush_tlb_vhpt(d);
746 /* Free bitmap. */
747 d->arch.shadow_bitmap_size = 0;
748 d->arch.shadow_bitmap = NULL;
749 xfree(bm);
750 }
751 break;
753 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
754 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
755 rc = -EINVAL;
756 break;
758 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
759 if (shadow_mode_enabled(d)) {
760 rc = -EINVAL;
761 break;
762 }
764 atomic64_set(&d->arch.shadow_fault_count, 0);
765 atomic64_set(&d->arch.shadow_dirty_count, 0);
767 d->arch.shadow_bitmap_size =
768 ((d->arch.convmem_end >> PAGE_SHIFT) +
769 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
770 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
771 d->arch.shadow_bitmap_size / BITS_PER_LONG);
772 if (d->arch.shadow_bitmap == NULL) {
773 d->arch.shadow_bitmap_size = 0;
774 rc = -ENOMEM;
775 }
776 else {
777 memset(d->arch.shadow_bitmap, 0,
778 d->arch.shadow_bitmap_size / 8);
780 /* Flush vhtp and tlb to enable dirty bit
781 virtualization. */
782 domain_flush_tlb_vhpt(d);
783 }
784 break;
786 case XEN_DOMCTL_SHADOW_OP_CLEAN:
787 {
788 int nbr_bytes;
790 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
791 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
793 atomic64_set(&d->arch.shadow_fault_count, 0);
794 atomic64_set(&d->arch.shadow_dirty_count, 0);
796 if (guest_handle_is_null(sc->dirty_bitmap) ||
797 (d->arch.shadow_bitmap == NULL)) {
798 rc = -EINVAL;
799 break;
800 }
802 if (sc->pages > d->arch.shadow_bitmap_size)
803 sc->pages = d->arch.shadow_bitmap_size;
805 nbr_bytes = (sc->pages + 7) / 8;
807 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
808 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
809 SHADOW_COPY_CHUNK : nbr_bytes - i;
811 if (copy_to_guest_offset(
812 sc->dirty_bitmap, i,
813 (uint8_t *)d->arch.shadow_bitmap + i,
814 size)) {
815 rc = -EFAULT;
816 break;
817 }
819 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
820 }
822 break;
823 }
825 case XEN_DOMCTL_SHADOW_OP_PEEK:
826 {
827 unsigned long size;
829 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
830 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
832 if (guest_handle_is_null(sc->dirty_bitmap) ||
833 (d->arch.shadow_bitmap == NULL)) {
834 rc = -EINVAL;
835 break;
836 }
838 if (sc->pages > d->arch.shadow_bitmap_size)
839 sc->pages = d->arch.shadow_bitmap_size;
841 size = (sc->pages + 7) / 8;
842 if (copy_to_guest(sc->dirty_bitmap,
843 (uint8_t *)d->arch.shadow_bitmap, size)) {
844 rc = -EFAULT;
845 break;
846 }
847 break;
848 }
849 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
850 sc->mb = 0;
851 break;
852 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
853 if (sc->mb > 0) {
854 BUG();
855 rc = -ENOMEM;
856 }
857 break;
858 default:
859 rc = -EINVAL;
860 break;
861 }
863 domain_unpause(d);
865 return rc;
866 }
868 // remove following line if not privifying in memory
869 //#define HAVE_PRIVIFY_MEMORY
870 #ifndef HAVE_PRIVIFY_MEMORY
871 #define privify_memory(x,y) do {} while(0)
872 #endif
874 // see arch/x86/xxx/domain_build.c
875 int elf_sanity_check(const Elf_Ehdr *ehdr)
876 {
877 if (!(IS_ELF(*ehdr)))
878 {
879 printk("DOM0 image is not a Xen-compatible Elf image.\n");
880 return 0;
881 }
882 return 1;
883 }
885 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
886 {
887 char *elfbase = (char *) image_start;
888 Elf_Ehdr ehdr;
889 Elf_Phdr phdr;
890 int h, filesz, memsz;
891 unsigned long elfaddr, dom_mpaddr, dom_imva;
892 struct page_info *p;
894 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
895 for ( h = 0; h < ehdr.e_phnum; h++ ) {
896 memcpy(&phdr,
897 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
898 sizeof(Elf_Phdr));
899 if ((phdr.p_type != PT_LOAD))
900 continue;
902 filesz = phdr.p_filesz;
903 memsz = phdr.p_memsz;
904 elfaddr = (unsigned long) elfbase + phdr.p_offset;
905 dom_mpaddr = phdr.p_paddr;
907 while (memsz > 0) {
908 p = assign_new_domain_page(d,dom_mpaddr);
909 BUG_ON (unlikely(p == NULL));
910 dom_imva = __va_ul(page_to_maddr(p));
911 if (filesz > 0) {
912 if (filesz >= PAGE_SIZE)
913 memcpy((void *) dom_imva,
914 (void *) elfaddr,
915 PAGE_SIZE);
916 else {
917 // copy partial page
918 memcpy((void *) dom_imva,
919 (void *) elfaddr, filesz);
920 // zero the rest of page
921 memset((void *) dom_imva+filesz, 0,
922 PAGE_SIZE-filesz);
923 }
924 //FIXME: This test for code seems to find a lot more than objdump -x does
925 if (phdr.p_flags & PF_X) {
926 privify_memory(dom_imva,PAGE_SIZE);
927 flush_icache_range(dom_imva,
928 dom_imva+PAGE_SIZE);
929 }
930 }
931 else if (memsz > 0) {
932 /* always zero out entire page */
933 memset((void *) dom_imva, 0, PAGE_SIZE);
934 }
935 memsz -= PAGE_SIZE;
936 filesz -= PAGE_SIZE;
937 elfaddr += PAGE_SIZE;
938 dom_mpaddr += PAGE_SIZE;
939 }
940 }
941 }
943 void alloc_dom0(void)
944 {
945 /* Check dom0 size. */
946 if (dom0_size < 4 * 1024 * 1024) {
947 panic("dom0_mem is too small, boot aborted"
948 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
949 }
951 if (running_on_sim) {
952 dom0_size = 128*1024*1024; //FIXME: Should be configurable
953 }
955 /* no need to allocate pages for now
956 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
957 */
958 }
961 /*
962 * Domain 0 has direct access to all devices absolutely. However
963 * the major point of this stub here, is to allow alloc_dom_mem
964 * handled with order > 0 request. Dom0 requires that bit set to
965 * allocate memory for other domains.
966 */
967 static void physdev_init_dom0(struct domain *d)
968 {
969 if (iomem_permit_access(d, 0UL, ~0UL))
970 BUG();
971 if (irqs_permit_access(d, 0, NR_IRQS-1))
972 BUG();
973 if (ioports_permit_access(d, 0, 0xffff))
974 BUG();
975 }
977 int construct_dom0(struct domain *d,
978 unsigned long image_start, unsigned long image_len,
979 unsigned long initrd_start, unsigned long initrd_len,
980 char *cmdline)
981 {
982 int i, rc;
983 start_info_t *si;
984 dom0_vga_console_info_t *ci;
985 struct vcpu *v = d->vcpu[0];
986 unsigned long max_pages;
988 struct domain_setup_info dsi;
989 unsigned long p_start;
990 unsigned long pkern_start;
991 unsigned long pkern_entry;
992 unsigned long pkern_end;
993 unsigned long pinitrd_start = 0;
994 unsigned long pstart_info;
995 struct page_info *start_info_page;
996 unsigned long bp_mpa;
997 struct ia64_boot_param *bp;
999 #ifdef VALIDATE_VT
1000 unsigned int vmx_dom0 = 0;
1001 unsigned long mfn;
1002 struct page_info *page = NULL;
1003 #endif
1005 //printk("construct_dom0: starting\n");
1007 /* Sanity! */
1008 BUG_ON(d != dom0);
1009 BUG_ON(d->vcpu[0] == NULL);
1010 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1012 memset(&dsi, 0, sizeof(struct domain_setup_info));
1014 printk("*** LOADING DOMAIN 0 ***\n");
1016 max_pages = dom0_size / PAGE_SIZE;
1017 d->max_pages = max_pages;
1018 d->tot_pages = 0;
1019 dsi.image_addr = (unsigned long)image_start;
1020 dsi.image_len = image_len;
1021 rc = parseelfimage(&dsi);
1022 if ( rc != 0 )
1023 return rc;
1025 #ifdef VALIDATE_VT
1026 /* Temp workaround */
1027 if (running_on_sim)
1028 dsi.xen_section_string = (char *)1;
1030 /* Check whether dom0 is vti domain */
1031 if ((!vmx_enabled) && !dsi.xen_section_string) {
1032 printk("Lack of hardware support for unmodified vmx dom0\n");
1033 panic("");
1036 if (vmx_enabled && !dsi.xen_section_string) {
1037 printk("Dom0 is vmx domain!\n");
1038 vmx_dom0 = 1;
1040 #endif
1042 p_start = dsi.v_start;
1043 pkern_start = dsi.v_kernstart;
1044 pkern_end = dsi.v_kernend;
1045 pkern_entry = dsi.v_kernentry;
1047 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1049 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1051 printk("Initial guest OS must load to a page boundary.\n");
1052 return -EINVAL;
1055 pstart_info = PAGE_ALIGN(pkern_end);
1056 if(initrd_start && initrd_len){
1057 unsigned long offset;
1059 /* The next page aligned boundary after the start info.
1060 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1061 pinitrd_start = pstart_info + PAGE_SIZE;
1062 if (pinitrd_start + initrd_len >= dom0_size)
1063 panic("%s: not enough memory assigned to dom0", __func__);
1064 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1065 struct page_info *p;
1066 p = assign_new_domain_page(d, pinitrd_start + offset);
1067 if (p == NULL)
1068 panic("%s: can't allocate page for initrd image", __func__);
1069 if (initrd_len < offset + PAGE_SIZE)
1070 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1071 initrd_len - offset);
1072 else
1073 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1077 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1078 " Kernel image: %lx->%lx\n"
1079 " Entry address: %lx\n"
1080 " Init. ramdisk: %lx len %lx\n"
1081 " Start info.: %lx->%lx\n",
1082 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1083 pstart_info, pstart_info + PAGE_SIZE);
1085 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1087 printk("Initial guest OS requires too much space\n"
1088 "(%luMB is greater than %luMB limit)\n",
1089 (pkern_end-pkern_start)>>20,
1090 (max_pages <<PAGE_SHIFT)>>20);
1091 return -ENOMEM;
1094 // if high 3 bits of pkern start are non-zero, error
1096 // if pkern end is after end of metaphysical memory, error
1097 // (we should be able to deal with this... later)
1099 /* Mask all upcalls... */
1100 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1101 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1103 if (dom0_max_vcpus == 0)
1104 dom0_max_vcpus = MAX_VIRT_CPUS;
1105 if (dom0_max_vcpus > num_online_cpus())
1106 dom0_max_vcpus = num_online_cpus();
1107 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1108 dom0_max_vcpus = MAX_VIRT_CPUS;
1110 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1111 for ( i = 1; i < dom0_max_vcpus; i++ )
1112 if (alloc_vcpu(d, i, i) == NULL)
1113 panic("Cannot allocate dom0 vcpu %d\n", i);
1115 /* Copy the OS image. */
1116 loaddomainelfimage(d,image_start);
1118 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1119 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1121 /* Set up start info area. */
1122 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1123 start_info_page = assign_new_domain_page(d, pstart_info);
1124 if (start_info_page == NULL)
1125 panic("can't allocate start info page");
1126 si = page_to_virt(start_info_page);
1127 memset(si, 0, PAGE_SIZE);
1128 sprintf(si->magic, "xen-%i.%i-ia64",
1129 xen_major_version(), xen_minor_version());
1130 si->nr_pages = max_pages;
1131 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1133 printk("Dom0: 0x%lx\n", (u64)dom0);
1135 #ifdef VALIDATE_VT
1136 /* VMX specific construction for Dom0, if hardware supports VMX
1137 * and Dom0 is unmodified image
1138 */
1139 if (vmx_dom0)
1140 vmx_final_setup_guest(v);
1141 #endif
1143 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1145 /* Build firmware.
1146 Note: Linux kernel reserve memory used by start_info, so there is
1147 no need to remove it from MDT. */
1148 bp_mpa = pstart_info + sizeof(struct start_info);
1149 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1151 /* Fill boot param. */
1152 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1153 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1155 bp = (struct ia64_boot_param *)((unsigned char *)si +
1156 sizeof(start_info_t));
1157 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1159 /* We assume console has reached the last line! */
1160 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1161 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1162 bp->console_info.orig_x = 0;
1163 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1164 0 : bp->console_info.num_rows - 1;
1166 bp->initrd_start = pinitrd_start;
1167 bp->initrd_size = ia64_boot_param->initrd_size;
1169 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1170 sizeof(start_info_t) +
1171 sizeof(struct ia64_boot_param));
1173 if (fill_console_start_info(ci)) {
1174 si->console.dom0.info_off = sizeof(start_info_t) +
1175 sizeof(struct ia64_boot_param);
1176 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1179 vcpu_init_regs (v);
1181 vcpu_regs(v)->r28 = bp_mpa;
1183 vcpu_regs (v)->cr_iip = pkern_entry;
1185 physdev_init_dom0(d);
1187 return 0;
1190 void machine_restart(char * __unused)
1192 console_start_sync();
1193 if (running_on_sim)
1194 printk ("machine_restart called. spinning...\n");
1195 else
1196 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1197 while(1);
1200 extern void cpu_halt(void);
1202 void machine_halt(void)
1204 console_start_sync();
1205 if (running_on_sim)
1206 printk ("machine_halt called. spinning...\n");
1207 else
1208 cpu_halt();
1209 while(1);
1212 void sync_vcpu_execstate(struct vcpu *v)
1214 // __ia64_save_fpu(v->arch._thread.fph);
1215 // if (VMX_DOMAIN(v))
1216 // vmx_save_state(v);
1217 // FIXME SMP: Anything else needed here for SMP?
1220 static void parse_dom0_mem(char *s)
1222 dom0_size = parse_size_and_unit(s, NULL);
1224 custom_param("dom0_mem", parse_dom0_mem);