ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 13112:613ccf4ca46b

[IA64] Change dom0's initrd image load area so that it follows start info.

It becomes non-sense to use dom0_size if dom0 memoy is assigned sparsely.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author awilliam@xenbuild2.aw
date Thu Jan 04 14:50:26 2007 -0700 (2007-01-04)
parents 178e46776c71
children d9b2dd57fdc4
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
49 #include <xen/guest_access.h>
50 #include <asm/tlb_track.h>
51 #include <asm/perfmon.h>
53 unsigned long dom0_size = 512*1024*1024;
55 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
56 static unsigned int dom0_max_vcpus = 1;
57 integer_param("dom0_max_vcpus", dom0_max_vcpus);
59 extern unsigned long running_on_sim;
61 extern char dom0_command_line[];
63 /* forward declaration */
64 static void init_switch_stack(struct vcpu *v);
66 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
67 This is a Xen virtual address. */
68 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
69 DEFINE_PER_CPU(int *, current_psr_ic_addr);
71 #include <xen/sched-if.h>
73 static void
74 ia64_disable_vhpt_walker(void)
75 {
76 // disable VHPT. ia64_new_rr7() might cause VHPT
77 // fault without this because it flushes dtr[IA64_TR_VHPT]
78 // (VHPT_SIZE_LOG2 << 2) is just for avoid
79 // Reserved Register/Field fault.
80 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
81 }
83 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
84 {
85 int cpu = smp_processor_id();
86 int last_vcpu_id, last_processor;
88 if (!is_idle_domain(prev->domain))
89 tlbflush_update_time
90 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
91 tlbflush_current_time());
93 if (is_idle_domain(next->domain))
94 return;
96 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
97 last_processor = next->arch.last_processor;
99 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
100 next->arch.last_processor = cpu;
102 if ((last_vcpu_id != next->vcpu_id &&
103 last_vcpu_id != INVALID_VCPU_ID) ||
104 (last_vcpu_id == next->vcpu_id &&
105 last_processor != cpu &&
106 last_processor != INVALID_PROCESSOR)) {
107 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
108 u32 last_tlbflush_timestamp =
109 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
110 #endif
111 int vhpt_is_flushed = 0;
113 // if the vTLB implementation was changed,
114 // the followings must be updated either.
115 if (VMX_DOMAIN(next)) {
116 // currently vTLB for vt-i domian is per vcpu.
117 // so any flushing isn't needed.
118 } else if (HAS_PERVCPU_VHPT(next->domain)) {
119 // nothing to do
120 } else {
121 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
122 last_tlbflush_timestamp)) {
123 local_vhpt_flush();
124 vhpt_is_flushed = 1;
125 }
126 }
127 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
128 last_tlbflush_timestamp)) {
129 local_flush_tlb_all();
130 perfc_incrc(tlbflush_clock_cswitch_purge);
131 } else {
132 perfc_incrc(tlbflush_clock_cswitch_skip);
133 }
134 perfc_incrc(flush_vtlb_for_context_switch);
135 }
136 }
138 void schedule_tail(struct vcpu *prev)
139 {
140 extern char ia64_ivt;
141 context_saved(prev);
143 ia64_disable_vhpt_walker();
144 if (VMX_DOMAIN(current)) {
145 vmx_do_launch(current);
146 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
147 current->processor);
148 } else {
149 ia64_set_iva(&ia64_ivt);
150 load_region_regs(current);
151 ia64_set_pta(vcpu_pta(current));
152 vcpu_load_kernel_regs(current);
153 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
154 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
155 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
156 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
157 migrate_timer(&current->arch.hlt_timer, current->processor);
158 }
159 flush_vtlb_for_context_switch(prev, current);
160 }
162 void context_switch(struct vcpu *prev, struct vcpu *next)
163 {
164 uint64_t spsr;
166 local_irq_save(spsr);
168 if (!is_idle_domain(prev->domain))
169 __ia64_save_fpu(prev->arch._thread.fph);
170 if (!is_idle_domain(next->domain))
171 __ia64_load_fpu(next->arch._thread.fph);
173 if (VMX_DOMAIN(prev)) {
174 vmx_save_state(prev);
175 if (!VMX_DOMAIN(next)) {
176 /* VMX domains can change the physical cr.dcr.
177 * Restore default to prevent leakage. */
178 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
179 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
180 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
181 }
182 }
183 if (VMX_DOMAIN(next))
184 vmx_load_state(next);
186 ia64_disable_vhpt_walker();
187 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
188 prev = ia64_switch_to(next);
190 /* Note: ia64_switch_to does not return here at vcpu initialization. */
192 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
194 if (VMX_DOMAIN(current)){
195 vmx_load_all_rr(current);
196 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
197 current->processor);
198 } else {
199 struct domain *nd;
200 extern char ia64_ivt;
202 ia64_set_iva(&ia64_ivt);
204 nd = current->domain;
205 if (!is_idle_domain(nd)) {
206 load_region_regs(current);
207 ia64_set_pta(vcpu_pta(current));
208 vcpu_load_kernel_regs(current);
209 vcpu_set_next_timer(current);
210 if (vcpu_timer_expired(current))
211 vcpu_pend_timer(current);
212 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
213 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
214 __ia64_per_cpu_var(current_psr_ic_addr) =
215 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
216 } else {
217 /* When switching to idle domain, only need to disable vhpt
218 * walker. Then all accesses happen within idle context will
219 * be handled by TR mapping and identity mapping.
220 */
221 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
222 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
223 }
224 }
225 local_irq_restore(spsr);
226 flush_vtlb_for_context_switch(prev, current);
227 context_saved(prev);
228 }
230 void continue_running(struct vcpu *same)
231 {
232 /* nothing to do */
233 }
235 #ifdef CONFIG_PERFMON
236 static int pal_halt = 1;
237 static int can_do_pal_halt = 1;
239 static int __init nohalt_setup(char * str)
240 {
241 pal_halt = can_do_pal_halt = 0;
242 return 1;
243 }
244 __setup("nohalt", nohalt_setup);
246 void
247 update_pal_halt_status(int status)
248 {
249 can_do_pal_halt = pal_halt && status;
250 }
251 #else
252 #define can_do_pal_halt (1)
253 #endif
255 static void default_idle(void)
256 {
257 local_irq_disable();
258 if ( !softirq_pending(smp_processor_id()) ) {
259 if (can_do_pal_halt)
260 safe_halt();
261 else
262 cpu_relax();
263 }
264 local_irq_enable();
265 }
267 static void continue_cpu_idle_loop(void)
268 {
269 for ( ; ; )
270 {
271 #ifdef IA64
272 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
273 #else
274 irq_stat[cpu].idle_timestamp = jiffies;
275 #endif
276 while ( !softirq_pending(smp_processor_id()) )
277 default_idle();
278 raise_softirq(SCHEDULE_SOFTIRQ);
279 do_softirq();
280 }
281 }
283 void startup_cpu_idle_loop(void)
284 {
285 /* Just some sanity to ensure that the scheduler is set up okay. */
286 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
287 raise_softirq(SCHEDULE_SOFTIRQ);
289 continue_cpu_idle_loop();
290 }
292 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
293 * get_order_from_shift(XMAPPEDREGS_SHIFT))
294 */
295 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
296 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
297 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
298 #endif
300 void hlt_timer_fn(void *data)
301 {
302 struct vcpu *v = data;
303 vcpu_unblock(v);
304 }
306 void relinquish_vcpu_resources(struct vcpu *v)
307 {
308 if (HAS_PERVCPU_VHPT(v->domain))
309 pervcpu_vhpt_free(v);
310 if (v->arch.privregs != NULL) {
311 // this might be called by arch_do_domctl() with XEN_DOMCTL_arch_setup()
312 // for domVTi.
313 if (!(atomic_read(&v->domain->refcnt) & DOMAIN_DESTROYED)) {
314 unsigned long i;
315 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
316 guest_physmap_remove_page(v->domain,
317 IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
318 virt_to_maddr(v->arch.privregs + i));
319 }
321 free_xenheap_pages(v->arch.privregs,
322 get_order_from_shift(XMAPPEDREGS_SHIFT));
323 v->arch.privregs = NULL;
324 }
325 kill_timer(&v->arch.hlt_timer);
326 }
328 struct vcpu *alloc_vcpu_struct(void)
329 {
330 struct vcpu *v;
331 struct thread_info *ti;
332 static int first_allocation = 1;
334 if (first_allocation) {
335 first_allocation = 0;
336 /* Still keep idle vcpu0 static allocated at compilation, due
337 * to some code from Linux still requires it in early phase.
338 */
339 return idle_vcpu[0];
340 }
342 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
343 return NULL;
344 memset(v, 0, sizeof(*v));
346 ti = alloc_thread_info(v);
347 /* Clear thread_info to clear some important fields, like
348 * preempt_count
349 */
350 memset(ti, 0, sizeof(struct thread_info));
351 init_switch_stack(v);
353 return v;
354 }
356 void free_vcpu_struct(struct vcpu *v)
357 {
358 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
359 }
361 int vcpu_initialise(struct vcpu *v)
362 {
363 struct domain *d = v->domain;
364 int rc, order, i;
366 if (!is_idle_domain(d)) {
367 if (!d->arch.is_vti) {
368 if (HAS_PERVCPU_VHPT(d))
369 if ((rc = pervcpu_vhpt_alloc(v)) != 0)
370 return rc;
372 /* Create privregs page only if not VTi. */
373 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
374 v->arch.privregs = alloc_xenheap_pages(order);
375 BUG_ON(v->arch.privregs == NULL);
376 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
377 for (i = 0; i < (1 << order); i++)
378 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
379 i, d, XENSHARE_writable);
380 /*
381 * XXX IA64_XMAPPEDREGS_PADDR
382 * assign these pages into guest pseudo physical address
383 * space for dom0 to map this page by gmfn.
384 * this is necessary for domain save, restore and dump-core.
385 */
386 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
387 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
388 virt_to_maddr(v->arch.privregs + i));
390 tlbflush_update_time(&v->arch.tlbflush_timestamp,
391 tlbflush_current_time());
392 }
394 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
395 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
396 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
397 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
399 /* Is it correct ?
400 It depends on the domain rid usage.
402 A domain may share rid among its processor (eg having a
403 global VHPT). In this case, we should also share rid
404 among vcpus and the rid range should be the same.
406 However a domain may have per cpu rid allocation. In
407 this case we don't want to share rid among vcpus, but we may
408 do it if two vcpus are on the same cpu... */
410 v->arch.starting_rid = d->arch.starting_rid;
411 v->arch.ending_rid = d->arch.ending_rid;
412 v->arch.breakimm = d->arch.breakimm;
413 v->arch.last_processor = INVALID_PROCESSOR;
414 }
416 if (!VMX_DOMAIN(v))
417 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
418 first_cpu(cpu_online_map));
420 return 0;
421 }
423 void vcpu_destroy(struct vcpu *v)
424 {
425 if (v->domain->arch.is_vti)
426 vmx_relinquish_vcpu_resources(v);
427 else
428 relinquish_vcpu_resources(v);
429 }
431 static void init_switch_stack(struct vcpu *v)
432 {
433 struct pt_regs *regs = vcpu_regs (v);
434 struct switch_stack *sw = (struct switch_stack *) regs - 1;
435 extern void ia64_ret_from_clone;
437 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
438 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
439 sw->b0 = (unsigned long) &ia64_ret_from_clone;
440 sw->ar_fpsr = FPSR_DEFAULT;
441 v->arch._thread.ksp = (unsigned long) sw - 16;
442 // stay on kernel stack because may get interrupts!
443 // ia64_ret_from_clone switches to user stack
444 v->arch._thread.on_ustack = 0;
445 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
446 }
448 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
449 static int opt_pervcpu_vhpt = 1;
450 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
451 #endif
453 int arch_domain_create(struct domain *d)
454 {
455 int i;
457 // the following will eventually need to be negotiated dynamically
458 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
459 d->arch.breakimm = 0x1000;
460 for (i = 0; i < NR_CPUS; i++) {
461 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
462 }
464 if (is_idle_domain(d))
465 return 0;
467 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
468 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
469 dprintk(XENLOG_WARNING, "%s:%d domain %d pervcpu_vhpt %d\n",
470 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
471 #endif
472 if (tlb_track_create(d) < 0)
473 goto fail_nomem1;
474 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
475 if (d->shared_info == NULL)
476 goto fail_nomem;
477 memset(d->shared_info, 0, XSI_SIZE);
478 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
479 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
480 d, XENSHARE_writable);
482 /* We may also need emulation rid for region4, though it's unlikely
483 * to see guest issue uncacheable access in metaphysical mode. But
484 * keep such info here may be more sane.
485 */
486 if (!allocate_rid_range(d,0))
487 goto fail_nomem;
489 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
491 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
492 goto fail_nomem;
494 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
495 RANGESETF_prettyprint_hex);
497 printk ("arch_domain_create: domain=%p\n", d);
498 return 0;
500 fail_nomem:
501 tlb_track_destroy(d);
502 fail_nomem1:
503 if (d->arch.mm.pgd != NULL)
504 pgd_free(d->arch.mm.pgd);
505 if (d->shared_info != NULL)
506 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
507 return -ENOMEM;
508 }
510 void arch_domain_destroy(struct domain *d)
511 {
512 mm_final_teardown(d);
514 if (d->shared_info != NULL)
515 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
517 tlb_track_destroy(d);
519 /* Clear vTLB for the next domain. */
520 domain_flush_tlb_vhpt(d);
522 deallocate_rid_range(d);
523 }
525 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
526 {
527 int i;
528 struct vcpu_extra_regs *er = &c->extra_regs;
530 c->user_regs = *vcpu_regs (v);
531 c->privregs_pfn = get_gpfn_from_mfn(virt_to_maddr(v->arch.privregs) >>
532 PAGE_SHIFT);
534 /* Fill extra regs. */
535 for (i = 0; i < 8; i++) {
536 er->itrs[i].pte = v->arch.itrs[i].pte.val;
537 er->itrs[i].itir = v->arch.itrs[i].itir;
538 er->itrs[i].vadr = v->arch.itrs[i].vadr;
539 er->itrs[i].rid = v->arch.itrs[i].rid;
540 }
541 for (i = 0; i < 8; i++) {
542 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
543 er->dtrs[i].itir = v->arch.dtrs[i].itir;
544 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
545 er->dtrs[i].rid = v->arch.dtrs[i].rid;
546 }
547 er->event_callback_ip = v->arch.event_callback_ip;
548 er->dcr = v->arch.dcr;
549 er->iva = v->arch.iva;
550 }
552 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
553 {
554 struct pt_regs *regs = vcpu_regs (v);
555 struct domain *d = v->domain;
557 *regs = c->user_regs;
559 if (!d->arch.is_vti) {
560 /* domain runs at PL2/3 */
561 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
562 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
563 }
565 if (c->flags & VGCF_EXTRA_REGS) {
566 int i;
567 struct vcpu_extra_regs *er = &c->extra_regs;
569 for (i = 0; i < 8; i++) {
570 vcpu_set_itr(v, i, er->itrs[i].pte,
571 er->itrs[i].itir,
572 er->itrs[i].vadr,
573 er->itrs[i].rid);
574 }
575 for (i = 0; i < 8; i++) {
576 vcpu_set_dtr(v, i,
577 er->dtrs[i].pte,
578 er->dtrs[i].itir,
579 er->dtrs[i].vadr,
580 er->dtrs[i].rid);
581 }
582 v->arch.event_callback_ip = er->event_callback_ip;
583 v->arch.dcr = er->dcr;
584 v->arch.iva = er->iva;
585 }
587 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
588 return 0;
589 if (d->arch.is_vti)
590 vmx_final_setup_guest(v);
592 /* This overrides some registers. */
593 vcpu_init_regs(v);
595 /* Don't redo final setup */
596 set_bit(_VCPUF_initialised, &v->vcpu_flags);
597 return 0;
598 }
600 static void relinquish_memory(struct domain *d, struct list_head *list)
601 {
602 struct list_head *ent;
603 struct page_info *page;
604 #ifndef __ia64__
605 unsigned long x, y;
606 #endif
608 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
609 spin_lock_recursive(&d->page_alloc_lock);
610 ent = list->next;
611 while ( ent != list )
612 {
613 page = list_entry(ent, struct page_info, list);
614 /* Grab a reference to the page so it won't disappear from under us. */
615 if ( unlikely(!get_page(page, d)) )
616 {
617 /* Couldn't get a reference -- someone is freeing this page. */
618 ent = ent->next;
619 continue;
620 }
622 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
623 put_page_and_type(page);
625 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
626 put_page(page);
628 #ifndef __ia64__
629 /*
630 * Forcibly invalidate base page tables at this point to break circular
631 * 'linear page table' references. This is okay because MMU structures
632 * are not shared across domains and this domain is now dead. Thus base
633 * tables are not in use so a non-zero count means circular reference.
634 */
635 y = page->u.inuse.type_info;
636 for ( ; ; )
637 {
638 x = y;
639 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
640 (PGT_base_page_table|PGT_validated)) )
641 break;
643 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
644 if ( likely(y == x) )
645 {
646 free_page_type(page, PGT_base_page_table);
647 break;
648 }
649 }
650 #endif
652 /* Follow the list chain and /then/ potentially free the page. */
653 ent = ent->next;
654 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
655 put_page(page);
656 }
658 spin_unlock_recursive(&d->page_alloc_lock);
659 }
661 void domain_relinquish_resources(struct domain *d)
662 {
663 /* Relinquish guest resources for VT-i domain. */
664 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
665 vmx_relinquish_guest_resources(d);
667 /* Tear down shadow mode stuff. */
668 mm_teardown(d);
670 /* Relinquish every page of memory. */
671 relinquish_memory(d, &d->xenpage_list);
672 relinquish_memory(d, &d->page_list);
674 if (d->arch.is_vti && d->arch.sal_data)
675 xfree(d->arch.sal_data);
677 /* Free page used by xen oprofile buffer */
678 free_xenoprof_pages(d);
679 }
681 unsigned long
682 domain_set_shared_info_va (unsigned long va)
683 {
684 struct vcpu *v = current;
685 struct domain *d = v->domain;
686 struct vcpu *v1;
688 /* Check virtual address:
689 must belong to region 7,
690 must be 64Kb aligned,
691 must not be within Xen virtual space. */
692 if ((va >> 61) != 7
693 || (va & 0xffffUL) != 0
694 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
695 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
697 /* Note: this doesn't work well if other cpus are already running.
698 However this is part of the spec :-) */
699 printk ("Domain set shared_info_va to 0x%016lx\n", va);
700 d->arch.shared_info_va = va;
702 for_each_vcpu (d, v1) {
703 VCPU(v1, interrupt_mask_addr) =
704 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
705 }
707 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
709 /* Remap the shared pages. */
710 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
712 return 0;
713 }
715 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
716 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
718 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
719 {
720 unsigned int op = sc->op;
721 int rc = 0;
722 int i;
723 //struct vcpu *v;
725 if (unlikely(d == current->domain)) {
726 gdprintk(XENLOG_INFO,
727 "Don't try to do a shadow op on yourself!\n");
728 return -EINVAL;
729 }
731 domain_pause(d);
733 switch (op)
734 {
735 case XEN_DOMCTL_SHADOW_OP_OFF:
736 if (shadow_mode_enabled (d)) {
737 u64 *bm = d->arch.shadow_bitmap;
739 /* Flush vhpt and tlb to restore dirty bit usage. */
740 domain_flush_tlb_vhpt(d);
742 /* Free bitmap. */
743 d->arch.shadow_bitmap_size = 0;
744 d->arch.shadow_bitmap = NULL;
745 xfree(bm);
746 }
747 break;
749 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
750 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
751 rc = -EINVAL;
752 break;
754 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
755 if (shadow_mode_enabled(d)) {
756 rc = -EINVAL;
757 break;
758 }
760 atomic64_set(&d->arch.shadow_fault_count, 0);
761 atomic64_set(&d->arch.shadow_dirty_count, 0);
763 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
764 ~(BITS_PER_LONG-1);
765 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
766 d->arch.shadow_bitmap_size / BITS_PER_LONG);
767 if (d->arch.shadow_bitmap == NULL) {
768 d->arch.shadow_bitmap_size = 0;
769 rc = -ENOMEM;
770 }
771 else {
772 memset(d->arch.shadow_bitmap, 0,
773 d->arch.shadow_bitmap_size / 8);
775 /* Flush vhtp and tlb to enable dirty bit
776 virtualization. */
777 domain_flush_tlb_vhpt(d);
778 }
779 break;
781 case XEN_DOMCTL_SHADOW_OP_CLEAN:
782 {
783 int nbr_longs;
785 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
786 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
788 atomic64_set(&d->arch.shadow_fault_count, 0);
789 atomic64_set(&d->arch.shadow_dirty_count, 0);
791 if (guest_handle_is_null(sc->dirty_bitmap) ||
792 (d->arch.shadow_bitmap == NULL)) {
793 rc = -EINVAL;
794 break;
795 }
797 if (sc->pages > d->arch.shadow_bitmap_size)
798 sc->pages = d->arch.shadow_bitmap_size;
800 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
802 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
803 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
804 SHADOW_COPY_CHUNK : nbr_longs - i;
806 if (copy_to_guest_offset(sc->dirty_bitmap, i,
807 d->arch.shadow_bitmap + i,
808 size)) {
809 rc = -EFAULT;
810 break;
811 }
813 memset(d->arch.shadow_bitmap + i,
814 0, size * sizeof(unsigned long));
815 }
817 break;
818 }
820 case XEN_DOMCTL_SHADOW_OP_PEEK:
821 {
822 unsigned long size;
824 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
825 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
827 if (guest_handle_is_null(sc->dirty_bitmap) ||
828 (d->arch.shadow_bitmap == NULL)) {
829 rc = -EINVAL;
830 break;
831 }
833 if (sc->pages > d->arch.shadow_bitmap_size)
834 sc->pages = d->arch.shadow_bitmap_size;
836 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
837 if (copy_to_guest(sc->dirty_bitmap,
838 d->arch.shadow_bitmap, size)) {
839 rc = -EFAULT;
840 break;
841 }
842 break;
843 }
844 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
845 sc->mb = 0;
846 break;
847 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
848 if (sc->mb > 0) {
849 BUG();
850 rc = -ENOMEM;
851 }
852 break;
853 default:
854 rc = -EINVAL;
855 break;
856 }
858 domain_unpause(d);
860 return rc;
861 }
863 // remove following line if not privifying in memory
864 //#define HAVE_PRIVIFY_MEMORY
865 #ifndef HAVE_PRIVIFY_MEMORY
866 #define privify_memory(x,y) do {} while(0)
867 #endif
869 // see arch/x86/xxx/domain_build.c
870 int elf_sanity_check(Elf_Ehdr *ehdr)
871 {
872 if (!(IS_ELF(*ehdr)))
873 {
874 printk("DOM0 image is not a Xen-compatible Elf image.\n");
875 return 0;
876 }
877 return 1;
878 }
880 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
881 {
882 char *elfbase = (char *) image_start;
883 Elf_Ehdr ehdr;
884 Elf_Phdr phdr;
885 int h, filesz, memsz;
886 unsigned long elfaddr, dom_mpaddr, dom_imva;
887 struct page_info *p;
889 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
890 for ( h = 0; h < ehdr.e_phnum; h++ ) {
891 memcpy(&phdr,
892 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
893 sizeof(Elf_Phdr));
894 if ((phdr.p_type != PT_LOAD))
895 continue;
897 filesz = phdr.p_filesz;
898 memsz = phdr.p_memsz;
899 elfaddr = (unsigned long) elfbase + phdr.p_offset;
900 dom_mpaddr = phdr.p_paddr;
902 while (memsz > 0) {
903 p = assign_new_domain_page(d,dom_mpaddr);
904 BUG_ON (unlikely(p == NULL));
905 dom_imva = __va_ul(page_to_maddr(p));
906 if (filesz > 0) {
907 if (filesz >= PAGE_SIZE)
908 memcpy((void *) dom_imva,
909 (void *) elfaddr,
910 PAGE_SIZE);
911 else {
912 // copy partial page
913 memcpy((void *) dom_imva,
914 (void *) elfaddr, filesz);
915 // zero the rest of page
916 memset((void *) dom_imva+filesz, 0,
917 PAGE_SIZE-filesz);
918 }
919 //FIXME: This test for code seems to find a lot more than objdump -x does
920 if (phdr.p_flags & PF_X) {
921 privify_memory(dom_imva,PAGE_SIZE);
922 flush_icache_range(dom_imva,
923 dom_imva+PAGE_SIZE);
924 }
925 }
926 else if (memsz > 0) {
927 /* always zero out entire page */
928 memset((void *) dom_imva, 0, PAGE_SIZE);
929 }
930 memsz -= PAGE_SIZE;
931 filesz -= PAGE_SIZE;
932 elfaddr += PAGE_SIZE;
933 dom_mpaddr += PAGE_SIZE;
934 }
935 }
936 }
938 void alloc_dom0(void)
939 {
940 /* Check dom0 size. */
941 if (dom0_size < 4 * 1024 * 1024) {
942 panic("dom0_mem is too small, boot aborted"
943 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
944 }
946 if (running_on_sim) {
947 dom0_size = 128*1024*1024; //FIXME: Should be configurable
948 }
950 /* no need to allocate pages for now
951 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
952 */
953 }
956 /*
957 * Domain 0 has direct access to all devices absolutely. However
958 * the major point of this stub here, is to allow alloc_dom_mem
959 * handled with order > 0 request. Dom0 requires that bit set to
960 * allocate memory for other domains.
961 */
962 static void physdev_init_dom0(struct domain *d)
963 {
964 if (iomem_permit_access(d, 0UL, ~0UL))
965 BUG();
966 if (irqs_permit_access(d, 0, NR_IRQS-1))
967 BUG();
968 if (ioports_permit_access(d, 0, 0xffff))
969 BUG();
970 }
972 int construct_dom0(struct domain *d,
973 unsigned long image_start, unsigned long image_len,
974 unsigned long initrd_start, unsigned long initrd_len,
975 char *cmdline)
976 {
977 int i, rc;
978 start_info_t *si;
979 dom0_vga_console_info_t *ci;
980 struct vcpu *v = d->vcpu[0];
981 unsigned long max_pages;
983 struct domain_setup_info dsi;
984 unsigned long p_start;
985 unsigned long pkern_start;
986 unsigned long pkern_entry;
987 unsigned long pkern_end;
988 unsigned long pinitrd_start = 0;
989 unsigned long pstart_info;
990 struct page_info *start_info_page;
991 unsigned long bp_mpa;
992 struct ia64_boot_param *bp;
994 #ifdef VALIDATE_VT
995 unsigned int vmx_dom0 = 0;
996 unsigned long mfn;
997 struct page_info *page = NULL;
998 #endif
1000 //printk("construct_dom0: starting\n");
1002 /* Sanity! */
1003 BUG_ON(d != dom0);
1004 BUG_ON(d->vcpu[0] == NULL);
1005 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
1007 memset(&dsi, 0, sizeof(struct domain_setup_info));
1009 printk("*** LOADING DOMAIN 0 ***\n");
1011 max_pages = dom0_size / PAGE_SIZE;
1012 d->max_pages = max_pages;
1013 d->tot_pages = 0;
1014 dsi.image_addr = (unsigned long)image_start;
1015 dsi.image_len = image_len;
1016 rc = parseelfimage(&dsi);
1017 if ( rc != 0 )
1018 return rc;
1020 #ifdef VALIDATE_VT
1021 /* Temp workaround */
1022 if (running_on_sim)
1023 dsi.xen_section_string = (char *)1;
1025 /* Check whether dom0 is vti domain */
1026 if ((!vmx_enabled) && !dsi.xen_section_string) {
1027 printk("Lack of hardware support for unmodified vmx dom0\n");
1028 panic("");
1031 if (vmx_enabled && !dsi.xen_section_string) {
1032 printk("Dom0 is vmx domain!\n");
1033 vmx_dom0 = 1;
1035 #endif
1037 p_start = dsi.v_start;
1038 pkern_start = dsi.v_kernstart;
1039 pkern_end = dsi.v_kernend;
1040 pkern_entry = dsi.v_kernentry;
1042 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
1044 if ( (p_start & (PAGE_SIZE-1)) != 0 )
1046 printk("Initial guest OS must load to a page boundary.\n");
1047 return -EINVAL;
1050 pstart_info = PAGE_ALIGN(pkern_end);
1051 if(initrd_start && initrd_len){
1052 unsigned long offset;
1054 /* The next page aligned boundary after the start info.
1055 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
1056 pinitrd_start = pstart_info + PAGE_SIZE;
1057 if (pinitrd_start + initrd_len >= dom0_size)
1058 panic("%s: not enough memory assigned to dom0", __func__);
1059 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1060 struct page_info *p;
1061 p = assign_new_domain_page(d, pinitrd_start + offset);
1062 if (p == NULL)
1063 panic("%s: can't allocate page for initrd image", __func__);
1064 if (initrd_len < offset + PAGE_SIZE)
1065 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1066 initrd_len - offset);
1067 else
1068 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1072 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1073 " Kernel image: %lx->%lx\n"
1074 " Entry address: %lx\n"
1075 " Init. ramdisk: %lx len %lx\n"
1076 " Start info.: %lx->%lx\n",
1077 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1078 pstart_info, pstart_info + PAGE_SIZE);
1080 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1082 printk("Initial guest OS requires too much space\n"
1083 "(%luMB is greater than %luMB limit)\n",
1084 (pkern_end-pkern_start)>>20,
1085 (max_pages <<PAGE_SHIFT)>>20);
1086 return -ENOMEM;
1089 // if high 3 bits of pkern start are non-zero, error
1091 // if pkern end is after end of metaphysical memory, error
1092 // (we should be able to deal with this... later)
1094 /* Mask all upcalls... */
1095 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1096 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1098 if (dom0_max_vcpus == 0)
1099 dom0_max_vcpus = MAX_VIRT_CPUS;
1100 if (dom0_max_vcpus > num_online_cpus())
1101 dom0_max_vcpus = num_online_cpus();
1102 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1103 dom0_max_vcpus = MAX_VIRT_CPUS;
1105 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1106 for ( i = 1; i < dom0_max_vcpus; i++ )
1107 if (alloc_vcpu(d, i, i) == NULL)
1108 printk ("Cannot allocate dom0 vcpu %d\n", i);
1110 /* Copy the OS image. */
1111 loaddomainelfimage(d,image_start);
1113 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1114 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1116 /* Set up start info area. */
1117 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1118 start_info_page = assign_new_domain_page(d, pstart_info);
1119 if (start_info_page == NULL)
1120 panic("can't allocate start info page");
1121 si = page_to_virt(start_info_page);
1122 memset(si, 0, PAGE_SIZE);
1123 sprintf(si->magic, "xen-%i.%i-ia64",
1124 xen_major_version(), xen_minor_version());
1125 si->nr_pages = max_pages;
1126 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1128 printk("Dom0: 0x%lx\n", (u64)dom0);
1130 #ifdef VALIDATE_VT
1131 /* VMX specific construction for Dom0, if hardware supports VMX
1132 * and Dom0 is unmodified image
1133 */
1134 if (vmx_dom0)
1135 vmx_final_setup_guest(v);
1136 #endif
1138 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1140 /* Build firmware.
1141 Note: Linux kernel reserve memory used by start_info, so there is
1142 no need to remove it from MDT. */
1143 bp_mpa = pstart_info + sizeof(struct start_info);
1144 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1146 /* Fill boot param. */
1147 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1148 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1150 bp = (struct ia64_boot_param *)((unsigned char *)si +
1151 sizeof(start_info_t));
1152 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1154 /* We assume console has reached the last line! */
1155 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1156 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1157 bp->console_info.orig_x = 0;
1158 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1159 0 : bp->console_info.num_rows - 1;
1161 bp->initrd_start = pinitrd_start;
1162 bp->initrd_size = ia64_boot_param->initrd_size;
1164 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1165 sizeof(start_info_t) +
1166 sizeof(struct ia64_boot_param));
1168 if (fill_console_start_info(ci)) {
1169 si->console.dom0.info_off = sizeof(start_info_t) +
1170 sizeof(struct ia64_boot_param);
1171 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1174 vcpu_init_regs (v);
1176 vcpu_regs(v)->r28 = bp_mpa;
1178 vcpu_regs (v)->cr_iip = pkern_entry;
1180 physdev_init_dom0(d);
1182 return 0;
1185 void machine_restart(char * __unused)
1187 console_start_sync();
1188 if (running_on_sim)
1189 printk ("machine_restart called. spinning...\n");
1190 else
1191 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1192 while(1);
1195 extern void cpu_halt(void);
1197 void machine_halt(void)
1199 console_start_sync();
1200 if (running_on_sim)
1201 printk ("machine_halt called. spinning...\n");
1202 else
1203 cpu_halt();
1204 while(1);
1207 void sync_vcpu_execstate(struct vcpu *v)
1209 // __ia64_save_fpu(v->arch._thread.fph);
1210 // if (VMX_DOMAIN(v))
1211 // vmx_save_state(v);
1212 // FIXME SMP: Anything else needed here for SMP?
1215 static void parse_dom0_mem(char *s)
1217 dom0_size = parse_size_and_unit(s, NULL);
1219 custom_param("dom0_mem", parse_dom0_mem);