ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 12525:7e7846ea4ab3

[IA64] Idle domain doesn't need to save and restore fph

Signed-off-by: Anthony Xu <anthony.xu@intel.com>
author awilliam@xenbuild.aw
date Mon Nov 27 10:10:57 2006 -0700 (2006-11-27)
parents b39844e292f6
children 8ab9b43ad557
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
49 #include <xen/guest_access.h>
50 #include <asm/tlb_track.h>
52 unsigned long dom0_size = 512*1024*1024;
54 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
55 static unsigned int dom0_max_vcpus = 1;
56 integer_param("dom0_max_vcpus", dom0_max_vcpus);
58 extern unsigned long running_on_sim;
60 extern char dom0_command_line[];
62 /* forward declaration */
63 static void init_switch_stack(struct vcpu *v);
65 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
66 This is a Xen virtual address. */
67 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
68 DEFINE_PER_CPU(int *, current_psr_ic_addr);
70 #include <xen/sched-if.h>
72 static void
73 ia64_disable_vhpt_walker(void)
74 {
75 // disable VHPT. ia64_new_rr7() might cause VHPT
76 // fault without this because it flushes dtr[IA64_TR_VHPT]
77 // (VHPT_SIZE_LOG2 << 2) is just for avoid
78 // Reserved Register/Field fault.
79 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
80 }
82 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
83 {
84 int cpu = smp_processor_id();
85 int last_vcpu_id, last_processor;
87 if (!is_idle_domain(prev->domain))
88 tlbflush_update_time
89 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
90 tlbflush_current_time());
92 if (is_idle_domain(next->domain))
93 return;
95 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
96 last_processor = next->arch.last_processor;
98 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
99 next->arch.last_processor = cpu;
101 if ((last_vcpu_id != next->vcpu_id &&
102 last_vcpu_id != INVALID_VCPU_ID) ||
103 (last_vcpu_id == next->vcpu_id &&
104 last_processor != cpu &&
105 last_processor != INVALID_PROCESSOR)) {
106 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
107 u32 last_tlbflush_timestamp =
108 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
109 #endif
110 int vhpt_is_flushed = 0;
112 // if the vTLB implementation was changed,
113 // the followings must be updated either.
114 if (VMX_DOMAIN(next)) {
115 // currently vTLB for vt-i domian is per vcpu.
116 // so any flushing isn't needed.
117 } else if (HAS_PERVCPU_VHPT(next->domain)) {
118 // nothing to do
119 } else {
120 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
121 last_tlbflush_timestamp)) {
122 local_vhpt_flush();
123 vhpt_is_flushed = 1;
124 }
125 }
126 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
127 last_tlbflush_timestamp)) {
128 local_flush_tlb_all();
129 perfc_incrc(tlbflush_clock_cswitch_purge);
130 } else {
131 perfc_incrc(tlbflush_clock_cswitch_skip);
132 }
133 perfc_incrc(flush_vtlb_for_context_switch);
134 }
135 }
137 void schedule_tail(struct vcpu *prev)
138 {
139 extern char ia64_ivt;
140 context_saved(prev);
142 ia64_disable_vhpt_walker();
143 if (VMX_DOMAIN(current)) {
144 vmx_do_launch(current);
145 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
146 current->processor);
147 } else {
148 ia64_set_iva(&ia64_ivt);
149 load_region_regs(current);
150 ia64_set_pta(vcpu_pta(current));
151 vcpu_load_kernel_regs(current);
152 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
153 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
154 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
155 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
156 migrate_timer(&current->arch.hlt_timer, current->processor);
157 }
158 flush_vtlb_for_context_switch(prev, current);
159 }
161 void context_switch(struct vcpu *prev, struct vcpu *next)
162 {
163 uint64_t spsr;
165 local_irq_save(spsr);
167 if (!is_idle_domain(prev->domain))
168 __ia64_save_fpu(prev->arch._thread.fph);
169 if (!is_idle_domain(next->domain))
170 __ia64_load_fpu(next->arch._thread.fph);
172 if (VMX_DOMAIN(prev)) {
173 vmx_save_state(prev);
174 if (!VMX_DOMAIN(next)) {
175 /* VMX domains can change the physical cr.dcr.
176 * Restore default to prevent leakage. */
177 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
178 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
179 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
180 }
181 }
182 if (VMX_DOMAIN(next))
183 vmx_load_state(next);
185 ia64_disable_vhpt_walker();
186 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
187 prev = ia64_switch_to(next);
189 /* Note: ia64_switch_to does not return here at vcpu initialization. */
191 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
193 if (VMX_DOMAIN(current)){
194 vmx_load_all_rr(current);
195 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
196 current->processor);
197 } else {
198 struct domain *nd;
199 extern char ia64_ivt;
201 ia64_set_iva(&ia64_ivt);
203 nd = current->domain;
204 if (!is_idle_domain(nd)) {
205 load_region_regs(current);
206 ia64_set_pta(vcpu_pta(current));
207 vcpu_load_kernel_regs(current);
208 vcpu_set_next_timer(current);
209 if (vcpu_timer_expired(current))
210 vcpu_pend_timer(current);
211 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
212 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
213 __ia64_per_cpu_var(current_psr_ic_addr) =
214 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
215 } else {
216 /* When switching to idle domain, only need to disable vhpt
217 * walker. Then all accesses happen within idle context will
218 * be handled by TR mapping and identity mapping.
219 */
220 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
221 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
222 }
223 }
224 local_irq_restore(spsr);
225 flush_vtlb_for_context_switch(prev, current);
226 context_saved(prev);
227 }
229 void continue_running(struct vcpu *same)
230 {
231 /* nothing to do */
232 }
234 static void default_idle(void)
235 {
236 local_irq_disable();
237 if ( !softirq_pending(smp_processor_id()) )
238 safe_halt();
239 local_irq_enable();
240 }
242 static void continue_cpu_idle_loop(void)
243 {
244 for ( ; ; )
245 {
246 #ifdef IA64
247 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
248 #else
249 irq_stat[cpu].idle_timestamp = jiffies;
250 #endif
251 while ( !softirq_pending(smp_processor_id()) )
252 default_idle();
253 raise_softirq(SCHEDULE_SOFTIRQ);
254 do_softirq();
255 }
256 }
258 void startup_cpu_idle_loop(void)
259 {
260 /* Just some sanity to ensure that the scheduler is set up okay. */
261 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
262 raise_softirq(SCHEDULE_SOFTIRQ);
264 continue_cpu_idle_loop();
265 }
267 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
268 * get_order_from_shift(XMAPPEDREGS_SHIFT))
269 */
270 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
271 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
272 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
273 #endif
275 void hlt_timer_fn(void *data)
276 {
277 struct vcpu *v = data;
278 vcpu_unblock(v);
279 }
281 void relinquish_vcpu_resources(struct vcpu *v)
282 {
283 if (HAS_PERVCPU_VHPT(v->domain))
284 pervcpu_vhpt_free(v);
285 if (v->arch.privregs != NULL) {
286 free_xenheap_pages(v->arch.privregs,
287 get_order_from_shift(XMAPPEDREGS_SHIFT));
288 v->arch.privregs = NULL;
289 }
290 kill_timer(&v->arch.hlt_timer);
291 }
293 struct vcpu *alloc_vcpu_struct(void)
294 {
295 struct vcpu *v;
296 struct thread_info *ti;
297 static int first_allocation = 1;
299 if (first_allocation) {
300 first_allocation = 0;
301 /* Still keep idle vcpu0 static allocated at compilation, due
302 * to some code from Linux still requires it in early phase.
303 */
304 return idle_vcpu[0];
305 }
307 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
308 return NULL;
309 memset(v, 0, sizeof(*v));
311 ti = alloc_thread_info(v);
312 /* Clear thread_info to clear some important fields, like
313 * preempt_count
314 */
315 memset(ti, 0, sizeof(struct thread_info));
316 init_switch_stack(v);
318 return v;
319 }
321 void free_vcpu_struct(struct vcpu *v)
322 {
323 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
324 }
326 int vcpu_initialise(struct vcpu *v)
327 {
328 struct domain *d = v->domain;
329 int rc, order, i;
331 if (!is_idle_domain(d)) {
332 if (!d->arch.is_vti) {
333 if (HAS_PERVCPU_VHPT(d))
334 if ((rc = pervcpu_vhpt_alloc(v)) != 0)
335 return rc;
337 /* Create privregs page only if not VTi. */
338 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
339 v->arch.privregs = alloc_xenheap_pages(order);
340 BUG_ON(v->arch.privregs == NULL);
341 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
342 for (i = 0; i < (1 << order); i++)
343 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
344 i, d, XENSHARE_writable);
346 tlbflush_update_time(&v->arch.tlbflush_timestamp,
347 tlbflush_current_time());
348 }
350 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
351 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
352 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
353 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
355 /* Is it correct ?
356 It depends on the domain rid usage.
358 A domain may share rid among its processor (eg having a
359 global VHPT). In this case, we should also share rid
360 among vcpus and the rid range should be the same.
362 However a domain may have per cpu rid allocation. In
363 this case we don't want to share rid among vcpus, but we may
364 do it if two vcpus are on the same cpu... */
366 v->arch.starting_rid = d->arch.starting_rid;
367 v->arch.ending_rid = d->arch.ending_rid;
368 v->arch.breakimm = d->arch.breakimm;
369 v->arch.last_processor = INVALID_PROCESSOR;
370 }
372 if (!VMX_DOMAIN(v))
373 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
374 first_cpu(cpu_online_map));
376 return 0;
377 }
379 void vcpu_destroy(struct vcpu *v)
380 {
381 if (v->domain->arch.is_vti)
382 vmx_relinquish_vcpu_resources(v);
383 else
384 relinquish_vcpu_resources(v);
385 }
387 static void init_switch_stack(struct vcpu *v)
388 {
389 struct pt_regs *regs = vcpu_regs (v);
390 struct switch_stack *sw = (struct switch_stack *) regs - 1;
391 extern void ia64_ret_from_clone;
393 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
394 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
395 sw->b0 = (unsigned long) &ia64_ret_from_clone;
396 sw->ar_fpsr = FPSR_DEFAULT;
397 v->arch._thread.ksp = (unsigned long) sw - 16;
398 // stay on kernel stack because may get interrupts!
399 // ia64_ret_from_clone switches to user stack
400 v->arch._thread.on_ustack = 0;
401 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
402 }
404 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
405 static int opt_pervcpu_vhpt = 1;
406 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
407 #endif
409 int arch_domain_create(struct domain *d)
410 {
411 int i;
413 // the following will eventually need to be negotiated dynamically
414 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
415 d->arch.breakimm = 0x1000;
416 for (i = 0; i < NR_CPUS; i++) {
417 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
418 }
420 if (is_idle_domain(d))
421 return 0;
423 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
424 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
425 dprintk(XENLOG_WARNING, "%s:%d domain %d pervcpu_vhpt %d\n",
426 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
427 #endif
428 if (tlb_track_create(d) < 0)
429 goto fail_nomem1;
430 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
431 if (d->shared_info == NULL)
432 goto fail_nomem;
433 memset(d->shared_info, 0, XSI_SIZE);
434 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
435 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
436 d, XENSHARE_writable);
438 /* We may also need emulation rid for region4, though it's unlikely
439 * to see guest issue uncacheable access in metaphysical mode. But
440 * keep such info here may be more sane.
441 */
442 if (!allocate_rid_range(d,0))
443 goto fail_nomem;
445 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
447 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
448 goto fail_nomem;
450 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
451 RANGESETF_prettyprint_hex);
453 printk ("arch_domain_create: domain=%p\n", d);
454 return 0;
456 fail_nomem:
457 tlb_track_destroy(d);
458 fail_nomem1:
459 if (d->arch.mm.pgd != NULL)
460 pgd_free(d->arch.mm.pgd);
461 if (d->shared_info != NULL)
462 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
463 return -ENOMEM;
464 }
466 void arch_domain_destroy(struct domain *d)
467 {
468 mm_final_teardown(d);
470 if (d->shared_info != NULL)
471 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
473 tlb_track_destroy(d);
475 /* Clear vTLB for the next domain. */
476 domain_flush_tlb_vhpt(d);
478 deallocate_rid_range(d);
479 }
481 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
482 {
483 int i;
484 struct vcpu_extra_regs *er = &c->extra_regs;
486 c->user_regs = *vcpu_regs (v);
487 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
489 /* Fill extra regs. */
490 for (i = 0; i < 8; i++) {
491 er->itrs[i].pte = v->arch.itrs[i].pte.val;
492 er->itrs[i].itir = v->arch.itrs[i].itir;
493 er->itrs[i].vadr = v->arch.itrs[i].vadr;
494 er->itrs[i].rid = v->arch.itrs[i].rid;
495 }
496 for (i = 0; i < 8; i++) {
497 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
498 er->dtrs[i].itir = v->arch.dtrs[i].itir;
499 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
500 er->dtrs[i].rid = v->arch.dtrs[i].rid;
501 }
502 er->event_callback_ip = v->arch.event_callback_ip;
503 er->dcr = v->arch.dcr;
504 er->iva = v->arch.iva;
505 }
507 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
508 {
509 struct pt_regs *regs = vcpu_regs (v);
510 struct domain *d = v->domain;
512 *regs = c->user_regs;
514 if (!d->arch.is_vti) {
515 /* domain runs at PL2/3 */
516 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
517 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
518 }
520 if (c->flags & VGCF_EXTRA_REGS) {
521 int i;
522 struct vcpu_extra_regs *er = &c->extra_regs;
524 for (i = 0; i < 8; i++) {
525 vcpu_set_itr(v, i, er->itrs[i].pte,
526 er->itrs[i].itir,
527 er->itrs[i].vadr,
528 er->itrs[i].rid);
529 }
530 for (i = 0; i < 8; i++) {
531 vcpu_set_dtr(v, i,
532 er->dtrs[i].pte,
533 er->dtrs[i].itir,
534 er->dtrs[i].vadr,
535 er->dtrs[i].rid);
536 }
537 v->arch.event_callback_ip = er->event_callback_ip;
538 v->arch.dcr = er->dcr;
539 v->arch.iva = er->iva;
540 }
542 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
543 return 0;
544 if (d->arch.is_vti)
545 vmx_final_setup_guest(v);
547 /* This overrides some registers. */
548 vcpu_init_regs(v);
550 /* Don't redo final setup */
551 set_bit(_VCPUF_initialised, &v->vcpu_flags);
552 return 0;
553 }
555 static void relinquish_memory(struct domain *d, struct list_head *list)
556 {
557 struct list_head *ent;
558 struct page_info *page;
559 #ifndef __ia64__
560 unsigned long x, y;
561 #endif
563 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
564 spin_lock_recursive(&d->page_alloc_lock);
565 ent = list->next;
566 while ( ent != list )
567 {
568 page = list_entry(ent, struct page_info, list);
569 /* Grab a reference to the page so it won't disappear from under us. */
570 if ( unlikely(!get_page(page, d)) )
571 {
572 /* Couldn't get a reference -- someone is freeing this page. */
573 ent = ent->next;
574 continue;
575 }
577 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
578 put_page_and_type(page);
580 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
581 put_page(page);
583 #ifndef __ia64__
584 /*
585 * Forcibly invalidate base page tables at this point to break circular
586 * 'linear page table' references. This is okay because MMU structures
587 * are not shared across domains and this domain is now dead. Thus base
588 * tables are not in use so a non-zero count means circular reference.
589 */
590 y = page->u.inuse.type_info;
591 for ( ; ; )
592 {
593 x = y;
594 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
595 (PGT_base_page_table|PGT_validated)) )
596 break;
598 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
599 if ( likely(y == x) )
600 {
601 free_page_type(page, PGT_base_page_table);
602 break;
603 }
604 }
605 #endif
607 /* Follow the list chain and /then/ potentially free the page. */
608 ent = ent->next;
609 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
610 put_page(page);
611 }
613 spin_unlock_recursive(&d->page_alloc_lock);
614 }
616 void domain_relinquish_resources(struct domain *d)
617 {
618 /* Relinquish guest resources for VT-i domain. */
619 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
620 vmx_relinquish_guest_resources(d);
622 /* Tear down shadow mode stuff. */
623 mm_teardown(d);
625 /* Relinquish every page of memory. */
626 relinquish_memory(d, &d->xenpage_list);
627 relinquish_memory(d, &d->page_list);
629 if (d->arch.is_vti && d->arch.sal_data)
630 xfree(d->arch.sal_data);
631 }
633 unsigned long
634 domain_set_shared_info_va (unsigned long va)
635 {
636 struct vcpu *v = current;
637 struct domain *d = v->domain;
638 struct vcpu *v1;
640 /* Check virtual address:
641 must belong to region 7,
642 must be 64Kb aligned,
643 must not be within Xen virtual space. */
644 if ((va >> 61) != 7
645 || (va & 0xffffUL) != 0
646 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
647 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
649 /* Note: this doesn't work well if other cpus are already running.
650 However this is part of the spec :-) */
651 printk ("Domain set shared_info_va to 0x%016lx\n", va);
652 d->arch.shared_info_va = va;
654 for_each_vcpu (d, v1) {
655 VCPU(v1, interrupt_mask_addr) =
656 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
657 }
659 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
661 /* Remap the shared pages. */
662 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
664 return 0;
665 }
667 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
668 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
670 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
671 {
672 unsigned int op = sc->op;
673 int rc = 0;
674 int i;
675 //struct vcpu *v;
677 if (unlikely(d == current->domain)) {
678 gdprintk(XENLOG_INFO,
679 "Don't try to do a shadow op on yourself!\n");
680 return -EINVAL;
681 }
683 domain_pause(d);
685 switch (op)
686 {
687 case XEN_DOMCTL_SHADOW_OP_OFF:
688 if (shadow_mode_enabled (d)) {
689 u64 *bm = d->arch.shadow_bitmap;
691 /* Flush vhpt and tlb to restore dirty bit usage. */
692 domain_flush_tlb_vhpt(d);
694 /* Free bitmap. */
695 d->arch.shadow_bitmap_size = 0;
696 d->arch.shadow_bitmap = NULL;
697 xfree(bm);
698 }
699 break;
701 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
702 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
703 rc = -EINVAL;
704 break;
706 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
707 if (shadow_mode_enabled(d)) {
708 rc = -EINVAL;
709 break;
710 }
712 atomic64_set(&d->arch.shadow_fault_count, 0);
713 atomic64_set(&d->arch.shadow_dirty_count, 0);
715 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
716 ~(BITS_PER_LONG-1);
717 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
718 d->arch.shadow_bitmap_size / BITS_PER_LONG);
719 if (d->arch.shadow_bitmap == NULL) {
720 d->arch.shadow_bitmap_size = 0;
721 rc = -ENOMEM;
722 }
723 else {
724 memset(d->arch.shadow_bitmap, 0,
725 d->arch.shadow_bitmap_size / 8);
727 /* Flush vhtp and tlb to enable dirty bit
728 virtualization. */
729 domain_flush_tlb_vhpt(d);
730 }
731 break;
733 case XEN_DOMCTL_SHADOW_OP_CLEAN:
734 {
735 int nbr_longs;
737 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
738 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
740 atomic64_set(&d->arch.shadow_fault_count, 0);
741 atomic64_set(&d->arch.shadow_dirty_count, 0);
743 if (guest_handle_is_null(sc->dirty_bitmap) ||
744 (d->arch.shadow_bitmap == NULL)) {
745 rc = -EINVAL;
746 break;
747 }
749 if (sc->pages > d->arch.shadow_bitmap_size)
750 sc->pages = d->arch.shadow_bitmap_size;
752 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
754 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
755 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
756 SHADOW_COPY_CHUNK : nbr_longs - i;
758 if (copy_to_guest_offset(sc->dirty_bitmap, i,
759 d->arch.shadow_bitmap + i,
760 size)) {
761 rc = -EFAULT;
762 break;
763 }
765 memset(d->arch.shadow_bitmap + i,
766 0, size * sizeof(unsigned long));
767 }
769 break;
770 }
772 case XEN_DOMCTL_SHADOW_OP_PEEK:
773 {
774 unsigned long size;
776 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
777 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
779 if (guest_handle_is_null(sc->dirty_bitmap) ||
780 (d->arch.shadow_bitmap == NULL)) {
781 rc = -EINVAL;
782 break;
783 }
785 if (sc->pages > d->arch.shadow_bitmap_size)
786 sc->pages = d->arch.shadow_bitmap_size;
788 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
789 if (copy_to_guest(sc->dirty_bitmap,
790 d->arch.shadow_bitmap, size)) {
791 rc = -EFAULT;
792 break;
793 }
794 break;
795 }
796 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
797 sc->mb = 0;
798 break;
799 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
800 if (sc->mb > 0) {
801 BUG();
802 rc = -ENOMEM;
803 }
804 break;
805 default:
806 rc = -EINVAL;
807 break;
808 }
810 domain_unpause(d);
812 return rc;
813 }
815 // remove following line if not privifying in memory
816 //#define HAVE_PRIVIFY_MEMORY
817 #ifndef HAVE_PRIVIFY_MEMORY
818 #define privify_memory(x,y) do {} while(0)
819 #endif
821 // see arch/x86/xxx/domain_build.c
822 int elf_sanity_check(Elf_Ehdr *ehdr)
823 {
824 if (!(IS_ELF(*ehdr)))
825 {
826 printk("DOM0 image is not a Xen-compatible Elf image.\n");
827 return 0;
828 }
829 return 1;
830 }
832 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
833 {
834 char *elfbase = (char *) image_start;
835 Elf_Ehdr ehdr;
836 Elf_Phdr phdr;
837 int h, filesz, memsz;
838 unsigned long elfaddr, dom_mpaddr, dom_imva;
839 struct page_info *p;
841 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
842 for ( h = 0; h < ehdr.e_phnum; h++ ) {
843 memcpy(&phdr,
844 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
845 sizeof(Elf_Phdr));
846 if ((phdr.p_type != PT_LOAD))
847 continue;
849 filesz = phdr.p_filesz;
850 memsz = phdr.p_memsz;
851 elfaddr = (unsigned long) elfbase + phdr.p_offset;
852 dom_mpaddr = phdr.p_paddr;
854 while (memsz > 0) {
855 p = assign_new_domain_page(d,dom_mpaddr);
856 BUG_ON (unlikely(p == NULL));
857 dom_imva = __va_ul(page_to_maddr(p));
858 if (filesz > 0) {
859 if (filesz >= PAGE_SIZE)
860 memcpy((void *) dom_imva,
861 (void *) elfaddr,
862 PAGE_SIZE);
863 else {
864 // copy partial page
865 memcpy((void *) dom_imva,
866 (void *) elfaddr, filesz);
867 // zero the rest of page
868 memset((void *) dom_imva+filesz, 0,
869 PAGE_SIZE-filesz);
870 }
871 //FIXME: This test for code seems to find a lot more than objdump -x does
872 if (phdr.p_flags & PF_X) {
873 privify_memory(dom_imva,PAGE_SIZE);
874 flush_icache_range(dom_imva,
875 dom_imva+PAGE_SIZE);
876 }
877 }
878 else if (memsz > 0) {
879 /* always zero out entire page */
880 memset((void *) dom_imva, 0, PAGE_SIZE);
881 }
882 memsz -= PAGE_SIZE;
883 filesz -= PAGE_SIZE;
884 elfaddr += PAGE_SIZE;
885 dom_mpaddr += PAGE_SIZE;
886 }
887 }
888 }
890 void alloc_dom0(void)
891 {
892 /* Check dom0 size. */
893 if (dom0_size < 4 * 1024 * 1024) {
894 panic("dom0_mem is too small, boot aborted"
895 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
896 }
898 if (running_on_sim) {
899 dom0_size = 128*1024*1024; //FIXME: Should be configurable
900 }
902 /* no need to allocate pages for now
903 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
904 */
905 }
908 /*
909 * Domain 0 has direct access to all devices absolutely. However
910 * the major point of this stub here, is to allow alloc_dom_mem
911 * handled with order > 0 request. Dom0 requires that bit set to
912 * allocate memory for other domains.
913 */
914 static void physdev_init_dom0(struct domain *d)
915 {
916 if (iomem_permit_access(d, 0UL, ~0UL))
917 BUG();
918 if (irqs_permit_access(d, 0, NR_IRQS-1))
919 BUG();
920 if (ioports_permit_access(d, 0, 0xffff))
921 BUG();
922 }
924 int construct_dom0(struct domain *d,
925 unsigned long image_start, unsigned long image_len,
926 unsigned long initrd_start, unsigned long initrd_len,
927 char *cmdline)
928 {
929 int i, rc;
930 start_info_t *si;
931 dom0_vga_console_info_t *ci;
932 struct vcpu *v = d->vcpu[0];
933 unsigned long max_pages;
935 struct domain_setup_info dsi;
936 unsigned long p_start;
937 unsigned long pkern_start;
938 unsigned long pkern_entry;
939 unsigned long pkern_end;
940 unsigned long pinitrd_start = 0;
941 unsigned long pstart_info;
942 struct page_info *start_info_page;
943 unsigned long bp_mpa;
944 struct ia64_boot_param *bp;
946 #ifdef VALIDATE_VT
947 unsigned int vmx_dom0 = 0;
948 unsigned long mfn;
949 struct page_info *page = NULL;
950 #endif
952 //printk("construct_dom0: starting\n");
954 /* Sanity! */
955 BUG_ON(d != dom0);
956 BUG_ON(d->vcpu[0] == NULL);
957 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
959 memset(&dsi, 0, sizeof(struct domain_setup_info));
961 printk("*** LOADING DOMAIN 0 ***\n");
963 max_pages = dom0_size / PAGE_SIZE;
964 d->max_pages = max_pages;
965 d->tot_pages = 0;
966 dsi.image_addr = (unsigned long)image_start;
967 dsi.image_len = image_len;
968 rc = parseelfimage(&dsi);
969 if ( rc != 0 )
970 return rc;
972 #ifdef VALIDATE_VT
973 /* Temp workaround */
974 if (running_on_sim)
975 dsi.xen_section_string = (char *)1;
977 /* Check whether dom0 is vti domain */
978 if ((!vmx_enabled) && !dsi.xen_section_string) {
979 printk("Lack of hardware support for unmodified vmx dom0\n");
980 panic("");
981 }
983 if (vmx_enabled && !dsi.xen_section_string) {
984 printk("Dom0 is vmx domain!\n");
985 vmx_dom0 = 1;
986 }
987 #endif
989 p_start = dsi.v_start;
990 pkern_start = dsi.v_kernstart;
991 pkern_end = dsi.v_kernend;
992 pkern_entry = dsi.v_kernentry;
994 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
996 if ( (p_start & (PAGE_SIZE-1)) != 0 )
997 {
998 printk("Initial guest OS must load to a page boundary.\n");
999 return -EINVAL;
1002 pstart_info = PAGE_ALIGN(pkern_end);
1003 if(initrd_start && initrd_len){
1004 unsigned long offset;
1006 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
1007 if (pinitrd_start <= pstart_info)
1008 panic("%s:enough memory is not assigned to dom0", __func__);
1010 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1011 struct page_info *p;
1012 p = assign_new_domain_page(d, pinitrd_start + offset);
1013 if (p == NULL)
1014 panic("%s: can't allocate page for initrd image", __func__);
1015 if (initrd_len < offset + PAGE_SIZE)
1016 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1017 initrd_len - offset);
1018 else
1019 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1023 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1024 " Kernel image: %lx->%lx\n"
1025 " Entry address: %lx\n"
1026 " Init. ramdisk: %lx len %lx\n"
1027 " Start info.: %lx->%lx\n",
1028 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1029 pstart_info, pstart_info + PAGE_SIZE);
1031 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1033 printk("Initial guest OS requires too much space\n"
1034 "(%luMB is greater than %luMB limit)\n",
1035 (pkern_end-pkern_start)>>20,
1036 (max_pages <<PAGE_SHIFT)>>20);
1037 return -ENOMEM;
1040 // if high 3 bits of pkern start are non-zero, error
1042 // if pkern end is after end of metaphysical memory, error
1043 // (we should be able to deal with this... later)
1045 /* Mask all upcalls... */
1046 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1047 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1049 if (dom0_max_vcpus == 0)
1050 dom0_max_vcpus = MAX_VIRT_CPUS;
1051 if (dom0_max_vcpus > num_online_cpus())
1052 dom0_max_vcpus = num_online_cpus();
1053 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1054 dom0_max_vcpus = MAX_VIRT_CPUS;
1056 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1057 for ( i = 1; i < dom0_max_vcpus; i++ )
1058 if (alloc_vcpu(d, i, i) == NULL)
1059 printk ("Cannot allocate dom0 vcpu %d\n", i);
1061 /* Copy the OS image. */
1062 loaddomainelfimage(d,image_start);
1064 /* Copy the initial ramdisk. */
1065 //if ( initrd_len != 0 )
1066 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1068 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1069 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1071 /* Set up start info area. */
1072 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1073 start_info_page = assign_new_domain_page(d, pstart_info);
1074 if (start_info_page == NULL)
1075 panic("can't allocate start info page");
1076 si = page_to_virt(start_info_page);
1077 memset(si, 0, PAGE_SIZE);
1078 sprintf(si->magic, "xen-%i.%i-ia64",
1079 xen_major_version(), xen_minor_version());
1080 si->nr_pages = max_pages;
1081 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1083 printk("Dom0: 0x%lx\n", (u64)dom0);
1085 #ifdef VALIDATE_VT
1086 /* VMX specific construction for Dom0, if hardware supports VMX
1087 * and Dom0 is unmodified image
1088 */
1089 if (vmx_dom0)
1090 vmx_final_setup_guest(v);
1091 #endif
1093 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1095 /* Build firmware.
1096 Note: Linux kernel reserve memory used by start_info, so there is
1097 no need to remove it from MDT. */
1098 bp_mpa = pstart_info + sizeof(struct start_info);
1099 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1101 /* Fill boot param. */
1102 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1103 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1105 bp = (struct ia64_boot_param *)((unsigned char *)si +
1106 sizeof(start_info_t));
1107 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1109 /* We assume console has reached the last line! */
1110 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1111 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1112 bp->console_info.orig_x = 0;
1113 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1114 0 : bp->console_info.num_rows - 1;
1116 bp->initrd_start = dom0_size -
1117 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1118 bp->initrd_size = ia64_boot_param->initrd_size;
1120 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1121 sizeof(start_info_t) +
1122 sizeof(struct ia64_boot_param));
1124 if (fill_console_start_info(ci)) {
1125 si->console.dom0.info_off = sizeof(start_info_t) +
1126 sizeof(struct ia64_boot_param);
1127 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1130 vcpu_init_regs (v);
1132 vcpu_regs(v)->r28 = bp_mpa;
1134 vcpu_regs (v)->cr_iip = pkern_entry;
1136 physdev_init_dom0(d);
1138 return 0;
1141 void machine_restart(char * __unused)
1143 console_start_sync();
1144 if (running_on_sim)
1145 printk ("machine_restart called. spinning...\n");
1146 else
1147 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1148 while(1);
1151 extern void cpu_halt(void);
1153 void machine_halt(void)
1155 console_start_sync();
1156 if (running_on_sim)
1157 printk ("machine_halt called. spinning...\n");
1158 else
1159 cpu_halt();
1160 while(1);
1163 void sync_vcpu_execstate(struct vcpu *v)
1165 // __ia64_save_fpu(v->arch._thread.fph);
1166 // if (VMX_DOMAIN(v))
1167 // vmx_save_state(v);
1168 // FIXME SMP: Anything else needed here for SMP?
1171 static void parse_dom0_mem(char *s)
1173 dom0_size = parse_size_and_unit(s, NULL);
1175 custom_param("dom0_mem", parse_dom0_mem);