direct-io.hg

view xen/arch/ia64/xen/domain.c @ 12425:5c5af79e7272

[IA64] IA64 counter part of the change 12204:e6fdb32b786c of xen-unstable.hg

Remove xc_ia64_get_pfn_list() from setup_guest() in xc_linux_build.c,
use xc_domain_populate_physmap() and xc_domain_translate_gpfn_list().

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author awilliam@xenbuild.aw
date Fri Nov 10 11:14:32 2006 -0700 (2006-11-10)
parents ebed72718263
children b39844e292f6
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
49 #include <xen/guest_access.h>
50 #include <asm/tlb_track.h>
52 unsigned long dom0_size = 512*1024*1024;
54 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
55 static unsigned int dom0_max_vcpus = 1;
56 integer_param("dom0_max_vcpus", dom0_max_vcpus);
58 extern unsigned long running_on_sim;
60 extern char dom0_command_line[];
62 /* forward declaration */
63 static void init_switch_stack(struct vcpu *v);
65 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
66 This is a Xen virtual address. */
67 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
68 DEFINE_PER_CPU(int *, current_psr_ic_addr);
70 #include <xen/sched-if.h>
72 static void
73 ia64_disable_vhpt_walker(void)
74 {
75 // disable VHPT. ia64_new_rr7() might cause VHPT
76 // fault without this because it flushes dtr[IA64_TR_VHPT]
77 // (VHPT_SIZE_LOG2 << 2) is just for avoid
78 // Reserved Register/Field fault.
79 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
80 }
82 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
83 {
84 int cpu = smp_processor_id();
85 int last_vcpu_id, last_processor;
87 if (!is_idle_domain(prev->domain))
88 tlbflush_update_time
89 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
90 tlbflush_current_time());
92 if (is_idle_domain(next->domain))
93 return;
95 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
96 last_processor = next->arch.last_processor;
98 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
99 next->arch.last_processor = cpu;
101 if ((last_vcpu_id != next->vcpu_id &&
102 last_vcpu_id != INVALID_VCPU_ID) ||
103 (last_vcpu_id == next->vcpu_id &&
104 last_processor != cpu &&
105 last_processor != INVALID_PROCESSOR)) {
106 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
107 u32 last_tlbflush_timestamp =
108 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
109 #endif
110 int vhpt_is_flushed = 0;
112 // if the vTLB implementation was changed,
113 // the followings must be updated either.
114 if (VMX_DOMAIN(next)) {
115 // currently vTLB for vt-i domian is per vcpu.
116 // so any flushing isn't needed.
117 } else if (HAS_PERVCPU_VHPT(next->domain)) {
118 // nothing to do
119 } else {
120 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
121 last_tlbflush_timestamp)) {
122 local_vhpt_flush();
123 vhpt_is_flushed = 1;
124 }
125 }
126 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
127 last_tlbflush_timestamp)) {
128 local_flush_tlb_all();
129 perfc_incrc(tlbflush_clock_cswitch_purge);
130 } else {
131 perfc_incrc(tlbflush_clock_cswitch_skip);
132 }
133 perfc_incrc(flush_vtlb_for_context_switch);
134 }
135 }
137 void schedule_tail(struct vcpu *prev)
138 {
139 extern char ia64_ivt;
140 context_saved(prev);
142 ia64_disable_vhpt_walker();
143 if (VMX_DOMAIN(current)) {
144 vmx_do_launch(current);
145 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
146 current->processor);
147 } else {
148 ia64_set_iva(&ia64_ivt);
149 load_region_regs(current);
150 ia64_set_pta(vcpu_pta(current));
151 vcpu_load_kernel_regs(current);
152 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
153 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
154 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
155 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
156 migrate_timer(&current->arch.hlt_timer, current->processor);
157 }
158 flush_vtlb_for_context_switch(prev, current);
159 }
161 void context_switch(struct vcpu *prev, struct vcpu *next)
162 {
163 uint64_t spsr;
165 local_irq_save(spsr);
167 __ia64_save_fpu(prev->arch._thread.fph);
168 __ia64_load_fpu(next->arch._thread.fph);
169 if (VMX_DOMAIN(prev)) {
170 vmx_save_state(prev);
171 if (!VMX_DOMAIN(next)) {
172 /* VMX domains can change the physical cr.dcr.
173 * Restore default to prevent leakage. */
174 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
175 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
176 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
177 }
178 }
179 if (VMX_DOMAIN(next))
180 vmx_load_state(next);
182 ia64_disable_vhpt_walker();
183 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
184 prev = ia64_switch_to(next);
186 /* Note: ia64_switch_to does not return here at vcpu initialization. */
188 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
190 if (VMX_DOMAIN(current)){
191 vmx_load_all_rr(current);
192 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
193 current->processor);
194 } else {
195 struct domain *nd;
196 extern char ia64_ivt;
198 ia64_set_iva(&ia64_ivt);
200 nd = current->domain;
201 if (!is_idle_domain(nd)) {
202 load_region_regs(current);
203 ia64_set_pta(vcpu_pta(current));
204 vcpu_load_kernel_regs(current);
205 vcpu_set_next_timer(current);
206 if (vcpu_timer_expired(current))
207 vcpu_pend_timer(current);
208 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
209 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
210 __ia64_per_cpu_var(current_psr_ic_addr) =
211 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
212 } else {
213 /* When switching to idle domain, only need to disable vhpt
214 * walker. Then all accesses happen within idle context will
215 * be handled by TR mapping and identity mapping.
216 */
217 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
218 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
219 }
220 }
221 local_irq_restore(spsr);
222 flush_vtlb_for_context_switch(prev, current);
223 context_saved(prev);
224 }
226 void continue_running(struct vcpu *same)
227 {
228 /* nothing to do */
229 }
231 static void default_idle(void)
232 {
233 local_irq_disable();
234 if ( !softirq_pending(smp_processor_id()) )
235 safe_halt();
236 local_irq_enable();
237 }
239 static void continue_cpu_idle_loop(void)
240 {
241 for ( ; ; )
242 {
243 #ifdef IA64
244 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
245 #else
246 irq_stat[cpu].idle_timestamp = jiffies;
247 #endif
248 while ( !softirq_pending(smp_processor_id()) )
249 default_idle();
250 raise_softirq(SCHEDULE_SOFTIRQ);
251 do_softirq();
252 }
253 }
255 void startup_cpu_idle_loop(void)
256 {
257 /* Just some sanity to ensure that the scheduler is set up okay. */
258 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
259 raise_softirq(SCHEDULE_SOFTIRQ);
261 continue_cpu_idle_loop();
262 }
264 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
265 * get_order_from_shift(XMAPPEDREGS_SHIFT))
266 */
267 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
268 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
269 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
270 #endif
272 void hlt_timer_fn(void *data)
273 {
274 struct vcpu *v = data;
275 vcpu_unblock(v);
276 }
278 void relinquish_vcpu_resources(struct vcpu *v)
279 {
280 if (HAS_PERVCPU_VHPT(v->domain))
281 pervcpu_vhpt_free(v);
282 if (v->arch.privregs != NULL) {
283 free_xenheap_pages(v->arch.privregs,
284 get_order_from_shift(XMAPPEDREGS_SHIFT));
285 v->arch.privregs = NULL;
286 }
287 kill_timer(&v->arch.hlt_timer);
288 }
290 struct vcpu *alloc_vcpu_struct(void)
291 {
292 struct vcpu *v;
293 struct thread_info *ti;
294 static int first_allocation = 1;
296 if (first_allocation) {
297 first_allocation = 0;
298 /* Still keep idle vcpu0 static allocated at compilation, due
299 * to some code from Linux still requires it in early phase.
300 */
301 return idle_vcpu[0];
302 }
304 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
305 return NULL;
306 memset(v, 0, sizeof(*v));
308 ti = alloc_thread_info(v);
309 /* Clear thread_info to clear some important fields, like
310 * preempt_count
311 */
312 memset(ti, 0, sizeof(struct thread_info));
313 init_switch_stack(v);
315 return v;
316 }
318 void free_vcpu_struct(struct vcpu *v)
319 {
320 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
321 }
323 int vcpu_initialise(struct vcpu *v)
324 {
325 struct domain *d = v->domain;
326 int rc, order, i;
328 if (!is_idle_domain(d)) {
329 if (!d->arch.is_vti) {
330 if (HAS_PERVCPU_VHPT(d))
331 if ((rc = pervcpu_vhpt_alloc(v)) != 0)
332 return rc;
334 /* Create privregs page only if not VTi. */
335 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
336 v->arch.privregs = alloc_xenheap_pages(order);
337 BUG_ON(v->arch.privregs == NULL);
338 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
339 for (i = 0; i < (1 << order); i++)
340 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
341 i, d, XENSHARE_writable);
343 tlbflush_update_time(&v->arch.tlbflush_timestamp,
344 tlbflush_current_time());
345 }
347 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
348 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
349 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
350 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
352 /* Is it correct ?
353 It depends on the domain rid usage.
355 A domain may share rid among its processor (eg having a
356 global VHPT). In this case, we should also share rid
357 among vcpus and the rid range should be the same.
359 However a domain may have per cpu rid allocation. In
360 this case we don't want to share rid among vcpus, but we may
361 do it if two vcpus are on the same cpu... */
363 v->arch.starting_rid = d->arch.starting_rid;
364 v->arch.ending_rid = d->arch.ending_rid;
365 v->arch.breakimm = d->arch.breakimm;
366 v->arch.last_processor = INVALID_PROCESSOR;
367 }
369 if (!VMX_DOMAIN(v))
370 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
371 first_cpu(cpu_online_map));
373 return 0;
374 }
376 void vcpu_destroy(struct vcpu *v)
377 {
378 if (v->domain->arch.is_vti)
379 vmx_relinquish_vcpu_resources(v);
380 else
381 relinquish_vcpu_resources(v);
382 }
384 static void init_switch_stack(struct vcpu *v)
385 {
386 struct pt_regs *regs = vcpu_regs (v);
387 struct switch_stack *sw = (struct switch_stack *) regs - 1;
388 extern void ia64_ret_from_clone;
390 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
391 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
392 sw->b0 = (unsigned long) &ia64_ret_from_clone;
393 sw->ar_fpsr = FPSR_DEFAULT;
394 v->arch._thread.ksp = (unsigned long) sw - 16;
395 // stay on kernel stack because may get interrupts!
396 // ia64_ret_from_clone switches to user stack
397 v->arch._thread.on_ustack = 0;
398 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
399 }
401 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
402 static int opt_pervcpu_vhpt = 1;
403 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
404 #endif
406 int arch_domain_create(struct domain *d)
407 {
408 int i;
410 // the following will eventually need to be negotiated dynamically
411 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
412 d->arch.breakimm = 0x1000;
413 for (i = 0; i < NR_CPUS; i++) {
414 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
415 }
417 if (is_idle_domain(d))
418 return 0;
420 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
421 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
422 dprintk(XENLOG_WARNING, "%s:%d domain %d pervcpu_vhpt %d\n",
423 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
424 #endif
425 if (tlb_track_create(d) < 0)
426 goto fail_nomem1;
427 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
428 if (d->shared_info == NULL)
429 goto fail_nomem;
430 memset(d->shared_info, 0, XSI_SIZE);
431 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
432 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
433 d, XENSHARE_writable);
435 /* We may also need emulation rid for region4, though it's unlikely
436 * to see guest issue uncacheable access in metaphysical mode. But
437 * keep such info here may be more sane.
438 */
439 if (!allocate_rid_range(d,0))
440 goto fail_nomem;
442 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
444 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
445 goto fail_nomem;
447 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
448 RANGESETF_prettyprint_hex);
450 printk ("arch_domain_create: domain=%p\n", d);
451 return 0;
453 fail_nomem:
454 tlb_track_destroy(d);
455 fail_nomem1:
456 if (d->arch.mm.pgd != NULL)
457 pgd_free(d->arch.mm.pgd);
458 if (d->shared_info != NULL)
459 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
460 return -ENOMEM;
461 }
463 void arch_domain_destroy(struct domain *d)
464 {
465 BUG_ON(d->arch.mm.pgd != NULL);
466 if (d->shared_info != NULL)
467 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
468 if (d->arch.shadow_bitmap != NULL)
469 xfree(d->arch.shadow_bitmap);
471 tlb_track_destroy(d);
473 /* Clear vTLB for the next domain. */
474 domain_flush_tlb_vhpt(d);
476 deallocate_rid_range(d);
477 }
479 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
480 {
481 int i;
482 struct vcpu_extra_regs *er = &c->extra_regs;
484 c->user_regs = *vcpu_regs (v);
485 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
487 /* Fill extra regs. */
488 for (i = 0; i < 8; i++) {
489 er->itrs[i].pte = v->arch.itrs[i].pte.val;
490 er->itrs[i].itir = v->arch.itrs[i].itir;
491 er->itrs[i].vadr = v->arch.itrs[i].vadr;
492 er->itrs[i].rid = v->arch.itrs[i].rid;
493 }
494 for (i = 0; i < 8; i++) {
495 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
496 er->dtrs[i].itir = v->arch.dtrs[i].itir;
497 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
498 er->dtrs[i].rid = v->arch.dtrs[i].rid;
499 }
500 er->event_callback_ip = v->arch.event_callback_ip;
501 er->dcr = v->arch.dcr;
502 er->iva = v->arch.iva;
503 }
505 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
506 {
507 struct pt_regs *regs = vcpu_regs (v);
508 struct domain *d = v->domain;
510 *regs = c->user_regs;
512 if (!d->arch.is_vti) {
513 /* domain runs at PL2/3 */
514 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
515 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
516 }
518 if (c->flags & VGCF_EXTRA_REGS) {
519 int i;
520 struct vcpu_extra_regs *er = &c->extra_regs;
522 for (i = 0; i < 8; i++) {
523 vcpu_set_itr(v, i, er->itrs[i].pte,
524 er->itrs[i].itir,
525 er->itrs[i].vadr,
526 er->itrs[i].rid);
527 }
528 for (i = 0; i < 8; i++) {
529 vcpu_set_dtr(v, i,
530 er->dtrs[i].pte,
531 er->dtrs[i].itir,
532 er->dtrs[i].vadr,
533 er->dtrs[i].rid);
534 }
535 v->arch.event_callback_ip = er->event_callback_ip;
536 v->arch.dcr = er->dcr;
537 v->arch.iva = er->iva;
538 }
540 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
541 return 0;
542 if (d->arch.is_vti)
543 vmx_final_setup_guest(v);
545 /* This overrides some registers. */
546 vcpu_init_regs(v);
548 /* Don't redo final setup */
549 set_bit(_VCPUF_initialised, &v->vcpu_flags);
550 return 0;
551 }
553 static void relinquish_memory(struct domain *d, struct list_head *list)
554 {
555 struct list_head *ent;
556 struct page_info *page;
557 #ifndef __ia64__
558 unsigned long x, y;
559 #endif
561 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
562 spin_lock_recursive(&d->page_alloc_lock);
563 ent = list->next;
564 while ( ent != list )
565 {
566 page = list_entry(ent, struct page_info, list);
567 /* Grab a reference to the page so it won't disappear from under us. */
568 if ( unlikely(!get_page(page, d)) )
569 {
570 /* Couldn't get a reference -- someone is freeing this page. */
571 ent = ent->next;
572 continue;
573 }
575 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
576 put_page_and_type(page);
578 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
579 put_page(page);
581 #ifndef __ia64__
582 /*
583 * Forcibly invalidate base page tables at this point to break circular
584 * 'linear page table' references. This is okay because MMU structures
585 * are not shared across domains and this domain is now dead. Thus base
586 * tables are not in use so a non-zero count means circular reference.
587 */
588 y = page->u.inuse.type_info;
589 for ( ; ; )
590 {
591 x = y;
592 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
593 (PGT_base_page_table|PGT_validated)) )
594 break;
596 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
597 if ( likely(y == x) )
598 {
599 free_page_type(page, PGT_base_page_table);
600 break;
601 }
602 }
603 #endif
605 /* Follow the list chain and /then/ potentially free the page. */
606 ent = ent->next;
607 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
608 put_page(page);
609 }
611 spin_unlock_recursive(&d->page_alloc_lock);
612 }
614 void domain_relinquish_resources(struct domain *d)
615 {
616 /* Relinquish every page of memory. */
618 // relase page traversing d->arch.mm.
619 relinquish_mm(d);
621 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
622 vmx_relinquish_guest_resources(d);
624 relinquish_memory(d, &d->xenpage_list);
625 relinquish_memory(d, &d->page_list);
627 if (d->arch.is_vti && d->arch.sal_data)
628 xfree(d->arch.sal_data);
629 }
631 unsigned long
632 domain_set_shared_info_va (unsigned long va)
633 {
634 struct vcpu *v = current;
635 struct domain *d = v->domain;
636 struct vcpu *v1;
638 /* Check virtual address:
639 must belong to region 7,
640 must be 64Kb aligned,
641 must not be within Xen virtual space. */
642 if ((va >> 61) != 7
643 || (va & 0xffffUL) != 0
644 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
645 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
647 /* Note: this doesn't work well if other cpus are already running.
648 However this is part of the spec :-) */
649 printk ("Domain set shared_info_va to 0x%016lx\n", va);
650 d->arch.shared_info_va = va;
652 for_each_vcpu (d, v1) {
653 VCPU(v1, interrupt_mask_addr) =
654 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
655 }
657 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
659 /* Remap the shared pages. */
660 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
662 return 0;
663 }
665 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
666 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
668 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
669 {
670 unsigned int op = sc->op;
671 int rc = 0;
672 int i;
673 //struct vcpu *v;
675 if (unlikely(d == current->domain)) {
676 gdprintk(XENLOG_INFO,
677 "Don't try to do a shadow op on yourself!\n");
678 return -EINVAL;
679 }
681 domain_pause(d);
683 switch (op)
684 {
685 case XEN_DOMCTL_SHADOW_OP_OFF:
686 if (shadow_mode_enabled (d)) {
687 u64 *bm = d->arch.shadow_bitmap;
689 /* Flush vhpt and tlb to restore dirty bit usage. */
690 domain_flush_tlb_vhpt(d);
692 /* Free bitmap. */
693 d->arch.shadow_bitmap_size = 0;
694 d->arch.shadow_bitmap = NULL;
695 xfree(bm);
696 }
697 break;
699 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
700 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
701 rc = -EINVAL;
702 break;
704 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
705 if (shadow_mode_enabled(d)) {
706 rc = -EINVAL;
707 break;
708 }
710 atomic64_set(&d->arch.shadow_fault_count, 0);
711 atomic64_set(&d->arch.shadow_dirty_count, 0);
713 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
714 ~(BITS_PER_LONG-1);
715 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
716 d->arch.shadow_bitmap_size / BITS_PER_LONG);
717 if (d->arch.shadow_bitmap == NULL) {
718 d->arch.shadow_bitmap_size = 0;
719 rc = -ENOMEM;
720 }
721 else {
722 memset(d->arch.shadow_bitmap, 0,
723 d->arch.shadow_bitmap_size / 8);
725 /* Flush vhtp and tlb to enable dirty bit
726 virtualization. */
727 domain_flush_tlb_vhpt(d);
728 }
729 break;
731 case XEN_DOMCTL_SHADOW_OP_CLEAN:
732 {
733 int nbr_longs;
735 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
736 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
738 atomic64_set(&d->arch.shadow_fault_count, 0);
739 atomic64_set(&d->arch.shadow_dirty_count, 0);
741 if (guest_handle_is_null(sc->dirty_bitmap) ||
742 (d->arch.shadow_bitmap == NULL)) {
743 rc = -EINVAL;
744 break;
745 }
747 if (sc->pages > d->arch.shadow_bitmap_size)
748 sc->pages = d->arch.shadow_bitmap_size;
750 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
752 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
753 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
754 SHADOW_COPY_CHUNK : nbr_longs - i;
756 if (copy_to_guest_offset(sc->dirty_bitmap, i,
757 d->arch.shadow_bitmap + i,
758 size)) {
759 rc = -EFAULT;
760 break;
761 }
763 memset(d->arch.shadow_bitmap + i,
764 0, size * sizeof(unsigned long));
765 }
767 break;
768 }
770 case XEN_DOMCTL_SHADOW_OP_PEEK:
771 {
772 unsigned long size;
774 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
775 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
777 if (guest_handle_is_null(sc->dirty_bitmap) ||
778 (d->arch.shadow_bitmap == NULL)) {
779 rc = -EINVAL;
780 break;
781 }
783 if (sc->pages > d->arch.shadow_bitmap_size)
784 sc->pages = d->arch.shadow_bitmap_size;
786 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
787 if (copy_to_guest(sc->dirty_bitmap,
788 d->arch.shadow_bitmap, size)) {
789 rc = -EFAULT;
790 break;
791 }
792 break;
793 }
794 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
795 sc->mb = 0;
796 break;
797 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
798 if (sc->mb > 0) {
799 BUG();
800 rc = -ENOMEM;
801 }
802 break;
803 default:
804 rc = -EINVAL;
805 break;
806 }
808 domain_unpause(d);
810 return rc;
811 }
813 // remove following line if not privifying in memory
814 //#define HAVE_PRIVIFY_MEMORY
815 #ifndef HAVE_PRIVIFY_MEMORY
816 #define privify_memory(x,y) do {} while(0)
817 #endif
819 // see arch/x86/xxx/domain_build.c
820 int elf_sanity_check(Elf_Ehdr *ehdr)
821 {
822 if (!(IS_ELF(*ehdr)))
823 {
824 printk("DOM0 image is not a Xen-compatible Elf image.\n");
825 return 0;
826 }
827 return 1;
828 }
830 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
831 {
832 char *elfbase = (char *) image_start;
833 Elf_Ehdr ehdr;
834 Elf_Phdr phdr;
835 int h, filesz, memsz;
836 unsigned long elfaddr, dom_mpaddr, dom_imva;
837 struct page_info *p;
839 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
840 for ( h = 0; h < ehdr.e_phnum; h++ ) {
841 memcpy(&phdr,
842 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
843 sizeof(Elf_Phdr));
844 if ((phdr.p_type != PT_LOAD))
845 continue;
847 filesz = phdr.p_filesz;
848 memsz = phdr.p_memsz;
849 elfaddr = (unsigned long) elfbase + phdr.p_offset;
850 dom_mpaddr = phdr.p_paddr;
852 while (memsz > 0) {
853 p = assign_new_domain_page(d,dom_mpaddr);
854 BUG_ON (unlikely(p == NULL));
855 dom_imva = __va_ul(page_to_maddr(p));
856 if (filesz > 0) {
857 if (filesz >= PAGE_SIZE)
858 memcpy((void *) dom_imva,
859 (void *) elfaddr,
860 PAGE_SIZE);
861 else {
862 // copy partial page
863 memcpy((void *) dom_imva,
864 (void *) elfaddr, filesz);
865 // zero the rest of page
866 memset((void *) dom_imva+filesz, 0,
867 PAGE_SIZE-filesz);
868 }
869 //FIXME: This test for code seems to find a lot more than objdump -x does
870 if (phdr.p_flags & PF_X) {
871 privify_memory(dom_imva,PAGE_SIZE);
872 flush_icache_range(dom_imva,
873 dom_imva+PAGE_SIZE);
874 }
875 }
876 else if (memsz > 0) {
877 /* always zero out entire page */
878 memset((void *) dom_imva, 0, PAGE_SIZE);
879 }
880 memsz -= PAGE_SIZE;
881 filesz -= PAGE_SIZE;
882 elfaddr += PAGE_SIZE;
883 dom_mpaddr += PAGE_SIZE;
884 }
885 }
886 }
888 void alloc_dom0(void)
889 {
890 /* Check dom0 size. */
891 if (dom0_size < 4 * 1024 * 1024) {
892 panic("dom0_mem is too small, boot aborted"
893 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
894 }
896 if (running_on_sim) {
897 dom0_size = 128*1024*1024; //FIXME: Should be configurable
898 }
900 /* no need to allocate pages for now
901 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
902 */
903 }
906 /*
907 * Domain 0 has direct access to all devices absolutely. However
908 * the major point of this stub here, is to allow alloc_dom_mem
909 * handled with order > 0 request. Dom0 requires that bit set to
910 * allocate memory for other domains.
911 */
912 static void physdev_init_dom0(struct domain *d)
913 {
914 if (iomem_permit_access(d, 0UL, ~0UL))
915 BUG();
916 if (irqs_permit_access(d, 0, NR_IRQS-1))
917 BUG();
918 if (ioports_permit_access(d, 0, 0xffff))
919 BUG();
920 }
922 int construct_dom0(struct domain *d,
923 unsigned long image_start, unsigned long image_len,
924 unsigned long initrd_start, unsigned long initrd_len,
925 char *cmdline)
926 {
927 int i, rc;
928 start_info_t *si;
929 dom0_vga_console_info_t *ci;
930 struct vcpu *v = d->vcpu[0];
931 unsigned long max_pages;
933 struct domain_setup_info dsi;
934 unsigned long p_start;
935 unsigned long pkern_start;
936 unsigned long pkern_entry;
937 unsigned long pkern_end;
938 unsigned long pinitrd_start = 0;
939 unsigned long pstart_info;
940 struct page_info *start_info_page;
941 unsigned long bp_mpa;
942 struct ia64_boot_param *bp;
944 #ifdef VALIDATE_VT
945 unsigned int vmx_dom0 = 0;
946 unsigned long mfn;
947 struct page_info *page = NULL;
948 #endif
950 //printk("construct_dom0: starting\n");
952 /* Sanity! */
953 BUG_ON(d != dom0);
954 BUG_ON(d->vcpu[0] == NULL);
955 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
957 memset(&dsi, 0, sizeof(struct domain_setup_info));
959 printk("*** LOADING DOMAIN 0 ***\n");
961 max_pages = dom0_size / PAGE_SIZE;
962 d->max_pages = max_pages;
963 d->tot_pages = 0;
964 dsi.image_addr = (unsigned long)image_start;
965 dsi.image_len = image_len;
966 rc = parseelfimage(&dsi);
967 if ( rc != 0 )
968 return rc;
970 #ifdef VALIDATE_VT
971 /* Temp workaround */
972 if (running_on_sim)
973 dsi.xen_section_string = (char *)1;
975 /* Check whether dom0 is vti domain */
976 if ((!vmx_enabled) && !dsi.xen_section_string) {
977 printk("Lack of hardware support for unmodified vmx dom0\n");
978 panic("");
979 }
981 if (vmx_enabled && !dsi.xen_section_string) {
982 printk("Dom0 is vmx domain!\n");
983 vmx_dom0 = 1;
984 }
985 #endif
987 p_start = dsi.v_start;
988 pkern_start = dsi.v_kernstart;
989 pkern_end = dsi.v_kernend;
990 pkern_entry = dsi.v_kernentry;
992 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
994 if ( (p_start & (PAGE_SIZE-1)) != 0 )
995 {
996 printk("Initial guest OS must load to a page boundary.\n");
997 return -EINVAL;
998 }
1000 pstart_info = PAGE_ALIGN(pkern_end);
1001 if(initrd_start && initrd_len){
1002 unsigned long offset;
1004 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
1005 if (pinitrd_start <= pstart_info)
1006 panic("%s:enough memory is not assigned to dom0", __func__);
1008 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
1009 struct page_info *p;
1010 p = assign_new_domain_page(d, pinitrd_start + offset);
1011 if (p == NULL)
1012 panic("%s: can't allocate page for initrd image", __func__);
1013 if (initrd_len < offset + PAGE_SIZE)
1014 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
1015 initrd_len - offset);
1016 else
1017 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
1021 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
1022 " Kernel image: %lx->%lx\n"
1023 " Entry address: %lx\n"
1024 " Init. ramdisk: %lx len %lx\n"
1025 " Start info.: %lx->%lx\n",
1026 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
1027 pstart_info, pstart_info + PAGE_SIZE);
1029 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
1031 printk("Initial guest OS requires too much space\n"
1032 "(%luMB is greater than %luMB limit)\n",
1033 (pkern_end-pkern_start)>>20,
1034 (max_pages <<PAGE_SHIFT)>>20);
1035 return -ENOMEM;
1038 // if high 3 bits of pkern start are non-zero, error
1040 // if pkern end is after end of metaphysical memory, error
1041 // (we should be able to deal with this... later)
1043 /* Mask all upcalls... */
1044 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1045 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1047 if (dom0_max_vcpus == 0)
1048 dom0_max_vcpus = MAX_VIRT_CPUS;
1049 if (dom0_max_vcpus > num_online_cpus())
1050 dom0_max_vcpus = num_online_cpus();
1051 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1052 dom0_max_vcpus = MAX_VIRT_CPUS;
1054 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1055 for ( i = 1; i < dom0_max_vcpus; i++ )
1056 if (alloc_vcpu(d, i, i) == NULL)
1057 printk ("Cannot allocate dom0 vcpu %d\n", i);
1059 /* Copy the OS image. */
1060 loaddomainelfimage(d,image_start);
1062 /* Copy the initial ramdisk. */
1063 //if ( initrd_len != 0 )
1064 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1066 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1067 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1069 /* Set up start info area. */
1070 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1071 start_info_page = assign_new_domain_page(d, pstart_info);
1072 if (start_info_page == NULL)
1073 panic("can't allocate start info page");
1074 si = page_to_virt(start_info_page);
1075 memset(si, 0, PAGE_SIZE);
1076 sprintf(si->magic, "xen-%i.%i-ia64",
1077 xen_major_version(), xen_minor_version());
1078 si->nr_pages = max_pages;
1079 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1081 printk("Dom0: 0x%lx\n", (u64)dom0);
1083 #ifdef VALIDATE_VT
1084 /* VMX specific construction for Dom0, if hardware supports VMX
1085 * and Dom0 is unmodified image
1086 */
1087 if (vmx_dom0)
1088 vmx_final_setup_guest(v);
1089 #endif
1091 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1093 /* Build firmware.
1094 Note: Linux kernel reserve memory used by start_info, so there is
1095 no need to remove it from MDT. */
1096 bp_mpa = pstart_info + sizeof(struct start_info);
1097 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1099 /* Fill boot param. */
1100 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1101 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1103 bp = (struct ia64_boot_param *)((unsigned char *)si +
1104 sizeof(start_info_t));
1105 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1107 /* We assume console has reached the last line! */
1108 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1109 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1110 bp->console_info.orig_x = 0;
1111 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1112 0 : bp->console_info.num_rows - 1;
1114 bp->initrd_start = dom0_size -
1115 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1116 bp->initrd_size = ia64_boot_param->initrd_size;
1118 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1119 sizeof(start_info_t) +
1120 sizeof(struct ia64_boot_param));
1122 if (fill_console_start_info(ci)) {
1123 si->console.dom0.info_off = sizeof(start_info_t) +
1124 sizeof(struct ia64_boot_param);
1125 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1128 vcpu_init_regs (v);
1130 vcpu_regs(v)->r28 = bp_mpa;
1132 vcpu_regs (v)->cr_iip = pkern_entry;
1134 physdev_init_dom0(d);
1136 return 0;
1139 void machine_restart(char * __unused)
1141 console_start_sync();
1142 if (running_on_sim)
1143 printk ("machine_restart called. spinning...\n");
1144 else
1145 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1146 while(1);
1149 extern void cpu_halt(void);
1151 void machine_halt(void)
1153 console_start_sync();
1154 if (running_on_sim)
1155 printk ("machine_halt called. spinning...\n");
1156 else
1157 cpu_halt();
1158 while(1);
1161 void sync_vcpu_execstate(struct vcpu *v)
1163 // __ia64_save_fpu(v->arch._thread.fph);
1164 // if (VMX_DOMAIN(v))
1165 // vmx_save_state(v);
1166 // FIXME SMP: Anything else needed here for SMP?
1169 static void parse_dom0_mem(char *s)
1171 dom0_size = parse_size_and_unit(s, NULL);
1173 custom_param("dom0_mem", parse_dom0_mem);