direct-io.hg

view xen/arch/ia64/xen/domain.c @ 11354:586c5fe8cf3e

[IA64] revert xen-ia64-unstable.hg cset 11301:a19dbbe4cff5

revert arch_domain_destory() for old PV-on-HVM on IPF

Signed-off-by: Tsunehisa Doi <Doi.Tsunehisa@jp.fujitsu.com>
Signed-off-by: Tomonari Horikoshi <t.horikoshi@jp.fujitsu.com>
author awilliam@xenbuild.aw
date Tue Aug 29 09:08:29 2006 -0600 (2006-08-29)
parents e317ad162eba
children 019b7c756ddb
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <xen/elf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vhpt.h>
45 #include <asm/tlbflush.h>
46 #include <asm/regionreg.h>
47 #include <asm/dom_fw.h>
48 #include <asm/shadow.h>
50 unsigned long dom0_size = 512*1024*1024;
51 unsigned long dom0_align = 64*1024*1024;
53 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
54 static unsigned int dom0_max_vcpus = 1;
55 integer_param("dom0_max_vcpus", dom0_max_vcpus);
57 extern unsigned long running_on_sim;
59 extern char dom0_command_line[];
61 /* FIXME: where these declarations should be there ? */
62 extern void serial_input_init(void);
63 static void init_switch_stack(struct vcpu *v);
64 extern void vmx_do_launch(struct vcpu *);
66 /* this belongs in include/asm, but there doesn't seem to be a suitable place */
67 extern struct vcpu *ia64_switch_to (struct vcpu *next_task);
69 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
70 This is a Xen virtual address. */
71 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
72 DEFINE_PER_CPU(int *, current_psr_ic_addr);
74 #include <xen/sched-if.h>
76 static void flush_vtlb_for_context_switch(struct vcpu* vcpu)
77 {
78 int cpu = smp_processor_id();
79 int last_vcpu_id = vcpu->domain->arch.last_vcpu[cpu].vcpu_id;
80 int last_processor = vcpu->arch.last_processor;
82 if (is_idle_domain(vcpu->domain))
83 return;
85 vcpu->domain->arch.last_vcpu[cpu].vcpu_id = vcpu->vcpu_id;
86 vcpu->arch.last_processor = cpu;
88 if ((last_vcpu_id != vcpu->vcpu_id &&
89 last_vcpu_id != INVALID_VCPU_ID) ||
90 (last_vcpu_id == vcpu->vcpu_id &&
91 last_processor != cpu &&
92 last_processor != INVALID_PROCESSOR)) {
94 // if the vTLB implementation was changed,
95 // the followings must be updated either.
96 if (VMX_DOMAIN(vcpu)) {
97 // currently vTLB for vt-i domian is per vcpu.
98 // so any flushing isn't needed.
99 } else {
100 vhpt_flush();
101 }
102 local_flush_tlb_all();
103 }
104 }
106 void schedule_tail(struct vcpu *prev)
107 {
108 extern char ia64_ivt;
109 context_saved(prev);
111 if (VMX_DOMAIN(current)) {
112 vmx_do_launch(current);
113 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
114 current->processor);
115 } else {
116 ia64_set_iva(&ia64_ivt);
117 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
118 VHPT_ENABLED);
119 load_region_regs(current);
120 vcpu_load_kernel_regs(current);
121 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
122 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
123 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
124 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
125 migrate_timer(&current->arch.hlt_timer, current->processor);
126 }
127 flush_vtlb_for_context_switch(current);
128 }
130 void context_switch(struct vcpu *prev, struct vcpu *next)
131 {
132 uint64_t spsr;
133 uint64_t pta;
135 local_irq_save(spsr);
137 __ia64_save_fpu(prev->arch._thread.fph);
138 __ia64_load_fpu(next->arch._thread.fph);
139 if (VMX_DOMAIN(prev)) {
140 vmx_save_state(prev);
141 if (!VMX_DOMAIN(next)) {
142 /* VMX domains can change the physical cr.dcr.
143 * Restore default to prevent leakage. */
144 ia64_setreg(_IA64_REG_CR_DCR, (IA64_DCR_DP | IA64_DCR_DK
145 | IA64_DCR_DX | IA64_DCR_DR | IA64_DCR_PP
146 | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
147 }
148 }
149 if (VMX_DOMAIN(next))
150 vmx_load_state(next);
151 /*ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next);*/
152 prev = ia64_switch_to(next);
154 /* Note: ia64_switch_to does not return here at vcpu initialization. */
156 //cpu_set(smp_processor_id(), current->domain->domain_dirty_cpumask);
158 if (VMX_DOMAIN(current)){
159 vmx_load_all_rr(current);
160 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
161 current->processor);
162 } else {
163 struct domain *nd;
164 extern char ia64_ivt;
166 ia64_set_iva(&ia64_ivt);
168 nd = current->domain;
169 if (!is_idle_domain(nd)) {
170 ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
171 VHPT_ENABLED);
172 load_region_regs(current);
173 vcpu_load_kernel_regs(current);
174 vcpu_set_next_timer(current);
175 if (vcpu_timer_expired(current))
176 vcpu_pend_timer(current);
177 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
178 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
179 __ia64_per_cpu_var(current_psr_ic_addr) =
180 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
181 } else {
182 /* When switching to idle domain, only need to disable vhpt
183 * walker. Then all accesses happen within idle context will
184 * be handled by TR mapping and identity mapping.
185 */
186 pta = ia64_get_pta();
187 ia64_set_pta(pta & ~VHPT_ENABLED);
188 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
189 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
190 }
191 }
192 flush_vtlb_for_context_switch(current);
193 local_irq_restore(spsr);
194 context_saved(prev);
195 }
197 void continue_running(struct vcpu *same)
198 {
199 /* nothing to do */
200 }
202 static void default_idle(void)
203 {
204 local_irq_disable();
205 if ( !softirq_pending(smp_processor_id()) )
206 safe_halt();
207 local_irq_enable();
208 }
210 static void continue_cpu_idle_loop(void)
211 {
212 for ( ; ; )
213 {
214 #ifdef IA64
215 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
216 #else
217 irq_stat[cpu].idle_timestamp = jiffies;
218 #endif
219 while ( !softirq_pending(smp_processor_id()) )
220 default_idle();
221 raise_softirq(SCHEDULE_SOFTIRQ);
222 do_softirq();
223 }
224 }
226 void startup_cpu_idle_loop(void)
227 {
228 /* Just some sanity to ensure that the scheduler is set up okay. */
229 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
230 raise_softirq(SCHEDULE_SOFTIRQ);
232 continue_cpu_idle_loop();
233 }
235 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
236 * get_order_from_shift(XMAPPEDREGS_SHIFT))
237 */
238 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
239 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
240 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
241 #endif
243 void hlt_timer_fn(void *data)
244 {
245 struct vcpu *v = data;
246 vcpu_unblock(v);
247 }
249 struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id)
250 {
251 struct vcpu *v;
252 struct thread_info *ti;
254 /* Still keep idle vcpu0 static allocated at compilation, due
255 * to some code from Linux still requires it in early phase.
256 */
257 if (is_idle_domain(d) && !vcpu_id)
258 v = idle_vcpu[0];
259 else {
260 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
261 return NULL;
262 memset(v, 0, sizeof(*v));
264 ti = alloc_thread_info(v);
265 /* Clear thread_info to clear some important fields, like
266 * preempt_count
267 */
268 memset(ti, 0, sizeof(struct thread_info));
269 init_switch_stack(v);
270 }
272 if (!is_idle_domain(d)) {
273 if (!d->arch.is_vti) {
274 int order;
275 int i;
277 /* Create privregs page only if not VTi. */
278 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
279 v->arch.privregs = alloc_xenheap_pages(order);
280 BUG_ON(v->arch.privregs == NULL);
281 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
282 for (i = 0; i < (1 << order); i++)
283 share_xen_page_with_guest(virt_to_page(v->arch.privregs) +
284 i, d, XENSHARE_writable);
285 }
287 v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0;
288 v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4;
289 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0;
290 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4;
292 /* Is it correct ?
293 It depends on the domain rid usage.
295 A domain may share rid among its processor (eg having a
296 global VHPT). In this case, we should also share rid
297 among vcpus and the rid range should be the same.
299 However a domain may have per cpu rid allocation. In
300 this case we don't want to share rid among vcpus, but we may
301 do it if two vcpus are on the same cpu... */
303 v->arch.starting_rid = d->arch.starting_rid;
304 v->arch.ending_rid = d->arch.ending_rid;
305 v->arch.breakimm = d->arch.breakimm;
306 v->arch.last_processor = INVALID_PROCESSOR;
307 }
308 if (!VMX_DOMAIN(v)){
309 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
310 first_cpu(cpu_online_map));
311 }
313 return v;
314 }
316 void relinquish_vcpu_resources(struct vcpu *v)
317 {
318 if (v->arch.privregs != NULL) {
319 free_xenheap_pages(v->arch.privregs,
320 get_order_from_shift(XMAPPEDREGS_SHIFT));
321 v->arch.privregs = NULL;
322 }
323 kill_timer(&v->arch.hlt_timer);
324 }
326 void free_vcpu_struct(struct vcpu *v)
327 {
328 if (VMX_DOMAIN(v))
329 vmx_relinquish_vcpu_resources(v);
330 else
331 relinquish_vcpu_resources(v);
333 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
334 }
336 static void init_switch_stack(struct vcpu *v)
337 {
338 struct pt_regs *regs = vcpu_regs (v);
339 struct switch_stack *sw = (struct switch_stack *) regs - 1;
340 extern void ia64_ret_from_clone;
342 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
343 sw->ar_bspstore = (unsigned long)v + IA64_RBS_OFFSET;
344 sw->b0 = (unsigned long) &ia64_ret_from_clone;
345 sw->ar_fpsr = FPSR_DEFAULT;
346 v->arch._thread.ksp = (unsigned long) sw - 16;
347 // stay on kernel stack because may get interrupts!
348 // ia64_ret_from_clone switches to user stack
349 v->arch._thread.on_ustack = 0;
350 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
351 }
353 int arch_domain_create(struct domain *d)
354 {
355 int i;
357 // the following will eventually need to be negotiated dynamically
358 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
359 d->arch.breakimm = 0x1000;
360 for (i = 0; i < NR_CPUS; i++) {
361 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
362 }
364 if (is_idle_domain(d))
365 return 0;
367 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
368 if (d->shared_info == NULL)
369 goto fail_nomem;
370 memset(d->shared_info, 0, XSI_SIZE);
371 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
372 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
373 d, XENSHARE_writable);
375 d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME
376 /* We may also need emulation rid for region4, though it's unlikely
377 * to see guest issue uncacheable access in metaphysical mode. But
378 * keep such info here may be more sane.
379 */
380 if (!allocate_rid_range(d,0))
381 goto fail_nomem;
383 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
385 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
386 goto fail_nomem;
388 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
389 RANGESETF_prettyprint_hex);
391 printf ("arch_domain_create: domain=%p\n", d);
392 return 0;
394 fail_nomem:
395 if (d->arch.mm.pgd != NULL)
396 pgd_free(d->arch.mm.pgd);
397 if (d->shared_info != NULL)
398 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
399 return -ENOMEM;
400 }
402 void arch_domain_destroy(struct domain *d)
403 {
404 BUG_ON(d->arch.mm.pgd != NULL);
405 if (d->shared_info != NULL)
406 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
407 if (d->arch.shadow_bitmap != NULL)
408 xfree(d->arch.shadow_bitmap);
410 /* Clear vTLB for the next domain. */
411 domain_flush_tlb_vhpt(d);
413 deallocate_rid_range(d);
414 }
416 void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c)
417 {
418 int i;
419 struct vcpu_extra_regs *er = &c->extra_regs;
421 c->user_regs = *vcpu_regs (v);
422 c->privregs_pfn = virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT;
424 /* Fill extra regs. */
425 for (i = 0; i < 8; i++) {
426 er->itrs[i].pte = v->arch.itrs[i].pte.val;
427 er->itrs[i].itir = v->arch.itrs[i].itir;
428 er->itrs[i].vadr = v->arch.itrs[i].vadr;
429 er->itrs[i].rid = v->arch.itrs[i].rid;
430 }
431 for (i = 0; i < 8; i++) {
432 er->dtrs[i].pte = v->arch.dtrs[i].pte.val;
433 er->dtrs[i].itir = v->arch.dtrs[i].itir;
434 er->dtrs[i].vadr = v->arch.dtrs[i].vadr;
435 er->dtrs[i].rid = v->arch.dtrs[i].rid;
436 }
437 er->event_callback_ip = v->arch.event_callback_ip;
438 er->dcr = v->arch.dcr;
439 er->iva = v->arch.iva;
440 }
442 int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c)
443 {
444 struct pt_regs *regs = vcpu_regs (v);
445 struct domain *d = v->domain;
447 *regs = c->user_regs;
449 if (!d->arch.is_vti) {
450 /* domain runs at PL2/3 */
451 regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT;
452 regs->ar_rsc |= (2 << 2); /* force PL2/3 */
453 }
455 if (c->flags & VGCF_EXTRA_REGS) {
456 int i;
457 struct vcpu_extra_regs *er = &c->extra_regs;
459 for (i = 0; i < 8; i++) {
460 vcpu_set_itr(v, i, er->itrs[i].pte,
461 er->itrs[i].itir,
462 er->itrs[i].vadr,
463 er->itrs[i].rid);
464 }
465 for (i = 0; i < 8; i++) {
466 vcpu_set_dtr(v, i,
467 er->dtrs[i].pte,
468 er->dtrs[i].itir,
469 er->dtrs[i].vadr,
470 er->dtrs[i].rid);
471 }
472 v->arch.event_callback_ip = er->event_callback_ip;
473 v->arch.dcr = er->dcr;
474 v->arch.iva = er->iva;
475 }
477 if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) )
478 return 0;
479 if (d->arch.is_vti)
480 vmx_final_setup_guest(v);
482 /* This overrides some registers. */
483 vcpu_init_regs(v);
485 /* Don't redo final setup */
486 set_bit(_VCPUF_initialised, &v->vcpu_flags);
487 return 0;
488 }
490 static void relinquish_memory(struct domain *d, struct list_head *list)
491 {
492 struct list_head *ent;
493 struct page_info *page;
494 #ifndef __ia64__
495 unsigned long x, y;
496 #endif
498 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
499 spin_lock_recursive(&d->page_alloc_lock);
500 ent = list->next;
501 while ( ent != list )
502 {
503 page = list_entry(ent, struct page_info, list);
504 /* Grab a reference to the page so it won't disappear from under us. */
505 if ( unlikely(!get_page(page, d)) )
506 {
507 /* Couldn't get a reference -- someone is freeing this page. */
508 ent = ent->next;
509 continue;
510 }
512 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
513 put_page_and_type(page);
515 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
516 put_page(page);
518 #ifndef __ia64__
519 /*
520 * Forcibly invalidate base page tables at this point to break circular
521 * 'linear page table' references. This is okay because MMU structures
522 * are not shared across domains and this domain is now dead. Thus base
523 * tables are not in use so a non-zero count means circular reference.
524 */
525 y = page->u.inuse.type_info;
526 for ( ; ; )
527 {
528 x = y;
529 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
530 (PGT_base_page_table|PGT_validated)) )
531 break;
533 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
534 if ( likely(y == x) )
535 {
536 free_page_type(page, PGT_base_page_table);
537 break;
538 }
539 }
540 #endif
542 /* Follow the list chain and /then/ potentially free the page. */
543 ent = ent->next;
544 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
545 put_page(page);
546 }
548 spin_unlock_recursive(&d->page_alloc_lock);
549 }
551 void domain_relinquish_resources(struct domain *d)
552 {
553 /* Relinquish every page of memory. */
555 // relase page traversing d->arch.mm.
556 relinquish_mm(d);
558 if (d->vcpu[0] && VMX_DOMAIN(d->vcpu[0]))
559 vmx_relinquish_guest_resources(d);
561 relinquish_memory(d, &d->xenpage_list);
562 relinquish_memory(d, &d->page_list);
564 if (d->arch.is_vti && d->arch.sal_data)
565 xfree(d->arch.sal_data);
566 }
568 void build_physmap_table(struct domain *d)
569 {
570 struct list_head *list_ent = d->page_list.next;
571 unsigned long mfn, i = 0;
573 while(list_ent != &d->page_list) {
574 mfn = page_to_mfn(list_entry(
575 list_ent, struct page_info, list));
576 assign_domain_page(d, i << PAGE_SHIFT, mfn << PAGE_SHIFT);
578 i++;
579 list_ent = mfn_to_page(mfn)->list.next;
580 }
581 }
583 unsigned long
584 domain_set_shared_info_va (unsigned long va)
585 {
586 struct vcpu *v = current;
587 struct domain *d = v->domain;
588 struct vcpu *v1;
590 /* Check virtual address:
591 must belong to region 7,
592 must be 64Kb aligned,
593 must not be within Xen virtual space. */
594 if ((va >> 61) != 7
595 || (va & 0xffffUL) != 0
596 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
597 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
599 /* Note: this doesn't work well if other cpus are already running.
600 However this is part of the spec :-) */
601 printf ("Domain set shared_info_va to 0x%016lx\n", va);
602 d->arch.shared_info_va = va;
604 for_each_vcpu (d, v1) {
605 VCPU(v1, interrupt_mask_addr) =
606 (unsigned char *)va + INT_ENABLE_OFFSET(v1);
607 }
609 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
611 /* Remap the shared pages. */
612 set_one_rr (7UL << 61, PSCB(v,rrs[7]));
614 return 0;
615 }
617 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
618 #define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
620 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
621 {
622 unsigned int op = sc->op;
623 int rc = 0;
624 int i;
625 //struct vcpu *v;
627 if (unlikely(d == current->domain)) {
628 DPRINTK("Don't try to do a shadow op on yourself!\n");
629 return -EINVAL;
630 }
632 domain_pause(d);
634 switch (op)
635 {
636 case XEN_DOMCTL_SHADOW_OP_OFF:
637 if (shadow_mode_enabled (d)) {
638 u64 *bm = d->arch.shadow_bitmap;
640 /* Flush vhpt and tlb to restore dirty bit usage. */
641 domain_flush_tlb_vhpt(d);
643 /* Free bitmap. */
644 d->arch.shadow_bitmap_size = 0;
645 d->arch.shadow_bitmap = NULL;
646 xfree(bm);
647 }
648 break;
650 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
651 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
652 rc = -EINVAL;
653 break;
655 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
656 if (shadow_mode_enabled(d)) {
657 rc = -EINVAL;
658 break;
659 }
661 atomic64_set(&d->arch.shadow_fault_count, 0);
662 atomic64_set(&d->arch.shadow_dirty_count, 0);
664 d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
665 ~(BITS_PER_LONG-1);
666 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
667 d->arch.shadow_bitmap_size / BITS_PER_LONG);
668 if (d->arch.shadow_bitmap == NULL) {
669 d->arch.shadow_bitmap_size = 0;
670 rc = -ENOMEM;
671 }
672 else {
673 memset(d->arch.shadow_bitmap, 0,
674 d->arch.shadow_bitmap_size / 8);
676 /* Flush vhtp and tlb to enable dirty bit
677 virtualization. */
678 domain_flush_tlb_vhpt(d);
679 }
680 break;
682 case XEN_DOMCTL_SHADOW_OP_CLEAN:
683 {
684 int nbr_longs;
686 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
687 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
689 atomic64_set(&d->arch.shadow_fault_count, 0);
690 atomic64_set(&d->arch.shadow_dirty_count, 0);
692 if (guest_handle_is_null(sc->dirty_bitmap) ||
693 (d->arch.shadow_bitmap == NULL)) {
694 rc = -EINVAL;
695 break;
696 }
698 if (sc->pages > d->arch.shadow_bitmap_size)
699 sc->pages = d->arch.shadow_bitmap_size;
701 nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
703 for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
704 int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
705 SHADOW_COPY_CHUNK : nbr_longs - i;
707 if (copy_to_guest_offset(sc->dirty_bitmap, i,
708 d->arch.shadow_bitmap + i,
709 size)) {
710 rc = -EFAULT;
711 break;
712 }
714 memset(d->arch.shadow_bitmap + i,
715 0, size * sizeof(unsigned long));
716 }
718 break;
719 }
721 case XEN_DOMCTL_SHADOW_OP_PEEK:
722 {
723 unsigned long size;
725 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
726 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
728 if (guest_handle_is_null(sc->dirty_bitmap) ||
729 (d->arch.shadow_bitmap == NULL)) {
730 rc = -EINVAL;
731 break;
732 }
734 if (sc->pages > d->arch.shadow_bitmap_size)
735 sc->pages = d->arch.shadow_bitmap_size;
737 size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
738 if (copy_to_guest(sc->dirty_bitmap,
739 d->arch.shadow_bitmap, size)) {
740 rc = -EFAULT;
741 break;
742 }
743 break;
744 }
745 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
746 sc->mb = 0;
747 break;
748 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
749 if (sc->mb > 0) {
750 BUG();
751 rc = -ENOMEM;
752 }
753 break;
754 default:
755 rc = -EINVAL;
756 break;
757 }
759 domain_unpause(d);
761 return rc;
762 }
764 // remove following line if not privifying in memory
765 //#define HAVE_PRIVIFY_MEMORY
766 #ifndef HAVE_PRIVIFY_MEMORY
767 #define privify_memory(x,y) do {} while(0)
768 #endif
770 // see arch/x86/xxx/domain_build.c
771 int elf_sanity_check(Elf_Ehdr *ehdr)
772 {
773 if (!(IS_ELF(*ehdr)))
774 {
775 printk("DOM0 image is not a Xen-compatible Elf image.\n");
776 return 0;
777 }
778 return 1;
779 }
781 static void loaddomainelfimage(struct domain *d, unsigned long image_start)
782 {
783 char *elfbase = (char *) image_start;
784 Elf_Ehdr ehdr;
785 Elf_Phdr phdr;
786 int h, filesz, memsz;
787 unsigned long elfaddr, dom_mpaddr, dom_imva;
788 struct page_info *p;
790 memcpy(&ehdr, (void *) image_start, sizeof(Elf_Ehdr));
791 for ( h = 0; h < ehdr.e_phnum; h++ ) {
792 memcpy(&phdr,
793 elfbase + ehdr.e_phoff + (h*ehdr.e_phentsize),
794 sizeof(Elf_Phdr));
795 if ((phdr.p_type != PT_LOAD))
796 continue;
798 filesz = phdr.p_filesz;
799 memsz = phdr.p_memsz;
800 elfaddr = (unsigned long) elfbase + phdr.p_offset;
801 dom_mpaddr = phdr.p_paddr;
803 while (memsz > 0) {
804 p = assign_new_domain_page(d,dom_mpaddr);
805 BUG_ON (unlikely(p == NULL));
806 dom_imva = __va_ul(page_to_maddr(p));
807 if (filesz > 0) {
808 if (filesz >= PAGE_SIZE)
809 memcpy((void *) dom_imva,
810 (void *) elfaddr,
811 PAGE_SIZE);
812 else {
813 // copy partial page
814 memcpy((void *) dom_imva,
815 (void *) elfaddr, filesz);
816 // zero the rest of page
817 memset((void *) dom_imva+filesz, 0,
818 PAGE_SIZE-filesz);
819 }
820 //FIXME: This test for code seems to find a lot more than objdump -x does
821 if (phdr.p_flags & PF_X) {
822 privify_memory(dom_imva,PAGE_SIZE);
823 flush_icache_range(dom_imva,
824 dom_imva+PAGE_SIZE);
825 }
826 }
827 else if (memsz > 0) {
828 /* always zero out entire page */
829 memset((void *) dom_imva, 0, PAGE_SIZE);
830 }
831 memsz -= PAGE_SIZE;
832 filesz -= PAGE_SIZE;
833 elfaddr += PAGE_SIZE;
834 dom_mpaddr += PAGE_SIZE;
835 }
836 }
837 }
839 void alloc_dom0(void)
840 {
841 /* Check dom0 size. */
842 if (dom0_size < 4 * 1024 * 1024) {
843 panic("dom0_mem is too small, boot aborted"
844 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
845 }
847 /* Check dom0 align. */
848 if ((dom0_align - 1) & dom0_align) { /* not a power of two */
849 panic("dom0_align (%lx) must be power of two, boot aborted"
850 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
851 dom0_align);
852 }
853 if (dom0_align < PAGE_SIZE) {
854 panic("dom0_align must be >= %ld, boot aborted"
855 " (try e.g. dom0_align=256M or dom0_align=65536K)\n",
856 PAGE_SIZE);
857 }
858 if (dom0_size % dom0_align) {
859 dom0_size = (dom0_size / dom0_align + 1) * dom0_align;
860 printf("dom0_size rounded up to %ld, due to dom0_align=%lx\n",
861 dom0_size,dom0_align);
862 }
864 if (running_on_sim) {
865 dom0_size = 128*1024*1024; //FIXME: Should be configurable
866 }
868 /* no need to allocate pages for now
869 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
870 */
871 }
874 /*
875 * Domain 0 has direct access to all devices absolutely. However
876 * the major point of this stub here, is to allow alloc_dom_mem
877 * handled with order > 0 request. Dom0 requires that bit set to
878 * allocate memory for other domains.
879 */
880 static void physdev_init_dom0(struct domain *d)
881 {
882 if (iomem_permit_access(d, 0UL, ~0UL))
883 BUG();
884 if (irqs_permit_access(d, 0, NR_IRQS-1))
885 BUG();
886 if (ioports_permit_access(d, 0, 0xffff))
887 BUG();
888 }
890 int construct_dom0(struct domain *d,
891 unsigned long image_start, unsigned long image_len,
892 unsigned long initrd_start, unsigned long initrd_len,
893 char *cmdline)
894 {
895 int i, rc;
896 start_info_t *si;
897 dom0_vga_console_info_t *ci;
898 struct vcpu *v = d->vcpu[0];
899 unsigned long max_pages;
901 struct domain_setup_info dsi;
902 unsigned long p_start;
903 unsigned long pkern_start;
904 unsigned long pkern_entry;
905 unsigned long pkern_end;
906 unsigned long pinitrd_start = 0;
907 unsigned long pstart_info;
908 struct page_info *start_info_page;
909 unsigned long bp_mpa;
910 struct ia64_boot_param *bp;
912 #ifdef VALIDATE_VT
913 unsigned int vmx_dom0 = 0;
914 unsigned long mfn;
915 struct page_info *page = NULL;
916 #endif
918 //printf("construct_dom0: starting\n");
920 /* Sanity! */
921 BUG_ON(d != dom0);
922 BUG_ON(d->vcpu[0] == NULL);
923 BUG_ON(test_bit(_VCPUF_initialised, &v->vcpu_flags));
925 memset(&dsi, 0, sizeof(struct domain_setup_info));
927 printk("*** LOADING DOMAIN 0 ***\n");
929 max_pages = dom0_size / PAGE_SIZE;
930 d->max_pages = max_pages;
931 d->tot_pages = 0;
932 dsi.image_addr = (unsigned long)image_start;
933 dsi.image_len = image_len;
934 rc = parseelfimage(&dsi);
935 if ( rc != 0 )
936 return rc;
938 #ifdef VALIDATE_VT
939 /* Temp workaround */
940 if (running_on_sim)
941 dsi.xen_section_string = (char *)1;
943 /* Check whether dom0 is vti domain */
944 if ((!vmx_enabled) && !dsi.xen_section_string) {
945 printk("Lack of hardware support for unmodified vmx dom0\n");
946 panic("");
947 }
949 if (vmx_enabled && !dsi.xen_section_string) {
950 printk("Dom0 is vmx domain!\n");
951 vmx_dom0 = 1;
952 }
953 #endif
955 p_start = dsi.v_start;
956 pkern_start = dsi.v_kernstart;
957 pkern_end = dsi.v_kernend;
958 pkern_entry = dsi.v_kernentry;
960 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
962 if ( (p_start & (PAGE_SIZE-1)) != 0 )
963 {
964 printk("Initial guest OS must load to a page boundary.\n");
965 return -EINVAL;
966 }
968 pstart_info = PAGE_ALIGN(pkern_end);
969 if(initrd_start && initrd_len){
970 unsigned long offset;
972 pinitrd_start= dom0_size - (PAGE_ALIGN(initrd_len) + 4*1024*1024);
973 if (pinitrd_start <= pstart_info)
974 panic("%s:enough memory is not assigned to dom0", __func__);
976 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
977 struct page_info *p;
978 p = assign_new_domain_page(d, pinitrd_start + offset);
979 if (p == NULL)
980 panic("%s: can't allocate page for initrd image", __func__);
981 if (initrd_len < offset + PAGE_SIZE)
982 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
983 initrd_len - offset);
984 else
985 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
986 }
987 }
989 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
990 " Kernel image: %lx->%lx\n"
991 " Entry address: %lx\n"
992 " Init. ramdisk: %lx len %lx\n"
993 " Start info.: %lx->%lx\n",
994 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
995 pstart_info, pstart_info + PAGE_SIZE);
997 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
998 {
999 printk("Initial guest OS requires too much space\n"
1000 "(%luMB is greater than %luMB limit)\n",
1001 (pkern_end-pkern_start)>>20,
1002 (max_pages <<PAGE_SHIFT)>>20);
1003 return -ENOMEM;
1006 // if high 3 bits of pkern start are non-zero, error
1008 // if pkern end is after end of metaphysical memory, error
1009 // (we should be able to deal with this... later)
1011 /* Mask all upcalls... */
1012 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
1013 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
1015 if (dom0_max_vcpus == 0)
1016 dom0_max_vcpus = MAX_VIRT_CPUS;
1017 if (dom0_max_vcpus > num_online_cpus())
1018 dom0_max_vcpus = num_online_cpus();
1019 if (dom0_max_vcpus > MAX_VIRT_CPUS)
1020 dom0_max_vcpus = MAX_VIRT_CPUS;
1022 printf ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
1023 for ( i = 1; i < dom0_max_vcpus; i++ )
1024 if (alloc_vcpu(d, i, i) == NULL)
1025 printf ("Cannot allocate dom0 vcpu %d\n", i);
1027 /* Copy the OS image. */
1028 loaddomainelfimage(d,image_start);
1030 /* Copy the initial ramdisk. */
1031 //if ( initrd_len != 0 )
1032 // memcpy((void *)vinitrd_start, initrd_start, initrd_len);
1034 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
1035 sizeof(struct ia64_boot_param) > PAGE_SIZE);
1037 /* Set up start info area. */
1038 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
1039 start_info_page = assign_new_domain_page(d, pstart_info);
1040 if (start_info_page == NULL)
1041 panic("can't allocate start info page");
1042 si = page_to_virt(start_info_page);
1043 memset(si, 0, PAGE_SIZE);
1044 sprintf(si->magic, "xen-%i.%i-ia64",
1045 xen_major_version(), xen_minor_version());
1046 si->nr_pages = max_pages;
1047 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
1049 printk("Dom0: 0x%lx\n", (u64)dom0);
1051 #ifdef VALIDATE_VT
1052 /* VMX specific construction for Dom0, if hardware supports VMX
1053 * and Dom0 is unmodified image
1054 */
1055 if (vmx_dom0)
1056 vmx_final_setup_guest(v);
1057 #endif
1059 set_bit(_VCPUF_initialised, &v->vcpu_flags);
1061 /* Build firmware.
1062 Note: Linux kernel reserve memory used by start_info, so there is
1063 no need to remove it from MDT. */
1064 bp_mpa = pstart_info + sizeof(struct start_info);
1065 dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
1067 /* Fill boot param. */
1068 strncpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
1069 si->cmd_line[sizeof(si->cmd_line)-1] = 0;
1071 bp = (struct ia64_boot_param *)((unsigned char *)si +
1072 sizeof(start_info_t));
1073 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
1075 /* We assume console has reached the last line! */
1076 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
1077 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
1078 bp->console_info.orig_x = 0;
1079 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
1080 0 : bp->console_info.num_rows - 1;
1082 bp->initrd_start = dom0_size -
1083 (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024);
1084 bp->initrd_size = ia64_boot_param->initrd_size;
1086 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
1087 sizeof(start_info_t) +
1088 sizeof(struct ia64_boot_param));
1090 if (fill_console_start_info(ci)) {
1091 si->console.dom0.info_off = sizeof(start_info_t) +
1092 sizeof(struct ia64_boot_param);
1093 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
1096 vcpu_init_regs (v);
1098 vcpu_regs(v)->r28 = bp_mpa;
1100 vcpu_regs (v)->cr_iip = pkern_entry;
1102 physdev_init_dom0(d);
1104 // FIXME: Hack for keyboard input
1105 //serial_input_init();
1107 return 0;
1110 void machine_restart(char * __unused)
1112 console_start_sync();
1113 if (running_on_sim)
1114 printf ("machine_restart called. spinning...\n");
1115 else
1116 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
1117 while(1);
1120 extern void cpu_halt(void);
1122 void machine_halt(void)
1124 console_start_sync();
1125 if (running_on_sim)
1126 printf ("machine_halt called. spinning...\n");
1127 else
1128 cpu_halt();
1129 while(1);
1132 void sync_vcpu_execstate(struct vcpu *v)
1134 // __ia64_save_fpu(v->arch._thread.fph);
1135 // if (VMX_DOMAIN(v))
1136 // vmx_save_state(v);
1137 // FIXME SMP: Anything else needed here for SMP?
1140 static void parse_dom0_mem(char *s)
1142 dom0_size = parse_size_and_unit(s);
1144 custom_param("dom0_mem", parse_dom0_mem);
1147 static void parse_dom0_align(char *s)
1149 dom0_align = parse_size_and_unit(s);
1151 custom_param("dom0_align", parse_dom0_align);