ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 16785:af3550f53874

[IA64] domheap: Don't pin xenheap down. Now it's unnecessary.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Alex Williamson <alex.williamson@hp.com>
date Thu Jan 17 12:05:43 2008 -0700 (2008-01-17)
parents 6a7fa7dbde56
children bba0419a05f1
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vmx_vcpu_save.h>
45 #include <asm/vhpt.h>
46 #include <asm/vcpu.h>
47 #include <asm/tlbflush.h>
48 #include <asm/regionreg.h>
49 #include <asm/dom_fw.h>
50 #include <asm/shadow.h>
51 #include <xen/guest_access.h>
52 #include <asm/tlb_track.h>
53 #include <asm/perfmon.h>
54 #include <asm/sal.h>
55 #include <public/vcpu.h>
56 #include <linux/cpu.h>
57 #include <linux/notifier.h>
58 #include <asm/debugger.h>
60 /* dom0_size: default memory allocation for dom0 (~4GB) */
61 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
63 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
64 static unsigned int __initdata dom0_max_vcpus = 4;
65 integer_param("dom0_max_vcpus", dom0_max_vcpus);
67 extern char dom0_command_line[];
69 /* forward declaration */
70 static void init_switch_stack(struct vcpu *v);
72 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
73 This is a Xen virtual address. */
74 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
75 DEFINE_PER_CPU(int *, current_psr_ic_addr);
77 DEFINE_PER_CPU(struct vcpu *, fp_owner);
79 #include <xen/sched-if.h>
81 static void
82 ia64_disable_vhpt_walker(void)
83 {
84 // disable VHPT. ia64_new_rr7() might cause VHPT
85 // fault without this because it flushes dtr[IA64_TR_VHPT]
86 // (VHPT_SIZE_LOG2 << 2) is just for avoid
87 // Reserved Register/Field fault.
88 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
89 }
91 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
92 {
93 int cpu = smp_processor_id();
94 int last_vcpu_id, last_processor;
96 if (!is_idle_domain(prev->domain))
97 tlbflush_update_time
98 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
99 tlbflush_current_time());
101 if (is_idle_domain(next->domain))
102 return;
104 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
105 last_processor = next->arch.last_processor;
107 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
108 next->arch.last_processor = cpu;
110 if ((last_vcpu_id != next->vcpu_id &&
111 last_vcpu_id != INVALID_VCPU_ID) ||
112 (last_vcpu_id == next->vcpu_id &&
113 last_processor != cpu &&
114 last_processor != INVALID_PROCESSOR)) {
115 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
116 u32 last_tlbflush_timestamp =
117 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
118 #endif
119 int vhpt_is_flushed = 0;
121 // if the vTLB implementation was changed,
122 // the followings must be updated either.
123 if (VMX_DOMAIN(next)) {
124 // currently vTLB for vt-i domian is per vcpu.
125 // so any flushing isn't needed.
126 } else if (HAS_PERVCPU_VHPT(next->domain)) {
127 // nothing to do
128 } else {
129 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
130 last_tlbflush_timestamp)) {
131 local_vhpt_flush();
132 vhpt_is_flushed = 1;
133 }
134 }
135 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
136 last_tlbflush_timestamp)) {
137 local_flush_tlb_all();
138 perfc_incr(tlbflush_clock_cswitch_purge);
139 } else {
140 perfc_incr(tlbflush_clock_cswitch_skip);
141 }
142 perfc_incr(flush_vtlb_for_context_switch);
143 }
144 }
146 static void flush_cache_for_context_switch(struct vcpu *next)
147 {
148 extern cpumask_t cpu_cache_coherent_map;
149 int cpu = smp_processor_id();
151 if (is_idle_vcpu(next) ||
152 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
153 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
154 unsigned long flags;
155 u64 progress = 0;
156 s64 status;
158 local_irq_save(flags);
159 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
160 local_irq_restore(flags);
161 if (status != 0)
162 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
163 "cache_type=4 status %lx", status);
164 }
165 }
166 }
168 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
169 {
170 /*
171 * Implement eager save, lazy restore
172 */
173 if (!is_idle_vcpu(prev)) {
174 if (VMX_DOMAIN(prev)) {
175 if (FP_PSR(prev) & IA64_PSR_MFH) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 } else {
180 if (PSCB(prev, hpsr_mfh)) {
181 __ia64_save_fpu(prev->arch._thread.fph);
182 __ia64_per_cpu_var(fp_owner) = prev;
183 }
184 }
185 }
187 if (!is_idle_vcpu(next)) {
188 if (VMX_DOMAIN(next)) {
189 FP_PSR(next) = IA64_PSR_DFH;
190 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
191 } else {
192 PSCB(next, hpsr_dfh) = 1;
193 PSCB(next, hpsr_mfh) = 0;
194 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
195 }
196 }
197 }
199 void schedule_tail(struct vcpu *prev)
200 {
201 extern char ia64_ivt;
203 context_saved(prev);
204 ia64_disable_vhpt_walker();
206 if (VMX_DOMAIN(current))
207 vmx_do_resume(current);
208 else {
209 if (VMX_DOMAIN(prev))
210 ia64_set_iva(&ia64_ivt);
211 load_region_regs(current);
212 ia64_set_pta(vcpu_pta(current));
213 vcpu_load_kernel_regs(current);
214 __ia64_per_cpu_var(current_psr_i_addr) =
215 (uint8_t*)(current->domain->arch.shared_info_va +
216 INT_ENABLE_OFFSET(current));
217 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
218 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
219 migrate_timer(&current->arch.hlt_timer, current->processor);
220 }
221 flush_vtlb_for_context_switch(prev, current);
222 }
224 void context_switch(struct vcpu *prev, struct vcpu *next)
225 {
226 uint64_t spsr;
228 local_irq_save(spsr);
230 if (VMX_DOMAIN(prev)) {
231 vmx_save_state(prev);
232 if (!VMX_DOMAIN(next)) {
233 /* VMX domains can change the physical cr.dcr.
234 * Restore default to prevent leakage. */
235 uint64_t dcr = ia64_getreg(_IA64_REG_CR_DCR);
236 /* xenoprof:
237 * don't change psr.pp.
238 * It is manipulated by xenoprof.
239 */
240 dcr = (IA64_DEFAULT_DCR_BITS & ~IA64_DCR_PP) | (dcr & IA64_DCR_PP);
241 ia64_setreg(_IA64_REG_CR_DCR, dcr);
242 }
243 }
245 ia64_disable_vhpt_walker();
246 lazy_fp_switch(prev, current);
248 if (prev->arch.dbg_used || next->arch.dbg_used) {
249 /*
250 * Load debug registers either because they are valid or to clear
251 * the previous one.
252 */
253 ia64_load_debug_regs(next->arch.dbr);
254 }
256 prev = ia64_switch_to(next);
258 /* Note: ia64_switch_to does not return here at vcpu initialization. */
260 if (VMX_DOMAIN(current)) {
261 vmx_load_all_rr(current);
262 vmx_load_state(current);
263 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
264 current->processor);
265 } else {
266 struct domain *nd;
267 extern char ia64_ivt;
269 if (VMX_DOMAIN(prev))
270 ia64_set_iva(&ia64_ivt);
272 nd = current->domain;
273 if (!is_idle_domain(nd)) {
274 load_region_regs(current);
275 ia64_set_pta(vcpu_pta(current));
276 vcpu_load_kernel_regs(current);
277 if (vcpu_pkr_in_use(current))
278 vcpu_pkr_load_regs(current);
279 vcpu_set_next_timer(current);
280 if (vcpu_timer_expired(current))
281 vcpu_pend_timer(current);
282 __ia64_per_cpu_var(current_psr_i_addr) =
283 (uint8_t*)(nd->arch.shared_info_va +
284 INT_ENABLE_OFFSET(current));
285 __ia64_per_cpu_var(current_psr_ic_addr) =
286 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
287 /* steal time accounting */
288 if (!guest_handle_is_null(runstate_guest(current)))
289 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
290 } else {
291 /* When switching to idle domain, only need to disable vhpt
292 * walker. Then all accesses happen within idle context will
293 * be handled by TR mapping and identity mapping.
294 */
295 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
296 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
297 }
298 }
299 local_irq_restore(spsr);
301 /* lazy fp */
302 if (current->processor != current->arch.last_processor) {
303 unsigned long *addr;
304 addr = (unsigned long *)per_cpu_addr(fp_owner,
305 current->arch.last_processor);
306 ia64_cmpxchg(acq, addr, current, 0, 8);
307 }
309 flush_vtlb_for_context_switch(prev, current);
310 flush_cache_for_context_switch(current);
311 context_saved(prev);
312 }
314 void continue_running(struct vcpu *same)
315 {
316 /* nothing to do */
317 }
319 #ifdef CONFIG_PERFMON
320 static int pal_halt = 1;
321 static int can_do_pal_halt = 1;
323 static int __init nohalt_setup(char * str)
324 {
325 pal_halt = can_do_pal_halt = 0;
326 return 1;
327 }
328 __setup("nohalt", nohalt_setup);
330 void
331 update_pal_halt_status(int status)
332 {
333 can_do_pal_halt = pal_halt && status;
334 }
335 #else
336 #define can_do_pal_halt (1)
337 #endif
339 static void default_idle(void)
340 {
341 local_irq_disable();
342 if ( !softirq_pending(smp_processor_id()) ) {
343 if (can_do_pal_halt)
344 safe_halt();
345 else
346 cpu_relax();
347 }
348 local_irq_enable();
349 }
351 extern void play_dead(void);
353 static void continue_cpu_idle_loop(void)
354 {
355 int cpu = smp_processor_id();
357 for ( ; ; )
358 {
359 #ifdef IA64
360 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
361 #else
362 irq_stat[cpu].idle_timestamp = jiffies;
363 #endif
364 page_scrub_schedule_work();
365 while ( !softirq_pending(cpu) )
366 default_idle();
367 raise_softirq(SCHEDULE_SOFTIRQ);
368 do_softirq();
369 if (!cpu_online(cpu))
370 play_dead();
371 }
372 }
374 void startup_cpu_idle_loop(void)
375 {
376 /* Just some sanity to ensure that the scheduler is set up okay. */
377 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
378 raise_softirq(SCHEDULE_SOFTIRQ);
380 continue_cpu_idle_loop();
381 }
383 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
384 * get_order_from_shift(XMAPPEDREGS_SHIFT))
385 */
386 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
387 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
388 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
389 #endif
391 void hlt_timer_fn(void *data)
392 {
393 struct vcpu *v = data;
394 vcpu_unblock(v);
395 }
397 void relinquish_vcpu_resources(struct vcpu *v)
398 {
399 if (HAS_PERVCPU_VHPT(v->domain))
400 pervcpu_vhpt_free(v);
401 if (v->arch.privregs != NULL) {
402 free_domheap_pages(virt_to_page(v->arch.privregs),
403 get_order_from_shift(XMAPPEDREGS_SHIFT));
404 v->arch.privregs = NULL;
405 }
406 kill_timer(&v->arch.hlt_timer);
407 }
409 struct vcpu *alloc_vcpu_struct(void)
410 {
411 struct page_info *page;
412 struct vcpu *v;
413 struct thread_info *ti;
414 static int first_allocation = 1;
416 if (first_allocation) {
417 first_allocation = 0;
418 /* Still keep idle vcpu0 static allocated at compilation, due
419 * to some code from Linux still requires it in early phase.
420 */
421 return idle_vcpu[0];
422 }
424 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
425 if (page == NULL)
426 return NULL;
427 v = page_to_virt(page);
428 memset(v, 0, sizeof(*v));
430 ti = alloc_thread_info(v);
431 /* Clear thread_info to clear some important fields, like
432 * preempt_count
433 */
434 memset(ti, 0, sizeof(struct thread_info));
435 init_switch_stack(v);
437 return v;
438 }
440 void free_vcpu_struct(struct vcpu *v)
441 {
442 free_domheap_pages(virt_to_page(v), KERNEL_STACK_SIZE_ORDER);
443 }
445 int vcpu_initialise(struct vcpu *v)
446 {
447 struct domain *d = v->domain;
449 if (!is_idle_domain(d)) {
450 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
451 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
452 /* Set default values to saved_rr. */
453 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
454 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
456 /* Is it correct ?
457 It depends on the domain rid usage.
459 A domain may share rid among its processor (eg having a
460 global VHPT). In this case, we should also share rid
461 among vcpus and the rid range should be the same.
463 However a domain may have per cpu rid allocation. In
464 this case we don't want to share rid among vcpus, but we may
465 do it if two vcpus are on the same cpu... */
467 v->arch.starting_rid = d->arch.starting_rid;
468 v->arch.ending_rid = d->arch.ending_rid;
469 v->arch.rid_bits = d->arch.rid_bits;
470 v->arch.breakimm = d->arch.breakimm;
471 v->arch.last_processor = INVALID_PROCESSOR;
472 v->arch.vhpt_pg_shift = PAGE_SHIFT;
473 }
475 if (!VMX_DOMAIN(v))
476 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
477 first_cpu(cpu_online_map));
479 return 0;
480 }
482 static void vcpu_share_privregs_with_guest(struct vcpu *v)
483 {
484 struct domain *d = v->domain;
485 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
487 for (i = 0; i < (1 << order); i++)
488 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
489 d, XENSHARE_writable);
490 /*
491 * XXX IA64_XMAPPEDREGS_PADDR
492 * assign these pages into guest pseudo physical address
493 * space for dom0 to map this page by gmfn.
494 * this is necessary for domain save, restore and dump-core.
495 */
496 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
497 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
498 virt_to_maddr(v->arch.privregs + i));
499 }
501 int vcpu_late_initialise(struct vcpu *v)
502 {
503 struct domain *d = v->domain;
504 struct page_info *page;
505 int rc, order;
507 if (HAS_PERVCPU_VHPT(d)) {
508 rc = pervcpu_vhpt_alloc(v);
509 if (rc != 0)
510 return rc;
511 }
513 /* Create privregs page. */
514 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
515 page = alloc_domheap_pages(NULL, order, 0);
516 if (page == NULL)
517 return -ENOMEM;
519 v->arch.privregs = page_to_virt(page);
520 BUG_ON(v->arch.privregs == NULL);
521 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
522 vcpu_share_privregs_with_guest(v);
524 return 0;
525 }
527 void vcpu_destroy(struct vcpu *v)
528 {
529 if (is_hvm_vcpu(v))
530 vmx_relinquish_vcpu_resources(v);
531 else
532 relinquish_vcpu_resources(v);
533 }
535 static unsigned long*
536 vcpu_to_rbs_bottom(struct vcpu *v)
537 {
538 return (unsigned long*)((char *)v + IA64_RBS_OFFSET);
539 }
541 static void init_switch_stack(struct vcpu *v)
542 {
543 struct pt_regs *regs = vcpu_regs (v);
544 struct switch_stack *sw = (struct switch_stack *) regs - 1;
545 extern void ia64_ret_from_clone;
547 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
548 sw->ar_bspstore = (unsigned long)vcpu_to_rbs_bottom(v);
549 sw->b0 = (unsigned long) &ia64_ret_from_clone;
550 sw->ar_fpsr = FPSR_DEFAULT;
551 v->arch._thread.ksp = (unsigned long) sw - 16;
552 // stay on kernel stack because may get interrupts!
553 // ia64_ret_from_clone switches to user stack
554 v->arch._thread.on_ustack = 0;
555 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
556 }
558 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
559 static int opt_pervcpu_vhpt = 1;
560 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
561 #endif
563 int arch_domain_create(struct domain *d)
564 {
565 int i;
566 struct page_info *page = NULL;
568 // the following will eventually need to be negotiated dynamically
569 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
570 d->arch.breakimm = 0x1000;
571 for (i = 0; i < NR_CPUS; i++) {
572 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
573 }
575 if (is_idle_domain(d))
576 return 0;
578 foreign_p2m_init(d);
579 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
580 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
581 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
582 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
583 #endif
584 if (tlb_track_create(d) < 0)
585 goto fail_nomem1;
586 page = alloc_domheap_pages(NULL, get_order_from_shift(XSI_SHIFT), 0);
587 if (page == NULL)
588 goto fail_nomem;
589 d->shared_info = page_to_virt(page);
590 BUG_ON(d->shared_info == NULL);
591 memset(d->shared_info, 0, XSI_SIZE);
592 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
593 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
594 d, XENSHARE_writable);
596 /* We may also need emulation rid for region4, though it's unlikely
597 * to see guest issue uncacheable access in metaphysical mode. But
598 * keep such info here may be more sane.
599 */
600 if (!allocate_rid_range(d,0))
601 goto fail_nomem;
603 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
604 d->arch.relres = RELRES_not_started;
605 d->arch.mm_teardown_offset = 0;
606 INIT_LIST_HEAD(&d->arch.relmem_list);
608 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
609 goto fail_nomem;
611 /*
612 * grant_table_create() can't fully initialize grant table for domain
613 * because it is called before arch_domain_create().
614 * Here we complete the initialization which requires p2m table.
615 */
616 spin_lock(&d->grant_table->lock);
617 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
618 ia64_gnttab_create_shared_page(d, d->grant_table, i);
619 spin_unlock(&d->grant_table->lock);
621 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
622 RANGESETF_prettyprint_hex);
624 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
625 return 0;
627 fail_nomem:
628 tlb_track_destroy(d);
629 fail_nomem1:
630 if (d->arch.mm.pgd != NULL)
631 pgd_free(d->arch.mm.pgd);
632 if (page != NULL)
633 free_domheap_pages(page, get_order_from_shift(XSI_SHIFT));
634 return -ENOMEM;
635 }
637 void arch_domain_destroy(struct domain *d)
638 {
639 mm_final_teardown(d);
641 if (d->shared_info != NULL)
642 free_domheap_pages(virt_to_page(d->shared_info),
643 get_order_from_shift(XSI_SHIFT));
645 tlb_track_destroy(d);
647 /* Clear vTLB for the next domain. */
648 domain_flush_tlb_vhpt(d);
650 deallocate_rid_range(d);
651 }
653 int arch_vcpu_reset(struct vcpu *v)
654 {
655 /* FIXME: Stub for now */
656 return 0;
657 }
659 /* Here it is assumed that all of the CPUs has same RSE.N_STACKED_PHYS */
660 static unsigned long num_phys_stacked;
661 static int __init
662 init_num_phys_stacked(void)
663 {
664 switch (ia64_pal_rse_info(&num_phys_stacked, NULL)) {
665 case 0L:
666 printk("the number of physical stacked general registers"
667 "(RSE.N_STACKED_PHYS) = %ld\n", num_phys_stacked);
668 return 0;
669 case -2L:
670 case -3L:
671 default:
672 break;
673 }
674 printk("WARNING: PAL_RSE_INFO call failed. "
675 "domain save/restore may NOT work!\n");
676 return -EINVAL;
677 }
678 __initcall(init_num_phys_stacked);
680 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
682 #define AR_PFS_PEC_SHIFT 51
683 #define AR_PFS_REC_SIZE 6
684 #define AR_PFS_PEC_MASK (((1UL << 6) - 1) << 51)
686 /*
687 * See init_swtich_stack() and ptrace.h
688 */
689 static struct switch_stack*
690 vcpu_to_switch_stack(struct vcpu* v)
691 {
692 return (struct switch_stack *)(v->arch._thread.ksp + 16);
693 }
695 static int
696 vcpu_has_not_run(struct vcpu* v)
697 {
698 extern void ia64_ret_from_clone;
699 struct switch_stack *sw = vcpu_to_switch_stack(v);
701 return (sw == (struct switch_stack *)(vcpu_regs(v)) - 1) &&
702 (sw->b0 == (unsigned long)&ia64_ret_from_clone);
703 }
705 static void
706 nats_update(unsigned int* nats, unsigned int reg, char nat)
707 {
708 BUG_ON(reg > 31);
710 if (nat)
711 *nats |= (1UL << reg);
712 else
713 *nats &= ~(1UL << reg);
714 }
716 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
717 {
718 int i;
719 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
720 struct cpu_user_regs *uregs = vcpu_regs(v);
721 struct switch_stack *sw = vcpu_to_switch_stack(v);
722 struct unw_frame_info info;
723 int is_hvm = VMX_DOMAIN(v);
724 unsigned int rbs_size;
725 unsigned long *const rbs_bottom = vcpu_to_rbs_bottom(v);
726 unsigned long *rbs_top;
727 unsigned long *rbs_rnat_addr;
728 unsigned int top_slot;
729 unsigned int num_regs;
731 memset(c.nat, 0, sizeof(*c.nat));
732 c.nat->regs.b[6] = uregs->b6;
733 c.nat->regs.b[7] = uregs->b7;
735 memset(&info, 0, sizeof(info));
736 unw_init_from_blocked_task(&info, v);
737 if (vcpu_has_not_run(v)) {
738 c.nat->regs.ar.lc = sw->ar_lc;
739 c.nat->regs.ar.ec =
740 (sw->ar_pfs & AR_PFS_PEC_MASK) >> AR_PFS_PEC_SHIFT;
741 } else if (unw_unwind_to_user(&info) < 0) {
742 /* warn: should panic? */
743 gdprintk(XENLOG_ERR, "vcpu=%d unw_unwind_to_user() failed.\n",
744 v->vcpu_id);
745 show_stack(v, NULL);
747 /* can't return error */
748 c.nat->regs.ar.lc = 0;
749 c.nat->regs.ar.ec = 0;
750 } else {
751 unw_get_ar(&info, UNW_AR_LC, &c.nat->regs.ar.lc);
752 unw_get_ar(&info, UNW_AR_EC, &c.nat->regs.ar.ec);
753 }
754 c.nat->regs.ar.csd = uregs->ar_csd;
755 c.nat->regs.ar.ssd = uregs->ar_ssd;
757 c.nat->regs.r[8] = uregs->r8;
758 c.nat->regs.r[9] = uregs->r9;
759 c.nat->regs.r[10] = uregs->r10;
760 c.nat->regs.r[11] = uregs->r11;
762 if (is_hvm)
763 c.nat->regs.psr = vmx_vcpu_get_psr(v);
764 else
765 c.nat->regs.psr = vcpu_get_psr(v);
767 c.nat->regs.ip = uregs->cr_iip;
768 c.nat->regs.cfm = uregs->cr_ifs;
770 c.nat->regs.ar.unat = uregs->ar_unat;
771 c.nat->regs.ar.pfs = uregs->ar_pfs;
772 c.nat->regs.ar.rsc = uregs->ar_rsc;
773 c.nat->regs.ar.rnat = uregs->ar_rnat;
774 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
776 c.nat->regs.pr = uregs->pr;
777 c.nat->regs.b[0] = uregs->b0;
778 rbs_size = uregs->loadrs >> 16;
779 num_regs = ia64_rse_num_regs(rbs_bottom,
780 (unsigned long*)((char*)rbs_bottom + rbs_size));
781 c.nat->regs.ar.bsp = (unsigned long)ia64_rse_skip_regs(
782 (unsigned long*)c.nat->regs.ar.bspstore, num_regs);
783 BUG_ON(num_regs > num_phys_stacked);
785 c.nat->regs.r[1] = uregs->r1;
786 c.nat->regs.r[12] = uregs->r12;
787 c.nat->regs.r[13] = uregs->r13;
788 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
789 c.nat->regs.r[15] = uregs->r15;
791 c.nat->regs.r[14] = uregs->r14;
792 c.nat->regs.r[2] = uregs->r2;
793 c.nat->regs.r[3] = uregs->r3;
794 c.nat->regs.r[16] = uregs->r16;
795 c.nat->regs.r[17] = uregs->r17;
796 c.nat->regs.r[18] = uregs->r18;
797 c.nat->regs.r[19] = uregs->r19;
798 c.nat->regs.r[20] = uregs->r20;
799 c.nat->regs.r[21] = uregs->r21;
800 c.nat->regs.r[22] = uregs->r22;
801 c.nat->regs.r[23] = uregs->r23;
802 c.nat->regs.r[24] = uregs->r24;
803 c.nat->regs.r[25] = uregs->r25;
804 c.nat->regs.r[26] = uregs->r26;
805 c.nat->regs.r[27] = uregs->r27;
806 c.nat->regs.r[28] = uregs->r28;
807 c.nat->regs.r[29] = uregs->r29;
808 c.nat->regs.r[30] = uregs->r30;
809 c.nat->regs.r[31] = uregs->r31;
811 c.nat->regs.ar.ccv = uregs->ar_ccv;
813 COPY_FPREG(&c.nat->regs.f[2], &sw->f2);
814 COPY_FPREG(&c.nat->regs.f[3], &sw->f3);
815 COPY_FPREG(&c.nat->regs.f[4], &sw->f4);
816 COPY_FPREG(&c.nat->regs.f[5], &sw->f5);
818 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
819 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
820 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
821 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
822 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
823 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
825 COPY_FPREG(&c.nat->regs.f[12], &sw->f12);
826 COPY_FPREG(&c.nat->regs.f[13], &sw->f13);
827 COPY_FPREG(&c.nat->regs.f[14], &sw->f14);
828 COPY_FPREG(&c.nat->regs.f[15], &sw->f15);
829 COPY_FPREG(&c.nat->regs.f[16], &sw->f16);
830 COPY_FPREG(&c.nat->regs.f[17], &sw->f17);
831 COPY_FPREG(&c.nat->regs.f[18], &sw->f18);
832 COPY_FPREG(&c.nat->regs.f[19], &sw->f19);
833 COPY_FPREG(&c.nat->regs.f[20], &sw->f20);
834 COPY_FPREG(&c.nat->regs.f[21], &sw->f21);
835 COPY_FPREG(&c.nat->regs.f[22], &sw->f22);
836 COPY_FPREG(&c.nat->regs.f[23], &sw->f23);
837 COPY_FPREG(&c.nat->regs.f[24], &sw->f24);
838 COPY_FPREG(&c.nat->regs.f[25], &sw->f25);
839 COPY_FPREG(&c.nat->regs.f[26], &sw->f26);
840 COPY_FPREG(&c.nat->regs.f[27], &sw->f27);
841 COPY_FPREG(&c.nat->regs.f[28], &sw->f28);
842 COPY_FPREG(&c.nat->regs.f[29], &sw->f29);
843 COPY_FPREG(&c.nat->regs.f[30], &sw->f30);
844 COPY_FPREG(&c.nat->regs.f[31], &sw->f31);
846 // f32 - f127
847 memcpy(&c.nat->regs.f[32], &v->arch._thread.fph[0],
848 sizeof(v->arch._thread.fph));
850 #define NATS_UPDATE(reg) \
851 nats_update(&c.nat->regs.nats, (reg), \
852 !!(uregs->eml_unat & \
853 (1UL << ia64_unat_pos(&uregs->r ## reg))))
855 // corresponding bit in ar.unat is determined by
856 // (&uregs->rN){8:3}.
857 // r8: the lowest gr member of struct cpu_user_regs.
858 // r7: the highest gr member of struct cpu_user_regs.
859 BUILD_BUG_ON(offsetof(struct cpu_user_regs, r7) -
860 offsetof(struct cpu_user_regs, r8) >
861 64 * sizeof(unsigned long));
863 NATS_UPDATE(1);
864 NATS_UPDATE(2);
865 NATS_UPDATE(3);
867 NATS_UPDATE(8);
868 NATS_UPDATE(9);
869 NATS_UPDATE(10);
870 NATS_UPDATE(11);
871 NATS_UPDATE(12);
872 NATS_UPDATE(13);
873 NATS_UPDATE(14);
874 NATS_UPDATE(15);
875 NATS_UPDATE(16);
876 NATS_UPDATE(17);
877 NATS_UPDATE(18);
878 NATS_UPDATE(19);
879 NATS_UPDATE(20);
880 NATS_UPDATE(21);
881 NATS_UPDATE(22);
882 NATS_UPDATE(23);
883 NATS_UPDATE(24);
884 NATS_UPDATE(25);
885 NATS_UPDATE(26);
886 NATS_UPDATE(27);
887 NATS_UPDATE(28);
888 NATS_UPDATE(29);
889 NATS_UPDATE(30);
890 NATS_UPDATE(31);
892 if (!is_hvm) {
893 c.nat->regs.r[4] = uregs->r4;
894 c.nat->regs.r[5] = uregs->r5;
895 c.nat->regs.r[6] = uregs->r6;
896 c.nat->regs.r[7] = uregs->r7;
898 NATS_UPDATE(4);
899 NATS_UPDATE(5);
900 NATS_UPDATE(6);
901 NATS_UPDATE(7);
902 #undef NATS_UPDATE
903 } else {
904 /*
905 * for VTi domain, r[4-7] are saved sometimes both in
906 * uregs->r[4-7] and memory stack or only in memory stack.
907 * So it is ok to get them from memory stack.
908 */
909 c.nat->regs.nats = uregs->eml_unat;
911 if (vcpu_has_not_run(v)) {
912 c.nat->regs.r[4] = sw->r4;
913 c.nat->regs.r[5] = sw->r5;
914 c.nat->regs.r[6] = sw->r6;
915 c.nat->regs.r[7] = sw->r7;
917 nats_update(&c.nat->regs.nats, 4,
918 !!(sw->ar_unat &
919 (1UL << ia64_unat_pos(&sw->r4))));
920 nats_update(&c.nat->regs.nats, 5,
921 !!(sw->ar_unat &
922 (1UL << ia64_unat_pos(&sw->r5))));
923 nats_update(&c.nat->regs.nats, 6,
924 !!(sw->ar_unat &
925 (1UL << ia64_unat_pos(&sw->r6))));
926 nats_update(&c.nat->regs.nats, 7,
927 !!(sw->ar_unat &
928 (1UL << ia64_unat_pos(&sw->r7))));
929 } else {
930 char nat;
932 unw_get_gr(&info, 4, &c.nat->regs.r[4], &nat);
933 nats_update(&c.nat->regs.nats, 4, nat);
934 unw_get_gr(&info, 5, &c.nat->regs.r[5], &nat);
935 nats_update(&c.nat->regs.nats, 5, nat);
936 unw_get_gr(&info, 6, &c.nat->regs.r[6], &nat);
937 nats_update(&c.nat->regs.nats, 6, nat);
938 unw_get_gr(&info, 7, &c.nat->regs.r[7], &nat);
939 nats_update(&c.nat->regs.nats, 7, nat);
940 }
941 }
943 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
944 if (unlikely(rbs_size > sizeof(c.nat->regs.rbs)))
945 gdprintk(XENLOG_INFO,
946 "rbs_size is too large 0x%x > 0x%lx\n",
947 rbs_size, sizeof(c.nat->regs.rbs));
948 else
949 memcpy(c.nat->regs.rbs, rbs_bottom, rbs_size);
951 rbs_top = (unsigned long*)((char *)rbs_bottom + rbs_size) - 1;
952 rbs_rnat_addr = ia64_rse_rnat_addr(rbs_top);
953 if ((unsigned long)rbs_rnat_addr >= sw->ar_bspstore)
954 rbs_rnat_addr = &sw->ar_rnat;
956 top_slot = ia64_rse_slot_num(rbs_top);
958 c.nat->regs.rbs_rnat = (*rbs_rnat_addr) & ((1UL << top_slot) - 1);
959 if (ia64_rse_rnat_addr(rbs_bottom) == ia64_rse_rnat_addr(rbs_top)) {
960 unsigned int bottom_slot = ia64_rse_slot_num(rbs_bottom);
961 c.nat->regs.rbs_rnat &= ~((1UL << bottom_slot) - 1);
962 }
964 c.nat->regs.num_phys_stacked = num_phys_stacked;
966 if (VMX_DOMAIN(v))
967 c.nat->privregs_pfn = VGC_PRIVREGS_HVM;
968 else
969 c.nat->privregs_pfn = get_gpfn_from_mfn(
970 virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
972 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
973 if (VMX_DOMAIN(v)) {
974 vmx_vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
975 vmx_vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
976 } else {
977 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
978 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
979 }
980 }
982 for (i = 0; i < 8; i++)
983 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
985 /* Fill extra regs. */
986 for (i = 0;
987 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
988 i++) {
989 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
990 tr->itrs[i].itir = v->arch.itrs[i].itir;
991 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
992 tr->itrs[i].rid = v->arch.itrs[i].rid;
993 }
994 for (i = 0;
995 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
996 i++) {
997 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
998 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
999 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
1000 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
1002 c.nat->event_callback_ip = v->arch.event_callback_ip;
1004 /* If PV and privregs is not set, we can't read mapped registers. */
1005 if (!is_hvm_vcpu(v) && v->arch.privregs == NULL)
1006 return;
1008 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
1010 c.nat->regs.cr.itm = is_hvm_vcpu(v) ?
1011 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
1012 vcpu_get_iva(v, &c.nat->regs.cr.iva);
1013 vcpu_get_pta(v, &c.nat->regs.cr.pta);
1015 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
1016 vcpu_get_isr(v, &c.nat->regs.cr.isr);
1017 vcpu_get_iip(v, &c.nat->regs.cr.iip);
1018 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
1019 vcpu_get_itir(v, &c.nat->regs.cr.itir);
1020 vcpu_get_iha(v, &c.nat->regs.cr.iha);
1022 //XXX change irr[] and arch.insvc[]
1023 if (is_hvm_vcpu(v))
1024 /* c.nat->regs.cr.ivr = vmx_vcpu_get_ivr(v)*/;//XXXnot SMP-safe
1025 else
1026 vcpu_get_ivr (v, &c.nat->regs.cr.ivr);
1027 vcpu_get_iim(v, &c.nat->regs.cr.iim);
1029 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
1030 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
1031 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
1032 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
1033 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
1034 vcpu_get_itv(v, &c.nat->regs.cr.itv);//XXX vlsapic
1035 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
1036 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
1038 if (is_hvm)
1039 vmx_arch_get_info_guest(v, c);
1042 #if 0
1043 // for debug
1044 static void
1045 __rbs_print(const char* func, int line, const char* name,
1046 const unsigned long* rbs, unsigned int rbs_size)
1048 unsigned int i;
1049 printk("%s:%d %s rbs %p\n", func, line, name, rbs);
1050 printk(" rbs_size 0x%016x no 0x%lx\n",
1051 rbs_size, rbs_size / sizeof(unsigned long));
1053 for (i = 0; i < rbs_size / sizeof(unsigned long); i++) {
1054 const char* zero_or_n = "0x";
1055 if (ia64_rse_is_rnat_slot((unsigned long*)&rbs[i]))
1056 zero_or_n = "Nx";
1058 if ((i % 3) == 0)
1059 printk("0x%02x:", i);
1060 printk(" %s%016lx", zero_or_n, rbs[i]);
1061 if ((i % 3) == 2)
1062 printk("\n");
1064 printk("\n");
1067 #define rbs_print(rbs, rbs_size) \
1068 __rbs_print(__func__, __LINE__, (#rbs), (rbs), (rbs_size))
1069 #endif
1071 static int
1072 copy_rbs(struct vcpu* v, unsigned long* dst_rbs_size,
1073 const unsigned long* rbs, unsigned long rbs_size,
1074 unsigned long src_rnat, unsigned long rbs_voff)
1076 int rc = -EINVAL;
1077 struct page_info* page;
1078 unsigned char* vaddr;
1079 unsigned long* src_bsp;
1080 unsigned long* src_bspstore;
1082 struct switch_stack* sw = vcpu_to_switch_stack(v);
1083 unsigned long num_regs;
1084 unsigned long* dst_bsp;
1085 unsigned long* dst_bspstore;
1086 unsigned long* dst_rnat;
1087 unsigned long dst_rnat_tmp;
1088 unsigned long dst_rnat_mask;
1089 unsigned long flags;
1090 extern void ia64_copy_rbs(unsigned long* dst_bspstore,
1091 unsigned long* dst_rbs_size,
1092 unsigned long* dst_rnat_p,
1093 unsigned long* src_bsp,
1094 unsigned long src_rbs_size,
1095 unsigned long src_rnat);
1097 dst_bspstore = vcpu_to_rbs_bottom(v);
1098 *dst_rbs_size = rbs_size;
1099 if (rbs_size == 0)
1100 return 0;
1102 // rbs offset depends on sizeof(struct vcpu) so that
1103 // it's too unstable for hypercall ABI.
1104 // we need to take rbs offset into acount.
1105 //memcpy(dst_bspstore, c.nat->regs.rbs, rbs_size);
1107 // It is assumed that rbs_size is small enough compared
1108 // to KERNEL_STACK_SIZE.
1109 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
1110 if (page == NULL)
1111 return -ENOMEM;
1112 vaddr = page_to_virt(page);
1114 src_bspstore = (unsigned long*)(vaddr + rbs_voff * 8);
1115 src_bsp = (unsigned long*)((unsigned char*)src_bspstore + rbs_size);
1116 if ((unsigned long)src_bsp >= (unsigned long)vaddr + PAGE_SIZE)
1117 goto out;
1118 memcpy(src_bspstore, rbs, rbs_size);
1120 num_regs = ia64_rse_num_regs(src_bspstore, src_bsp);
1121 dst_bsp = ia64_rse_skip_regs(dst_bspstore, num_regs);
1122 *dst_rbs_size = (unsigned long)dst_bsp - (unsigned long)dst_bspstore;
1124 // rough check.
1125 if (((unsigned long)dst_bsp & ~PAGE_MASK) > KERNEL_STACK_SIZE / 2)
1126 goto out;
1128 // ia64_copy_rbs() uses real cpu's stack register.
1129 // So it may fault with an Illigal Operation fault resulting
1130 // in panic if rbs_size is too large to load compared to
1131 // the number of physical stacked registers, RSE.N_STACKED_PHYS,
1132 // which is cpu implementatin specific.
1133 // See SDM vol. 2 Register Stack Engine 6, especially 6.5.5.
1134 //
1135 // For safe operation and cpu model independency,
1136 // we need to copy them by hand without loadrs and flushrs
1137 // However even if we implement that, similar issue still occurs
1138 // when running guest. CPU context restore routine issues loadrs
1139 // resulting in Illegal Operation fault. And what if the vRSE is in
1140 // enforced lazy mode? We can't store any dirty stacked registers
1141 // into RBS without cover or br.call.
1142 if (num_regs > num_phys_stacked) {
1143 rc = -ENOSYS;
1144 gdprintk(XENLOG_WARNING,
1145 "%s:%d domain %d: can't load stacked registres\n"
1146 "requested size 0x%lx => 0x%lx, num regs %ld"
1147 "RSE.N_STACKED_PHYS %ld\n",
1148 __func__, __LINE__, v->domain->domain_id,
1149 rbs_size, *dst_rbs_size, num_regs,
1150 num_phys_stacked);
1151 goto out;
1154 // we mask interrupts to avoid using register backing store.
1155 local_irq_save(flags);
1156 ia64_copy_rbs(dst_bspstore, dst_rbs_size, &dst_rnat_tmp,
1157 src_bsp, rbs_size, src_rnat);
1158 local_irq_restore(flags);
1160 dst_rnat_mask = (1UL << ia64_rse_slot_num(dst_bsp)) - 1;
1161 dst_rnat = ia64_rse_rnat_addr(dst_bsp);
1162 if ((unsigned long)dst_rnat > sw->ar_bspstore)
1163 dst_rnat = &sw->ar_rnat;
1164 // if ia64_rse_rnat_addr(dst_bsp) ==
1165 // ia64_rse_rnat_addr(vcpu_to_rbs_bottom(v)), the lsb bit of rnat
1166 // is just ignored. so we don't have to mask it out.
1167 *dst_rnat =
1168 (*dst_rnat & ~dst_rnat_mask) | (dst_rnat_tmp & dst_rnat_mask);
1170 rc = 0;
1171 out:
1172 free_domheap_pages(page, KERNEL_STACK_SIZE_ORDER);
1173 return rc;
1176 static void
1177 unat_update(unsigned long *unat_eml, unsigned long *spill_addr, char nat)
1179 unsigned int pos = ia64_unat_pos(spill_addr);
1180 if (nat)
1181 *unat_eml |= (1UL << pos);
1182 else
1183 *unat_eml &= ~(1UL << pos);
1186 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
1188 struct cpu_user_regs *uregs = vcpu_regs(v);
1189 struct domain *d = v->domain;
1190 struct switch_stack *sw = vcpu_to_switch_stack(v);
1191 int was_initialised = v->is_initialised;
1192 struct unw_frame_info info;
1193 unsigned int rbs_size;
1194 unsigned int num_regs;
1195 unsigned long * const rbs_bottom = vcpu_to_rbs_bottom(v);
1196 int rc = 0;
1197 int i;
1199 /* Finish vcpu initialization. */
1200 if (!was_initialised) {
1201 if (is_hvm_domain(d))
1202 rc = vmx_final_setup_guest(v);
1203 else
1204 rc = vcpu_late_initialise(v);
1205 if (rc != 0)
1206 return rc;
1208 vcpu_init_regs(v);
1210 v->is_initialised = 1;
1211 /* Auto-online VCPU0 when it is initialised. */
1212 if (v->vcpu_id == 0)
1213 clear_bit(_VPF_down, &v->pause_flags);
1216 if (c.nat == NULL)
1217 return 0;
1219 uregs->b6 = c.nat->regs.b[6];
1220 uregs->b7 = c.nat->regs.b[7];
1222 memset(&info, 0, sizeof(info));
1223 unw_init_from_blocked_task(&info, v);
1224 if (vcpu_has_not_run(v)) {
1225 sw->ar_lc = c.nat->regs.ar.lc;
1226 sw->ar_pfs =
1227 (sw->ar_pfs & ~AR_PFS_PEC_MASK) |
1228 ((c.nat->regs.ar.ec << AR_PFS_PEC_SHIFT) &
1229 AR_PFS_PEC_MASK);
1230 } else if (unw_unwind_to_user(&info) < 0) {
1231 /* warn: should panic? */
1232 gdprintk(XENLOG_ERR,
1233 "vcpu=%d unw_unwind_to_user() failed.\n",
1234 v->vcpu_id);
1235 show_stack(v, NULL);
1237 //return -ENOSYS;
1238 } else {
1239 unw_set_ar(&info, UNW_AR_LC, c.nat->regs.ar.lc);
1240 unw_set_ar(&info, UNW_AR_EC, c.nat->regs.ar.ec);
1242 uregs->ar_csd = c.nat->regs.ar.csd;
1243 uregs->ar_ssd = c.nat->regs.ar.ssd;
1245 uregs->r8 = c.nat->regs.r[8];
1246 uregs->r9 = c.nat->regs.r[9];
1247 uregs->r10 = c.nat->regs.r[10];
1248 uregs->r11 = c.nat->regs.r[11];
1250 if (!is_hvm_domain(d))
1251 vcpu_set_psr(v, c.nat->regs.psr);
1252 else
1253 vmx_vcpu_set_psr(v, c.nat->regs.psr);
1254 uregs->cr_iip = c.nat->regs.ip;
1255 uregs->cr_ifs = c.nat->regs.cfm;
1257 uregs->ar_unat = c.nat->regs.ar.unat;
1258 uregs->ar_pfs = c.nat->regs.ar.pfs;
1259 uregs->ar_rsc = c.nat->regs.ar.rsc;
1260 uregs->ar_rnat = c.nat->regs.ar.rnat;
1261 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
1263 uregs->pr = c.nat->regs.pr;
1264 uregs->b0 = c.nat->regs.b[0];
1265 num_regs = ia64_rse_num_regs((unsigned long*)c.nat->regs.ar.bspstore,
1266 (unsigned long*)c.nat->regs.ar.bsp);
1267 rbs_size = (unsigned long)ia64_rse_skip_regs(rbs_bottom, num_regs) -
1268 (unsigned long)rbs_bottom;
1269 if (rbs_size > sizeof (c.nat->regs.rbs)) {
1270 gdprintk(XENLOG_INFO,
1271 "rbs size is too large %x > %lx\n",
1272 rbs_size, sizeof (c.nat->regs.rbs));
1273 return -EINVAL;
1275 if (rbs_size > 0 &&
1276 ((IA64_RBS_OFFSET / 8) % 64) != c.nat->regs.rbs_voff)
1277 gdprintk(XENLOG_INFO,
1278 "rbs stack offset is different! xen 0x%x given 0x%x",
1279 (IA64_RBS_OFFSET / 8) % 64, c.nat->regs.rbs_voff);
1281 /* Protection against crazy user code. */
1282 if (!was_initialised)
1283 uregs->loadrs = (rbs_size << 16);
1284 if (rbs_size == (uregs->loadrs >> 16)) {
1285 unsigned long dst_rbs_size = 0;
1286 if (vcpu_has_not_run(v))
1287 sw->ar_bspstore = (unsigned long)rbs_bottom;
1289 rc = copy_rbs(v, &dst_rbs_size,
1290 c.nat->regs.rbs, rbs_size,
1291 c.nat->regs.rbs_rnat,
1292 c.nat->regs.rbs_voff);
1293 if (rc < 0)
1294 return rc;
1296 /* In case of newly created vcpu, ar_bspstore points to
1297 * the bottom of register stack. Move it up.
1298 * See also init_switch_stack().
1299 */
1300 if (vcpu_has_not_run(v)) {
1301 uregs->loadrs = (dst_rbs_size << 16);
1302 sw->ar_bspstore = (unsigned long)((char*)rbs_bottom +
1303 dst_rbs_size);
1307 // inhibit save/restore between cpus of different RSE.N_STACKED_PHYS.
1308 // to avoid nasty issues.
1309 //
1310 // The number of physical stacked general register(RSE.N_STACKED_PHYS)
1311 // isn't virtualized. Guest OS utilizes it via PAL_RSE_INFO call and
1312 // the value might be exported to user/user process.
1313 // (Linux does via /proc/cpuinfo)
1314 // The SDM says only that the number is cpu implementation specific.
1315 //
1316 // If the number of restoring cpu is different from one of saving cpu,
1317 // the following, or something worse, might happen.
1318 // - Xen VMM itself may panic when issuing loadrs to run guest with
1319 // illegal operation fault
1320 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1321 // restoring CPU
1322 // This case is detected to refuse restore by rbs_copy()
1323 // - guest kernel may panic with illegal operation fault
1324 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1325 // restoring CPU
1326 // - infomation leak from guest kernel to user process
1327 // When RSE.N_STACKED_PHYS of saving CPU < RSE.N_STACKED_PHYS of
1328 // restoring CPU
1329 // Before returning to user process, kernel should zero clear all
1330 // physical stacked resgisters to prevent kernel bits leak.
1331 // It would be based on RSE.N_STACKED_PHYS (Linux does.).
1332 // On the restored environtment the kernel clears only a part
1333 // of the physical stacked registers.
1334 // - user processes or human operators would be confused.
1335 // RSE.N_STACKED_PHYS might be exported to user process or human
1336 // operators. Actually on linux it is exported via /proc/cpuinfo.
1337 // user processes might use it.
1338 // I don't know any concrete example, but it's possible in theory.
1339 // e.g. thread libraly may allocate RBS area based on the value.
1340 // (Fortunately glibc nptl doesn't)
1341 if (c.nat->regs.num_phys_stacked != 0 && /* COMPAT */
1342 c.nat->regs.num_phys_stacked != num_phys_stacked) {
1343 gdprintk(XENLOG_WARNING,
1344 "num phys stacked is different! "
1345 "xen 0x%lx given 0x%lx",
1346 num_phys_stacked, c.nat->regs.num_phys_stacked);
1347 return -EINVAL;
1350 uregs->r1 = c.nat->regs.r[1];
1351 uregs->r12 = c.nat->regs.r[12];
1352 uregs->r13 = c.nat->regs.r[13];
1353 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
1354 uregs->r15 = c.nat->regs.r[15];
1356 uregs->r14 = c.nat->regs.r[14];
1357 uregs->r2 = c.nat->regs.r[2];
1358 uregs->r3 = c.nat->regs.r[3];
1359 uregs->r16 = c.nat->regs.r[16];
1360 uregs->r17 = c.nat->regs.r[17];
1361 uregs->r18 = c.nat->regs.r[18];
1362 uregs->r19 = c.nat->regs.r[19];
1363 uregs->r20 = c.nat->regs.r[20];
1364 uregs->r21 = c.nat->regs.r[21];
1365 uregs->r22 = c.nat->regs.r[22];
1366 uregs->r23 = c.nat->regs.r[23];
1367 uregs->r24 = c.nat->regs.r[24];
1368 uregs->r25 = c.nat->regs.r[25];
1369 uregs->r26 = c.nat->regs.r[26];
1370 uregs->r27 = c.nat->regs.r[27];
1371 uregs->r28 = c.nat->regs.r[28];
1372 uregs->r29 = c.nat->regs.r[29];
1373 uregs->r30 = c.nat->regs.r[30];
1374 uregs->r31 = c.nat->regs.r[31];
1376 uregs->ar_ccv = c.nat->regs.ar.ccv;
1378 COPY_FPREG(&sw->f2, &c.nat->regs.f[2]);
1379 COPY_FPREG(&sw->f3, &c.nat->regs.f[3]);
1380 COPY_FPREG(&sw->f4, &c.nat->regs.f[4]);
1381 COPY_FPREG(&sw->f5, &c.nat->regs.f[5]);
1383 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
1384 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
1385 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
1386 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
1387 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
1388 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
1390 COPY_FPREG(&sw->f12, &c.nat->regs.f[12]);
1391 COPY_FPREG(&sw->f13, &c.nat->regs.f[13]);
1392 COPY_FPREG(&sw->f14, &c.nat->regs.f[14]);
1393 COPY_FPREG(&sw->f15, &c.nat->regs.f[15]);
1394 COPY_FPREG(&sw->f16, &c.nat->regs.f[16]);
1395 COPY_FPREG(&sw->f17, &c.nat->regs.f[17]);
1396 COPY_FPREG(&sw->f18, &c.nat->regs.f[18]);
1397 COPY_FPREG(&sw->f19, &c.nat->regs.f[19]);
1398 COPY_FPREG(&sw->f20, &c.nat->regs.f[20]);
1399 COPY_FPREG(&sw->f21, &c.nat->regs.f[21]);
1400 COPY_FPREG(&sw->f22, &c.nat->regs.f[22]);
1401 COPY_FPREG(&sw->f23, &c.nat->regs.f[23]);
1402 COPY_FPREG(&sw->f24, &c.nat->regs.f[24]);
1403 COPY_FPREG(&sw->f25, &c.nat->regs.f[25]);
1404 COPY_FPREG(&sw->f26, &c.nat->regs.f[26]);
1405 COPY_FPREG(&sw->f27, &c.nat->regs.f[27]);
1406 COPY_FPREG(&sw->f28, &c.nat->regs.f[28]);
1407 COPY_FPREG(&sw->f29, &c.nat->regs.f[29]);
1408 COPY_FPREG(&sw->f30, &c.nat->regs.f[30]);
1409 COPY_FPREG(&sw->f31, &c.nat->regs.f[31]);
1411 // f32 - f127
1412 memcpy(&v->arch._thread.fph[0], &c.nat->regs.f[32],
1413 sizeof(v->arch._thread.fph));
1415 #define UNAT_UPDATE(reg) \
1416 unat_update(&uregs->eml_unat, &uregs->r ## reg, \
1417 !!(c.nat->regs.nats & (1UL << (reg))));
1419 uregs->eml_unat = 0;
1420 UNAT_UPDATE(1);
1421 UNAT_UPDATE(2);
1422 UNAT_UPDATE(3);
1424 UNAT_UPDATE(8);
1425 UNAT_UPDATE(9);
1426 UNAT_UPDATE(10);
1427 UNAT_UPDATE(11);
1428 UNAT_UPDATE(12);
1429 UNAT_UPDATE(13);
1430 UNAT_UPDATE(14);
1431 UNAT_UPDATE(15);
1432 UNAT_UPDATE(16);
1433 UNAT_UPDATE(17);
1434 UNAT_UPDATE(18);
1435 UNAT_UPDATE(19);
1436 UNAT_UPDATE(20);
1437 UNAT_UPDATE(21);
1438 UNAT_UPDATE(22);
1439 UNAT_UPDATE(23);
1440 UNAT_UPDATE(24);
1441 UNAT_UPDATE(25);
1442 UNAT_UPDATE(26);
1443 UNAT_UPDATE(27);
1444 UNAT_UPDATE(28);
1445 UNAT_UPDATE(29);
1446 UNAT_UPDATE(30);
1447 UNAT_UPDATE(31);
1449 /*
1450 * r4-r7 is saved sometimes both in pt_regs->r[4-7] and memory stack or
1451 * only in memory stack.
1452 * for both cases, both memory stack and pt_regs->r[4-7] are updated.
1453 */
1454 uregs->r4 = c.nat->regs.r[4];
1455 uregs->r5 = c.nat->regs.r[5];
1456 uregs->r6 = c.nat->regs.r[6];
1457 uregs->r7 = c.nat->regs.r[7];
1459 UNAT_UPDATE(4);
1460 UNAT_UPDATE(5);
1461 UNAT_UPDATE(6);
1462 UNAT_UPDATE(7);
1463 #undef UNAT_UPDATE
1464 if (vcpu_has_not_run(v)) {
1465 sw->r4 = c.nat->regs.r[4];
1466 sw->r5 = c.nat->regs.r[5];
1467 sw->r6 = c.nat->regs.r[6];
1468 sw->r7 = c.nat->regs.r[7];
1470 unat_update(&sw->ar_unat, &sw->r4,
1471 !!(c.nat->regs.nats & (1UL << 4)));
1472 unat_update(&sw->ar_unat, &sw->r5,
1473 !!(c.nat->regs.nats & (1UL << 5)));
1474 unat_update(&sw->ar_unat, &sw->r6,
1475 !!(c.nat->regs.nats & (1UL << 6)));
1476 unat_update(&sw->ar_unat, &sw->r7,
1477 !!(c.nat->regs.nats & (1UL << 7)));
1478 } else {
1479 unw_set_gr(&info, 4, c.nat->regs.r[4],
1480 !!(c.nat->regs.nats & (1UL << 4)));
1481 unw_set_gr(&info, 5, c.nat->regs.r[5],
1482 !!(c.nat->regs.nats & (1UL << 5)));
1483 unw_set_gr(&info, 6, c.nat->regs.r[6],
1484 !!(c.nat->regs.nats & (1UL << 6)));
1485 unw_set_gr(&info, 7, c.nat->regs.r[7],
1486 !!(c.nat->regs.nats & (1UL << 7)));
1489 if (!is_hvm_domain(d)) {
1490 /* domain runs at PL2/3 */
1491 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
1492 IA64_PSR_CPL0_BIT);
1493 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
1496 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1497 if (is_hvm_domain(d)) {
1498 vmx_vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1499 vmx_vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1500 } else {
1501 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1502 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1506 /* rr[] must be set before setting itrs[] dtrs[] */
1507 for (i = 0; i < 8; i++) {
1508 unsigned long rrval = c.nat->regs.rr[i];
1509 unsigned long reg = (unsigned long)i << 61;
1510 IA64FAULT fault = IA64_NO_FAULT;
1512 if (rrval == 0)
1513 continue;
1514 if (is_hvm_domain(d)) {
1515 //without VGCF_EXTRA_REGS check,
1516 //VTi domain doesn't boot.
1517 if (c.nat->flags & VGCF_EXTRA_REGS)
1518 fault = vmx_vcpu_set_rr(v, reg, rrval);
1519 } else
1520 fault = vcpu_set_rr(v, reg, rrval);
1521 if (fault != IA64_NO_FAULT)
1522 return -EINVAL;
1525 if (c.nat->flags & VGCF_EXTRA_REGS) {
1526 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
1528 for (i = 0;
1529 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1530 i++) {
1531 if (is_hvm_domain(d))
1532 vmx_vcpu_itr_i(v, i, tr->itrs[i].pte,
1533 tr->itrs[i].itir,
1534 tr->itrs[i].vadr);
1535 else
1536 vcpu_set_itr(v, i, tr->itrs[i].pte,
1537 tr->itrs[i].itir,
1538 tr->itrs[i].vadr,
1539 tr->itrs[i].rid);
1541 for (i = 0;
1542 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1543 i++) {
1544 if (is_hvm_domain(d))
1545 vmx_vcpu_itr_d(v, i, tr->dtrs[i].pte,
1546 tr->dtrs[i].itir,
1547 tr->dtrs[i].vadr);
1548 else
1549 vcpu_set_dtr(v, i,
1550 tr->dtrs[i].pte,
1551 tr->dtrs[i].itir,
1552 tr->dtrs[i].vadr,
1553 tr->dtrs[i].rid);
1555 v->arch.event_callback_ip = c.nat->event_callback_ip;
1556 vcpu_set_iva(v, c.nat->regs.cr.iva);
1559 if (is_hvm_domain(d))
1560 rc = vmx_arch_set_info_guest(v, c);
1562 return rc;
1565 static int relinquish_memory(struct domain *d, struct list_head *list)
1567 struct list_head *ent;
1568 struct page_info *page;
1569 #ifndef __ia64__
1570 unsigned long x, y;
1571 #endif
1572 int ret = 0;
1574 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1575 spin_lock_recursive(&d->page_alloc_lock);
1576 ent = list->next;
1577 while ( ent != list )
1579 page = list_entry(ent, struct page_info, list);
1580 /* Grab a reference to the page so it won't disappear from under us. */
1581 if ( unlikely(!get_page(page, d)) )
1583 /* Couldn't get a reference -- someone is freeing this page. */
1584 ent = ent->next;
1585 list_move_tail(&page->list, &d->arch.relmem_list);
1586 continue;
1589 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1590 put_page_and_type(page);
1592 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1593 put_page(page);
1595 #ifndef __ia64__
1596 /*
1597 * Forcibly invalidate base page tables at this point to break circular
1598 * 'linear page table' references. This is okay because MMU structures
1599 * are not shared across domains and this domain is now dead. Thus base
1600 * tables are not in use so a non-zero count means circular reference.
1601 */
1602 y = page->u.inuse.type_info;
1603 for ( ; ; )
1605 x = y;
1606 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1607 (PGT_base_page_table|PGT_validated)) )
1608 break;
1610 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1611 if ( likely(y == x) )
1613 free_page_type(page, PGT_base_page_table);
1614 break;
1617 #endif
1619 /* Follow the list chain and /then/ potentially free the page. */
1620 ent = ent->next;
1621 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
1622 list_move_tail(&page->list, &d->arch.relmem_list);
1623 put_page(page);
1625 if (hypercall_preempt_check()) {
1626 ret = -EAGAIN;
1627 goto out;
1631 list_splice_init(&d->arch.relmem_list, list);
1633 out:
1634 spin_unlock_recursive(&d->page_alloc_lock);
1635 return ret;
1638 int domain_relinquish_resources(struct domain *d)
1640 int ret = 0;
1642 switch (d->arch.relres) {
1643 case RELRES_not_started:
1644 /* Relinquish guest resources for VT-i domain. */
1645 if (is_hvm_domain(d))
1646 vmx_relinquish_guest_resources(d);
1647 d->arch.relres = RELRES_mm_teardown;
1648 /*fallthrough*/
1650 case RELRES_mm_teardown:
1651 /* Tear down shadow mode stuff. */
1652 ret = mm_teardown(d);
1653 if (ret != 0)
1654 return ret;
1655 d->arch.relres = RELRES_xen;
1656 /* fallthrough */
1658 case RELRES_xen:
1659 /* Relinquish every xen page of memory. */
1660 ret = relinquish_memory(d, &d->xenpage_list);
1661 if (ret != 0)
1662 return ret;
1663 d->arch.relres = RELRES_dom;
1664 /* fallthrough */
1666 case RELRES_dom:
1667 /* Relinquish every domain page of memory. */
1668 ret = relinquish_memory(d, &d->page_list);
1669 if (ret != 0)
1670 return ret;
1671 d->arch.relres = RELRES_done;
1672 /* fallthrough */
1674 case RELRES_done:
1675 break;
1677 default:
1678 BUG();
1681 if (is_hvm_domain(d) && d->arch.sal_data)
1682 xfree(d->arch.sal_data);
1684 /* Free page used by xen oprofile buffer */
1685 free_xenoprof_pages(d);
1687 return 0;
1690 unsigned long
1691 domain_set_shared_info_va (unsigned long va)
1693 struct vcpu *v = current;
1694 struct domain *d = v->domain;
1695 int rc;
1697 /* Check virtual address:
1698 must belong to region 7,
1699 must be 64Kb aligned,
1700 must not be within Xen virtual space. */
1701 if ((va >> 61) != 7
1702 || (va & 0xffffUL) != 0
1703 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1704 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1706 /* Note: this doesn't work well if other cpus are already running.
1707 However this is part of the spec :-) */
1708 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1709 d->arch.shared_info_va = va;
1711 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1712 INT_ENABLE_OFFSET(v);
1714 __ia64_per_cpu_var(current_psr_i_addr) =
1715 (uint8_t*)(va + INT_ENABLE_OFFSET(current));
1716 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
1718 /* Remap the shared pages. */
1719 rc = !set_one_rr(7UL << 61, PSCB(v,rrs[7]));
1720 BUG_ON(rc);
1722 return rc;
1725 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1726 #define SHADOW_COPY_CHUNK 1024
1728 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1730 unsigned int op = sc->op;
1731 int rc = 0;
1732 int i;
1733 //struct vcpu *v;
1735 if (unlikely(d == current->domain)) {
1736 gdprintk(XENLOG_INFO,
1737 "Don't try to do a shadow op on yourself!\n");
1738 return -EINVAL;
1741 domain_pause(d);
1743 switch (op)
1745 case XEN_DOMCTL_SHADOW_OP_OFF:
1746 if (shadow_mode_enabled (d)) {
1747 u64 *bm = d->arch.shadow_bitmap;
1748 struct vcpu *v;
1750 for_each_vcpu(d, v)
1751 v->arch.shadow_bitmap = NULL;
1753 /* Flush vhpt and tlb to restore dirty bit usage. */
1754 domain_flush_tlb_vhpt(d);
1756 /* Free bitmap. */
1757 d->arch.shadow_bitmap_size = 0;
1758 d->arch.shadow_bitmap = NULL;
1759 xfree(bm);
1761 break;
1763 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1764 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1765 rc = -EINVAL;
1766 break;
1768 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1769 if (shadow_mode_enabled(d)) {
1770 rc = -EINVAL;
1771 break;
1774 atomic64_set(&d->arch.shadow_fault_count, 0);
1775 atomic64_set(&d->arch.shadow_dirty_count, 0);
1777 d->arch.shadow_bitmap_size =
1778 ((d->arch.convmem_end >> PAGE_SHIFT) +
1779 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1780 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1781 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1782 if (d->arch.shadow_bitmap == NULL) {
1783 d->arch.shadow_bitmap_size = 0;
1784 rc = -ENOMEM;
1786 else {
1787 struct vcpu *v;
1788 memset(d->arch.shadow_bitmap, 0,
1789 d->arch.shadow_bitmap_size / 8);
1791 for_each_vcpu(d, v)
1792 v->arch.shadow_bitmap = d->arch.shadow_bitmap;
1793 /* Flush vhtp and tlb to enable dirty bit
1794 virtualization. */
1795 domain_flush_tlb_vhpt(d);
1797 break;
1799 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1801 int nbr_bytes;
1803 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1804 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1806 atomic64_set(&d->arch.shadow_fault_count, 0);
1807 atomic64_set(&d->arch.shadow_dirty_count, 0);
1809 if (guest_handle_is_null(sc->dirty_bitmap) ||
1810 (d->arch.shadow_bitmap == NULL)) {
1811 rc = -EINVAL;
1812 break;
1815 if (sc->pages > d->arch.shadow_bitmap_size)
1816 sc->pages = d->arch.shadow_bitmap_size;
1818 nbr_bytes = (sc->pages + 7) / 8;
1820 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1821 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1822 SHADOW_COPY_CHUNK : nbr_bytes - i;
1824 if (copy_to_guest_offset(
1825 sc->dirty_bitmap, i,
1826 (uint8_t *)d->arch.shadow_bitmap + i,
1827 size)) {
1828 rc = -EFAULT;
1829 break;
1832 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1835 break;
1838 case XEN_DOMCTL_SHADOW_OP_PEEK:
1840 unsigned long size;
1842 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1843 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1845 if (guest_handle_is_null(sc->dirty_bitmap) ||
1846 (d->arch.shadow_bitmap == NULL)) {
1847 rc = -EINVAL;
1848 break;
1851 if (sc->pages > d->arch.shadow_bitmap_size)
1852 sc->pages = d->arch.shadow_bitmap_size;
1854 size = (sc->pages + 7) / 8;
1855 if (copy_to_guest(sc->dirty_bitmap,
1856 (uint8_t *)d->arch.shadow_bitmap, size)) {
1857 rc = -EFAULT;
1858 break;
1860 break;
1862 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1863 sc->mb = 0;
1864 break;
1865 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1866 if (sc->mb > 0) {
1867 BUG();
1868 rc = -ENOMEM;
1870 break;
1871 default:
1872 rc = -EINVAL;
1873 break;
1876 domain_unpause(d);
1878 return rc;
1881 // remove following line if not privifying in memory
1882 //#define HAVE_PRIVIFY_MEMORY
1883 #ifndef HAVE_PRIVIFY_MEMORY
1884 #define privify_memory(x,y) do {} while(0)
1885 #endif
1887 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1888 unsigned long phys_load_offset)
1890 const elf_phdr *phdr;
1891 int phnum, h, filesz, memsz;
1892 unsigned long elfaddr, dom_mpaddr, dom_imva;
1893 struct page_info *p;
1895 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1896 for (h = 0; h < phnum; h++) {
1897 phdr = elf_phdr_by_index(elf, h);
1898 if (!elf_phdr_is_loadable(elf, phdr))
1899 continue;
1901 filesz = elf_uval(elf, phdr, p_filesz);
1902 memsz = elf_uval(elf, phdr, p_memsz);
1903 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1904 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1905 dom_mpaddr += phys_load_offset;
1907 while (memsz > 0) {
1908 p = assign_new_domain_page(d,dom_mpaddr);
1909 BUG_ON (unlikely(p == NULL));
1910 dom_imva = __va_ul(page_to_maddr(p));
1911 if (filesz > 0) {
1912 if (filesz >= PAGE_SIZE)
1913 copy_page((void *) dom_imva,
1914 (void *) elfaddr);
1915 else {
1916 // copy partial page
1917 memcpy((void *) dom_imva,
1918 (void *) elfaddr, filesz);
1919 // zero the rest of page
1920 memset((void *) dom_imva+filesz, 0,
1921 PAGE_SIZE-filesz);
1923 //FIXME: This test for code seems to find a lot more than objdump -x does
1924 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1925 privify_memory(dom_imva,PAGE_SIZE);
1926 flush_icache_range(dom_imva,
1927 dom_imva+PAGE_SIZE);
1930 else if (memsz > 0) {
1931 /* always zero out entire page */
1932 clear_page((void *) dom_imva);
1934 memsz -= PAGE_SIZE;
1935 filesz -= PAGE_SIZE;
1936 elfaddr += PAGE_SIZE;
1937 dom_mpaddr += PAGE_SIZE;
1942 static void __init calc_dom0_size(void)
1944 unsigned long domheap_pages;
1945 unsigned long p2m_pages;
1946 unsigned long spare_hv_pages;
1947 unsigned long max_dom0_size;
1949 /* Estimate maximum memory we can safely allocate for dom0
1950 * by subtracting the p2m table allocation and a chunk of memory
1951 * for DMA and PCI mapping from the available domheap pages. The
1952 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1953 * to have a good idea of what those requirements might be ahead
1954 * of time, calculated at 1MB per 4GB of system memory */
1955 domheap_pages = avail_domheap_pages();
1956 p2m_pages = domheap_pages / PTRS_PER_PTE;
1957 spare_hv_pages = domheap_pages / 4096;
1958 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1959 * PAGE_SIZE;
1960 printk("Maximum permitted dom0 size: %luMB\n",
1961 max_dom0_size / (1024*1024));
1963 /* validate proposed dom0_size, fix up as needed */
1964 if (dom0_size > max_dom0_size) {
1965 printk("Reducing dom0 memory allocation from %luK to %luK "
1966 "to fit available memory\n",
1967 dom0_size / 1024, max_dom0_size / 1024);
1968 dom0_size = max_dom0_size;
1971 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1972 if (dom0_size == 0) {
1973 printk("Allocating all available memory to dom0\n");
1974 dom0_size = max_dom0_size;
1977 /* Check dom0 size. */
1978 if (dom0_size < 4 * 1024 * 1024) {
1979 panic("dom0_mem is too small, boot aborted"
1980 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1983 if (running_on_sim) {
1984 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1987 /* no need to allocate pages for now
1988 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1989 */
1993 /*
1994 * Domain 0 has direct access to all devices absolutely. However
1995 * the major point of this stub here, is to allow alloc_dom_mem
1996 * handled with order > 0 request. Dom0 requires that bit set to
1997 * allocate memory for other domains.
1998 */
1999 static void __init physdev_init_dom0(struct domain *d)
2001 if (iomem_permit_access(d, 0UL, ~0UL))
2002 BUG();
2003 if (irqs_permit_access(d, 0, NR_IRQS-1))
2004 BUG();
2005 if (ioports_permit_access(d, 0, 0xffff))
2006 BUG();
2009 int __init construct_dom0(struct domain *d,
2010 unsigned long image_start, unsigned long image_len,
2011 unsigned long initrd_start, unsigned long initrd_len,
2012 char *cmdline)
2014 int i, rc;
2015 start_info_t *si;
2016 dom0_vga_console_info_t *ci;
2017 struct vcpu *v = d->vcpu[0];
2018 unsigned long max_pages;
2020 struct elf_binary elf;
2021 struct elf_dom_parms parms;
2022 unsigned long p_start;
2023 unsigned long pkern_start;
2024 unsigned long pkern_entry;
2025 unsigned long pkern_end;
2026 unsigned long pinitrd_start = 0;
2027 unsigned long pstart_info;
2028 unsigned long phys_load_offset;
2029 struct page_info *start_info_page;
2030 unsigned long bp_mpa;
2031 struct ia64_boot_param *bp;
2033 //printk("construct_dom0: starting\n");
2035 /* Sanity! */
2036 BUG_ON(d != dom0);
2037 BUG_ON(d->vcpu[0] == NULL);
2038 BUG_ON(v->is_initialised);
2040 printk("*** LOADING DOMAIN 0 ***\n");
2042 calc_dom0_size();
2044 max_pages = dom0_size / PAGE_SIZE;
2045 d->max_pages = max_pages;
2046 d->tot_pages = 0;
2048 rc = elf_init(&elf, (void*)image_start, image_len);
2049 if ( rc != 0 )
2050 return rc;
2051 #ifdef VERBOSE
2052 elf_set_verbose(&elf);
2053 #endif
2054 elf_parse_binary(&elf);
2055 if (0 != (elf_xen_parse(&elf, &parms)))
2056 return rc;
2058 /*
2059 * We cannot rely on the load address in the ELF headers to
2060 * determine the meta physical address at which the image
2061 * is loaded. Patch the address to match the real one, based
2062 * on xen_pstart
2063 */
2064 phys_load_offset = xen_pstart - elf.pstart;
2065 elf.pstart += phys_load_offset;
2066 elf.pend += phys_load_offset;
2067 parms.virt_kstart += phys_load_offset;
2068 parms.virt_kend += phys_load_offset;
2069 parms.virt_entry += phys_load_offset;
2071 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
2072 elf_64bit(&elf) ? "64-bit" : "32-bit",
2073 elf_msb(&elf) ? "msb" : "lsb",
2074 elf.pstart, elf.pend);
2075 if (!elf_64bit(&elf) ||
2076 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
2077 printk("Incompatible kernel binary\n");
2078 return -1;
2081 p_start = parms.virt_base;
2082 pkern_start = parms.virt_kstart;
2083 pkern_end = parms.virt_kend;
2084 pkern_entry = parms.virt_entry;
2086 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
2088 if ( (p_start & (PAGE_SIZE-1)) != 0 )
2090 printk("Initial guest OS must load to a page boundary.\n");
2091 return -EINVAL;
2094 pstart_info = PAGE_ALIGN(pkern_end);
2095 if(initrd_start && initrd_len){
2096 unsigned long offset;
2098 /* The next page aligned boundary after the start info.
2099 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
2100 pinitrd_start = pstart_info + PAGE_SIZE;
2102 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
2103 panic("%s: not enough memory assigned to dom0", __func__);
2105 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
2106 struct page_info *p;
2107 p = assign_new_domain_page(d, pinitrd_start + offset);
2108 if (p == NULL)
2109 panic("%s: can't allocate page for initrd image", __func__);
2110 if (initrd_len < offset + PAGE_SIZE)
2111 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
2112 initrd_len - offset);
2113 else
2114 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
2118 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
2119 " Kernel image: %lx->%lx\n"
2120 " Entry address: %lx\n"
2121 " Init. ramdisk: %lx len %lx\n"
2122 " Start info.: %lx->%lx\n",
2123 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
2124 pstart_info, pstart_info + PAGE_SIZE);
2126 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
2128 printk("Initial guest OS requires too much space\n"
2129 "(%luMB is greater than %luMB limit)\n",
2130 (pkern_end-pkern_start)>>20,
2131 (max_pages <<PAGE_SHIFT)>>20);
2132 return -ENOMEM;
2135 // if high 3 bits of pkern start are non-zero, error
2137 // if pkern end is after end of metaphysical memory, error
2138 // (we should be able to deal with this... later)
2140 /* Mask all upcalls... */
2141 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
2142 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
2144 if (dom0_max_vcpus == 0)
2145 dom0_max_vcpus = MAX_VIRT_CPUS;
2146 if (dom0_max_vcpus > num_online_cpus())
2147 dom0_max_vcpus = num_online_cpus();
2148 if (dom0_max_vcpus > MAX_VIRT_CPUS)
2149 dom0_max_vcpus = MAX_VIRT_CPUS;
2151 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
2152 for ( i = 1; i < dom0_max_vcpus; i++ )
2153 if (alloc_vcpu(d, i, i) == NULL)
2154 panic("Cannot allocate dom0 vcpu %d\n", i);
2156 /* Copy the OS image. */
2157 loaddomainelfimage(d, &elf, phys_load_offset);
2159 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
2160 sizeof(struct ia64_boot_param) > PAGE_SIZE);
2162 /* Set up start info area. */
2163 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
2164 start_info_page = assign_new_domain_page(d, pstart_info);
2165 if (start_info_page == NULL)
2166 panic("can't allocate start info page");
2167 si = page_to_virt(start_info_page);
2168 clear_page(si);
2169 snprintf(si->magic, sizeof(si->magic), "xen-3.0-ia64");
2170 si->nr_pages = max_pages;
2171 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
2173 printk("Dom0: 0x%lx\n", (u64)dom0);
2175 v->is_initialised = 1;
2176 clear_bit(_VPF_down, &v->pause_flags);
2178 /* Build firmware.
2179 Note: Linux kernel reserve memory used by start_info, so there is
2180 no need to remove it from MDT. */
2181 bp_mpa = pstart_info + sizeof(struct start_info);
2182 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
2183 if (rc != 0)
2184 return rc;
2186 /* Fill boot param. */
2187 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
2189 bp = (struct ia64_boot_param *)((unsigned char *)si +
2190 sizeof(start_info_t));
2191 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
2193 /* We assume console has reached the last line! */
2194 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
2195 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
2196 bp->console_info.orig_x = 0;
2197 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
2198 0 : bp->console_info.num_rows - 1;
2200 bp->initrd_start = pinitrd_start;
2201 bp->initrd_size = ia64_boot_param->initrd_size;
2203 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
2204 sizeof(start_info_t) +
2205 sizeof(struct ia64_boot_param));
2207 if (fill_console_start_info(ci)) {
2208 si->console.dom0.info_off = sizeof(start_info_t) +
2209 sizeof(struct ia64_boot_param);
2210 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
2213 vcpu_init_regs (v);
2215 vcpu_regs(v)->r28 = bp_mpa;
2217 vcpu_regs (v)->cr_iip = pkern_entry;
2219 physdev_init_dom0(d);
2221 return 0;
2224 void machine_restart(void)
2226 console_start_sync();
2227 if (running_on_sim)
2228 printk ("machine_restart called. spinning...\n");
2229 else
2230 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
2231 while(1);
2234 extern void cpu_halt(void);
2236 void machine_halt(void)
2238 console_start_sync();
2240 #ifdef CONFIG_SMP
2241 smp_send_stop();
2242 #endif
2244 printk ("machine_halt called. spinning...\n");
2245 while(1);
2248 void sync_vcpu_execstate(struct vcpu *v)
2250 // __ia64_save_fpu(v->arch._thread.fph);
2251 // FIXME SMP: Anything else needed here for SMP?
2254 /* This function is taken from xen/arch/x86/domain.c */
2255 long
2256 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
2258 long rc = 0;
2260 switch (cmd) {
2261 case VCPUOP_register_runstate_memory_area:
2263 struct vcpu_register_runstate_memory_area area;
2264 struct vcpu_runstate_info runstate;
2266 rc = -EFAULT;
2267 if (copy_from_guest(&area, arg, 1))
2268 break;
2270 if (!guest_handle_okay(area.addr.h, 1))
2271 break;
2273 rc = 0;
2274 runstate_guest(v) = area.addr.h;
2276 if (v == current) {
2277 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
2278 } else {
2279 vcpu_runstate_get(v, &runstate);
2280 __copy_to_guest(runstate_guest(v), &runstate, 1);
2283 break;
2285 default:
2286 rc = -ENOSYS;
2287 break;
2290 return rc;
2293 static void __init parse_dom0_mem(char *s)
2295 dom0_size = parse_size_and_unit(s, NULL);
2297 custom_param("dom0_mem", parse_dom0_mem);
2299 /*
2300 * Helper function for the optimization stuff handling the identity mapping
2301 * feature.
2302 */
2303 static inline unsigned long
2304 optf_identity_mapping_cmd_to_flg(unsigned long cmd)
2306 switch(cmd) {
2307 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2308 return XEN_IA64_OPTF_IDENT_MAP_REG7_FLG;
2309 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2310 return XEN_IA64_OPTF_IDENT_MAP_REG4_FLG;
2311 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2312 return XEN_IA64_OPTF_IDENT_MAP_REG5_FLG;
2313 default:
2314 BUG();
2315 return 0;
2318 /* NOTREACHED */
2321 static inline void
2322 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
2323 struct xen_ia64_opt_feature* f)
2325 unsigned long flag = optf_identity_mapping_cmd_to_flg(f->cmd);
2327 if (f->on) {
2328 *mask |= flag;
2329 im->pgprot = f->pgprot;
2330 im->key = f->key;
2331 } else {
2332 *mask &= ~flag;
2333 im->pgprot = 0;
2334 im->key = 0;
2338 /*
2339 * Switch an optimization feature on/off.
2340 * The vcpu must be paused to avoid racy access to opt_feature.
2341 */
2342 int
2343 domain_opt_feature(struct domain *d, struct xen_ia64_opt_feature* f)
2345 struct opt_feature* optf = &d->arch.opt_feature;
2346 struct vcpu *v;
2347 long rc = 0;
2349 for_each_vcpu(d, v) {
2350 if (v != current)
2351 vcpu_pause(v);
2354 switch (f->cmd) {
2355 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2356 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
2357 break;
2358 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2359 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
2360 break;
2361 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2362 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
2363 break;
2364 default:
2365 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
2366 rc = -ENOSYS;
2367 break;
2370 for_each_vcpu(d, v) {
2371 if (v != current)
2372 vcpu_unpause(v);
2375 return rc;