ia64/xen-unstable

view xen/arch/ia64/xen/domain.c @ 16682:7515dc56c124

[IA64] Sort out the XEN_IA64_OPTF_IDENT_MAP_REG[457] constants confusion

Currently the constants are used for two different purpose.
one is for the OPTF hypercall sub command.
another is bit flag for struct opt_feature::mask.
They are different spaces, split them out.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Alex Williamson <alex.williamson@hp.com>
date Sun Dec 30 13:02:16 2007 -0700 (2007-12-30)
parents 2900e4dacaa7
children 09cd682ac68e
line source
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * Copyright (C) 2005 Intel Co
8 * Kun Tian (Kevin Tian) <kevin.tian@intel.com>
9 *
10 * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support
11 *
12 * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
13 * VA Linux Systems Japan K.K.
14 * dom0 vp model support
15 */
17 #include <xen/config.h>
18 #include <xen/init.h>
19 #include <xen/lib.h>
20 #include <xen/errno.h>
21 #include <xen/sched.h>
22 #include <xen/smp.h>
23 #include <xen/delay.h>
24 #include <xen/softirq.h>
25 #include <xen/mm.h>
26 #include <xen/iocap.h>
27 #include <asm/asm-xsi-offsets.h>
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <asm/processor.h>
31 #include <xen/event.h>
32 #include <xen/console.h>
33 #include <xen/version.h>
34 #include <public/libelf.h>
35 #include <asm/pgalloc.h>
36 #include <asm/offsets.h> /* for IA64_THREAD_INFO_SIZE */
37 #include <asm/vcpu.h> /* for function declarations */
38 #include <public/xen.h>
39 #include <xen/domain.h>
40 #include <asm/vmx.h>
41 #include <asm/vmx_vcpu.h>
42 #include <asm/vmx_vpd.h>
43 #include <asm/vmx_phy_mode.h>
44 #include <asm/vmx_vcpu_save.h>
45 #include <asm/vhpt.h>
46 #include <asm/vcpu.h>
47 #include <asm/tlbflush.h>
48 #include <asm/regionreg.h>
49 #include <asm/dom_fw.h>
50 #include <asm/shadow.h>
51 #include <xen/guest_access.h>
52 #include <asm/tlb_track.h>
53 #include <asm/perfmon.h>
54 #include <asm/sal.h>
55 #include <public/vcpu.h>
56 #include <linux/cpu.h>
57 #include <linux/notifier.h>
58 #include <asm/debugger.h>
60 /* dom0_size: default memory allocation for dom0 (~4GB) */
61 static unsigned long __initdata dom0_size = 4096UL*1024UL*1024UL;
63 /* dom0_max_vcpus: maximum number of VCPUs to create for dom0. */
64 static unsigned int __initdata dom0_max_vcpus = 4;
65 integer_param("dom0_max_vcpus", dom0_max_vcpus);
67 extern char dom0_command_line[];
69 /* forward declaration */
70 static void init_switch_stack(struct vcpu *v);
72 /* Address of vpsr.i (in fact evtchn_upcall_mask) of current vcpu.
73 This is a Xen virtual address. */
74 DEFINE_PER_CPU(uint8_t *, current_psr_i_addr);
75 DEFINE_PER_CPU(int *, current_psr_ic_addr);
77 DEFINE_PER_CPU(struct vcpu *, fp_owner);
79 #include <xen/sched-if.h>
81 static void
82 ia64_disable_vhpt_walker(void)
83 {
84 // disable VHPT. ia64_new_rr7() might cause VHPT
85 // fault without this because it flushes dtr[IA64_TR_VHPT]
86 // (VHPT_SIZE_LOG2 << 2) is just for avoid
87 // Reserved Register/Field fault.
88 ia64_set_pta(VHPT_SIZE_LOG2 << 2);
89 }
91 static void flush_vtlb_for_context_switch(struct vcpu* prev, struct vcpu* next)
92 {
93 int cpu = smp_processor_id();
94 int last_vcpu_id, last_processor;
96 if (!is_idle_domain(prev->domain))
97 tlbflush_update_time
98 (&prev->domain->arch.last_vcpu[cpu].tlbflush_timestamp,
99 tlbflush_current_time());
101 if (is_idle_domain(next->domain))
102 return;
104 last_vcpu_id = next->domain->arch.last_vcpu[cpu].vcpu_id;
105 last_processor = next->arch.last_processor;
107 next->domain->arch.last_vcpu[cpu].vcpu_id = next->vcpu_id;
108 next->arch.last_processor = cpu;
110 if ((last_vcpu_id != next->vcpu_id &&
111 last_vcpu_id != INVALID_VCPU_ID) ||
112 (last_vcpu_id == next->vcpu_id &&
113 last_processor != cpu &&
114 last_processor != INVALID_PROCESSOR)) {
115 #ifdef CONFIG_XEN_IA64_TLBFLUSH_CLOCK
116 u32 last_tlbflush_timestamp =
117 next->domain->arch.last_vcpu[cpu].tlbflush_timestamp;
118 #endif
119 int vhpt_is_flushed = 0;
121 // if the vTLB implementation was changed,
122 // the followings must be updated either.
123 if (VMX_DOMAIN(next)) {
124 // currently vTLB for vt-i domian is per vcpu.
125 // so any flushing isn't needed.
126 } else if (HAS_PERVCPU_VHPT(next->domain)) {
127 // nothing to do
128 } else {
129 if (NEED_FLUSH(__get_cpu_var(vhpt_tlbflush_timestamp),
130 last_tlbflush_timestamp)) {
131 local_vhpt_flush();
132 vhpt_is_flushed = 1;
133 }
134 }
135 if (vhpt_is_flushed || NEED_FLUSH(__get_cpu_var(tlbflush_time),
136 last_tlbflush_timestamp)) {
137 local_flush_tlb_all();
138 perfc_incr(tlbflush_clock_cswitch_purge);
139 } else {
140 perfc_incr(tlbflush_clock_cswitch_skip);
141 }
142 perfc_incr(flush_vtlb_for_context_switch);
143 }
144 }
146 static void flush_cache_for_context_switch(struct vcpu *next)
147 {
148 extern cpumask_t cpu_cache_coherent_map;
149 int cpu = smp_processor_id();
151 if (is_idle_vcpu(next) ||
152 __test_and_clear_bit(cpu, &next->arch.cache_coherent_map)) {
153 if (cpu_test_and_clear(cpu, cpu_cache_coherent_map)) {
154 unsigned long flags;
155 u64 progress = 0;
156 s64 status;
158 local_irq_save(flags);
159 status = ia64_pal_cache_flush(4, 0, &progress, NULL);
160 local_irq_restore(flags);
161 if (status != 0)
162 panic_domain(NULL, "PAL_CACHE_FLUSH ERROR, "
163 "cache_type=4 status %lx", status);
164 }
165 }
166 }
168 static void lazy_fp_switch(struct vcpu *prev, struct vcpu *next)
169 {
170 /*
171 * Implement eager save, lazy restore
172 */
173 if (!is_idle_vcpu(prev)) {
174 if (VMX_DOMAIN(prev)) {
175 if (FP_PSR(prev) & IA64_PSR_MFH) {
176 __ia64_save_fpu(prev->arch._thread.fph);
177 __ia64_per_cpu_var(fp_owner) = prev;
178 }
179 } else {
180 if (PSCB(prev, hpsr_mfh)) {
181 __ia64_save_fpu(prev->arch._thread.fph);
182 __ia64_per_cpu_var(fp_owner) = prev;
183 }
184 }
185 }
187 if (!is_idle_vcpu(next)) {
188 if (VMX_DOMAIN(next)) {
189 FP_PSR(next) = IA64_PSR_DFH;
190 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
191 } else {
192 PSCB(next, hpsr_dfh) = 1;
193 PSCB(next, hpsr_mfh) = 0;
194 vcpu_regs(next)->cr_ipsr |= IA64_PSR_DFH;
195 }
196 }
197 }
199 void schedule_tail(struct vcpu *prev)
200 {
201 extern char ia64_ivt;
203 context_saved(prev);
204 ia64_disable_vhpt_walker();
206 if (VMX_DOMAIN(current))
207 vmx_do_resume(current);
208 else {
209 if (VMX_DOMAIN(prev))
210 ia64_set_iva(&ia64_ivt);
211 load_region_regs(current);
212 ia64_set_pta(vcpu_pta(current));
213 vcpu_load_kernel_regs(current);
214 __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
215 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
216 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)
217 (current->domain->arch.shared_info_va + XSI_PSR_IC_OFS);
218 migrate_timer(&current->arch.hlt_timer, current->processor);
219 }
220 flush_vtlb_for_context_switch(prev, current);
221 }
223 void context_switch(struct vcpu *prev, struct vcpu *next)
224 {
225 uint64_t spsr;
227 local_irq_save(spsr);
229 if (VMX_DOMAIN(prev)) {
230 vmx_save_state(prev);
231 if (!VMX_DOMAIN(next)) {
232 /* VMX domains can change the physical cr.dcr.
233 * Restore default to prevent leakage. */
234 uint64_t dcr = ia64_getreg(_IA64_REG_CR_DCR);
235 /* xenoprof:
236 * don't change psr.pp.
237 * It is manipulated by xenoprof.
238 */
239 dcr = (IA64_DEFAULT_DCR_BITS & ~IA64_DCR_PP) | (dcr & IA64_DCR_PP);
240 ia64_setreg(_IA64_REG_CR_DCR, dcr);
241 }
242 }
243 if (VMX_DOMAIN(next))
244 vmx_load_state(next);
246 ia64_disable_vhpt_walker();
247 lazy_fp_switch(prev, current);
249 if (prev->arch.dbg_used || next->arch.dbg_used) {
250 /*
251 * Load debug registers either because they are valid or to clear
252 * the previous one.
253 */
254 ia64_load_debug_regs(next->arch.dbr);
255 }
257 prev = ia64_switch_to(next);
259 /* Note: ia64_switch_to does not return here at vcpu initialization. */
261 if (VMX_DOMAIN(current)) {
262 vmx_load_all_rr(current);
263 migrate_timer(&current->arch.arch_vmx.vtm.vtm_timer,
264 current->processor);
265 } else {
266 struct domain *nd;
267 extern char ia64_ivt;
269 if (VMX_DOMAIN(prev))
270 ia64_set_iva(&ia64_ivt);
272 nd = current->domain;
273 if (!is_idle_domain(nd)) {
274 load_region_regs(current);
275 ia64_set_pta(vcpu_pta(current));
276 vcpu_load_kernel_regs(current);
277 if (vcpu_pkr_in_use(current))
278 vcpu_pkr_load_regs(current);
279 vcpu_set_next_timer(current);
280 if (vcpu_timer_expired(current))
281 vcpu_pend_timer(current);
282 __ia64_per_cpu_var(current_psr_i_addr) = &nd->shared_info->
283 vcpu_info[current->vcpu_id].evtchn_upcall_mask;
284 __ia64_per_cpu_var(current_psr_ic_addr) =
285 (int *)(nd->arch.shared_info_va + XSI_PSR_IC_OFS);
286 /* steal time accounting */
287 if (!guest_handle_is_null(runstate_guest(current)))
288 __copy_to_guest(runstate_guest(current), &current->runstate, 1);
289 } else {
290 /* When switching to idle domain, only need to disable vhpt
291 * walker. Then all accesses happen within idle context will
292 * be handled by TR mapping and identity mapping.
293 */
294 __ia64_per_cpu_var(current_psr_i_addr) = NULL;
295 __ia64_per_cpu_var(current_psr_ic_addr) = NULL;
296 }
297 }
298 local_irq_restore(spsr);
300 /* lazy fp */
301 if (current->processor != current->arch.last_processor) {
302 unsigned long *addr;
303 addr = (unsigned long *)per_cpu_addr(fp_owner,
304 current->arch.last_processor);
305 ia64_cmpxchg(acq, addr, current, 0, 8);
306 }
308 flush_vtlb_for_context_switch(prev, current);
309 flush_cache_for_context_switch(current);
310 context_saved(prev);
311 }
313 void continue_running(struct vcpu *same)
314 {
315 /* nothing to do */
316 }
318 #ifdef CONFIG_PERFMON
319 static int pal_halt = 1;
320 static int can_do_pal_halt = 1;
322 static int __init nohalt_setup(char * str)
323 {
324 pal_halt = can_do_pal_halt = 0;
325 return 1;
326 }
327 __setup("nohalt", nohalt_setup);
329 void
330 update_pal_halt_status(int status)
331 {
332 can_do_pal_halt = pal_halt && status;
333 }
334 #else
335 #define can_do_pal_halt (1)
336 #endif
338 static void default_idle(void)
339 {
340 local_irq_disable();
341 if ( !softirq_pending(smp_processor_id()) ) {
342 if (can_do_pal_halt)
343 safe_halt();
344 else
345 cpu_relax();
346 }
347 local_irq_enable();
348 }
350 extern void play_dead(void);
352 static void continue_cpu_idle_loop(void)
353 {
354 int cpu = smp_processor_id();
356 for ( ; ; )
357 {
358 #ifdef IA64
359 // __IRQ_STAT(cpu, idle_timestamp) = jiffies
360 #else
361 irq_stat[cpu].idle_timestamp = jiffies;
362 #endif
363 page_scrub_schedule_work();
364 while ( !softirq_pending(cpu) )
365 default_idle();
366 raise_softirq(SCHEDULE_SOFTIRQ);
367 do_softirq();
368 if (!cpu_online(cpu))
369 play_dead();
370 }
371 }
373 void startup_cpu_idle_loop(void)
374 {
375 /* Just some sanity to ensure that the scheduler is set up okay. */
376 ASSERT(current->domain->domain_id == IDLE_DOMAIN_ID);
377 raise_softirq(SCHEDULE_SOFTIRQ);
379 continue_cpu_idle_loop();
380 }
382 /* compile time test for get_order(sizeof(mapped_regs_t)) !=
383 * get_order_from_shift(XMAPPEDREGS_SHIFT))
384 */
385 #if !(((1 << (XMAPPEDREGS_SHIFT - 1)) < MAPPED_REGS_T_SIZE) && \
386 (MAPPED_REGS_T_SIZE < (1 << (XMAPPEDREGS_SHIFT + 1))))
387 # error "XMAPPEDREGS_SHIFT doesn't match sizeof(mapped_regs_t)."
388 #endif
390 void hlt_timer_fn(void *data)
391 {
392 struct vcpu *v = data;
393 vcpu_unblock(v);
394 }
396 void relinquish_vcpu_resources(struct vcpu *v)
397 {
398 if (HAS_PERVCPU_VHPT(v->domain))
399 pervcpu_vhpt_free(v);
400 if (v->arch.privregs != NULL) {
401 free_xenheap_pages(v->arch.privregs,
402 get_order_from_shift(XMAPPEDREGS_SHIFT));
403 v->arch.privregs = NULL;
404 }
405 kill_timer(&v->arch.hlt_timer);
406 }
408 struct vcpu *alloc_vcpu_struct(void)
409 {
410 struct vcpu *v;
411 struct thread_info *ti;
412 static int first_allocation = 1;
414 if (first_allocation) {
415 first_allocation = 0;
416 /* Still keep idle vcpu0 static allocated at compilation, due
417 * to some code from Linux still requires it in early phase.
418 */
419 return idle_vcpu[0];
420 }
422 if ((v = alloc_xenheap_pages(KERNEL_STACK_SIZE_ORDER)) == NULL)
423 return NULL;
424 memset(v, 0, sizeof(*v));
426 ti = alloc_thread_info(v);
427 /* Clear thread_info to clear some important fields, like
428 * preempt_count
429 */
430 memset(ti, 0, sizeof(struct thread_info));
431 init_switch_stack(v);
433 return v;
434 }
436 void free_vcpu_struct(struct vcpu *v)
437 {
438 free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
439 }
441 int vcpu_initialise(struct vcpu *v)
442 {
443 struct domain *d = v->domain;
445 if (!is_idle_domain(d)) {
446 v->arch.metaphysical_rid_dt = d->arch.metaphysical_rid_dt;
447 v->arch.metaphysical_rid_d = d->arch.metaphysical_rid_d;
448 /* Set default values to saved_rr. */
449 v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rid_dt;
450 v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rid_dt;
452 /* Is it correct ?
453 It depends on the domain rid usage.
455 A domain may share rid among its processor (eg having a
456 global VHPT). In this case, we should also share rid
457 among vcpus and the rid range should be the same.
459 However a domain may have per cpu rid allocation. In
460 this case we don't want to share rid among vcpus, but we may
461 do it if two vcpus are on the same cpu... */
463 v->arch.starting_rid = d->arch.starting_rid;
464 v->arch.ending_rid = d->arch.ending_rid;
465 v->arch.breakimm = d->arch.breakimm;
466 v->arch.last_processor = INVALID_PROCESSOR;
467 v->arch.vhpt_pg_shift = PAGE_SHIFT;
468 }
470 if (!VMX_DOMAIN(v))
471 init_timer(&v->arch.hlt_timer, hlt_timer_fn, v,
472 first_cpu(cpu_online_map));
474 return 0;
475 }
477 static void vcpu_share_privregs_with_guest(struct vcpu *v)
478 {
479 struct domain *d = v->domain;
480 int i, order = get_order_from_shift(XMAPPEDREGS_SHIFT);
482 for (i = 0; i < (1 << order); i++)
483 share_xen_page_with_guest(virt_to_page(v->arch.privregs) + i,
484 d, XENSHARE_writable);
485 /*
486 * XXX IA64_XMAPPEDREGS_PADDR
487 * assign these pages into guest pseudo physical address
488 * space for dom0 to map this page by gmfn.
489 * this is necessary for domain save, restore and dump-core.
490 */
491 for (i = 0; i < XMAPPEDREGS_SIZE; i += PAGE_SIZE)
492 assign_domain_page(d, IA64_XMAPPEDREGS_PADDR(v->vcpu_id) + i,
493 virt_to_maddr(v->arch.privregs + i));
494 }
496 int vcpu_late_initialise(struct vcpu *v)
497 {
498 struct domain *d = v->domain;
499 int rc, order;
501 if (HAS_PERVCPU_VHPT(d)) {
502 rc = pervcpu_vhpt_alloc(v);
503 if (rc != 0)
504 return rc;
505 }
507 /* Create privregs page. */
508 order = get_order_from_shift(XMAPPEDREGS_SHIFT);
509 v->arch.privregs = alloc_xenheap_pages(order);
510 BUG_ON(v->arch.privregs == NULL);
511 memset(v->arch.privregs, 0, 1 << XMAPPEDREGS_SHIFT);
512 vcpu_share_privregs_with_guest(v);
514 return 0;
515 }
517 void vcpu_destroy(struct vcpu *v)
518 {
519 if (v->domain->arch.is_vti)
520 vmx_relinquish_vcpu_resources(v);
521 else
522 relinquish_vcpu_resources(v);
523 }
525 static unsigned long*
526 vcpu_to_rbs_bottom(struct vcpu *v)
527 {
528 return (unsigned long*)((char *)v + IA64_RBS_OFFSET);
529 }
531 static void init_switch_stack(struct vcpu *v)
532 {
533 struct pt_regs *regs = vcpu_regs (v);
534 struct switch_stack *sw = (struct switch_stack *) regs - 1;
535 extern void ia64_ret_from_clone;
537 memset(sw, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs));
538 sw->ar_bspstore = (unsigned long)vcpu_to_rbs_bottom(v);
539 sw->b0 = (unsigned long) &ia64_ret_from_clone;
540 sw->ar_fpsr = FPSR_DEFAULT;
541 v->arch._thread.ksp = (unsigned long) sw - 16;
542 // stay on kernel stack because may get interrupts!
543 // ia64_ret_from_clone switches to user stack
544 v->arch._thread.on_ustack = 0;
545 memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
546 }
548 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
549 static int opt_pervcpu_vhpt = 1;
550 integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
551 #endif
553 int arch_domain_create(struct domain *d)
554 {
555 int i;
557 // the following will eventually need to be negotiated dynamically
558 d->arch.shared_info_va = DEFAULT_SHAREDINFO_ADDR;
559 d->arch.breakimm = 0x1000;
560 for (i = 0; i < NR_CPUS; i++) {
561 d->arch.last_vcpu[i].vcpu_id = INVALID_VCPU_ID;
562 }
564 if (is_idle_domain(d))
565 return 0;
567 foreign_p2m_init(d);
568 #ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
569 d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
570 dprintk(XENLOG_INFO, "%s:%d domain %d pervcpu_vhpt %d\n",
571 __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
572 #endif
573 if (tlb_track_create(d) < 0)
574 goto fail_nomem1;
575 d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
576 if (d->shared_info == NULL)
577 goto fail_nomem;
578 memset(d->shared_info, 0, XSI_SIZE);
579 for (i = 0; i < XSI_SIZE; i += PAGE_SIZE)
580 share_xen_page_with_guest(virt_to_page((char *)d->shared_info + i),
581 d, XENSHARE_writable);
583 /* We may also need emulation rid for region4, though it's unlikely
584 * to see guest issue uncacheable access in metaphysical mode. But
585 * keep such info here may be more sane.
586 */
587 if (!allocate_rid_range(d,0))
588 goto fail_nomem;
590 memset(&d->arch.mm, 0, sizeof(d->arch.mm));
591 d->arch.relres = RELRES_not_started;
592 d->arch.mm_teardown_offset = 0;
593 INIT_LIST_HEAD(&d->arch.relmem_list);
595 if ((d->arch.mm.pgd = pgd_alloc(&d->arch.mm)) == NULL)
596 goto fail_nomem;
598 /*
599 * grant_table_create() can't fully initialize grant table for domain
600 * because it is called before arch_domain_create().
601 * Here we complete the initialization which requires p2m table.
602 */
603 spin_lock(&d->grant_table->lock);
604 for (i = 0; i < nr_grant_frames(d->grant_table); i++)
605 ia64_gnttab_create_shared_page(d, d->grant_table, i);
606 spin_unlock(&d->grant_table->lock);
608 d->arch.ioport_caps = rangeset_new(d, "I/O Ports",
609 RANGESETF_prettyprint_hex);
611 dprintk(XENLOG_DEBUG, "arch_domain_create: domain=%p\n", d);
612 return 0;
614 fail_nomem:
615 tlb_track_destroy(d);
616 fail_nomem1:
617 if (d->arch.mm.pgd != NULL)
618 pgd_free(d->arch.mm.pgd);
619 if (d->shared_info != NULL)
620 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
621 return -ENOMEM;
622 }
624 void arch_domain_destroy(struct domain *d)
625 {
626 mm_final_teardown(d);
628 if (d->shared_info != NULL)
629 free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
631 tlb_track_destroy(d);
633 /* Clear vTLB for the next domain. */
634 domain_flush_tlb_vhpt(d);
636 deallocate_rid_range(d);
637 }
639 int arch_vcpu_reset(struct vcpu *v)
640 {
641 /* FIXME: Stub for now */
642 return 0;
643 }
645 /* Here it is assumed that all of the CPUs has same RSE.N_STACKED_PHYS */
646 static unsigned long num_phys_stacked;
647 static int __init
648 init_num_phys_stacked(void)
649 {
650 switch (ia64_pal_rse_info(&num_phys_stacked, NULL)) {
651 case 0L:
652 printk("the number of physical stacked general registers"
653 "(RSE.N_STACKED_PHYS) = %ld\n", num_phys_stacked);
654 return 0;
655 case -2L:
656 case -3L:
657 default:
658 break;
659 }
660 printk("WARNING: PAL_RSE_INFO call failed. "
661 "domain save/restore may NOT work!\n");
662 return -EINVAL;
663 }
664 __initcall(init_num_phys_stacked);
666 #define COPY_FPREG(dst, src) memcpy(dst, src, sizeof(struct ia64_fpreg))
668 #define AR_PFS_PEC_SHIFT 51
669 #define AR_PFS_REC_SIZE 6
670 #define AR_PFS_PEC_MASK (((1UL << 6) - 1) << 51)
672 /*
673 * See init_swtich_stack() and ptrace.h
674 */
675 static struct switch_stack*
676 vcpu_to_switch_stack(struct vcpu* v)
677 {
678 return (struct switch_stack *)(v->arch._thread.ksp + 16);
679 }
681 static int
682 vcpu_has_not_run(struct vcpu* v)
683 {
684 extern void ia64_ret_from_clone;
685 struct switch_stack *sw = vcpu_to_switch_stack(v);
687 return (sw == (struct switch_stack *)(vcpu_regs(v)) - 1) &&
688 (sw->b0 == (unsigned long)&ia64_ret_from_clone);
689 }
691 static void
692 nats_update(unsigned int* nats, unsigned int reg, char nat)
693 {
694 BUG_ON(reg > 31);
696 if (nat)
697 *nats |= (1UL << reg);
698 else
699 *nats &= ~(1UL << reg);
700 }
702 void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c)
703 {
704 int i;
705 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
706 struct cpu_user_regs *uregs = vcpu_regs(v);
707 struct switch_stack *sw = vcpu_to_switch_stack(v);
708 struct unw_frame_info info;
709 int is_hvm = VMX_DOMAIN(v);
710 unsigned int rbs_size;
711 unsigned long *const rbs_bottom = vcpu_to_rbs_bottom(v);
712 unsigned long *rbs_top;
713 unsigned long *rbs_rnat_addr;
714 unsigned int top_slot;
715 unsigned int num_regs;
717 memset(c.nat, 0, sizeof(*c.nat));
718 c.nat->regs.b[6] = uregs->b6;
719 c.nat->regs.b[7] = uregs->b7;
721 memset(&info, 0, sizeof(info));
722 unw_init_from_blocked_task(&info, v);
723 if (vcpu_has_not_run(v)) {
724 c.nat->regs.ar.lc = sw->ar_lc;
725 c.nat->regs.ar.ec =
726 (sw->ar_pfs & AR_PFS_PEC_MASK) >> AR_PFS_PEC_SHIFT;
727 } else if (unw_unwind_to_user(&info) < 0) {
728 /* warn: should panic? */
729 gdprintk(XENLOG_ERR, "vcpu=%d unw_unwind_to_user() failed.\n",
730 v->vcpu_id);
731 show_stack(v, NULL);
733 /* can't return error */
734 c.nat->regs.ar.lc = 0;
735 c.nat->regs.ar.ec = 0;
736 } else {
737 unw_get_ar(&info, UNW_AR_LC, &c.nat->regs.ar.lc);
738 unw_get_ar(&info, UNW_AR_EC, &c.nat->regs.ar.ec);
739 }
740 c.nat->regs.ar.csd = uregs->ar_csd;
741 c.nat->regs.ar.ssd = uregs->ar_ssd;
743 c.nat->regs.r[8] = uregs->r8;
744 c.nat->regs.r[9] = uregs->r9;
745 c.nat->regs.r[10] = uregs->r10;
746 c.nat->regs.r[11] = uregs->r11;
748 if (is_hvm)
749 c.nat->regs.psr = vmx_vcpu_get_psr(v);
750 else
751 c.nat->regs.psr = vcpu_get_psr(v);
753 c.nat->regs.ip = uregs->cr_iip;
754 c.nat->regs.cfm = uregs->cr_ifs;
756 c.nat->regs.ar.unat = uregs->ar_unat;
757 c.nat->regs.ar.pfs = uregs->ar_pfs;
758 c.nat->regs.ar.rsc = uregs->ar_rsc;
759 c.nat->regs.ar.rnat = uregs->ar_rnat;
760 c.nat->regs.ar.bspstore = uregs->ar_bspstore;
762 c.nat->regs.pr = uregs->pr;
763 c.nat->regs.b[0] = uregs->b0;
764 rbs_size = uregs->loadrs >> 16;
765 num_regs = ia64_rse_num_regs(rbs_bottom,
766 (unsigned long*)((char*)rbs_bottom + rbs_size));
767 c.nat->regs.ar.bsp = (unsigned long)ia64_rse_skip_regs(
768 (unsigned long*)c.nat->regs.ar.bspstore, num_regs);
769 BUG_ON(num_regs > num_phys_stacked);
771 c.nat->regs.r[1] = uregs->r1;
772 c.nat->regs.r[12] = uregs->r12;
773 c.nat->regs.r[13] = uregs->r13;
774 c.nat->regs.ar.fpsr = uregs->ar_fpsr;
775 c.nat->regs.r[15] = uregs->r15;
777 c.nat->regs.r[14] = uregs->r14;
778 c.nat->regs.r[2] = uregs->r2;
779 c.nat->regs.r[3] = uregs->r3;
780 c.nat->regs.r[16] = uregs->r16;
781 c.nat->regs.r[17] = uregs->r17;
782 c.nat->regs.r[18] = uregs->r18;
783 c.nat->regs.r[19] = uregs->r19;
784 c.nat->regs.r[20] = uregs->r20;
785 c.nat->regs.r[21] = uregs->r21;
786 c.nat->regs.r[22] = uregs->r22;
787 c.nat->regs.r[23] = uregs->r23;
788 c.nat->regs.r[24] = uregs->r24;
789 c.nat->regs.r[25] = uregs->r25;
790 c.nat->regs.r[26] = uregs->r26;
791 c.nat->regs.r[27] = uregs->r27;
792 c.nat->regs.r[28] = uregs->r28;
793 c.nat->regs.r[29] = uregs->r29;
794 c.nat->regs.r[30] = uregs->r30;
795 c.nat->regs.r[31] = uregs->r31;
797 c.nat->regs.ar.ccv = uregs->ar_ccv;
799 COPY_FPREG(&c.nat->regs.f[2], &sw->f2);
800 COPY_FPREG(&c.nat->regs.f[3], &sw->f3);
801 COPY_FPREG(&c.nat->regs.f[4], &sw->f4);
802 COPY_FPREG(&c.nat->regs.f[5], &sw->f5);
804 COPY_FPREG(&c.nat->regs.f[6], &uregs->f6);
805 COPY_FPREG(&c.nat->regs.f[7], &uregs->f7);
806 COPY_FPREG(&c.nat->regs.f[8], &uregs->f8);
807 COPY_FPREG(&c.nat->regs.f[9], &uregs->f9);
808 COPY_FPREG(&c.nat->regs.f[10], &uregs->f10);
809 COPY_FPREG(&c.nat->regs.f[11], &uregs->f11);
811 COPY_FPREG(&c.nat->regs.f[12], &sw->f12);
812 COPY_FPREG(&c.nat->regs.f[13], &sw->f13);
813 COPY_FPREG(&c.nat->regs.f[14], &sw->f14);
814 COPY_FPREG(&c.nat->regs.f[15], &sw->f15);
815 COPY_FPREG(&c.nat->regs.f[16], &sw->f16);
816 COPY_FPREG(&c.nat->regs.f[17], &sw->f17);
817 COPY_FPREG(&c.nat->regs.f[18], &sw->f18);
818 COPY_FPREG(&c.nat->regs.f[19], &sw->f19);
819 COPY_FPREG(&c.nat->regs.f[20], &sw->f20);
820 COPY_FPREG(&c.nat->regs.f[21], &sw->f21);
821 COPY_FPREG(&c.nat->regs.f[22], &sw->f22);
822 COPY_FPREG(&c.nat->regs.f[23], &sw->f23);
823 COPY_FPREG(&c.nat->regs.f[24], &sw->f24);
824 COPY_FPREG(&c.nat->regs.f[25], &sw->f25);
825 COPY_FPREG(&c.nat->regs.f[26], &sw->f26);
826 COPY_FPREG(&c.nat->regs.f[27], &sw->f27);
827 COPY_FPREG(&c.nat->regs.f[28], &sw->f28);
828 COPY_FPREG(&c.nat->regs.f[29], &sw->f29);
829 COPY_FPREG(&c.nat->regs.f[30], &sw->f30);
830 COPY_FPREG(&c.nat->regs.f[31], &sw->f31);
832 // f32 - f127
833 memcpy(&c.nat->regs.f[32], &v->arch._thread.fph[0],
834 sizeof(v->arch._thread.fph));
836 #define NATS_UPDATE(reg) \
837 nats_update(&c.nat->regs.nats, (reg), \
838 !!(uregs->eml_unat & \
839 (1UL << ia64_unat_pos(&uregs->r ## reg))))
841 // corresponding bit in ar.unat is determined by
842 // (&uregs->rN){8:3}.
843 // r8: the lowest gr member of struct cpu_user_regs.
844 // r7: the highest gr member of struct cpu_user_regs.
845 BUILD_BUG_ON(offsetof(struct cpu_user_regs, r7) -
846 offsetof(struct cpu_user_regs, r8) >
847 64 * sizeof(unsigned long));
849 NATS_UPDATE(1);
850 NATS_UPDATE(2);
851 NATS_UPDATE(3);
853 NATS_UPDATE(8);
854 NATS_UPDATE(9);
855 NATS_UPDATE(10);
856 NATS_UPDATE(11);
857 NATS_UPDATE(12);
858 NATS_UPDATE(13);
859 NATS_UPDATE(14);
860 NATS_UPDATE(15);
861 NATS_UPDATE(16);
862 NATS_UPDATE(17);
863 NATS_UPDATE(18);
864 NATS_UPDATE(19);
865 NATS_UPDATE(20);
866 NATS_UPDATE(21);
867 NATS_UPDATE(22);
868 NATS_UPDATE(23);
869 NATS_UPDATE(24);
870 NATS_UPDATE(25);
871 NATS_UPDATE(26);
872 NATS_UPDATE(27);
873 NATS_UPDATE(28);
874 NATS_UPDATE(29);
875 NATS_UPDATE(30);
876 NATS_UPDATE(31);
878 if (!is_hvm) {
879 c.nat->regs.r[4] = uregs->r4;
880 c.nat->regs.r[5] = uregs->r5;
881 c.nat->regs.r[6] = uregs->r6;
882 c.nat->regs.r[7] = uregs->r7;
884 NATS_UPDATE(4);
885 NATS_UPDATE(5);
886 NATS_UPDATE(6);
887 NATS_UPDATE(7);
888 #undef NATS_UPDATE
889 } else {
890 /*
891 * for VTi domain, r[4-7] are saved sometimes both in
892 * uregs->r[4-7] and memory stack or only in memory stack.
893 * So it is ok to get them from memory stack.
894 */
895 c.nat->regs.nats = uregs->eml_unat;
897 if (vcpu_has_not_run(v)) {
898 c.nat->regs.r[4] = sw->r4;
899 c.nat->regs.r[5] = sw->r5;
900 c.nat->regs.r[6] = sw->r6;
901 c.nat->regs.r[7] = sw->r7;
903 nats_update(&c.nat->regs.nats, 4,
904 !!(sw->ar_unat &
905 (1UL << ia64_unat_pos(&sw->r4))));
906 nats_update(&c.nat->regs.nats, 5,
907 !!(sw->ar_unat &
908 (1UL << ia64_unat_pos(&sw->r5))));
909 nats_update(&c.nat->regs.nats, 6,
910 !!(sw->ar_unat &
911 (1UL << ia64_unat_pos(&sw->r6))));
912 nats_update(&c.nat->regs.nats, 7,
913 !!(sw->ar_unat &
914 (1UL << ia64_unat_pos(&sw->r7))));
915 } else {
916 char nat;
918 unw_get_gr(&info, 4, &c.nat->regs.r[4], &nat);
919 nats_update(&c.nat->regs.nats, 4, nat);
920 unw_get_gr(&info, 5, &c.nat->regs.r[5], &nat);
921 nats_update(&c.nat->regs.nats, 5, nat);
922 unw_get_gr(&info, 6, &c.nat->regs.r[6], &nat);
923 nats_update(&c.nat->regs.nats, 6, nat);
924 unw_get_gr(&info, 7, &c.nat->regs.r[7], &nat);
925 nats_update(&c.nat->regs.nats, 7, nat);
926 }
927 }
929 c.nat->regs.rbs_voff = (IA64_RBS_OFFSET / 8) % 64;
930 if (unlikely(rbs_size > sizeof(c.nat->regs.rbs)))
931 gdprintk(XENLOG_INFO,
932 "rbs_size is too large 0x%x > 0x%lx\n",
933 rbs_size, sizeof(c.nat->regs.rbs));
934 else
935 memcpy(c.nat->regs.rbs, rbs_bottom, rbs_size);
937 rbs_top = (unsigned long*)((char *)rbs_bottom + rbs_size) - 1;
938 rbs_rnat_addr = ia64_rse_rnat_addr(rbs_top);
939 if ((unsigned long)rbs_rnat_addr >= sw->ar_bspstore)
940 rbs_rnat_addr = &sw->ar_rnat;
942 top_slot = ia64_rse_slot_num(rbs_top);
944 c.nat->regs.rbs_rnat = (*rbs_rnat_addr) & ((1UL << top_slot) - 1);
945 if (ia64_rse_rnat_addr(rbs_bottom) == ia64_rse_rnat_addr(rbs_top)) {
946 unsigned int bottom_slot = ia64_rse_slot_num(rbs_bottom);
947 c.nat->regs.rbs_rnat &= ~((1UL << bottom_slot) - 1);
948 }
950 c.nat->regs.num_phys_stacked = num_phys_stacked;
952 if (VMX_DOMAIN(v))
953 c.nat->privregs_pfn = VGC_PRIVREGS_HVM;
954 else
955 c.nat->privregs_pfn = get_gpfn_from_mfn(
956 virt_to_maddr(v->arch.privregs) >> PAGE_SHIFT);
958 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
959 if (VMX_DOMAIN(v)) {
960 vmx_vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
961 vmx_vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
962 } else {
963 vcpu_get_dbr(v, i, &c.nat->regs.dbr[i]);
964 vcpu_get_ibr(v, i, &c.nat->regs.ibr[i]);
965 }
966 }
968 for (i = 0; i < 8; i++)
969 vcpu_get_rr(v, (unsigned long)i << 61, &c.nat->regs.rr[i]);
971 /* Fill extra regs. */
972 for (i = 0;
973 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
974 i++) {
975 tr->itrs[i].pte = v->arch.itrs[i].pte.val;
976 tr->itrs[i].itir = v->arch.itrs[i].itir;
977 tr->itrs[i].vadr = v->arch.itrs[i].vadr;
978 tr->itrs[i].rid = v->arch.itrs[i].rid;
979 }
980 for (i = 0;
981 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
982 i++) {
983 tr->dtrs[i].pte = v->arch.dtrs[i].pte.val;
984 tr->dtrs[i].itir = v->arch.dtrs[i].itir;
985 tr->dtrs[i].vadr = v->arch.dtrs[i].vadr;
986 tr->dtrs[i].rid = v->arch.dtrs[i].rid;
987 }
988 c.nat->event_callback_ip = v->arch.event_callback_ip;
990 /* If PV and privregs is not set, we can't read mapped registers. */
991 if (!v->domain->arch.is_vti && v->arch.privregs == NULL)
992 return;
994 vcpu_get_dcr(v, &c.nat->regs.cr.dcr);
996 c.nat->regs.cr.itm = v->domain->arch.is_vti ?
997 vmx_vcpu_get_itm(v) : PSCBX(v, domain_itm);
998 vcpu_get_iva(v, &c.nat->regs.cr.iva);
999 vcpu_get_pta(v, &c.nat->regs.cr.pta);
1001 vcpu_get_ipsr(v, &c.nat->regs.cr.ipsr);
1002 vcpu_get_isr(v, &c.nat->regs.cr.isr);
1003 vcpu_get_iip(v, &c.nat->regs.cr.iip);
1004 vcpu_get_ifa(v, &c.nat->regs.cr.ifa);
1005 vcpu_get_itir(v, &c.nat->regs.cr.itir);
1006 vcpu_get_iha(v, &c.nat->regs.cr.iha);
1008 //XXX change irr[] and arch.insvc[]
1009 if (v->domain->arch.is_vti)
1010 /* c.nat->regs.cr.ivr = vmx_vcpu_get_ivr(v)*/;//XXXnot SMP-safe
1011 else
1012 vcpu_get_ivr (v, &c.nat->regs.cr.ivr);
1013 vcpu_get_iim(v, &c.nat->regs.cr.iim);
1015 vcpu_get_tpr(v, &c.nat->regs.cr.tpr);
1016 vcpu_get_irr0(v, &c.nat->regs.cr.irr[0]);
1017 vcpu_get_irr1(v, &c.nat->regs.cr.irr[1]);
1018 vcpu_get_irr2(v, &c.nat->regs.cr.irr[2]);
1019 vcpu_get_irr3(v, &c.nat->regs.cr.irr[3]);
1020 vcpu_get_itv(v, &c.nat->regs.cr.itv);//XXX vlsapic
1021 vcpu_get_pmv(v, &c.nat->regs.cr.pmv);
1022 vcpu_get_cmcv(v, &c.nat->regs.cr.cmcv);
1024 if (is_hvm)
1025 vmx_arch_get_info_guest(v, c);
1028 #if 0
1029 // for debug
1030 static void
1031 __rbs_print(const char* func, int line, const char* name,
1032 const unsigned long* rbs, unsigned int rbs_size)
1034 unsigned int i;
1035 printk("%s:%d %s rbs %p\n", func, line, name, rbs);
1036 printk(" rbs_size 0x%016x no 0x%lx\n",
1037 rbs_size, rbs_size / sizeof(unsigned long));
1039 for (i = 0; i < rbs_size / sizeof(unsigned long); i++) {
1040 const char* zero_or_n = "0x";
1041 if (ia64_rse_is_rnat_slot((unsigned long*)&rbs[i]))
1042 zero_or_n = "Nx";
1044 if ((i % 3) == 0)
1045 printk("0x%02x:", i);
1046 printk(" %s%016lx", zero_or_n, rbs[i]);
1047 if ((i % 3) == 2)
1048 printk("\n");
1050 printk("\n");
1053 #define rbs_print(rbs, rbs_size) \
1054 __rbs_print(__func__, __LINE__, (#rbs), (rbs), (rbs_size))
1055 #endif
1057 static int
1058 copy_rbs(struct vcpu* v, unsigned long* dst_rbs_size,
1059 const unsigned long* rbs, unsigned long rbs_size,
1060 unsigned long src_rnat, unsigned long rbs_voff)
1062 int rc = -EINVAL;
1063 struct page_info* page;
1064 unsigned char* vaddr;
1065 unsigned long* src_bsp;
1066 unsigned long* src_bspstore;
1068 struct switch_stack* sw = vcpu_to_switch_stack(v);
1069 unsigned long num_regs;
1070 unsigned long* dst_bsp;
1071 unsigned long* dst_bspstore;
1072 unsigned long* dst_rnat;
1073 unsigned long dst_rnat_tmp;
1074 unsigned long dst_rnat_mask;
1075 unsigned long flags;
1076 extern void ia64_copy_rbs(unsigned long* dst_bspstore,
1077 unsigned long* dst_rbs_size,
1078 unsigned long* dst_rnat_p,
1079 unsigned long* src_bsp,
1080 unsigned long src_rbs_size,
1081 unsigned long src_rnat);
1083 dst_bspstore = vcpu_to_rbs_bottom(v);
1084 *dst_rbs_size = rbs_size;
1085 if (rbs_size == 0)
1086 return 0;
1088 // rbs offset depends on sizeof(struct vcpu) so that
1089 // it's too unstable for hypercall ABI.
1090 // we need to take rbs offset into acount.
1091 //memcpy(dst_bspstore, c.nat->regs.rbs, rbs_size);
1093 // It is assumed that rbs_size is small enough compared
1094 // to KERNEL_STACK_SIZE.
1095 page = alloc_domheap_pages(NULL, KERNEL_STACK_SIZE_ORDER, 0);
1096 if (page == NULL)
1097 return -ENOMEM;
1098 vaddr = page_to_virt(page);
1100 src_bspstore = (unsigned long*)(vaddr + rbs_voff * 8);
1101 src_bsp = (unsigned long*)((unsigned char*)src_bspstore + rbs_size);
1102 if ((unsigned long)src_bsp >= (unsigned long)vaddr + PAGE_SIZE)
1103 goto out;
1104 memcpy(src_bspstore, rbs, rbs_size);
1106 num_regs = ia64_rse_num_regs(src_bspstore, src_bsp);
1107 dst_bsp = ia64_rse_skip_regs(dst_bspstore, num_regs);
1108 *dst_rbs_size = (unsigned long)dst_bsp - (unsigned long)dst_bspstore;
1110 // rough check.
1111 if (((unsigned long)dst_bsp & ~PAGE_MASK) > KERNEL_STACK_SIZE / 2)
1112 goto out;
1114 // ia64_copy_rbs() uses real cpu's stack register.
1115 // So it may fault with an Illigal Operation fault resulting
1116 // in panic if rbs_size is too large to load compared to
1117 // the number of physical stacked registers, RSE.N_STACKED_PHYS,
1118 // which is cpu implementatin specific.
1119 // See SDM vol. 2 Register Stack Engine 6, especially 6.5.5.
1120 //
1121 // For safe operation and cpu model independency,
1122 // we need to copy them by hand without loadrs and flushrs
1123 // However even if we implement that, similar issue still occurs
1124 // when running guest. CPU context restore routine issues loadrs
1125 // resulting in Illegal Operation fault. And what if the vRSE is in
1126 // enforced lazy mode? We can't store any dirty stacked registers
1127 // into RBS without cover or br.call.
1128 if (num_regs > num_phys_stacked) {
1129 rc = -ENOSYS;
1130 gdprintk(XENLOG_WARNING,
1131 "%s:%d domain %d: can't load stacked registres\n"
1132 "requested size 0x%lx => 0x%lx, num regs %ld"
1133 "RSE.N_STACKED_PHYS %ld\n",
1134 __func__, __LINE__, v->domain->domain_id,
1135 rbs_size, *dst_rbs_size, num_regs,
1136 num_phys_stacked);
1137 goto out;
1140 // we mask interrupts to avoid using register backing store.
1141 local_irq_save(flags);
1142 ia64_copy_rbs(dst_bspstore, dst_rbs_size, &dst_rnat_tmp,
1143 src_bsp, rbs_size, src_rnat);
1144 local_irq_restore(flags);
1146 dst_rnat_mask = (1UL << ia64_rse_slot_num(dst_bsp)) - 1;
1147 dst_rnat = ia64_rse_rnat_addr(dst_bsp);
1148 if ((unsigned long)dst_rnat > sw->ar_bspstore)
1149 dst_rnat = &sw->ar_rnat;
1150 // if ia64_rse_rnat_addr(dst_bsp) ==
1151 // ia64_rse_rnat_addr(vcpu_to_rbs_bottom(v)), the lsb bit of rnat
1152 // is just ignored. so we don't have to mask it out.
1153 *dst_rnat =
1154 (*dst_rnat & ~dst_rnat_mask) | (dst_rnat_tmp & dst_rnat_mask);
1156 rc = 0;
1157 out:
1158 free_domheap_pages(page, KERNEL_STACK_SIZE_ORDER);
1159 return rc;
1162 static void
1163 unat_update(unsigned long *unat_eml, unsigned long *spill_addr, char nat)
1165 unsigned int pos = ia64_unat_pos(spill_addr);
1166 if (nat)
1167 *unat_eml |= (1UL << pos);
1168 else
1169 *unat_eml &= ~(1UL << pos);
1172 int arch_set_info_guest(struct vcpu *v, vcpu_guest_context_u c)
1174 struct cpu_user_regs *uregs = vcpu_regs(v);
1175 struct domain *d = v->domain;
1176 struct switch_stack *sw = vcpu_to_switch_stack(v);
1177 int was_initialised = v->is_initialised;
1178 struct unw_frame_info info;
1179 unsigned int rbs_size;
1180 unsigned int num_regs;
1181 unsigned long * const rbs_bottom = vcpu_to_rbs_bottom(v);
1182 int rc = 0;
1183 int i;
1185 /* Finish vcpu initialization. */
1186 if (!was_initialised) {
1187 if (d->arch.is_vti)
1188 rc = vmx_final_setup_guest(v);
1189 else
1190 rc = vcpu_late_initialise(v);
1191 if (rc != 0)
1192 return rc;
1194 vcpu_init_regs(v);
1196 v->is_initialised = 1;
1197 /* Auto-online VCPU0 when it is initialised. */
1198 if (v->vcpu_id == 0)
1199 clear_bit(_VPF_down, &v->pause_flags);
1202 if (c.nat == NULL)
1203 return 0;
1205 uregs->b6 = c.nat->regs.b[6];
1206 uregs->b7 = c.nat->regs.b[7];
1208 memset(&info, 0, sizeof(info));
1209 unw_init_from_blocked_task(&info, v);
1210 if (vcpu_has_not_run(v)) {
1211 sw->ar_lc = c.nat->regs.ar.lc;
1212 sw->ar_pfs =
1213 (sw->ar_pfs & ~AR_PFS_PEC_MASK) |
1214 ((c.nat->regs.ar.ec << AR_PFS_PEC_SHIFT) &
1215 AR_PFS_PEC_MASK);
1216 } else if (unw_unwind_to_user(&info) < 0) {
1217 /* warn: should panic? */
1218 gdprintk(XENLOG_ERR,
1219 "vcpu=%d unw_unwind_to_user() failed.\n",
1220 v->vcpu_id);
1221 show_stack(v, NULL);
1223 //return -ENOSYS;
1224 } else {
1225 unw_set_ar(&info, UNW_AR_LC, c.nat->regs.ar.lc);
1226 unw_set_ar(&info, UNW_AR_EC, c.nat->regs.ar.ec);
1228 uregs->ar_csd = c.nat->regs.ar.csd;
1229 uregs->ar_ssd = c.nat->regs.ar.ssd;
1231 uregs->r8 = c.nat->regs.r[8];
1232 uregs->r9 = c.nat->regs.r[9];
1233 uregs->r10 = c.nat->regs.r[10];
1234 uregs->r11 = c.nat->regs.r[11];
1236 if (!d->arch.is_vti)
1237 vcpu_set_psr(v, c.nat->regs.psr);
1238 else
1239 vmx_vcpu_set_psr(v, c.nat->regs.psr);
1240 uregs->cr_iip = c.nat->regs.ip;
1241 uregs->cr_ifs = c.nat->regs.cfm;
1243 uregs->ar_unat = c.nat->regs.ar.unat;
1244 uregs->ar_pfs = c.nat->regs.ar.pfs;
1245 uregs->ar_rsc = c.nat->regs.ar.rsc;
1246 uregs->ar_rnat = c.nat->regs.ar.rnat;
1247 uregs->ar_bspstore = c.nat->regs.ar.bspstore;
1249 uregs->pr = c.nat->regs.pr;
1250 uregs->b0 = c.nat->regs.b[0];
1251 num_regs = ia64_rse_num_regs((unsigned long*)c.nat->regs.ar.bspstore,
1252 (unsigned long*)c.nat->regs.ar.bsp);
1253 rbs_size = (unsigned long)ia64_rse_skip_regs(rbs_bottom, num_regs) -
1254 (unsigned long)rbs_bottom;
1255 if (rbs_size > sizeof (c.nat->regs.rbs)) {
1256 gdprintk(XENLOG_INFO,
1257 "rbs size is too large %x > %lx\n",
1258 rbs_size, sizeof (c.nat->regs.rbs));
1259 return -EINVAL;
1261 if (rbs_size > 0 &&
1262 ((IA64_RBS_OFFSET / 8) % 64) != c.nat->regs.rbs_voff)
1263 gdprintk(XENLOG_INFO,
1264 "rbs stack offset is different! xen 0x%x given 0x%x",
1265 (IA64_RBS_OFFSET / 8) % 64, c.nat->regs.rbs_voff);
1267 /* Protection against crazy user code. */
1268 if (!was_initialised)
1269 uregs->loadrs = (rbs_size << 16);
1270 if (rbs_size == (uregs->loadrs >> 16)) {
1271 unsigned long dst_rbs_size = 0;
1272 if (vcpu_has_not_run(v))
1273 sw->ar_bspstore = (unsigned long)rbs_bottom;
1275 rc = copy_rbs(v, &dst_rbs_size,
1276 c.nat->regs.rbs, rbs_size,
1277 c.nat->regs.rbs_rnat,
1278 c.nat->regs.rbs_voff);
1279 if (rc < 0)
1280 return rc;
1282 /* In case of newly created vcpu, ar_bspstore points to
1283 * the bottom of register stack. Move it up.
1284 * See also init_switch_stack().
1285 */
1286 if (vcpu_has_not_run(v)) {
1287 uregs->loadrs = (dst_rbs_size << 16);
1288 sw->ar_bspstore = (unsigned long)((char*)rbs_bottom +
1289 dst_rbs_size);
1293 // inhibit save/restore between cpus of different RSE.N_STACKED_PHYS.
1294 // to avoid nasty issues.
1295 //
1296 // The number of physical stacked general register(RSE.N_STACKED_PHYS)
1297 // isn't virtualized. Guest OS utilizes it via PAL_RSE_INFO call and
1298 // the value might be exported to user/user process.
1299 // (Linux does via /proc/cpuinfo)
1300 // The SDM says only that the number is cpu implementation specific.
1301 //
1302 // If the number of restoring cpu is different from one of saving cpu,
1303 // the following, or something worse, might happen.
1304 // - Xen VMM itself may panic when issuing loadrs to run guest with
1305 // illegal operation fault
1306 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1307 // restoring CPU
1308 // This case is detected to refuse restore by rbs_copy()
1309 // - guest kernel may panic with illegal operation fault
1310 // When RSE.N_STACKED_PHYS of saving CPU > RSE.N_STACKED_PHYS of
1311 // restoring CPU
1312 // - infomation leak from guest kernel to user process
1313 // When RSE.N_STACKED_PHYS of saving CPU < RSE.N_STACKED_PHYS of
1314 // restoring CPU
1315 // Before returning to user process, kernel should zero clear all
1316 // physical stacked resgisters to prevent kernel bits leak.
1317 // It would be based on RSE.N_STACKED_PHYS (Linux does.).
1318 // On the restored environtment the kernel clears only a part
1319 // of the physical stacked registers.
1320 // - user processes or human operators would be confused.
1321 // RSE.N_STACKED_PHYS might be exported to user process or human
1322 // operators. Actually on linux it is exported via /proc/cpuinfo.
1323 // user processes might use it.
1324 // I don't know any concrete example, but it's possible in theory.
1325 // e.g. thread libraly may allocate RBS area based on the value.
1326 // (Fortunately glibc nptl doesn't)
1327 if (c.nat->regs.num_phys_stacked != 0 && /* COMPAT */
1328 c.nat->regs.num_phys_stacked != num_phys_stacked) {
1329 gdprintk(XENLOG_WARNING,
1330 "num phys stacked is different! "
1331 "xen 0x%lx given 0x%lx",
1332 num_phys_stacked, c.nat->regs.num_phys_stacked);
1333 return -EINVAL;
1336 uregs->r1 = c.nat->regs.r[1];
1337 uregs->r12 = c.nat->regs.r[12];
1338 uregs->r13 = c.nat->regs.r[13];
1339 uregs->ar_fpsr = c.nat->regs.ar.fpsr;
1340 uregs->r15 = c.nat->regs.r[15];
1342 uregs->r14 = c.nat->regs.r[14];
1343 uregs->r2 = c.nat->regs.r[2];
1344 uregs->r3 = c.nat->regs.r[3];
1345 uregs->r16 = c.nat->regs.r[16];
1346 uregs->r17 = c.nat->regs.r[17];
1347 uregs->r18 = c.nat->regs.r[18];
1348 uregs->r19 = c.nat->regs.r[19];
1349 uregs->r20 = c.nat->regs.r[20];
1350 uregs->r21 = c.nat->regs.r[21];
1351 uregs->r22 = c.nat->regs.r[22];
1352 uregs->r23 = c.nat->regs.r[23];
1353 uregs->r24 = c.nat->regs.r[24];
1354 uregs->r25 = c.nat->regs.r[25];
1355 uregs->r26 = c.nat->regs.r[26];
1356 uregs->r27 = c.nat->regs.r[27];
1357 uregs->r28 = c.nat->regs.r[28];
1358 uregs->r29 = c.nat->regs.r[29];
1359 uregs->r30 = c.nat->regs.r[30];
1360 uregs->r31 = c.nat->regs.r[31];
1362 uregs->ar_ccv = c.nat->regs.ar.ccv;
1364 COPY_FPREG(&sw->f2, &c.nat->regs.f[2]);
1365 COPY_FPREG(&sw->f3, &c.nat->regs.f[3]);
1366 COPY_FPREG(&sw->f4, &c.nat->regs.f[4]);
1367 COPY_FPREG(&sw->f5, &c.nat->regs.f[5]);
1369 COPY_FPREG(&uregs->f6, &c.nat->regs.f[6]);
1370 COPY_FPREG(&uregs->f7, &c.nat->regs.f[7]);
1371 COPY_FPREG(&uregs->f8, &c.nat->regs.f[8]);
1372 COPY_FPREG(&uregs->f9, &c.nat->regs.f[9]);
1373 COPY_FPREG(&uregs->f10, &c.nat->regs.f[10]);
1374 COPY_FPREG(&uregs->f11, &c.nat->regs.f[11]);
1376 COPY_FPREG(&sw->f12, &c.nat->regs.f[12]);
1377 COPY_FPREG(&sw->f13, &c.nat->regs.f[13]);
1378 COPY_FPREG(&sw->f14, &c.nat->regs.f[14]);
1379 COPY_FPREG(&sw->f15, &c.nat->regs.f[15]);
1380 COPY_FPREG(&sw->f16, &c.nat->regs.f[16]);
1381 COPY_FPREG(&sw->f17, &c.nat->regs.f[17]);
1382 COPY_FPREG(&sw->f18, &c.nat->regs.f[18]);
1383 COPY_FPREG(&sw->f19, &c.nat->regs.f[19]);
1384 COPY_FPREG(&sw->f20, &c.nat->regs.f[20]);
1385 COPY_FPREG(&sw->f21, &c.nat->regs.f[21]);
1386 COPY_FPREG(&sw->f22, &c.nat->regs.f[22]);
1387 COPY_FPREG(&sw->f23, &c.nat->regs.f[23]);
1388 COPY_FPREG(&sw->f24, &c.nat->regs.f[24]);
1389 COPY_FPREG(&sw->f25, &c.nat->regs.f[25]);
1390 COPY_FPREG(&sw->f26, &c.nat->regs.f[26]);
1391 COPY_FPREG(&sw->f27, &c.nat->regs.f[27]);
1392 COPY_FPREG(&sw->f28, &c.nat->regs.f[28]);
1393 COPY_FPREG(&sw->f29, &c.nat->regs.f[29]);
1394 COPY_FPREG(&sw->f30, &c.nat->regs.f[30]);
1395 COPY_FPREG(&sw->f31, &c.nat->regs.f[31]);
1397 // f32 - f127
1398 memcpy(&v->arch._thread.fph[0], &c.nat->regs.f[32],
1399 sizeof(v->arch._thread.fph));
1401 #define UNAT_UPDATE(reg) \
1402 unat_update(&uregs->eml_unat, &uregs->r ## reg, \
1403 !!(c.nat->regs.nats & (1UL << (reg))));
1405 uregs->eml_unat = 0;
1406 UNAT_UPDATE(1);
1407 UNAT_UPDATE(2);
1408 UNAT_UPDATE(3);
1410 UNAT_UPDATE(8);
1411 UNAT_UPDATE(9);
1412 UNAT_UPDATE(10);
1413 UNAT_UPDATE(11);
1414 UNAT_UPDATE(12);
1415 UNAT_UPDATE(13);
1416 UNAT_UPDATE(14);
1417 UNAT_UPDATE(15);
1418 UNAT_UPDATE(16);
1419 UNAT_UPDATE(17);
1420 UNAT_UPDATE(18);
1421 UNAT_UPDATE(19);
1422 UNAT_UPDATE(20);
1423 UNAT_UPDATE(21);
1424 UNAT_UPDATE(22);
1425 UNAT_UPDATE(23);
1426 UNAT_UPDATE(24);
1427 UNAT_UPDATE(25);
1428 UNAT_UPDATE(26);
1429 UNAT_UPDATE(27);
1430 UNAT_UPDATE(28);
1431 UNAT_UPDATE(29);
1432 UNAT_UPDATE(30);
1433 UNAT_UPDATE(31);
1435 /*
1436 * r4-r7 is saved sometimes both in pt_regs->r[4-7] and memory stack or
1437 * only in memory stack.
1438 * for both cases, both memory stack and pt_regs->r[4-7] are updated.
1439 */
1440 uregs->r4 = c.nat->regs.r[4];
1441 uregs->r5 = c.nat->regs.r[5];
1442 uregs->r6 = c.nat->regs.r[6];
1443 uregs->r7 = c.nat->regs.r[7];
1445 UNAT_UPDATE(4);
1446 UNAT_UPDATE(5);
1447 UNAT_UPDATE(6);
1448 UNAT_UPDATE(7);
1449 #undef UNAT_UPDATE
1450 if (vcpu_has_not_run(v)) {
1451 sw->r4 = c.nat->regs.r[4];
1452 sw->r5 = c.nat->regs.r[5];
1453 sw->r6 = c.nat->regs.r[6];
1454 sw->r7 = c.nat->regs.r[7];
1456 unat_update(&sw->ar_unat, &sw->r4,
1457 !!(c.nat->regs.nats & (1UL << 4)));
1458 unat_update(&sw->ar_unat, &sw->r5,
1459 !!(c.nat->regs.nats & (1UL << 5)));
1460 unat_update(&sw->ar_unat, &sw->r6,
1461 !!(c.nat->regs.nats & (1UL << 6)));
1462 unat_update(&sw->ar_unat, &sw->r7,
1463 !!(c.nat->regs.nats & (1UL << 7)));
1464 } else {
1465 unw_set_gr(&info, 4, c.nat->regs.r[4],
1466 !!(c.nat->regs.nats & (1UL << 4)));
1467 unw_set_gr(&info, 5, c.nat->regs.r[5],
1468 !!(c.nat->regs.nats & (1UL << 5)));
1469 unw_set_gr(&info, 6, c.nat->regs.r[6],
1470 !!(c.nat->regs.nats & (1UL << 6)));
1471 unw_set_gr(&info, 7, c.nat->regs.r[7],
1472 !!(c.nat->regs.nats & (1UL << 7)));
1475 if (!d->arch.is_vti) {
1476 /* domain runs at PL2/3 */
1477 uregs->cr_ipsr = vcpu_pl_adjust(uregs->cr_ipsr,
1478 IA64_PSR_CPL0_BIT);
1479 uregs->ar_rsc = vcpu_pl_adjust(uregs->ar_rsc, 2);
1482 for (i = 0; i < IA64_NUM_DBG_REGS; i++) {
1483 if (d->arch.is_vti) {
1484 vmx_vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1485 vmx_vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1486 } else {
1487 vcpu_set_dbr(v, i, c.nat->regs.dbr[i]);
1488 vcpu_set_ibr(v, i, c.nat->regs.ibr[i]);
1492 /* rr[] must be set before setting itrs[] dtrs[] */
1493 for (i = 0; i < 8; i++) {
1494 unsigned long rrval = c.nat->regs.rr[i];
1495 unsigned long reg = (unsigned long)i << 61;
1496 IA64FAULT fault = IA64_NO_FAULT;
1498 if (rrval == 0)
1499 continue;
1500 if (d->arch.is_vti) {
1501 //without VGCF_EXTRA_REGS check,
1502 //VTi domain doesn't boot.
1503 if (c.nat->flags & VGCF_EXTRA_REGS)
1504 fault = vmx_vcpu_set_rr(v, reg, rrval);
1505 } else
1506 fault = vcpu_set_rr(v, reg, rrval);
1507 if (fault != IA64_NO_FAULT)
1508 return -EINVAL;
1511 if (c.nat->flags & VGCF_EXTRA_REGS) {
1512 struct vcpu_tr_regs *tr = &c.nat->regs.tr;
1514 for (i = 0;
1515 (i < sizeof(tr->itrs) / sizeof(tr->itrs[0])) && i < NITRS;
1516 i++) {
1517 if (d->arch.is_vti)
1518 vmx_vcpu_itr_i(v, i, tr->itrs[i].pte,
1519 tr->itrs[i].itir,
1520 tr->itrs[i].vadr);
1521 else
1522 vcpu_set_itr(v, i, tr->itrs[i].pte,
1523 tr->itrs[i].itir,
1524 tr->itrs[i].vadr,
1525 tr->itrs[i].rid);
1527 for (i = 0;
1528 (i < sizeof(tr->dtrs) / sizeof(tr->dtrs[0])) && i < NDTRS;
1529 i++) {
1530 if (d->arch.is_vti)
1531 vmx_vcpu_itr_d(v, i, tr->dtrs[i].pte,
1532 tr->dtrs[i].itir,
1533 tr->dtrs[i].vadr);
1534 else
1535 vcpu_set_dtr(v, i,
1536 tr->dtrs[i].pte,
1537 tr->dtrs[i].itir,
1538 tr->dtrs[i].vadr,
1539 tr->dtrs[i].rid);
1541 v->arch.event_callback_ip = c.nat->event_callback_ip;
1542 vcpu_set_iva(v, c.nat->regs.cr.iva);
1545 if (d->arch.is_vti)
1546 rc = vmx_arch_set_info_guest(v, c);
1548 return rc;
1551 static int relinquish_memory(struct domain *d, struct list_head *list)
1553 struct list_head *ent;
1554 struct page_info *page;
1555 #ifndef __ia64__
1556 unsigned long x, y;
1557 #endif
1558 int ret = 0;
1560 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1561 spin_lock_recursive(&d->page_alloc_lock);
1562 ent = list->next;
1563 while ( ent != list )
1565 page = list_entry(ent, struct page_info, list);
1566 /* Grab a reference to the page so it won't disappear from under us. */
1567 if ( unlikely(!get_page(page, d)) )
1569 /* Couldn't get a reference -- someone is freeing this page. */
1570 ent = ent->next;
1571 list_move_tail(&page->list, &d->arch.relmem_list);
1572 continue;
1575 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1576 put_page_and_type(page);
1578 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1579 put_page(page);
1581 #ifndef __ia64__
1582 /*
1583 * Forcibly invalidate base page tables at this point to break circular
1584 * 'linear page table' references. This is okay because MMU structures
1585 * are not shared across domains and this domain is now dead. Thus base
1586 * tables are not in use so a non-zero count means circular reference.
1587 */
1588 y = page->u.inuse.type_info;
1589 for ( ; ; )
1591 x = y;
1592 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
1593 (PGT_base_page_table|PGT_validated)) )
1594 break;
1596 y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
1597 if ( likely(y == x) )
1599 free_page_type(page, PGT_base_page_table);
1600 break;
1603 #endif
1605 /* Follow the list chain and /then/ potentially free the page. */
1606 ent = ent->next;
1607 BUG_ON(get_gpfn_from_mfn(page_to_mfn(page)) != INVALID_M2P_ENTRY);
1608 list_move_tail(&page->list, &d->arch.relmem_list);
1609 put_page(page);
1611 if (hypercall_preempt_check()) {
1612 ret = -EAGAIN;
1613 goto out;
1617 list_splice_init(&d->arch.relmem_list, list);
1619 out:
1620 spin_unlock_recursive(&d->page_alloc_lock);
1621 return ret;
1624 int domain_relinquish_resources(struct domain *d)
1626 int ret = 0;
1628 switch (d->arch.relres) {
1629 case RELRES_not_started:
1630 /* Relinquish guest resources for VT-i domain. */
1631 if (d->arch.is_vti)
1632 vmx_relinquish_guest_resources(d);
1633 d->arch.relres = RELRES_mm_teardown;
1634 /*fallthrough*/
1636 case RELRES_mm_teardown:
1637 /* Tear down shadow mode stuff. */
1638 ret = mm_teardown(d);
1639 if (ret != 0)
1640 return ret;
1641 d->arch.relres = RELRES_xen;
1642 /* fallthrough */
1644 case RELRES_xen:
1645 /* Relinquish every xen page of memory. */
1646 ret = relinquish_memory(d, &d->xenpage_list);
1647 if (ret != 0)
1648 return ret;
1649 d->arch.relres = RELRES_dom;
1650 /* fallthrough */
1652 case RELRES_dom:
1653 /* Relinquish every domain page of memory. */
1654 ret = relinquish_memory(d, &d->page_list);
1655 if (ret != 0)
1656 return ret;
1657 d->arch.relres = RELRES_done;
1658 /* fallthrough */
1660 case RELRES_done:
1661 break;
1663 default:
1664 BUG();
1667 if (d->arch.is_vti && d->arch.sal_data)
1668 xfree(d->arch.sal_data);
1670 /* Free page used by xen oprofile buffer */
1671 free_xenoprof_pages(d);
1673 return 0;
1676 unsigned long
1677 domain_set_shared_info_va (unsigned long va)
1679 struct vcpu *v = current;
1680 struct domain *d = v->domain;
1681 int rc;
1683 /* Check virtual address:
1684 must belong to region 7,
1685 must be 64Kb aligned,
1686 must not be within Xen virtual space. */
1687 if ((va >> 61) != 7
1688 || (va & 0xffffUL) != 0
1689 || (va >= HYPERVISOR_VIRT_START && va < HYPERVISOR_VIRT_END))
1690 panic_domain (NULL, "%s: bad va (0x%016lx)\n", __func__, va);
1692 /* Note: this doesn't work well if other cpus are already running.
1693 However this is part of the spec :-) */
1694 gdprintk(XENLOG_DEBUG, "Domain set shared_info_va to 0x%016lx\n", va);
1695 d->arch.shared_info_va = va;
1697 VCPU(v, interrupt_mask_addr) = (unsigned char *)va +
1698 INT_ENABLE_OFFSET(v);
1700 __ia64_per_cpu_var(current_psr_ic_addr) = (int *)(va + XSI_PSR_IC_OFS);
1702 /* Remap the shared pages. */
1703 rc = !set_one_rr(7UL << 61, PSCB(v,rrs[7]));
1704 BUG_ON(rc);
1706 return rc;
1709 /* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
1710 #define SHADOW_COPY_CHUNK 1024
1712 int shadow_mode_control(struct domain *d, xen_domctl_shadow_op_t *sc)
1714 unsigned int op = sc->op;
1715 int rc = 0;
1716 int i;
1717 //struct vcpu *v;
1719 if (unlikely(d == current->domain)) {
1720 gdprintk(XENLOG_INFO,
1721 "Don't try to do a shadow op on yourself!\n");
1722 return -EINVAL;
1725 domain_pause(d);
1727 switch (op)
1729 case XEN_DOMCTL_SHADOW_OP_OFF:
1730 if (shadow_mode_enabled (d)) {
1731 u64 *bm = d->arch.shadow_bitmap;
1733 /* Flush vhpt and tlb to restore dirty bit usage. */
1734 domain_flush_tlb_vhpt(d);
1736 /* Free bitmap. */
1737 d->arch.shadow_bitmap_size = 0;
1738 d->arch.shadow_bitmap = NULL;
1739 xfree(bm);
1741 break;
1743 case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST:
1744 case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE:
1745 rc = -EINVAL;
1746 break;
1748 case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
1749 if (shadow_mode_enabled(d)) {
1750 rc = -EINVAL;
1751 break;
1754 atomic64_set(&d->arch.shadow_fault_count, 0);
1755 atomic64_set(&d->arch.shadow_dirty_count, 0);
1757 d->arch.shadow_bitmap_size =
1758 ((d->arch.convmem_end >> PAGE_SHIFT) +
1759 BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
1760 d->arch.shadow_bitmap = xmalloc_array(unsigned long,
1761 d->arch.shadow_bitmap_size / BITS_PER_LONG);
1762 if (d->arch.shadow_bitmap == NULL) {
1763 d->arch.shadow_bitmap_size = 0;
1764 rc = -ENOMEM;
1766 else {
1767 memset(d->arch.shadow_bitmap, 0,
1768 d->arch.shadow_bitmap_size / 8);
1770 /* Flush vhtp and tlb to enable dirty bit
1771 virtualization. */
1772 domain_flush_tlb_vhpt(d);
1774 break;
1776 case XEN_DOMCTL_SHADOW_OP_CLEAN:
1778 int nbr_bytes;
1780 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1781 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1783 atomic64_set(&d->arch.shadow_fault_count, 0);
1784 atomic64_set(&d->arch.shadow_dirty_count, 0);
1786 if (guest_handle_is_null(sc->dirty_bitmap) ||
1787 (d->arch.shadow_bitmap == NULL)) {
1788 rc = -EINVAL;
1789 break;
1792 if (sc->pages > d->arch.shadow_bitmap_size)
1793 sc->pages = d->arch.shadow_bitmap_size;
1795 nbr_bytes = (sc->pages + 7) / 8;
1797 for (i = 0; i < nbr_bytes; i += SHADOW_COPY_CHUNK) {
1798 int size = (nbr_bytes - i) > SHADOW_COPY_CHUNK ?
1799 SHADOW_COPY_CHUNK : nbr_bytes - i;
1801 if (copy_to_guest_offset(
1802 sc->dirty_bitmap, i,
1803 (uint8_t *)d->arch.shadow_bitmap + i,
1804 size)) {
1805 rc = -EFAULT;
1806 break;
1809 memset((uint8_t *)d->arch.shadow_bitmap + i, 0, size);
1812 break;
1815 case XEN_DOMCTL_SHADOW_OP_PEEK:
1817 unsigned long size;
1819 sc->stats.fault_count = atomic64_read(&d->arch.shadow_fault_count);
1820 sc->stats.dirty_count = atomic64_read(&d->arch.shadow_dirty_count);
1822 if (guest_handle_is_null(sc->dirty_bitmap) ||
1823 (d->arch.shadow_bitmap == NULL)) {
1824 rc = -EINVAL;
1825 break;
1828 if (sc->pages > d->arch.shadow_bitmap_size)
1829 sc->pages = d->arch.shadow_bitmap_size;
1831 size = (sc->pages + 7) / 8;
1832 if (copy_to_guest(sc->dirty_bitmap,
1833 (uint8_t *)d->arch.shadow_bitmap, size)) {
1834 rc = -EFAULT;
1835 break;
1837 break;
1839 case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
1840 sc->mb = 0;
1841 break;
1842 case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
1843 if (sc->mb > 0) {
1844 BUG();
1845 rc = -ENOMEM;
1847 break;
1848 default:
1849 rc = -EINVAL;
1850 break;
1853 domain_unpause(d);
1855 return rc;
1858 // remove following line if not privifying in memory
1859 //#define HAVE_PRIVIFY_MEMORY
1860 #ifndef HAVE_PRIVIFY_MEMORY
1861 #define privify_memory(x,y) do {} while(0)
1862 #endif
1864 static void __init loaddomainelfimage(struct domain *d, struct elf_binary *elf,
1865 unsigned long phys_load_offset)
1867 const elf_phdr *phdr;
1868 int phnum, h, filesz, memsz;
1869 unsigned long elfaddr, dom_mpaddr, dom_imva;
1870 struct page_info *p;
1872 phnum = elf_uval(elf, elf->ehdr, e_phnum);
1873 for (h = 0; h < phnum; h++) {
1874 phdr = elf_phdr_by_index(elf, h);
1875 if (!elf_phdr_is_loadable(elf, phdr))
1876 continue;
1878 filesz = elf_uval(elf, phdr, p_filesz);
1879 memsz = elf_uval(elf, phdr, p_memsz);
1880 elfaddr = (unsigned long) elf->image + elf_uval(elf, phdr, p_offset);
1881 dom_mpaddr = elf_uval(elf, phdr, p_paddr);
1882 dom_mpaddr += phys_load_offset;
1884 while (memsz > 0) {
1885 p = assign_new_domain_page(d,dom_mpaddr);
1886 BUG_ON (unlikely(p == NULL));
1887 dom_imva = __va_ul(page_to_maddr(p));
1888 if (filesz > 0) {
1889 if (filesz >= PAGE_SIZE)
1890 copy_page((void *) dom_imva,
1891 (void *) elfaddr);
1892 else {
1893 // copy partial page
1894 memcpy((void *) dom_imva,
1895 (void *) elfaddr, filesz);
1896 // zero the rest of page
1897 memset((void *) dom_imva+filesz, 0,
1898 PAGE_SIZE-filesz);
1900 //FIXME: This test for code seems to find a lot more than objdump -x does
1901 if (elf_uval(elf, phdr, p_flags) & PF_X) {
1902 privify_memory(dom_imva,PAGE_SIZE);
1903 flush_icache_range(dom_imva,
1904 dom_imva+PAGE_SIZE);
1907 else if (memsz > 0) {
1908 /* always zero out entire page */
1909 clear_page((void *) dom_imva);
1911 memsz -= PAGE_SIZE;
1912 filesz -= PAGE_SIZE;
1913 elfaddr += PAGE_SIZE;
1914 dom_mpaddr += PAGE_SIZE;
1919 static void __init calc_dom0_size(void)
1921 unsigned long domheap_pages;
1922 unsigned long p2m_pages;
1923 unsigned long spare_hv_pages;
1924 unsigned long max_dom0_size;
1926 /* Estimate maximum memory we can safely allocate for dom0
1927 * by subtracting the p2m table allocation and a chunk of memory
1928 * for DMA and PCI mapping from the available domheap pages. The
1929 * chunk for DMA, PCI, etc., is a guestimate, as xen doesn't seem
1930 * to have a good idea of what those requirements might be ahead
1931 * of time, calculated at 1MB per 4GB of system memory */
1932 domheap_pages = avail_domheap_pages();
1933 p2m_pages = domheap_pages / PTRS_PER_PTE;
1934 spare_hv_pages = domheap_pages / 4096;
1935 max_dom0_size = (domheap_pages - (p2m_pages + spare_hv_pages))
1936 * PAGE_SIZE;
1937 printk("Maximum permitted dom0 size: %luMB\n",
1938 max_dom0_size / (1024*1024));
1940 /* validate proposed dom0_size, fix up as needed */
1941 if (dom0_size > max_dom0_size) {
1942 printk("Reducing dom0 memory allocation from %luK to %luK "
1943 "to fit available memory\n",
1944 dom0_size / 1024, max_dom0_size / 1024);
1945 dom0_size = max_dom0_size;
1948 /* dom0_mem=0 can be passed in to give all available mem to dom0 */
1949 if (dom0_size == 0) {
1950 printk("Allocating all available memory to dom0\n");
1951 dom0_size = max_dom0_size;
1954 /* Check dom0 size. */
1955 if (dom0_size < 4 * 1024 * 1024) {
1956 panic("dom0_mem is too small, boot aborted"
1957 " (try e.g. dom0_mem=256M or dom0_mem=65536K)\n");
1960 if (running_on_sim) {
1961 dom0_size = 128*1024*1024; //FIXME: Should be configurable
1964 /* no need to allocate pages for now
1965 * pages are allocated by map_new_domain_page() via loaddomainelfimage()
1966 */
1970 /*
1971 * Domain 0 has direct access to all devices absolutely. However
1972 * the major point of this stub here, is to allow alloc_dom_mem
1973 * handled with order > 0 request. Dom0 requires that bit set to
1974 * allocate memory for other domains.
1975 */
1976 static void __init physdev_init_dom0(struct domain *d)
1978 if (iomem_permit_access(d, 0UL, ~0UL))
1979 BUG();
1980 if (irqs_permit_access(d, 0, NR_IRQS-1))
1981 BUG();
1982 if (ioports_permit_access(d, 0, 0xffff))
1983 BUG();
1986 int __init construct_dom0(struct domain *d,
1987 unsigned long image_start, unsigned long image_len,
1988 unsigned long initrd_start, unsigned long initrd_len,
1989 char *cmdline)
1991 int i, rc;
1992 start_info_t *si;
1993 dom0_vga_console_info_t *ci;
1994 struct vcpu *v = d->vcpu[0];
1995 unsigned long max_pages;
1997 struct elf_binary elf;
1998 struct elf_dom_parms parms;
1999 unsigned long p_start;
2000 unsigned long pkern_start;
2001 unsigned long pkern_entry;
2002 unsigned long pkern_end;
2003 unsigned long pinitrd_start = 0;
2004 unsigned long pstart_info;
2005 unsigned long phys_load_offset;
2006 struct page_info *start_info_page;
2007 unsigned long bp_mpa;
2008 struct ia64_boot_param *bp;
2010 //printk("construct_dom0: starting\n");
2012 /* Sanity! */
2013 BUG_ON(d != dom0);
2014 BUG_ON(d->vcpu[0] == NULL);
2015 BUG_ON(v->is_initialised);
2017 printk("*** LOADING DOMAIN 0 ***\n");
2019 calc_dom0_size();
2021 max_pages = dom0_size / PAGE_SIZE;
2022 d->max_pages = max_pages;
2023 d->tot_pages = 0;
2025 rc = elf_init(&elf, (void*)image_start, image_len);
2026 if ( rc != 0 )
2027 return rc;
2028 #ifdef VERBOSE
2029 elf_set_verbose(&elf);
2030 #endif
2031 elf_parse_binary(&elf);
2032 if (0 != (elf_xen_parse(&elf, &parms)))
2033 return rc;
2035 /*
2036 * We cannot rely on the load address in the ELF headers to
2037 * determine the meta physical address at which the image
2038 * is loaded. Patch the address to match the real one, based
2039 * on xen_pstart
2040 */
2041 phys_load_offset = xen_pstart - elf.pstart;
2042 elf.pstart += phys_load_offset;
2043 elf.pend += phys_load_offset;
2044 parms.virt_kstart += phys_load_offset;
2045 parms.virt_kend += phys_load_offset;
2046 parms.virt_entry += phys_load_offset;
2048 printk(" Dom0 kernel: %s, %s, paddr 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
2049 elf_64bit(&elf) ? "64-bit" : "32-bit",
2050 elf_msb(&elf) ? "msb" : "lsb",
2051 elf.pstart, elf.pend);
2052 if (!elf_64bit(&elf) ||
2053 elf_uval(&elf, elf.ehdr, e_machine) != EM_IA_64) {
2054 printk("Incompatible kernel binary\n");
2055 return -1;
2058 p_start = parms.virt_base;
2059 pkern_start = parms.virt_kstart;
2060 pkern_end = parms.virt_kend;
2061 pkern_entry = parms.virt_entry;
2063 //printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n",p_start,pkern_start,pkern_end,pkern_entry);
2065 if ( (p_start & (PAGE_SIZE-1)) != 0 )
2067 printk("Initial guest OS must load to a page boundary.\n");
2068 return -EINVAL;
2071 pstart_info = PAGE_ALIGN(pkern_end);
2072 if(initrd_start && initrd_len){
2073 unsigned long offset;
2075 /* The next page aligned boundary after the start info.
2076 Note: EFI_PAGE_SHIFT = 12 <= PAGE_SHIFT */
2077 pinitrd_start = pstart_info + PAGE_SIZE;
2079 if ((pinitrd_start + initrd_len - phys_load_offset) >= dom0_size)
2080 panic("%s: not enough memory assigned to dom0", __func__);
2082 for (offset = 0; offset < initrd_len; offset += PAGE_SIZE) {
2083 struct page_info *p;
2084 p = assign_new_domain_page(d, pinitrd_start + offset);
2085 if (p == NULL)
2086 panic("%s: can't allocate page for initrd image", __func__);
2087 if (initrd_len < offset + PAGE_SIZE)
2088 memcpy(page_to_virt(p), (void*)(initrd_start + offset),
2089 initrd_len - offset);
2090 else
2091 copy_page(page_to_virt(p), (void*)(initrd_start + offset));
2095 printk("METAPHYSICAL MEMORY ARRANGEMENT:\n"
2096 " Kernel image: %lx->%lx\n"
2097 " Entry address: %lx\n"
2098 " Init. ramdisk: %lx len %lx\n"
2099 " Start info.: %lx->%lx\n",
2100 pkern_start, pkern_end, pkern_entry, pinitrd_start, initrd_len,
2101 pstart_info, pstart_info + PAGE_SIZE);
2103 if ( (pkern_end - pkern_start) > (max_pages * PAGE_SIZE) )
2105 printk("Initial guest OS requires too much space\n"
2106 "(%luMB is greater than %luMB limit)\n",
2107 (pkern_end-pkern_start)>>20,
2108 (max_pages <<PAGE_SHIFT)>>20);
2109 return -ENOMEM;
2112 // if high 3 bits of pkern start are non-zero, error
2114 // if pkern end is after end of metaphysical memory, error
2115 // (we should be able to deal with this... later)
2117 /* Mask all upcalls... */
2118 for ( i = 1; i < MAX_VIRT_CPUS; i++ )
2119 d->shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
2121 if (dom0_max_vcpus == 0)
2122 dom0_max_vcpus = MAX_VIRT_CPUS;
2123 if (dom0_max_vcpus > num_online_cpus())
2124 dom0_max_vcpus = num_online_cpus();
2125 if (dom0_max_vcpus > MAX_VIRT_CPUS)
2126 dom0_max_vcpus = MAX_VIRT_CPUS;
2128 printk ("Dom0 max_vcpus=%d\n", dom0_max_vcpus);
2129 for ( i = 1; i < dom0_max_vcpus; i++ )
2130 if (alloc_vcpu(d, i, i) == NULL)
2131 panic("Cannot allocate dom0 vcpu %d\n", i);
2133 /* Copy the OS image. */
2134 loaddomainelfimage(d, &elf, phys_load_offset);
2136 BUILD_BUG_ON(sizeof(start_info_t) + sizeof(dom0_vga_console_info_t) +
2137 sizeof(struct ia64_boot_param) > PAGE_SIZE);
2139 /* Set up start info area. */
2140 d->shared_info->arch.start_info_pfn = pstart_info >> PAGE_SHIFT;
2141 start_info_page = assign_new_domain_page(d, pstart_info);
2142 if (start_info_page == NULL)
2143 panic("can't allocate start info page");
2144 si = page_to_virt(start_info_page);
2145 clear_page(si);
2146 snprintf(si->magic, sizeof(si->magic), "xen-3.0-ia64");
2147 si->nr_pages = max_pages;
2148 si->flags = SIF_INITDOMAIN|SIF_PRIVILEGED;
2150 printk("Dom0: 0x%lx\n", (u64)dom0);
2152 v->is_initialised = 1;
2153 clear_bit(_VPF_down, &v->pause_flags);
2155 /* Build firmware.
2156 Note: Linux kernel reserve memory used by start_info, so there is
2157 no need to remove it from MDT. */
2158 bp_mpa = pstart_info + sizeof(struct start_info);
2159 rc = dom_fw_setup(d, bp_mpa, max_pages * PAGE_SIZE);
2160 if (rc != 0)
2161 return rc;
2163 /* Fill boot param. */
2164 strlcpy((char *)si->cmd_line, dom0_command_line, sizeof(si->cmd_line));
2166 bp = (struct ia64_boot_param *)((unsigned char *)si +
2167 sizeof(start_info_t));
2168 bp->command_line = pstart_info + offsetof (start_info_t, cmd_line);
2170 /* We assume console has reached the last line! */
2171 bp->console_info.num_cols = ia64_boot_param->console_info.num_cols;
2172 bp->console_info.num_rows = ia64_boot_param->console_info.num_rows;
2173 bp->console_info.orig_x = 0;
2174 bp->console_info.orig_y = bp->console_info.num_rows == 0 ?
2175 0 : bp->console_info.num_rows - 1;
2177 bp->initrd_start = pinitrd_start;
2178 bp->initrd_size = ia64_boot_param->initrd_size;
2180 ci = (dom0_vga_console_info_t *)((unsigned char *)si +
2181 sizeof(start_info_t) +
2182 sizeof(struct ia64_boot_param));
2184 if (fill_console_start_info(ci)) {
2185 si->console.dom0.info_off = sizeof(start_info_t) +
2186 sizeof(struct ia64_boot_param);
2187 si->console.dom0.info_size = sizeof(dom0_vga_console_info_t);
2190 vcpu_init_regs (v);
2192 vcpu_regs(v)->r28 = bp_mpa;
2194 vcpu_regs (v)->cr_iip = pkern_entry;
2196 physdev_init_dom0(d);
2198 return 0;
2201 void machine_restart(void)
2203 console_start_sync();
2204 if (running_on_sim)
2205 printk ("machine_restart called. spinning...\n");
2206 else
2207 (*efi.reset_system)(EFI_RESET_WARM,0,0,NULL);
2208 while(1);
2211 extern void cpu_halt(void);
2213 void machine_halt(void)
2215 console_start_sync();
2217 #ifdef CONFIG_SMP
2218 smp_send_stop();
2219 #endif
2221 printk ("machine_halt called. spinning...\n");
2222 while(1);
2225 void sync_vcpu_execstate(struct vcpu *v)
2227 // __ia64_save_fpu(v->arch._thread.fph);
2228 // FIXME SMP: Anything else needed here for SMP?
2231 /* This function is taken from xen/arch/x86/domain.c */
2232 long
2233 arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE(void) arg)
2235 long rc = 0;
2237 switch (cmd) {
2238 case VCPUOP_register_runstate_memory_area:
2240 struct vcpu_register_runstate_memory_area area;
2241 struct vcpu_runstate_info runstate;
2243 rc = -EFAULT;
2244 if (copy_from_guest(&area, arg, 1))
2245 break;
2247 if (!guest_handle_okay(area.addr.h, 1))
2248 break;
2250 rc = 0;
2251 runstate_guest(v) = area.addr.h;
2253 if (v == current) {
2254 __copy_to_guest(runstate_guest(v), &v->runstate, 1);
2255 } else {
2256 vcpu_runstate_get(v, &runstate);
2257 __copy_to_guest(runstate_guest(v), &runstate, 1);
2260 break;
2262 default:
2263 rc = -ENOSYS;
2264 break;
2267 return rc;
2270 static void __init parse_dom0_mem(char *s)
2272 dom0_size = parse_size_and_unit(s, NULL);
2274 custom_param("dom0_mem", parse_dom0_mem);
2276 /*
2277 * Helper function for the optimization stuff handling the identity mapping
2278 * feature.
2279 */
2280 static inline unsigned long
2281 optf_identity_mapping_cmd_to_flg(unsigned long cmd)
2283 switch(cmd) {
2284 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2285 return XEN_IA64_OPTF_IDENT_MAP_REG7_FLG;
2286 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2287 return XEN_IA64_OPTF_IDENT_MAP_REG4_FLG;
2288 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2289 return XEN_IA64_OPTF_IDENT_MAP_REG5_FLG;
2290 default:
2291 BUG();
2292 return 0;
2295 /* NOTREACHED */
2298 static inline void
2299 optf_set_identity_mapping(unsigned long* mask, struct identity_mapping* im,
2300 struct xen_ia64_opt_feature* f)
2302 unsigned long flag = optf_identity_mapping_cmd_to_flg(f->cmd);
2304 if (f->on) {
2305 *mask |= flag;
2306 im->pgprot = f->pgprot;
2307 im->key = f->key;
2308 } else {
2309 *mask &= ~flag;
2310 im->pgprot = 0;
2311 im->key = 0;
2315 /*
2316 * Switch an optimization feature on/off.
2317 * The vcpu must be paused to avoid racy access to opt_feature.
2318 */
2319 int
2320 domain_opt_feature(struct domain *d, struct xen_ia64_opt_feature* f)
2322 struct opt_feature* optf = &d->arch.opt_feature;
2323 struct vcpu *v;
2324 long rc = 0;
2326 for_each_vcpu(d, v) {
2327 if (v != current)
2328 vcpu_pause(v);
2331 switch (f->cmd) {
2332 case XEN_IA64_OPTF_IDENT_MAP_REG4:
2333 optf_set_identity_mapping(&optf->mask, &optf->im_reg4, f);
2334 break;
2335 case XEN_IA64_OPTF_IDENT_MAP_REG5:
2336 optf_set_identity_mapping(&optf->mask, &optf->im_reg5, f);
2337 break;
2338 case XEN_IA64_OPTF_IDENT_MAP_REG7:
2339 optf_set_identity_mapping(&optf->mask, &optf->im_reg7, f);
2340 break;
2341 default:
2342 printk("%s: unknown opt_feature: %ld\n", __func__, f->cmd);
2343 rc = -ENOSYS;
2344 break;
2347 for_each_vcpu(d, v) {
2348 if (v != current)
2349 vcpu_unpause(v);
2352 return rc;