ia64/xen-unstable

view xen/arch/x86/domain.c @ 1776:c2f673cea5e4

bitkeeper revision 1.1072.1.1 (40f4e51fLMgcKX4Sn6FNYePX6EqkGA)

Merge http://xen.bkbits.net:8080/xeno-unstable.bk
into gandalf.hpl.hp.com:/var/bk/xeno-unstable.bk
author xenbk@gandalf.hpl.hp.com
date Wed Jul 14 07:47:43 2004 +0000 (2004-07-14)
parents cd887a8fa08a c4061a2a3309
children e91945007886
line source
1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
14 #include <xen/config.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/smp.h>
19 #include <xen/delay.h>
20 #include <xen/softirq.h>
21 #include <asm/ptrace.h>
22 #include <asm/mc146818rtc.h>
23 #include <asm/system.h>
24 #include <asm/io.h>
25 #include <asm/processor.h>
26 #include <asm/desc.h>
27 #include <asm/i387.h>
28 #include <asm/mpspec.h>
29 #include <asm/ldt.h>
30 #include <xen/irq.h>
31 #include <xen/event.h>
32 #include <asm/shadow.h>
33 #include <xen/console.h>
34 #include <xen/elf.h>
36 #if !defined(CONFIG_X86_64BITMODE)
37 /* No ring-3 access in initial page tables. */
38 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
39 #else
40 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
41 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
42 #endif
43 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
44 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
45 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
47 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
48 #define round_pgdown(_p) ((_p)&PAGE_MASK)
50 int hlt_counter;
52 void disable_hlt(void)
53 {
54 hlt_counter++;
55 }
57 void enable_hlt(void)
58 {
59 hlt_counter--;
60 }
62 /*
63 * We use this if we don't have any better
64 * idle routine..
65 */
66 static void default_idle(void)
67 {
68 if ( hlt_counter == 0 )
69 {
70 __cli();
71 if ( !softirq_pending(smp_processor_id()) )
72 safe_halt();
73 else
74 __sti();
75 }
76 }
78 void continue_cpu_idle_loop(void)
79 {
80 int cpu = smp_processor_id();
81 for ( ; ; )
82 {
83 irq_stat[cpu].idle_timestamp = jiffies;
84 while ( !softirq_pending(cpu) )
85 default_idle();
86 do_softirq();
87 }
88 }
90 void startup_cpu_idle_loop(void)
91 {
92 /* Just some sanity to ensure that the scheduler is set up okay. */
93 ASSERT(current->domain == IDLE_DOMAIN_ID);
94 domain_unpause_by_systemcontroller(current);
95 __enter_scheduler();
97 /*
98 * Declares CPU setup done to the boot processor.
99 * Therefore memory barrier to ensure state is visible.
100 */
101 smp_mb();
102 init_idle();
104 continue_cpu_idle_loop();
105 }
107 static long no_idt[2];
108 static int reboot_mode;
109 int reboot_thru_bios = 0;
111 #ifdef CONFIG_SMP
112 int reboot_smp = 0;
113 static int reboot_cpu = -1;
114 /* shamelessly grabbed from lib/vsprintf.c for readability */
115 #define is_digit(c) ((c) >= '0' && (c) <= '9')
116 #endif
119 static inline void kb_wait(void)
120 {
121 int i;
123 for (i=0; i<0x10000; i++)
124 if ((inb_p(0x64) & 0x02) == 0)
125 break;
126 }
129 void machine_restart(char * __unused)
130 {
131 extern int opt_noreboot;
132 #ifdef CONFIG_SMP
133 int cpuid;
134 #endif
136 if ( opt_noreboot )
137 {
138 printk("Reboot disabled on cmdline: require manual reset\n");
139 for ( ; ; ) __asm__ __volatile__ ("hlt");
140 }
142 #ifdef CONFIG_SMP
143 cpuid = GET_APIC_ID(apic_read(APIC_ID));
145 /* KAF: Need interrupts enabled for safe IPI. */
146 __sti();
148 if (reboot_smp) {
150 /* check to see if reboot_cpu is valid
151 if its not, default to the BSP */
152 if ((reboot_cpu == -1) ||
153 (reboot_cpu > (NR_CPUS -1)) ||
154 !(phys_cpu_present_map & (1<<cpuid)))
155 reboot_cpu = boot_cpu_physical_apicid;
157 reboot_smp = 0; /* use this as a flag to only go through this once*/
158 /* re-run this function on the other CPUs
159 it will fall though this section since we have
160 cleared reboot_smp, and do the reboot if it is the
161 correct CPU, otherwise it halts. */
162 if (reboot_cpu != cpuid)
163 smp_call_function((void *)machine_restart , NULL, 1, 0);
164 }
166 /* if reboot_cpu is still -1, then we want a tradional reboot,
167 and if we are not running on the reboot_cpu,, halt */
168 if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) {
169 for (;;)
170 __asm__ __volatile__ ("hlt");
171 }
172 /*
173 * Stop all CPUs and turn off local APICs and the IO-APIC, so
174 * other OSs see a clean IRQ state.
175 */
176 smp_send_stop();
177 disable_IO_APIC();
178 #endif
180 if(!reboot_thru_bios) {
181 /* rebooting needs to touch the page at absolute addr 0 */
182 *((unsigned short *)__va(0x472)) = reboot_mode;
183 for (;;) {
184 int i;
185 for (i=0; i<100; i++) {
186 kb_wait();
187 udelay(50);
188 outb(0xfe,0x64); /* pulse reset low */
189 udelay(50);
190 }
191 /* That didn't work - force a triple fault.. */
192 __asm__ __volatile__("lidt %0": "=m" (no_idt));
193 __asm__ __volatile__("int3");
194 }
195 }
197 panic("Need to reinclude BIOS reboot code\n");
198 }
200 void machine_halt(void)
201 {
202 machine_restart(0);
203 }
205 void machine_power_off(void)
206 {
207 machine_restart(0);
208 }
210 void arch_do_createdomain(struct domain *d)
211 {
212 d->shared_info = (void *)get_free_page();
213 memset(d->shared_info, 0, PAGE_SIZE);
214 SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d);
215 machine_to_phys_mapping[virt_to_phys(d->shared_info) >>
216 PAGE_SHIFT] = 0x80000000UL; /* debug */
218 d->mm.perdomain_pt = (l1_pgentry_t *)get_free_page();
219 memset(d->mm.perdomain_pt, 0, PAGE_SIZE);
220 machine_to_phys_mapping[virt_to_phys(d->mm.perdomain_pt) >>
221 PAGE_SHIFT] = 0x0fffdeadUL; /* debug */
222 }
224 void arch_final_setup_guestos(struct domain *p, full_execution_context_t *c)
225 {
226 unsigned long phys_basetab;
227 int i;
229 clear_bit(DF_DONEFPUINIT, &p->flags);
230 if ( c->flags & ECF_I387_VALID )
231 set_bit(DF_DONEFPUINIT, &p->flags);
232 memcpy(&p->shared_info->execution_context,
233 &c->cpu_ctxt,
234 sizeof(p->shared_info->execution_context));
235 memcpy(&p->thread.i387,
236 &c->fpu_ctxt,
237 sizeof(p->thread.i387));
238 memcpy(p->thread.traps,
239 &c->trap_ctxt,
240 sizeof(p->thread.traps));
241 #ifdef ARCH_HAS_FAST_TRAP
242 SET_DEFAULT_FAST_TRAP(&p->thread);
243 (void)set_fast_trap(p, c->fast_trap_idx);
244 #endif
245 p->mm.ldt_base = c->ldt_base;
246 p->mm.ldt_ents = c->ldt_ents;
247 SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
248 SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
249 if ( c->gdt_ents != 0 )
250 (void)set_gdt(p,
251 c->gdt_frames,
252 c->gdt_ents);
253 p->thread.guestos_ss = c->guestos_ss;
254 p->thread.guestos_sp = c->guestos_esp;
255 for ( i = 0; i < 8; i++ )
256 (void)set_debugreg(p, i, c->debugreg[i]);
257 p->event_selector = c->event_callback_cs;
258 p->event_address = c->event_callback_eip;
259 p->failsafe_selector = c->failsafe_callback_cs;
260 p->failsafe_address = c->failsafe_callback_eip;
262 phys_basetab = c->pt_base;
263 p->mm.pagetable = mk_pagetable(phys_basetab);
264 get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], p,
265 PGT_base_page_table);
266 }
268 #if defined(__i386__)
270 void new_thread(struct domain *p,
271 unsigned long start_pc,
272 unsigned long start_stack,
273 unsigned long start_info)
274 {
275 execution_context_t *ec = &p->shared_info->execution_context;
277 /*
278 * Initial register values:
279 * DS,ES,FS,GS = FLAT_RING1_DS
280 * CS:EIP = FLAT_RING1_CS:start_pc
281 * SS:ESP = FLAT_RING1_DS:start_stack
282 * ESI = start_info
283 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
284 */
285 ec->ds = ec->es = ec->fs = ec->gs = ec->ss = FLAT_RING1_DS;
286 ec->cs = FLAT_RING1_CS;
287 ec->eip = start_pc;
288 ec->esp = start_stack;
289 ec->esi = start_info;
291 __save_flags(ec->eflags);
292 ec->eflags |= X86_EFLAGS_IF;
294 /* No fast trap at start of day. */
295 SET_DEFAULT_FAST_TRAP(&p->thread);
296 }
299 /*
300 * This special macro can be used to load a debugging register
301 */
302 #define loaddebug(thread,register) \
303 __asm__("movl %0,%%db" #register \
304 : /* no output */ \
305 :"r" (thread->debugreg[register]))
308 void switch_to(struct domain *prev_p, struct domain *next_p)
309 {
310 struct thread_struct *next = &next_p->thread;
311 struct tss_struct *tss = init_tss + smp_processor_id();
312 execution_context_t *stack_ec = get_execution_context();
313 int i;
315 __cli();
317 /* Switch guest general-register state. */
318 if ( !is_idle_task(prev_p) )
319 {
320 memcpy(&prev_p->shared_info->execution_context,
321 stack_ec,
322 sizeof(*stack_ec));
323 unlazy_fpu(prev_p);
324 CLEAR_FAST_TRAP(&prev_p->thread);
325 }
327 if ( !is_idle_task(next_p) )
328 {
329 memcpy(stack_ec,
330 &next_p->shared_info->execution_context,
331 sizeof(*stack_ec));
333 /*
334 * This is sufficient! If the descriptor DPL differs from CS RPL then
335 * we'll #GP. If DS, ES, FS, GS are DPL 0 then they'll be cleared
336 * automatically. If SS RPL or DPL differs from CS RPL then we'll #GP.
337 */
338 if ( (stack_ec->cs & 3) == 0 )
339 stack_ec->cs = FLAT_RING1_CS;
340 if ( (stack_ec->ss & 3) == 0 )
341 stack_ec->ss = FLAT_RING1_DS;
343 SET_FAST_TRAP(&next_p->thread);
345 /* Switch the guest OS ring-1 stack. */
346 tss->esp1 = next->guestos_sp;
347 tss->ss1 = next->guestos_ss;
349 /* Maybe switch the debug registers. */
350 if ( unlikely(next->debugreg[7]) )
351 {
352 loaddebug(next, 0);
353 loaddebug(next, 1);
354 loaddebug(next, 2);
355 loaddebug(next, 3);
356 /* no 4 and 5 */
357 loaddebug(next, 6);
358 loaddebug(next, 7);
359 }
361 /* Switch page tables. */
362 write_ptbase(&next_p->mm);
363 tlb_clocktick();
364 }
366 if ( unlikely(prev_p->io_bitmap != NULL) ||
367 unlikely(next_p->io_bitmap != NULL) )
368 {
369 if ( next_p->io_bitmap != NULL )
370 {
371 /* Copy in the appropriate parts of the IO bitmap. We use the
372 * selector to copy only the interesting parts of the bitmap. */
374 u64 old_sel = ~0ULL; /* IO bitmap selector for previous task. */
376 if ( prev_p->io_bitmap != NULL)
377 {
378 old_sel = prev_p->io_bitmap_sel;
380 /* Replace any areas of the IO bitmap that had bits cleared. */
381 for ( i = 0; i < sizeof(prev_p->io_bitmap_sel) * 8; i++ )
382 if ( !test_bit(i, &prev_p->io_bitmap_sel) )
383 memcpy(&tss->io_bitmap[i * IOBMP_SELBIT_LWORDS],
384 &next_p->io_bitmap[i * IOBMP_SELBIT_LWORDS],
385 IOBMP_SELBIT_LWORDS * sizeof(unsigned long));
386 }
388 /* Copy in any regions of the new task's bitmap that have bits
389 * clear and we haven't already dealt with. */
390 for ( i = 0; i < sizeof(prev_p->io_bitmap_sel) * 8; i++ )
391 {
392 if ( test_bit(i, &old_sel)
393 && !test_bit(i, &next_p->io_bitmap_sel) )
394 memcpy(&tss->io_bitmap[i * IOBMP_SELBIT_LWORDS],
395 &next_p->io_bitmap[i * IOBMP_SELBIT_LWORDS],
396 IOBMP_SELBIT_LWORDS * sizeof(unsigned long));
397 }
399 tss->bitmap = IO_BITMAP_OFFSET;
401 }
402 else
403 {
404 /* In this case, we're switching FROM a task with IO port access,
405 * to a task that doesn't use the IO bitmap. We set any TSS bits
406 * that might have been cleared, ready for future use. */
407 for ( i = 0; i < sizeof(prev_p->io_bitmap_sel) * 8; i++ )
408 if ( !test_bit(i, &prev_p->io_bitmap_sel) )
409 memset(&tss->io_bitmap[i * IOBMP_SELBIT_LWORDS],
410 0xFF, IOBMP_SELBIT_LWORDS * sizeof(unsigned long));
412 /*
413 * a bitmap offset pointing outside of the TSS limit
414 * causes a nicely controllable SIGSEGV if a process
415 * tries to use a port IO instruction. The first
416 * sys_ioperm() call sets up the bitmap properly.
417 */
418 tss->bitmap = INVALID_IO_BITMAP_OFFSET;
419 }
420 }
422 set_current(next_p);
424 /* Switch GDT and LDT. */
425 __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
426 load_LDT(next_p);
428 __sti();
429 }
432 /* XXX Currently the 'domain' field is ignored! XXX */
433 long do_iopl(domid_t domain, unsigned int new_io_pl)
434 {
435 execution_context_t *ec = get_execution_context();
436 ec->eflags = (ec->eflags & 0xffffcfff) | ((new_io_pl&3) << 12);
437 return 0;
438 }
440 #endif
442 void domain_relinquish_memory(struct domain *d)
443 {
444 struct list_head *ent, *tmp;
445 struct pfn_info *page;
446 unsigned long x, y;
448 /* Ensure that noone is running over the dead domain's page tables. */
449 synchronise_pagetables(~0UL);
451 /* Exit shadow mode before deconstructing final guest page table. */
452 shadow_mode_disable(d);
454 /* Drop the in-use reference to the page-table base. */
455 if ( pagetable_val(d->mm.pagetable) != 0 )
456 put_page_and_type(&frame_table[pagetable_val(d->mm.pagetable) >>
457 PAGE_SHIFT]);
459 /*
460 * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
461 * it automatically gets squashed when the guest's mappings go away.
462 */
463 destroy_gdt(d);
465 /* Relinquish Xen-heap pages. Currently this can only be 'shared_info'. */
466 page = virt_to_page(d->shared_info);
467 if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
468 put_page(page);
470 /* Relinquish all pages on the domain's allocation list. */
471 spin_lock_recursive(&d->page_alloc_lock); /* may enter free_domain_page */
472 list_for_each_safe ( ent, tmp, &d->page_list )
473 {
474 page = list_entry(ent, struct pfn_info, list);
476 if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
477 put_page_and_type(page);
479 if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
480 put_page(page);
482 /*
483 * Forcibly invalidate base page tables at this point to break circular
484 * 'linear page table' references. This is okay because MMU structures
485 * are not shared across domains and this domain is now dead. Thus base
486 * tables are not in use so a non-zero count means circular reference.
487 */
488 y = page->type_and_flags;
489 do {
490 x = y;
491 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
492 (PGT_base_page_table|PGT_validated)) )
493 break;
494 y = cmpxchg(&page->type_and_flags, x, x & ~PGT_validated);
495 if ( likely(y == x) )
496 free_page_type(page, PGT_base_page_table);
497 }
498 while ( unlikely(y != x) );
499 }
500 spin_unlock_recursive(&d->page_alloc_lock);
501 }
504 int construct_dom0(struct domain *p,
505 unsigned long alloc_start,
506 unsigned long alloc_end,
507 char *image_start, unsigned long image_len,
508 char *initrd_start, unsigned long initrd_len,
509 char *cmdline)
510 {
511 char *dst;
512 int i, rc;
513 unsigned long pfn, mfn;
514 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
515 unsigned long nr_pt_pages;
516 unsigned long count;
517 l2_pgentry_t *l2tab, *l2start;
518 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
519 struct pfn_info *page = NULL;
520 start_info_t *si;
522 /*
523 * This fully describes the memory layout of the initial domain. All
524 * *_start address are page-aligned, except v_start (and v_end) which are
525 * superpage-aligned.
526 */
527 unsigned long v_start;
528 unsigned long vkern_start;
529 unsigned long vkern_entry;
530 unsigned long vkern_end;
531 unsigned long vinitrd_start;
532 unsigned long vinitrd_end;
533 unsigned long vphysmap_start;
534 unsigned long vphysmap_end;
535 unsigned long vstartinfo_start;
536 unsigned long vstartinfo_end;
537 unsigned long vstack_start;
538 unsigned long vstack_end;
539 unsigned long vpt_start;
540 unsigned long vpt_end;
541 unsigned long v_end;
543 /* Machine address of next candidate page-table page. */
544 unsigned long mpt_alloc;
546 extern void physdev_init_dom0(struct domain *);
548 /* Sanity! */
549 if ( p->domain != 0 )
550 BUG();
551 if ( test_bit(DF_CONSTRUCTED, &p->flags) )
552 BUG();
554 printk("*** LOADING DOMAIN 0 ***\n");
556 /*
557 * This is all a bit grim. We've moved the modules to the "safe" physical
558 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
559 * routine we're going to copy it down into the region that's actually
560 * been allocated to domain 0. This is highly likely to be overlapping, so
561 * we use a forward copy.
562 *
563 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
564 * 4GB and lots of network/disk cards that allocate loads of buffers.
565 * We'll have to revisit this if we ever support PAE (64GB).
566 */
568 rc = readelfimage_base_and_size(image_start, image_len,
569 &vkern_start, &vkern_end, &vkern_entry);
570 if ( rc != 0 )
571 return rc;
573 /*
574 * Why do we need this? The number of page-table frames depends on the
575 * size of the bootstrap address space. But the size of the address space
576 * depends on the number of page-table frames (since each one is mapped
577 * read-only). We have a pair of simultaneous equations in two unknowns,
578 * which we solve by exhaustive search.
579 */
580 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
581 {
582 v_start = vkern_start & ~((1<<22)-1);
583 vinitrd_start = round_pgup(vkern_end);
584 vinitrd_end = vinitrd_start + initrd_len;
585 vphysmap_start = round_pgup(vinitrd_end);
586 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
587 vpt_start = round_pgup(vphysmap_end);
588 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
589 vstartinfo_start = vpt_end;
590 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
591 vstack_start = vstartinfo_end;
592 vstack_end = vstack_start + PAGE_SIZE;
593 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
594 if ( (v_end - vstack_end) < (512 << 10) )
595 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
596 if ( (((v_end - v_start) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
597 break;
598 }
600 if ( (v_end - v_start) > (nr_pages * PAGE_SIZE) )
601 {
602 printk("Initial guest OS requires too much space\n"
603 "(%luMB is greater than %luMB limit)\n",
604 (v_end-v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
605 return -ENOMEM;
606 }
608 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
609 " Kernel image: %p->%p\n"
610 " Initrd image: %p->%p\n"
611 " Dom0 alloc.: %08lx->%08lx\n",
612 image_start, image_start + image_len,
613 initrd_start, initrd_start + initrd_len,
614 alloc_start, alloc_end);
615 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
616 " Loaded kernel: %08lx->%08lx\n"
617 " Init. ramdisk: %08lx->%08lx\n"
618 " Phys-Mach map: %08lx->%08lx\n"
619 " Page tables: %08lx->%08lx\n"
620 " Start info: %08lx->%08lx\n"
621 " Boot stack: %08lx->%08lx\n"
622 " TOTAL: %08lx->%08lx\n",
623 vkern_start, vkern_end,
624 vinitrd_start, vinitrd_end,
625 vphysmap_start, vphysmap_end,
626 vpt_start, vpt_end,
627 vstartinfo_start, vstartinfo_end,
628 vstack_start, vstack_end,
629 v_start, v_end);
630 printk(" ENTRY ADDRESS: %08lx\n", vkern_entry);
632 /*
633 * Protect the lowest 1GB of memory. We use a temporary mapping there
634 * from which we copy the kernel and ramdisk images.
635 */
636 if ( v_start < (1<<30) )
637 {
638 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
639 return -EINVAL;
640 }
642 /* Construct a frame-allocation list for the initial domain. */
643 for ( mfn = (alloc_start>>PAGE_SHIFT);
644 mfn < (alloc_end>>PAGE_SHIFT);
645 mfn++ )
646 {
647 page = &frame_table[mfn];
648 page->u.domain = p;
649 page->type_and_flags = 0;
650 page->count_and_flags = PGC_allocated | 1;
651 list_add_tail(&page->list, &p->page_list);
652 p->tot_pages++; p->max_pages++;
653 }
655 mpt_alloc = (vpt_start - v_start) + alloc_start;
657 SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
658 SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
660 /*
661 * We're basically forcing default RPLs to 1, so that our "what privilege
662 * level are we returning to?" logic works.
663 */
664 p->failsafe_selector = FLAT_GUESTOS_CS;
665 p->event_selector = FLAT_GUESTOS_CS;
666 p->thread.guestos_ss = FLAT_GUESTOS_DS;
667 for ( i = 0; i < 256; i++ )
668 p->thread.traps[i].cs = FLAT_GUESTOS_CS;
670 /* WARNING: The new domain must have its 'processor' field filled in! */
671 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
672 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
673 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
674 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
675 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
676 mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
677 p->mm.pagetable = mk_pagetable((unsigned long)l2start);
679 l2tab += l2_table_offset(v_start);
680 mfn = alloc_start >> PAGE_SHIFT;
681 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
682 {
683 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
684 {
685 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
686 mpt_alloc += PAGE_SIZE;
687 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
688 clear_page(l1tab);
689 }
690 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
692 page = &frame_table[mfn];
693 set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
694 if ( !get_page_and_type(page, p, PGT_writeable_page) )
695 BUG();
697 mfn++;
698 }
700 /* Pages that are part of page tables must be read only. */
701 l2tab = l2start + l2_table_offset(vpt_start);
702 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
703 l1tab += l1_table_offset(vpt_start);
704 l2tab++;
705 for ( count = 0; count < nr_pt_pages; count++ )
706 {
707 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
708 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
709 if ( count == 0 )
710 {
711 page->type_and_flags &= ~PGT_type_mask;
712 page->type_and_flags |= PGT_l2_page_table;
713 get_page(page, p); /* an extra ref because of readable mapping */
714 /* Get another ref to L2 page so that it can be pinned. */
715 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
716 BUG();
717 set_bit(_PGC_guest_pinned, &page->count_and_flags);
718 }
719 else
720 {
721 page->type_and_flags &= ~PGT_type_mask;
722 page->type_and_flags |= PGT_l1_page_table;
723 get_page(page, p); /* an extra ref because of readable mapping */
724 }
725 l1tab++;
726 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
727 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
728 }
730 /* Set up shared-info area. */
731 update_dom_time(p->shared_info);
732 p->shared_info->domain_time = 0;
733 /* Mask all upcalls... */
734 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
735 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
737 /* Install the new page tables. */
738 __cli();
739 write_ptbase(&p->mm);
741 /* Copy the OS image. */
742 (void)loadelfimage(image_start);
744 /* Copy the initial ramdisk. */
745 if ( initrd_len != 0 )
746 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
748 /* Set up start info area. */
749 si = (start_info_t *)vstartinfo_start;
750 memset(si, 0, PAGE_SIZE);
751 si->nr_pages = p->tot_pages;
752 si->shared_info = virt_to_phys(p->shared_info);
753 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
754 si->pt_base = vpt_start;
755 si->nr_pt_frames = nr_pt_pages;
756 si->mfn_list = vphysmap_start;
758 /* Write the phys->machine and machine->phys table entries. */
759 for ( mfn = (alloc_start>>PAGE_SHIFT);
760 mfn < (alloc_end>>PAGE_SHIFT);
761 mfn++ )
762 {
763 pfn = mfn - (alloc_start>>PAGE_SHIFT);
764 ((unsigned long *)vphysmap_start)[pfn] = mfn;
765 machine_to_phys_mapping[mfn] = pfn;
766 }
768 if ( initrd_len != 0 )
769 {
770 si->mod_start = vinitrd_start;
771 si->mod_len = initrd_len;
772 printk("Initrd len 0x%lx, start at 0x%08lx\n",
773 si->mod_len, si->mod_start);
774 }
776 dst = si->cmd_line;
777 if ( cmdline != NULL )
778 {
779 for ( i = 0; i < 255; i++ )
780 {
781 if ( cmdline[i] == '\0' )
782 break;
783 *dst++ = cmdline[i];
784 }
785 }
786 *dst = '\0';
788 /* Reinstate the caller's page tables. */
789 write_ptbase(&current->mm);
790 __sti();
792 /* Destroy low mappings - they were only for our convenience. */
793 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
794 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
795 l2start[i] = mk_l2_pgentry(0);
796 zap_low_mappings(); /* Do the same for the idle page tables. */
798 /* Give up the VGA console if DOM0 is configured to grab it. */
799 console_endboot(strstr(cmdline, "tty0") != NULL);
801 /* DOM0 gets access to everything. */
802 physdev_init_dom0(p);
804 set_bit(DF_CONSTRUCTED, &p->flags);
806 #if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
807 shadow_mode_enable(&p->mm, SHM_test);
808 #endif
810 new_thread(p, vkern_entry, vstack_end, vstartinfo_start);
812 return 0;
813 }