ia64/xen-unstable

view xen/common/domain.c @ 1376:a3dd7bf2fcd6

bitkeeper revision 1.902 (40a34b40gzZ-ypwQ3HUMqdzjtQqqfQ)

domain.c:
init shadow page table spin lock earlier
author iap10@labyrinth.cl.cam.ac.uk
date Thu May 13 10:17:36 2004 +0000 (2004-05-13)
parents 0fab6364d23b
children acc04d188782
line source
1 #include <xen/config.h>
2 #include <xen/init.h>
3 #include <xen/lib.h>
4 #include <xen/errno.h>
5 #include <xen/sched.h>
6 #include <xen/mm.h>
7 #include <xen/skbuff.h>
8 #include <xen/interrupt.h>
9 #include <xen/delay.h>
10 #include <xen/event.h>
11 #include <xen/time.h>
12 #include <xen/shadow.h>
13 #include <hypervisor-ifs/dom0_ops.h>
14 #include <asm/io.h>
15 #include <asm/domain_page.h>
16 #include <asm/flushtlb.h>
17 #include <asm/msr.h>
18 #include <xen/blkdev.h>
19 #include <xen/console.h>
20 #include <xen/vbd.h>
21 #include <asm/i387.h>
22 #include <xen/shadow.h>
24 #ifdef CONFIG_X86_64BITMODE
25 #define ELFSIZE 64
26 #else
27 #define ELFSIZE 32
28 #endif
29 #include <xen/elf.h>
31 #if !defined(CONFIG_X86_64BITMODE)
32 /* No ring-3 access in initial page tables. */
33 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
34 #else
35 /* Allow ring-3 access in long mode as guest cannot use ring 1. */
36 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
37 #endif
38 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
39 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
40 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
42 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
43 #define round_pgdown(_p) ((_p)&PAGE_MASK)
45 /* Both these structures are protected by the tasklist_lock. */
46 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;
47 struct task_struct *task_hash[TASK_HASH_SIZE];
48 struct task_struct *task_list;
50 struct task_struct *do_createdomain(domid_t dom_id, unsigned int cpu)
51 {
52 char buf[100];
53 struct task_struct *p, **pp;
54 unsigned long flags;
56 if ( (p = alloc_task_struct()) == NULL )
57 return NULL;
59 atomic_set(&p->refcnt, 1);
61 spin_lock_init(&p->mm.shadow_lock);
63 p->domain = dom_id;
64 p->processor = cpu;
65 p->create_time = NOW();
67 memcpy(&p->thread, &idle0_task.thread, sizeof(p->thread));
69 if ( p->domain != IDLE_DOMAIN_ID )
70 {
71 if ( init_event_channels(p) != 0 )
72 {
73 free_task_struct(p);
74 return NULL;
75 }
77 /* We use a large intermediate to avoid overflow in sprintf. */
78 sprintf(buf, "Domain-%llu", dom_id);
79 strncpy(p->name, buf, MAX_DOMAIN_NAME);
80 p->name[MAX_DOMAIN_NAME-1] = '\0';
82 spin_lock_init(&p->blk_ring_lock);
84 p->addr_limit = USER_DS;
86 spin_lock_init(&p->page_list_lock);
87 INIT_LIST_HEAD(&p->page_list);
88 p->max_pages = p->tot_pages = 0;
90 p->shared_info = (void *)get_free_page(GFP_KERNEL);
91 memset(p->shared_info, 0, PAGE_SIZE);
92 SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
94 machine_to_phys_mapping[virt_to_phys(p->shared_info) >> PAGE_SHIFT] =
95 0x80000000UL; // set m2p table to magic marker (helps debug)
97 p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
98 memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
100 machine_to_phys_mapping[virt_to_phys(p->mm.perdomain_pt) >> PAGE_SHIFT] =
101 0x0fffdeadUL; // set m2p table to magic marker (helps debug)
103 init_blkdev_info(p);
105 /* Per-domain PCI-device list. */
106 spin_lock_init(&p->pcidev_lock);
107 INIT_LIST_HEAD(&p->pcidev_list);
109 write_lock_irqsave(&tasklist_lock, flags);
110 pp = &task_list; /* NB. task_list is maintained in order of dom_id. */
111 for ( pp = &task_list; *pp != NULL; pp = &(*pp)->next_list )
112 if ( (*pp)->domain > p->domain )
113 break;
114 p->next_list = *pp;
115 *pp = p;
116 p->next_hash = task_hash[TASK_HASH(dom_id)];
117 task_hash[TASK_HASH(dom_id)] = p;
118 write_unlock_irqrestore(&tasklist_lock, flags);
119 }
120 else
121 {
122 sprintf(p->name, "Idle-%d", cpu);
123 }
125 sched_add_domain(p);
127 return p;
128 }
131 struct task_struct *find_domain_by_id(domid_t dom)
132 {
133 struct task_struct *p;
134 unsigned long flags;
136 read_lock_irqsave(&tasklist_lock, flags);
137 p = task_hash[TASK_HASH(dom)];
138 while ( p != NULL )
139 {
140 if ( p->domain == dom )
141 {
142 get_task_struct(p);
143 break;
144 }
145 p = p->next_hash;
146 }
147 read_unlock_irqrestore(&tasklist_lock, flags);
149 return p;
150 }
153 /* return the most recent domain created */
154 struct task_struct *find_last_domain(void)
155 {
156 struct task_struct *p, *plast;
157 unsigned long flags;
159 read_lock_irqsave(&tasklist_lock, flags);
160 plast = task_list;
161 p = plast->next_list;
162 while ( p != NULL )
163 {
164 if ( p->create_time > plast->create_time )
165 plast = p;
166 p = p->next_list;
167 }
168 get_task_struct(plast);
169 read_unlock_irqrestore(&tasklist_lock, flags);
171 return plast;
172 }
175 void kill_domain_with_errmsg(const char *err)
176 {
177 printk("DOM%llu FATAL ERROR: %s\n", current->domain, err);
178 kill_domain();
179 }
182 void __kill_domain(struct task_struct *p)
183 {
184 int i;
185 struct task_struct **pp;
186 unsigned long flags;
188 if ( p->domain == 0 )
189 {
190 extern void machine_restart(char *);
191 printk("Domain 0 killed: rebooting machine!\n");
192 machine_restart(0);
193 }
195 /* Only allow the domain to be destroyed once. */
196 if ( !sched_rem_domain(p) )
197 return;
199 DPRINTK("Killing domain %llu\n", p->domain);
201 unlink_blkdev_info(p);
203 for ( i = 0; i < MAX_DOMAIN_VIFS; i++ )
204 unlink_net_vif(p->net_vif_list[i]);
206 destroy_event_channels(p);
208 delete_all_domain_vfr_rules(p);
210 /*
211 * Note this means that find_domain_by_id may fail, even when the caller
212 * holds a reference to the domain being queried. Take care!
213 */
214 write_lock_irqsave(&tasklist_lock, flags);
215 pp = &task_list; /* Delete from task_list. */
216 while ( *pp != p )
217 pp = &(*pp)->next_list;
218 *pp = p->next_list;
219 pp = &task_hash[TASK_HASH(p->domain)]; /* Delete from task_hash. */
220 while ( *pp != p )
221 pp = &(*pp)->next_hash;
222 *pp = p->next_hash;
223 write_unlock_irqrestore(&tasklist_lock, flags);
225 if ( atomic_read(&p->refcnt) >2 )
226 DPRINTK("Domain refcnt>1 so kil deferred. Missing put_task? p=%p cur=%p cnt=%d\n",p,current,atomic_read(&p->refcnt));
229 if ( p == current )
230 {
231 __enter_scheduler();
232 BUG(); /* never get here */
233 }
234 else
235 {
236 put_task_struct(p);
237 }
238 }
241 void kill_domain(void)
242 {
243 __kill_domain(current);
244 }
247 long kill_other_domain(domid_t dom, int force)
248 {
249 struct task_struct *p;
251 if ( (p = find_domain_by_id(dom)) == NULL )
252 return -ESRCH;
254 if ( p->state == TASK_STOPPED )
255 __kill_domain(p);
256 else if ( force )
257 send_hyp_event(p, _HYP_EVENT_DIE);
258 else
259 send_guest_virq(p, VIRQ_DIE);
261 put_task_struct(p);
262 return 0;
263 }
265 void stop_domain(void)
266 {
267 memcpy(&current->shared_info->execution_context,
268 get_execution_context(),
269 sizeof(execution_context_t));
270 unlazy_fpu(current);
271 wmb(); /* All CPUs must see saved info in state TASK_STOPPED. */
272 set_current_state(TASK_STOPPED);
273 __enter_scheduler();
274 }
276 long stop_other_domain(domid_t dom)
277 {
278 struct task_struct *p;
280 if ( dom == 0 )
281 return -EINVAL;
283 p = find_domain_by_id(dom);
284 if ( p == NULL) return -ESRCH;
286 if ( p->state != TASK_STOPPED )
287 send_guest_virq(p, VIRQ_STOP);
289 put_task_struct(p);
290 return 0;
291 }
293 struct pfn_info *alloc_domain_page(struct task_struct *p)
294 {
295 struct pfn_info *page = NULL;
296 unsigned long flags, mask, pfn_stamp, cpu_stamp;
297 int i;
299 spin_lock_irqsave(&free_list_lock, flags);
300 if ( likely(!list_empty(&free_list)) )
301 {
302 page = list_entry(free_list.next, struct pfn_info, list);
303 list_del(&page->list);
304 free_pfns--;
305 }
306 spin_unlock_irqrestore(&free_list_lock, flags);
308 if ( unlikely(page == NULL) )
309 return NULL;
311 if ( (mask = page->u.cpu_mask) != 0 )
312 {
313 pfn_stamp = page->tlbflush_timestamp;
314 for ( i = 0; (mask != 0) && (i < NR_CPUS); i++ )
315 {
316 if ( mask & (1<<i) )
317 {
318 cpu_stamp = tlbflush_time[i];
319 if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
320 mask &= ~(1<<i);
321 }
322 }
324 if ( unlikely(mask != 0) )
325 {
326 /* In IRQ ctxt, flushing is best-effort only, to avoid deadlock. */
327 if ( likely(!in_irq()) )
328 flush_tlb_mask(mask);
329 else if ( unlikely(!try_flush_tlb_mask(mask)) )
330 goto free_and_exit;
331 perfc_incrc(need_flush_tlb_flush);
332 }
333 }
335 page->u.domain = p;
336 page->type_and_flags = 0;
337 if ( p != NULL )
338 {
339 if ( unlikely(in_irq()) )
340 BUG();
341 wmb(); /* Domain pointer must be visible before updating refcnt. */
342 spin_lock(&p->page_list_lock);
343 if ( unlikely(p->tot_pages >= p->max_pages) )
344 {
345 DPRINTK("Over-allocation for domain %llu: %u >= %u\n",
346 p->domain, p->tot_pages, p->max_pages);
347 spin_unlock(&p->page_list_lock);
348 goto free_and_exit;
349 }
350 list_add_tail(&page->list, &p->page_list);
351 p->tot_pages++;
352 page->count_and_flags = PGC_allocated | 1;
353 spin_unlock(&p->page_list_lock);
354 }
356 return page;
358 free_and_exit:
359 spin_lock_irqsave(&free_list_lock, flags);
360 list_add(&page->list, &free_list);
361 free_pfns++;
362 spin_unlock_irqrestore(&free_list_lock, flags);
363 return NULL;
364 }
366 void free_domain_page(struct pfn_info *page)
367 {
368 unsigned long flags;
369 struct task_struct *p = page->u.domain;
371 if ( unlikely(in_irq()) )
372 BUG();
374 if ( likely(!IS_XEN_HEAP_FRAME(page)) )
375 {
376 /*
377 * No race with setting of zombie bit. If it wasn't set before the
378 * last reference was dropped, then it can't be set now.
379 */
380 page->u.cpu_mask = 0;
381 if ( !(page->count_and_flags & PGC_zombie) )
382 {
383 page->tlbflush_timestamp = tlbflush_clock;
384 if (p)
385 {
386 page->u.cpu_mask = 1 << p->processor;
387 spin_lock(&p->page_list_lock);
388 list_del(&page->list);
389 p->tot_pages--;
390 spin_unlock(&p->page_list_lock);
391 }
392 }
394 page->count_and_flags = 0;
396 spin_lock_irqsave(&free_list_lock, flags);
397 list_add(&page->list, &free_list);
398 free_pfns++;
399 spin_unlock_irqrestore(&free_list_lock, flags);
400 }
401 else
402 {
403 /*
404 * No need for a TLB flush. Non-domain pages are always co-held by Xen,
405 * and the Xen reference is not dropped until the domain is dead.
406 * DOM0 may hold references, but it's trusted so no need to flush.
407 */
408 page->u.cpu_mask = 0;
409 page->count_and_flags = 0;
410 free_page((unsigned long)page_to_virt(page));
411 }
412 }
415 void free_all_dom_mem(struct task_struct *p)
416 {
417 struct list_head *ent, zombies;
418 struct pfn_info *page;
419 unsigned long x, y;
421 INIT_LIST_HEAD(&zombies);
423 if ( p->mm.shadow_mode ) shadow_mode_disable(p);
425 /* STEP 1. Drop the in-use reference to the page-table base. */
426 put_page_and_type(&frame_table[pagetable_val(p->mm.pagetable) >>
427 PAGE_SHIFT]);
429 /* STEP 2. Zombify all pages on the domain's allocation list. */
430 spin_lock(&p->page_list_lock);
431 while ( (ent = p->page_list.next) != &p->page_list )
432 {
433 page = list_entry(ent, struct pfn_info, list);
435 if ( unlikely(!get_page(page, p)) )
436 {
437 /*
438 * Another CPU has dropped the last reference and is responsible
439 * for removing the page from this list. Wait for them to do so.
440 */
441 spin_unlock(&p->page_list_lock);
442 while ( p->page_list.next == ent )
443 barrier();
444 spin_lock(&p->page_list_lock);
445 continue;
446 }
448 set_bit(_PGC_zombie, &page->count_and_flags);
450 list_del(&page->list);
451 p->tot_pages--;
453 list_add(&page->list, &zombies);
454 }
455 spin_unlock(&p->page_list_lock);
457 /*
458 * STEP 3. With the domain's list lock now released, we examine each zombie
459 * page and drop references for guest-allocated and/or type-pinned pages.
460 */
461 while ( (ent = zombies.next) != &zombies )
462 {
463 page = list_entry(ent, struct pfn_info, list);
465 list_del(&page->list);
467 if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
468 put_page_and_type(page);
470 if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
471 put_page(page);
473 /*
474 * Forcibly invalidate base page tables at this point to break circular
475 * 'linear page table' references. This is okay because MMU structures
476 * are not shared across domains and this domain is now dead. Thus base
477 * tables are not in use so a non-zero count means circular reference.
478 */
479 y = page->type_and_flags;
480 do {
481 x = y;
482 if ( likely((x & (PGT_type_mask|PGT_validated)) !=
483 (PGT_base_page_table|PGT_validated)) )
484 break;
485 y = cmpxchg(&page->type_and_flags, x, x & ~PGT_validated);
486 if ( likely(y == x) )
487 free_page_type(page, PGT_base_page_table);
488 }
489 while ( unlikely(y != x) );
491 put_page(page);
492 }
493 }
496 unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
497 {
498 unsigned int alloc_pfns, nr_pages;
499 struct pfn_info *page;
501 nr_pages = (kbytes + ((PAGE_SIZE-1)>>10)) >> (PAGE_SHIFT - 10);
502 p->max_pages = nr_pages; /* this can now be controlled independently */
504 /* grow the allocation if necessary */
505 for ( alloc_pfns = p->tot_pages; alloc_pfns < nr_pages; alloc_pfns++ )
506 {
507 if ( unlikely((page=alloc_domain_page(p)) == NULL) ||
508 unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >>
509 (PAGE_SHIFT-10))) )
510 {
511 free_all_dom_mem(p);
512 return -ENOMEM;
513 }
515 /* initialise to machine_to_phys_mapping table to likely pfn */
516 machine_to_phys_mapping[page-frame_table] = alloc_pfns;
517 }
519 p->tot_pages = nr_pages;
521 return 0;
522 }
525 /* Release resources belonging to task @p. */
526 void release_task(struct task_struct *p)
527 {
528 ASSERT(p->state == TASK_DYING);
529 ASSERT(!p->has_cpu);
531 DPRINTK("Releasing task %llu\n", p->domain);
533 /*
534 * This frees up blkdev rings and vbd-access lists. Totally safe since
535 * blkdev ref counting actually uses the task_struct refcnt.
536 */
537 destroy_blkdev_info(p);
539 /* Free all memory associated with this domain. */
540 free_page((unsigned long)p->mm.perdomain_pt);
541 UNSHARE_PFN(virt_to_page(p->shared_info));
542 free_all_dom_mem(p);
544 free_task_struct(p);
545 }
548 /*
549 * final_setup_guestos is used for final setup and launching of domains other
550 * than domain 0. ie. the domains that are being built by the userspace dom0
551 * domain builder.
552 */
553 int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain)
554 {
555 unsigned long phys_basetab;
556 int i, rc = 0;
557 full_execution_context_t *c;
559 if ( (c = kmalloc(sizeof(*c), GFP_KERNEL)) == NULL )
560 return -ENOMEM;
562 if ( test_bit(PF_CONSTRUCTED, &p->flags) )
563 {
564 rc = -EINVAL;
565 goto out;
566 }
568 if ( copy_from_user(c, builddomain->ctxt, sizeof(*c)) )
569 {
570 rc = -EFAULT;
571 goto out;
572 }
574 clear_bit(PF_DONEFPUINIT, &p->flags);
575 if ( c->flags & ECF_I387_VALID )
576 set_bit(PF_DONEFPUINIT, &p->flags);
577 memcpy(&p->shared_info->execution_context,
578 &c->cpu_ctxt,
579 sizeof(p->shared_info->execution_context));
580 memcpy(&p->thread.i387,
581 &c->fpu_ctxt,
582 sizeof(p->thread.i387));
583 memcpy(p->thread.traps,
584 &c->trap_ctxt,
585 sizeof(p->thread.traps));
586 #ifdef ARCH_HAS_FAST_TRAP
587 SET_DEFAULT_FAST_TRAP(&p->thread);
588 (void)set_fast_trap(p, c->fast_trap_idx);
589 #endif
590 p->mm.ldt_base = c->ldt_base;
591 p->mm.ldt_ents = c->ldt_ents;
592 SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
593 SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
594 if ( c->gdt_ents != 0 )
595 (void)set_gdt(p,
596 c->gdt_frames,
597 c->gdt_ents);
598 p->thread.guestos_ss = c->guestos_ss;
599 p->thread.guestos_sp = c->guestos_esp;
600 for ( i = 0; i < 8; i++ )
601 (void)set_debugreg(p, i, c->debugreg[i]);
602 p->event_selector = c->event_callback_cs;
603 p->event_address = c->event_callback_eip;
604 p->failsafe_selector = c->failsafe_callback_cs;
605 p->failsafe_address = c->failsafe_callback_eip;
607 phys_basetab = c->pt_base;
608 p->mm.pagetable = mk_pagetable(phys_basetab);
609 get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], p,
610 PGT_base_page_table);
612 /* Set up the shared info structure. */
613 update_dom_time(p->shared_info);
615 /* Add virtual network interfaces and point to them in startinfo. */
616 while ( builddomain->num_vifs-- > 0 )
617 (void)create_net_vif(p->domain);
619 set_bit(PF_CONSTRUCTED, &p->flags);
621 out:
622 if (c) kfree(c);
624 return rc;
625 }
627 static inline int is_loadable_phdr(Elf_Phdr *phdr)
628 {
629 return ((phdr->p_type == PT_LOAD) &&
630 ((phdr->p_flags & (PF_W|PF_X)) != 0));
631 }
633 static int readelfimage_base_and_size(char *elfbase,
634 unsigned long elfsize,
635 unsigned long *pkernstart,
636 unsigned long *pkernend,
637 unsigned long *pkernentry)
638 {
639 Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase;
640 Elf_Phdr *phdr;
641 Elf_Shdr *shdr;
642 unsigned long kernstart = ~0UL, kernend=0UL;
643 char *shstrtab, *guestinfo;
644 int h;
646 if ( !IS_ELF(*ehdr) )
647 {
648 printk("Kernel image does not have an ELF header.\n");
649 return -EINVAL;
650 }
652 if ( (ehdr->e_phoff + (ehdr->e_phnum * ehdr->e_phentsize)) > elfsize )
653 {
654 printk("ELF program headers extend beyond end of image.\n");
655 return -EINVAL;
656 }
658 if ( (ehdr->e_shoff + (ehdr->e_shnum * ehdr->e_shentsize)) > elfsize )
659 {
660 printk("ELF section headers extend beyond end of image.\n");
661 return -EINVAL;
662 }
664 /* Find the section-header strings table. */
665 if ( ehdr->e_shstrndx == SHN_UNDEF )
666 {
667 printk("ELF image has no section-header strings table (shstrtab).\n");
668 return -EINVAL;
669 }
670 shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff +
671 (ehdr->e_shstrndx*ehdr->e_shentsize));
672 shstrtab = elfbase + shdr->sh_offset;
674 /* Find the special '__xen_guest' section and check its contents. */
675 for ( h = 0; h < ehdr->e_shnum; h++ )
676 {
677 shdr = (Elf_Shdr *)(elfbase + ehdr->e_shoff + (h*ehdr->e_shentsize));
678 if ( strcmp(&shstrtab[shdr->sh_name], "__xen_guest") != 0 )
679 continue;
680 guestinfo = elfbase + shdr->sh_offset;
681 printk("Xen-ELF header found: '%s'\n", guestinfo);
682 if ( (strstr(guestinfo, "GUEST_OS=linux") == NULL) ||
683 (strstr(guestinfo, "XEN_VER=1.3") == NULL) )
684 {
685 printk("ERROR: Xen will only load Linux built for Xen v1.3\n");
686 return -EINVAL;
687 }
688 break;
689 }
690 if ( h == ehdr->e_shnum )
691 {
692 printk("Not a Xen-ELF image: '__xen_guest' section not found.\n");
693 return -EINVAL;
694 }
696 for ( h = 0; h < ehdr->e_phnum; h++ )
697 {
698 phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
699 if ( !is_loadable_phdr(phdr) )
700 continue;
701 if ( phdr->p_vaddr < kernstart )
702 kernstart = phdr->p_vaddr;
703 if ( (phdr->p_vaddr + phdr->p_memsz) > kernend )
704 kernend = phdr->p_vaddr + phdr->p_memsz;
705 }
707 if ( (kernstart > kernend) ||
708 (ehdr->e_entry < kernstart) ||
709 (ehdr->e_entry > kernend) )
710 {
711 printk("Malformed ELF image.\n");
712 return -EINVAL;
713 }
715 *pkernstart = kernstart;
716 *pkernend = kernend;
717 *pkernentry = ehdr->e_entry;
719 return 0;
720 }
722 static int loadelfimage(char *elfbase)
723 {
724 Elf_Ehdr *ehdr = (Elf_Ehdr *)elfbase;
725 Elf_Phdr *phdr;
726 int h;
728 for ( h = 0; h < ehdr->e_phnum; h++ )
729 {
730 phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
731 if ( !is_loadable_phdr(phdr) )
732 continue;
733 if ( phdr->p_filesz != 0 )
734 memcpy((char *)phdr->p_vaddr, elfbase + phdr->p_offset,
735 phdr->p_filesz);
736 if ( phdr->p_memsz > phdr->p_filesz )
737 memset((char *)phdr->p_vaddr + phdr->p_filesz, 0,
738 phdr->p_memsz - phdr->p_filesz);
739 }
741 return 0;
742 }
744 int construct_dom0(struct task_struct *p,
745 unsigned long alloc_start,
746 unsigned long alloc_end,
747 unsigned int num_vifs,
748 char *image_start, unsigned long image_len,
749 char *initrd_start, unsigned long initrd_len,
750 char *cmdline)
751 {
752 char *dst;
753 int i, rc;
754 unsigned long pfn, mfn;
755 unsigned long nr_pages = (alloc_end - alloc_start) >> PAGE_SHIFT;
756 unsigned long nr_pt_pages;
757 unsigned long count;
758 l2_pgentry_t *l2tab, *l2start;
759 l1_pgentry_t *l1tab = NULL, *l1start = NULL;
760 struct pfn_info *page = NULL;
761 start_info_t *si;
763 /*
764 * This fully describes the memory layout of the initial domain. All
765 * *_start address are page-aligned, except v_start (and v_end) which are
766 * superpage-aligned.
767 */
768 unsigned long v_start;
769 unsigned long vkern_start;
770 unsigned long vkern_entry;
771 unsigned long vkern_end;
772 unsigned long vinitrd_start;
773 unsigned long vinitrd_end;
774 unsigned long vphysmap_start;
775 unsigned long vphysmap_end;
776 unsigned long vstartinfo_start;
777 unsigned long vstartinfo_end;
778 unsigned long vstack_start;
779 unsigned long vstack_end;
780 unsigned long vpt_start;
781 unsigned long vpt_end;
782 unsigned long v_end;
784 /* Machine address of next candidate page-table page. */
785 unsigned long mpt_alloc;
787 extern void physdev_init_dom0(struct task_struct *);
789 #ifndef NO_DEVICES_IN_XEN
790 extern void ide_probe_devices(xen_disk_info_t *);
791 extern void scsi_probe_devices(xen_disk_info_t *);
792 extern void cciss_probe_devices(xen_disk_info_t *);
793 xen_disk_info_t xdi;
794 xen_disk_t *xd;
795 #endif
797 /* Sanity! */
798 if ( p->domain != 0 )
799 BUG();
800 if ( test_bit(PF_CONSTRUCTED, &p->flags) )
801 BUG();
803 printk("*** LOADING DOMAIN 0 ***\n");
805 /*
806 * This is all a bit grim. We've moved the modules to the "safe" physical
807 * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this
808 * routine we're going to copy it down into the region that's actually
809 * been allocated to domain 0. This is highly likely to be overlapping, so
810 * we use a forward copy.
811 *
812 * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with
813 * 4GB and lots of network/disk cards that allocate loads of buffers.
814 * We'll have to revisit this if we ever support PAE (64GB).
815 */
817 rc = readelfimage_base_and_size(image_start, image_len,
818 &vkern_start, &vkern_end, &vkern_entry);
819 if ( rc != 0 )
820 return rc;
822 /*
823 * Why do we need this? The number of page-table frames depends on the
824 * size of the bootstrap address space. But the size of the address space
825 * depends on the number of page-table frames (since each one is mapped
826 * read-only). We have a pair of simultaneous equations in two unknowns,
827 * which we solve by exhaustive search.
828 */
829 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
830 {
831 v_start = vkern_start & ~((1<<22)-1);
832 vinitrd_start = round_pgup(vkern_end);
833 vinitrd_end = vinitrd_start + initrd_len;
834 vphysmap_start = round_pgup(vinitrd_end);
835 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
836 vpt_start = round_pgup(vphysmap_end);
837 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
838 vstartinfo_start = vpt_end;
839 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
840 vstack_start = vstartinfo_end;
841 vstack_end = vstack_start + PAGE_SIZE;
842 v_end = (vstack_end + (1<<22)-1) & ~((1<<22)-1);
843 if ( (v_end - vstack_end) < (512 << 10) )
844 v_end += 1 << 22; /* Add extra 4MB to get >= 512kB padding. */
845 if ( (((v_end - v_start) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
846 break;
847 }
849 if ( (v_end - v_start) > (nr_pages * PAGE_SIZE) )
850 {
851 printk("Initial guest OS requires too much space\n"
852 "(%luMB is greater than %luMB limit)\n",
853 (v_end-v_start)>>20, (nr_pages<<PAGE_SHIFT)>>20);
854 return -ENOMEM;
855 }
857 printk("PHYSICAL MEMORY ARRANGEMENT:\n"
858 " Kernel image: %p->%p\n"
859 " Initrd image: %p->%p\n"
860 " Dom0 alloc.: %08lx->%08lx\n",
861 image_start, image_start + image_len,
862 initrd_start, initrd_start + initrd_len,
863 alloc_start, alloc_end);
864 printk("VIRTUAL MEMORY ARRANGEMENT:\n"
865 " Loaded kernel: %08lx->%08lx\n"
866 " Init. ramdisk: %08lx->%08lx\n"
867 " Phys-Mach map: %08lx->%08lx\n"
868 " Page tables: %08lx->%08lx\n"
869 " Start info: %08lx->%08lx\n"
870 " Boot stack: %08lx->%08lx\n"
871 " TOTAL: %08lx->%08lx\n",
872 vkern_start, vkern_end,
873 vinitrd_start, vinitrd_end,
874 vphysmap_start, vphysmap_end,
875 vpt_start, vpt_end,
876 vstartinfo_start, vstartinfo_end,
877 vstack_start, vstack_end,
878 v_start, v_end);
879 printk(" ENTRY ADDRESS: %08lx\n", vkern_entry);
881 /*
882 * Protect the lowest 1GB of memory. We use a temporary mapping there
883 * from which we copy the kernel and ramdisk images.
884 */
885 if ( v_start < (1<<30) )
886 {
887 printk("Initial loading isn't allowed to lowest 1GB of memory.\n");
888 return -EINVAL;
889 }
891 /* Construct a frame-allocation list for the initial domain. */
892 for ( pfn = (alloc_start>>PAGE_SHIFT);
893 pfn < (alloc_end>>PAGE_SHIFT);
894 pfn++ )
895 {
896 page = &frame_table[pfn];
897 page->u.domain = p;
898 page->type_and_flags = 0;
899 page->count_and_flags = PGC_allocated | 1;
900 list_add_tail(&page->list, &p->page_list);
901 p->tot_pages++; p->max_pages++;
902 }
904 mpt_alloc = (vpt_start - v_start) + alloc_start;
906 SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
907 SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
909 /*
910 * We're basically forcing default RPLs to 1, so that our "what privilege
911 * level are we returning to?" logic works.
912 */
913 p->failsafe_selector = FLAT_GUESTOS_CS;
914 p->event_selector = FLAT_GUESTOS_CS;
915 p->thread.guestos_ss = FLAT_GUESTOS_DS;
916 for ( i = 0; i < 256; i++ )
917 p->thread.traps[i].cs = FLAT_GUESTOS_CS;
919 /* WARNING: The new domain must have its 'processor' field filled in! */
920 l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
921 memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
922 l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
923 mk_l2_pgentry((unsigned long)l2start | __PAGE_HYPERVISOR);
924 l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
925 mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
926 p->mm.pagetable = mk_pagetable((unsigned long)l2start);
928 l2tab += l2_table_offset(v_start);
929 mfn = alloc_start >> PAGE_SHIFT;
930 for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
931 {
932 if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
933 {
934 l1start = l1tab = (l1_pgentry_t *)mpt_alloc;
935 mpt_alloc += PAGE_SIZE;
936 *l2tab++ = mk_l2_pgentry((unsigned long)l1start | L2_PROT);
937 clear_page(l1tab);
938 }
939 *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
941 page = &frame_table[mfn];
942 set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
943 if ( !get_page_and_type(page, p, PGT_writeable_page) )
944 BUG();
946 mfn++;
947 }
949 /* Pages that are part of page tables must be read only. */
950 l2tab = l2start + l2_table_offset(vpt_start);
951 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
952 l1tab += l1_table_offset(vpt_start);
953 l2tab++;
954 for ( count = 0; count < nr_pt_pages; count++ )
955 {
956 *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
957 page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
958 if ( count == 0 )
959 {
960 page->type_and_flags &= ~PGT_type_mask;
961 page->type_and_flags |= PGT_l2_page_table;
962 get_page(page, p); /* an extra ref because of readable mapping */
963 /* Get another ref to L2 page so that it can be pinned. */
964 if ( !get_page_and_type(page, p, PGT_l2_page_table) )
965 BUG();
966 set_bit(_PGC_guest_pinned, &page->count_and_flags);
967 }
968 else
969 {
970 page->type_and_flags &= ~PGT_type_mask;
971 page->type_and_flags |= PGT_l1_page_table;
972 get_page(page, p); /* an extra ref because of readable mapping */
973 }
974 l1tab++;
975 if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
976 l1start = l1tab = (l1_pgentry_t *)l2_pgentry_to_phys(*l2tab);
977 }
979 /* Set up shared-info area. */
980 update_dom_time(p->shared_info);
981 p->shared_info->domain_time = 0;
982 /* Mask all upcalls... */
983 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
984 p->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
986 /* Install the new page tables. */
987 __cli();
988 write_ptbase(&p->mm);
990 /* Copy the OS image. */
991 (void)loadelfimage(image_start);
993 /* Copy the initial ramdisk. */
994 if ( initrd_len != 0 )
995 memcpy((void *)vinitrd_start, initrd_start, initrd_len);
997 /* Set up start info area. */
998 si = (start_info_t *)vstartinfo_start;
999 memset(si, 0, PAGE_SIZE);
1000 si->nr_pages = p->tot_pages;
1001 si->shared_info = virt_to_phys(p->shared_info);
1002 si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN;
1003 si->pt_base = vpt_start;
1004 si->nr_pt_frames = nr_pt_pages;
1005 si->mfn_list = vphysmap_start;
1007 /* Write the phys->machine and machine->phys table entries. */
1008 for ( pfn = 0; pfn < p->tot_pages; pfn++ )
1010 mfn = (alloc_start >> PAGE_SHIFT) + pfn;
1011 ((unsigned long *)vphysmap_start)[pfn] = mfn;
1012 machine_to_phys_mapping[mfn] = pfn;
1015 if ( initrd_len != 0 )
1017 si->mod_start = vinitrd_start;
1018 si->mod_len = initrd_len;
1019 printk("Initrd len 0x%lx, start at 0x%08lx\n",
1020 si->mod_len, si->mod_start);
1023 dst = si->cmd_line;
1024 if ( cmdline != NULL )
1026 for ( i = 0; i < 255; i++ )
1028 if ( cmdline[i] == '\0' )
1029 break;
1030 *dst++ = cmdline[i];
1033 *dst = '\0';
1035 /* Reinstate the caller's page tables. */
1036 write_ptbase(&current->mm);
1037 __sti();
1039 /* Destroy low mappings - they were only for our convenience. */
1040 for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
1041 if ( l2_pgentry_val(l2start[i]) & _PAGE_PSE )
1042 l2start[i] = mk_l2_pgentry(0);
1043 zap_low_mappings(); /* Do the same for the idle page tables. */
1045 /* Give up the VGA console if DOM0 is configured to grab it. */
1046 console_endboot(strstr(cmdline, "tty0") != NULL);
1048 /* Add virtual network interfaces. */
1049 while ( num_vifs-- > 0 )
1050 (void)create_net_vif(0);
1052 #ifndef NO_DEVICES_IN_XEN
1053 /* DOM0 gets access to all real block devices. */
1054 #define MAX_REAL_DISKS 256
1055 xd = kmalloc(MAX_REAL_DISKS * sizeof(xen_disk_t), GFP_KERNEL);
1056 xdi.max = MAX_REAL_DISKS;
1057 xdi.count = 0;
1058 xdi.disks = xd;
1059 ide_probe_devices(&xdi);
1060 scsi_probe_devices(&xdi);
1061 cciss_probe_devices(&xdi);
1062 for ( i = 0; i < xdi.count; i++ )
1064 xen_extent_t e;
1065 e.device = xd[i].device;
1066 e.start_sector = 0;
1067 e.nr_sectors = xd[i].capacity;
1068 if ( (__vbd_create(p, xd[i].device, VBD_MODE_R|VBD_MODE_W,
1069 xd[i].info) != 0) ||
1070 (__vbd_grow(p, xd[i].device, &e) != 0) )
1071 BUG();
1073 kfree(xd);
1074 #endif
1076 /* DOM0 gets access to everything. */
1077 physdev_init_dom0(p);
1079 set_bit(PF_CONSTRUCTED, &p->flags);
1081 #if 0 // XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave)
1082 shadow_mode_enable(&p->mm, SHM_test);
1083 #endif
1085 new_thread(p, vkern_entry, vstack_end, vstartinfo_start);
1087 return 0;
1091 void __init domain_init(void)
1093 printk("Initialising domains\n");