ia64/xen-unstable

changeset 16961:17cce0554151

Direct Linux boot: Support booting relocatable Linux kernels.

This patch introduces the basic infrastructure for direct kernel
boot in the ioemu copy of QEMU. The current #ifdef disabled
code is actually obsolete wrt to upstream QEMU code. So this
is removed entirely. In its place I have imported the latest
upstream QEMU code. The QEMU code assumes that the guest RAM
is directly mapped into the QEMU process, so there were some
changes neccessary. Instead of strcpy/memcpy'ing the args
and kernel header into guest RAM, cpu_physical_memory_rw is
used. Intead of fread() the initrd and kernel into guest RAM
a helper function is used fread2guest which reads into a small
buffer and then uses cpu_physical_memory_rw.

NB in reading the following, Documentation/i386/boot.txt is
a useful reference for what's going on.

Next, instead of loading the kernel at 0x100000, this code
loads it at 0x200000. This is far enough away that there's
no risk of it overlapping with the HVM firmware image. If the
Linux kernel boot protocol is 0x205 or later, and the flag
at offset 0x234 in the kernel header is 1, then the guest
kernel was built with CONFIG_RELOCATABLE=y.

In this scenario we merely need to tell the kernel what address
it has been relocated to by writing 0x200000 into the kernel
header at offset 0x214. When switching from real mode into
protected mode the kernel will immediately start executing at
0x200000 and be happy with life. This should work for 2.6.20 or
later on i386, and 2.6.22 or later on x86_64.

This has been verified with Fedora 7 and Fedora 8 bare metal kernels
on i386 and x86_64 from the $TREE/images/pxeboot of the install trees.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
author Keir Fraser <keir.fraser@citrix.com>
date Fri Feb 01 11:14:53 2008 +0000 (2008-02-01)
parents 42369d21641d
children 67ca9c37ef02
files tools/ioemu/hw/pc.c
line diff
     1.1 --- a/tools/ioemu/hw/pc.c	Thu Jan 31 16:23:35 2008 +0000
     1.2 +++ b/tools/ioemu/hw/pc.c	Fri Feb 01 11:14:53 2008 +0000
     1.3 @@ -31,10 +31,8 @@
     1.4  #define VGABIOS_CIRRUS_FILENAME "vgabios-cirrus.bin"
     1.5  #define LINUX_BOOT_FILENAME "linux_boot.bin"
     1.6  
     1.7 -#define KERNEL_LOAD_ADDR     0x00100000
     1.8 -#define INITRD_LOAD_ADDR     0x00600000
     1.9 -#define KERNEL_PARAMS_ADDR   0x00090000
    1.10 -#define KERNEL_CMDLINE_ADDR  0x00099000
    1.11 +/* Leave a chunk of memory at the top of RAM for the BIOS ACPI tables.  */
    1.12 +#define ACPI_DATA_SIZE        0x10000
    1.13  
    1.14  static fdctrl_t *floppy_controller;
    1.15  static RTCState *rtc_state;
    1.16 @@ -363,36 +361,271 @@ void bochs_bios_init(void)
    1.17      register_ioport_write(0x503, 1, 1, bochs_bios_write, NULL);
    1.18  }
    1.19  
    1.20 -
    1.21 -int load_kernel(const char *filename, uint8_t *addr, 
    1.22 -                uint8_t *real_addr)
    1.23 +/* Generate an initial boot sector which sets state and jump to
    1.24 +   a specified vector */
    1.25 +static void generate_bootsect(uint32_t gpr[8], uint16_t segs[6], uint16_t ip)
    1.26  {
    1.27 -    int fd, size;
    1.28 -    int setup_sects;
    1.29 +    uint8_t bootsect[512], *p;
    1.30 +    int i;
    1.31 +
    1.32 +    if (bs_table[0] == NULL) {
    1.33 +        fprintf(stderr, "A disk image must be given for 'hda' when booting "
    1.34 +                "a Linux kernel\n");
    1.35 +        exit(1);
    1.36 +    }
    1.37 +
    1.38 +    memset(bootsect, 0, sizeof(bootsect));
    1.39 +
    1.40 +    /* Copy the MSDOS partition table if possible */
    1.41 +    bdrv_read(bs_table[0], 0, bootsect, 1);
    1.42 +
    1.43 +    /* Make sure we have a partition signature */
    1.44 +    bootsect[510] = 0x55;
    1.45 +    bootsect[511] = 0xaa;
    1.46 +
    1.47 +    /* Actual code */
    1.48 +    p = bootsect;
    1.49 +    *p++ = 0xfa;                /* CLI */
    1.50 +    *p++ = 0xfc;                /* CLD */
    1.51 +
    1.52 +    for (i = 0; i < 6; i++) {
    1.53 +        if (i == 1)             /* Skip CS */
    1.54 +            continue;
    1.55 +
    1.56 +        *p++ = 0xb8;            /* MOV AX,imm16 */
    1.57 +        *p++ = segs[i];
    1.58 +        *p++ = segs[i] >> 8;
    1.59 +        *p++ = 0x8e;            /* MOV <seg>,AX */
    1.60 +        *p++ = 0xc0 + (i << 3);
    1.61 +    }
    1.62 +
    1.63 +    for (i = 0; i < 8; i++) {
    1.64 +        *p++ = 0x66;            /* 32-bit operand size */
    1.65 +        *p++ = 0xb8 + i;        /* MOV <reg>,imm32 */
    1.66 +        *p++ = gpr[i];
    1.67 +        *p++ = gpr[i] >> 8;
    1.68 +        *p++ = gpr[i] >> 16;
    1.69 +        *p++ = gpr[i] >> 24;
    1.70 +    }
    1.71 +
    1.72 +    *p++ = 0xea;                /* JMP FAR */
    1.73 +    *p++ = ip;                  /* IP */
    1.74 +    *p++ = ip >> 8;
    1.75 +    *p++ = segs[1];             /* CS */
    1.76 +    *p++ = segs[1] >> 8;
    1.77 +
    1.78 +    bdrv_set_boot_sector(bs_table[0], bootsect, sizeof(bootsect));
    1.79 +}
    1.80 +
    1.81  
    1.82 -    fd = open(filename, O_RDONLY | O_BINARY);
    1.83 -    if (fd < 0)
    1.84 -        return -1;
    1.85 +static long get_file_size(FILE *f)
    1.86 +{
    1.87 +    long where, size;
    1.88 +
    1.89 +    /* XXX: on Unix systems, using fstat() probably makes more sense */
    1.90 +
    1.91 +    where = ftell(f);
    1.92 +    fseek(f, 0, SEEK_END);
    1.93 +    size = ftell(f);
    1.94 +    fseek(f, where, SEEK_SET);
    1.95 +
    1.96 +    return size;
    1.97 +}
    1.98 +
    1.99 +static int fread2guest(target_phys_addr_t dst_addr, size_t nbytes, FILE *f)
   1.100 +{
   1.101 +    size_t offset = 0;
   1.102 +    while (nbytes) {
   1.103 +        uint8_t buf[4096];
   1.104 +	size_t count = nbytes > sizeof(buf) ? sizeof(buf) : nbytes;
   1.105 +	if (fread(buf, 1, count, f) != count)
   1.106 +	    return -1;
   1.107 +
   1.108 +	cpu_physical_memory_rw(dst_addr+offset, buf, count, 1);
   1.109 +	offset += count;
   1.110 +	nbytes -= count;
   1.111 +    }
   1.112 +    return 0;
   1.113 +}
   1.114 +
   1.115 +static void load_linux(const char *kernel_filename,
   1.116 +                       const char *initrd_filename,
   1.117 +                       const char *kernel_cmdline)
   1.118 +{
   1.119 +    uint16_t protocol;
   1.120 +    uint32_t gpr[8];
   1.121 +    uint16_t seg[6];
   1.122 +    uint16_t real_seg;
   1.123 +    int setup_size, kernel_size, initrd_size, cmdline_size;
   1.124 +    uint32_t initrd_max;
   1.125 +    uint8_t header[1024];
   1.126 +    target_phys_addr_t real_addr, reloc_prot_addr, prot_addr, cmdline_addr, initrd_addr;
   1.127 +    size_t ncmdline;
   1.128 +    FILE *f, *fi;
   1.129 +
   1.130 +    /* Align to 16 bytes as a paranoia measure */
   1.131 +    cmdline_size = (strlen(kernel_cmdline)+16) & ~15;
   1.132 +
   1.133 +    /* load the kernel header */
   1.134 +    f = fopen(kernel_filename, "rb");
   1.135 +    if (!f || !(kernel_size = get_file_size(f)) ||
   1.136 +        fread(header, 1, 1024, f) != 1024) {
   1.137 +        fprintf(stderr, "qemu: could not load kernel '%s'\n",
   1.138 +                kernel_filename);
   1.139 +        exit(1);
   1.140 +    }
   1.141  
   1.142 -    /* load 16 bit code */
   1.143 -    if (read(fd, real_addr, 512) != 512)
   1.144 -        goto fail;
   1.145 -    setup_sects = real_addr[0x1F1];
   1.146 -    if (!setup_sects)
   1.147 -        setup_sects = 4;
   1.148 -    if (read(fd, real_addr + 512, setup_sects * 512) != 
   1.149 -        setup_sects * 512)
   1.150 -        goto fail;
   1.151 -    
   1.152 -    /* load 32 bit code */
   1.153 -    size = read(fd, addr, 16 * 1024 * 1024);
   1.154 -    if (size < 0)
   1.155 -        goto fail;
   1.156 -    close(fd);
   1.157 -    return size;
   1.158 - fail:
   1.159 -    close(fd);
   1.160 -    return -1;
   1.161 +    /* kernel protocol version */
   1.162 +    fprintf(stderr, "header magic: %#x\n", ldl_p(header+0x202));
   1.163 +    if (ldl_p(header+0x202) == 0x53726448)
   1.164 +        protocol = lduw_p(header+0x206);
   1.165 +    else
   1.166 +        protocol = 0;
   1.167 +    fprintf(stderr, "header protocol: %x\n", protocol);
   1.168 +    if (protocol < 0x200 || !(header[0x211] & 0x01)) {
   1.169 +        /* Low kernel */
   1.170 +        real_addr    = 0x90000;
   1.171 +        cmdline_addr = 0x9a000 - cmdline_size;
   1.172 +        prot_addr    = 0x10000;
   1.173 +	reloc_prot_addr = prot_addr;
   1.174 +    } else if (protocol < 0x202) {
   1.175 +        /* High but ancient kernel */
   1.176 +        real_addr    = 0x90000;
   1.177 +        cmdline_addr = 0x9a000 - cmdline_size;
   1.178 +        prot_addr    = 0x100000;
   1.179 +	reloc_prot_addr = 0x200000;
   1.180 +    } else {
   1.181 +        /* High and recent kernel */
   1.182 +        real_addr    = 0x10000;
   1.183 +        cmdline_addr = 0x20000;
   1.184 +        prot_addr    = 0x100000;
   1.185 +	reloc_prot_addr = 0x200000;
   1.186 +    }
   1.187 +
   1.188 +    fprintf(stderr,
   1.189 +            "qemu: real_addr     = %#zx\n"
   1.190 +            "qemu: cmdline_addr  = %#zx\n"
   1.191 +            "qemu: prot_addr     = %#zx\n",
   1.192 +            real_addr,
   1.193 +            cmdline_addr,
   1.194 +            prot_addr);
   1.195 +
   1.196 +    /* highest address for loading the initrd */
   1.197 +    if (protocol >= 0x203)
   1.198 +        initrd_max = ldl_p(header+0x22c);
   1.199 +    else
   1.200 +        initrd_max = 0x37ffffff;
   1.201 +
   1.202 +    if (initrd_max >= ram_size-ACPI_DATA_SIZE)
   1.203 +        initrd_max = ram_size-ACPI_DATA_SIZE-1;
   1.204 +
   1.205 +
   1.206 +    /* kernel command line */
   1.207 +    ncmdline = strlen(kernel_cmdline);
   1.208 +    if (ncmdline > 4095) {
   1.209 +        ncmdline = 4095;
   1.210 +	((uint8_t*)kernel_cmdline)[4095] = '\0';
   1.211 +    }
   1.212 +    fprintf(stderr, "qemu: kernel_cmdline: %#zx ('%s')\n", ncmdline, kernel_cmdline);
   1.213 +    cpu_physical_memory_rw(cmdline_addr, (uint8_t*)kernel_cmdline, ncmdline+1, 1);
   1.214 +
   1.215 +    if (protocol >= 0x202) {
   1.216 +        stl_p(header+0x228, cmdline_addr);
   1.217 +    } else {
   1.218 +        stw_p(header+0x20, 0xA33F);
   1.219 +        stw_p(header+0x22, cmdline_addr-real_addr);
   1.220 +    }
   1.221 +
   1.222 +    /* loader type */
   1.223 +    /* High nybble = B reserved for Qemu; low nybble is revision number.
   1.224 +       If this code is substantially changed, you may want to consider
   1.225 +       incrementing the revision. */
   1.226 +    if (protocol >= 0x200)
   1.227 +        header[0x210] = 0xB0;
   1.228 +
   1.229 +    /* heap */
   1.230 +    if (protocol >= 0x201) {
   1.231 +        header[0x211] |= 0x80;  /* CAN_USE_HEAP */
   1.232 +        stw_p(header+0x224, cmdline_addr-real_addr-0x200);
   1.233 +    }
   1.234 +
   1.235 +    /* load initrd */
   1.236 +    if (initrd_filename) {
   1.237 +        if (protocol < 0x200) {
   1.238 +            fprintf(stderr, "qemu: linux kernel too old to load a ram disk\n");
   1.239 +            exit(1);
   1.240 +        }
   1.241 +
   1.242 +        fi = fopen(initrd_filename, "rb");
   1.243 +        if (!fi) {
   1.244 +            fprintf(stderr, "qemu: could not load initial ram disk '%s'\n",
   1.245 +                    initrd_filename);
   1.246 +            exit(1);
   1.247 +        }
   1.248 +
   1.249 +        initrd_size = get_file_size(fi);
   1.250 +        initrd_addr = ((initrd_max-initrd_size) & ~4095);
   1.251 +
   1.252 +        fprintf(stderr, "qemu: loading initrd (%#x bytes) at %#zx\n",
   1.253 +                initrd_size, initrd_addr);
   1.254 +
   1.255 +	if (fread2guest(initrd_addr, initrd_size, fi) < 0) {
   1.256 +	    fprintf(stderr, "qemu: read error on initial ram disk '%s'\n",
   1.257 +		    initrd_filename);
   1.258 +	    exit(1);
   1.259 +	}
   1.260 +        fclose(fi);
   1.261 +
   1.262 +        stl_p(header+0x218, initrd_addr);
   1.263 +        stl_p(header+0x21c, initrd_size);
   1.264 +    }
   1.265 +
   1.266 +
   1.267 +    setup_size = header[0x1f1];
   1.268 +    if (setup_size == 0)
   1.269 +        setup_size = 4;
   1.270 +
   1.271 +    setup_size = (setup_size+1)*512;
   1.272 +    kernel_size -= setup_size;  /* Size of protected-mode code */
   1.273 +
   1.274 +    /* Urgh, Xen's HVM firmware lives at 0x100000, but that's also the
   1.275 +     * address Linux wants to start life at prior to relocatable support
   1.276 +     */
   1.277 +    if (prot_addr != reloc_prot_addr) {
   1.278 +        if (protocol >= 0x205 && (header[0x234] & 1)) {
   1.279 +	    /* Relocatable automatically */
   1.280 +	    stl_p(header+0x214, reloc_prot_addr);
   1.281 +	    fprintf(stderr, "qemu: kernel is relocatable\n");
   1.282 +	} else {
   1.283 +	    fprintf(stderr, "qemu: unable to load non-relocatable kernel\n");
   1.284 +	    exit(1);
   1.285 +	}
   1.286 +    }
   1.287 +
   1.288 +
   1.289 +    fprintf(stderr, "qemu: loading kernel real mode (%#x bytes) at %#zx\n",
   1.290 +	    setup_size-1024, real_addr);
   1.291 +    fprintf(stderr, "qemu: loading kernel protected mode (%#x bytes) at %#zx\n",
   1.292 +	    kernel_size, reloc_prot_addr);
   1.293 +
   1.294 +    /* store the finalized header and load the rest of the kernel */
   1.295 +    cpu_physical_memory_rw(real_addr, header, 1024, 1);
   1.296 +    if (fread2guest(real_addr+1024, setup_size-1024, f) < 0 ||
   1.297 +        fread2guest(reloc_prot_addr, kernel_size, f) < 0) {
   1.298 +	fprintf(stderr, "qemu: loading kernel protected mode (%#x bytes) at %#zx\n",
   1.299 +		kernel_size, reloc_prot_addr);
   1.300 +	exit(1);
   1.301 +    }
   1.302 +    fclose(f);
   1.303 +
   1.304 +    /* generate bootsector to set up the initial register state */
   1.305 +    real_seg = (real_addr) >> 4;
   1.306 +    seg[0] = seg[2] = seg[3] = seg[4] = seg[4] = real_seg;
   1.307 +    seg[1] = real_seg+0x20;     /* CS */
   1.308 +    memset(gpr, 0, sizeof gpr);
   1.309 +    gpr[4] = cmdline_addr-real_addr-16; /* SP (-16 is paranoia) */
   1.310 +
   1.311 +    generate_bootsect(gpr, seg, 0);
   1.312  }
   1.313  
   1.314  static void main_cpu_reset(void *opaque)
   1.315 @@ -577,63 +810,8 @@ static void pc_init1(uint64_t ram_size, 
   1.316      
   1.317      bochs_bios_init();
   1.318  
   1.319 -#ifndef CONFIG_DM
   1.320 -    if (linux_boot) {
   1.321 -        uint8_t bootsect[512];
   1.322 -        uint8_t old_bootsect[512];
   1.323 -
   1.324 -        if (bs_table[0] == NULL) {
   1.325 -            fprintf(stderr, "A disk image must be given for 'hda' when booting a Linux kernel\n");
   1.326 -            exit(1);
   1.327 -        }
   1.328 -        snprintf(buf, sizeof(buf), "%s/%s", bios_dir, LINUX_BOOT_FILENAME);
   1.329 -        ret = load_image(buf, bootsect);
   1.330 -        if (ret != sizeof(bootsect)) {
   1.331 -            fprintf(stderr, "qemu: could not load linux boot sector '%s'\n",
   1.332 -                    buf);
   1.333 -            exit(1);
   1.334 -        }
   1.335 -
   1.336 -        if (bdrv_read(bs_table[0], 0, old_bootsect, 1) >= 0) {
   1.337 -            /* copy the MSDOS partition table */
   1.338 -            memcpy(bootsect + 0x1be, old_bootsect + 0x1be, 0x40);
   1.339 -        }
   1.340 -
   1.341 -        bdrv_set_boot_sector(bs_table[0], bootsect, sizeof(bootsect));
   1.342 -
   1.343 -        /* now we can load the kernel */
   1.344 -        ret = load_kernel(kernel_filename, 
   1.345 -                          phys_ram_base + KERNEL_LOAD_ADDR,
   1.346 -                          phys_ram_base + KERNEL_PARAMS_ADDR);
   1.347 -        if (ret < 0) {
   1.348 -            fprintf(stderr, "qemu: could not load kernel '%s'\n", 
   1.349 -                    kernel_filename);
   1.350 -            exit(1);
   1.351 -        }
   1.352 -        
   1.353 -        /* load initrd */
   1.354 -        initrd_size = 0;
   1.355 -        if (initrd_filename) {
   1.356 -            initrd_size = load_image(initrd_filename, phys_ram_base + INITRD_LOAD_ADDR);
   1.357 -            if (initrd_size < 0) {
   1.358 -                fprintf(stderr, "qemu: could not load initial ram disk '%s'\n", 
   1.359 -                        initrd_filename);
   1.360 -                exit(1);
   1.361 -            }
   1.362 -        }
   1.363 -        if (initrd_size > 0) {
   1.364 -            stl_raw(phys_ram_base + KERNEL_PARAMS_ADDR + 0x218, INITRD_LOAD_ADDR);
   1.365 -            stl_raw(phys_ram_base + KERNEL_PARAMS_ADDR + 0x21c, initrd_size);
   1.366 -        }
   1.367 -        pstrcpy(phys_ram_base + KERNEL_CMDLINE_ADDR, 4096,
   1.368 -                kernel_cmdline);
   1.369 -        stw_raw(phys_ram_base + KERNEL_PARAMS_ADDR + 0x20, 0xA33F);
   1.370 -        stw_raw(phys_ram_base + KERNEL_PARAMS_ADDR + 0x22,
   1.371 -                KERNEL_CMDLINE_ADDR - KERNEL_PARAMS_ADDR);
   1.372 -        /* loader type */
   1.373 -        stw_raw(phys_ram_base + KERNEL_PARAMS_ADDR + 0x210, 0x01);
   1.374 -    }
   1.375 -#endif /* !CONFIG_DM */
   1.376 +    if (linux_boot)
   1.377 +        load_linux(kernel_filename, initrd_filename, kernel_cmdline);
   1.378  
   1.379      if (pci_enabled) {
   1.380          pci_bus = i440fx_init(&i440fx_state);