ia64/xen-unstable
changeset 1312:de457488a115
bitkeeper revision 1.872 (4087cf0eay7XY7T1xObNygn1qSwJ0g)
Partial checkin of new blkdev backend in Xenolinux. Also updates
to the mmu_update interface.
Partial checkin of new blkdev backend in Xenolinux. Also updates
to the mmu_update interface.
line diff
1.1 --- a/.rootkeys Wed Apr 21 10:43:06 2004 +0000 1.2 +++ b/.rootkeys Thu Apr 22 13:56:30 2004 +0000 1.3 @@ -671,12 +671,18 @@ 3e5a4e65gZBRBB6RsSVg1c9iahigAw xenolinux 1.4 3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c 1.5 4083dc16z0jvZEH4PiVDbDRreaNp6w xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/Makefile 1.6 4083dc16KQus88a4U3uCV6qVCA6_8Q xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile 1.7 +4087cf0dPeHOvzmZAazvwLslKEF93A xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h 1.8 +4087cf0da2cROOiybf9A-j4R_yHnjg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c 1.9 +4087cf0dvXL1PKX23t_LvO1wVPb7OA xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c 1.10 +4087cf0dkVF3I19gpT1cNubeJgQr7g xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c 1.11 +4087cf0dlv1Dw4MAbeRStPPG8IvPPg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c 1.12 4075806dI5kfeMD5RV-DA0PYoThx_w xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/Makefile 1.13 4075806d3fJqqDC1pYYPTZPc575iKg xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/block.c 1.14 4075806d4-j7vN0Mn0bklI1cRUX1vQ xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/block.h 1.15 4075806dibjCcfuXv6CINMhxWTw3jQ xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/frontend/vbd.c 1.16 4083dc16-Kd5y9psK_yk161sme5j5Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/Makefile 1.17 4083dc16UmHXxS9g_UFVnkUpN-oP2Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/Makefile 1.18 +4087cf0d5dudKw_DecIJgOhLlBF_0Q xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c 1.19 405853f2wg7JXZJNltspMwOZJklxgw xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/frontend/Makefile 1.20 405853f6nbeazrNyEWNHBuoSg2PiPA xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/frontend/vnetif.c 1.21 3e5a4e65lWzkiPXsZdzPt2RNnJGG1g xenolinux-2.4.26-sparse/arch/xen/kernel/Makefile
2.1 --- a/tools/xc/lib/xc_linux_build.c Wed Apr 21 10:43:06 2004 +0000 2.2 +++ b/tools/xc/lib/xc_linux_build.c Thu Apr 22 13:56:30 2004 +0000 2.3 @@ -165,7 +165,7 @@ static int setup_guestos(int xc_handle, 2.4 2.5 memset(builddomain, 0, sizeof(*builddomain)); 2.6 2.7 - if ( (pm_handle = init_pfn_mapper()) < 0 ) 2.8 + if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) 2.9 goto error_out; 2.10 2.11 if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
3.1 --- a/tools/xc/lib/xc_linux_restore.c Wed Apr 21 10:43:06 2004 +0000 3.2 +++ b/tools/xc/lib/xc_linux_restore.c Thu Apr 22 13:56:30 2004 +0000 3.3 @@ -186,7 +186,7 @@ int xc_linux_restore(int xc_handle, 3.4 } 3.5 shared_info_frame = op.u.getdomaininfo.shared_info_frame; 3.6 3.7 - if ( (pm_handle = init_pfn_mapper()) < 0 ) 3.8 + if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) 3.9 goto out; 3.10 3.11 /* Copy saved contents of shared-info page. No checking needed. */
4.1 --- a/tools/xc/lib/xc_linux_save.c Wed Apr 21 10:43:06 2004 +0000 4.2 +++ b/tools/xc/lib/xc_linux_save.c Thu Apr 22 13:56:30 2004 +0000 4.3 @@ -178,7 +178,7 @@ int xc_linux_save(int xc_handle, 4.4 goto out; 4.5 } 4.6 4.7 - if ( (pm_handle = init_pfn_mapper()) < 0 ) 4.8 + if ( (pm_handle = init_pfn_mapper((domid_t)domid)) < 0 ) 4.9 goto out; 4.10 4.11 /* Is the suspend-record MFN actually valid for this domain? */
5.1 --- a/tools/xc/lib/xc_netbsd_build.c Wed Apr 21 10:43:06 2004 +0000 5.2 +++ b/tools/xc/lib/xc_netbsd_build.c Thu Apr 22 13:56:30 2004 +0000 5.3 @@ -80,7 +80,7 @@ static int setup_guestos(int xc_handle, 5.4 5.5 memset(builddomain, 0, sizeof(*builddomain)); 5.6 5.7 - if ( (pm_handle = init_pfn_mapper()) < 0 ) 5.8 + if ( (pm_handle = init_pfn_mapper((domid_t)dom)) < 0 ) 5.9 goto error_out; 5.10 5.11 if ( (page_array = malloc(tot_pages * sizeof(unsigned long))) == NULL )
6.1 --- a/tools/xc/lib/xc_private.c Wed Apr 21 10:43:06 2004 +0000 6.2 +++ b/tools/xc/lib/xc_private.c Thu Apr 22 13:56:30 2004 +0000 6.3 @@ -6,9 +6,15 @@ 6.4 6.5 #include "xc_private.h" 6.6 6.7 -int init_pfn_mapper(void) 6.8 +int init_pfn_mapper(domid_t domid) 6.9 { 6.10 - return open("/dev/mem", O_RDWR); 6.11 + int fd = open("/dev/mem", O_RDWR); 6.12 + if ( fd >= 0 ) 6.13 + { 6.14 + (void)ioctl(fd, _IO('M', 1), (unsigned long)(domid>> 0)); /* low */ 6.15 + (void)ioctl(fd, _IO('M', 2), (unsigned long)(domid>>32)); /* high */ 6.16 + } 6.17 + return fd; 6.18 } 6.19 6.20 int close_pfn_mapper(int pm_handle) 6.21 @@ -49,7 +55,7 @@ static int flush_mmu_updates(int xc_hand 6.22 if ( mmu->idx == FIRST_MMU_UPDATE ) 6.23 return 0; 6.24 6.25 - /* The first two requests set the correct subject domain. */ 6.26 + /* The first two requests set the correct subject domain (PTS and GPS). */ 6.27 mmu->updates[0].val = (unsigned long)(mmu->subject<<16) & ~0xFFFFUL; 6.28 mmu->updates[0].ptr = (unsigned long)(mmu->subject<< 0) & ~0xFFFFUL; 6.29 mmu->updates[1].val = (unsigned long)(mmu->subject>>16) & ~0xFFFFUL; 6.30 @@ -57,7 +63,7 @@ static int flush_mmu_updates(int xc_hand 6.31 mmu->updates[0].ptr |= MMU_EXTENDED_COMMAND; 6.32 mmu->updates[0].val |= MMUEXT_SET_SUBJECTDOM_L; 6.33 mmu->updates[1].ptr |= MMU_EXTENDED_COMMAND; 6.34 - mmu->updates[1].val |= MMUEXT_SET_SUBJECTDOM_H; 6.35 + mmu->updates[1].val |= MMUEXT_SET_SUBJECTDOM_H | SET_PAGETABLE_SUBJECTDOM; 6.36 6.37 hypercall.op = __HYPERVISOR_mmu_update; 6.38 hypercall.arg[0] = (unsigned long)mmu->updates;
7.1 --- a/tools/xc/lib/xc_private.h Wed Apr 21 10:43:06 2004 +0000 7.2 +++ b/tools/xc/lib/xc_private.h Thu Apr 22 13:56:30 2004 +0000 7.3 @@ -154,7 +154,7 @@ static inline int do_block_io_op(int xc_ 7.4 /* 7.5 * PFN mapping. 7.6 */ 7.7 -int init_pfn_mapper(void); 7.8 +int init_pfn_mapper(domid_t domid); 7.9 int close_pfn_mapper(int pm_handle); 7.10 void *map_pfn_writeable(int pm_handle, unsigned long pfn); 7.11 void *map_pfn_readonly(int pm_handle, unsigned long pfn);
8.1 --- a/tools/xend/lib/domain_controller.h Wed Apr 21 10:43:06 2004 +0000 8.2 +++ b/tools/xend/lib/domain_controller.h Thu Apr 22 13:56:30 2004 +0000 8.3 @@ -49,8 +49,116 @@ typedef struct { 8.4 CONTROL_RING_IDX rx_req_prod, rx_resp_prod; 8.5 } control_if_t; 8.6 8.7 -#define CMSG_CONSOLE 0 8.8 -#define CMSG_CONSOLE_DATA 0 8.9 +/* 8.10 + * Top-level command types. 8.11 + */ 8.12 +#define CMSG_CONSOLE 0 /* Console */ 8.13 +#define CMSG_BLKIF_BE 1 /* Block-device backend */ 8.14 +#define CMSG_BLKIF_FE 2 /* Block-device frontend */ 8.15 + 8.16 +/* 8.17 + * Subtypes for console messages. 8.18 + */ 8.19 +#define CMSG_CONSOLE_DATA 0 8.20 + 8.21 +/* 8.22 + * Subtypes for block-device messages. 8.23 + */ 8.24 +#define CMSG_BLKIF_BE_CREATE 0 /* Create a new block-device interface. */ 8.25 +#define CMSG_BLKIF_BE_DESTROY 1 /* Destroy a block-device interface. */ 8.26 +#define CMSG_BLKIF_BE_VBD_CREATE 2 /* Create a new VBD for an interface. */ 8.27 +#define CMSG_BLKIF_BE_VBD_DESTROY 3 /* Delete a VBD from an interface. */ 8.28 +#define CMSG_BLKIF_BE_VBD_GROW 4 /* Append an extent to a given VBD. */ 8.29 +#define CMSG_BLKIF_BE_VBD_SHRINK 5 /* Remove last extent from a given VBD. */ 8.30 + 8.31 +/* 8.32 + * Message request/response defintions for block-device messages. 8.33 + */ 8.34 + 8.35 +typedef u16 blkif_vdev_t; 8.36 +typedef u16 blkif_pdev_t; 8.37 +typedef u64 blkif_sector_t; 8.38 +typedef struct { 8.39 + blkif_pdev_t device; 8.40 + blkif_sector_t sector_start; 8.41 + blkif_sector_t sector_length; 8.42 +} blkif_extent_t; 8.43 + 8.44 +/* Non-specific 'okay' return. */ 8.45 +#define BLKIF_STATUS_OKAY 0 8.46 +/* Non-specific 'error' return. */ 8.47 +#define BLKIF_STATUS_ERROR 1 8.48 +/* The following are specific error returns. */ 8.49 +#define BLKIF_STATUS_INTERFACE_EXISTS 2 8.50 +#define BLKIF_STATUS_INTERFACE_NOT_FOUND 3 8.51 + 8.52 +/* This macro can be used to create an array of descriptive error strings. */ 8.53 +#define BLKIF_STATUS_ERRORS { \ 8.54 + "Okay", \ 8.55 + "Non-specific error", \ 8.56 + "Interface already exists", \ 8.57 + "Interface not found" } 8.58 8.59 +/* CMSG_BLKIF_CREATE */ 8.60 +typedef struct { 8.61 + /* IN */ 8.62 + domid_t domid; /* Domain attached to new interface. */ 8.63 + unsigned int blkif_handle; /* Domain-specific interface handle. */ 8.64 + unsigned int evtchn_port; /* Event channel for notifications. */ 8.65 + unsigned long shmem_frame; /* Page cont. shared comms window. */ 8.66 + /* OUT */ 8.67 + unsigned int status; 8.68 +} blkif_create_t; 8.69 + 8.70 +/* CMSG_BLKIF_DESTROY */ 8.71 +typedef struct { 8.72 + /* IN */ 8.73 + domid_t domid; /* Identify interface to be destroyed. */ 8.74 + unsigned int blkif_handle; /* ...ditto... */ 8.75 + /* OUT */ 8.76 + unsigned int status; 8.77 +} blkif_destroy_t; 8.78 + 8.79 +/* CMSG_BLKIF_VBD_CREATE */ 8.80 +typedef struct { 8.81 + /* IN */ 8.82 + domid_t domid; /* Identify blkdev interface. */ 8.83 + unsigned int blkif_handle; /* ...ditto... */ 8.84 + blkif_vdev_t vdevice; /* Interface-specific id for this VBD. */ 8.85 + int readonly; /* Non-zero -> VBD isn't writeable. */ 8.86 + /* OUT */ 8.87 + unsigned int status; 8.88 +} blkif_vbd_create_t; 8.89 + 8.90 +/* CMSG_BLKIF_VBD_DESTROY */ 8.91 +typedef struct { 8.92 + /* IN */ 8.93 + domid_t domid; /* Identify blkdev interface. */ 8.94 + unsigned int blkif_handle; /* ...ditto... */ 8.95 + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ 8.96 + /* OUT */ 8.97 + unsigned int status; 8.98 +} blkif_vbd_destroy_t; 8.99 + 8.100 +/* CMSG_BLKIF_VBD_GROW */ 8.101 +typedef struct { 8.102 + /* IN */ 8.103 + domid_t domid; /* Identify blkdev interface. */ 8.104 + unsigned int blkif_handle; /* ...ditto... */ 8.105 + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ 8.106 + blkif_extent_t extent; /* Physical extent to append to VBD. */ 8.107 + /* OUT */ 8.108 + unsigned int status; 8.109 +} blkif_vbd_grow_t; 8.110 + 8.111 +/* CMSG_BLKIF_VBD_SHRINK */ 8.112 +typedef struct { 8.113 + /* IN */ 8.114 + domid_t domid; /* Identify blkdev interface. */ 8.115 + unsigned int blkif_handle; /* ...ditto... */ 8.116 + blkif_vdev_t vdevice; /* Interface-specific id of the VBD. */ 8.117 + /* OUT */ 8.118 + unsigned int status; 8.119 +} blkif_vbd_shrink_t; 8.120 8.121 #endif /* __DOMAIN_CONTROLLER_H__ */
9.1 --- a/tools/xend/lib/utils.c Wed Apr 21 10:43:06 2004 +0000 9.2 +++ b/tools/xend/lib/utils.c Thu Apr 22 13:56:30 2004 +0000 9.3 @@ -674,6 +674,10 @@ static PyObject *xu_port_new(PyObject *s 9.4 goto fail1; 9.5 } 9.6 9.7 + /* Set the General-Purpose Subject whose page frame will be mapped. */ 9.8 + (void)ioctl(xup->mem_fd, _IO('M', 1), (unsigned long)(dom>> 0)); /* low */ 9.9 + (void)ioctl(xup->mem_fd, _IO('M', 2), (unsigned long)(dom>>32)); /* high */ 9.10 + 9.11 if ( (xup->xc_handle = xc_interface_open()) == -1 ) 9.12 { 9.13 PyErr_SetString(port_error, "Could not open Xen control interface");
10.1 --- a/xen/common/memory.c Wed Apr 21 10:43:06 2004 +0000 10.2 +++ b/xen/common/memory.c Thu Apr 22 13:56:30 2004 +0000 10.3 @@ -151,12 +151,10 @@ 10.4 10.5 static int alloc_l2_table(struct pfn_info *page); 10.6 static int alloc_l1_table(struct pfn_info *page); 10.7 -static int get_page_from_pagenr(unsigned long page_nr, int check_level); 10.8 +static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p); 10.9 static int get_page_and_type_from_pagenr(unsigned long page_nr, 10.10 u32 type, 10.11 - int check_level); 10.12 -#define CHECK_STRICT 0 /* Subject domain must own the page */ 10.13 -#define CHECK_ANYDOM 1 /* Any domain may own the page (if subject is priv.) */ 10.14 + struct task_struct *p); 10.15 10.16 static void free_l2_table(struct pfn_info *page); 10.17 static void free_l1_table(struct pfn_info *page); 10.18 @@ -180,9 +178,14 @@ static struct { 10.19 unsigned long deferred_ops; 10.20 unsigned long cr0; 10.21 domid_t subject_id; 10.22 - struct task_struct *subject_p; 10.23 + /* General-Purpose Subject, Page-Table Subject */ 10.24 + struct task_struct *gps, *pts; 10.25 } percpu_info[NR_CPUS] __cacheline_aligned; 10.26 10.27 +/* Determine the current General-Purpose Subject or Page-Table Subject. */ 10.28 +#define PTS (percpu_info[smp_processor_id()].pts ? : current) 10.29 +#define GPS (percpu_info[smp_processor_id()].gps ? : current) 10.30 + 10.31 10.32 /* 10.33 * init_frametable: 10.34 @@ -295,11 +298,9 @@ int map_ldt_shadow_page(unsigned int off 10.35 } 10.36 10.37 10.38 -static int get_page_from_pagenr(unsigned long page_nr, int check_level) 10.39 +static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p) 10.40 { 10.41 - struct task_struct *p = current; 10.42 struct pfn_info *page = &frame_table[page_nr]; 10.43 - u32 y, x, nx; 10.44 10.45 if ( unlikely(!pfn_is_ram(page_nr)) ) 10.46 { 10.47 @@ -307,37 +308,10 @@ static int get_page_from_pagenr(unsigned 10.48 return 0; 10.49 } 10.50 10.51 - /* Find the correct subject domain. */ 10.52 - if ( unlikely(percpu_info[p->processor].subject_p != NULL) ) 10.53 - p = percpu_info[p->processor].subject_p; 10.54 - 10.55 - /* Demote ANYDOM to STRICT if subject domain is not privileged. */ 10.56 - if ( check_level == CHECK_ANYDOM && !IS_PRIV(p) ) 10.57 - check_level = CHECK_STRICT; 10.58 - 10.59 - switch ( check_level ) 10.60 + if ( unlikely(!get_page(page, p)) ) 10.61 { 10.62 - case CHECK_STRICT: 10.63 - if ( unlikely(!get_page(page, p)) ) 10.64 - { 10.65 - MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); 10.66 - return 0; 10.67 - } 10.68 - break; 10.69 - case CHECK_ANYDOM: 10.70 - y = page->count_and_flags; 10.71 - do { 10.72 - x = y; 10.73 - nx = x + 1; 10.74 - if ( unlikely((x & PGC_count_mask) == 0) || 10.75 - unlikely((nx & PGC_count_mask) == 0) ) 10.76 - { 10.77 - MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); 10.78 - return 0; 10.79 - } 10.80 - } 10.81 - while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) ); 10.82 - break; 10.83 + MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); 10.84 + return 0; 10.85 } 10.86 10.87 return 1; 10.88 @@ -346,11 +320,11 @@ static int get_page_from_pagenr(unsigned 10.89 10.90 static int get_page_and_type_from_pagenr(unsigned long page_nr, 10.91 u32 type, 10.92 - int check_level) 10.93 + struct task_struct *p) 10.94 { 10.95 struct pfn_info *page = &frame_table[page_nr]; 10.96 10.97 - if ( unlikely(!get_page_from_pagenr(page_nr, check_level)) ) 10.98 + if ( unlikely(!get_page_from_pagenr(page_nr, p)) ) 10.99 return 0; 10.100 10.101 if ( unlikely(!get_page_type(page, type)) ) 10.102 @@ -391,8 +365,7 @@ static int get_linear_pagetable(l2_pgent 10.103 if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn ) 10.104 { 10.105 /* Make sure the mapped frame belongs to the correct domain. */ 10.106 - if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), 10.107 - CHECK_STRICT)) ) 10.108 + if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), PTS)) ) 10.109 return 0; 10.110 10.111 /* 10.112 @@ -443,14 +416,14 @@ static int get_page_from_l1e(l1_pgentry_ 10.113 if ( l1v & _PAGE_RW ) 10.114 { 10.115 if ( unlikely(!get_page_and_type_from_pagenr( 10.116 - pfn, PGT_writeable_page, CHECK_ANYDOM)) ) 10.117 + pfn, PGT_writeable_page, GPS)) ) 10.118 return 0; 10.119 set_bit(_PGC_tlb_flush_on_type_change, 10.120 &frame_table[pfn].count_and_flags); 10.121 return 1; 10.122 } 10.123 10.124 - return get_page_from_pagenr(pfn, CHECK_ANYDOM); 10.125 + return get_page_from_pagenr(pfn, GPS); 10.126 } 10.127 10.128 10.129 @@ -468,7 +441,7 @@ static int get_page_from_l2e(l2_pgentry_ 10.130 } 10.131 10.132 if ( unlikely(!get_page_and_type_from_pagenr( 10.133 - l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, CHECK_STRICT)) ) 10.134 + l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, PTS)) ) 10.135 return get_linear_pagetable(l2e, pfn); 10.136 10.137 return 1; 10.138 @@ -771,12 +744,12 @@ void free_page_type(struct pfn_info *pag 10.139 page-frame_table) & PSH_shadowed) ) 10.140 { 10.141 /* 10.142 - * Using 'current->mm' is safe and correct because page-table pages 10.143 - * are not shared across domains. Updates to such pages' types are 10.144 - * thus only done within the context of the owning domain. The one 10.145 - * exception is when destroying a domain; however, this is not a 10.146 - * problem as the currently-executing domain will not have this 10.147 - * MFN shadowed, and at domain end-of-day we explicitly unshadow 10.148 + * Using 'current->mm' is safe and correct because page-table pages 10.149 + * are not shared across domains. Updates to such pages' types are 10.150 + * thus only done within the context of the owning domain. The one 10.151 + * exception is when destroying a domain; however, this is not a 10.152 + * problem as the currently-executing domain will not have this MFN 10.153 + * shadowed, and at domain end-of-day we explicitly unshadow 10.154 * everything so that nothing will get left lying around. 10.155 */ 10.156 unshadow_table( page-frame_table, type ); 10.157 @@ -814,9 +787,9 @@ static int do_extended_command(unsigned 10.158 case MMUEXT_PIN_L1_TABLE: 10.159 case MMUEXT_PIN_L2_TABLE: 10.160 okay = get_page_and_type_from_pagenr( 10.161 - pfn, (cmd == MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : 10.162 - PGT_l1_page_table, 10.163 - CHECK_STRICT); 10.164 + pfn, 10.165 + (cmd==MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : PGT_l1_page_table, 10.166 + PTS); 10.167 if ( unlikely(!okay) ) 10.168 { 10.169 MEM_LOG("Error while pinning pfn %08lx", pfn); 10.170 @@ -836,7 +809,7 @@ static int do_extended_command(unsigned 10.171 break; 10.172 10.173 case MMUEXT_UNPIN_TABLE: 10.174 - if ( unlikely(!(okay = get_page_from_pagenr(pfn, CHECK_STRICT))) ) 10.175 + if ( unlikely(!(okay = get_page_from_pagenr(pfn, PTS))) ) 10.176 { 10.177 MEM_LOG("Page %08lx bad domain (dom=%p)", 10.178 ptr, page->u.domain); 10.179 @@ -856,8 +829,7 @@ static int do_extended_command(unsigned 10.180 break; 10.181 10.182 case MMUEXT_NEW_BASEPTR: 10.183 - okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, 10.184 - CHECK_STRICT); 10.185 + okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, current); 10.186 if ( likely(okay) ) 10.187 { 10.188 invalidate_shadow_ldt(); 10.189 @@ -890,7 +862,7 @@ static int do_extended_command(unsigned 10.190 break; 10.191 10.192 case MMUEXT_INVLPG: 10.193 - __flush_tlb_one(val & ~MMUEXT_CMD_MASK); 10.194 + __flush_tlb_one(ptr); 10.195 break; 10.196 10.197 case MMUEXT_SET_LDT: 10.198 @@ -932,11 +904,13 @@ static int do_extended_command(unsigned 10.199 } 10.200 else 10.201 { 10.202 - if ( percpu_info[cpu].subject_p != NULL ) 10.203 - put_task_struct(percpu_info[cpu].subject_p); 10.204 - percpu_info[cpu].subject_p = find_domain_by_id( 10.205 + if ( percpu_info[cpu].gps != NULL ) 10.206 + put_task_struct(percpu_info[cpu].gps); 10.207 + percpu_info[cpu].gps = find_domain_by_id( 10.208 percpu_info[cpu].subject_id); 10.209 - if ( percpu_info[cpu].subject_p == NULL ) 10.210 + percpu_info[cpu].pts = (val & SET_PAGETABLE_SUBJECTDOM) ? 10.211 + percpu_info[cpu].gps : NULL; 10.212 + if ( percpu_info[cpu].gps == NULL ) 10.213 { 10.214 MEM_LOG("Unknown domain '%llu'", percpu_info[cpu].subject_id); 10.215 okay = 0; 10.216 @@ -987,7 +961,7 @@ int do_mmu_update(mmu_update_t *ureqs, i 10.217 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. 10.218 */ 10.219 case MMU_NORMAL_PT_UPDATE: 10.220 - if ( unlikely(!get_page_from_pagenr(pfn, CHECK_STRICT)) ) 10.221 + if ( unlikely(!get_page_from_pagenr(pfn, PTS)) ) 10.222 { 10.223 MEM_LOG("Could not get page for normal update"); 10.224 break; 10.225 @@ -1059,7 +1033,7 @@ int do_mmu_update(mmu_update_t *ureqs, i 10.226 break; 10.227 10.228 case MMU_MACHPHYS_UPDATE: 10.229 - if ( unlikely(!get_page_from_pagenr(pfn, CHECK_STRICT)) ) 10.230 + if ( unlikely(!get_page_from_pagenr(pfn, GPS)) ) 10.231 { 10.232 MEM_LOG("Could not get page for mach->phys update"); 10.233 break; 10.234 @@ -1108,10 +1082,10 @@ int do_mmu_update(mmu_update_t *ureqs, i 10.235 if ( deferred_ops & DOP_RELOAD_LDT ) 10.236 (void)map_ldt_shadow_page(0); 10.237 10.238 - if ( unlikely(percpu_info[cpu].subject_p != NULL) ) 10.239 + if ( unlikely(percpu_info[cpu].gps != NULL) ) 10.240 { 10.241 - put_task_struct(percpu_info[cpu].subject_p); 10.242 - percpu_info[cpu].subject_p = NULL; 10.243 + put_task_struct(percpu_info[cpu].gps); 10.244 + percpu_info[cpu].gps = percpu_info[cpu].pts = NULL; 10.245 } 10.246 10.247 return rc;
11.1 --- a/xen/include/hypervisor-ifs/hypervisor-if.h Wed Apr 21 10:43:06 2004 +0000 11.2 +++ b/xen/include/hypervisor-ifs/hypervisor-if.h Thu Apr 22 13:56:30 2004 +0000 11.3 @@ -71,22 +71,73 @@ 11.4 #define NR_VIRQS 12 11.5 11.6 /* 11.7 - * MMU_XXX: specified in least 2 bits of 'ptr' field. These bits are masked 11.8 - * off to get the real 'ptr' value. 11.9 - * All requests specify relevent address in 'ptr'. This is either a 11.10 - * machine/physical address (MA), or linear/virtual address (VA). 11.11 - * Normal requests specify update value in 'value'. 11.12 - * Extended requests specify command in least 8 bits of 'value'. These bits 11.13 - * are masked off to get the real 'val' value. Except for MMUEXT_SET_LDT 11.14 - * which shifts the least bits out. 11.15 + * MMU-UPDATE REQUESTS 11.16 + * 11.17 + * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. 11.18 + * ptr[1:0] specifies the appropriate MMU_* command. 11.19 + * 11.20 + * GPS (General-Purpose Subject) 11.21 + * ----------------------------- 11.22 + * This domain that must own all non-page-table pages that are involved in 11.23 + * MMU updates. By default it is the domain that executes mmu_update(). If the 11.24 + * caller has sufficient privilege then it can be changed by executing 11.25 + * MMUEXT_SET_SUBJECTDOM_{L,H}. 11.26 + * 11.27 + * PTS (Page-Table Subject) 11.28 + * ------------------------ 11.29 + * This domain must own all the page-table pages that are subject to MMU 11.30 + * updates. By default it is the domain that executes mmu_update(). If the 11.31 + * caller has sufficient privilege then it can be changed by executing 11.32 + * MMUEXT_SET_SUBJECTDOM_H with val[14] (SET_PAGETABLE_SUBJECTDOM) set. 11.33 + * 11.34 + * ptr[1:0] == MMU_NORMAL_PT_UPDATE: 11.35 + * Updates an entry in a page table. 11.36 + * ptr[:2] -- machine address of the page-table entry to modify [1] 11.37 + * val -- value to write [2] 11.38 + * 11.39 + * ptr[1:0] == MMU_MACHPHYS_UPDATE: 11.40 + * Updates an entry in the machine->pseudo-physical mapping table. 11.41 + * ptr[:2] -- machine address within the frame whose mapping to modify [3] 11.42 + * val -- value to write into the mapping entry 11.43 + * 11.44 + * ptr[1:0] == MMU_EXTENDED_COMMAND: 11.45 + * val[7:0] -- MMUEXT_* command 11.46 + * 11.47 + * val[7:0] == MMUEXT_(UN)PIN_*_TABLE: 11.48 + * ptr[:2] -- machine address of frame to be (un)pinned as a p.t. page [1] 11.49 + * 11.50 + * val[7:0] == MMUEXT_NEW_BASEPTR: 11.51 + * ptr[:2] -- machine address of new page-table base to install in MMU [1] 11.52 + * 11.53 + * val[7:0] == MMUEXT_TLB_FLUSH: 11.54 + * no additional arguments 11.55 + * 11.56 + * val[7:0] == MMUEXT_INVLPG: 11.57 + * ptr[:2] -- linear address to be flushed from the TLB 11.58 + * 11.59 + * val[7:0] == MMUEXT_SET_LDT: 11.60 + * ptr[:2] -- linear address of LDT base (NB. must be page-aligned) 11.61 + * val[:8] -- number of entries in LDT 11.62 + * 11.63 + * val[7:0] == MMUEXT_SET_SUBJECTDOM_L: 11.64 + * (ptr[31:15],val[31:15]) -- dom[31:0] 11.65 + * 11.66 + * val[7:0] == MMUEXT_SET_SUBJECTDOM_H: 11.67 + * val[14] -- if TRUE then sets the PTS in addition to the GPS. 11.68 + * (ptr[31:15],val[31:15]) -- dom[63:32] 11.69 + * NB. This command must be immediately preceded by SET_SUBJECTDOM_L. 11.70 + * 11.71 + * Notes on constraints on the above arguments: 11.72 + * [1] The page frame containing the machine address must belong to the PTS. 11.73 + * [2] If the PTE is valid (i.e., bit 0 is set) then the specified page frame 11.74 + * must belong to: 11.75 + * (a) the PTS (if the PTE is part of a non-L1 table); or 11.76 + * (b) the GPS (if the PTE is part of an L1 table). 11.77 + * [3] The page frame containing the machine address must belong to the GPS. 11.78 */ 11.79 -/* A normal page-table update request. */ 11.80 #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ 11.81 -/* Update an entry in the machine->physical mapping table. */ 11.82 #define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */ 11.83 -/* An extended command. */ 11.84 #define MMU_EXTENDED_COMMAND 3 /* least 8 bits of val demux further */ 11.85 -/* Extended commands: */ 11.86 #define MMUEXT_PIN_L1_TABLE 0 /* ptr = MA of frame to pin */ 11.87 #define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */ 11.88 #define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */ 11.89 @@ -94,11 +145,12 @@ 11.90 #define MMUEXT_UNPIN_TABLE 4 /* ptr = MA of frame to unpin */ 11.91 #define MMUEXT_NEW_BASEPTR 5 /* ptr = MA of new pagetable base */ 11.92 #define MMUEXT_TLB_FLUSH 6 /* ptr = NULL */ 11.93 -#define MMUEXT_INVLPG 7 /* ptr = NULL ; val = VA to invalidate */ 11.94 +#define MMUEXT_INVLPG 7 /* ptr = VA to invalidate */ 11.95 #define MMUEXT_SET_LDT 8 /* ptr = VA of table; val = # entries */ 11.96 /* NB. MMUEXT_SET_SUBJECTDOM must consist of *_L followed immediately by *_H */ 11.97 #define MMUEXT_SET_SUBJECTDOM_L 9 /* (ptr[31:15],val[31:15]) = dom[31:0] */ 11.98 #define MMUEXT_SET_SUBJECTDOM_H 10 /* (ptr[31:15],val[31:15]) = dom[63:32] */ 11.99 +#define SET_PAGETABLE_SUBJECTDOM (1<<14) /* OR into 'val' arg of SUBJECTDOM_H*/ 11.100 #define MMUEXT_CMD_MASK 255 11.101 #define MMUEXT_CMD_SHIFT 8 11.102
12.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile Wed Apr 21 10:43:06 2004 +0000 12.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/Makefile Thu Apr 22 13:56:30 2004 +0000 12.3 @@ -1,3 +1,3 @@ 12.4 O_TARGET := drv.o 12.5 -obj-y := main.o 12.6 +obj-y := main.o control.o interface.o vbd.o 12.7 include $(TOPDIR)/Rules.make
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 13.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/common.h Thu Apr 22 13:56:30 2004 +0000 13.3 @@ -0,0 +1,94 @@ 13.4 +/****************************************************************************** 13.5 + * arch/xen/drivers/vblkif/backend/common.h 13.6 + */ 13.7 + 13.8 +#ifndef __VBLKIF__BACKEND__COMMON_H__ 13.9 +#define __VBLKIF__BACKEND__COMMON_H__ 13.10 + 13.11 +#include <linux/config.h> 13.12 +#include <linux/module.h> 13.13 +#include <linux/rbtree.h> 13.14 +#include <linux/interrupt.h> 13.15 +#include <linux/slab.h> 13.16 +#include <asm/ctrl_if.h> 13.17 +#include <asm/io.h> 13.18 + 13.19 +#ifndef NDEBUG 13.20 +#define ASSERT(_p) \ 13.21 + if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ 13.22 + __LINE__, __FILE__); *(int*)0=0; } 13.23 +#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \ 13.24 + __FILE__ , __LINE__ , ## _a ) 13.25 +#else 13.26 +#define ASSERT(_p) ((void)0) 13.27 +#define DPRINTK(_f, _a...) ((void)0) 13.28 +#endif 13.29 + 13.30 +typedef struct { 13.31 + /* Unique identifier for this interface. */ 13.32 + domid_t domid; 13.33 + unsigned int handle; 13.34 + /* Physical parameters of the comms window. */ 13.35 + unsigned long shmem_frame; 13.36 + unsigned int evtchn; 13.37 + int irq; 13.38 + /* Comms information. */ 13.39 + blk_ring_t *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */ 13.40 + BLK_RING_IDX blk_req_cons; /* Request consumer. */ 13.41 + BLK_RING_IDX blk_resp_prod; /* Private version of response producer. */ 13.42 + /* VBDs attached to this interface. */ 13.43 + rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs. */ 13.44 + spinlock_t vbd_lock; /* Protects VBD mapping. */ 13.45 + /* Private fields. */ 13.46 + struct list_head blkdev_list; 13.47 + spinlock_t blk_ring_lock; 13.48 +} blkif_t; 13.49 + 13.50 +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); 13.51 +void blkif_get(blkif_t *blkif); 13.52 +void blkif_put(blkif_t *blkif); 13.53 + 13.54 +/* An entry in a list of xen_extents. */ 13.55 +typedef struct _blkif_extent_le { 13.56 + blkif_extent_t extent; /* an individual extent */ 13.57 + struct _blkif_extent_le *next; /* and a pointer to the next */ 13.58 +} blkif_extent_le_t; 13.59 + 13.60 +typedef struct _vbd { 13.61 + blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ 13.62 + unsigned char mode; /* VBD_MODE_{R,W} */ 13.63 + unsigned char type; /* XD_TYPE_xxx */ 13.64 + blkif_extent_le_t *extents; /* list of xen_extents making up this vbd */ 13.65 + rb_node_t rb; /* for linking into R-B tree lookup struct */ 13.66 +} vbd_t; 13.67 + 13.68 +long vbd_create(blkif_vbd_create_t *create_params); 13.69 +long vbd_grow(blkif_vbd_grow_t *grow_params); 13.70 +long vbd_shrink(blkif_vbd_shrink_t *shrink_params); 13.71 +long vbd_destroy(blkif_vbd_destroy_t *delete_params); 13.72 + 13.73 +void destroy_all_vbds(struct task_struct *p); 13.74 + 13.75 +typedef struct { 13.76 + blkif_t *blkif; 13.77 + unsigned long id; 13.78 + atomic_t pendcnt; 13.79 + unsigned short operation; 13.80 + unsigned short status; 13.81 +} pending_req_t; 13.82 + 13.83 +/* Describes a [partial] disk extent (part of a block io request) */ 13.84 +typedef struct { 13.85 + unsigned short dev; 13.86 + unsigned short nr_sects; 13.87 + unsigned long buffer; 13.88 + xen_sector_t sector_number; 13.89 +} phys_seg_t; 13.90 + 13.91 +int vbd_translate(phys_seg_t *pseg, blkif_t *blkif, int operation); 13.92 + 13.93 +int vblkif_be_controller_init(void); 13.94 + 13.95 +void vblkif_be_int(int irq, void *dev_id, struct pt_regs *regs); 13.96 + 13.97 +#endif /* __VBLKIF__BACKEND__COMMON_H__ */
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/control.c Thu Apr 22 13:56:30 2004 +0000 14.3 @@ -0,0 +1,60 @@ 14.4 +/****************************************************************************** 14.5 + * arch/xen/drivers/vblkif/backend/control.c 14.6 + * 14.7 + * Routines for interfacing with the control plane. 14.8 + * 14.9 + * Copyright (c) 2004, Keir Fraser 14.10 + */ 14.11 + 14.12 +#include "common.h" 14.13 + 14.14 +static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) 14.15 +{ 14.16 + switch ( msg->subtype ) 14.17 + { 14.18 + case CMSG_BLKIF_BE_CREATE: 14.19 + if ( msg->length != sizeof(blkif_create_t) ) 14.20 + goto parse_error; 14.21 + blkif_create((blkif_create_t *)&msg->msg[0]); 14.22 + break; 14.23 + case CMSG_BLKIF_BE_DESTROY: 14.24 + if ( msg->length != sizeof(blkif_destroy_t) ) 14.25 + goto parse_error; 14.26 + blkif_destroy((blkif_destroy_t *)&msg->msg[0]); 14.27 + break; 14.28 + case CMSG_BLKIF_BE_VBD_CREATE: 14.29 + if ( msg->length != sizeof(blkif_vbd_create_t) ) 14.30 + goto parse_error; 14.31 + vbd_create((blkif_vbd_create_t *)&msg->msg[0]); 14.32 + break; 14.33 + case CMSG_BLKIF_BE_VBD_DESTROY: 14.34 + if ( msg->length != sizeof(blkif_vbd_destroy_t) ) 14.35 + goto parse_error; 14.36 + vbd_destroy((blkif_vbd_destroy_t *)&msg->msg[0]); 14.37 + break; 14.38 + case CMSG_BLKIF_BE_VBD_GROW: 14.39 + if ( msg->length != sizeof(blkif_vbd_grow_t) ) 14.40 + goto parse_error; 14.41 + vbd_grow((blkif_vbd_grow_t *)&msg->msg[0]); 14.42 + break; 14.43 + case CMSG_BLKIF_BE_VBD_SHRINK: 14.44 + if ( msg->length != sizeof(blkif_vbd_shrink_t) ) 14.45 + goto parse_error; 14.46 + vbd_shrink((blkif_vbd_shrink_t *)&msg->msg[0]); 14.47 + break; 14.48 + default: 14.49 + goto parse_error; 14.50 + } 14.51 + 14.52 + ctrl_if_send_response(msg); 14.53 + return; 14.54 + 14.55 + parse_error: 14.56 + msg->length = 0; 14.57 + ctrl_if_send_response(msg); 14.58 +} 14.59 + 14.60 +int blkif_ctrlif_init(void) 14.61 +{ 14.62 + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx); 14.63 +}
15.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 15.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/interface.c Thu Apr 22 13:56:30 2004 +0000 15.3 @@ -0,0 +1,96 @@ 15.4 +/****************************************************************************** 15.5 + * arch/xen/drivers/vblkif/backend/interface.c 15.6 + * 15.7 + * Block-device interface management. 15.8 + * 15.9 + * Copyright (c) 2004, Keir Fraser 15.10 + */ 15.11 + 15.12 +#include "common.h" 15.13 + 15.14 +#define BLKIF_HASHSZ 1024 15.15 +#define BLKIF_HASH(_d,_h) \ 15.16 + (((int)(_d)^(int)((_d)>>32)^(int)(_h))&(BLKIF_HASHSZ-1)) 15.17 + 15.18 +static blkif_t *blkif_hash[BLKIF_HASHSZ]; 15.19 + 15.20 +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) 15.21 +{ 15.22 + blkif_t *blkif = &blkif_hash[BLKIF_HASH(domid, handle)]; 15.23 + while ( (blkif != NULL) && 15.24 + (blkif->domid != domid) && 15.25 + (blkif->handle != handle) ) 15.26 + blkif = blkif->hash_next; 15.27 + return blkif; 15.28 +} 15.29 + 15.30 +static void blkif_create(blkif_create_t *create) 15.31 +{ 15.32 + domid_t domid = create->domid; 15.33 + unsigned int handle = create->blkif_handle; 15.34 + unsigned int evtchn = create->evtchn; 15.35 + unsigned long shmem_frame = create->shmem_frame; 15.36 + blkif_t **pblkif, *blkif; 15.37 + 15.38 + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; 15.39 + while ( *pblkif == NULL ) 15.40 + { 15.41 + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) 15.42 + goto found_match; 15.43 + pblkif = &(*pblkif)->hash_next; 15.44 + } 15.45 + 15.46 + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); 15.47 + memset(blkif, 0, sizeof(*blkif)); 15.48 + blkif->domid = domid; 15.49 + blkif->handle = handle; 15.50 + blkif->evtchn = evtchn; 15.51 + blkif->irq = bind_evtchn_to_irq(evtchn); 15.52 + blkif->shmem_frame = shmem_frame; 15.53 + blkif->shmem_vbase = ioremap(shmem_frame<<PAGE_SHIFT, PAGE_SIZE); 15.54 + spin_lock_init(&blkif->vbd_lock); 15.55 + spin_lock_init(&blkif->blk_ring_lock); 15.56 + 15.57 + request_irq(irq, vblkif_be_int, 0, "vblkif-backend", blkif); 15.58 + 15.59 + blkif->hash_next = *pblkif; 15.60 + *pblkif = blkif; 15.61 + 15.62 + create->status = BLKIF_STATUS_OKAY; 15.63 + return; 15.64 + 15.65 + found_match: 15.66 + create->status = BLKIF_STATUS_INTERFACE_EXISTS; 15.67 + return; 15.68 + 15.69 + evtchn_in_use: 15.70 + unbind_evtchn_from_irq(evtchn); /* drop refcnt */ 15.71 + create->status = BLKIF_STATUS_ERROR; 15.72 + return; 15.73 +} 15.74 + 15.75 +static void blkif_destroy(blkif_destroy_t *destroy) 15.76 +{ 15.77 + domid_t domid = destroy->domid; 15.78 + unsigned int handle = destroy->blkif_handle; 15.79 + blkif_t **pblkif, *blkif; 15.80 + 15.81 + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; 15.82 + while ( (blkif = *pblkif) == NULL ) 15.83 + { 15.84 + if ( (blkif->domid == domid) && (blkif->handle == handle) ) 15.85 + goto found_match; 15.86 + pblkif = &blkif->hash_next; 15.87 + } 15.88 + 15.89 + destroy->status = BLKIF_STATUS_NO_INTERFACE; 15.90 + return; 15.91 + 15.92 + found_match: 15.93 + free_irq(blkif->irq, NULL); 15.94 + unbind_evtchn_from_irq(blkif->evtchn); 15.95 + *pblkif = blkif->hash_next; 15.96 + kmem_cache_free(blkif_cachep, blkif); 15.97 + destroy->status = BLKIF_STATUS_OKAY; 15.98 +} 15.99 +
16.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 16.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/main.c Thu Apr 22 13:56:30 2004 +0000 16.3 @@ -0,0 +1,508 @@ 16.4 +/****************************************************************************** 16.5 + * arch/xen/drivers/vblkif/backend/main.c 16.6 + * 16.7 + * Back-end of the driver for virtual block devices. This portion of the 16.8 + * driver exports a 'unified' block-device interface that can be accessed 16.9 + * by any operating system that implements a compatible front end. A 16.10 + * reference front-end implementation can be found in: 16.11 + * arch/xen/drivers/vblkif/frontend 16.12 + * 16.13 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand 16.14 + */ 16.15 + 16.16 +#include "common.h" 16.17 + 16.18 +/* 16.19 + * These are rather arbitrary. They are fairly large because adjacent requests 16.20 + * pulled from a communication ring are quite likely to end up being part of 16.21 + * the same scatter/gather request at the disc. 16.22 + * 16.23 + * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** 16.24 + * This will increase the chances of being able to write whole tracks. 16.25 + * 64 should be enough to keep us competitive with Linux. 16.26 + */ 16.27 +#define MAX_PENDING_REQS 64 16.28 +#define BATCH_PER_DOMAIN 16 16.29 + 16.30 +/* 16.31 + * Each outstanding request that we've passed to the lower device layers has a 16.32 + * 'pending_req' allocated to it. Each buffer_head that completes decrements 16.33 + * the pendcnt towards zero. When it hits zero, the specified domain has a 16.34 + * response queued for it, with the saved 'id' passed back. 16.35 + * 16.36 + * We can't allocate pending_req's in order, since they may complete out of 16.37 + * order. We therefore maintain an allocation ring. This ring also indicates 16.38 + * when enough work has been passed down -- at that point the allocation ring 16.39 + * will be empty. 16.40 + */ 16.41 +static pending_req_t pending_reqs[MAX_PENDING_REQS]; 16.42 +static unsigned char pending_ring[MAX_PENDING_REQS]; 16.43 +static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; 16.44 +/* NB. We use a different index type to differentiate from shared blk rings. */ 16.45 +typedef unsigned int PEND_RING_IDX; 16.46 +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) 16.47 +static PEND_RING_IDX pending_prod, pending_cons; 16.48 +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) 16.49 + 16.50 +static kmem_cache_t *buffer_head_cachep; 16.51 + 16.52 +static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned; 16.53 + 16.54 +static int lock_buffer(blkif_t *blkif, 16.55 + unsigned long buffer, 16.56 + unsigned short size, 16.57 + int writeable_buffer); 16.58 +static void unlock_buffer(unsigned long buffer, 16.59 + unsigned short size, 16.60 + int writeable_buffer); 16.61 + 16.62 +static void io_schedule(unsigned long unused); 16.63 +static int do_block_io_op(blkif_t *blkif, int max_to_do); 16.64 +static void dispatch_rw_block_io(blkif_t *blkif, 16.65 + blk_ring_req_entry_t *req); 16.66 +static void make_response(blkif_t *blkif, unsigned long id, 16.67 + unsigned short op, unsigned long st); 16.68 + 16.69 + 16.70 +/****************************************************************** 16.71 + * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE 16.72 + */ 16.73 + 16.74 +static struct list_head io_schedule_list; 16.75 +static spinlock_t io_schedule_list_lock; 16.76 + 16.77 +static int __on_blkdev_list(blkif_t *blkif) 16.78 +{ 16.79 + return blkif->blkdev_list.next != NULL; 16.80 +} 16.81 + 16.82 +static void remove_from_blkdev_list(blkif_t *blkif) 16.83 +{ 16.84 + unsigned long flags; 16.85 + if ( !__on_blkdev_list(blkif) ) return; 16.86 + spin_lock_irqsave(&io_schedule_list_lock, flags); 16.87 + if ( __on_blkdev_list(blkif) ) 16.88 + { 16.89 + list_del(&blkif->blkdev_list); 16.90 + blkif->blkdev_list.next = NULL; 16.91 + blkif_put(blkif); 16.92 + } 16.93 + spin_unlock_irqrestore(&io_schedule_list_lock, flags); 16.94 +} 16.95 + 16.96 +static void add_to_blkdev_list_tail(blkif_t *blkif) 16.97 +{ 16.98 + unsigned long flags; 16.99 + if ( __on_blkdev_list(blkif) ) return; 16.100 + spin_lock_irqsave(&io_schedule_list_lock, flags); 16.101 + if ( !__on_blkdev_list(blkif) ) 16.102 + { 16.103 + list_add_tail(&blkif->blkdev_list, &io_schedule_list); 16.104 + blkif_get(blkif); 16.105 + } 16.106 + spin_unlock_irqrestore(&io_schedule_list_lock, flags); 16.107 +} 16.108 + 16.109 + 16.110 +/****************************************************************** 16.111 + * SCHEDULER FUNCTIONS 16.112 + */ 16.113 + 16.114 +static DECLARE_TASKLET(io_schedule_tasklet, io_schedule, 0); 16.115 + 16.116 +static void io_schedule(unsigned long unused) 16.117 +{ 16.118 + blkif_t *blkif; 16.119 + struct list_head *ent; 16.120 + 16.121 + /* Queue up a batch of requests. */ 16.122 + while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && 16.123 + !list_empty(&io_schedule_list) ) 16.124 + { 16.125 + ent = io_schedule_list.next; 16.126 + blkif = list_entry(ent, blkif_t, blkdev_list); 16.127 + blkif_get(blkif); 16.128 + remove_from_blkdev_list(blkif); 16.129 + if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) 16.130 + add_to_blkdev_list_tail(blkif); 16.131 + blkif_put(blkif); 16.132 + } 16.133 + 16.134 + /* Push the batch through to disc. */ 16.135 + run_task_queue(&tq_disk); 16.136 +} 16.137 + 16.138 +static void maybe_trigger_io_schedule(void) 16.139 +{ 16.140 + /* 16.141 + * Needed so that two processes, who together make the following predicate 16.142 + * true, don't both read stale values and evaluate the predicate 16.143 + * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... 16.144 + */ 16.145 + smp_mb(); 16.146 + 16.147 + if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && 16.148 + !list_empty(&io_schedule_list) ) 16.149 + tasklet_schedule(&io_schedule_tasklet); 16.150 +} 16.151 + 16.152 + 16.153 + 16.154 +/****************************************************************** 16.155 + * COMPLETION CALLBACK -- Called as bh->b_end_io() 16.156 + */ 16.157 + 16.158 +static void end_block_io_op(struct buffer_head *bh, int uptodate) 16.159 +{ 16.160 + pending_req_t *pending_req = bh->b_private; 16.161 + 16.162 + /* An error fails the entire request. */ 16.163 + if ( !uptodate ) 16.164 + { 16.165 + DPRINTK("Buffer not up-to-date at end of operation\n"); 16.166 + pending_req->status = 2; 16.167 + } 16.168 + 16.169 + unlock_buffer(virt_to_phys(bh->b_data), 16.170 + bh->b_size, 16.171 + (pending_req->operation==READ)); 16.172 + 16.173 + if ( atomic_dec_and_test(&pending_req->pendcnt) ) 16.174 + { 16.175 + make_response(pending_req->blkif, pending_req->id, 16.176 + pending_req->operation, pending_req->status); 16.177 + blkif_put(pending_req->blkif); 16.178 + spin_lock(&pend_prod_lock); 16.179 + pending_ring[MASK_PEND_IDX(pending_prod)] = 16.180 + pending_req - pending_reqs; 16.181 + pending_prod++; 16.182 + spin_unlock(&pend_prod_lock); 16.183 + maybe_trigger_io_schedule(); 16.184 + } 16.185 +} 16.186 + 16.187 + 16.188 + 16.189 +/****************************************************************************** 16.190 + * NOTIFICATION FROM GUEST OS. 16.191 + */ 16.192 + 16.193 +void vblkif_be_int(int irq, void *dev_id, struct pt_regs *regs) 16.194 +{ 16.195 + blkif_t *blkif = dev_id; 16.196 + add_to_blkdev_list_tail(blkif); 16.197 + maybe_trigger_io_schedule(); 16.198 +} 16.199 + 16.200 + 16.201 + 16.202 +/****************************************************************** 16.203 + * DOWNWARD CALLS -- These interface with the block-device layer proper. 16.204 + */ 16.205 + 16.206 +static int lock_buffer(blkif_t *blkif, 16.207 + unsigned long buffer, 16.208 + unsigned short size, 16.209 + int writeable_buffer) 16.210 +{ 16.211 + unsigned long pfn; 16.212 + 16.213 + for ( pfn = buffer >> PAGE_SHIFT; 16.214 + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); 16.215 + pfn++ ) 16.216 + { 16.217 + } 16.218 + 16.219 + return 1; 16.220 + 16.221 + fail: 16.222 + while ( pfn-- > (buffer >> PAGE_SHIFT) ) 16.223 + { 16.224 + } 16.225 + return 0; 16.226 +} 16.227 + 16.228 +static void unlock_buffer(unsigned long buffer, 16.229 + unsigned short size, 16.230 + int writeable_buffer) 16.231 +{ 16.232 + unsigned long pfn; 16.233 + 16.234 + for ( pfn = buffer >> PAGE_SHIFT; 16.235 + pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); 16.236 + pfn++ ) 16.237 + { 16.238 + } 16.239 +} 16.240 + 16.241 +static int do_block_io_op(blkif_t *blkif, int max_to_do) 16.242 +{ 16.243 + blk_ring_t *blk_ring = blkif->blk_ring_base; 16.244 + blk_ring_req_entry_t *req; 16.245 + BLK_RING_IDX i; 16.246 + int more_to_do = 0; 16.247 + 16.248 + /* Take items off the comms ring, taking care not to overflow. */ 16.249 + for ( i = blkif->blk_req_cons; 16.250 + (i != blk_ring->req_prod) && ((i-blkif->blk_resp_prod) != 16.251 + BLK_RING_SIZE); 16.252 + i++ ) 16.253 + { 16.254 + if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) 16.255 + { 16.256 + more_to_do = 1; 16.257 + break; 16.258 + } 16.259 + 16.260 + req = &blk_ring->ring[MASK_BLK_IDX(i)].req; 16.261 + switch ( req->operation ) 16.262 + { 16.263 + case BLKIF_OP_READ: 16.264 + case BLKIF_OP_WRITE: 16.265 + dispatch_rw_block_io(blkif, req); 16.266 + break; 16.267 + 16.268 + default: 16.269 + DPRINTK("error: unknown block io operation [%d]\n", 16.270 + blk_ring->ring[i].req.operation); 16.271 + make_response(blkif, blk_ring->ring[i].req.id, 16.272 + blk_ring->ring[i].req.operation, 1); 16.273 + break; 16.274 + } 16.275 + } 16.276 + 16.277 + blkif->blk_req_cons = i; 16.278 + return more_to_do; 16.279 +} 16.280 + 16.281 +static void dispatch_rw_block_io(blkif_t *blkif, 16.282 + blk_ring_req_entry_t *req) 16.283 +{ 16.284 + extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 16.285 + struct buffer_head *bh; 16.286 + int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ; 16.287 + unsigned short nr_sects; 16.288 + unsigned long buffer; 16.289 + int i, tot_sects; 16.290 + pending_req_t *pending_req; 16.291 + 16.292 + /* We map virtual scatter/gather segments to physical segments. */ 16.293 + int new_segs, nr_psegs = 0; 16.294 + phys_seg_t phys_seg[MAX_BLK_SEGS * 2]; 16.295 + 16.296 + /* Check that number of segments is sane. */ 16.297 + if ( unlikely(req->nr_segments == 0) || 16.298 + unlikely(req->nr_segments > MAX_BLK_SEGS) ) 16.299 + { 16.300 + DPRINTK("Bad number of segments in request (%d)\n", req->nr_segments); 16.301 + goto bad_descriptor; 16.302 + } 16.303 + 16.304 + /* 16.305 + * Check each address/size pair is sane, and convert into a 16.306 + * physical device and block offset. Note that if the offset and size 16.307 + * crosses a virtual extent boundary, we may end up with more 16.308 + * physical scatter/gather segments than virtual segments. 16.309 + */ 16.310 + for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) 16.311 + { 16.312 + buffer = req->buffer_and_sects[i] & ~0x1FF; 16.313 + nr_sects = req->buffer_and_sects[i] & 0x1FF; 16.314 + 16.315 + if ( unlikely(nr_sects == 0) ) 16.316 + { 16.317 + DPRINTK("zero-sized data request\n"); 16.318 + goto bad_descriptor; 16.319 + } 16.320 + 16.321 + phys_seg[nr_psegs].dev = req->device; 16.322 + phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; 16.323 + phys_seg[nr_psegs].buffer = buffer; 16.324 + phys_seg[nr_psegs].nr_sects = nr_sects; 16.325 + 16.326 + /* Translate the request into the relevant 'physical device' */ 16.327 + new_segs = vbd_translate(&phys_seg[nr_psegs], blkif, operation); 16.328 + if ( new_segs < 0 ) 16.329 + { 16.330 + DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 16.331 + operation == READ ? "read" : "write", 16.332 + req->sector_number + tot_sects, 16.333 + req->sector_number + tot_sects + nr_sects, 16.334 + req->device); 16.335 + goto bad_descriptor; 16.336 + } 16.337 + 16.338 + nr_psegs += new_segs; 16.339 + ASSERT(nr_psegs <= MAX_BLK_SEGS*2); 16.340 + } 16.341 + 16.342 + for ( i = 0; i < nr_psegs; i++ ) 16.343 + { 16.344 + if ( unlikely(!lock_buffer(blkif, phys_seg[i].buffer, 16.345 + phys_seg[i].nr_sects << 9, 16.346 + operation==READ)) ) 16.347 + { 16.348 + DPRINTK("invalid buffer\n"); 16.349 + while ( i-- > 0 ) 16.350 + unlock_buffer(phys_seg[i].buffer, 16.351 + phys_seg[i].nr_sects << 9, 16.352 + operation==READ); 16.353 + goto bad_descriptor; 16.354 + } 16.355 + } 16.356 + 16.357 + pending_req = &pending_reqs[pending_ring[MASK_PEND_IDX(pending_cons++)]]; 16.358 + pending_req->blkif = blkif; 16.359 + pending_req->id = req->id; 16.360 + pending_req->operation = operation; 16.361 + pending_req->status = 0; 16.362 + atomic_set(&pending_req->pendcnt, nr_psegs); 16.363 + 16.364 + blkif_get(blkif); 16.365 + 16.366 + /* Now we pass each segment down to the real blkdev layer. */ 16.367 + for ( i = 0; i < nr_psegs; i++ ) 16.368 + { 16.369 + bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); 16.370 + if ( unlikely(bh == NULL) ) 16.371 + panic("bh is null\n"); 16.372 + memset(bh, 0, sizeof (struct buffer_head)); 16.373 + 16.374 + bh->b_size = phys_seg[i].nr_sects << 9; 16.375 + bh->b_dev = phys_seg[i].dev; 16.376 + bh->b_rsector = (unsigned long)phys_seg[i].sector_number; 16.377 + 16.378 + /* SMH: we store a 'pseudo-virtual' bogus address in b_data since 16.379 + later code will undo this transformation (i.e. +-PAGE_OFFSET). */ 16.380 + bh->b_data = phys_to_virt(phys_seg[i].buffer); 16.381 + 16.382 + /* SMH: bh_phys() uses the below field as a 'cheap' virt_to_phys */ 16.383 + bh->b_page = &mem_map[phys_seg[i].buffer>>PAGE_SHIFT]; 16.384 + bh->b_end_io = end_block_io_op; 16.385 + bh->b_private = pending_req; 16.386 + 16.387 + bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock); 16.388 + if ( operation == WRITE ) 16.389 + bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); 16.390 + 16.391 + atomic_set(&bh->b_count, 1); 16.392 + 16.393 + /* Dispatch a single request. We'll flush it to disc later. */ 16.394 + submit_bh(operation, bh); 16.395 + } 16.396 + 16.397 + return; 16.398 + 16.399 + bad_descriptor: 16.400 + make_response(blkif, req->id, req->operation, 1); 16.401 +} 16.402 + 16.403 + 16.404 + 16.405 +/****************************************************************** 16.406 + * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING 16.407 + */ 16.408 + 16.409 + 16.410 +static void make_response(blkif_t *blkif, unsigned long id, 16.411 + unsigned short op, unsigned long st) 16.412 +{ 16.413 + blk_ring_resp_entry_t *resp; 16.414 + 16.415 + /* Place on the response ring for the relevant domain. */ 16.416 + spin_lock(&blkif->blk_ring_lock); 16.417 + resp = &blkif->blk_ring_base-> 16.418 + ring[MASK_BLK_IDX(blkif->blk_resp_prod)].resp; 16.419 + resp->id = id; 16.420 + resp->operation = op; 16.421 + resp->status = st; 16.422 + wmb(); 16.423 + blkif->blk_ring_base->resp_prod = ++blkif->blk_resp_prod; 16.424 + spin_unlock(&blkif->blk_ring_lock); 16.425 + 16.426 + /* Kick the relevant domain. */ 16.427 + notify_via_evtchn(blkif->evtchn); 16.428 +} 16.429 + 16.430 +static void blkif_debug_int(int irq, void *unused, struct pt_regs *regs) 16.431 +{ 16.432 +#if 0 16.433 + unsigned long flags; 16.434 + struct task_struct *p; 16.435 + blk_ring_t *blk_ring; 16.436 + int i; 16.437 + 16.438 + printk("Dumping block queue stats: nr_pending = %d" 16.439 + " (prod=0x%08x,cons=0x%08x)\n", 16.440 + NR_PENDING_REQS, pending_prod, pending_cons); 16.441 + 16.442 + read_lock_irqsave(&tasklist_lock, flags); 16.443 + for_each_domain ( p ) 16.444 + { 16.445 + printk("Domain: %llu\n", blkif->domain); 16.446 + blk_ring = blkif->blk_ring_base; 16.447 + printk(" req_prod:0x%08x, req_cons:0x%08x resp_prod:0x%08x/" 16.448 + "0x%08x on_list=%d\n", 16.449 + blk_ring->req_prod, blkif->blk_req_cons, 16.450 + blk_ring->resp_prod, blkif->blk_resp_prod, 16.451 + __on_blkdev_list(p)); 16.452 + } 16.453 + read_unlock_irqrestore(&tasklist_lock, flags); 16.454 + 16.455 + for ( i = 0; i < MAX_PENDING_REQS; i++ ) 16.456 + { 16.457 + printk("Pend%d: dom=%p, id=%08lx, cnt=%d, op=%d, status=%d\n", 16.458 + i, pending_reqs[i].domain, pending_reqs[i].id, 16.459 + atomic_read(&pending_reqs[i].pendcnt), 16.460 + pending_reqs[i].operation, pending_reqs[i].status); 16.461 + } 16.462 +#endif 16.463 +} 16.464 + 16.465 +void unlink_blkdev_info(blkif_t *blkif) 16.466 +{ 16.467 + unsigned long flags; 16.468 + 16.469 + spin_lock_irqsave(&io_schedule_list_lock, flags); 16.470 + if ( __on_blkdev_list(blkif) ) 16.471 + { 16.472 + list_del(&blkif->blkdev_list); 16.473 + blkif->blkdev_list.next = (void *)0xdeadbeef; 16.474 + blkif_put(blkif); 16.475 + } 16.476 + spin_unlock_irqrestore(&io_schedule_list_lock, flags); 16.477 +} 16.478 + 16.479 +static int __init init_module(void) 16.480 +{ 16.481 + int i; 16.482 + 16.483 + pending_cons = 0; 16.484 + pending_prod = MAX_PENDING_REQS; 16.485 + memset(pending_reqs, 0, sizeof(pending_reqs)); 16.486 + for ( i = 0; i < MAX_PENDING_REQS; i++ ) 16.487 + pending_ring[i] = i; 16.488 + 16.489 + for ( i = 0; i < NR_CPUS; i++ ) 16.490 + completed_bhs[i] = NULL; 16.491 + 16.492 + spin_lock_init(&io_schedule_list_lock); 16.493 + INIT_LIST_HEAD(&io_schedule_list); 16.494 + 16.495 + if ( request_irq(bind_virq_to_irq(VIRQ_DEBUG), blkif_debug_int, 16.496 + SA_SHIRQ, "vblkif-backend-dbg", &blkif_debug_int) != 0 ) 16.497 + printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); 16.498 + 16.499 + buffer_head_cachep = kmem_cache_create( 16.500 + "buffer_head_cache", sizeof(struct buffer_head), 16.501 + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 16.502 + 16.503 + return 0; 16.504 +} 16.505 + 16.506 +static void cleanup_module(void) 16.507 +{ 16.508 +} 16.509 + 16.510 +module_init(init_module); 16.511 +module_exit(cleanup_module);
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 17.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vblkif/backend/vbd.c Thu Apr 22 13:56:30 2004 +0000 17.3 @@ -0,0 +1,701 @@ 17.4 +/****************************************************************************** 17.5 + * arch/xen/drivers/vblkif/backend/vbd.c 17.6 + * 17.7 + * Routines for managing virtual block devices (VBDs). 17.8 + * 17.9 + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand 17.10 + */ 17.11 + 17.12 +#include "common.h" 17.13 + 17.14 +long __vbd_create(struct task_struct *p, 17.15 + unsigned short vdevice, 17.16 + unsigned char mode, 17.17 + unsigned char type) 17.18 +{ 17.19 + vbd_t *vbd; 17.20 + rb_node_t **rb_p, *rb_parent = NULL; 17.21 + long ret = 0; 17.22 + 17.23 + spin_lock(&p->vbd_lock); 17.24 + 17.25 + rb_p = &p->vbd_rb.rb_node; 17.26 + while ( *rb_p != NULL ) 17.27 + { 17.28 + rb_parent = *rb_p; 17.29 + vbd = rb_entry(rb_parent, vbd_t, rb); 17.30 + if ( vdevice < vbd->vdevice ) 17.31 + { 17.32 + rb_p = &rb_parent->rb_left; 17.33 + } 17.34 + else if ( vdevice > vbd->vdevice ) 17.35 + { 17.36 + rb_p = &rb_parent->rb_right; 17.37 + } 17.38 + else 17.39 + { 17.40 + DPRINTK("vbd_create attempted for already existing vbd\n"); 17.41 + ret = -EINVAL; 17.42 + goto out; 17.43 + } 17.44 + } 17.45 + 17.46 + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) 17.47 + { 17.48 + DPRINTK("vbd_create: out of memory\n"); 17.49 + ret = -ENOMEM; 17.50 + goto out; 17.51 + } 17.52 + 17.53 + vbd->vdevice = vdevice; 17.54 + vbd->mode = mode; 17.55 + vbd->type = type; 17.56 + vbd->extents = NULL; 17.57 + 17.58 + rb_link_node(&vbd->rb, rb_parent, rb_p); 17.59 + rb_insert_color(&vbd->rb, &p->vbd_rb); 17.60 + 17.61 + out: 17.62 + spin_unlock(&p->vbd_lock); 17.63 + return ret; 17.64 +} 17.65 + 17.66 + 17.67 +long vbd_create(vbd_create_t *create) 17.68 +{ 17.69 + struct task_struct *p; 17.70 + long rc; 17.71 + 17.72 + if ( unlikely(!IS_PRIV(current)) ) 17.73 + return -EPERM; 17.74 + 17.75 + if ( unlikely((p = find_domain_by_id(create->domain)) == NULL) ) 17.76 + { 17.77 + DPRINTK("vbd_create attempted for non-existent domain %llu\n", 17.78 + create->domain); 17.79 + return -EINVAL; 17.80 + } 17.81 + 17.82 + rc = __vbd_create(p, create->vdevice, create->mode, 17.83 + XD_TYPE_DISK | XD_FLAG_VIRT); 17.84 + 17.85 + put_task_struct(p); 17.86 + 17.87 + return rc; 17.88 +} 17.89 + 17.90 + 17.91 +long __vbd_grow(struct task_struct *p, 17.92 + unsigned short vdevice, 17.93 + xen_extent_t *extent) 17.94 +{ 17.95 + xen_extent_le_t **px, *x; 17.96 + vbd_t *vbd = NULL; 17.97 + rb_node_t *rb; 17.98 + long ret = 0; 17.99 + 17.100 + spin_lock(&p->vbd_lock); 17.101 + 17.102 + rb = p->vbd_rb.rb_node; 17.103 + while ( rb != NULL ) 17.104 + { 17.105 + vbd = rb_entry(rb, vbd_t, rb); 17.106 + if ( vdevice < vbd->vdevice ) 17.107 + rb = rb->rb_left; 17.108 + else if ( vdevice > vbd->vdevice ) 17.109 + rb = rb->rb_right; 17.110 + else 17.111 + break; 17.112 + } 17.113 + 17.114 + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != vdevice) ) 17.115 + { 17.116 + DPRINTK("vbd_grow: attempted to append extent to non-existent VBD.\n"); 17.117 + ret = -EINVAL; 17.118 + goto out; 17.119 + } 17.120 + 17.121 + if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) == NULL) ) 17.122 + { 17.123 + DPRINTK("vbd_grow: out of memory\n"); 17.124 + ret = -ENOMEM; 17.125 + goto out; 17.126 + } 17.127 + 17.128 + x->extent.device = extent->device; 17.129 + x->extent.start_sector = extent->start_sector; 17.130 + x->extent.nr_sectors = extent->nr_sectors; 17.131 + x->next = (xen_extent_le_t *)NULL; 17.132 + 17.133 + for ( px = &vbd->extents; *px != NULL; px = &(*px)->next ) 17.134 + continue; 17.135 + 17.136 + *px = x; 17.137 + 17.138 + out: 17.139 + spin_unlock(&p->vbd_lock); 17.140 + return ret; 17.141 +} 17.142 + 17.143 + 17.144 +/* Grow a VBD by appending a new extent. Fails if the VBD doesn't exist. */ 17.145 +long vbd_grow(vbd_grow_t *grow) 17.146 +{ 17.147 + struct task_struct *p; 17.148 + long rc; 17.149 + 17.150 + if ( unlikely(!IS_PRIV(current)) ) 17.151 + return -EPERM; 17.152 + 17.153 + if ( unlikely((p = find_domain_by_id(grow->domain)) == NULL) ) 17.154 + { 17.155 + DPRINTK("vbd_grow: attempted for non-existent domain %llu\n", 17.156 + grow->domain); 17.157 + return -EINVAL; 17.158 + } 17.159 + 17.160 + rc = __vbd_grow(p, grow->vdevice, &grow->extent); 17.161 + 17.162 + put_task_struct(p); 17.163 + 17.164 + return rc; 17.165 +} 17.166 + 17.167 + 17.168 +long vbd_shrink(vbd_shrink_t *shrink) 17.169 +{ 17.170 + struct task_struct *p; 17.171 + xen_extent_le_t **px, *x; 17.172 + vbd_t *vbd = NULL; 17.173 + rb_node_t *rb; 17.174 + long ret = 0; 17.175 + 17.176 + if ( !IS_PRIV(current) ) 17.177 + return -EPERM; 17.178 + 17.179 + if ( (p = find_domain_by_id(shrink->domain)) == NULL ) 17.180 + { 17.181 + DPRINTK("vbd_shrink attempted for non-existent domain %llu\n", 17.182 + shrink->domain); 17.183 + return -EINVAL; 17.184 + } 17.185 + 17.186 + spin_lock(&p->vbd_lock); 17.187 + 17.188 + rb = p->vbd_rb.rb_node; 17.189 + while ( rb != NULL ) 17.190 + { 17.191 + vbd = rb_entry(rb, vbd_t, rb); 17.192 + if ( shrink->vdevice < vbd->vdevice ) 17.193 + rb = rb->rb_left; 17.194 + else if ( shrink->vdevice > vbd->vdevice ) 17.195 + rb = rb->rb_right; 17.196 + else 17.197 + break; 17.198 + } 17.199 + 17.200 + if ( unlikely(vbd == NULL) || 17.201 + unlikely(vbd->vdevice != shrink->vdevice) || 17.202 + unlikely(vbd->extents == NULL) ) 17.203 + { 17.204 + DPRINTK("vbd_shrink: attempt to remove non-existent extent.\n"); 17.205 + ret = -EINVAL; 17.206 + goto out; 17.207 + } 17.208 + 17.209 + /* Find the last extent. We now know that there is at least one. */ 17.210 + for ( px = &vbd->extents; (*px)->next != NULL; px = &(*px)->next ) 17.211 + continue; 17.212 + 17.213 + x = *px; 17.214 + *px = x->next; 17.215 + kfree(x); 17.216 + 17.217 + out: 17.218 + spin_unlock(&p->vbd_lock); 17.219 + put_task_struct(p); 17.220 + return ret; 17.221 +} 17.222 + 17.223 + 17.224 +long vbd_setextents(vbd_setextents_t *setextents) 17.225 +{ 17.226 + struct task_struct *p; 17.227 + xen_extent_t e; 17.228 + xen_extent_le_t *new_extents, *x, *t; 17.229 + vbd_t *vbd = NULL; 17.230 + rb_node_t *rb; 17.231 + int i; 17.232 + long ret = 0; 17.233 + 17.234 + if ( !IS_PRIV(current) ) 17.235 + return -EPERM; 17.236 + 17.237 + if ( (p = find_domain_by_id(setextents->domain)) == NULL ) 17.238 + { 17.239 + DPRINTK("vbd_setextents attempted for non-existent domain %llu\n", 17.240 + setextents->domain); 17.241 + return -EINVAL; 17.242 + } 17.243 + 17.244 + spin_lock(&p->vbd_lock); 17.245 + 17.246 + rb = p->vbd_rb.rb_node; 17.247 + while ( rb != NULL ) 17.248 + { 17.249 + vbd = rb_entry(rb, vbd_t, rb); 17.250 + if ( setextents->vdevice < vbd->vdevice ) 17.251 + rb = rb->rb_left; 17.252 + else if ( setextents->vdevice > vbd->vdevice ) 17.253 + rb = rb->rb_right; 17.254 + else 17.255 + break; 17.256 + } 17.257 + 17.258 + if ( unlikely(vbd == NULL) || 17.259 + unlikely(vbd->vdevice != setextents->vdevice) ) 17.260 + { 17.261 + DPRINTK("vbd_setextents: attempt to modify non-existent VBD.\n"); 17.262 + ret = -EINVAL; 17.263 + goto out; 17.264 + } 17.265 + 17.266 + /* Construct the new extent list. */ 17.267 + new_extents = NULL; 17.268 + for ( i = setextents->nr_extents - 1; i >= 0; i-- ) 17.269 + { 17.270 + if ( unlikely(copy_from_user(&e, 17.271 + &setextents->extents[i], 17.272 + sizeof(e)) != 0) ) 17.273 + { 17.274 + DPRINTK("vbd_setextents: copy_from_user failed\n"); 17.275 + ret = -EFAULT; 17.276 + goto free_and_out; 17.277 + } 17.278 + 17.279 + if ( unlikely((x = kmalloc(sizeof(xen_extent_le_t), GFP_KERNEL)) 17.280 + == NULL) ) 17.281 + { 17.282 + DPRINTK("vbd_setextents: out of memory\n"); 17.283 + ret = -ENOMEM; 17.284 + goto free_and_out; 17.285 + } 17.286 + 17.287 + x->extent = e; 17.288 + x->next = new_extents; 17.289 + 17.290 + new_extents = x; 17.291 + } 17.292 + 17.293 + /* Delete the old extent list _after_ successfully creating the new. */ 17.294 + for ( x = vbd->extents; x != NULL; x = t ) 17.295 + { 17.296 + t = x->next; 17.297 + kfree(x); 17.298 + } 17.299 + 17.300 + /* Make the new list visible. */ 17.301 + vbd->extents = new_extents; 17.302 + 17.303 + out: 17.304 + spin_unlock(&p->vbd_lock); 17.305 + put_task_struct(p); 17.306 + return ret; 17.307 + 17.308 + free_and_out: 17.309 + /* Failed part-way through the new list. Delete all that we managed. */ 17.310 + for ( x = new_extents; x != NULL; x = t ) 17.311 + { 17.312 + t = x->next; 17.313 + kfree(x); 17.314 + } 17.315 + goto out; 17.316 +} 17.317 + 17.318 + 17.319 +long vbd_delete(vbd_delete_t *delete) 17.320 +{ 17.321 + struct task_struct *p; 17.322 + vbd_t *vbd; 17.323 + rb_node_t *rb; 17.324 + xen_extent_le_t *x, *t; 17.325 + 17.326 + if( !IS_PRIV(current) ) 17.327 + return -EPERM; 17.328 + 17.329 + if ( (p = find_domain_by_id(delete->domain)) == NULL ) 17.330 + { 17.331 + DPRINTK("vbd_delete attempted for non-existent domain %llu\n", 17.332 + delete->domain); 17.333 + return -EINVAL; 17.334 + } 17.335 + 17.336 + spin_lock(&p->vbd_lock); 17.337 + 17.338 + rb = p->vbd_rb.rb_node; 17.339 + while ( rb != NULL ) 17.340 + { 17.341 + vbd = rb_entry(rb, vbd_t, rb); 17.342 + if ( delete->vdevice < vbd->vdevice ) 17.343 + rb = rb->rb_left; 17.344 + else if ( delete->vdevice > vbd->vdevice ) 17.345 + rb = rb->rb_right; 17.346 + else 17.347 + goto found; 17.348 + } 17.349 + 17.350 + DPRINTK("vbd_delete attempted for non-existing VBD.\n"); 17.351 + 17.352 + spin_unlock(&p->vbd_lock); 17.353 + put_task_struct(p); 17.354 + return -EINVAL; 17.355 + 17.356 + found: 17.357 + rb_erase(rb, &p->vbd_rb); 17.358 + x = vbd->extents; 17.359 + kfree(vbd); 17.360 + 17.361 + while ( x != NULL ) 17.362 + { 17.363 + t = x->next; 17.364 + kfree(x); 17.365 + x = t; 17.366 + } 17.367 + 17.368 + spin_unlock(&p->vbd_lock); 17.369 + put_task_struct(p); 17.370 + return 0; 17.371 +} 17.372 + 17.373 + 17.374 +void destroy_all_vbds(struct task_struct *p) 17.375 +{ 17.376 + vbd_t *vbd; 17.377 + rb_node_t *rb; 17.378 + xen_extent_le_t *x, *t; 17.379 + 17.380 + spin_lock(&p->vbd_lock); 17.381 + 17.382 + while ( (rb = p->vbd_rb.rb_node) != NULL ) 17.383 + { 17.384 + vbd = rb_entry(rb, vbd_t, rb); 17.385 + 17.386 + rb_erase(rb, &p->vbd_rb); 17.387 + x = vbd->extents; 17.388 + kfree(vbd); 17.389 + 17.390 + while ( x != NULL ) 17.391 + { 17.392 + t = x->next; 17.393 + kfree(x); 17.394 + x = t; 17.395 + } 17.396 + } 17.397 + 17.398 + spin_unlock(&p->vbd_lock); 17.399 +} 17.400 + 17.401 + 17.402 +static int vbd_probe_single(xen_disk_info_t *xdi, 17.403 + vbd_t *vbd, 17.404 + struct task_struct *p) 17.405 +{ 17.406 + xen_extent_le_t *x; 17.407 + xen_disk_t cur_disk; 17.408 + 17.409 + if ( xdi->count == xdi->max ) 17.410 + { 17.411 + DPRINTK("vbd_probe_devices: out of space for probe.\n"); 17.412 + return -ENOMEM; 17.413 + } 17.414 + 17.415 + cur_disk.device = vbd->vdevice; 17.416 + cur_disk.info = vbd->type; 17.417 + if ( !VBD_CAN_WRITE(vbd) ) 17.418 + cur_disk.info |= XD_FLAG_RO; 17.419 + cur_disk.capacity = 0ULL; 17.420 + for ( x = vbd->extents; x != NULL; x = x->next ) 17.421 + cur_disk.capacity += x->extent.nr_sectors; 17.422 + cur_disk.domain = p->domain; 17.423 + 17.424 + /* Now copy into relevant part of user-space buffer */ 17.425 + if( copy_to_user(&xdi->disks[xdi->count], 17.426 + &cur_disk, 17.427 + sizeof(xen_disk_t)) ) 17.428 + { 17.429 + DPRINTK("vbd_probe_devices: copy_to_user failed\n"); 17.430 + return -EFAULT; 17.431 + } 17.432 + 17.433 + xdi->count++; 17.434 + 17.435 + return 0; 17.436 +} 17.437 + 17.438 + 17.439 +static int vbd_probe_devices(xen_disk_info_t *xdi, struct task_struct *p) 17.440 +{ 17.441 + int rc = 0; 17.442 + rb_node_t *rb; 17.443 + 17.444 + spin_lock(&p->vbd_lock); 17.445 + 17.446 + if ( (rb = p->vbd_rb.rb_node) == NULL ) 17.447 + goto out; 17.448 + 17.449 + new_subtree: 17.450 + /* STEP 1. Find least node (it'll be left-most). */ 17.451 + while ( rb->rb_left != NULL ) 17.452 + rb = rb->rb_left; 17.453 + 17.454 + for ( ; ; ) 17.455 + { 17.456 + /* STEP 2. Dealt with left subtree. Now process current node. */ 17.457 + if ( (rc = vbd_probe_single(xdi, rb_entry(rb, vbd_t, rb), p)) != 0 ) 17.458 + goto out; 17.459 + 17.460 + /* STEP 3. Process right subtree, if any. */ 17.461 + if ( rb->rb_right != NULL ) 17.462 + { 17.463 + rb = rb->rb_right; 17.464 + goto new_subtree; 17.465 + } 17.466 + 17.467 + /* STEP 4. Done both subtrees. Head back through ancesstors. */ 17.468 + for ( ; ; ) 17.469 + { 17.470 + /* We're done when we get back to the root node. */ 17.471 + if ( rb->rb_parent == NULL ) 17.472 + goto out; 17.473 + /* If we are left of parent, then parent is next to process. */ 17.474 + if ( rb->rb_parent->rb_left == rb ) 17.475 + break; 17.476 + /* If we are right of parent, then we climb to grandparent. */ 17.477 + rb = rb->rb_parent; 17.478 + } 17.479 + 17.480 + rb = rb->rb_parent; 17.481 + } 17.482 + 17.483 + out: 17.484 + spin_unlock(&p->vbd_lock); 17.485 + return rc; 17.486 +} 17.487 + 17.488 + 17.489 +/* 17.490 + * Return information about the VBDs available for a given domain, or for all 17.491 + * domains; in the general case the 'domain' argument will be 0 which means 17.492 + * "information about the caller"; otherwise the 'domain' argument will 17.493 + * specify either a given domain, or all domains ("VBD_PROBE_ALL") -- both of 17.494 + * these cases require the caller to be privileged. 17.495 + */ 17.496 +long vbd_probe(vbd_probe_t *probe) 17.497 +{ 17.498 + struct task_struct *p = NULL; 17.499 + unsigned long flags; 17.500 + long ret = 0; 17.501 + 17.502 + if ( probe->domain != 0 ) 17.503 + { 17.504 + /* We can only probe for ourselves (unless we're privileged). */ 17.505 + if( (probe->domain != current->domain) && !IS_PRIV(current) ) 17.506 + return -EPERM; 17.507 + 17.508 + if ( (probe->domain != VBD_PROBE_ALL) && 17.509 + ((p = find_domain_by_id(probe->domain)) == NULL) ) 17.510 + { 17.511 + DPRINTK("vbd_probe attempted for non-existent domain %llu\n", 17.512 + probe->domain); 17.513 + return -EINVAL; 17.514 + } 17.515 + } 17.516 + else 17.517 + { 17.518 + /* Default is to probe for ourselves. */ 17.519 + p = current; 17.520 + get_task_struct(p); /* to mirror final put_task_struct */ 17.521 + } 17.522 + 17.523 + if ( probe->domain == VBD_PROBE_ALL ) 17.524 + { 17.525 + read_lock_irqsave(&tasklist_lock, flags); 17.526 + for_each_domain ( p ) 17.527 + { 17.528 + if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) 17.529 + { 17.530 + read_unlock_irqrestore(&tasklist_lock, flags); 17.531 + goto out; 17.532 + } 17.533 + } 17.534 + read_unlock_irqrestore(&tasklist_lock, flags); 17.535 + } 17.536 + else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 ) 17.537 + goto out; 17.538 + 17.539 + out: 17.540 + if ( ret != 0 ) 17.541 + DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); 17.542 + if ( p != NULL ) 17.543 + put_task_struct(p); 17.544 + return ret; 17.545 +} 17.546 + 17.547 + 17.548 +long vbd_info(vbd_info_t *info) 17.549 +{ 17.550 + struct task_struct *p; 17.551 + xen_extent_le_t *x; 17.552 + xen_extent_t *extents; 17.553 + vbd_t *vbd = NULL; 17.554 + rb_node_t *rb; 17.555 + long ret = 0; 17.556 + 17.557 + if ( (info->domain != current->domain) && !IS_PRIV(current) ) 17.558 + return -EPERM; 17.559 + 17.560 + if ( (p = find_domain_by_id(info->domain)) == NULL ) 17.561 + { 17.562 + DPRINTK("vbd_info attempted for non-existent domain %llu\n", 17.563 + info->domain); 17.564 + return -EINVAL; 17.565 + } 17.566 + 17.567 + spin_lock(&p->vbd_lock); 17.568 + 17.569 + rb = p->vbd_rb.rb_node; 17.570 + while ( rb != NULL ) 17.571 + { 17.572 + vbd = rb_entry(rb, vbd_t, rb); 17.573 + if ( info->vdevice < vbd->vdevice ) 17.574 + rb = rb->rb_left; 17.575 + else if ( info->vdevice > vbd->vdevice ) 17.576 + rb = rb->rb_right; 17.577 + else 17.578 + break; 17.579 + } 17.580 + 17.581 + if ( unlikely(vbd == NULL) || unlikely(vbd->vdevice != info->vdevice) ) 17.582 + { 17.583 + DPRINTK("vbd_info attempted on non-existent VBD.\n"); 17.584 + ret = -EINVAL; 17.585 + goto out; 17.586 + } 17.587 + 17.588 + info->mode = vbd->mode; 17.589 + info->nextents = 0; 17.590 + 17.591 + extents = info->extents; 17.592 + for ( x = vbd->extents; x != NULL; x = x->next ) 17.593 + { 17.594 + if ( info->nextents == info->maxextents ) 17.595 + break; 17.596 + if ( copy_to_user(extents, &x->extent, sizeof(xen_extent_t)) ) 17.597 + { 17.598 + DPRINTK("vbd_info: copy_to_user failed\n"); 17.599 + ret = -EFAULT; 17.600 + goto out; 17.601 + } 17.602 + extents++; 17.603 + info->nextents++; 17.604 + } 17.605 + 17.606 + out: 17.607 + spin_unlock(&p->vbd_lock); 17.608 + put_task_struct(p); 17.609 + return ret; 17.610 +} 17.611 + 17.612 + 17.613 +int vbd_translate(phys_seg_t *pseg, struct task_struct *p, int operation) 17.614 +{ 17.615 + xen_extent_le_t *x; 17.616 + vbd_t *vbd; 17.617 + rb_node_t *rb; 17.618 + xen_sector_t sec_off; 17.619 + unsigned long nr_secs; 17.620 + 17.621 + spin_lock(&p->vbd_lock); 17.622 + 17.623 + rb = p->vbd_rb.rb_node; 17.624 + while ( rb != NULL ) 17.625 + { 17.626 + vbd = rb_entry(rb, vbd_t, rb); 17.627 + if ( pseg->dev < vbd->vdevice ) 17.628 + rb = rb->rb_left; 17.629 + else if ( pseg->dev > vbd->vdevice ) 17.630 + rb = rb->rb_right; 17.631 + else 17.632 + goto found; 17.633 + } 17.634 + 17.635 + DPRINTK("vbd_translate; domain %llu attempted to access " 17.636 + "non-existent VBD.\n", p->domain); 17.637 + 17.638 + spin_unlock(&p->vbd_lock); 17.639 + return -ENODEV; 17.640 + 17.641 + found: 17.642 + 17.643 + if ( ((operation == READ) && !VBD_CAN_READ(vbd)) || 17.644 + ((operation == WRITE) && !VBD_CAN_WRITE(vbd)) ) 17.645 + { 17.646 + spin_unlock(&p->vbd_lock); 17.647 + return -EACCES; 17.648 + } 17.649 + 17.650 + /* 17.651 + * Now iterate through the list of xen_extents, working out which should 17.652 + * be used to perform the translation. 17.653 + */ 17.654 + sec_off = pseg->sector_number; 17.655 + nr_secs = pseg->nr_sects; 17.656 + for ( x = vbd->extents; x != NULL; x = x->next ) 17.657 + { 17.658 + if ( sec_off < x->extent.nr_sectors ) 17.659 + { 17.660 + pseg->dev = x->extent.device; 17.661 + pseg->sector_number = x->extent.start_sector + sec_off; 17.662 + if ( unlikely((sec_off + nr_secs) > x->extent.nr_sectors) ) 17.663 + goto overrun; 17.664 + spin_unlock(&p->vbd_lock); 17.665 + return 1; 17.666 + } 17.667 + sec_off -= x->extent.nr_sectors; 17.668 + } 17.669 + 17.670 + DPRINTK("vbd_translate: end of vbd.\n"); 17.671 + spin_unlock(&p->vbd_lock); 17.672 + return -EACCES; 17.673 + 17.674 + /* 17.675 + * Here we deal with overrun onto the following extent. We don't deal with 17.676 + * overrun of more than one boundary since each request is restricted to 17.677 + * 2^9 512-byte sectors, so it should be trivial for control software to 17.678 + * ensure that extents are large enough to prevent excessive overrun. 17.679 + */ 17.680 + overrun: 17.681 + 17.682 + /* Adjust length of first chunk to run to end of first extent. */ 17.683 + pseg[0].nr_sects = x->extent.nr_sectors - sec_off; 17.684 + 17.685 + /* Set second chunk buffer and length to start where first chunk ended. */ 17.686 + pseg[1].buffer = pseg[0].buffer + (pseg[0].nr_sects << 9); 17.687 + pseg[1].nr_sects = nr_secs - pseg[0].nr_sects; 17.688 + 17.689 + /* Now move to the next extent. Check it exists and is long enough! */ 17.690 + if ( unlikely((x = x->next) == NULL) || 17.691 + unlikely(x->extent.nr_sectors < pseg[1].nr_sects) ) 17.692 + { 17.693 + DPRINTK("vbd_translate: multiple overruns or end of vbd.\n"); 17.694 + spin_unlock(&p->vbd_lock); 17.695 + return -EACCES; 17.696 + } 17.697 + 17.698 + /* Store the real device and start sector for the second chunk. */ 17.699 + pseg[1].dev = x->extent.device; 17.700 + pseg[1].sector_number = x->extent.start_sector; 17.701 + 17.702 + spin_unlock(&p->vbd_lock); 17.703 + return 2; 17.704 +}
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 18.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/vnetif/backend/main.c Thu Apr 22 13:56:30 2004 +0000 18.3 @@ -0,0 +1,26 @@ 18.4 +/****************************************************************************** 18.5 + * arch/xen/drivers/vnetif/backend/main.c 18.6 + * 18.7 + * Back-end of the driver for virtual block devices. This portion of the 18.8 + * driver exports a 'unified' block-device interface that can be accessed 18.9 + * by any operating system that implements a compatible front end. A 18.10 + * reference front-end implementation can be found in: 18.11 + * arch/xen/drivers/vnetif/frontend 18.12 + * 18.13 + * Copyright (c) 2004, K A Fraser 18.14 + */ 18.15 + 18.16 +#include <linux/config.h> 18.17 +#include <linux/module.h> 18.18 + 18.19 +static int __init init_module(void) 18.20 +{ 18.21 + return 0; 18.22 +} 18.23 + 18.24 +static void cleanup_module(void) 18.25 +{ 18.26 +} 18.27 + 18.28 +module_init(init_module); 18.29 +module_exit(cleanup_module);
19.1 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c Wed Apr 21 10:43:06 2004 +0000 19.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c Thu Apr 22 13:56:30 2004 +0000 19.3 @@ -15,8 +15,7 @@ 19.4 #include <linux/irq.h> 19.5 #include <linux/interrupt.h> 19.6 #include <asm/ctrl_if.h> 19.7 -#include <asm/hypervisor.h> 19.8 -#include <asm/hypervisor-ifs/event_channel.h> 19.9 +#include <asm/evtchn.h> 19.10 19.11 static int ctrl_if_evtchn; 19.12 static int ctrl_if_irq; 19.13 @@ -50,10 +49,7 @@ static DECLARE_TASKLET(ctrl_if_rx_taskle 19.14 19.15 static void ctrl_if_notify_controller(void) 19.16 { 19.17 - evtchn_op_t evtchn_op; 19.18 - evtchn_op.cmd = EVTCHNOP_send; 19.19 - evtchn_op.u.send.local_port = ctrl_if_evtchn; 19.20 - (void)HYPERVISOR_event_channel_op(&evtchn_op); 19.21 + notify_via_evtchn(ctrl_if_evtchn); 19.22 } 19.23 19.24 static void ctrl_if_rxmsg_default_handler(ctrl_msg_t *msg, unsigned long id)
20.1 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c Wed Apr 21 10:43:06 2004 +0000 20.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c Thu Apr 22 13:56:30 2004 +0000 20.3 @@ -321,7 +321,11 @@ asmlinkage void do_general_protection(st 20.4 u.ptr = MMU_EXTENDED_COMMAND; 20.5 u.ptr |= (unsigned long)&default_ldt[0]; 20.6 u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT); 20.7 - HYPERVISOR_mmu_update(&u, 1); 20.8 + if ( unlikely(HYPERVISOR_mmu_update(&u, 1) < 0) ) 20.9 + { 20.10 + show_trace(NULL); 20.11 + panic("Failed to install default LDT"); 20.12 + } 20.13 return; 20.14 } 20.15 }
21.1 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c Wed Apr 21 10:43:06 2004 +0000 21.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c Thu Apr 22 13:56:30 2004 +0000 21.3 @@ -116,7 +116,8 @@ static inline void __flush_page_update_q 21.4 #endif 21.5 idx = 0; 21.6 wmb(); /* Make sure index is cleared first to avoid double updates. */ 21.7 - HYPERVISOR_mmu_update(update_queue, _idx); 21.8 + if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx) < 0) ) 21.9 + panic("Failed to execute MMU updates"); 21.10 } 21.11 21.12 void _flush_page_update_queue(void) 21.13 @@ -182,8 +183,8 @@ void queue_invlpg(unsigned long ptr) 21.14 unsigned long flags; 21.15 spin_lock_irqsave(&update_lock, flags); 21.16 update_queue[idx].ptr = MMU_EXTENDED_COMMAND; 21.17 - update_queue[idx].val = ptr & PAGE_MASK; 21.18 - update_queue[idx].val |= MMUEXT_INVLPG; 21.19 + update_queue[idx].ptr |= ptr & PAGE_MASK; 21.20 + update_queue[idx].val = MMUEXT_INVLPG; 21.21 increment_index(); 21.22 spin_unlock_irqrestore(&update_lock, flags); 21.23 }
22.1 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c Wed Apr 21 10:43:06 2004 +0000 22.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c Thu Apr 22 13:56:30 2004 +0000 22.3 @@ -31,10 +31,28 @@ static inline void direct_remap_area_pte 22.4 unsigned long address, 22.5 unsigned long size, 22.6 unsigned long machine_addr, 22.7 - pgprot_t prot) 22.8 + pgprot_t prot, 22.9 + domid_t domid) 22.10 { 22.11 unsigned long end; 22.12 22.13 + mmu_update_t *u, *v; 22.14 + u = v = vmalloc(3*PAGE_SIZE); /* plenty */ 22.15 + 22.16 + /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */ 22.17 + if ( domid != 0 ) 22.18 + { 22.19 + v[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL; 22.20 + v[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL; 22.21 + v[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL; 22.22 + v[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL; 22.23 + v[0].ptr |= MMU_EXTENDED_COMMAND; 22.24 + v[0].val |= MMUEXT_SET_SUBJECTDOM_L; 22.25 + v[1].ptr |= MMU_EXTENDED_COMMAND; 22.26 + v[1].val |= MMUEXT_SET_SUBJECTDOM_H; 22.27 + v += 2; 22.28 + } 22.29 + 22.30 address &= ~PMD_MASK; 22.31 end = address + size; 22.32 if (end > PMD_SIZE) 22.33 @@ -46,11 +64,18 @@ static inline void direct_remap_area_pte 22.34 printk("direct_remap_area_pte: page already exists\n"); 22.35 BUG(); 22.36 } 22.37 - set_pte(pte, pte_mkio(direct_mk_pte_phys(machine_addr, prot))); 22.38 + v->ptr = virt_to_machine(pte); 22.39 + v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO; 22.40 + v++; 22.41 address += PAGE_SIZE; 22.42 machine_addr += PAGE_SIZE; 22.43 pte++; 22.44 } while (address && (address < end)); 22.45 + 22.46 + if ( ((v-u) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) ) 22.47 + printk(KERN_WARNING "Failed to ioremap %08lx->%08lx (%08lx)\n", 22.48 + end-size, end, machine_addr-size); 22.49 + vfree(u); 22.50 } 22.51 22.52 static inline int direct_remap_area_pmd(struct mm_struct *mm, 22.53 @@ -58,7 +83,8 @@ static inline int direct_remap_area_pmd( 22.54 unsigned long address, 22.55 unsigned long size, 22.56 unsigned long machine_addr, 22.57 - pgprot_t prot) 22.58 + pgprot_t prot, 22.59 + domid_t domid) 22.60 { 22.61 unsigned long end; 22.62 22.63 @@ -74,7 +100,7 @@ static inline int direct_remap_area_pmd( 22.64 if (!pte) 22.65 return -ENOMEM; 22.66 direct_remap_area_pte(pte, address, end - address, 22.67 - address + machine_addr, prot); 22.68 + address + machine_addr, prot, domid); 22.69 address = (address + PMD_SIZE) & PMD_MASK; 22.70 pmd++; 22.71 } while (address && (address < end)); 22.72 @@ -85,7 +111,8 @@ int direct_remap_area_pages(struct mm_st 22.73 unsigned long address, 22.74 unsigned long machine_addr, 22.75 unsigned long size, 22.76 - pgprot_t prot) 22.77 + pgprot_t prot, 22.78 + domid_t domid) 22.79 { 22.80 int error = 0; 22.81 pgd_t * dir; 22.82 @@ -103,7 +130,7 @@ int direct_remap_area_pages(struct mm_st 22.83 if (!pmd) 22.84 break; 22.85 error = direct_remap_area_pmd(mm, pmd, address, end - address, 22.86 - machine_addr + address, prot); 22.87 + machine_addr + address, prot, domid); 22.88 if (error) 22.89 break; 22.90 address = (address + PGDIR_SIZE) & PGDIR_MASK; 22.91 @@ -158,7 +185,7 @@ void * __ioremap(unsigned long machine_a 22.92 prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | 22.93 _PAGE_ACCESSED | flags); 22.94 if (direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(addr), 22.95 - machine_addr, size, prot)) { 22.96 + machine_addr, size, prot, 0)) { 22.97 vfree(addr); 22.98 return NULL; 22.99 }
23.1 --- a/xenolinux-2.4.26-sparse/drivers/char/mem.c Wed Apr 21 10:43:06 2004 +0000 23.2 +++ b/xenolinux-2.4.26-sparse/drivers/char/mem.c Thu Apr 22 13:56:30 2004 +0000 23.3 @@ -197,24 +197,11 @@ static inline int noncached_address(unsi 23.4 #endif 23.5 } 23.6 23.7 +#if !defined(CONFIG_XEN) 23.8 static int mmap_mem(struct file * file, struct vm_area_struct * vma) 23.9 { 23.10 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; 23.11 23.12 -#if defined(CONFIG_XEN) && defined(CONFIG_XEN_PRIVILEGED_GUEST) 23.13 - if (!(start_info.flags & SIF_PRIVILEGED)) 23.14 - return -ENXIO; 23.15 - 23.16 - /* DONTCOPY is essential for Xen as copy_page_range is broken. */ 23.17 - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; 23.18 - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 23.19 - if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset, 23.20 - vma->vm_end-vma->vm_start, vma->vm_page_prot)) 23.21 - return -EAGAIN; 23.22 - return 0; 23.23 -#elif defined(CONFIG_XEN) 23.24 - return -ENXIO; 23.25 -#else 23.26 /* 23.27 * Accessing memory above the top the kernel knows about or 23.28 * through a file pointer that was marked O_SYNC will be 23.29 @@ -236,8 +223,50 @@ static int mmap_mem(struct file * file, 23.30 vma->vm_page_prot)) 23.31 return -EAGAIN; 23.32 return 0; 23.33 -#endif 23.34 +} 23.35 +#elif !defined(CONFIG_XEN_PRIVILEGED_GUEST) 23.36 +static int mmap_mem(struct file * file, struct vm_area_struct * vma) 23.37 +{ 23.38 + return -ENXIO; 23.39 } 23.40 +#else 23.41 +static int mmap_mem(struct file * file, struct vm_area_struct * vma) 23.42 +{ 23.43 + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; 23.44 + domid_t domid; 23.45 + 23.46 + if (!(start_info.flags & SIF_PRIVILEGED)) 23.47 + return -ENXIO; 23.48 + 23.49 + domid = file->private_data ? *(domid_t *)file->private_data : 0; 23.50 + 23.51 + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ 23.52 + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; 23.53 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 23.54 + if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset, 23.55 + vma->vm_end-vma->vm_start, vma->vm_page_prot, 23.56 + domid)) 23.57 + return -EAGAIN; 23.58 + return 0; 23.59 +} 23.60 +static int ioctl_mem(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) 23.61 +{ 23.62 + if (file->private_data == NULL) 23.63 + file->private_data = kmalloc(sizeof(domid_t), GFP_KERNEL); 23.64 + switch (cmd) { 23.65 + case _IO('M', 1): ((unsigned long *)file->private_data)[0]=arg; break; 23.66 + case _IO('M', 2): ((unsigned long *)file->private_data)[1]=arg; break; 23.67 + default: return -ENOSYS; 23.68 + } 23.69 + return 0; 23.70 +} 23.71 +static int release_mem(struct inode * inode, struct file * file) 23.72 +{ 23.73 + if (file->private_data != NULL) 23.74 + kfree(file->private_data); 23.75 + return 0; 23.76 +} 23.77 +#endif /* CONFIG_XEN */ 23.78 23.79 /* 23.80 * This function reads the *virtual* memory as seen by the kernel. 23.81 @@ -426,10 +455,6 @@ static inline size_t read_zero_pagealign 23.82 goto out_up; 23.83 if (vma->vm_flags & VM_SHARED) 23.84 break; 23.85 -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) 23.86 - if (vma->vm_flags & VM_IO) 23.87 - break; 23.88 -#endif 23.89 count = vma->vm_end - addr; 23.90 if (count > size) 23.91 count = size; 23.92 @@ -615,10 +640,6 @@ static int mmap_kmem(struct file * file, 23.93 unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; 23.94 unsigned long size = vma->vm_end - vma->vm_start; 23.95 23.96 -#if defined(CONFIG_XEN) 23.97 - return -ENXIO; 23.98 -#endif 23.99 - 23.100 /* 23.101 * If the user is not attempting to mmap a high memory address then 23.102 * the standard mmap_mem mechanism will work. High memory addresses 23.103 @@ -663,13 +684,19 @@ static struct file_operations mem_fops = 23.104 write: write_mem, 23.105 mmap: mmap_mem, 23.106 open: open_mem, 23.107 +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) 23.108 + release: release_mem, 23.109 + ioctl: ioctl_mem, 23.110 +#endif 23.111 }; 23.112 23.113 static struct file_operations kmem_fops = { 23.114 llseek: memory_lseek, 23.115 read: read_kmem, 23.116 write: write_kmem, 23.117 +#if !defined(CONFIG_XEN) 23.118 mmap: mmap_kmem, 23.119 +#endif 23.120 open: open_kmem, 23.121 }; 23.122 23.123 @@ -715,12 +742,6 @@ static int memory_open(struct inode * in 23.124 break; 23.125 #if defined(CONFIG_ISA) || !defined(__mc68000__) 23.126 case 4: 23.127 -#if defined(CONFIG_XEN) 23.128 -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) 23.129 - if (!(start_info.flags & SIF_PRIVILEGED)) 23.130 -#endif 23.131 - return -ENXIO; 23.132 -#endif 23.133 filp->f_op = &port_fops; 23.134 break; 23.135 #endif
24.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h Wed Apr 21 10:43:06 2004 +0000 24.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h Thu Apr 22 13:56:30 2004 +0000 24.3 @@ -9,6 +9,7 @@ 24.4 #ifndef __ASM_XEN__CTRL_IF_H__ 24.5 #define __ASM_XEN__CTRL_IF_H__ 24.6 24.7 +#include <linux/tqueue.h> 24.8 #include <asm/hypervisor.h> 24.9 24.10 typedef control_msg_t ctrl_msg_t;
25.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/evtchn.h Wed Apr 21 10:43:06 2004 +0000 25.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/evtchn.h Thu Apr 22 13:56:30 2004 +0000 25.3 @@ -14,6 +14,7 @@ 25.4 #include <asm/hypervisor.h> 25.5 #include <asm/ptrace.h> 25.6 #include <asm/synch_bitops.h> 25.7 +#include <asm/hypervisor-ifs/event_channel.h> 25.8 25.9 /* 25.10 * LOW-LEVEL DEFINITIONS 25.11 @@ -62,6 +63,14 @@ static inline void clear_evtchn_exceptio 25.12 synch_clear_bit(port, &s->evtchn_exception[0]); 25.13 } 25.14 25.15 +static inline void notify_via_evtchn(int port) 25.16 +{ 25.17 + evtchn_op_t op; 25.18 + op.cmd = EVTCHNOP_send; 25.19 + op.u.send.local_port = port; 25.20 + (void)HYPERVISOR_event_channel_op(&op); 25.21 +} 25.22 + 25.23 /* 25.24 * CHARACTER-DEVICE DEFINITIONS 25.25 */
26.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h Wed Apr 21 10:43:06 2004 +0000 26.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h Thu Apr 22 13:56:30 2004 +0000 26.3 @@ -161,13 +161,6 @@ static inline int HYPERVISOR_mmu_update( 26.4 : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), 26.5 "b" (req), "c" (count) : "memory" ); 26.6 26.7 - if ( unlikely(ret < 0) ) 26.8 - { 26.9 - extern void show_trace(unsigned long *); 26.10 - show_trace(NULL); 26.11 - panic("Failed mmu update: %p, %d", req, count); 26.12 - } 26.13 - 26.14 return ret; 26.15 } 26.16
27.1 --- a/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h Wed Apr 21 10:43:06 2004 +0000 27.2 +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h Thu Apr 22 13:56:30 2004 +0000 27.3 @@ -265,10 +265,15 @@ static inline void flush_tlb_pgtables(st 27.4 XEN_flush_page_update_queue(); 27.5 } 27.6 27.7 +/* 27.8 + * NB. The 'domid' field should be zero if mapping I/O space (non RAM). 27.9 + * Otherwise it identifies the owner of the memory that is being mapped. 27.10 + */ 27.11 extern int direct_remap_area_pages(struct mm_struct *mm, 27.12 unsigned long address, 27.13 unsigned long machine_addr, 27.14 unsigned long size, 27.15 - pgprot_t prot); 27.16 + pgprot_t prot, 27.17 + domid_t domid); 27.18 27.19 #endif /* _I386_PGALLOC_H */