ia64/xen-unstable

changeset 1412:7843b8fd800f

bitkeeper revision 1.917 (40acee3d26HD4ugLPjY-eN66o0rNIg)

Merge labyrinth.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into labyrinth.cl.cam.ac.uk:/auto/groups/xeno/users/iap10/xeno-clone/xeno.bk
author iap10@labyrinth.cl.cam.ac.uk
date Thu May 20 17:43:25 2004 +0000 (2004-05-20)
parents ba2029aefdd6 fa3cfd1bdd96
children 57530115a37e
files tools/examples/xc_dom_control.py tools/xc/lib/xc_domain.c tools/xc/lib/xc_linux_restore.c tools/xc/lib/xc_linux_save.c tools/xc/lib/xc_private.c tools/xc/lib/xc_private.h tools/xc/py/Xc.c tools/xentrace/Makefile tools/xentrace/formats tools/xentrace/xentrace.c tools/xentrace/xentrace_format xen/common/dom0_ops.c xen/common/domain.c xen/common/schedule.c xen/common/shadow.c xen/common/trace.c xen/include/hypervisor-ifs/dom0_ops.h xen/include/hypervisor-ifs/trace.h xen/include/xen/trace.h xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c
line diff
     1.1 --- a/tools/examples/xc_dom_control.py	Thu May 20 17:18:28 2004 +0000
     1.2 +++ b/tools/examples/xc_dom_control.py	Thu May 20 17:43:25 2004 +0000
     1.3 @@ -136,15 +136,7 @@ elif cmd == 'suspend':
     1.4          pid = int(fd.readline())
     1.5          os.kill(pid, signal.SIGTERM)
     1.6  
     1.7 -    xc.domain_stop( dom=dom )
     1.8 -    
     1.9 -    while not xc.domain_getinfo( first_dom=dom, max_doms=1 )[0]['stopped']:
    1.10 -	print "Sleep..."
    1.11 -	time.sleep(0.001);
    1.12 -
    1.13      rc = xc.linux_save( dom=dom, state_file=file, progress=1)
    1.14 -    if rc == 0 : xc.domain_destroy( dom=dom, force=1 )
    1.15 -    else: xc.domain_start( dom=dom )  # sensible for production use
    1.16  
    1.17  elif cmd == 'cpu_bvtslice':
    1.18      if len(sys.argv) < 3:
     2.1 --- a/tools/xc/lib/xc_domain.c	Thu May 20 17:18:28 2004 +0000
     2.2 +++ b/tools/xc/lib/xc_domain.c	Thu May 20 17:43:25 2004 +0000
     2.3 @@ -46,6 +46,7 @@ int xc_domain_stop(int xc_handle,
     2.4      dom0_op_t op;
     2.5      op.cmd = DOM0_STOPDOMAIN;
     2.6      op.u.stopdomain.domain = (domid_t)domid;
     2.7 +    op.u.stopdomain.sync = 0; // async
     2.8      return do_dom0_op(xc_handle, &op);
     2.9  }    
    2.10  
     3.1 --- a/tools/xc/lib/xc_linux_restore.c	Thu May 20 17:18:28 2004 +0000
     3.2 +++ b/tools/xc/lib/xc_linux_restore.c	Thu May 20 17:43:25 2004 +0000
     3.3 @@ -592,14 +592,11 @@ int xc_linux_restore(int xc_handle,
     3.4  
     3.5  
     3.6   out:
     3.7 -    if ( rc != 0 )  // destroy is something went wrong
     3.8 +    if ( rc != 0 )  // destroy if something went wrong
     3.9      {
    3.10          if ( dom != 0 )
    3.11          {
    3.12 -            op.cmd = DOM0_DESTROYDOMAIN;
    3.13 -            op.u.destroydomain.domain = (domid_t)dom;
    3.14 -            op.u.destroydomain.force  = 1;
    3.15 -            (void)do_dom0_op(xc_handle, &op);
    3.16 +	    xc_domain_destroy( xc_handle, dom, 1 );
    3.17          }
    3.18      }
    3.19  
     4.1 --- a/tools/xc/lib/xc_linux_save.c	Thu May 20 17:18:28 2004 +0000
     4.2 +++ b/tools/xc/lib/xc_linux_save.c	Thu May 20 17:43:25 2004 +0000
     4.3 @@ -82,6 +82,67 @@ inline void set_bit ( int nr, volatile v
     4.4  	(1 << (nr % (sizeof(unsigned long)*8) ) );
     4.5  }
     4.6  
     4.7 +long long tv_to_us( struct timeval *new )
     4.8 +{
     4.9 +    return (new->tv_sec * 1000000) + new->tv_usec;
    4.10 +}
    4.11 +
    4.12 +long long tvdelta( struct timeval *new, struct timeval *old )
    4.13 +{
    4.14 +    return ((new->tv_sec - old->tv_sec)*1000000 ) + 
    4.15 +	(new->tv_usec - old->tv_usec);
    4.16 +}
    4.17 +
    4.18 +int track_cpu_usage_dom0( int xc_handle, int print )
    4.19 +{
    4.20 +    static struct timeval wall_last;
    4.21 +    static long long      cpu_last;
    4.22 +
    4.23 +    struct timeval        wall_now;
    4.24 +    long long             cpu_now, wall_delta, cpu_delta;
    4.25 +
    4.26 +    gettimeofday(&wall_now, NULL);
    4.27 +
    4.28 +    cpu_now = xc_domain_get_cpu_usage( xc_handle, 0 )/1000;
    4.29 +
    4.30 +    wall_delta = tvdelta(&wall_now,&wall_last)/1000;
    4.31 +    cpu_delta  = (cpu_now - cpu_last)/1000;
    4.32 +
    4.33 +    if(print)
    4.34 +	printf("Dom0  : wall delta %lldms, cpu delta %lldms    : %d%%\n",
    4.35 +	   wall_delta, cpu_delta, (cpu_delta*100)/wall_delta);
    4.36 +
    4.37 +    cpu_last  = cpu_now;
    4.38 +    wall_last = wall_now;	
    4.39 +
    4.40 +    return 0;
    4.41 +}
    4.42 +
    4.43 +int track_cpu_usage_target( int xc_handle, u64 domid, int print )
    4.44 +{
    4.45 +    static struct timeval wall_last;
    4.46 +    static long long      cpu_last;
    4.47 +
    4.48 +    struct timeval        wall_now;
    4.49 +    long long             cpu_now, wall_delta, cpu_delta;
    4.50 +
    4.51 +    gettimeofday(&wall_now, NULL);
    4.52 +
    4.53 +    cpu_now = xc_domain_get_cpu_usage( xc_handle, domid )/1000;
    4.54 +
    4.55 +    wall_delta = tvdelta(&wall_now,&wall_last)/1000;
    4.56 +    cpu_delta  = (cpu_now - cpu_last)/1000;
    4.57 +
    4.58 +    if(print)
    4.59 +	printf("Target: wall delta %lldms, cpu delta %lldms    : %d%%\n",
    4.60 +	   wall_delta, cpu_delta, (cpu_delta*100)/wall_delta);
    4.61 +
    4.62 +    cpu_last  = cpu_now;
    4.63 +    wall_last = wall_now;	
    4.64 +
    4.65 +    return 0;
    4.66 +}
    4.67 +
    4.68  
    4.69  int xc_linux_save(int xc_handle,
    4.70                    u64 domid, 
    4.71 @@ -95,10 +156,11 @@ int xc_linux_save(int xc_handle,
    4.72      int verbose = flags & XCFLAGS_VERBOSE;
    4.73      int live = flags & XCFLAGS_LIVE;
    4.74      int debug = flags & XCFLAGS_DEBUG;
    4.75 -    int sent_last_iter, sent_this_iter, max_iters;
    4.76 -
    4.77 -    /* Remember if we stopped the guest, so we can restart it on exit. */
    4.78 -    int we_stopped_it = 0;
    4.79 +    int sent_last_iter, sent_this_iter, skip_this_iter;
    4.80 +    
    4.81 +    /* Important tuning parameters */
    4.82 +    int max_iters  = 29; // limit us to 30 times round loop
    4.83 +    int max_factor = 3;  // never send more than 3x nr_pfns 
    4.84  
    4.85      /* The new domain's shared-info frame number. */
    4.86      unsigned long shared_info_frame;
    4.87 @@ -137,11 +199,15 @@ int xc_linux_save(int xc_handle,
    4.88      /* number of pages we're dealing with */
    4.89      unsigned long nr_pfns;
    4.90  
    4.91 -    /* bitmap of pages left to send */
    4.92 -    unsigned long *to_send, *to_fix;
    4.93 +    /* bitmap of pages:
    4.94 +       - that should be sent this iteration (unless later marked as skip); 
    4.95 +       - to skip this iteration because already dirty;
    4.96 +       - to fixup by sending at the end if not already resent; */
    4.97 +    unsigned long *to_send, *to_skip, *to_fix;
    4.98  
    4.99 -//live=0;
   4.100 -
   4.101 +    int needed_to_fix = 0;
   4.102 +    int total_sent    = 0;
   4.103 +    
   4.104      if ( mlock(&ctxt, sizeof(ctxt) ) )
   4.105      {
   4.106          PERROR("Unable to mlock ctxt");
   4.107 @@ -149,38 +215,15 @@ int xc_linux_save(int xc_handle,
   4.108      }
   4.109  
   4.110      /* Ensure that the domain exists, and that it is stopped. */
   4.111 -    for ( ; ; )
   4.112 -    {
   4.113 -        op.cmd = DOM0_GETDOMAININFO;
   4.114 -        op.u.getdomaininfo.domain = (domid_t)domid;
   4.115 -        op.u.getdomaininfo.ctxt = &ctxt;
   4.116 -        if ( (do_dom0_op(xc_handle, &op) < 0) || 
   4.117 -             ((u64)op.u.getdomaininfo.domain != domid) )
   4.118 -        {
   4.119 -            PERROR("Could not get info on domain");
   4.120 -            goto out;
   4.121 -        }
   4.122 -
   4.123 -        memcpy(name, op.u.getdomaininfo.name, sizeof(name));
   4.124 -        shared_info_frame = op.u.getdomaininfo.shared_info_frame;
   4.125  
   4.126 -        if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED )
   4.127 -            break;
   4.128 -
   4.129 -        we_stopped_it = 1;
   4.130 +    if ( xc_domain_stop_sync( xc_handle, domid, &op, &ctxt ) )
   4.131 +    {
   4.132 +	PERROR("Could not sync stop domain");
   4.133 +	goto out;
   4.134 +    }
   4.135  
   4.136 -        op.cmd = DOM0_STOPDOMAIN;
   4.137 -        op.u.stopdomain.domain = (domid_t)domid;
   4.138 -        if ( do_dom0_op(xc_handle, &op) != 0 )
   4.139 -        {
   4.140 -            we_stopped_it = 0;
   4.141 -            PERROR("Stopping target domain failed");
   4.142 -            goto out;
   4.143 -        }
   4.144 -
   4.145 -        usleep(1000); // 1ms
   4.146 -	printf("Sleep for 1ms\n");
   4.147 -    }
   4.148 +    memcpy(name, op.u.getdomaininfo.name, sizeof(name));
   4.149 +    shared_info_frame = op.u.getdomaininfo.shared_info_frame;
   4.150  
   4.151      /* A cheesy test to see whether the domain contains valid state. */
   4.152      if ( ctxt.pt_base == 0 )
   4.153 @@ -288,7 +331,6 @@ int xc_linux_save(int xc_handle,
   4.154  
   4.155  	last_iter = 0;
   4.156  	sent_last_iter = 1<<20; // 4GB's worth of pages
   4.157 -	max_iters = 9; // limit us to 10 time round loop
   4.158      }
   4.159      else
   4.160  	last_iter = 1;
   4.161 @@ -300,12 +342,14 @@ int xc_linux_save(int xc_handle,
   4.162  	
   4.163  	to_send = malloc( sz );
   4.164  	to_fix  = calloc( 1, sz );
   4.165 +	to_skip = malloc( sz );
   4.166  
   4.167 -	if (!to_send || !to_fix)
   4.168 +	if (!to_send || !to_fix || !to_skip)
   4.169  	{
   4.170  	    ERROR("Couldn't allocate to_send array");
   4.171  	    goto out;
   4.172  	}
   4.173 +
   4.174  	memset( to_send, 0xff, sz );
   4.175  
   4.176  	if ( mlock( to_send, sz ) )
   4.177 @@ -313,6 +357,15 @@ int xc_linux_save(int xc_handle,
   4.178  	    PERROR("Unable to mlock to_send");
   4.179  	    return 1;
   4.180  	}
   4.181 +
   4.182 +	/* (to fix is local only) */
   4.183 +
   4.184 +	if ( mlock( to_skip, sz ) )
   4.185 +	{
   4.186 +	    PERROR("Unable to mlock to_skip");
   4.187 +	    return 1;
   4.188 +	}
   4.189 +
   4.190      }
   4.191  
   4.192  
   4.193 @@ -369,8 +422,11 @@ int xc_linux_save(int xc_handle,
   4.194          goto out;
   4.195      }
   4.196  
   4.197 +    track_cpu_usage_dom0(xc_handle, 0);
   4.198 +    track_cpu_usage_target( xc_handle, domid, 0);
   4.199 +
   4.200      /* Now write out each data page, canonicalising page tables as we go... */
   4.201 -
   4.202 +    
   4.203      while(1)
   4.204      {
   4.205  	unsigned int prev_pc, batch, sent_this_iter;
   4.206 @@ -378,6 +434,7 @@ int xc_linux_save(int xc_handle,
   4.207  	iter++;
   4.208  
   4.209  	sent_this_iter = 0;
   4.210 +	skip_this_iter = 0;
   4.211  	prev_pc = 0;
   4.212  	verbose_printf("Saving memory pages: iter %d   0%%", iter);
   4.213  
   4.214 @@ -391,6 +448,18 @@ int xc_linux_save(int xc_handle,
   4.215  		prev_pc = this_pc;
   4.216  	    }
   4.217  
   4.218 +	    /* slightly wasteful to peek the whole array evey time, 
   4.219 +	       but this is fast enough for the moment. */
   4.220 +
   4.221 +	    if ( !last_iter && 
   4.222 +		 xc_shadow_control( xc_handle, domid, 
   4.223 +				    DOM0_SHADOW_CONTROL_OP_PEEK,
   4.224 +				    to_skip, nr_pfns ) != nr_pfns ) 
   4.225 +	    {
   4.226 +		ERROR("Error peeking shadow bitmap");
   4.227 +		goto out;
   4.228 +	    }
   4.229 +	    
   4.230  
   4.231  	    /* load pfn_type[] with the mfn of all the pages we're doing in
   4.232  	       this batch. */
   4.233 @@ -404,15 +473,29 @@ int xc_linux_save(int xc_handle,
   4.234  			    test_bit(n,to_send),
   4.235  			    live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&0xFFFFF]);
   4.236  
   4.237 +		if (!last_iter && test_bit(n, to_send) && test_bit(n, to_skip))
   4.238 +		    skip_this_iter++; // stats keeping
   4.239  
   4.240 -		if ( !test_bit(n, to_send ) &&
   4.241 -		    !( last_iter && test_bit(n, to_fix ) ) ) continue;
   4.242 +		if (! ( (test_bit(n, to_send) && !test_bit(n, to_skip)) ||
   4.243 +			(test_bit(n, to_send) && last_iter) ||
   4.244 +			(test_bit(n, to_fix)  && last_iter) )   )
   4.245 +		    continue;
   4.246 +
   4.247 +		/* we get here if:
   4.248 +		   1. page is marked to_send & hasn't already been re-dirtied
   4.249 +		   2. (ignore to_skip in last iteration)
   4.250 +		   3. add in pages that still need fixup (net bufs)
   4.251 +		 */
   4.252  		
   4.253  		pfn_batch[batch] = n;
   4.254  		pfn_type[batch] = live_pfn_to_mfn_table[n];
   4.255  
   4.256  		if( pfn_type[batch] == 0x80000004 )
   4.257  		{
   4.258 +		    /* not currently in pusedo-physical map -- set bit
   4.259 +		       in to_fix that we must send this page in last_iter
   4.260 +		       unless its sent sooner anyhow */
   4.261 +
   4.262  		    set_bit( n, to_fix );
   4.263  		    if( iter>1 )
   4.264  			DDPRINTF("Urk! netbuf race: iter %d, pfn %lx. mfn %lx\n",
   4.265 @@ -422,6 +505,7 @@ int xc_linux_save(int xc_handle,
   4.266  
   4.267  		if ( last_iter && test_bit(n, to_fix ) && !test_bit(n, to_send ))
   4.268  		{
   4.269 +		    needed_to_fix++;
   4.270  		    DPRINTF("Fix! iter %d, pfn %lx. mfn %lx\n",
   4.271  			       iter,n,pfn_type[batch]);
   4.272  		}
   4.273 @@ -567,9 +651,23 @@ int xc_linux_save(int xc_handle,
   4.274  	munmap(region_base, batch*PAGE_SIZE);
   4.275  
   4.276      skip: 
   4.277 +
   4.278 +	total_sent += sent_this_iter;
   4.279 +
   4.280 +	verbose_printf("\b\b\b\b100%% (pages sent= %d, skipped= %d )\n", 
   4.281 +		       sent_this_iter, skip_this_iter );
   4.282 +
   4.283 +	track_cpu_usage_dom0(xc_handle, 1);
   4.284 +	track_cpu_usage_target( xc_handle, domid, 1);
   4.285 +
   4.286  	
   4.287 -	verbose_printf("\b\b\b\b100%% (%d pages)\n", sent_this_iter );
   4.288 -	
   4.289 +	if ( last_iter )
   4.290 +	{
   4.291 +	    verbose_printf("Total pages sent= %d (%.2fx)\n", 
   4.292 +			   total_sent, ((float)total_sent)/nr_pfns );
   4.293 +	    verbose_printf("(of which %d were fixups)\n", needed_to_fix  );
   4.294 +	}       
   4.295 +
   4.296  	if ( debug && last_iter )
   4.297  	{
   4.298  	    int minusone = -1;
   4.299 @@ -592,18 +690,21 @@ int xc_linux_save(int xc_handle,
   4.300  
   4.301  	if ( live )
   4.302  	{
   4.303 -	    if ( ( sent_this_iter > (sent_last_iter * 0.95) ) ||
   4.304 -		 (iter >= max_iters) || (sent_this_iter < 10) )
   4.305 +	    if ( 
   4.306 +		 // ( sent_this_iter > (sent_last_iter * 0.95) ) ||		 
   4.307 +		 (iter >= max_iters) || 
   4.308 +		 (sent_this_iter+skip_this_iter < 10) || 
   4.309 +		 (total_sent > nr_pfns*max_factor) )
   4.310  	    {
   4.311  		DPRINTF("Start last iteration\n");
   4.312  		last_iter = 1;
   4.313  
   4.314 -		xc_domain_stop_sync( xc_handle, domid );
   4.315 +		xc_domain_stop_sync( xc_handle, domid, &op, NULL );
   4.316  
   4.317  	    } 
   4.318  
   4.319  	    if ( xc_shadow_control( xc_handle, domid, 
   4.320 -				    DOM0_SHADOW_CONTROL_OP_CLEAN,
   4.321 +				    DOM0_SHADOW_CONTROL_OP_CLEAN2,
   4.322  				    to_send, nr_pfns ) != nr_pfns ) 
   4.323  	    {
   4.324  		ERROR("Error flushing shadow PT");
   4.325 @@ -674,14 +775,6 @@ int xc_linux_save(int xc_handle,
   4.326      munmap(live_shinfo, PAGE_SIZE);
   4.327  
   4.328  out:
   4.329 -    /* Restart the domain if we had to stop it to save its state. */
   4.330 -    if ( we_stopped_it )
   4.331 -    {
   4.332 -	printf("Restart domain\n");
   4.333 -        op.cmd = DOM0_STARTDOMAIN;
   4.334 -        op.u.startdomain.domain = (domid_t)domid;
   4.335 -        (void)do_dom0_op(xc_handle, &op);
   4.336 -    }
   4.337  
   4.338      if ( pfn_type != NULL )
   4.339          free(pfn_type);
     5.1 --- a/tools/xc/lib/xc_private.c	Thu May 20 17:18:28 2004 +0000
     5.2 +++ b/tools/xc/lib/xc_private.c	Thu May 20 17:43:25 2004 +0000
     5.3 @@ -377,51 +377,67 @@ int finish_mmu_updates(int xc_handle, mm
     5.4  
     5.5  /* this function is a hack until we get proper synchronous domain stop */
     5.6  
     5.7 -int xc_domain_stop_sync( int xc_handle, domid_t domid )
     5.8 +int xc_domain_stop_sync( int xc_handle, domid_t domid,
     5.9 +			 dom0_op_t *op, full_execution_context_t *ctxt)
    5.10  {
    5.11 -    dom0_op_t op;
    5.12      int i;
    5.13 -    
    5.14  
    5.15 -    op.cmd = DOM0_STOPDOMAIN;
    5.16 -    op.u.stopdomain.domain = (domid_t)domid;
    5.17 -    if ( do_dom0_op(xc_handle, &op) != 0 )
    5.18 -    {
    5.19 -	PERROR("Stopping target domain failed");
    5.20 -	goto out;
    5.21 -    }
    5.22 -
    5.23 -    usleep(100); // 100us
    5.24 +    printf("Sleep:");
    5.25  
    5.26      for(i=0;;i++)
    5.27 -    {
    5.28 -	if (i>0)
    5.29 -	    if (i==1) printf("Sleep.");
    5.30 -	    else printf(".");
    5.31 +    {    
    5.32  
    5.33 -        op.cmd = DOM0_GETDOMAININFO;
    5.34 -        op.u.getdomaininfo.domain = (domid_t)domid;
    5.35 -        op.u.getdomaininfo.ctxt = NULL;
    5.36 -        if ( (do_dom0_op(xc_handle, &op) < 0) || 
    5.37 -             ((u64)op.u.getdomaininfo.domain != domid) )
    5.38 +	op->cmd = DOM0_STOPDOMAIN;
    5.39 +	op->u.stopdomain.domain = (domid_t)domid;
    5.40 +	op->u.stopdomain.sync = 1;
    5.41 +	do_dom0_op(xc_handle, op);
    5.42 +	/* can't trust return code due to sync stop hack :-(( */
    5.43 +
    5.44 +       
    5.45 +        op->cmd = DOM0_GETDOMAININFO;
    5.46 +        op->u.getdomaininfo.domain = (domid_t)domid;
    5.47 +        op->u.getdomaininfo.ctxt = ctxt;
    5.48 +        if ( (do_dom0_op(xc_handle, op) < 0) || 
    5.49 +             ((u64)op->u.getdomaininfo.domain != domid) )
    5.50          {
    5.51              PERROR("Could not get info on domain");
    5.52              goto out;
    5.53          }
    5.54  
    5.55 -        if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED )
    5.56 +        if ( op->u.getdomaininfo.state == DOMSTATE_STOPPED )
    5.57  	{
    5.58  	    printf("Domain %lld stopped\n",domid);
    5.59              return 0;
    5.60  	}
    5.61  	
    5.62 -	usleep(1000);
    5.63 +	printf(".");
    5.64 +
    5.65 +	//usleep(1000);
    5.66      }
    5.67  
    5.68 +    printf("\n");
    5.69 +
    5.70  out:
    5.71      return -1;    
    5.72  }
    5.73  
    5.74 +long long  xc_domain_get_cpu_usage( int xc_handle, domid_t domid )
    5.75 +{
    5.76 +    dom0_op_t op;
    5.77 +
    5.78 +    op.cmd = DOM0_GETDOMAININFO;
    5.79 +    op.u.getdomaininfo.domain = (domid_t)domid;
    5.80 +    op.u.getdomaininfo.ctxt = NULL;
    5.81 +    if ( (do_dom0_op(xc_handle, &op) < 0) || 
    5.82 +	 ((u64)op.u.getdomaininfo.domain != domid) )
    5.83 +    {
    5.84 +	PERROR("Could not get info on domain");
    5.85 +	return -1;
    5.86 +    }
    5.87 +    return op.u.getdomaininfo.cpu_time;
    5.88 +}
    5.89 +
    5.90 +
    5.91  /**********************************************************************/
    5.92  
    5.93  // this is shared between save and restore, and may be useful.
     6.1 --- a/tools/xc/lib/xc_private.h	Thu May 20 17:18:28 2004 +0000
     6.2 +++ b/tools/xc/lib/xc_private.h	Thu May 20 17:43:25 2004 +0000
     6.3 @@ -248,6 +248,9 @@ void * mfn_mapper_queue_entry(mfn_mapper
     6.4  
     6.5  /*********************/
     6.6  
     6.7 -int xc_domain_stop_sync( int xc_handle, domid_t dom );
     6.8 +int xc_domain_stop_sync( int xc_handle, domid_t dom, 
     6.9 +			 dom0_op_t *op, full_execution_context_t *ctxt );
    6.10 +
    6.11 +long long  xc_domain_get_cpu_usage( int xc_handle, domid_t domid );
    6.12  
    6.13  #endif /* __XC_PRIVATE_H__ */
     7.1 --- a/tools/xc/py/Xc.c	Thu May 20 17:18:28 2004 +0000
     7.2 +++ b/tools/xc/py/Xc.c	Thu May 20 17:43:25 2004 +0000
     7.3 @@ -214,6 +214,7 @@ static PyObject *pyxc_linux_save(PyObjec
     7.4  	struct hostent *h;
     7.5  	struct sockaddr_in s;
     7.6  	int sockbufsize;
     7.7 +	int rc = -1;
     7.8  
     7.9  	int writerfn(void *fd, const void *buf, size_t count)
    7.10  	{
    7.11 @@ -257,12 +258,24 @@ static PyObject *pyxc_linux_save(PyObjec
    7.12  	if ( xc_linux_save(xc->xc_handle, dom, flags, 
    7.13                             writerfn, (void*)sd) == 0 )
    7.14  	{
    7.15 -	    close(sd);
    7.16 -	    Py_INCREF(zero);
    7.17 -	    return zero;
    7.18 +	    if ( read( sd, &rc, sizeof(int) ) != sizeof(int) )
    7.19 +		goto serr;
    7.20 +		
    7.21 +	    if ( rc == 0 )
    7.22 +	    {
    7.23 +		printf("Migration succesful -- destroy local copy\n");
    7.24 +		xc_domain_destroy( xc->xc_handle, dom, 1 );
    7.25 +		close(sd);
    7.26 +		Py_INCREF(zero);
    7.27 +		return zero;
    7.28 +	    }
    7.29 +	    else
    7.30 +		errno = rc;
    7.31  	}
    7.32  
    7.33      serr:
    7.34 +	printf("Migration failed -- restart local copy\n");
    7.35 +	xc_domain_start( xc->xc_handle, dom );
    7.36  	PyErr_SetFromErrno(xc_error);
    7.37  	if ( sd >= 0 ) close(sd);
    7.38  	return NULL;
    7.39 @@ -355,7 +368,7 @@ static PyObject *pyxc_linux_restore(PyOb
    7.40  	struct sockaddr_in s, d, p;
    7.41  	socklen_t dlen, plen;
    7.42  	int sockbufsize;
    7.43 -	int on = 1;
    7.44 +	int on = 1, rc = -1;
    7.45  
    7.46  	int readerfn(void *fd, void *buf, size_t count)
    7.47  	{
    7.48 @@ -413,13 +426,18 @@ static PyObject *pyxc_linux_restore(PyOb
    7.49                          sizeof sockbufsize) < 0 ) 
    7.50  	    goto serr;
    7.51  
    7.52 -	if ( xc_linux_restore(xc->xc_handle, dom, flags, 
    7.53 -                              readerfn, (void*)sd, &dom) == 0 )
    7.54 +	rc = xc_linux_restore(xc->xc_handle, dom, flags, 
    7.55 +                              readerfn, (void*)sd, &dom);
    7.56 +
    7.57 +	write( sd, &rc, sizeof(int) ); 
    7.58 +
    7.59 +	if (rc == 0)
    7.60  	{
    7.61  	    close(sd);
    7.62  	    Py_INCREF(zero);
    7.63  	    return zero;
    7.64  	}
    7.65 +	errno = rc;
    7.66  
    7.67      serr:
    7.68  	PyErr_SetFromErrno(xc_error);
     8.1 --- a/tools/xentrace/Makefile	Thu May 20 17:18:28 2004 +0000
     8.2 +++ b/tools/xentrace/Makefile	Thu May 20 17:43:25 2004 +0000
     8.3 @@ -1,6 +1,6 @@
     8.4  
     8.5  CC       = gcc
     8.6 -CFLAGS   = -Wall -O3 
     8.7 +CFLAGS   = -Wall -O3 -Werror
     8.8  CFLAGS  += -I../../xen/include/hypervisor-ifs
     8.9  CFLAGS  += -I../../xenolinux-sparse/include
    8.10  CFLAGS  += -I../xend/lib
     9.1 --- a/tools/xentrace/formats	Thu May 20 17:18:28 2004 +0000
     9.2 +++ b/tools/xentrace/formats	Thu May 20 17:43:25 2004 +0000
     9.3 @@ -1,14 +1,35 @@
     9.4 -0x00010000	CPU%(cpu)d 0x%(tsc)x sched_add_domain(0x%(3)08x)            [ dom id = 0x%(1)x%(2)08x   ]
     9.5 -0x00010001	CPU%(cpu)d 0x%(tsc)x sched_rem_domain(0x%08(3)x)            [ dom id = 0x%(1)x%(2)08x   ]
     9.6 -0x00010002	CPU%(cpu)d 0x%(tsc)x __wake_up(0x%(3)08x)                   [ dom id = 0x%(1)x%(2)08x   ]
     9.7 -0x00010003	CPU%(cpu)d 0x%(tsc)x do_block()                             [ current = 0x%(2)08x         ]
     9.8 -0x00010004	CPU%(cpu)d 0x%(tsc)x do_yield()		                    [ current = %(2)08x         ]
     9.9 -0x00010005	CPU%(cpu)d 0x%(tsc)x do_set_timer_op(0x%(4)08x, 0x%(5)08x)  [ current = 0x%(3)08x ]
    9.10 -0x00010006	CPU%(cpu)d 0x%(tsc)x sched_ctl(0x%(1)08x)
    9.11 -0x00010007	CPU%(cpu)d 0x%(tsc)x sched_adjdom(params)                   [ dom id = 0x%(1)x%(2)08x   ]
    9.12 -0x00010008	CPU%(cpu)d 0x%(tsc)x __reschedule(0x%(3)08x)                [ dom id = 0x%(1)x%(2)08x   ]
    9.13 -0x00010009	CPU%(cpu)d 0x%(tsc)x switching to task_struct 0x%(1)08x     [ dom id = 0x%(1)x     ]
    9.14 -0x0001000A	CPU%(cpu)d 0x%(tsc)x s_timer_fn(unused)
    9.15 -0x0001000B	CPU%(cpu)d 0x%(tsc)x t_timer_fn(unused)
    9.16 -0x0001000C	CPU%(cpu)d 0x%(tsc)x dom_timer_fn(data)
    9.17 -0x0001000D	CPU%(cpu)d 0x%(tsc)x fallback_timer_fn(unused)
    9.18 +#0x00010000	CPU%(cpu)d %(tsc).6f sched_add_domain(0x%(3)08x)            [ dom id = 0x%(1)x%(2)08x   ]
    9.19 +#0x00010001	CPU%(cpu)d %(tsc).6f sched_rem_domain(0x%08(3)x)            [ dom id = 0x%(1)x%(2)08x   ]
    9.20 +#0x00010002	CPU%(cpu)d %(tsc).6f __wake_up(0x%(3)08x)                   [ dom id = 0x%(1)x%(2)08x   ]
    9.21 +#0x00010003	CPU%(cpu)d %(tsc).6f do_block()                             [ current = 0x%(2)08x         ]
    9.22 +#0x00010004	CPU%(cpu)d %(tsc).6f do_yield()		                    [ current = %(2)08x         ]
    9.23 +#0x00010005	CPU%(cpu)d %(tsc).6f do_set_timer_op(0x%(4)08x, 0x%(5)08x)  [ current = 0x%(3)08x ]
    9.24 +#0x00010006	CPU%(cpu)d %(tsc).6f sched_ctl(0x%(1)08x)
    9.25 +#0x00010007	CPU%(cpu)d %(tsc).6f sched_adjdom(params)                   [ dom id = 0x%(1)x%(2)08x   ]
    9.26 +#0x00010008	CPU%(cpu)d %(tsc).6f __reschedule(0x%(3)08x)                [ dom id = 0x%(1)x%(2)08x   ]
    9.27 +#0x00010009	CPU%(cpu)d %(tsc).6f switching to task_struct 0x%(1)08x     [ dom id = 0x%(1)x     ]
    9.28 +#0x0001000A	CPU%(cpu)d %(tsc).6f s_timer_fn(unused)
    9.29 +#0x0001000B	CPU%(cpu)d %(tsc).6f t_timer_fn(unused)
    9.30 +#0x0001000C	CPU%(cpu)d %(tsc).6f dom_timer_fn(data)
    9.31 +#0x0001000D	CPU%(cpu)d %(tsc).6f fallback_timer_fn(unused)
    9.32 +
    9.33 +
    9.34 +0x00020008	CPU%(cpu)d %(tsc).6f enter: dom0_create_dom ( )
    9.35 +0x00030008	CPU%(cpu)d %(tsc).6f leave: dom0_create_dom ( )
    9.36 +
    9.37 +0x00020009	CPU%(cpu)d %(tsc).6f enter: dom0_destroy_dom ( dom=0x%(2)x )
    9.38 +0x00030009	CPU%(cpu)d %(tsc).6f leave: dom0_destroy_dom ( dom=0x%(2)x ) = %(1)d
    9.39 +
    9.40 +0x0002000A	CPU%(cpu)d %(tsc).6f enter: dom0_start_dom ( dom=0x%(2)x )
    9.41 +0x0003000A	CPU%(cpu)d %(tsc).6f leave: dom0_start_dom ( dom=0x%(2)x ) = %(1)d
    9.42 +0x0002000B	CPU%(cpu)d %(tsc).6f enter: dom0_stop_dom ( dom=0x%(2)x )
    9.43 +0x0003000B	CPU%(cpu)d %(tsc).6f leave: dom0_stop_dom ( dom=0x%(2)x ) = %(1)d
    9.44 +0x0002000C	CPU%(cpu)d %(tsc).6f enter: dom0_getinfo ( dom=0x%(2)x )
    9.45 +0x0003000C	CPU%(cpu)d %(tsc).6f leave: dom0_getinfo ( dom=0x%(2)x ) = %(1)d
    9.46 +0x0002000D	CPU%(cpu)d %(tsc).6f enter: dom0_build ( dom=0x%(2)x )
    9.47 +0x0003000D	CPU%(cpu)d %(tsc).6f leave: dom0_build ( dom=0x%(2)x ) = %(1)d
    9.48 +
    9.49 +0x00020019	CPU%(cpu)d %(tsc).6f enter: dom0_shadow_op ( dom=0x%(2)x, %(3)d )
    9.50 +0x00030019	CPU%(cpu)d %(tsc).6f leave: dom0_shadow_op ( dom=0x%(2)x, %(3)d  ) = %(1)d
    9.51 +
    9.52 +#0x0		CPU%(cpu)d %(tsc).6f %(event)x
    10.1 --- a/tools/xentrace/xentrace.c	Thu May 20 17:18:28 2004 +0000
    10.2 +++ b/tools/xentrace/xentrace.c	Thu May 20 17:43:25 2004 +0000
    10.3 @@ -227,10 +227,10 @@ struct t_rec **init_rec_ptrs(unsigned lo
    10.4   * trace buffer.  Each entry in this table corresponds to the tail index for a
    10.5   * particular trace buffer.
    10.6   */
    10.7 -int *init_tail_idxs(struct t_buf **bufs, unsigned int num)
    10.8 +unsigned long *init_tail_idxs(struct t_buf **bufs, unsigned int num)
    10.9  {
   10.10      int i;
   10.11 -    int *tails = calloc(num, sizeof(unsigned int));
   10.12 +    unsigned long *tails = calloc(num, sizeof(unsigned int));
   10.13   
   10.14      if ( tails == NULL )
   10.15      {
   10.16 @@ -276,16 +276,19 @@ unsigned int get_num_cpus()
   10.17   */
   10.18  int monitor_tbufs(FILE *logfile)
   10.19  {
   10.20 -    int i, j;
   10.21 +    int i;
   10.22 +
   10.23      void *tbufs_mapped;          /* pointer to where the tbufs are mapped    */
   10.24      struct t_buf **meta;         /* pointers to the trace buffer metadata    */
   10.25      struct t_rec **data;         /* pointers to the trace buffer data areas
   10.26                                    * where they are mapped into user space.   */
   10.27 -    int *tails;                  /* store tail indexes for the trace buffers */
   10.28 +    unsigned long *cons;         /* store tail indexes for the trace buffers */
   10.29      unsigned long tbufs_phys;    /* physical address of the tbufs            */
   10.30      unsigned int  num;           /* number of trace buffers / logical CPUS   */
   10.31      unsigned long size;          /* size of a single trace buffer            */
   10.32  
   10.33 +    int size_in_recs;
   10.34 +
   10.35      /* get number of logical CPUs (and therefore number of trace buffers) */
   10.36      num = get_num_cpus();
   10.37  
   10.38 @@ -293,39 +296,32 @@ int monitor_tbufs(FILE *logfile)
   10.39      get_tbufs(&tbufs_phys, &size);
   10.40      tbufs_mapped = map_tbufs(tbufs_phys, num, size);
   10.41  
   10.42 +    size_in_recs = (size / sizeof(struct t_rec) )-1;
   10.43 +
   10.44      /* build arrays of convenience ptrs */
   10.45      meta  = init_bufs_ptrs (tbufs_mapped, num, size);
   10.46      data  = init_rec_ptrs  (tbufs_phys, tbufs_mapped, meta, num);
   10.47 -    tails = init_tail_idxs (meta, num);
   10.48 +    cons  = init_tail_idxs (meta, num);
   10.49  
   10.50      /* now, scan buffers for events */
   10.51      while ( !interrupted )
   10.52      {
   10.53          for ( i = 0; ( i < num ) && !interrupted; i++ )
   10.54 -        {
   10.55 -            signed long newdata = meta[i]->head - tails[i];
   10.56 -            signed long prewrap = newdata;
   10.57 -
   10.58 -	    /* correct newdata and prewrap in case of a pointer wrap */
   10.59 -            if ( newdata < 0 )
   10.60 -            {
   10.61 -                newdata += meta[i]->size;
   10.62 -                prewrap  = meta[i]->size - tails[i];
   10.63 -            }
   10.64 +        {	    
   10.65 +/*	    printf("XX%d: cons=%ld head=%ld  %p\n", i,
   10.66 +		   cons[i], meta[i]->head, data[i] + (cons[i] % size_in_recs) );
   10.67 +		   */
   10.68 +	    while( cons[i] < meta[i]->head )
   10.69 +	    {
   10.70 +/*
   10.71 +		if( (cons[i] % 6  ) == 0 )
   10.72 +		    printf("%d: cons=%ld head=%ld  %p\n", i,
   10.73 +		       cons[i], meta[i]->head, data[i] + (cons[i] % size_in_recs) );
   10.74 +		       */
   10.75 +		write_rec(i, data[i] + (cons[i] % size_in_recs), logfile);
   10.76 +		cons[i]++;
   10.77 +	    }
   10.78  
   10.79 -            if ( newdata >= opts.new_data_thresh )
   10.80 -            {
   10.81 -                /* output pre-wrap data */
   10.82 -                for(j = 0; j < prewrap; j++)
   10.83 -                    write_rec(i, data[i] + tails[i] + j, logfile);
   10.84 -                
   10.85 -                /* output post-wrap data, if any */                    
   10.86 -                for(j = 0; j < (newdata - prewrap); j++)
   10.87 -                    write_rec(i, data[i] + j, logfile);  
   10.88 -                
   10.89 -                tails[i] += newdata;
   10.90 -                if(tails[i] >= meta[i]->size) tails[i] = 0;
   10.91 -            }
   10.92          }
   10.93          nanosleep(&opts.poll_sleep, NULL);
   10.94      }
   10.95 @@ -333,7 +329,7 @@ int monitor_tbufs(FILE *logfile)
   10.96      /* cleanup */
   10.97      free(meta);
   10.98      free(data);
   10.99 -    free(tails);
  10.100 +    free(cons);
  10.101      /* don't need to munmap - cleanup is automatic */
  10.102      fclose(logfile);
  10.103  
    11.1 --- a/tools/xentrace/xentrace_format	Thu May 20 17:18:28 2004 +0000
    11.2 +++ b/tools/xentrace/xentrace_format	Thu May 20 17:43:25 2004 +0000
    11.3 @@ -4,7 +4,7 @@
    11.4  
    11.5  # Program for reformatting trace buffer output according to user-supplied rules
    11.6  
    11.7 -import re, sys, string, signal, struct
    11.8 +import re, sys, string, signal, struct, os, getopt
    11.9  
   11.10  def usage():
   11.11      print >> sys.stderr, \
   11.12 @@ -43,6 +43,9 @@ def read_defs(defs_file):
   11.13          line = fd.readline()
   11.14          if not line:
   11.15              break
   11.16 +	
   11.17 +	if line[0] == '#' or line[0] == '\n':
   11.18 +	    continue
   11.19          
   11.20          m = reg.match(line)
   11.21  
   11.22 @@ -58,29 +61,61 @@ def sighand(x,y):
   11.23  
   11.24  ##### Main code
   11.25  
   11.26 +mhz = 0
   11.27 +
   11.28  if len(sys.argv) < 2:
   11.29      usage()
   11.30  
   11.31 +try:
   11.32 +    opts, arg = getopt.getopt(sys.argv[1:], "c:" )
   11.33 +
   11.34 +    for opt in opts:
   11.35 +	if opt[0] == '-c' : mhz = int(opt[1])
   11.36 +
   11.37 +except getopt.GetoptError:
   11.38 +    usage()
   11.39 +
   11.40 +print mhz
   11.41 +
   11.42  signal.signal(signal.SIGTERM, sighand)
   11.43  signal.signal(signal.SIGHUP,  sighand)
   11.44  signal.signal(signal.SIGINT,  sighand)
   11.45  
   11.46  interrupted = 0
   11.47  
   11.48 -defs = read_defs(sys.argv[1])
   11.49 +defs = read_defs(arg[0])
   11.50 +
   11.51 +print defs
   11.52  
   11.53  # structure of trace record + prepended CPU id (as output by xentrace):
   11.54  # CPU(I) TSC(Q) EVENT(L) D1(L) D2(L) D3(L) D4(L) D5(L)
   11.55  TRCREC = "IQLLLLLL"
   11.56  
   11.57 +last_tsc = [0,0,0,0,0,0,0,0]
   11.58 +
   11.59 +i=0
   11.60 +
   11.61  while not interrupted:
   11.62      try:
   11.63 +	i=i+1
   11.64          line = sys.stdin.read(struct.calcsize(TRCREC))
   11.65          if not line:
   11.66              break
   11.67  
   11.68          (cpu, tsc, event, d1, d2, d3, d4, d5) = struct.unpack(TRCREC, line)
   11.69  
   11.70 +	#tsc = (tscH<<32) | tscL
   11.71 +
   11.72 +	#print i, tsc
   11.73 +
   11.74 +	if tsc < last_tsc[cpu]:
   11.75 +	    print "TSC stepped backward cpu %d !  %d %d" % (cpu,tsc,last_tsc[cpu])
   11.76 +
   11.77 +	last_tsc[cpu] = tsc
   11.78 +
   11.79 +	if mhz:
   11.80 +	    tsc = tsc / (mhz*1000000.0)
   11.81 +
   11.82          args = {'cpu'   : cpu,
   11.83                  'tsc'   : tsc,
   11.84                  'event' : event,
   11.85 @@ -90,8 +125,15 @@ while not interrupted:
   11.86                  '4'     : d4,
   11.87                  '5'     : d5    }
   11.88  
   11.89 -        if defs.has_key(str(event)): print defs[str(event)] % args
   11.90 -        # silently skip lines we don't have a format for - a 'complain' option
   11.91 -        # should be added if needed
   11.92 +	try:
   11.93 +
   11.94 +	    if defs.has_key(str(event)): 
   11.95 +		print defs[str(event)] % args
   11.96 +	    else: 
   11.97 +		if defs.has_key(str(0)): print defs[str(0)] % args
   11.98 +	except TypeError:
   11.99 +	    print defs[str(event)]
  11.100 +	    print args
  11.101 +	    
  11.102  
  11.103      except IOError, struct.error: sys.exit()
    12.1 --- a/xen/common/dom0_ops.c	Thu May 20 17:18:28 2004 +0000
    12.2 +++ b/xen/common/dom0_ops.c	Thu May 20 17:43:25 2004 +0000
    12.3 @@ -22,8 +22,8 @@
    12.4  #include <hypervisor-ifs/sched_ctl.h>
    12.5  
    12.6  
    12.7 -#define TRC_DOM0OP_START_BASE   0x00020000
    12.8 -#define TRC_DOM0OP_FINISH_BASE  0x00030000
    12.9 +#define TRC_DOM0OP_ENTER_BASE  0x00020000
   12.10 +#define TRC_DOM0OP_LEAVE_BASE  0x00030000
   12.11  
   12.12  
   12.13  extern unsigned int alloc_new_dom_mem(struct task_struct *, unsigned int);
   12.14 @@ -64,7 +64,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   12.15          return -EACCES;
   12.16      }
   12.17  
   12.18 -    TRACE_5D( TRC_DOM0OP_START_BASE + op->cmd, 
   12.19 +    TRACE_5D( TRC_DOM0OP_ENTER_BASE + op->cmd, 
   12.20  	 0, op->u.dummy[0], op->u.dummy[1], op->u.dummy[2], op->u.dummy[3] );
   12.21  
   12.22      switch ( op->cmd )
   12.23 @@ -102,6 +102,20 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   12.24      case DOM0_STOPDOMAIN:
   12.25      {
   12.26          ret = stop_other_domain(op->u.stopdomain.domain);
   12.27 +	
   12.28 +	/* This is grim, but helps for live migrate. It's also unsafe
   12.29 +	   in the strict sense as we're not explicitly setting a
   12.30 +	   timeout, but dom0 is bound to have other timers going off to
   12.31 +	   wake us back up. 
   12.32 +	   We go to sleep so that the other domain can stop quicker, hence
   12.33 +	   we have less total down time in a migrate.
   12.34 +	 */
   12.35 +	if( ret == 0 && op->u.stopdomain.sync == 1 )
   12.36 +	{
   12.37 +	    extern long do_block( void );
   12.38 +	    printk("T\n");
   12.39 +	    do_block(); // Yuk...
   12.40 +	}
   12.41      }
   12.42      break;
   12.43  
   12.44 @@ -668,7 +682,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
   12.45  
   12.46      }
   12.47  
   12.48 -    TRACE_5D( TRC_DOM0OP_FINISH_BASE + op->cmd, ret,
   12.49 +    TRACE_5D( TRC_DOM0OP_LEAVE_BASE + op->cmd, ret,
   12.50  	 op->u.dummy[0], op->u.dummy[1], op->u.dummy[2], op->u.dummy[3]  );
   12.51  
   12.52  
    13.1 --- a/xen/common/domain.c	Thu May 20 17:18:28 2004 +0000
    13.2 +++ b/xen/common/domain.c	Thu May 20 17:43:25 2004 +0000
    13.3 @@ -266,6 +266,16 @@ void stop_domain(void)
    13.4      unlazy_fpu(current);
    13.5      wmb(); /* All CPUs must see saved info in state TASK_STOPPED. */
    13.6      set_current_state(TASK_STOPPED);
    13.7 +
    13.8 +    /* OK, this is grim, but helps speed up live migrate. When a domain stops,
    13.9 +       kick Dom0 */
   13.10 +    {
   13.11 +	struct task_struct *p;
   13.12 +	printk("S\n");
   13.13 +	guest_schedule_to_run( p = find_domain_by_id(0ULL) );
   13.14 +	put_task_struct(p);
   13.15 +    }
   13.16 +
   13.17      __enter_scheduler();
   13.18  }
   13.19  
    14.1 --- a/xen/common/schedule.c	Thu May 20 17:18:28 2004 +0000
    14.2 +++ b/xen/common/schedule.c	Thu May 20 17:43:25 2004 +0000
    14.3 @@ -27,6 +27,9 @@
    14.4  #include <xen/perfc.h>
    14.5  #include <xen/sched-if.h>
    14.6  #include <hypervisor-ifs/sched_ctl.h>
    14.7 +
    14.8 +#undef  TRACE_BUFFER
    14.9 +
   14.10  #include <xen/trace.h>
   14.11  
   14.12  /*#define WAKEUP_HISTO*/
   14.13 @@ -216,7 +219,7 @@ void wake_up(struct task_struct *p)
   14.14  /* 
   14.15   * Block the currently-executing domain until a pertinent event occurs.
   14.16   */
   14.17 -static long do_block(void)
   14.18 +long do_block(void)
   14.19  {
   14.20      ASSERT(current->domain != IDLE_DOMAIN_ID);
   14.21      current->shared_info->vcpu_data[0].evtchn_upcall_mask = 0;
    15.1 --- a/xen/common/shadow.c	Thu May 20 17:18:28 2004 +0000
    15.2 +++ b/xen/common/shadow.c	Thu May 20 17:43:25 2004 +0000
    15.3 @@ -109,43 +109,68 @@ static void __free_shadow_table( struct 
    15.4      SH_LOG("Free shadow table. Freed= %d",free);
    15.5  }
    15.6  
    15.7 -static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
    15.8 -                                  struct pfn_info *spfn_info )
    15.9 +
   15.10 +#define TABLE_OP_ZERO_L2 1
   15.11 +#define TABLE_OP_ZERO_L1 2
   15.12 +#define TABLE_OP_FREE_L1 3
   15.13 +
   15.14 +static inline int shadow_page_op( struct mm_struct *m, unsigned int op, 
   15.15 +								  unsigned int gpfn,
   15.16 +                                  struct pfn_info *spfn_info, int *work )
   15.17  {
   15.18 -    int work = 0;
   15.19      unsigned int spfn = spfn_info-frame_table;
   15.20 +	int restart = 0;
   15.21  
   15.22      switch( op )
   15.23      {
   15.24 -    case DOM0_SHADOW_CONTROL_OP_CLEAN:
   15.25 -    {
   15.26 -        int i;
   15.27 -        if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   15.28 +	case TABLE_OP_ZERO_L2:
   15.29 +	{
   15.30 +		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   15.31 +             PGT_l2_page_table )
   15.32 +		{
   15.33 +			unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   15.34 +			memset( spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e) );
   15.35 +			unmap_domain_mem( spl1e );
   15.36 +		}
   15.37 +    }
   15.38 +	break;
   15.39 +	
   15.40 +	case TABLE_OP_ZERO_L1:
   15.41 +	{
   15.42 +		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   15.43               PGT_l1_page_table )
   15.44 -        {
   15.45 -            unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   15.46 -
   15.47 -            for (i=0;i<ENTRIES_PER_L1_PAGETABLE;i++)
   15.48 -            {                    
   15.49 -                if ( (spl1e[i] & _PAGE_PRESENT ) && (spl1e[i] & _PAGE_RW) )
   15.50 -                {
   15.51 -                    work++;
   15.52 -                    spl1e[i] &= ~_PAGE_RW;
   15.53 -                }
   15.54 -            }
   15.55 -            unmap_domain_mem( spl1e );
   15.56 -        }
   15.57 +		{
   15.58 +			unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
   15.59 +			memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
   15.60 +			unmap_domain_mem( spl1e );
   15.61 +		}
   15.62      }
   15.63  	break;
   15.64  
   15.65 +	case TABLE_OP_FREE_L1:
   15.66 +	{
   15.67 +		if ( (spfn_info->type_and_flags & PGT_type_mask) == 
   15.68 +             PGT_l1_page_table )
   15.69 +		{
   15.70 +			// lock is already held
   15.71 +			delete_shadow_status( m, gpfn );
   15.72 +			restart = 1; // we need to go to start of list again
   15.73 +		}
   15.74      }
   15.75 -    return work;
   15.76 +
   15.77 +	break;
   15.78 +	
   15.79 +	default:
   15.80 +		BUG();
   15.81 +
   15.82 +    }
   15.83 +    return restart;
   15.84  }
   15.85  
   15.86  static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
   15.87  {
   15.88      int j, work=0;
   15.89 -    struct shadow_status *a;
   15.90 +    struct shadow_status *a, *next;
   15.91   
   15.92      // the code assumes you're not using the page tables i.e.
   15.93      // the domain is stopped and cr3 is something else!!
   15.94 @@ -156,16 +181,25 @@ static void __scan_shadow_table( struct 
   15.95  
   15.96      for(j=0;j<shadow_ht_buckets;j++)
   15.97      {
   15.98 -        a = &m->shadow_ht[j];        
   15.99 +	retry:
  15.100 +        a = &m->shadow_ht[j];     
  15.101 +		next = a->next;
  15.102          if (a->pfn)
  15.103          {
  15.104 -            work += shadow_page_op( m, op, &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
  15.105 +            if ( shadow_page_op( m, op, a->pfn,								 
  15.106 +								 &frame_table[a->spfn_and_flags & PSH_pfn_mask], 
  15.107 +								 &work ) )
  15.108 +				goto retry;
  15.109          }
  15.110 -        a=a->next;
  15.111 +        a=next;
  15.112          while(a)
  15.113          { 
  15.114 -            work += shadow_page_op( m, op, &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
  15.115 -            a=a->next;
  15.116 +			next = a->next;
  15.117 +            if ( shadow_page_op( m, op, a->pfn,
  15.118 +								 &frame_table[a->spfn_and_flags & PSH_pfn_mask],
  15.119 +								 &work ) )
  15.120 +				goto retry;
  15.121 +            a=next;
  15.122          }
  15.123          shadow_audit(m,0);
  15.124      }
  15.125 @@ -301,16 +335,29 @@ static int shadow_mode_table_op( struct 
  15.126      switch(op)
  15.127      {
  15.128      case DOM0_SHADOW_CONTROL_OP_FLUSH:
  15.129 -        __free_shadow_table( m );
  15.130 +        // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
  15.131 +		// OTHER CPU -- fix when we get sched sync pause.
  15.132 +        __free_shadow_table( m );  
  15.133          break;
  15.134     
  15.135 -    case DOM0_SHADOW_CONTROL_OP_CLEAN:
  15.136 +    case DOM0_SHADOW_CONTROL_OP_CLEAN:   // zero all-non hypervisor
  15.137 +	{
  15.138 +		__scan_shadow_table( m, TABLE_OP_ZERO_L2 );
  15.139 +		__scan_shadow_table( m, TABLE_OP_ZERO_L1 );
  15.140 +
  15.141 +		goto send_bitmap;
  15.142 +	}
  15.143 +		
  15.144 +
  15.145 +    case DOM0_SHADOW_CONTROL_OP_CLEAN2:  // zero all L2, free L1s
  15.146      {
  15.147  		int i,j,zero=1;
  15.148  		
  15.149 -		__scan_shadow_table( m, op );
  15.150 -		//    __free_shadow_table( m );
  15.151 -	
  15.152 +		__scan_shadow_table( m, TABLE_OP_ZERO_L2 );
  15.153 +		__scan_shadow_table( m, TABLE_OP_FREE_L1 );
  15.154 +		
  15.155 +	send_bitmap:
  15.156 +
  15.157  		if( p->tot_pages > sc->pages || 
  15.158  			!sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap )
  15.159  		{
  15.160 @@ -350,6 +397,38 @@ static int shadow_mode_table_op( struct 
  15.161  
  15.162  		break;
  15.163      }
  15.164 +
  15.165 +    case DOM0_SHADOW_CONTROL_OP_PEEK:
  15.166 +    {
  15.167 +		int i;
  15.168 +	
  15.169 +		if( p->tot_pages > sc->pages || 
  15.170 +			!sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap )
  15.171 +		{
  15.172 +			rc = -EINVAL;
  15.173 +			goto out;
  15.174 +		}
  15.175 +	
  15.176 +		sc->pages = p->tot_pages;
  15.177 +	
  15.178 +#define chunk (8*1024) // do this in 1KB chunks for L1 cache
  15.179 +	
  15.180 +		for(i=0;i<p->tot_pages;i+=chunk)
  15.181 +		{
  15.182 +			int bytes = ((  ((p->tot_pages-i) > (chunk))?
  15.183 +							(chunk):(p->tot_pages-i) ) + 7) / 8;
  15.184 +	    
  15.185 +			copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
  15.186 +						  p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
  15.187 +						  bytes );	    
  15.188 +		}
  15.189 +
  15.190 +		break;
  15.191 +    }
  15.192 +
  15.193 +	default:
  15.194 +		BUG();
  15.195 +
  15.196      }
  15.197  
  15.198  
  15.199 @@ -386,7 +465,7 @@ int shadow_mode_control( struct task_str
  15.200          if(p->mm.shadow_mode) shadow_mode_disable(p);
  15.201          shadow_mode_enable(p, SHM_logdirty);
  15.202      } 
  15.203 -    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN )
  15.204 +    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
  15.205      {
  15.206          rc = shadow_mode_table_op(p, sc);
  15.207      }
    16.1 --- a/xen/common/trace.c	Thu May 20 17:18:28 2004 +0000
    16.2 +++ b/xen/common/trace.c	Thu May 20 17:43:25 2004 +0000
    16.3 @@ -27,7 +27,6 @@
    16.4  #include <xen/sched.h>
    16.5  #include <xen/slab.h>
    16.6  #include <xen/smp.h>
    16.7 -#include <xen/spinlock.h>
    16.8  #include <xen/trace.h>
    16.9  #include <xen/errno.h>
   16.10  #include <asm/atomic.h>
   16.11 @@ -86,7 +85,6 @@ void init_trace_bufs(void)
   16.12          /* For use in Xen. */
   16.13          buf->vdata    = (struct t_rec *)(buf+1);
   16.14          buf->head_ptr = buf->vdata;
   16.15 -        spin_lock_init(&buf->lock);
   16.16          
   16.17          /* For use in user space. */
   16.18          buf->data = (struct t_rec *)__pa(buf->vdata);
    17.1 --- a/xen/include/hypervisor-ifs/dom0_ops.h	Thu May 20 17:18:28 2004 +0000
    17.2 +++ b/xen/include/hypervisor-ifs/dom0_ops.h	Thu May 20 17:43:25 2004 +0000
    17.3 @@ -74,6 +74,9 @@ typedef struct dom0_stopdomain_st
    17.4  {
    17.5      /* IN parameters. */
    17.6      domid_t domain;
    17.7 +    /* hack to indicate that you want to wait for other domain -- replace
    17.8 +       with proper sychronous stop soon! */
    17.9 +    int     sync;  
   17.10  } dom0_stopdomain_t;
   17.11  
   17.12  #define DOM0_GETDOMAININFO    12
   17.13 @@ -236,8 +239,10 @@ typedef struct dom0_sched_id_st
   17.14  #define DOM0_SHADOW_CONTROL_OP_OFF         0
   17.15  #define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1
   17.16  #define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2
   17.17 -#define DOM0_SHADOW_CONTROL_OP_FLUSH       10
   17.18 +#define DOM0_SHADOW_CONTROL_OP_FLUSH       10     /* table ops */
   17.19  #define DOM0_SHADOW_CONTROL_OP_CLEAN       11
   17.20 +#define DOM0_SHADOW_CONTROL_OP_PEEK        12
   17.21 +#define DOM0_SHADOW_CONTROL_OP_CLEAN2      13
   17.22  typedef struct dom0_shadow_control_st
   17.23  {
   17.24      /* IN variables. */
    18.1 --- a/xen/include/hypervisor-ifs/trace.h	Thu May 20 17:18:28 2004 +0000
    18.2 +++ b/xen/include/hypervisor-ifs/trace.h	Thu May 20 17:43:25 2004 +0000
    18.3 @@ -20,13 +20,12 @@ struct t_buf {
    18.4      struct t_rec *data;     /* pointer to data area.  physical address
    18.5                               * for convenience in user space code            */
    18.6  
    18.7 -    unsigned int size;      /* size of the data area, in t_recs              */
    18.8 -    unsigned int head;      /* array index of the most recent record         */
    18.9 +    unsigned long size;      /* size of the data area, in t_recs              */
   18.10 +    unsigned long head;      /* array index of the most recent record         */
   18.11  
   18.12  #ifdef __KERNEL__
   18.13      struct t_rec *head_ptr; /* pointer to the head record                    */
   18.14      struct t_rec *vdata;    /* virtual address pointer to data               */
   18.15 -    spinlock_t lock;        /* ensure mutually exlusive access (for inserts) */
   18.16  #endif
   18.17  
   18.18      /* never add anything here - the kernel stuff must be the last elements */
    19.1 --- a/xen/include/xen/trace.h	Thu May 20 17:18:28 2004 +0000
    19.2 +++ b/xen/include/xen/trace.h	Thu May 20 17:43:25 2004 +0000
    19.3 @@ -61,11 +61,13 @@ static inline int trace(u32 event, u32 d
    19.4      if ( !tb_init_done )
    19.5          return -1;
    19.6  
    19.7 +
    19.8      buf = t_bufs[smp_processor_id()];
    19.9 +
   19.10 +    local_irq_save(flags);
   19.11 +
   19.12      rec = buf->head_ptr;
   19.13  
   19.14 -    spin_lock_irqsave(&buf->lock, flags);
   19.15 -
   19.16      rdtscll(rec->cycles);
   19.17      rec->event = event;
   19.18      rec->d1 = d1;
   19.19 @@ -76,18 +78,12 @@ static inline int trace(u32 event, u32 d
   19.20  
   19.21      wmb(); /* above must be visible before reader sees index updated */
   19.22  
   19.23 -    if ( likely(buf->head_ptr < (buf->vdata + buf->size - 1)) )
   19.24 -    {
   19.25 -        buf->head_ptr++;
   19.26 -        buf->head++;
   19.27 -    }
   19.28 -    else
   19.29 -    {
   19.30 -        buf->head = 0;
   19.31 +    buf->head_ptr++;
   19.32 +    buf->head++;
   19.33 +    if ( buf->head_ptr == (buf->vdata + (buf->size-1)) )
   19.34          buf->head_ptr = buf->vdata;
   19.35 -    }
   19.36  
   19.37 -    spin_unlock_irqrestore(&buf->lock, flags);
   19.38 +    local_irq_restore(flags);
   19.39      
   19.40      return 0;
   19.41  }
    20.1 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c	Thu May 20 17:18:28 2004 +0000
    20.2 +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c	Thu May 20 17:43:25 2004 +0000
    20.3 @@ -381,6 +381,10 @@ xc_linux_save  */
    20.4  	        /* With live migrate, we even get these...  Disable for now. */
    20.5                  // printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status);
    20.6              }
    20.7 +	    else
    20.8 +		phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
    20.9 +		    0x80000004; // disown this page -- it was a flush
   20.10 +
   20.11              dev_kfree_skb_any(skb);
   20.12              continue;
   20.13          }