ia64/xen-unstable

changeset 19646:f210a633571c

Transcendent memory ("tmem") for Xen.

Tmem, when called from a tmem-capable (paravirtualized) guest, makes
use of otherwise unutilized ("fallow") memory to create and manage
pools of pages that can be accessed from the guest either as
"ephemeral" pages or as "persistent" pages. In either case, the pages
are not directly addressible by the guest, only copied to and fro via
the tmem interface. Ephemeral pages are a nice place for a guest to
put recently evicted clean pages that it might need again; these pages
can be reclaimed synchronously by Xen for other guests or other uses.
Persistent pages are a nice place for a guest to put "swap" pages to
avoid sending them to disk. These pages retain data as long as the
guest lives, but count against the guest memory allocation.

Tmem pages may optionally be compressed and, in certain cases, can be
shared between guests. Tmem also handles concurrency nicely and
provides limited QoS settings to combat malicious DoS attempts.
Save/restore and live migration support is not yet provided.

Tmem is primarily targeted for an x86 64-bit hypervisor. On a 32-bit
x86 hypervisor, it has limited functionality and testing due to
limitations of the xen heap. Nearly all of tmem is
architecture-independent; three routines remain to be ported to ia64
and it should work on that architecture too. It is also structured to
be portable to non-Xen environments.

Tmem defaults off (for now) and must be enabled with a "tmem" xen boot
option (and does nothing unless a tmem-capable guest is running). The
"tmem_compress" boot option enables compression which takes about 10x
more CPU but approximately doubles the number of pages that can be
stored.

Tmem can be controlled via several "xm" commands and many interesting
tmem statistics can be obtained. A README and internal specification
will follow, but lots of useful prose about tmem, as well as Linux
patches, can be found at http://oss.oracle.com/projects/tmem .

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 11:05:04 2009 +0100 (2009-05-26)
parents 9749e8f6e3d8
children 1c627434605e
files .hgignore tools/libxc/Makefile tools/libxc/xc_tmem.c tools/libxc/xenctrl.h tools/misc/Makefile tools/misc/xen-tmem-list-parse.c tools/python/xen/lowlevel/xc/xc.c tools/python/xen/xend/XendAPI.py tools/python/xen/xend/XendConstants.py tools/python/xen/xend/XendNode.py tools/python/xen/xend/balloon.py tools/python/xen/xend/server/XMLRPCServer.py tools/python/xen/xm/main.py xen/arch/ia64/xen/mm.c xen/arch/x86/mm.c xen/arch/x86/setup.c xen/common/Makefile xen/common/compat/Makefile xen/common/compat/tmem_xen.c xen/common/domain.c xen/common/lzo.c xen/common/memory.c xen/common/page_alloc.c xen/common/radix-tree.c xen/common/rbtree.c xen/common/spinlock.c xen/common/tmem.c xen/common/tmem_xen.c xen/common/xmalloc_tlsf.c xen/include/Makefile xen/include/asm-ia64/mm.h xen/include/asm-x86/mm.h xen/include/asm-x86/spinlock.h xen/include/public/tmem.h xen/include/public/xen.h xen/include/xen/config.h xen/include/xen/hash.h xen/include/xen/hypercall.h xen/include/xen/lib.h xen/include/xen/lzo.h xen/include/xen/mm.h xen/include/xen/radix-tree.h xen/include/xen/rbtree.h xen/include/xen/sched.h xen/include/xen/spinlock.h xen/include/xen/tmem.h xen/include/xen/tmem_xen.h xen/include/xen/xmalloc.h xen/include/xlat.lst
line diff
     1.1 --- a/.hgignore	Tue May 26 10:14:34 2009 +0100
     1.2 +++ b/.hgignore	Tue May 26 11:05:04 2009 +0100
     1.3 @@ -181,6 +181,7 @@
     1.4  ^tools/misc/xc_shadow$
     1.5  ^tools/misc/xen_cpuperf$
     1.6  ^tools/misc/xen-detect$
     1.7 +^tools/misc/xen-tmem-list-parse$
     1.8  ^tools/misc/xenperf$
     1.9  ^tools/misc/xenpm$
    1.10  ^tools/pygrub/build/.*$
     2.1 --- a/tools/libxc/Makefile	Tue May 26 10:14:34 2009 +0100
     2.2 +++ b/tools/libxc/Makefile	Tue May 26 11:05:04 2009 +0100
     2.3 @@ -21,6 +21,7 @@ CTRL_SRCS-y       += xc_tbuf.c
     2.4  CTRL_SRCS-y       += xc_pm.c
     2.5  CTRL_SRCS-y       += xc_cpu_hotplug.c
     2.6  CTRL_SRCS-y       += xc_resume.c
     2.7 +CTRL_SRCS-y       += xc_tmem.c
     2.8  CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
     2.9  CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c
    2.10  CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/tools/libxc/xc_tmem.c	Tue May 26 11:05:04 2009 +0100
     3.3 @@ -0,0 +1,83 @@
     3.4 +/******************************************************************************
     3.5 + * xc_tmem.c
     3.6 + *
     3.7 + * Copyright (C) 2008 Oracle Corp.
     3.8 + */
     3.9 +
    3.10 +#include "xc_private.h"
    3.11 +#include <xen/tmem.h>
    3.12 +
    3.13 +static int do_tmem_op(int xc, tmem_op_t *op)
    3.14 +{
    3.15 +    int ret;
    3.16 +    DECLARE_HYPERCALL;
    3.17 +
    3.18 +    hypercall.op = __HYPERVISOR_tmem_op;
    3.19 +    hypercall.arg[0] = (unsigned long)op;
    3.20 +    if (lock_pages(op, sizeof(*op)) != 0)
    3.21 +    {
    3.22 +        PERROR("Could not lock memory for Xen hypercall");
    3.23 +        return -EFAULT;
    3.24 +    }
    3.25 +    if ((ret = do_xen_hypercall(xc, &hypercall)) < 0)
    3.26 +    {
    3.27 +        if ( errno == EACCES )
    3.28 +            DPRINTF("tmem operation failed -- need to"
    3.29 +                    " rebuild the user-space tool set?\n");
    3.30 +    }
    3.31 +    unlock_pages(op, sizeof(*op));
    3.32 +
    3.33 +    return ret;
    3.34 +}
    3.35 +
    3.36 +int xc_tmem_control(int xc,
    3.37 +                    int32_t pool_id,
    3.38 +                    uint32_t subop,
    3.39 +                    uint32_t cli_id,
    3.40 +                    uint32_t arg1,
    3.41 +                    uint32_t arg2,
    3.42 +                    void *buf)
    3.43 +{
    3.44 +    tmem_op_t op;
    3.45 +    int rc;
    3.46 +
    3.47 +    op.cmd = TMEM_CONTROL;
    3.48 +    op.pool_id = pool_id;
    3.49 +    op.subop = subop;
    3.50 +    op.cli_id = cli_id;
    3.51 +    op.arg1 = arg1;
    3.52 +    op.arg2 = arg2;
    3.53 +    op.buf.p = buf;
    3.54 +
    3.55 +    if (subop == TMEMC_LIST) {
    3.56 +        if ((arg1 != 0) && (lock_pages(buf, arg1) != 0))
    3.57 +        {
    3.58 +            PERROR("Could not lock memory for Xen hypercall");
    3.59 +            return -ENOMEM;
    3.60 +        }
    3.61 +    }
    3.62 +
    3.63 +#ifdef VALGRIND
    3.64 +    if (arg1 != 0)
    3.65 +        memset(buf, 0, arg1);
    3.66 +#endif
    3.67 +
    3.68 +    rc = do_tmem_op(xc, &op);
    3.69 +
    3.70 +    if (subop == TMEMC_LIST) {
    3.71 +        if (arg1 != 0)
    3.72 +            unlock_pages(buf, arg1);
    3.73 +    }
    3.74 +
    3.75 +    return rc;
    3.76 +}
    3.77 +
    3.78 +/*
    3.79 + * Local variables:
    3.80 + * mode: C
    3.81 + * c-set-style: "BSD"
    3.82 + * c-basic-offset: 4
    3.83 + * tab-width: 4
    3.84 + * indent-tabs-mode: nil
    3.85 + * End:
    3.86 + */
     4.1 --- a/tools/libxc/xenctrl.h	Tue May 26 10:14:34 2009 +0100
     4.2 +++ b/tools/libxc/xenctrl.h	Tue May 26 11:05:04 2009 +0100
     4.3 @@ -1267,4 +1267,15 @@ int xc_get_vcpu_migration_delay(int xc_h
     4.4  int xc_get_cpuidle_max_cstate(int xc_handle, uint32_t *value);
     4.5  int xc_set_cpuidle_max_cstate(int xc_handle, uint32_t value);
     4.6  
     4.7 +/**
     4.8 + * tmem operations
     4.9 + */
    4.10 +int xc_tmem_control(int xc,
    4.11 +                    int32_t pool_id,
    4.12 +                    uint32_t subop,
    4.13 +                    uint32_t cli_id,
    4.14 +                    uint32_t arg1,
    4.15 +                    uint32_t arg2,
    4.16 +                    void *buf);
    4.17 +
    4.18  #endif /* XENCTRL_H */
     5.1 --- a/tools/misc/Makefile	Tue May 26 10:14:34 2009 +0100
     5.2 +++ b/tools/misc/Makefile	Tue May 26 11:05:04 2009 +0100
     5.3 @@ -10,7 +10,7 @@ CFLAGS   += $(INCLUDES)
     5.4  
     5.5  HDRS     = $(wildcard *.h)
     5.6  
     5.7 -TARGETS-y := xenperf xenpm
     5.8 +TARGETS-y := xenperf xenpm xen-tmem-list-parse
     5.9  TARGETS-$(CONFIG_X86) += xen-detect
    5.10  TARGETS := $(TARGETS-y)
    5.11  
    5.12 @@ -22,7 +22,7 @@ INSTALL_BIN-y := xencons
    5.13  INSTALL_BIN-$(CONFIG_X86) += xen-detect
    5.14  INSTALL_BIN := $(INSTALL_BIN-y)
    5.15  
    5.16 -INSTALL_SBIN-y := xm xen-bugtool xen-python-path xend xenperf xsview xenpm
    5.17 +INSTALL_SBIN-y := xm xen-bugtool xen-python-path xend xenperf xsview xenpm xen-tmem-list-parse
    5.18  INSTALL_SBIN := $(INSTALL_SBIN-y)
    5.19  
    5.20  DEFAULT_PYTHON_PATH := $(shell $(XEN_ROOT)/tools/python/get-path)
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/tools/misc/xen-tmem-list-parse.c	Tue May 26 11:05:04 2009 +0100
     6.3 @@ -0,0 +1,288 @@
     6.4 +/*
     6.5 + * Parse output from tmem-list and reformat to human-readable
     6.6 + *
     6.7 + * NOTE: NEVER delete a parse call as this file documents backwards
     6.8 + * compatibility for older versions of tmem-list and we don't want to
     6.9 + * accidentally reuse an old tag
    6.10 + *
    6.11 + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
    6.12 + */
    6.13 +
    6.14 +#include <stdio.h>
    6.15 +#include <unistd.h>
    6.16 +#include <string.h>
    6.17 +
    6.18 +#define BUFSIZE 4096
    6.19 +#define PAGE_SIZE 4096
    6.20 +
    6.21 +unsigned long long parse(char *s,char *match)
    6.22 +{
    6.23 +    char *s1 = strstr(s,match);
    6.24 +    unsigned long long ret;
    6.25 +
    6.26 +    if ( s1 == NULL )
    6.27 +        return 0LL;
    6.28 +    s1 += 2;
    6.29 +    if ( *s1++ != ':' )
    6.30 +        return 0LL;
    6.31 +    sscanf(s1,"%llu",&ret);
    6.32 +    return ret;
    6.33 +}
    6.34 +
    6.35 +unsigned long long parse2(char *s,char *match1, char *match2)
    6.36 +{
    6.37 +    char match[3];
    6.38 +    match[0] = *match1;
    6.39 +    match[1] = *match2;
    6.40 +    match[2] = '\0';
    6.41 +    return parse(s,match);
    6.42 +}
    6.43 +
    6.44 +void parse_string(char *s,char *match, char *buf, int len)
    6.45 +{
    6.46 +    char *s1 = strstr(s,match);
    6.47 +    int i;
    6.48 +
    6.49 +    if ( s1 == NULL )
    6.50 +        return;
    6.51 +    s1 += 2;
    6.52 +    if ( *s1++ != ':' )
    6.53 +        return;
    6.54 +    for ( i = 0; i < len; i++ )
    6.55 +        *buf++ = *s1++;
    6.56 +}
    6.57 +
    6.58 +void parse_sharers(char *s, char *match, char *buf, int len)
    6.59 +{
    6.60 +    char *s1 = strstr(s,match);
    6.61 +    char *b = buf;
    6.62 +
    6.63 +    if ( s1 == NULL )
    6.64 +        return;
    6.65 +    while ( s1 )
    6.66 +    {
    6.67 +        s1 += 2;
    6.68 +        if (*s1++ != ':')
    6.69 +            return;
    6.70 +        while (*s1 <= '0' && *s1 <= '9')
    6.71 +            *b++ = *s1++;
    6.72 +        *b++ = ',';
    6.73 +        s1 = strstr(s1,match);
    6.74 +    }
    6.75 +    if ( b != buf )
    6.76 +        *--b = '\0';
    6.77 +}
    6.78 +
    6.79 +void parse_global(char *s)
    6.80 +{
    6.81 +    unsigned long long total_ops = parse(s,"Tt");
    6.82 +    unsigned long long errored_ops = parse(s,"Te");
    6.83 +    unsigned long long failed_copies = parse(s,"Cf");
    6.84 +    unsigned long long alloc_failed = parse(s,"Af");
    6.85 +    unsigned long long alloc_page_failed = parse(s,"Pf");
    6.86 +    unsigned long long avail_pages = parse(s,"Ta");
    6.87 +    unsigned long long low_on_memory = parse(s,"Lm");
    6.88 +    unsigned long long evicted_pgs = parse(s,"Et");
    6.89 +    unsigned long long evict_attempts = parse(s,"Ea");
    6.90 +    unsigned long long relinq_pgs = parse(s,"Rt");
    6.91 +    unsigned long long relinq_attempts = parse(s,"Ra");
    6.92 +    unsigned long long max_evicts_per_relinq = parse(s,"Rx");
    6.93 +    unsigned long long total_flush_pool = parse(s,"Fp");
    6.94 +    unsigned long long global_eph_count = parse(s,"Ec");
    6.95 +    unsigned long long global_eph_max = parse(s,"Em");
    6.96 +    unsigned long long obj_count = parse(s,"Oc");
    6.97 +    unsigned long long obj_max = parse(s,"Om");
    6.98 +    unsigned long long rtree_node_count = parse(s,"Nc");
    6.99 +    unsigned long long rtree_node_max = parse(s,"Nm");
   6.100 +    unsigned long long pgp_count = parse(s,"Pc");
   6.101 +    unsigned long long pgp_max = parse(s,"Pm");
   6.102 +
   6.103 +    printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
   6.104 +           total_ops, errored_ops, avail_pages);
   6.105 +    printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
   6.106 +           "nodes=%llu (max=%llu)\n",
   6.107 +           obj_count, obj_max, pgp_count, pgp_max,
   6.108 +           rtree_node_count, rtree_node_max);
   6.109 +    printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
   6.110 +           "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
   6.111 +           "max_evicts_per_relinq=%llu, flush_pools=%llu, "
   6.112 +           "eph_count=%llu, eph_max=%llu\n",
   6.113 +           failed_copies, alloc_failed, alloc_page_failed, low_on_memory,
   6.114 +           evicted_pgs, evict_attempts, relinq_pgs, relinq_attempts,
   6.115 +           max_evicts_per_relinq, total_flush_pool,
   6.116 +           global_eph_count, global_eph_max);
   6.117 +}
   6.118 +
   6.119 +#define PARSE_CYC_COUNTER(s,x,prefix) unsigned long long \
   6.120 +   x##_count = parse2(s,prefix,"n"), \
   6.121 +   x##_sum_cycles = parse2(s,prefix,"t"), \
   6.122 +   x##_max_cycles = parse2(s,prefix,"x"), \
   6.123 +   x##_min_cycles = parse2(s,prefix,"m")
   6.124 +#define PRINTF_CYC_COUNTER(x,text) \
   6.125 +  if (x##_count) printf(text" avg=%llu, max=%llu, " \
   6.126 +  "min=%llu, samples=%llu\n", \
   6.127 +  x##_sum_cycles ? (x##_sum_cycles/x##_count) : 0, \
   6.128 +  x##_max_cycles, x##_min_cycles, x##_count)
   6.129 +
   6.130 +void parse_time_stats(char *s)
   6.131 +{
   6.132 +    PARSE_CYC_COUNTER(s,succ_get,"G");
   6.133 +    PARSE_CYC_COUNTER(s,succ_put,"P");
   6.134 +    PARSE_CYC_COUNTER(s,non_succ_get,"g");
   6.135 +    PARSE_CYC_COUNTER(s,non_succ_put,"p");
   6.136 +    PARSE_CYC_COUNTER(s,flush,"F");
   6.137 +    PARSE_CYC_COUNTER(s,flush_obj,"O");
   6.138 +    PARSE_CYC_COUNTER(s,pg_copy,"C");
   6.139 +    PARSE_CYC_COUNTER(s,compress,"c");
   6.140 +    PARSE_CYC_COUNTER(s,decompress,"d");
   6.141 +
   6.142 +    PRINTF_CYC_COUNTER(succ_get,"succ get cycles:");
   6.143 +    PRINTF_CYC_COUNTER(succ_put,"succ put cycles:");
   6.144 +    PRINTF_CYC_COUNTER(non_succ_get,"failed get cycles:");
   6.145 +    PRINTF_CYC_COUNTER(non_succ_put,"failed put cycles:");
   6.146 +    PRINTF_CYC_COUNTER(flush,"flush cycles:");
   6.147 +    PRINTF_CYC_COUNTER(flush_obj,"flush_obj cycles:");
   6.148 +    PRINTF_CYC_COUNTER(pg_copy,"page copy cycles:");
   6.149 +    PRINTF_CYC_COUNTER(compress,"compression cycles:");
   6.150 +    PRINTF_CYC_COUNTER(decompress,"decompression cycles:");
   6.151 +}
   6.152 +
   6.153 +void parse_client(char *s)
   6.154 +{
   6.155 +    unsigned long cli_id = parse(s,"CI");
   6.156 +    unsigned long weight = parse(s,"ww");
   6.157 +    unsigned long cap = parse(s,"ca");
   6.158 +    unsigned long compress = parse(s,"co");
   6.159 +    unsigned long frozen = parse(s,"fr");
   6.160 +    unsigned long long eph_count = parse(s,"Ec");
   6.161 +    unsigned long long max_eph_count = parse(s,"Em");
   6.162 +    unsigned long long compressed_pages = parse(s,"cp");
   6.163 +    unsigned long long compressed_sum_size = parse(s,"cb");
   6.164 +    unsigned long long compress_poor = parse(s,"cn");
   6.165 +    unsigned long long compress_nomem = parse(s,"cm");
   6.166 +
   6.167 +    printf("domid%lu: weight=%lu,cap=%lu,compress=%d,frozen=%d,"
   6.168 +           "eph_count=%llu,max_eph=%llu,"
   6.169 +           "compression ratio=%lu%% (samples=%llu,poor=%llu,nomem=%llu)\n",
   6.170 +           cli_id, weight, cap, compress?1:0, frozen?1:0,
   6.171 +           eph_count, max_eph_count,
   6.172 +           compressed_pages ?  (long)((compressed_sum_size*100LL) /
   6.173 +                                      (compressed_pages*PAGE_SIZE)) : 0,
   6.174 +           compressed_pages, compress_poor, compress_nomem);
   6.175 +
   6.176 +}
   6.177 +
   6.178 +void parse_pool(char *s)
   6.179 +{
   6.180 +    char pool_type[3];
   6.181 +    unsigned long cli_id = parse(s,"CI");
   6.182 +    unsigned long pool_id = parse(s,"PI");
   6.183 +    unsigned long long pgp_count = parse(s,"Pc");
   6.184 +    unsigned long long max_pgp_count = parse(s,"Pm");
   6.185 +    unsigned long long obj_count = parse(s,"Oc");
   6.186 +    unsigned long long max_obj_count = parse(s,"Om");
   6.187 +    unsigned long long objnode_count = parse(s,"Nc");
   6.188 +    unsigned long long max_objnode_count = parse(s,"Nm");
   6.189 +    unsigned long long good_puts = parse(s,"ps");
   6.190 +    unsigned long long puts = parse(s,"pt");
   6.191 +    unsigned long long no_mem_puts = parse(s,"px");
   6.192 +    unsigned long long dup_puts_flushed = parse(s,"pd");
   6.193 +    unsigned long long dup_puts_replaced = parse(s,"pr");
   6.194 +    unsigned long long found_gets = parse(s,"gs");
   6.195 +    unsigned long long gets = parse(s,"gt");
   6.196 +    unsigned long long flushs_found = parse(s,"fs");
   6.197 +    unsigned long long flushs = parse(s,"ft");
   6.198 +    unsigned long long flush_objs_found = parse(s,"os");
   6.199 +    unsigned long long flush_objs = parse(s,"ot");
   6.200 +
   6.201 +    parse_string(s,"PT",pool_type,2);
   6.202 +    printf("domid%lu,id%lu[%s]:pgp=%llu(max=%llu) obj=%llu(%llu) "
   6.203 +           "objnode=%llu(%llu) puts=%llu/%llu/%llu(dup=%llu/%llu) "
   6.204 +           "gets=%llu/%llu(%llu%%) "
   6.205 +           "flush=%llu/%llu flobj=%llu/%llu\n",
   6.206 +           cli_id, pool_id, pool_type,
   6.207 +           pgp_count, max_pgp_count, obj_count, max_obj_count,
   6.208 +           objnode_count, max_objnode_count,
   6.209 +           good_puts, puts, no_mem_puts, 
   6.210 +           dup_puts_flushed, dup_puts_replaced,
   6.211 +           found_gets, gets,
   6.212 +           gets ? (found_gets*100LL)/gets : 0,
   6.213 +           flushs_found, flushs, flush_objs_found, flush_objs);
   6.214 +
   6.215 +}
   6.216 +
   6.217 +void parse_shared_pool(char *s)
   6.218 +{
   6.219 +    char pool_type[3];
   6.220 +    char buf[BUFSIZE];
   6.221 +    unsigned long pool_id = parse(s,"PI");
   6.222 +    unsigned long long uid0 = parse(s,"U0");
   6.223 +    unsigned long long uid1 = parse(s,"U1");
   6.224 +    unsigned long long pgp_count = parse(s,"Pc");
   6.225 +    unsigned long long max_pgp_count = parse(s,"Pm");
   6.226 +    unsigned long long obj_count = parse(s,"Oc");
   6.227 +    unsigned long long max_obj_count = parse(s,"Om");
   6.228 +    unsigned long long objnode_count = parse(s,"Nc");
   6.229 +    unsigned long long max_objnode_count = parse(s,"Nm");
   6.230 +    unsigned long long good_puts = parse(s,"ps");
   6.231 +    unsigned long long puts = parse(s,"pt");
   6.232 +    unsigned long long no_mem_puts = parse(s,"px");
   6.233 +    unsigned long long dup_puts_flushed = parse(s,"pd");
   6.234 +    unsigned long long dup_puts_replaced = parse(s,"pr");
   6.235 +    unsigned long long found_gets = parse(s,"gs");
   6.236 +    unsigned long long gets = parse(s,"gt");
   6.237 +    unsigned long long flushs_found = parse(s,"fs");
   6.238 +    unsigned long long flushs = parse(s,"ft");
   6.239 +    unsigned long long flush_objs_found = parse(s,"os");
   6.240 +    unsigned long long flush_objs = parse(s,"ot");
   6.241 +
   6.242 +    parse_string(s,"PT",pool_type,2);
   6.243 +    parse_sharers(s,"SC",buf,BUFSIZE);
   6.244 +    printf("poolid=%lu[%s] uuid=%llu.%llu, shared-by:%s: "
   6.245 +           "pgp=%llu(max=%llu) obj=%llu(%llu) "
   6.246 +           "objnode=%llu(%llu) puts=%llu/%llu/%llu(dup=%llu/%llu) "
   6.247 +           "gets=%llu/%llu(%llu%%) "
   6.248 +           "flush=%llu/%llu flobj=%llu/%llu\n",
   6.249 +           pool_id, pool_type, uid0, uid1, buf,
   6.250 +           pgp_count, max_pgp_count, obj_count, max_obj_count,
   6.251 +           objnode_count, max_objnode_count,
   6.252 +           good_puts, puts, no_mem_puts, 
   6.253 +           dup_puts_flushed, dup_puts_replaced,
   6.254 +           found_gets, gets,
   6.255 +           gets ? (found_gets*100LL)/gets : 0,
   6.256 +           flushs_found, flushs, flush_objs_found, flush_objs);
   6.257 +}
   6.258 +
   6.259 +int main(int ac, char **av)
   6.260 +{
   6.261 +    char *p, c;
   6.262 +    char buf[BUFSIZE];
   6.263 +
   6.264 +    while ( (p = fgets(buf,BUFSIZE,stdin)) != NULL )
   6.265 +    {
   6.266 +        c = *p++;
   6.267 +        if ( *p++ != '=' )
   6.268 +            continue;
   6.269 +        switch ( c )
   6.270 +        {
   6.271 +        case 'G':
   6.272 +            parse_global(p);
   6.273 +            break;
   6.274 +        case 'T':
   6.275 +            parse_time_stats(p);
   6.276 +            break;
   6.277 +        case 'C':
   6.278 +            parse_client(p);
   6.279 +            break;
   6.280 +        case 'P':
   6.281 +            parse_pool(p);
   6.282 +            break;
   6.283 +        case 'S':
   6.284 +            parse_shared_pool(p);
   6.285 +            break;
   6.286 +        default:
   6.287 +            continue;
   6.288 +        }
   6.289 +    }
   6.290 +    return 0;
   6.291 +}
     7.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Tue May 26 10:14:34 2009 +0100
     7.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Tue May 26 11:05:04 2009 +0100
     7.3 @@ -19,6 +19,7 @@
     7.4  
     7.5  #include "xenctrl.h"
     7.6  #include <xen/elfnote.h>
     7.7 +#include <xen/tmem.h>
     7.8  #include "xc_dom.h"
     7.9  #include <xen/hvm/hvm_info_table.h>
    7.10  #include <xen/hvm/params.h>
    7.11 @@ -1506,6 +1507,50 @@ static PyObject *dom_op(XcObject *self, 
    7.12      return zero;
    7.13  }
    7.14  
    7.15 +static PyObject *pyxc_tmem_control(XcObject *self,
    7.16 +                                   PyObject *args,
    7.17 +                                   PyObject *kwds)
    7.18 +{
    7.19 +    int32_t pool_id;
    7.20 +    uint32_t subop;
    7.21 +    uint32_t cli_id;
    7.22 +    uint32_t arg1;
    7.23 +    uint32_t arg2;
    7.24 +    char *buf;
    7.25 +    char _buffer[32768], *buffer = _buffer;
    7.26 +    int rc;
    7.27 +
    7.28 +    static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", "buf", NULL };
    7.29 +
    7.30 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiiis", kwd_list,
    7.31 +                                      &pool_id, &subop, &cli_id, &arg1, &arg2, &buf) )
    7.32 +        return NULL;
    7.33 +
    7.34 +    if ( (subop == TMEMC_LIST) && (arg1 > 32768) )
    7.35 +        arg1 = 32768;
    7.36 +
    7.37 +    if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, arg2, buffer)) < 0 )
    7.38 +        return Py_BuildValue("i", rc);
    7.39 +
    7.40 +    switch (subop) {
    7.41 +        case TMEMC_LIST:
    7.42 +            return Py_BuildValue("s", buffer);
    7.43 +        case TMEMC_FLUSH:
    7.44 +            return Py_BuildValue("i", rc);
    7.45 +        case TMEMC_THAW:
    7.46 +        case TMEMC_FREEZE:
    7.47 +        case TMEMC_DESTROY:
    7.48 +        case TMEMC_SET_WEIGHT:
    7.49 +        case TMEMC_SET_CAP:
    7.50 +        case TMEMC_SET_COMPRESS:
    7.51 +        default:
    7.52 +            break;
    7.53 +    }
    7.54 +
    7.55 +    Py_INCREF(zero);
    7.56 +    return zero;
    7.57 +}
    7.58 +
    7.59  static PyMethodDef pyxc_methods[] = {
    7.60      { "handle",
    7.61        (PyCFunction)pyxc_handle,
    7.62 @@ -1965,6 +2010,18 @@ static PyMethodDef pyxc_methods[] = {
    7.63        " dom [int]: Identifier of domain.\n" },
    7.64  #endif
    7.65  
    7.66 +    { "tmem_control",
    7.67 +      (PyCFunction)pyxc_tmem_control,
    7.68 +      METH_VARARGS | METH_KEYWORDS, "\n"
    7.69 +      "Do various control on a tmem pool.\n"
    7.70 +      " pool_id [int]: Identifier of the tmem pool (-1 == all).\n"
    7.71 +      " subop [int]: Supplementary Operation.\n"
    7.72 +      " cli_id [int]: Client identifier (-1 == all).\n"
    7.73 +      " arg1 [int]: Argument.\n"
    7.74 +      " arg2 [int]: Argument.\n"
    7.75 +      " buf [str]: Buffer.\n\n"
    7.76 +      "Returns: [int] 0 or [str] tmem info on success; exception on error.\n" },
    7.77 +
    7.78      { NULL, NULL, 0, NULL }
    7.79  };
    7.80  
     8.1 --- a/tools/python/xen/xend/XendAPI.py	Tue May 26 10:14:34 2009 +0100
     8.2 +++ b/tools/python/xen/xend/XendAPI.py	Tue May 26 11:05:04 2009 +0100
     8.3 @@ -925,7 +925,15 @@ class XendAPI(object):
     8.4                      ('dmesg', 'String'),
     8.5                      ('dmesg_clear', 'String'),
     8.6                      ('get_log', 'String'),
     8.7 -                    ('send_debug_keys', None)]
     8.8 +                    ('send_debug_keys', None),
     8.9 +                    ('tmem_thaw', None),
    8.10 +                    ('tmem_freeze', None),
    8.11 +                    ('tmem_flush', None),
    8.12 +                    ('tmem_destroy', None),
    8.13 +                    ('tmem_list', None),
    8.14 +                    ('tmem_set_weight', None),
    8.15 +                    ('tmem_set_cap', None),
    8.16 +                    ('tmem_set_compress', None)]
    8.17      
    8.18      host_funcs = [('get_by_name_label', None),
    8.19                    ('list_methods', None)]
    8.20 @@ -1061,6 +1069,70 @@ class XendAPI(object):
    8.21                    'PSCSIs': XendPSCSI.get_all()}
    8.22          return xen_api_success(record)
    8.23  
    8.24 +    def host_tmem_thaw(self, _, host_ref, cli_id):
    8.25 +        node = XendNode.instance()
    8.26 +        try:
    8.27 +            node.tmem_thaw(cli_id)
    8.28 +        except Exception, e:
    8.29 +            return xen_api_error(e)
    8.30 +        return xen_api_success_void()
    8.31 +
    8.32 +    def host_tmem_freeze(self, _, host_ref, cli_id):
    8.33 +        node = XendNode.instance()
    8.34 +        try:
    8.35 +            node.tmem_freeze(cli_id)
    8.36 +        except Exception, e:
    8.37 +            return xen_api_error(e)
    8.38 +        return xen_api_success_void()
    8.39 +
    8.40 +    def host_tmem_flush(self, _, host_ref, cli_id, pages):
    8.41 +        node = XendNode.instance()
    8.42 +        try:
    8.43 +            node.tmem_flush(cli_id, pages)
    8.44 +        except Exception, e:
    8.45 +            return xen_api_error(e)
    8.46 +        return xen_api_success_void()
    8.47 +
    8.48 +    def host_tmem_destroy(self, _, host_ref, cli_id):
    8.49 +        node = XendNode.instance()
    8.50 +        try:
    8.51 +            node.tmem_destroy(cli_id)
    8.52 +        except Exception, e:
    8.53 +            return xen_api_error(e)
    8.54 +        return xen_api_success_void()
    8.55 +
    8.56 +    def host_tmem_list(self, _, host_ref, cli_id, use_long):
    8.57 +        node = XendNode.instance()
    8.58 +        try:
    8.59 +            info = node.tmem_list(cli_id, use_long)
    8.60 +        except Exception, e:
    8.61 +            return xen_api_error(e)
    8.62 +        return xen_api_success(info)
    8.63 +
    8.64 +    def host_tmem_set_weight(self, _, host_ref, cli_id, value):
    8.65 +        node = XendNode.instance()
    8.66 +        try:
    8.67 +            node.tmem_set_weight(cli_id, value)
    8.68 +        except Exception, e:
    8.69 +            return xen_api_error(e)
    8.70 +        return xen_api_success_void()
    8.71 +
    8.72 +    def host_tmem_set_cap(self, _, host_ref, cli_id, value):
    8.73 +        node = XendNode.instance()
    8.74 +        try:
    8.75 +            node.tmem_set_cap(cli_id, value)
    8.76 +        except Exception, e:
    8.77 +            return xen_api_error(e)
    8.78 +        return xen_api_success_void()
    8.79 +
    8.80 +    def host_tmem_set_compress(self, _, host_ref, cli_id, value):
    8.81 +        node = XendNode.instance()
    8.82 +        try:
    8.83 +            node.tmem_set_compress(cli_id, value)
    8.84 +        except Exception, e:
    8.85 +            return xen_api_error(e)
    8.86 +        return xen_api_success_void()
    8.87 +
    8.88      # class methods
    8.89      def host_get_all(self, session):
    8.90          return xen_api_success((XendNode.instance().uuid,))
     9.1 --- a/tools/python/xen/xend/XendConstants.py	Tue May 26 10:14:34 2009 +0100
     9.2 +++ b/tools/python/xen/xend/XendConstants.py	Tue May 26 11:05:04 2009 +0100
     9.3 @@ -141,3 +141,29 @@ XS_VMROOT = "/vm/"
     9.4  NR_PCI_DEV = 32
     9.5  AUTO_PHP_SLOT = NR_PCI_DEV
     9.6  AUTO_PHP_SLOT_STR = "%02x" % NR_PCI_DEV
     9.7 +
     9.8 +#
     9.9 +# tmem
    9.10 +#
    9.11 +
    9.12 +TMEM_CONTROL       = 0
    9.13 +TMEM_NEW_POOL      = 1
    9.14 +TMEM_DESTROY_POOL  = 2
    9.15 +TMEM_NEW_PAGE      = 3
    9.16 +TMEM_PUT_PAGE      = 4
    9.17 +TMEM_GET_PAGE      = 5
    9.18 +TMEM_FLUSH_PAGE    = 6
    9.19 +TMEM_FLUSH_OBJECT  = 7
    9.20 +TMEM_READ          = 8
    9.21 +TMEM_WRITE         = 9
    9.22 +TMEM_XCHG          = 10
    9.23 +
    9.24 +TMEMC_THAW         = 0
    9.25 +TMEMC_FREEZE       = 1
    9.26 +TMEMC_FLUSH        = 2
    9.27 +TMEMC_DESTROY      = 3
    9.28 +TMEMC_LIST         = 4
    9.29 +TMEMC_SET_WEIGHT   = 5
    9.30 +TMEMC_SET_CAP      = 6
    9.31 +TMEMC_SET_COMPRESS = 7
    9.32 +
    10.1 --- a/tools/python/xen/xend/XendNode.py	Tue May 26 10:14:34 2009 +0100
    10.2 +++ b/tools/python/xen/xend/XendNode.py	Tue May 26 11:05:04 2009 +0100
    10.3 @@ -26,6 +26,7 @@ from xen.util import pci as PciUtil
    10.4  from xen.util import vscsi_util
    10.5  from xen.xend import XendAPIStore
    10.6  from xen.xend import osdep
    10.7 +from xen.xend.XendConstants import *
    10.8  
    10.9  import uuid, arch
   10.10  from XendPBD import XendPBD
   10.11 @@ -940,6 +941,69 @@ class XendNode:
   10.12      def info_dict(self):
   10.13          return dict(self.info())
   10.14  
   10.15 +    # tmem
   10.16 +    def tmem_list(self, cli_id, use_long):
   10.17 +        pool_id = -1
   10.18 +        subop = TMEMC_LIST
   10.19 +        arg1 = 32768
   10.20 +        arg2 = use_long
   10.21 +        buf = ''
   10.22 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.23 +
   10.24 +    def tmem_thaw(self, cli_id):
   10.25 +        pool_id = -1
   10.26 +        subop = TMEMC_THAW
   10.27 +        arg1 = 0
   10.28 +        arg2 = 0
   10.29 +        buf = ''
   10.30 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.31 +
   10.32 +    def tmem_freeze(self, cli_id):
   10.33 +        pool_id = -1
   10.34 +        subop = TMEMC_FREEZE
   10.35 +        arg1 = 0
   10.36 +        arg2 = 0
   10.37 +        buf = ''
   10.38 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.39 +
   10.40 +    def tmem_flush(self, cli_id, pages):
   10.41 +        pool_id = -1
   10.42 +        subop = TMEMC_FLUSH
   10.43 +        arg1 = pages
   10.44 +        arg2 = 0
   10.45 +        buf = ''
   10.46 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.47 +
   10.48 +    def tmem_destroy(self, cli_id):
   10.49 +        pool_id = -1
   10.50 +        subop = TMEMC_DESTROY
   10.51 +        arg1 = 0
   10.52 +        arg2 = 0
   10.53 +        buf = ''
   10.54 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.55 +
   10.56 +    def tmem_set_weight(self, cli_id, arg1):
   10.57 +        pool_id = -1
   10.58 +        subop = TMEMC_SET_WEIGHT
   10.59 +        arg2 = 0
   10.60 +        buf = ''
   10.61 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.62 +
   10.63 +    def tmem_set_cap(self, cli_id, arg1):
   10.64 +        pool_id = -1
   10.65 +        subop = TMEMC_SET_CAP
   10.66 +        arg2 = 0
   10.67 +        buf = ''
   10.68 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.69 +
   10.70 +    def tmem_set_compress(self, cli_id, arg1):
   10.71 +        pool_id = -1
   10.72 +        subop = TMEMC_SET_COMPRESS
   10.73 +        arg2 = 0
   10.74 +        buf = ''
   10.75 +        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
   10.76 +
   10.77 +
   10.78  def instance():
   10.79      global inst
   10.80      try:
    11.1 --- a/tools/python/xen/xend/balloon.py	Tue May 26 10:14:34 2009 +0100
    11.2 +++ b/tools/python/xen/xend/balloon.py	Tue May 26 11:05:04 2009 +0100
    11.3 @@ -26,6 +26,7 @@ import XendOptions
    11.4  from XendLogging import log
    11.5  from XendError import VmError
    11.6  import osdep
    11.7 +from xen.xend.XendConstants import *
    11.8  
    11.9  RETRY_LIMIT = 20
   11.10  RETRY_LIMIT_INCR = 5
   11.11 @@ -109,6 +110,9 @@ def free(need_mem, dominfo):
   11.12          last_free = None
   11.13          rlimit = RETRY_LIMIT
   11.14  
   11.15 +        # stop tmem from absorbing any more memory (must THAW when done!)
   11.16 +        xc.tmem_control(0,TMEMC_FREEZE,-1, 0, 0, "")
   11.17 +
   11.18          # If unreasonable memory size is required, we give up waiting
   11.19          # for ballooning or scrubbing, as if had retried.
   11.20          physinfo = xc.physinfo()
   11.21 @@ -122,6 +126,17 @@ def free(need_mem, dominfo):
   11.22          if need_mem >= max_free_mem:
   11.23              retries = rlimit
   11.24  
   11.25 +        freeable_mem = free_mem + scrub_mem
   11.26 +        if freeable_mem < need_mem and need_mem < max_free_mem:
   11.27 +            # flush memory from tmem to scrub_mem and reobtain physinfo
   11.28 +            need_tmem_kb = need_mem - freeable_mem
   11.29 +            tmem_kb = xc.tmem_control(0,TMEMC_FLUSH,-1, need_tmem_kb, 0, "")
   11.30 +            log.debug("Balloon: tmem relinquished %d KiB of %d KiB requested.",
   11.31 +                      tmem_kb, need_tmem_kb)
   11.32 +            physinfo = xc.physinfo()
   11.33 +            free_mem = physinfo['free_memory']
   11.34 +            scrub_mem = physinfo['scrub_memory']
   11.35 +
   11.36          # Check whethercurrent machine is a numa system and the new 
   11.37          # created hvm has all its vcpus in the same node, if all the 
   11.38          # conditions above are fit. We will wait until all the pages 
   11.39 @@ -216,4 +231,6 @@ def free(need_mem, dominfo):
   11.40                   ' be shrunk any further'))
   11.41  
   11.42      finally:
   11.43 +        # allow tmem to accept pages again
   11.44 +        xc.tmem_control(0,TMEMC_THAW,-1, 0, 0, "")
   11.45          del xc
    12.1 --- a/tools/python/xen/xend/server/XMLRPCServer.py	Tue May 26 10:14:34 2009 +0100
    12.2 +++ b/tools/python/xen/xend/server/XMLRPCServer.py	Tue May 26 11:05:04 2009 +0100
    12.3 @@ -198,7 +198,11 @@ class XMLRPCServer:
    12.4                      self.server.register_function(fn, "xend.domain.%s" % name[7:])
    12.5  
    12.6          # Functions in XendNode and XendDmesg
    12.7 -        for type, lst, n in [(XendNode, ['info', 'pciinfo', 'send_debug_keys'],
    12.8 +        for type, lst, n in [(XendNode,
    12.9 +                              ['info', 'pciinfo', 'send_debug_keys',
   12.10 +                               'tmem_list', 'tmem_freeze', 'tmem_thaw',
   12.11 +                               'tmem_flush', 'tmem_destroy', 'tmem_set_weight',
   12.12 +                               'tmem_set_cap', 'tmem_set_compress'],
   12.13                               'node'),
   12.14                               (XendDmesg, ['info', 'clear'], 'node.dmesg')]:
   12.15              inst = type.instance()
    13.1 --- a/tools/python/xen/xm/main.py	Tue May 26 10:14:34 2009 +0100
    13.2 +++ b/tools/python/xen/xm/main.py	Tue May 26 11:05:04 2009 +0100
    13.3 @@ -199,6 +199,15 @@ SUBCOMMAND_HELP = {
    13.4      'scsi-list'    :  ('<Domain> [--long]',
    13.5                          'List all SCSI devices currently attached.'),
    13.6  
    13.7 +    # tmem
    13.8 +    'tmem-list'     :  ('[-l|--long] [<Domain>|-a|--all]', 'List tmem pools.'),
    13.9 +    'tmem-thaw'     :  ('[<Domain>|-a|--all]', 'Thaw tmem pools.'),
   13.10 +    'tmem-freeze'   :  ('[<Domain>|-a|--all]', 'Freeze tmem pools.'),
   13.11 +    'tmem-destroy'  :  ('[<Domain>|-a|--all]', 'Destroy tmem pools.'),
   13.12 +    'tmem-set'      :  ('[<Domain>|-a|--all] [weight=<weight>] [cap=<cap>] '
   13.13 +                        '[compress=<compress>]',
   13.14 +                        'Change tmem settings.'),
   13.15 +
   13.16      # security
   13.17  
   13.18      'addlabel'      :  ('<label> {dom <ConfigFile>|res <resource>|mgt <managed domain>}\n'
   13.19 @@ -283,6 +292,21 @@ SUBCOMMAND_OPTIONS = {
   13.20      'info': (
   13.21         ('-c', '--config', 'List Xend configuration parameters'),
   13.22      ),
   13.23 +    'tmem-list': (
   13.24 +       ('-l', '--long', 'List tmem stats.'),
   13.25 +    ),
   13.26 +    'tmem-thaw': (
   13.27 +       ('-a', '--all', 'Thaw all tmem.'),
   13.28 +    ),
   13.29 +    'tmem-freeze':  (
   13.30 +       ('-a', '--all', 'Freeze all tmem.'),
   13.31 +    ),
   13.32 +    'tmem-destroy':  (
   13.33 +       ('-a', '--all', 'Destroy all tmem.'),
   13.34 +    ),
   13.35 +    'tmem-set':  (
   13.36 +       ('-a', '--all', 'Operate on all tmem.'),
   13.37 +    ),
   13.38  }
   13.39  
   13.40  common_commands = [
   13.41 @@ -397,9 +421,17 @@ acm_commands = [
   13.42      "getpolicy",
   13.43      ]
   13.44  
   13.45 +tmem_commands = [
   13.46 +    "tmem-list",
   13.47 +    "tmem-thaw",
   13.48 +    "tmem-freeze",
   13.49 +    "tmem-destroy",
   13.50 +    "tmem-set",
   13.51 +    ]
   13.52 +
   13.53  all_commands = (domain_commands + host_commands + scheduler_commands +
   13.54                  device_commands + vnet_commands + acm_commands +
   13.55 -                ['shell', 'event-monitor'])
   13.56 +                tmem_commands + ['shell', 'event-monitor'])
   13.57  
   13.58  
   13.59  ##
   13.60 @@ -2837,7 +2869,188 @@ def xm_network_show(args):
   13.61  
   13.62              print format2 % r
   13.63  
   13.64 -            
   13.65 +def xm_tmem_list(args):
   13.66 +    try:
   13.67 +        (options, params) = getopt.gnu_getopt(args, 'la', ['long','all'])
   13.68 +    except getopt.GetoptError, opterr:
   13.69 +        err(opterr)
   13.70 +        usage('tmem-list')
   13.71 +
   13.72 +    use_long = False
   13.73 +    for (k, v) in options:
   13.74 +        if k in ['-l', '--long']:
   13.75 +            use_long = True
   13.76 +
   13.77 +    all = False
   13.78 +    for (k, v) in options:
   13.79 +        if k in ['-a', '--all']:
   13.80 +            all = True
   13.81 +
   13.82 +    if not all and len(params) == 0:
   13.83 +        err('You must specify -a or --all or a domain id.')
   13.84 +        usage('tmem-list')
   13.85 +
   13.86 +    if all:
   13.87 +        domid = -1
   13.88 +    else:
   13.89 +        try: 
   13.90 +            domid = int(params[0])
   13.91 +            params = params[1:]
   13.92 +        except:
   13.93 +            err('Unrecognized domain id: %s' % params[0])
   13.94 +            usage('tmem-list')
   13.95 +
   13.96 +    if serverType == SERVER_XEN_API:
   13.97 +        print server.xenapi.host.tmem_list(domid,use_long)
   13.98 +    else:
   13.99 +        print  server.xend.node.tmem_list(domid,use_long)
  13.100 +
  13.101 +def parse_tmem_args(args, name):
  13.102 +    try:
  13.103 +        (options, params) = getopt.gnu_getopt(args, 'a', ['all'])
  13.104 +    except getopt.GetoptError, opterr:
  13.105 +        err(opterr)
  13.106 +        usage(name)
  13.107 +
  13.108 +    all = False
  13.109 +    for (k, v) in options:
  13.110 +        if k in ['-a', '--all']:
  13.111 +            all = True
  13.112 +
  13.113 +    if not all and len(params) == 0:
  13.114 +        err('You must specify -a or --all or a domain id.')
  13.115 +        usage(name)
  13.116 +
  13.117 +    if all:
  13.118 +        domid = -1
  13.119 +    else:
  13.120 +        try: 
  13.121 +            domid = int(params[0])
  13.122 +            params = params[1:]
  13.123 +        except:
  13.124 +            err('Unrecognized domain id: %s' % params[0])
  13.125 +            usage(name)
  13.126 +
  13.127 +    return domid, params
  13.128 +
  13.129 +def xm_tmem_destroy(args):
  13.130 +    (domid, _) = parse_tmem_args(args, 'tmem-destroy')
  13.131 +    if serverType == SERVER_XEN_API:
  13.132 +        server.xenapi.host.tmem_destroy(domid)
  13.133 +    else:
  13.134 +        server.xend.node.tmem_destroy(domid)
  13.135 +
  13.136 +def xm_tmem_thaw(args):
  13.137 +    (domid, _) = parse_tmem_args(args, 'tmem-thaw')
  13.138 +    if serverType == SERVER_XEN_API:
  13.139 +        server.xenapi.host.tmem_thaw(domid)
  13.140 +    else:
  13.141 +        server.xend.node.tmem_thaw(domid)
  13.142 +
  13.143 +def xm_tmem_freeze(args):
  13.144 +    (domid, _) = parse_tmem_args(args, 'tmem-freeze')
  13.145 +    if serverType == SERVER_XEN_API:
  13.146 +        server.xenapi.host.tmem_freeze(domid)
  13.147 +    else:
  13.148 +        server.xend.node.tmem_freeze(domid)
  13.149 +
  13.150 +def xm_tmem_flush(args):
  13.151 +    try:
  13.152 +        (options, params) = getopt.gnu_getopt(args, 'a', ['all'])
  13.153 +    except getopt.GetoptError, opterr:
  13.154 +        err(opterr)
  13.155 +        usage(name)
  13.156 +
  13.157 +    all = False
  13.158 +    for (k, v) in options:
  13.159 +        if k in ['-a', '--all']:
  13.160 +            all = True
  13.161 +
  13.162 +    if not all and len(params) == 0:
  13.163 +        err('You must specify -a or --all or a domain id.')
  13.164 +        usage('tmem-flush')
  13.165 +
  13.166 +    if all:
  13.167 +        domid = -1
  13.168 +    else:
  13.169 +        try: 
  13.170 +            domid = int(params[0])
  13.171 +            params = params[1:]
  13.172 +        except:
  13.173 +            err('Unrecognized domain id: %s' % params[0])
  13.174 +            usage('tmem-flush')
  13.175 +
  13.176 +    pages = -1
  13.177 +    for (k, v) in options:
  13.178 +        if k in ['-p', '--pages']:
  13.179 +            pages = v
  13.180 +
  13.181 +    if serverType == SERVER_XEN_API:
  13.182 +        server.xenapi.host.tmem_flush(domid,pages)
  13.183 +    else:
  13.184 +        server.xend.node.tmem_flush(domid,pages)
  13.185 +
  13.186 +def xm_tmem_set(args):
  13.187 +    try:
  13.188 +        (options, params) = getopt.gnu_getopt(args, 'a', ['all'])
  13.189 +    except getopt.GetoptError, opterr:
  13.190 +        err(opterr)
  13.191 +        usage(name)
  13.192 +
  13.193 +    all = False
  13.194 +    for (k, v) in options:
  13.195 +        if k in ['-a', '--all']:
  13.196 +            all = True
  13.197 +
  13.198 +    if not all and len(params) == 0:
  13.199 +        err('You must specify -a or --all or a domain id.')
  13.200 +        usage('tmem-set')
  13.201 +
  13.202 +    if all:
  13.203 +        domid = -1
  13.204 +    else:
  13.205 +        try: 
  13.206 +            domid = int(params[0])
  13.207 +            params = params[1:]
  13.208 +        except:
  13.209 +            err('Unrecognized domain id: %s' % params[0])
  13.210 +            usage('tmem-set')
  13.211 +
  13.212 +    weight = None
  13.213 +    cap = None
  13.214 +    compress = None
  13.215 +    for item in params:
  13.216 +        if item.startswith('weight='):
  13.217 +            try:
  13.218 +                weight = int(item[7:])
  13.219 +            except:
  13.220 +                err('weight should be a integer')
  13.221 +                usage('tmem-set')
  13.222 +        if item.startswith('cap='):
  13.223 +            cap = int(item[4:])
  13.224 +        if item.startswith('compress='):
  13.225 +            compress = int(item[9:])
  13.226 +
  13.227 +    if weight is None and cap is None and compress is None:
  13.228 +        err('Unrecognized tmem configuration option: %s' % item)
  13.229 +        usage('tmem-set')
  13.230 +        
  13.231 +    if serverType == SERVER_XEN_API:
  13.232 +        if weight is not None:
  13.233 +            server.xenapi.host.tmem_set_weight(domid, weight)
  13.234 +        if cap is not None:
  13.235 +            server.xenapi.host.tmem_set_cap(domid, cap)
  13.236 +        if compress is not None:
  13.237 +            server.xenapi.host.tmem_set_compress(domid, compress)
  13.238 +    else:
  13.239 +        if weight is not None:
  13.240 +            server.xend.node.tmem_set_weight(domid, weight)
  13.241 +        if cap is not None:
  13.242 +            server.xend.node.tmem_set_cap(domid, cap)
  13.243 +        if compress is not None:
  13.244 +            server.xend.node.tmem_set_compress(domid, compress)
  13.245 +
  13.246 +
  13.247  commands = {
  13.248      "shell": xm_shell,
  13.249      "event-monitor": xm_event_monitor,
  13.250 @@ -2912,6 +3125,13 @@ commands = {
  13.251      "scsi-attach": xm_scsi_attach,
  13.252      "scsi-detach": xm_scsi_detach,
  13.253      "scsi-list": xm_scsi_list,
  13.254 +    # tmem
  13.255 +    "tmem-thaw": xm_tmem_thaw,
  13.256 +    "tmem-freeze": xm_tmem_freeze,
  13.257 +    "tmem-flush": xm_tmem_flush,
  13.258 +    "tmem-destroy": xm_tmem_destroy,
  13.259 +    "tmem-list": xm_tmem_list,
  13.260 +    "tmem-set": xm_tmem_set,
  13.261      }
  13.262  
  13.263  ## The commands supported by a separate argument parser in xend.xm.
    14.1 --- a/xen/arch/ia64/xen/mm.c	Tue May 26 10:14:34 2009 +0100
    14.2 +++ b/xen/arch/ia64/xen/mm.c	Tue May 26 11:05:04 2009 +0100
    14.3 @@ -2870,6 +2870,13 @@ steal_page(struct domain *d, struct page
    14.4      return -1;
    14.5  }
    14.6  
    14.7 +int
    14.8 +donate_page(struct domain *d, struct page_info *page, unsigned int memflags)
    14.9 +{
   14.10 +    /* needs to be implemented for transcendent memory (tmem) */
   14.11 +    ASSERT(0);
   14.12 +}
   14.13 +
   14.14  static void
   14.15  __guest_physmap_add_page(struct domain *d, unsigned long gpfn,
   14.16                           unsigned long mfn)
    15.1 --- a/xen/arch/x86/mm.c	Tue May 26 10:14:34 2009 +0100
    15.2 +++ b/xen/arch/x86/mm.c	Tue May 26 11:05:04 2009 +0100
    15.3 @@ -3539,6 +3539,42 @@ int replace_grant_host_mapping(
    15.4      return rc;
    15.5  }
    15.6  
    15.7 +int donate_page(
    15.8 +    struct domain *d, struct page_info *page, unsigned int memflags)
    15.9 +{
   15.10 +    spin_lock(&d->page_alloc_lock);
   15.11 +
   15.12 +    if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) )
   15.13 +        goto fail;
   15.14 +
   15.15 +    if ( d->is_dying )
   15.16 +        goto fail;
   15.17 +
   15.18 +    if ( page->count_info & ~(PGC_allocated | 1) )
   15.19 +        goto fail;
   15.20 +
   15.21 +    if ( !(memflags & MEMF_no_refcount) )
   15.22 +    {
   15.23 +        if ( d->tot_pages >= d->max_pages )
   15.24 +            goto fail;
   15.25 +        d->tot_pages++;
   15.26 +    }
   15.27 +
   15.28 +    page->count_info = PGC_allocated | 1;
   15.29 +    page_set_owner(page, d);
   15.30 +    page_list_add_tail(page,&d->page_list);
   15.31 +
   15.32 +    spin_unlock(&d->page_alloc_lock);
   15.33 +    return 0;
   15.34 +
   15.35 + fail:
   15.36 +    spin_unlock(&d->page_alloc_lock);
   15.37 +    MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info,
   15.38 +            (void *)page_to_mfn(page), d, d->domain_id,
   15.39 +            page_get_owner(page), page->count_info, page->u.inuse.type_info);
   15.40 +    return -1;
   15.41 +}
   15.42 +
   15.43  int steal_page(
   15.44      struct domain *d, struct page_info *page, unsigned int memflags)
   15.45  {
    16.1 --- a/xen/arch/x86/setup.c	Tue May 26 10:14:34 2009 +0100
    16.2 +++ b/xen/arch/x86/setup.c	Tue May 26 11:05:04 2009 +0100
    16.3 @@ -110,6 +110,7 @@ extern void early_time_init(void);
    16.4  extern void early_cpu_init(void);
    16.5  extern void vesa_init(void);
    16.6  extern void vesa_mtrr_init(void);
    16.7 +extern void init_tmem(void);
    16.8  
    16.9  DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
   16.10  #ifdef CONFIG_COMPAT
   16.11 @@ -1063,6 +1064,8 @@ void __init __start_xen(unsigned long mb
   16.12  
   16.13      init_trace_bufs();
   16.14  
   16.15 +    init_tmem();
   16.16 +
   16.17      console_endboot();
   16.18  
   16.19      /* Hide UART from DOM0 if we're using it */
    17.1 --- a/xen/common/Makefile	Tue May 26 10:14:34 2009 +0100
    17.2 +++ b/xen/common/Makefile	Tue May 26 11:05:04 2009 +0100
    17.3 @@ -28,6 +28,11 @@ obj-y += version.o
    17.4  obj-y += vsprintf.o
    17.5  obj-y += xmalloc_tlsf.o
    17.6  obj-y += rcupdate.o
    17.7 +obj-y += tmem.o
    17.8 +obj-y += tmem_xen.o
    17.9 +obj-y += radix-tree.o
   17.10 +obj-y += rbtree.o
   17.11 +obj-y += lzo.o
   17.12  
   17.13  obj-$(perfc)       += perfc.o
   17.14  obj-$(crash_debug) += gdbstub.o
    18.1 --- a/xen/common/compat/Makefile	Tue May 26 10:14:34 2009 +0100
    18.2 +++ b/xen/common/compat/Makefile	Tue May 26 11:05:04 2009 +0100
    18.3 @@ -3,3 +3,4 @@ obj-y += kernel.o
    18.4  obj-y += memory.o
    18.5  obj-y += multicall.o
    18.6  obj-y += xlat.o
    18.7 +obj-y += tmem_xen.o
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/xen/common/compat/tmem_xen.c	Tue May 26 11:05:04 2009 +0100
    19.3 @@ -0,0 +1,26 @@
    19.4 +/******************************************************************************
    19.5 + * tmem_xen.c
    19.6 + *
    19.7 + */
    19.8 +
    19.9 +#include <xen/config.h>
   19.10 +#include <xen/lib.h>
   19.11 +#include <xen/sched.h>
   19.12 +#include <xen/domain.h>
   19.13 +#include <xen/guest_access.h>
   19.14 +#include <xen/hypercall.h>
   19.15 +#include <compat/tmem.h>
   19.16 +
   19.17 +#define xen_tmem_op tmem_op
   19.18 +/*CHECK_tmem_op;*/
   19.19 +#undef xen_tmem_op
   19.20 +
   19.21 +/*
   19.22 + * Local variables:
   19.23 + * mode: C
   19.24 + * c-set-style: "BSD"
   19.25 + * c-basic-offset: 4
   19.26 + * tab-width: 4
   19.27 + * indent-tabs-mode: nil
   19.28 + * End:
   19.29 + */
    20.1 --- a/xen/common/domain.c	Tue May 26 10:14:34 2009 +0100
    20.2 +++ b/xen/common/domain.c	Tue May 26 11:05:04 2009 +0100
    20.3 @@ -31,6 +31,7 @@
    20.4  #include <public/vcpu.h>
    20.5  #include <xsm/xsm.h>
    20.6  #include <xen/trace.h>
    20.7 +#include <xen/tmem.h>
    20.8  
    20.9  /* Linux config option: propageted to domain0 */
   20.10  /* xen_processor_pmbits: xen control Cx, Px, ... */
   20.11 @@ -558,6 +559,9 @@ static void complete_domain_destroy(stru
   20.12  
   20.13      grant_table_destroy(d);
   20.14  
   20.15 +    if ( d->tmem != NULL )
   20.16 +        tmem_destroy(d->tmem);
   20.17 +
   20.18      arch_domain_destroy(d);
   20.19  
   20.20      rangeset_domain_destroy(d);
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/xen/common/lzo.c	Tue May 26 11:05:04 2009 +0100
    21.3 @@ -0,0 +1,518 @@
    21.4 +/*
    21.5 + *  lzo.c -- LZO1X Compressor from MiniLZO
    21.6 + *
    21.7 + *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
    21.8 + *
    21.9 + *  The full LZO package can be found at:
   21.10 + *  http://www.oberhumer.com/opensource/lzo/
   21.11 + *
   21.12 + *  Adapted for Xen (files combined and syntactic/header changes) by:
   21.13 + *  Dan Magenheimer <dan.magenheimer@oracle.com>
   21.14 + *
   21.15 + */
   21.16 +
   21.17 +/*
   21.18 + *  lzodefs.h -- architecture, OS and compiler specific defines
   21.19 + *
   21.20 + *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
   21.21 + *
   21.22 + *  The full LZO package can be found at:
   21.23 + *  http://www.oberhumer.com/opensource/lzo/
   21.24 + *
   21.25 + *  Changed for kernel use by:
   21.26 + *  Nitin Gupta <nitingupta910@gmail.com>
   21.27 + *  Richard Purdie <rpurdie@openedhand.com>
   21.28 + */
   21.29 +
   21.30 +#define LZO_VERSION  0x2020
   21.31 +#define LZO_VERSION_STRING "2.02"
   21.32 +#define LZO_VERSION_DATE "Oct 17 2005"
   21.33 +
   21.34 +#define M1_MAX_OFFSET 0x0400
   21.35 +#define M2_MAX_OFFSET 0x0800
   21.36 +#define M3_MAX_OFFSET 0x4000
   21.37 +#define M4_MAX_OFFSET 0xbfff
   21.38 +
   21.39 +#define M1_MIN_LEN 2
   21.40 +#define M1_MAX_LEN 2
   21.41 +#define M2_MIN_LEN 3
   21.42 +#define M2_MAX_LEN 8
   21.43 +#define M3_MIN_LEN 3
   21.44 +#define M3_MAX_LEN 33
   21.45 +#define M4_MIN_LEN 3
   21.46 +#define M4_MAX_LEN 9
   21.47 +
   21.48 +#define M1_MARKER 0
   21.49 +#define M2_MARKER 64
   21.50 +#define M3_MARKER 32
   21.51 +#define M4_MARKER 16
   21.52 +
   21.53 +#define D_BITS  14
   21.54 +#define D_MASK  ((1u << D_BITS) - 1)
   21.55 +#define D_HIGH  ((D_MASK >> 1) + 1)
   21.56 +
   21.57 +#define DX2(p, s1, s2) (((((size_t)((p)[2]) << (s2)) ^ (p)[1]) \
   21.58 +       << (s1)) ^ (p)[0])
   21.59 +#define DX3(p, s1, s2, s3) ((DX2((p)+1, s2, s3) << (s1)) ^ (p)[0])
   21.60 +
   21.61 +/*
   21.62 + *  LZO1X Compressor from MiniLZO
   21.63 + *
   21.64 + *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
   21.65 + *
   21.66 + *  The full LZO package can be found at:
   21.67 + *  http://www.oberhumer.com/opensource/lzo/
   21.68 + *
   21.69 + *  Changed for kernel use by:
   21.70 + *  Nitin Gupta <nitingupta910@gmail.com>
   21.71 + *  Richard Purdie <rpurdie@openedhand.com>
   21.72 + */
   21.73 +
   21.74 +#include <xen/types.h>
   21.75 +#include <xen/lzo.h>
   21.76 +#define get_unaligned(_p) (*(_p))
   21.77 +#define put_unaligned(_val,_p) (*(_p)=_val)
   21.78 +#define get_unaligned_le16(_p) (*(u16 *)(_p))
   21.79 +
   21.80 +static noinline size_t
   21.81 +_lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
   21.82 +                     unsigned char *out, size_t *out_len, void *wrkmem)
   21.83 +{
   21.84 +    const unsigned char * const in_end = in + in_len;
   21.85 +    const unsigned char * const ip_end = in + in_len - M2_MAX_LEN - 5;
   21.86 +    const unsigned char ** const dict = wrkmem;
   21.87 +    const unsigned char *ip = in, *ii = ip;
   21.88 +    const unsigned char *end, *m, *m_pos;
   21.89 +    size_t m_off, m_len, dindex;
   21.90 +    unsigned char *op = out;
   21.91 +
   21.92 +    ip += 4;
   21.93 +
   21.94 +    for (;;) {
   21.95 +        dindex = ((size_t)(0x21 * DX3(ip, 5, 5, 6)) >> 5) & D_MASK;
   21.96 +        m_pos = dict[dindex];
   21.97 +
   21.98 +        if (m_pos < in)
   21.99 +            goto literal;
  21.100 +
  21.101 +        if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET))
  21.102 +            goto literal;
  21.103 +
  21.104 +        m_off = ip - m_pos;
  21.105 +        if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
  21.106 +            goto try_match;
  21.107 +
  21.108 +        dindex = (dindex & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f);
  21.109 +        m_pos = dict[dindex];
  21.110 +
  21.111 +        if (m_pos < in)
  21.112 +            goto literal;
  21.113 +
  21.114 +        if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET))
  21.115 +            goto literal;
  21.116 +
  21.117 +        m_off = ip - m_pos;
  21.118 +        if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
  21.119 +            goto try_match;
  21.120 +
  21.121 +        goto literal;
  21.122 +
  21.123 +    try_match:
  21.124 +        if (get_unaligned((const unsigned short *)m_pos)
  21.125 +            == get_unaligned((const unsigned short *)ip)) {
  21.126 +            if (likely(m_pos[2] == ip[2]))
  21.127 +                goto match;
  21.128 +        }
  21.129 +
  21.130 +    literal:
  21.131 +        dict[dindex] = ip;
  21.132 +        ++ip;
  21.133 +        if (unlikely(ip >= ip_end))
  21.134 +            break;
  21.135 +        continue;
  21.136 +
  21.137 +    match:
  21.138 +        dict[dindex] = ip;
  21.139 +        if (ip != ii) {
  21.140 +            size_t t = ip - ii;
  21.141 +
  21.142 +            if (t <= 3) {
  21.143 +                op[-2] |= t;
  21.144 +            } else if (t <= 18) {
  21.145 +                *op++ = (t - 3);
  21.146 +            } else {
  21.147 +                size_t tt = t - 18;
  21.148 +
  21.149 +                *op++ = 0;
  21.150 +                while (tt > 255) {
  21.151 +                    tt -= 255;
  21.152 +                    *op++ = 0;
  21.153 +                }
  21.154 +                *op++ = tt;
  21.155 +            }
  21.156 +            do {
  21.157 +                *op++ = *ii++;
  21.158 +            } while (--t > 0);
  21.159 +        }
  21.160 +
  21.161 +        ip += 3;
  21.162 +        if (m_pos[3] != *ip++ || m_pos[4] != *ip++
  21.163 +            || m_pos[5] != *ip++ || m_pos[6] != *ip++
  21.164 +            || m_pos[7] != *ip++ || m_pos[8] != *ip++) {
  21.165 +            --ip;
  21.166 +            m_len = ip - ii;
  21.167 +
  21.168 +            if (m_off <= M2_MAX_OFFSET) {
  21.169 +                m_off -= 1;
  21.170 +                *op++ = (((m_len - 1) << 5)
  21.171 +                         | ((m_off & 7) << 2));
  21.172 +                *op++ = (m_off >> 3);
  21.173 +            } else if (m_off <= M3_MAX_OFFSET) {
  21.174 +                m_off -= 1;
  21.175 +                *op++ = (M3_MARKER | (m_len - 2));
  21.176 +                goto m3_m4_offset;
  21.177 +            } else {
  21.178 +                m_off -= 0x4000;
  21.179 +
  21.180 +                *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11)
  21.181 +                         | (m_len - 2));
  21.182 +                goto m3_m4_offset;
  21.183 +            }
  21.184 +        } else {
  21.185 +            end = in_end;
  21.186 +            m = m_pos + M2_MAX_LEN + 1;
  21.187 +
  21.188 +            while (ip < end && *m == *ip) {
  21.189 +                m++;
  21.190 +                ip++;
  21.191 +            }
  21.192 +            m_len = ip - ii;
  21.193 +
  21.194 +            if (m_off <= M3_MAX_OFFSET) {
  21.195 +                m_off -= 1;
  21.196 +                if (m_len <= 33) {
  21.197 +                    *op++ = (M3_MARKER | (m_len - 2));
  21.198 +                } else {
  21.199 +                    m_len -= 33;
  21.200 +                    *op++ = M3_MARKER | 0;
  21.201 +                    goto m3_m4_len;
  21.202 +                }
  21.203 +            } else {
  21.204 +                m_off -= 0x4000;
  21.205 +                if (m_len <= M4_MAX_LEN) {
  21.206 +                    *op++ = (M4_MARKER
  21.207 +                             | ((m_off & 0x4000) >> 11)
  21.208 +                             | (m_len - 2));
  21.209 +                } else {
  21.210 +                    m_len -= M4_MAX_LEN;
  21.211 +                    *op++ = (M4_MARKER
  21.212 +                             | ((m_off & 0x4000) >> 11));
  21.213 +                m3_m4_len:
  21.214 +                    while (m_len > 255) {
  21.215 +                        m_len -= 255;
  21.216 +                        *op++ = 0;
  21.217 +                    }
  21.218 +
  21.219 +                    *op++ = (m_len);
  21.220 +                }
  21.221 +            }
  21.222 +        m3_m4_offset:
  21.223 +            *op++ = ((m_off & 63) << 2);
  21.224 +            *op++ = (m_off >> 6);
  21.225 +        }
  21.226 +
  21.227 +        ii = ip;
  21.228 +        if (unlikely(ip >= ip_end))
  21.229 +            break;
  21.230 +    }
  21.231 +
  21.232 +    *out_len = op - out;
  21.233 +    return in_end - ii;
  21.234 +}
  21.235 +
  21.236 +int lzo1x_1_compress(const unsigned char *in, size_t in_len, unsigned char *out,
  21.237 +                     size_t *out_len, void *wrkmem)
  21.238 +{
  21.239 +    const unsigned char *ii;
  21.240 +    unsigned char *op = out;
  21.241 +    size_t t;
  21.242 +
  21.243 +    if (unlikely(in_len <= M2_MAX_LEN + 5)) {
  21.244 +        t = in_len;
  21.245 +    } else {
  21.246 +        t = _lzo1x_1_do_compress(in, in_len, op, out_len, wrkmem);
  21.247 +        op += *out_len;
  21.248 +    }
  21.249 +
  21.250 +    if (t > 0) {
  21.251 +        ii = in + in_len - t;
  21.252 +
  21.253 +        if (op == out && t <= 238) {
  21.254 +            *op++ = (17 + t);
  21.255 +        } else if (t <= 3) {
  21.256 +            op[-2] |= t;
  21.257 +        } else if (t <= 18) {
  21.258 +            *op++ = (t - 3);
  21.259 +        } else {
  21.260 +            size_t tt = t - 18;
  21.261 +
  21.262 +            *op++ = 0;
  21.263 +            while (tt > 255) {
  21.264 +                tt -= 255;
  21.265 +                *op++ = 0;
  21.266 +            }
  21.267 +
  21.268 +            *op++ = tt;
  21.269 +        }
  21.270 +        do {
  21.271 +            *op++ = *ii++;
  21.272 +        } while (--t > 0);
  21.273 +    }
  21.274 +
  21.275 +    *op++ = M4_MARKER | 1;
  21.276 +    *op++ = 0;
  21.277 +    *op++ = 0;
  21.278 +
  21.279 +    *out_len = op - out;
  21.280 +    return LZO_E_OK;
  21.281 +}
  21.282 +
  21.283 +/*
  21.284 + *  LZO1X Decompressor from MiniLZO
  21.285 + *
  21.286 + *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
  21.287 + *
  21.288 + *  The full LZO package can be found at:
  21.289 + *  http://www.oberhumer.com/opensource/lzo/
  21.290 + *
  21.291 + *  Changed for kernel use by:
  21.292 + *  Nitin Gupta <nitingupta910@gmail.com>
  21.293 + *  Richard Purdie <rpurdie@openedhand.com>
  21.294 + */
  21.295 +
  21.296 +#define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x))
  21.297 +#define HAVE_OP(x, op_end, op) ((size_t)(op_end - op) < (x))
  21.298 +#define HAVE_LB(m_pos, out, op) (m_pos < out || m_pos >= op)
  21.299 +
  21.300 +#define COPY4(dst, src) \
  21.301 +  put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst))
  21.302 +
  21.303 +int lzo1x_decompress_safe(const unsigned char *in, size_t in_len,
  21.304 +                          unsigned char *out, size_t *out_len)
  21.305 +{
  21.306 +    const unsigned char * const ip_end = in + in_len;
  21.307 +    unsigned char * const op_end = out + *out_len;
  21.308 +    const unsigned char *ip = in, *m_pos;
  21.309 +    unsigned char *op = out;
  21.310 +    size_t t;
  21.311 +
  21.312 +    *out_len = 0;
  21.313 +
  21.314 +    if (*ip > 17) {
  21.315 +        t = *ip++ - 17;
  21.316 +        if (t < 4)
  21.317 +            goto match_next;
  21.318 +        if (HAVE_OP(t, op_end, op))
  21.319 +            goto output_overrun;
  21.320 +        if (HAVE_IP(t + 1, ip_end, ip))
  21.321 +            goto input_overrun;
  21.322 +        do {
  21.323 +            *op++ = *ip++;
  21.324 +        } while (--t > 0);
  21.325 +        goto first_literal_run;
  21.326 +    }
  21.327 +
  21.328 +    while ((ip < ip_end)) {
  21.329 +        t = *ip++;
  21.330 +        if (t >= 16)
  21.331 +            goto match;
  21.332 +        if (t == 0) {
  21.333 +            if (HAVE_IP(1, ip_end, ip))
  21.334 +                goto input_overrun;
  21.335 +            while (*ip == 0) {
  21.336 +                t += 255;
  21.337 +                ip++;
  21.338 +                if (HAVE_IP(1, ip_end, ip))
  21.339 +                    goto input_overrun;
  21.340 +            }
  21.341 +            t += 15 + *ip++;
  21.342 +        }
  21.343 +        if (HAVE_OP(t + 3, op_end, op))
  21.344 +            goto output_overrun;
  21.345 +        if (HAVE_IP(t + 4, ip_end, ip))
  21.346 +            goto input_overrun;
  21.347 +
  21.348 +        COPY4(op, ip);
  21.349 +        op += 4;
  21.350 +        ip += 4;
  21.351 +        if (--t > 0) {
  21.352 +            if (t >= 4) {
  21.353 +                do {
  21.354 +                    COPY4(op, ip);
  21.355 +                    op += 4;
  21.356 +                    ip += 4;
  21.357 +                    t -= 4;
  21.358 +                } while (t >= 4);
  21.359 +                if (t > 0) {
  21.360 +                    do {
  21.361 +                        *op++ = *ip++;
  21.362 +                    } while (--t > 0);
  21.363 +                }
  21.364 +            } else {
  21.365 +                do {
  21.366 +                    *op++ = *ip++;
  21.367 +                } while (--t > 0);
  21.368 +            }
  21.369 +        }
  21.370 +
  21.371 +    first_literal_run:
  21.372 +        t = *ip++;
  21.373 +        if (t >= 16)
  21.374 +            goto match;
  21.375 +        m_pos = op - (1 + M2_MAX_OFFSET);
  21.376 +        m_pos -= t >> 2;
  21.377 +        m_pos -= *ip++ << 2;
  21.378 +
  21.379 +        if (HAVE_LB(m_pos, out, op))
  21.380 +            goto lookbehind_overrun;
  21.381 +
  21.382 +        if (HAVE_OP(3, op_end, op))
  21.383 +            goto output_overrun;
  21.384 +        *op++ = *m_pos++;
  21.385 +        *op++ = *m_pos++;
  21.386 +        *op++ = *m_pos;
  21.387 +
  21.388 +        goto match_done;
  21.389 +
  21.390 +        do {
  21.391 +        match:
  21.392 +            if (t >= 64) {
  21.393 +                m_pos = op - 1;
  21.394 +                m_pos -= (t >> 2) & 7;
  21.395 +                m_pos -= *ip++ << 3;
  21.396 +                t = (t >> 5) - 1;
  21.397 +                if (HAVE_LB(m_pos, out, op))
  21.398 +                    goto lookbehind_overrun;
  21.399 +                if (HAVE_OP(t + 3 - 1, op_end, op))
  21.400 +                    goto output_overrun;
  21.401 +                goto copy_match;
  21.402 +            } else if (t >= 32) {
  21.403 +                t &= 31;
  21.404 +                if (t == 0) {
  21.405 +                    if (HAVE_IP(1, ip_end, ip))
  21.406 +                        goto input_overrun;
  21.407 +                    while (*ip == 0) {
  21.408 +                        t += 255;
  21.409 +                        ip++;
  21.410 +                        if (HAVE_IP(1, ip_end, ip))
  21.411 +                            goto input_overrun;
  21.412 +                    }
  21.413 +                    t += 31 + *ip++;
  21.414 +                }
  21.415 +                m_pos = op - 1;
  21.416 +                m_pos -= get_unaligned_le16(ip) >> 2;
  21.417 +                ip += 2;
  21.418 +            } else if (t >= 16) {
  21.419 +                m_pos = op;
  21.420 +                m_pos -= (t & 8) << 11;
  21.421 +
  21.422 +                t &= 7;
  21.423 +                if (t == 0) {
  21.424 +                    if (HAVE_IP(1, ip_end, ip))
  21.425 +                        goto input_overrun;
  21.426 +                    while (*ip == 0) {
  21.427 +                        t += 255;
  21.428 +                        ip++;
  21.429 +                        if (HAVE_IP(1, ip_end, ip))
  21.430 +                            goto input_overrun;
  21.431 +                    }
  21.432 +                    t += 7 + *ip++;
  21.433 +                }
  21.434 +                m_pos -= get_unaligned_le16(ip) >> 2;
  21.435 +                ip += 2;
  21.436 +                if (m_pos == op)
  21.437 +                    goto eof_found;
  21.438 +                m_pos -= 0x4000;
  21.439 +            } else {
  21.440 +                m_pos = op - 1;
  21.441 +                m_pos -= t >> 2;
  21.442 +                m_pos -= *ip++ << 2;
  21.443 +
  21.444 +                if (HAVE_LB(m_pos, out, op))
  21.445 +                    goto lookbehind_overrun;
  21.446 +                if (HAVE_OP(2, op_end, op))
  21.447 +                    goto output_overrun;
  21.448 +
  21.449 +                *op++ = *m_pos++;
  21.450 +                *op++ = *m_pos;
  21.451 +                goto match_done;
  21.452 +            }
  21.453 +
  21.454 +            if (HAVE_LB(m_pos, out, op))
  21.455 +                goto lookbehind_overrun;
  21.456 +            if (HAVE_OP(t + 3 - 1, op_end, op))
  21.457 +                goto output_overrun;
  21.458 +
  21.459 +            if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
  21.460 +                COPY4(op, m_pos);
  21.461 +                op += 4;
  21.462 +                m_pos += 4;
  21.463 +                t -= 4 - (3 - 1);
  21.464 +                do {
  21.465 +                    COPY4(op, m_pos);
  21.466 +                    op += 4;
  21.467 +                    m_pos += 4;
  21.468 +                    t -= 4;
  21.469 +                } while (t >= 4);
  21.470 +                if (t > 0)
  21.471 +                    do {
  21.472 +                        *op++ = *m_pos++;
  21.473 +                    } while (--t > 0);
  21.474 +            } else {
  21.475 +            copy_match:
  21.476 +                *op++ = *m_pos++;
  21.477 +                *op++ = *m_pos++;
  21.478 +                do {
  21.479 +                    *op++ = *m_pos++;
  21.480 +                } while (--t > 0);
  21.481 +            }
  21.482 +        match_done:
  21.483 +            t = ip[-2] & 3;
  21.484 +            if (t == 0)
  21.485 +                break;
  21.486 +        match_next:
  21.487 +            if (HAVE_OP(t, op_end, op))
  21.488 +                goto output_overrun;
  21.489 +            if (HAVE_IP(t + 1, ip_end, ip))
  21.490 +                goto input_overrun;
  21.491 +
  21.492 +            *op++ = *ip++;
  21.493 +            if (t > 1) {
  21.494 +                *op++ = *ip++;
  21.495 +                if (t > 2)
  21.496 +                    *op++ = *ip++;
  21.497 +            }
  21.498 +
  21.499 +            t = *ip++;
  21.500 +        } while (ip < ip_end);
  21.501 +    }
  21.502 +
  21.503 +    *out_len = op - out;
  21.504 +    return LZO_E_EOF_NOT_FOUND;
  21.505 +
  21.506 + eof_found:
  21.507 +    *out_len = op - out;
  21.508 +    return (ip == ip_end ? LZO_E_OK :
  21.509 +            (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
  21.510 + input_overrun:
  21.511 +    *out_len = op - out;
  21.512 +    return LZO_E_INPUT_OVERRUN;
  21.513 +
  21.514 + output_overrun:
  21.515 +    *out_len = op - out;
  21.516 +    return LZO_E_OUTPUT_OVERRUN;
  21.517 +
  21.518 + lookbehind_overrun:
  21.519 +    *out_len = op - out;
  21.520 +    return LZO_E_LOOKBEHIND_OVERRUN;
  21.521 +}
    22.1 --- a/xen/common/memory.c	Tue May 26 10:14:34 2009 +0100
    22.2 +++ b/xen/common/memory.c	Tue May 26 11:05:04 2009 +0100
    22.3 @@ -560,17 +560,6 @@ long do_memory_op(unsigned long cmd, XEN
    22.4      return rc;
    22.5  }
    22.6  
    22.7 -/* Temporary placeholder. */
    22.8 -int do_tmem_op(void *tmem_op)
    22.9 -{
   22.10 -    static bool_t warned;
   22.11 -
   22.12 -    if ( !test_and_set_bool(warned) )
   22.13 -        printk("tmem: not implemented\n");
   22.14 -
   22.15 -    return -ENOSYS;
   22.16 -}
   22.17 -
   22.18  /*
   22.19   * Local variables:
   22.20   * mode: C
    23.1 --- a/xen/common/page_alloc.c	Tue May 26 10:14:34 2009 +0100
    23.2 +++ b/xen/common/page_alloc.c	Tue May 26 11:05:04 2009 +0100
    23.3 @@ -35,6 +35,7 @@
    23.4  #include <xen/perfc.h>
    23.5  #include <xen/numa.h>
    23.6  #include <xen/nodemask.h>
    23.7 +#include <xen/tmem.h>
    23.8  #include <public/sysctl.h>
    23.9  #include <asm/page.h>
   23.10  #include <asm/numa.h>
   23.11 @@ -335,9 +336,9 @@ static unsigned long init_node_heap(int 
   23.12  /* Allocate 2^@order contiguous pages. */
   23.13  static struct page_info *alloc_heap_pages(
   23.14      unsigned int zone_lo, unsigned int zone_hi,
   23.15 -    unsigned int node, unsigned int order)
   23.16 +    unsigned int node, unsigned int order, unsigned int memflags)
   23.17  {
   23.18 -    unsigned int i, j, zone;
   23.19 +    unsigned int i, j, zone = 0;
   23.20      unsigned int num_nodes = num_online_nodes();
   23.21      unsigned long request = 1UL << order;
   23.22      cpumask_t extra_cpus_mask, mask;
   23.23 @@ -380,6 +381,14 @@ static struct page_info *alloc_heap_page
   23.24              node = 0;
   23.25      }
   23.26  
   23.27 +    /* Try to free memory from tmem */
   23.28 +    if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
   23.29 +    {
   23.30 +        /* reassigning an already allocated anonymous heap page */
   23.31 +        spin_unlock(&heap_lock);
   23.32 +        return pg;
   23.33 +    }
   23.34 +
   23.35      /* No suitable memory blocks. Fail the request. */
   23.36      spin_unlock(&heap_lock);
   23.37      return NULL;
   23.38 @@ -1018,8 +1027,8 @@ void *alloc_xenheap_pages(unsigned int o
   23.39  
   23.40      ASSERT(!in_irq());
   23.41  
   23.42 -    pg = alloc_heap_pages(
   23.43 -        MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order);
   23.44 +    pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
   23.45 +        cpu_to_node(smp_processor_id()), order, memflags);
   23.46      if ( unlikely(pg == NULL) )
   23.47          return NULL;
   23.48  
   23.49 @@ -1172,11 +1181,11 @@ struct page_info *alloc_domheap_pages(
   23.50          return NULL;
   23.51  
   23.52      if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
   23.53 -        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order);
   23.54 +        pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
   23.55  
   23.56      if ( (pg == NULL) &&
   23.57           ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
   23.58 -                                 node, order)) == NULL) )
   23.59 +                                 node, order, memflags)) == NULL) )
   23.60           return NULL;
   23.61  
   23.62      if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
   23.63 @@ -1373,6 +1382,28 @@ static void page_scrub_softirq(void)
   23.64      spin_unlock(&serialise_lock);
   23.65  }
   23.66  
   23.67 +void scrub_list_splice(struct page_list_head *list)
   23.68 +{
   23.69 +    spin_lock(&page_scrub_lock);
   23.70 +    page_list_splice(list, &page_scrub_list);
   23.71 +    spin_unlock(&page_scrub_lock);
   23.72 +}
   23.73 +
   23.74 +void scrub_list_add(struct page_info *pg)
   23.75 +{
   23.76 +    spin_lock(&page_scrub_lock);
   23.77 +    page_list_add(pg, &page_scrub_list);
   23.78 +    spin_unlock(&page_scrub_lock);
   23.79 +}
   23.80 +
   23.81 +void scrub_one_page(struct page_info *pg)
   23.82 +{
   23.83 +    void *p = map_domain_page(page_to_mfn(pg));
   23.84 +
   23.85 +    scrub_page(p);
   23.86 +    unmap_domain_page(p);
   23.87 +}
   23.88 +
   23.89  static void page_scrub_timer_fn(void *unused)
   23.90  {
   23.91      page_scrub_schedule_work();
    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/xen/common/radix-tree.c	Tue May 26 11:05:04 2009 +0100
    24.3 @@ -0,0 +1,448 @@
    24.4 +/*
    24.5 + * Copyright (C) 2001 Momchil Velikov
    24.6 + * Portions Copyright (C) 2001 Christoph Hellwig
    24.7 + * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
    24.8 + *
    24.9 + * This program is free software; you can redistribute it and/or
   24.10 + * modify it under the terms of the GNU General Public License as
   24.11 + * published by the Free Software Foundation; either version 2, or (at
   24.12 + * your option) any later version.
   24.13 + *
   24.14 + * This program is distributed in the hope that it will be useful, but
   24.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   24.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   24.17 + * General Public License for more details.
   24.18 + *
   24.19 + * You should have received a copy of the GNU General Public License
   24.20 + * along with this program; if not, write to the Free Software
   24.21 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   24.22 + */
   24.23 +
   24.24 +/*
   24.25 + * Copyright (C) 2009 adaption for Xen tmem by Dan Magenheimer, Oracle Corp.
   24.26 + * Changed:
   24.27 + * o Linux 2.6.18 source used (prior to read-copy-update addition)
   24.28 + * o constants and data structures moved out to radix-tree.h header
   24.29 + * o tagging code removed
   24.30 + * o radix_tree_insert has func parameter for dynamic data struct allocation
   24.31 + * o radix_tree_destroy added (including recursive helper function)
   24.32 + * o __init functions must be called explicitly
   24.33 + * o other include files adapted to Xen
   24.34 + */
   24.35 +
   24.36 +#include <xen/config.h>
   24.37 +#include <xen/lib.h>
   24.38 +#include <xen/types.h>
   24.39 +#include <xen/errno.h>
   24.40 +#include <xen/radix-tree.h>
   24.41 +#include <asm/cache.h>
   24.42 +
   24.43 +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
   24.44 +
   24.45 +/*
   24.46 + * Return the maximum key which can be store into a
   24.47 + * radix tree with height HEIGHT.
   24.48 + */
   24.49 +static inline unsigned long radix_tree_maxindex(unsigned int height)
   24.50 +{
   24.51 +    return height_to_maxindex[height];
   24.52 +}
   24.53 +
   24.54 +/*
   24.55 + * Extend a radix tree so it can store key @index.
   24.56 + */
   24.57 +static int radix_tree_extend(struct radix_tree_root *root, unsigned long index,
   24.58 +                             struct radix_tree_node *(*node_alloc)(void *), void *arg)
   24.59 +{
   24.60 +    struct radix_tree_node *node;
   24.61 +    unsigned int height;
   24.62 +
   24.63 +    /* Figure out what the height should be.  */
   24.64 +    height = root->height + 1;
   24.65 +    if (index > radix_tree_maxindex(height))
   24.66 +        while (index > radix_tree_maxindex(height))
   24.67 +            height++;
   24.68 +
   24.69 +    if (root->rnode == NULL) {
   24.70 +        root->height = height;
   24.71 +        goto out;
   24.72 +    }
   24.73 +
   24.74 +    do {
   24.75 +        if (!(node = node_alloc(arg)))
   24.76 +            return -ENOMEM;
   24.77 +
   24.78 +        /* Increase the height.  */
   24.79 +        node->slots[0] = root->rnode;
   24.80 +
   24.81 +        node->count = 1;
   24.82 +        root->rnode = node;
   24.83 +        root->height++;
   24.84 +    } while (height > root->height);
   24.85 + out:
   24.86 +    return 0;
   24.87 +}
   24.88 +
   24.89 +/**
   24.90 + * radix_tree_insert    -    insert into a radix tree
   24.91 + * @root:  radix tree root
   24.92 + * @index:  index key
   24.93 + * @item:  item to insert
   24.94 + *
   24.95 + * Insert an item into the radix tree at position @index.
   24.96 + */
   24.97 +int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
   24.98 +                      void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg)
   24.99 +{
  24.100 +    struct radix_tree_node *node = NULL, *slot;
  24.101 +    unsigned int height, shift;
  24.102 +    int offset;
  24.103 +    int error;
  24.104 +
  24.105 +    /* Make sure the tree is high enough.  */
  24.106 +    if (index > radix_tree_maxindex(root->height)) {
  24.107 +        error = radix_tree_extend(root, index, node_alloc, arg);
  24.108 +        if (error)
  24.109 +            return error;
  24.110 +    }
  24.111 +
  24.112 +    slot = root->rnode;
  24.113 +    height = root->height;
  24.114 +    shift = (height-1) * RADIX_TREE_MAP_SHIFT;
  24.115 +
  24.116 +    offset = 0;   /* uninitialised var warning */
  24.117 +    while (height > 0) {
  24.118 +        if (slot == NULL) {
  24.119 +            /* Have to add a child node.  */
  24.120 +            if (!(slot = node_alloc(arg)))
  24.121 +                return -ENOMEM;
  24.122 +            if (node) {
  24.123 +
  24.124 +                node->slots[offset] = slot;
  24.125 +                node->count++;
  24.126 +            } else
  24.127 +                root->rnode = slot;
  24.128 +        }
  24.129 +
  24.130 +        /* Go a level down */
  24.131 +        offset = (index >> shift) & RADIX_TREE_MAP_MASK;
  24.132 +        node = slot;
  24.133 +        slot = node->slots[offset];
  24.134 +        shift -= RADIX_TREE_MAP_SHIFT;
  24.135 +        height--;
  24.136 +    }
  24.137 +
  24.138 +    if (slot != NULL)
  24.139 +        return -EEXIST;
  24.140 +
  24.141 +    if (node) {
  24.142 +        node->count++;
  24.143 +        node->slots[offset] = item;
  24.144 +    } else {
  24.145 +        root->rnode = item;
  24.146 +    }
  24.147 +
  24.148 +    return 0;
  24.149 +}
  24.150 +EXPORT_SYMBOL(radix_tree_insert);
  24.151 +
  24.152 +static inline void **__lookup_slot(struct radix_tree_root *root,
  24.153 +                                   unsigned long index)
  24.154 +{
  24.155 +    unsigned int height, shift;
  24.156 +    struct radix_tree_node **slot;
  24.157 +
  24.158 +    height = root->height;
  24.159 +
  24.160 +    if (index > radix_tree_maxindex(height))
  24.161 +        return NULL;
  24.162 +
  24.163 +    if (height == 0 && root->rnode)
  24.164 +        return (void **)&root->rnode;
  24.165 +
  24.166 +    shift = (height-1) * RADIX_TREE_MAP_SHIFT;
  24.167 +    slot = &root->rnode;
  24.168 +
  24.169 +    while (height > 0) {
  24.170 +        if (*slot == NULL)
  24.171 +            return NULL;
  24.172 +
  24.173 +        slot = (struct radix_tree_node **)
  24.174 +            ((*slot)->slots +
  24.175 +             ((index >> shift) & RADIX_TREE_MAP_MASK));
  24.176 +        shift -= RADIX_TREE_MAP_SHIFT;
  24.177 +        height--;
  24.178 +    }
  24.179 +
  24.180 +    return (void **)slot;
  24.181 +}
  24.182 +
  24.183 +/**
  24.184 + * radix_tree_lookup_slot    -    lookup a slot in a radix tree
  24.185 + * @root:  radix tree root
  24.186 + * @index:  index key
  24.187 + *
  24.188 + * Lookup the slot corresponding to the position @index in the radix tree
  24.189 + * @root. This is useful for update-if-exists operations.
  24.190 + */
  24.191 +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
  24.192 +{
  24.193 +    return __lookup_slot(root, index);
  24.194 +}
  24.195 +EXPORT_SYMBOL(radix_tree_lookup_slot);
  24.196 +
  24.197 +/**
  24.198 + * radix_tree_lookup    -    perform lookup operation on a radix tree
  24.199 + * @root:  radix tree root
  24.200 + * @index:  index key
  24.201 + *
  24.202 + * Lookup the item at the position @index in the radix tree @root.
  24.203 + */
  24.204 +void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
  24.205 +{
  24.206 +    void **slot;
  24.207 +
  24.208 +    slot = __lookup_slot(root, index);
  24.209 +    return slot != NULL ? *slot : NULL;
  24.210 +}
  24.211 +EXPORT_SYMBOL(radix_tree_lookup);
  24.212 +
  24.213 +static unsigned int
  24.214 +__lookup(struct radix_tree_root *root, void **results, unsigned long index,
  24.215 +         unsigned int max_items, unsigned long *next_index)
  24.216 +{
  24.217 +    unsigned int nr_found = 0;
  24.218 +    unsigned int shift, height;
  24.219 +    struct radix_tree_node *slot;
  24.220 +    unsigned long i;
  24.221 +
  24.222 +    height = root->height;
  24.223 +    if (index > radix_tree_maxindex(height))
  24.224 +        if (height == 0) {
  24.225 +            if (root->rnode && index == 0)
  24.226 +                results[nr_found++] = root->rnode;
  24.227 +            goto out;
  24.228 +        }
  24.229 +
  24.230 +    shift = (height-1) * RADIX_TREE_MAP_SHIFT;
  24.231 +    slot = root->rnode;
  24.232 +
  24.233 +    for ( ; height > 1; height--) {
  24.234 +
  24.235 +        for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
  24.236 +             i < RADIX_TREE_MAP_SIZE; i++) {
  24.237 +            if (slot->slots[i] != NULL)
  24.238 +                break;
  24.239 +            index &= ~((1UL << shift) - 1);
  24.240 +            index += 1UL << shift;
  24.241 +            if (index == 0)
  24.242 +                goto out; /* 32-bit wraparound */
  24.243 +        }
  24.244 +        if (i == RADIX_TREE_MAP_SIZE)
  24.245 +            goto out;
  24.246 +
  24.247 +        shift -= RADIX_TREE_MAP_SHIFT;
  24.248 +        slot = slot->slots[i];
  24.249 +    }
  24.250 +
  24.251 +    /* Bottom level: grab some items */
  24.252 +    for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
  24.253 +        index++;
  24.254 +        if (slot->slots[i]) {
  24.255 +            results[nr_found++] = slot->slots[i];
  24.256 +            if (nr_found == max_items)
  24.257 +                goto out;
  24.258 +        }
  24.259 +    }
  24.260 + out:
  24.261 +    *next_index = index;
  24.262 +    return nr_found;
  24.263 +}
  24.264 +
  24.265 +/**
  24.266 + * radix_tree_gang_lookup - perform multiple lookup on a radix tree
  24.267 + * @root:  radix tree root
  24.268 + * @results: where the results of the lookup are placed
  24.269 + * @first_index: start the lookup from this key
  24.270 + * @max_items: place up to this many items at *results
  24.271 + *
  24.272 + * Performs an index-ascending scan of the tree for present items.  Places
  24.273 + * them at *@results and returns the number of items which were placed at
  24.274 + * *@results.
  24.275 + *
  24.276 + * The implementation is naive.
  24.277 + */
  24.278 +unsigned int
  24.279 +radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
  24.280 +                       unsigned long first_index, unsigned int max_items)
  24.281 +{
  24.282 +    const unsigned long max_index = radix_tree_maxindex(root->height);
  24.283 +    unsigned long cur_index = first_index;
  24.284 +    unsigned int ret = 0;
  24.285 +
  24.286 +    while (ret < max_items) {
  24.287 +        unsigned int nr_found;
  24.288 +        unsigned long next_index; /* Index of next search */
  24.289 +
  24.290 +        if (cur_index > max_index)
  24.291 +            break;
  24.292 +        nr_found = __lookup(root, results + ret, cur_index,
  24.293 +                            max_items - ret, &next_index);
  24.294 +        ret += nr_found;
  24.295 +        if (next_index == 0)
  24.296 +            break;
  24.297 +        cur_index = next_index;
  24.298 +    }
  24.299 +    return ret;
  24.300 +}
  24.301 +EXPORT_SYMBOL(radix_tree_gang_lookup);
  24.302 +
  24.303 +/**
  24.304 + * radix_tree_shrink    -    shrink height of a radix tree to minimal
  24.305 + * @root  radix tree root
  24.306 + */
  24.307 +static inline void radix_tree_shrink(struct radix_tree_root *root,
  24.308 +                                     void (*node_free)(struct radix_tree_node *))
  24.309 +{
  24.310 +    /* try to shrink tree height */
  24.311 +    while (root->height > 0 &&
  24.312 +           root->rnode->count == 1 &&
  24.313 +           root->rnode->slots[0]) {
  24.314 +        struct radix_tree_node *to_free = root->rnode;
  24.315 +
  24.316 +        root->rnode = to_free->slots[0];
  24.317 +        root->height--;
  24.318 +        to_free->slots[0] = NULL;
  24.319 +        to_free->count = 0;
  24.320 +        node_free(to_free);
  24.321 +    }
  24.322 +}
  24.323 +
  24.324 +/**
  24.325 + * radix_tree_delete    -    delete an item from a radix tree
  24.326 + * @root:  radix tree root
  24.327 + * @index:  index key
  24.328 + *
  24.329 + * Remove the item at @index from the radix tree rooted at @root.
  24.330 + *
  24.331 + * Returns the address of the deleted item, or NULL if it was not present.
  24.332 + */
  24.333 +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index,
  24.334 +                        void(*node_free)(struct radix_tree_node *))
  24.335 +{
  24.336 +    struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
  24.337 +    struct radix_tree_node *slot = NULL;
  24.338 +    unsigned int height, shift;
  24.339 +    int offset;
  24.340 +
  24.341 +    height = root->height;
  24.342 +    if (index > radix_tree_maxindex(height))
  24.343 +        goto out;
  24.344 +
  24.345 +    slot = root->rnode;
  24.346 +    if (height == 0 && root->rnode) {
  24.347 +        root->rnode = NULL;
  24.348 +        goto out;
  24.349 +    }
  24.350 +
  24.351 +    shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
  24.352 +    pathp->node = NULL;
  24.353 +
  24.354 +    do {
  24.355 +        if (slot == NULL)
  24.356 +            goto out;
  24.357 +
  24.358 +        pathp++;
  24.359 +        offset = (index >> shift) & RADIX_TREE_MAP_MASK;
  24.360 +        pathp->offset = offset;
  24.361 +        pathp->node = slot;
  24.362 +        slot = slot->slots[offset];
  24.363 +        shift -= RADIX_TREE_MAP_SHIFT;
  24.364 +        height--;
  24.365 +    } while (height > 0);
  24.366 +
  24.367 +    if (slot == NULL)
  24.368 +        goto out;
  24.369 +
  24.370 +    /* Now free the nodes we do not need anymore */
  24.371 +    while (pathp->node) {
  24.372 +        pathp->node->slots[pathp->offset] = NULL;
  24.373 +        pathp->node->count--;
  24.374 +
  24.375 +        if (pathp->node->count) {
  24.376 +            if (pathp->node == root->rnode)
  24.377 +                radix_tree_shrink(root, node_free);
  24.378 +            goto out;
  24.379 +        }
  24.380 +
  24.381 +        /* Node with zero slots in use so free it */
  24.382 +        node_free(pathp->node);
  24.383 +
  24.384 +        pathp--;
  24.385 +    }
  24.386 +    root->height = 0;
  24.387 +    root->rnode = NULL;
  24.388 +
  24.389 + out:
  24.390 +    return slot;
  24.391 +}
  24.392 +EXPORT_SYMBOL(radix_tree_delete);
  24.393 +
  24.394 +static void
  24.395 +radix_tree_node_destroy(struct radix_tree_node *node, unsigned int height,
  24.396 +                        void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *))
  24.397 +{
  24.398 +    int i;
  24.399 +
  24.400 +    if (height == 0)
  24.401 +        return;
  24.402 +    for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
  24.403 +        if (node->slots[i]) {
  24.404 +            if (height == 1) {
  24.405 +                slot_free(node->slots[i]);
  24.406 +                node->slots[i] = NULL;
  24.407 +                continue;
  24.408 +            }
  24.409 +            radix_tree_node_destroy(node->slots[i], height-1,
  24.410 +                                    slot_free, node_free);
  24.411 +            node_free(node->slots[i]);
  24.412 +            node->slots[i] = NULL;
  24.413 +        }
  24.414 +    }
  24.415 +}
  24.416 +
  24.417 +void radix_tree_destroy(struct radix_tree_root *root,
  24.418 +                        void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *))
  24.419 +{
  24.420 +    if (root->rnode == NULL)
  24.421 +        return;
  24.422 +    if (root->height == 0)
  24.423 +        slot_free(root->rnode);
  24.424 +    else {
  24.425 +        radix_tree_node_destroy(root->rnode, root->height,
  24.426 +                                slot_free, node_free);
  24.427 +        node_free(root->rnode);
  24.428 +        root->height = 0;
  24.429 +    }
  24.430 +    root->rnode = NULL;
  24.431 +    /* caller must delete root if desired */
  24.432 +}
  24.433 +EXPORT_SYMBOL(radix_tree_destroy);
  24.434 +
  24.435 +static /*__init*/ unsigned long __maxindex(unsigned int height)
  24.436 +{
  24.437 +    unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
  24.438 +    unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
  24.439 +
  24.440 +    if (tmp >= RADIX_TREE_INDEX_BITS)
  24.441 +        index = ~0UL;
  24.442 +    return index;
  24.443 +}
  24.444 +
  24.445 +/*__init*/ void radix_tree_init(void)
  24.446 +{
  24.447 +    unsigned int i;
  24.448 +
  24.449 +    for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
  24.450 +        height_to_maxindex[i] = __maxindex(i);
  24.451 +}
    25.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.2 +++ b/xen/common/rbtree.c	Tue May 26 11:05:04 2009 +0100
    25.3 @@ -0,0 +1,398 @@
    25.4 +/*
    25.5 +  Red Black Trees
    25.6 +  (C) 1999  Andrea Arcangeli <andrea@suse.de>
    25.7 +  (C) 2002  David Woodhouse <dwmw2@infradead.org>
    25.8 +  
    25.9 +  This program is free software; you can redistribute it and/or modify
   25.10 +  it under the terms of the GNU General Public License as published by
   25.11 +  the Free Software Foundation; either version 2 of the License, or
   25.12 +  (at your option) any later version.
   25.13 +
   25.14 +  This program is distributed in the hope that it will be useful,
   25.15 +  but WITHOUT ANY WARRANTY; without even the implied warranty of
   25.16 +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   25.17 +  GNU General Public License for more details.
   25.18 +
   25.19 +  You should have received a copy of the GNU General Public License
   25.20 +  along with this program; if not, write to the Free Software
   25.21 +  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   25.22 +
   25.23 +  linux/lib/rbtree.c
   25.24 +*/
   25.25 +
   25.26 +#include <xen/config.h>
   25.27 +#include <xen/types.h>
   25.28 +#include <xen/rbtree.h>
   25.29 +
   25.30 +static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
   25.31 +{
   25.32 +    struct rb_node *right = node->rb_right;
   25.33 +    struct rb_node *parent = rb_parent(node);
   25.34 +
   25.35 +    if ((node->rb_right = right->rb_left))
   25.36 +        rb_set_parent(right->rb_left, node);
   25.37 +    right->rb_left = node;
   25.38 +
   25.39 +    rb_set_parent(right, parent);
   25.40 +
   25.41 +    if (parent)
   25.42 +    {
   25.43 +        if (node == parent->rb_left)
   25.44 +            parent->rb_left = right;
   25.45 +        else
   25.46 +            parent->rb_right = right;
   25.47 +    }
   25.48 +    else
   25.49 +        root->rb_node = right;
   25.50 +    rb_set_parent(node, right);
   25.51 +}
   25.52 +
   25.53 +static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
   25.54 +{
   25.55 +    struct rb_node *left = node->rb_left;
   25.56 +    struct rb_node *parent = rb_parent(node);
   25.57 +
   25.58 +    if ((node->rb_left = left->rb_right))
   25.59 +        rb_set_parent(left->rb_right, node);
   25.60 +    left->rb_right = node;
   25.61 +
   25.62 +    rb_set_parent(left, parent);
   25.63 +
   25.64 +    if (parent)
   25.65 +    {
   25.66 +        if (node == parent->rb_right)
   25.67 +            parent->rb_right = left;
   25.68 +        else
   25.69 +            parent->rb_left = left;
   25.70 +    }
   25.71 +    else
   25.72 +        root->rb_node = left;
   25.73 +    rb_set_parent(node, left);
   25.74 +}
   25.75 +
   25.76 +void rb_insert_color(struct rb_node *node, struct rb_root *root)
   25.77 +{
   25.78 +    struct rb_node *parent, *gparent;
   25.79 +
   25.80 +    while ((parent = rb_parent(node)) && rb_is_red(parent))
   25.81 +    {
   25.82 +        gparent = rb_parent(parent);
   25.83 +
   25.84 +        if (parent == gparent->rb_left)
   25.85 +        {
   25.86 +            {
   25.87 +                register struct rb_node *uncle = gparent->rb_right;
   25.88 +                if (uncle && rb_is_red(uncle))
   25.89 +                {
   25.90 +                    rb_set_black(uncle);
   25.91 +                    rb_set_black(parent);
   25.92 +                    rb_set_red(gparent);
   25.93 +                    node = gparent;
   25.94 +                    continue;
   25.95 +                }
   25.96 +            }
   25.97 +
   25.98 +            if (parent->rb_right == node)
   25.99 +            {
  25.100 +                register struct rb_node *tmp;
  25.101 +                __rb_rotate_left(parent, root);
  25.102 +                tmp = parent;
  25.103 +                parent = node;
  25.104 +                node = tmp;
  25.105 +            }
  25.106 +
  25.107 +            rb_set_black(parent);
  25.108 +            rb_set_red(gparent);
  25.109 +            __rb_rotate_right(gparent, root);
  25.110 +        } else {
  25.111 +            {
  25.112 +                register struct rb_node *uncle = gparent->rb_left;
  25.113 +                if (uncle && rb_is_red(uncle))
  25.114 +                {
  25.115 +                    rb_set_black(uncle);
  25.116 +                    rb_set_black(parent);
  25.117 +                    rb_set_red(gparent);
  25.118 +                    node = gparent;
  25.119 +                    continue;
  25.120 +                }
  25.121 +            }
  25.122 +
  25.123 +            if (parent->rb_left == node)
  25.124 +            {
  25.125 +                register struct rb_node *tmp;
  25.126 +                __rb_rotate_right(parent, root);
  25.127 +                tmp = parent;
  25.128 +                parent = node;
  25.129 +                node = tmp;
  25.130 +            }
  25.131 +
  25.132 +            rb_set_black(parent);
  25.133 +            rb_set_red(gparent);
  25.134 +            __rb_rotate_left(gparent, root);
  25.135 +        }
  25.136 +    }
  25.137 +
  25.138 +    rb_set_black(root->rb_node);
  25.139 +}
  25.140 +EXPORT_SYMBOL(rb_insert_color);
  25.141 +
  25.142 +static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
  25.143 +                             struct rb_root *root)
  25.144 +{
  25.145 +    struct rb_node *other;
  25.146 +
  25.147 +    while ((!node || rb_is_black(node)) && node != root->rb_node)
  25.148 +    {
  25.149 +        if (parent->rb_left == node)
  25.150 +        {
  25.151 +            other = parent->rb_right;
  25.152 +            if (rb_is_red(other))
  25.153 +            {
  25.154 +                rb_set_black(other);
  25.155 +                rb_set_red(parent);
  25.156 +                __rb_rotate_left(parent, root);
  25.157 +                other = parent->rb_right;
  25.158 +            }
  25.159 +            if ((!other->rb_left || rb_is_black(other->rb_left)) &&
  25.160 +                (!other->rb_right || rb_is_black(other->rb_right)))
  25.161 +            {
  25.162 +                rb_set_red(other);
  25.163 +                node = parent;
  25.164 +                parent = rb_parent(node);
  25.165 +            }
  25.166 +            else
  25.167 +            {
  25.168 +                if (!other->rb_right || rb_is_black(other->rb_right))
  25.169 +                {
  25.170 +                    struct rb_node *o_left;
  25.171 +                    if ((o_left = other->rb_left))
  25.172 +                        rb_set_black(o_left);
  25.173 +                    rb_set_red(other);
  25.174 +                    __rb_rotate_right(other, root);
  25.175 +                    other = parent->rb_right;
  25.176 +                }
  25.177 +                rb_set_color(other, rb_color(parent));
  25.178 +                rb_set_black(parent);
  25.179 +                if (other->rb_right)
  25.180 +                    rb_set_black(other->rb_right);
  25.181 +                __rb_rotate_left(parent, root);
  25.182 +                node = root->rb_node;
  25.183 +                break;
  25.184 +            }
  25.185 +        }
  25.186 +        else
  25.187 +        {
  25.188 +            other = parent->rb_left;
  25.189 +            if (rb_is_red(other))
  25.190 +            {
  25.191 +                rb_set_black(other);
  25.192 +                rb_set_red(parent);
  25.193 +                __rb_rotate_right(parent, root);
  25.194 +                other = parent->rb_left;
  25.195 +            }
  25.196 +            if ((!other->rb_left || rb_is_black(other->rb_left)) &&
  25.197 +                (!other->rb_right || rb_is_black(other->rb_right)))
  25.198 +            {
  25.199 +                rb_set_red(other);
  25.200 +                node = parent;
  25.201 +                parent = rb_parent(node);
  25.202 +            }
  25.203 +            else
  25.204 +            {
  25.205 +                if (!other->rb_left || rb_is_black(other->rb_left))
  25.206 +                {
  25.207 +                    register struct rb_node *o_right;
  25.208 +                    if ((o_right = other->rb_right))
  25.209 +                        rb_set_black(o_right);
  25.210 +                    rb_set_red(other);
  25.211 +                    __rb_rotate_left(other, root);
  25.212 +                    other = parent->rb_left;
  25.213 +                }
  25.214 +                rb_set_color(other, rb_color(parent));
  25.215 +                rb_set_black(parent);
  25.216 +                if (other->rb_left)
  25.217 +                    rb_set_black(other->rb_left);
  25.218 +                __rb_rotate_right(parent, root);
  25.219 +                node = root->rb_node;
  25.220 +                break;
  25.221 +            }
  25.222 +        }
  25.223 +    }
  25.224 +    if (node)
  25.225 +        rb_set_black(node);
  25.226 +}
  25.227 +
  25.228 +void rb_erase(struct rb_node *node, struct rb_root *root)
  25.229 +{
  25.230 +    struct rb_node *child, *parent;
  25.231 +    int color;
  25.232 +
  25.233 +    if (!node->rb_left)
  25.234 +        child = node->rb_right;
  25.235 +    else if (!node->rb_right)
  25.236 +        child = node->rb_left;
  25.237 +    else
  25.238 +    {
  25.239 +        struct rb_node *old = node, *left;
  25.240 +
  25.241 +        node = node->rb_right;
  25.242 +        while ((left = node->rb_left) != NULL)
  25.243 +            node = left;
  25.244 +        child = node->rb_right;
  25.245 +        parent = rb_parent(node);
  25.246 +        color = rb_color(node);
  25.247 +
  25.248 +        if (child)
  25.249 +            rb_set_parent(child, parent);
  25.250 +        if (parent == old) {
  25.251 +            parent->rb_right = child;
  25.252 +            parent = node;
  25.253 +        } else
  25.254 +            parent->rb_left = child;
  25.255 +
  25.256 +        node->rb_parent_color = old->rb_parent_color;
  25.257 +        node->rb_right = old->rb_right;
  25.258 +        node->rb_left = old->rb_left;
  25.259 +
  25.260 +        if (rb_parent(old))
  25.261 +        {
  25.262 +            if (rb_parent(old)->rb_left == old)
  25.263 +                rb_parent(old)->rb_left = node;
  25.264 +            else
  25.265 +                rb_parent(old)->rb_right = node;
  25.266 +        } else
  25.267 +            root->rb_node = node;
  25.268 +
  25.269 +        rb_set_parent(old->rb_left, node);
  25.270 +        if (old->rb_right)
  25.271 +            rb_set_parent(old->rb_right, node);
  25.272 +        goto color;
  25.273 +    }
  25.274 +
  25.275 +    parent = rb_parent(node);
  25.276 +    color = rb_color(node);
  25.277 +
  25.278 +    if (child)
  25.279 +        rb_set_parent(child, parent);
  25.280 +    if (parent)
  25.281 +    {
  25.282 +        if (parent->rb_left == node)
  25.283 +            parent->rb_left = child;
  25.284 +        else
  25.285 +            parent->rb_right = child;
  25.286 +    }
  25.287 +    else
  25.288 +        root->rb_node = child;
  25.289 +
  25.290 + color:
  25.291 +    if (color == RB_BLACK)
  25.292 +        __rb_erase_color(child, parent, root);
  25.293 +}
  25.294 +EXPORT_SYMBOL(rb_erase);
  25.295 +
  25.296 +/*
  25.297 + * This function returns the first node (in sort order) of the tree.
  25.298 + */
  25.299 +struct rb_node *rb_first(struct rb_root *root)
  25.300 +{
  25.301 +    struct rb_node *n;
  25.302 +
  25.303 +    n = root->rb_node;
  25.304 +    if (!n)
  25.305 +        return NULL;
  25.306 +    while (n->rb_left)
  25.307 +        n = n->rb_left;
  25.308 +    return n;
  25.309 +}
  25.310 +EXPORT_SYMBOL(rb_first);
  25.311 +
  25.312 +struct rb_node *rb_last(struct rb_root *root)
  25.313 +{
  25.314 +    struct rb_node *n;
  25.315 +
  25.316 +    n = root->rb_node;
  25.317 +    if (!n)
  25.318 +        return NULL;
  25.319 +    while (n->rb_right)
  25.320 +        n = n->rb_right;
  25.321 +    return n;
  25.322 +}
  25.323 +EXPORT_SYMBOL(rb_last);
  25.324 +
  25.325 +struct rb_node *rb_next(struct rb_node *node)
  25.326 +{
  25.327 +    struct rb_node *parent;
  25.328 +
  25.329 +    if (rb_parent(node) == node)
  25.330 +        return NULL;
  25.331 +
  25.332 +    /* If we have a right-hand child, go down and then left as far
  25.333 +       as we can. */
  25.334 +    if (node->rb_right) {
  25.335 +        node = node->rb_right; 
  25.336 +        while (node->rb_left)
  25.337 +            node=node->rb_left;
  25.338 +        return node;
  25.339 +    }
  25.340 +
  25.341 +    /* No right-hand children.  Everything down and left is
  25.342 +       smaller than us, so any 'next' node must be in the general
  25.343 +       direction of our parent. Go up the tree; any time the
  25.344 +       ancestor is a right-hand child of its parent, keep going
  25.345 +       up. First time it's a left-hand child of its parent, said
  25.346 +       parent is our 'next' node. */
  25.347 +    while ((parent = rb_parent(node)) && node == parent->rb_right)
  25.348 +        node = parent;
  25.349 +
  25.350 +    return parent;
  25.351 +}
  25.352 +EXPORT_SYMBOL(rb_next);
  25.353 +
  25.354 +struct rb_node *rb_prev(struct rb_node *node)
  25.355 +{
  25.356 +    struct rb_node *parent;
  25.357 +
  25.358 +    if (rb_parent(node) == node)
  25.359 +        return NULL;
  25.360 +
  25.361 +    /* If we have a left-hand child, go down and then right as far
  25.362 +       as we can. */
  25.363 +    if (node->rb_left) {
  25.364 +        node = node->rb_left; 
  25.365 +        while (node->rb_right)
  25.366 +            node=node->rb_right;
  25.367 +        return node;
  25.368 +    }
  25.369 +
  25.370 +    /* No left-hand children. Go up till we find an ancestor which
  25.371 +       is a right-hand child of its parent */
  25.372 +    while ((parent = rb_parent(node)) && node == parent->rb_left)
  25.373 +        node = parent;
  25.374 +
  25.375 +    return parent;
  25.376 +}
  25.377 +EXPORT_SYMBOL(rb_prev);
  25.378 +
  25.379 +void rb_replace_node(struct rb_node *victim, struct rb_node *new,
  25.380 +                     struct rb_root *root)
  25.381 +{
  25.382 +    struct rb_node *parent = rb_parent(victim);
  25.383 +
  25.384 +    /* Set the surrounding nodes to point to the replacement */
  25.385 +    if (parent) {
  25.386 +        if (victim == parent->rb_left)
  25.387 +            parent->rb_left = new;
  25.388 +        else
  25.389 +            parent->rb_right = new;
  25.390 +    } else {
  25.391 +        root->rb_node = new;
  25.392 +    }
  25.393 +    if (victim->rb_left)
  25.394 +        rb_set_parent(victim->rb_left, new);
  25.395 +    if (victim->rb_right)
  25.396 +        rb_set_parent(victim->rb_right, new);
  25.397 +
  25.398 +    /* Copy the pointers/colour from the victim to the replacement */
  25.399 +    *new = *victim;
  25.400 +}
  25.401 +EXPORT_SYMBOL(rb_replace_node);
    26.1 --- a/xen/common/spinlock.c	Tue May 26 10:14:34 2009 +0100
    26.2 +++ b/xen/common/spinlock.c	Tue May 26 11:05:04 2009 +0100
    26.3 @@ -214,6 +214,12 @@ unsigned long _write_lock_irqsave(rwlock
    26.4      return flags;
    26.5  }
    26.6  
    26.7 +int _write_trylock(rwlock_t *lock)
    26.8 +{
    26.9 +    check_lock(&lock->debug);
   26.10 +    return _raw_write_trylock(&lock->raw);
   26.11 +}
   26.12 +
   26.13  void _write_unlock(rwlock_t *lock)
   26.14  {
   26.15      _raw_write_unlock(&lock->raw);
   26.16 @@ -236,3 +242,9 @@ int _rw_is_locked(rwlock_t *lock)
   26.17      check_lock(&lock->debug);
   26.18      return _raw_rw_is_locked(&lock->raw);
   26.19  }
   26.20 +
   26.21 +int _rw_is_write_locked(rwlock_t *lock)
   26.22 +{
   26.23 +    check_lock(&lock->debug);
   26.24 +    return _raw_rw_is_write_locked(&lock->raw);
   26.25 +}
    27.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.2 +++ b/xen/common/tmem.c	Tue May 26 11:05:04 2009 +0100
    27.3 @@ -0,0 +1,2109 @@
    27.4 +/******************************************************************************
    27.5 + * tmem.c
    27.6 + *
    27.7 + * Transcendent memory
    27.8 + *
    27.9 + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
   27.10 + */
   27.11 +
   27.12 +/* TODO list: 090129
   27.13 +   - improve on reclamation policy
   27.14 +   - use different tlsf pools for each client (maybe each pool)
   27.15 +   - implement page accounting and minimal QoS limits
   27.16 +   - test shared access more completely (need pv cluster fs)
   27.17 +   - add feedback-driven compression (not for persistent pools though!)
   27.18 +   - add data-structure total bytes overhead stats
   27.19 + */
   27.20 +
   27.21 +#ifdef __XEN__
   27.22 +#include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
   27.23 +#endif
   27.24 +
   27.25 +#include <xen/tmem.h>
   27.26 +#include <xen/rbtree.h>
   27.27 +#include <xen/radix-tree.h>
   27.28 +#include <xen/list.h>
   27.29 +
   27.30 +#define EXPORT /* indicates code other modules are dependent upon */
   27.31 +#define FORWARD
   27.32 +
   27.33 +/************  INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
   27.34 +
   27.35 +#define CLI_ID_NULL TMH_CLI_ID_NULL
   27.36 +#define cli_id_str  tmh_cli_id_str
   27.37 +#define client_str  tmh_client_str
   27.38 +
   27.39 +/************ DEBUG and STATISTICS (+ some compression testing) *******/
   27.40 +
   27.41 +#ifndef NDEBUG
   27.42 +#define SENTINELS
   27.43 +#define NOINLINE noinline
   27.44 +#else
   27.45 +#define NOINLINE
   27.46 +#endif
   27.47 +
   27.48 +#ifdef SENTINELS
   27.49 +#define DECL_SENTINEL unsigned long sentinel;
   27.50 +#define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
   27.51 +#define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
   27.52 +#define ASSERT_SENTINEL(_x,_y) \
   27.53 +    ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
   27.54 +#ifdef __i386__
   27.55 +#define POOL_SENTINEL 0x87658765
   27.56 +#define OBJ_SENTINEL 0x12345678
   27.57 +#define OBJNODE_SENTINEL 0xfedcba09
   27.58 +#define PGD_SENTINEL  0x43214321
   27.59 +#else
   27.60 +#define POOL_SENTINEL 0x8765876587658765
   27.61 +#define OBJ_SENTINEL 0x1234567812345678
   27.62 +#define OBJNODE_SENTINEL 0xfedcba0987654321
   27.63 +#define PGD_SENTINEL  0x4321432143214321
   27.64 +#endif
   27.65 +#else
   27.66 +#define DECL_SENTINEL
   27.67 +#define SET_SENTINEL(_x,_y) do { } while (0)
   27.68 +#define ASSERT_SENTINEL(_x,_y) do { } while (0)
   27.69 +#define INVERT_SENTINEL(_x,_y) do { } while (0)
   27.70 +#endif
   27.71 +
   27.72 +/* global statistics (none need to be locked) */
   27.73 +static unsigned long total_tmem_ops = 0;
   27.74 +static unsigned long errored_tmem_ops = 0;
   27.75 +static unsigned long total_flush_pool = 0;
   27.76 +static unsigned long alloc_failed = 0, alloc_page_failed = 0;
   27.77 +static unsigned long evicted_pgs = 0, evict_attempts = 0;
   27.78 +static unsigned long relinq_pgs = 0, relinq_attempts = 0;
   27.79 +static unsigned long max_evicts_per_relinq = 0;
   27.80 +static unsigned long low_on_memory = 0;
   27.81 +static int global_obj_count_max = 0;
   27.82 +static int global_pgp_count_max = 0;
   27.83 +static int global_page_count_max = 0;
   27.84 +static int global_rtree_node_count_max = 0;
   27.85 +static long global_eph_count_max = 0;
   27.86 +static unsigned long failed_copies;
   27.87 +
   27.88 +DECL_CYC_COUNTER(succ_get);
   27.89 +DECL_CYC_COUNTER(succ_put);
   27.90 +DECL_CYC_COUNTER(non_succ_get);
   27.91 +DECL_CYC_COUNTER(non_succ_put);
   27.92 +DECL_CYC_COUNTER(flush);
   27.93 +DECL_CYC_COUNTER(flush_obj);
   27.94 +#ifdef COMPARE_COPY_PAGE_SSE2
   27.95 +EXTERN_CYC_COUNTER(pg_copy1);
   27.96 +EXTERN_CYC_COUNTER(pg_copy2);
   27.97 +EXTERN_CYC_COUNTER(pg_copy3);
   27.98 +EXTERN_CYC_COUNTER(pg_copy4);
   27.99 +#else
  27.100 +EXTERN_CYC_COUNTER(pg_copy);
  27.101 +#endif
  27.102 +DECL_CYC_COUNTER(compress);
  27.103 +DECL_CYC_COUNTER(decompress);
  27.104 +
  27.105 +/************ CORE DATA STRUCTURES ************************************/
  27.106 +
  27.107 +#define MAX_POOLS_PER_DOMAIN 16
  27.108 +#define MAX_GLOBAL_SHARED_POOLS  16
  27.109 +
  27.110 +struct tm_pool;
  27.111 +struct client {
  27.112 +    struct list_head client_list;
  27.113 +    struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
  27.114 +    tmh_client_t *tmh;
  27.115 +    struct list_head ephemeral_page_list;
  27.116 +    long eph_count, eph_count_max;
  27.117 +    cli_id_t cli_id;
  27.118 +    uint32_t weight;
  27.119 +    uint32_t cap;
  27.120 +    bool_t compress;
  27.121 +    bool_t frozen;
  27.122 +    unsigned long compress_poor, compress_nomem;
  27.123 +    unsigned long compressed_pages;
  27.124 +    uint64_t compressed_sum_size;
  27.125 +};
  27.126 +typedef struct client client_t;
  27.127 +
  27.128 +struct share_list {
  27.129 +    struct list_head share_list;
  27.130 +    client_t *client;
  27.131 +};
  27.132 +typedef struct share_list sharelist_t;
  27.133 +
  27.134 +#define OBJ_HASH_BUCKETS 256 /* must be power of two */
  27.135 +#define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
  27.136 +#define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
  27.137 +
  27.138 +struct tm_pool {
  27.139 +    bool_t shared;
  27.140 +    bool_t persistent;
  27.141 +    struct list_head pool_list; /* FIXME do we need this anymore? */
  27.142 +    client_t *client;
  27.143 +    uint64_t uuid[2]; /* 0 for private, non-zero for shared */
  27.144 +    uint32_t pool_id;
  27.145 +    rwlock_t pool_rwlock;
  27.146 +    struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
  27.147 +    struct list_head share_list; /* valid if shared */
  27.148 +    DECL_SENTINEL
  27.149 +    int shared_count; /* valid if shared */
  27.150 +    atomic_t pgp_count;
  27.151 +    int pgp_count_max;
  27.152 +    long obj_count;  /* atomicity depends on pool_rwlock held for write */
  27.153 +    long obj_count_max;  
  27.154 +    unsigned long objnode_count, objnode_count_max;
  27.155 +    uint64_t sum_life_cycles;
  27.156 +    uint64_t sum_evicted_cycles;
  27.157 +    unsigned long puts, good_puts, no_mem_puts;
  27.158 +    unsigned long dup_puts_flushed, dup_puts_replaced;
  27.159 +    unsigned long gets, found_gets;
  27.160 +    unsigned long flushs, flushs_found;
  27.161 +    unsigned long flush_objs, flush_objs_found;
  27.162 +};
  27.163 +typedef struct tm_pool pool_t;
  27.164 +
  27.165 +#define is_persistent(_p)  (_p->persistent)
  27.166 +#define is_ephemeral(_p)   (!(_p->persistent))
  27.167 +#define is_shared(_p)      (_p->shared)
  27.168 +#define is_private(_p)     (!(_p->shared))
  27.169 +
  27.170 +struct tmem_object_root {
  27.171 +    DECL_SENTINEL
  27.172 +    uint64_t oid;
  27.173 +    struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
  27.174 +    unsigned long objnode_count; /* atomicity depends on obj_spinlock */
  27.175 +    long pgp_count; /* atomicity depends on obj_spinlock */
  27.176 +    struct radix_tree_root tree_root; /* tree of pages within object */
  27.177 +    pool_t *pool;
  27.178 +    cli_id_t last_client;
  27.179 +    spinlock_t obj_spinlock;
  27.180 +    bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
  27.181 +};
  27.182 +typedef struct tmem_object_root obj_t;
  27.183 +
  27.184 +typedef struct radix_tree_node rtn_t;
  27.185 +struct tmem_object_node {
  27.186 +    obj_t *obj;
  27.187 +    DECL_SENTINEL
  27.188 +    rtn_t rtn;
  27.189 +};
  27.190 +typedef struct tmem_object_node objnode_t;
  27.191 +
  27.192 +struct tmem_page_descriptor {
  27.193 +    struct list_head global_eph_pages;
  27.194 +    struct list_head client_eph_pages;
  27.195 +    obj_t *obj;
  27.196 +    uint32_t index;
  27.197 +    size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
  27.198 +    union {
  27.199 +        pfp_t *pfp;  /* page frame pointer */
  27.200 +        char *cdata; /* compressed data */
  27.201 +    };
  27.202 +    uint64_t timestamp;
  27.203 +    DECL_SENTINEL
  27.204 +};
  27.205 +typedef struct tmem_page_descriptor pgp_t;
  27.206 +
  27.207 +static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
  27.208 +
  27.209 +static LIST_HEAD(global_client_list);
  27.210 +static LIST_HEAD(global_pool_list);
  27.211 +
  27.212 +static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
  27.213 +static atomic_t client_weight_total = ATOMIC_INIT(0);
  27.214 +static int tmem_initialized = 0;
  27.215 +
  27.216 +/************ CONCURRENCY  ***********************************************/
  27.217 +
  27.218 +EXPORT DEFINE_SPINLOCK(tmem_spinlock);  /* used iff tmh_lock_all */
  27.219 +EXPORT DEFINE_RWLOCK(tmem_rwlock);      /* used iff !tmh_lock_all */
  27.220 +static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
  27.221 +
  27.222 +#define tmem_spin_lock(_l)  do {if (!tmh_lock_all) spin_lock(_l);}while(0)
  27.223 +#define tmem_spin_unlock(_l)  do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
  27.224 +#define tmem_read_lock(_l)  do {if (!tmh_lock_all) read_lock(_l);}while(0)
  27.225 +#define tmem_read_unlock(_l)  do {if (!tmh_lock_all) read_unlock(_l);}while(0)
  27.226 +#define tmem_write_lock(_l)  do {if (!tmh_lock_all) write_lock(_l);}while(0)
  27.227 +#define tmem_write_unlock(_l)  do {if (!tmh_lock_all) write_unlock(_l);}while(0)
  27.228 +#define tmem_write_trylock(_l)  ((tmh_lock_all)?1:write_trylock(_l))
  27.229 +#define tmem_spin_trylock(_l)  (tmh_lock_all?1:spin_trylock(_l))
  27.230 +
  27.231 +#define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
  27.232 +#define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
  27.233 +
  27.234 +/* global counters (should use long_atomic_t access) */
  27.235 +static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
  27.236 +static atomic_t global_obj_count = ATOMIC_INIT(0);
  27.237 +static atomic_t global_pgp_count = ATOMIC_INIT(0);
  27.238 +static atomic_t global_page_count = ATOMIC_INIT(0);
  27.239 +static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
  27.240 +
  27.241 +#define atomic_inc_and_max(_c) do { \
  27.242 +    atomic_inc(&_c); \
  27.243 +    if ( _atomic_read(_c) > _c##_max ) \
  27.244 +        _c##_max = _atomic_read(_c); \
  27.245 +} while (0)
  27.246 +
  27.247 +#define atomic_dec_and_assert(_c) do { \
  27.248 +    atomic_dec(&_c); \
  27.249 +    ASSERT(_atomic_read(_c) >= 0); \
  27.250 +} while (0)
  27.251 +
  27.252 +
  27.253 +/************ MEMORY ALLOCATION INTERFACE *****************************/
  27.254 +
  27.255 +#define tmem_malloc(_type,_pool) \
  27.256 +       _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
  27.257 +
  27.258 +#define tmem_malloc_bytes(_size,_pool) \
  27.259 +       _tmem_malloc(_size, 1, _pool)
  27.260 +
  27.261 +static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
  27.262 +{
  27.263 +    void *v;
  27.264 +
  27.265 +    if ( (pool != NULL) && is_persistent(pool) )
  27.266 +        v = tmh_alloc_subpage_thispool(pool,size,align);
  27.267 +    else
  27.268 +        v = tmh_alloc_subpage(pool, size, align);
  27.269 +    if ( v == NULL )
  27.270 +        alloc_failed++;
  27.271 +    return v;
  27.272 +}
  27.273 +
  27.274 +static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
  27.275 +{
  27.276 +    if ( pool == NULL || !is_persistent(pool) )
  27.277 +        tmh_free_subpage(p,size);
  27.278 +    else
  27.279 +        tmh_free_subpage_thispool(pool,p,size);
  27.280 +}
  27.281 +
  27.282 +static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
  27.283 +{
  27.284 +    pfp_t *pfp = NULL;
  27.285 +
  27.286 +    if ( pool != NULL && is_persistent(pool) )
  27.287 +        pfp = tmh_alloc_page_thispool(pool);
  27.288 +    else
  27.289 +        pfp = tmh_alloc_page(pool,0);
  27.290 +    if ( pfp == NULL )
  27.291 +        alloc_page_failed++;
  27.292 +    else
  27.293 +        atomic_inc_and_max(global_page_count);
  27.294 +    return pfp;
  27.295 +}
  27.296 +
  27.297 +static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
  27.298 +{
  27.299 +    ASSERT(pfp);
  27.300 +    if ( pool == NULL || !is_persistent(pool) )
  27.301 +        tmh_free_page(pfp);
  27.302 +    else
  27.303 +        tmh_free_page_thispool(pool,pfp);
  27.304 +    atomic_dec_and_assert(global_page_count);
  27.305 +}
  27.306 +
  27.307 +/************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
  27.308 +
  27.309 +/* allocate a pgp_t and associate it with an object */
  27.310 +static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
  27.311 +{
  27.312 +    pgp_t *pgp;
  27.313 +    pool_t *pool;
  27.314 +
  27.315 +    ASSERT(obj != NULL);
  27.316 +    ASSERT(obj->pool != NULL);
  27.317 +    pool = obj->pool;
  27.318 +    if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
  27.319 +        return NULL;
  27.320 +    pgp->obj = obj;
  27.321 +    INIT_LIST_HEAD(&pgp->global_eph_pages);
  27.322 +    INIT_LIST_HEAD(&pgp->client_eph_pages);
  27.323 +    pgp->pfp = NULL;
  27.324 +    pgp->size = -1;
  27.325 +    pgp->index = -1;
  27.326 +    pgp->timestamp = get_cycles();
  27.327 +    SET_SENTINEL(pgp,PGD);
  27.328 +    atomic_inc_and_max(global_pgp_count);
  27.329 +    atomic_inc_and_max(pool->pgp_count);
  27.330 +    return pgp;
  27.331 +}
  27.332 +
  27.333 +static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
  27.334 +{
  27.335 +    ASSERT(obj != NULL);
  27.336 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
  27.337 +    ASSERT_SENTINEL(obj,OBJ);
  27.338 +    ASSERT(obj->pool != NULL);
  27.339 +    ASSERT_SENTINEL(obj->pool,POOL);
  27.340 +    return radix_tree_lookup(&obj->tree_root, index);
  27.341 +}
  27.342 +
  27.343 +static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
  27.344 +{
  27.345 +    if ( pgp->pfp == NULL )
  27.346 +        return;
  27.347 +    if ( !pgp->size )
  27.348 +        tmem_page_free(pgp->obj->pool,pgp->pfp);
  27.349 +    else
  27.350 +    {
  27.351 +        tmem_free(pgp->cdata,pgp->size,pool);
  27.352 +        if ( pool != NULL )
  27.353 +        {
  27.354 +            pool->client->compressed_pages--;
  27.355 +            pool->client->compressed_sum_size -= pgp->size;
  27.356 +        }
  27.357 +    }
  27.358 +    pgp->pfp = NULL;
  27.359 +    pgp->size = -1;
  27.360 +}
  27.361 +
  27.362 +static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
  27.363 +{
  27.364 +    pool_t *pool = NULL;
  27.365 +
  27.366 +    ASSERT_SENTINEL(pgp,PGD);
  27.367 +    ASSERT(pgp->obj != NULL);
  27.368 +    ASSERT_SENTINEL(pgp->obj,OBJ);
  27.369 +    ASSERT_SENTINEL(pgp->obj->pool,POOL);
  27.370 +    ASSERT(list_empty(&pgp->global_eph_pages));
  27.371 +    ASSERT(list_empty(&pgp->client_eph_pages));
  27.372 +    if ( from_delete )
  27.373 +        ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
  27.374 +    ASSERT(pgp->obj->pool != NULL);
  27.375 +    pool = pgp->obj->pool;
  27.376 +    pgp_free_data(pgp, pool);
  27.377 +    INVERT_SENTINEL(pgp,PGD);
  27.378 +    pgp->obj = NULL;
  27.379 +    pgp->index = -1;
  27.380 +    pgp->size = -1;
  27.381 +    atomic_dec_and_assert(global_pgp_count);
  27.382 +    atomic_dec_and_assert(pool->pgp_count);
  27.383 +    tmem_free(pgp,sizeof(pgp_t),pool);
  27.384 +}
  27.385 +
  27.386 +/* remove the page from appropriate lists but not from parent object */
  27.387 +static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
  27.388 +{
  27.389 +    ASSERT(pgp != NULL);
  27.390 +    ASSERT(pgp->obj != NULL);
  27.391 +    ASSERT(pgp->obj->pool != NULL);
  27.392 +    ASSERT(pgp->obj->pool->client != NULL);
  27.393 +    if ( is_ephemeral(pgp->obj->pool) )
  27.394 +    {
  27.395 +        if ( !no_eph_lock )
  27.396 +            tmem_spin_lock(&eph_lists_spinlock);
  27.397 +        if ( !list_empty(&pgp->client_eph_pages) )
  27.398 +            pgp->obj->pool->client->eph_count--;
  27.399 +        ASSERT(pgp->obj->pool->client->eph_count >= 0);
  27.400 +        list_del_init(&pgp->client_eph_pages);
  27.401 +        if ( !list_empty(&pgp->global_eph_pages) )
  27.402 +            global_eph_count--;
  27.403 +        ASSERT(global_eph_count >= 0);
  27.404 +        list_del_init(&pgp->global_eph_pages);
  27.405 +        if ( !no_eph_lock )
  27.406 +            tmem_spin_unlock(&eph_lists_spinlock);
  27.407 +    }
  27.408 +}
  27.409 +
  27.410 +/* remove page from lists (but not from parent object) and free it */
  27.411 +static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
  27.412 +{
  27.413 +    uint64_t life;
  27.414 +
  27.415 +    ASSERT(pgp != NULL);
  27.416 +    ASSERT(pgp->obj != NULL);
  27.417 +    ASSERT(pgp->obj->pool != NULL);
  27.418 +    life = get_cycles() - pgp->timestamp;
  27.419 +    pgp->obj->pool->sum_life_cycles += life;
  27.420 +    pgp_delist(pgp, no_eph_lock);
  27.421 +    pgp_free(pgp,1);
  27.422 +}
  27.423 +
  27.424 +/* called only indirectly by radix_tree_destroy */
  27.425 +static NOINLINE void pgp_destroy(void *v)
  27.426 +{
  27.427 +    pgp_t *pgp = (pgp_t *)v;
  27.428 +
  27.429 +    ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
  27.430 +    pgp_delist(pgp,0);
  27.431 +    ASSERT(pgp->obj != NULL);
  27.432 +    pgp->obj->pgp_count--;
  27.433 +    ASSERT(pgp->obj->pgp_count >= 0);
  27.434 +    pgp_free(pgp,0);
  27.435 +}
  27.436 +
  27.437 +FORWARD static rtn_t *rtn_alloc(void *arg);
  27.438 +FORWARD static void rtn_free(rtn_t *rtn);
  27.439 +
  27.440 +static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
  27.441 +{
  27.442 +    int ret;
  27.443 +
  27.444 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
  27.445 +    ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
  27.446 +    if ( !ret )
  27.447 +        obj->pgp_count++;
  27.448 +    return ret;
  27.449 +}
  27.450 +
  27.451 +static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
  27.452 +{
  27.453 +    pgp_t *pgp;
  27.454 +
  27.455 +    ASSERT(obj != NULL);
  27.456 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
  27.457 +    ASSERT_SENTINEL(obj,OBJ);
  27.458 +    ASSERT(obj->pool != NULL);
  27.459 +    ASSERT_SENTINEL(obj->pool,POOL);
  27.460 +    pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
  27.461 +    if ( pgp != NULL )
  27.462 +        obj->pgp_count--;
  27.463 +    ASSERT(obj->pgp_count >= 0);
  27.464 +
  27.465 +    return pgp;
  27.466 +}
  27.467 +
  27.468 +/************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
  27.469 +
  27.470 +/* called only indirectly from radix_tree_insert */
  27.471 +static NOINLINE rtn_t *rtn_alloc(void *arg)
  27.472 +{
  27.473 +    objnode_t *objnode;
  27.474 +    obj_t *obj = (obj_t *)arg;
  27.475 +
  27.476 +    ASSERT_SENTINEL(obj,OBJ);
  27.477 +    ASSERT(obj->pool != NULL);
  27.478 +    ASSERT_SENTINEL(obj->pool,POOL);
  27.479 +    objnode = tmem_malloc(objnode_t,obj->pool);
  27.480 +    if (objnode == NULL)
  27.481 +        return NULL;
  27.482 +    objnode->obj = obj;
  27.483 +    SET_SENTINEL(objnode,OBJNODE);
  27.484 +    memset(&objnode->rtn, 0, sizeof(rtn_t));
  27.485 +    if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
  27.486 +        obj->pool->objnode_count_max = obj->pool->objnode_count;
  27.487 +    atomic_inc_and_max(global_rtree_node_count);
  27.488 +    obj->objnode_count++;
  27.489 +    return &objnode->rtn;
  27.490 +}
  27.491 +
  27.492 +/* called only indirectly from radix_tree_delete/destroy */
  27.493 +static void rtn_free(rtn_t *rtn)
  27.494 +{
  27.495 +    pool_t *pool;
  27.496 +    objnode_t *objnode;
  27.497 +    int i;
  27.498 +
  27.499 +    ASSERT(rtn != NULL);
  27.500 +    for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
  27.501 +        ASSERT(rtn->slots[i] == NULL);
  27.502 +    objnode = container_of(rtn,objnode_t,rtn);
  27.503 +    ASSERT_SENTINEL(objnode,OBJNODE);
  27.504 +    INVERT_SENTINEL(objnode,OBJNODE);
  27.505 +    ASSERT(objnode->obj != NULL);
  27.506 +    ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
  27.507 +    ASSERT_SENTINEL(objnode->obj,OBJ);
  27.508 +    pool = objnode->obj->pool;
  27.509 +    ASSERT(pool != NULL);
  27.510 +    ASSERT_SENTINEL(pool,POOL);
  27.511 +    pool->objnode_count--;
  27.512 +    objnode->obj->objnode_count--;
  27.513 +    objnode->obj = NULL;
  27.514 +    tmem_free(objnode,sizeof(objnode_t),pool);
  27.515 +    atomic_dec_and_assert(global_rtree_node_count);
  27.516 +}
  27.517 +
  27.518 +/************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
  27.519 +
  27.520 +/* searches for object==oid in pool, returns locked object if found */
  27.521 +static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
  27.522 +{
  27.523 +    struct rb_node *node;
  27.524 +    obj_t *obj;
  27.525 +
  27.526 +restart_find:
  27.527 +    tmem_read_lock(&pool->pool_rwlock);
  27.528 +    node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
  27.529 +    while ( node )
  27.530 +    {
  27.531 +        obj = container_of(node, obj_t, rb_tree_node);
  27.532 +        if ( obj->oid == oid )
  27.533 +        {
  27.534 +            if ( tmh_lock_all )
  27.535 +                obj->no_evict = 1;
  27.536 +            else
  27.537 +            {
  27.538 +                if ( !tmem_spin_trylock(&obj->obj_spinlock) )
  27.539 +                {
  27.540 +                    tmem_read_unlock(&pool->pool_rwlock);
  27.541 +                    goto restart_find;
  27.542 +                }
  27.543 +                tmem_read_unlock(&pool->pool_rwlock);
  27.544 +            }
  27.545 +            return obj;
  27.546 +        }
  27.547 +        else if ( oid < obj->oid )
  27.548 +            node = node->rb_left;
  27.549 +        else
  27.550 +            node = node->rb_right;
  27.551 +    }
  27.552 +    tmem_read_unlock(&pool->pool_rwlock);
  27.553 +    return NULL;
  27.554 +}
  27.555 +
  27.556 +/* free an object that has no more pgps in it */
  27.557 +static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
  27.558 +{
  27.559 +    pool_t *pool;
  27.560 +    uint64_t old_oid;
  27.561 +
  27.562 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
  27.563 +    ASSERT(obj != NULL);
  27.564 +    ASSERT_SENTINEL(obj,OBJ);
  27.565 +    ASSERT(obj->pgp_count == 0);
  27.566 +    pool = obj->pool;
  27.567 +    ASSERT(pool != NULL);
  27.568 +    ASSERT_WRITELOCK(&pool->pool_rwlock);
  27.569 +    if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
  27.570 +        radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
  27.571 +    ASSERT((long)obj->objnode_count == 0);
  27.572 +    ASSERT(obj->tree_root.rnode == NULL);
  27.573 +    pool->obj_count--;
  27.574 +    ASSERT(pool->obj_count >= 0);
  27.575 +    INVERT_SENTINEL(obj,OBJ);
  27.576 +    obj->pool = NULL;
  27.577 +    old_oid = obj->oid;
  27.578 +    obj->oid = -1;
  27.579 +    obj->last_client = CLI_ID_NULL;
  27.580 +    atomic_dec_and_assert(global_obj_count);
  27.581 +    /* use no_rebalance only if all objects are being destroyed anyway */
  27.582 +    if ( !no_rebalance )
  27.583 +        rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
  27.584 +    tmem_free(obj,sizeof(obj_t),pool);
  27.585 +}
  27.586 +
  27.587 +static NOINLINE void obj_rb_destroy_node(struct rb_node *node)
  27.588 +{
  27.589 +    obj_t * obj;
  27.590 +
  27.591 +    if ( node == NULL )
  27.592 +        return;
  27.593 +    obj_rb_destroy_node(node->rb_left);
  27.594 +    obj_rb_destroy_node(node->rb_right);
  27.595 +    obj = container_of(node, obj_t, rb_tree_node);
  27.596 +    tmem_spin_lock(&obj->obj_spinlock);
  27.597 +    ASSERT(obj->no_evict == 0);
  27.598 +    radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
  27.599 +    obj_free(obj,1);
  27.600 +}
  27.601 +
  27.602 +static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
  27.603 +{
  27.604 +    struct rb_node **new, *parent = NULL;
  27.605 +    obj_t *this;
  27.606 +
  27.607 +    new = &(root->rb_node);
  27.608 +    while ( *new )
  27.609 +    {
  27.610 +        this = container_of(*new, obj_t, rb_tree_node);
  27.611 +        parent = *new;
  27.612 +        if ( obj->oid < this->oid )
  27.613 +            new = &((*new)->rb_left);
  27.614 +        else if ( obj->oid > this->oid )
  27.615 +            new = &((*new)->rb_right);
  27.616 +        else
  27.617 +            return 0;
  27.618 +    }
  27.619 +    rb_link_node(&obj->rb_tree_node, parent, new);
  27.620 +    rb_insert_color(&obj->rb_tree_node, root);
  27.621 +    return 1;
  27.622 +}
  27.623 +
  27.624 +/*
  27.625 + * allocate, initialize, and insert an tmem_object_root
  27.626 + * (should be called only if find failed)
  27.627 + */
  27.628 +static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
  27.629 +{
  27.630 +    obj_t *obj;
  27.631 +
  27.632 +    ASSERT(pool != NULL);
  27.633 +    ASSERT_WRITELOCK(&pool->pool_rwlock);
  27.634 +    if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
  27.635 +        return NULL;
  27.636 +    pool->obj_count++;
  27.637 +    if (pool->obj_count > pool->obj_count_max)
  27.638 +        pool->obj_count_max = pool->obj_count;
  27.639 +    atomic_inc_and_max(global_obj_count);
  27.640 +    INIT_RADIX_TREE(&obj->tree_root,0);
  27.641 +    spin_lock_init(&obj->obj_spinlock);
  27.642 +    obj->pool = pool;
  27.643 +    obj->oid = oid;
  27.644 +    obj->objnode_count = 0;
  27.645 +    obj->pgp_count = 0;
  27.646 +    obj->last_client = CLI_ID_NULL;
  27.647 +    SET_SENTINEL(obj,OBJ);
  27.648 +    tmem_spin_lock(&obj->obj_spinlock);
  27.649 +    obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
  27.650 +    obj->no_evict = 1;
  27.651 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
  27.652 +    return obj;
  27.653 +}
  27.654 +
  27.655 +/* free an object after destroying any pgps in it */
  27.656 +static NOINLINE void obj_destroy(obj_t *obj)
  27.657 +{
  27.658 +    ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
  27.659 +    radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
  27.660 +    obj_free(obj,0);
  27.661 +}
  27.662 +
  27.663 +/* destroy all objects in a pool */
  27.664 +static NOINLINE void obj_rb_destroy_all(pool_t *pool)
  27.665 +{
  27.666 +    int i;
  27.667 +
  27.668 +    tmem_write_lock(&pool->pool_rwlock);
  27.669 +    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
  27.670 +        obj_rb_destroy_node(pool->obj_rb_root[i].rb_node);
  27.671 +    tmem_write_unlock(&pool->pool_rwlock);
  27.672 +}
  27.673 +
  27.674 +/* destroys all objects in a pool that have last_client set to cli_id */
  27.675 +static void obj_free_selective(pool_t *pool, cli_id_t cli_id)
  27.676 +{
  27.677 +    struct rb_node *node;
  27.678 +    obj_t *obj;
  27.679 +    int i;
  27.680 +
  27.681 +    tmem_write_lock(&pool->pool_rwlock);
  27.682 +    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
  27.683 +    {
  27.684 +        node = rb_first(&pool->obj_rb_root[i]);
  27.685 +        while ( node != NULL )
  27.686 +        {
  27.687 +            obj = container_of(node, obj_t, rb_tree_node);
  27.688 +            tmem_spin_lock(&obj->obj_spinlock);
  27.689 +            node = rb_next(node);
  27.690 +            if ( obj->last_client == cli_id )
  27.691 +                obj_destroy(obj);
  27.692 +            else
  27.693 +                tmem_spin_unlock(&obj->obj_spinlock);
  27.694 +        }
  27.695 +    }
  27.696 +    tmem_write_unlock(&pool->pool_rwlock);
  27.697 +}
  27.698 +
  27.699 +
  27.700 +/************ POOL MANIPULATION ROUTINES ******************************/
  27.701 +
  27.702 +static pool_t * pool_alloc(void)
  27.703 +{
  27.704 +    pool_t *pool;
  27.705 +    int i;
  27.706 +
  27.707 +    if ( (pool = tmem_malloc(pool_t,NULL)) == NULL )
  27.708 +        return NULL;
  27.709 +    for (i = 0; i < OBJ_HASH_BUCKETS; i++)
  27.710 +        pool->obj_rb_root[i] = RB_ROOT;
  27.711 +    INIT_LIST_HEAD(&pool->pool_list);
  27.712 +    rwlock_init(&pool->pool_rwlock);
  27.713 +    pool->pgp_count_max = pool->obj_count_max = 0;
  27.714 +    pool->objnode_count = pool->objnode_count_max = 0;
  27.715 +    atomic_set(&pool->pgp_count,0);
  27.716 +    pool->obj_count = 0;
  27.717 +    pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
  27.718 +    pool->dup_puts_replaced = pool->no_mem_puts = 0;
  27.719 +    pool->found_gets = pool->gets = 0;
  27.720 +    pool->flushs_found = pool->flushs = 0;
  27.721 +    pool->flush_objs_found = pool->flush_objs = 0;
  27.722 +    SET_SENTINEL(pool,POOL);
  27.723 +    return pool;
  27.724 +}
  27.725 +
  27.726 +static NOINLINE void pool_free(pool_t *pool)
  27.727 +{
  27.728 +    ASSERT_SENTINEL(pool,POOL);
  27.729 +    INVERT_SENTINEL(pool,POOL);
  27.730 +    pool->client = NULL;
  27.731 +    list_del(&pool->pool_list);
  27.732 +    tmem_free(pool,sizeof(pool_t),NULL);
  27.733 +}
  27.734 +
  27.735 +/* register new_client as a user of this shared pool and return new
  27.736 +   total number of registered users */
  27.737 +static int shared_pool_join(pool_t *pool, client_t *new_client)
  27.738 +{
  27.739 +    sharelist_t *sl;
  27.740 +
  27.741 +    ASSERT(is_shared(pool));
  27.742 +    if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
  27.743 +        return -1;
  27.744 +    sl->client = new_client;
  27.745 +    list_add_tail(&sl->share_list, &pool->share_list);
  27.746 +    printk("adding new %s %d to shared pool owned by %s %d\n",
  27.747 +        client_str, new_client->cli_id, client_str, pool->client->cli_id);
  27.748 +    return ++pool->shared_count;
  27.749 +}
  27.750 +
  27.751 +/* reassign "ownership" of the pool to another client that shares this pool */
  27.752 +static NOINLINE void shared_pool_reassign(pool_t *pool)
  27.753 +{
  27.754 +    sharelist_t *sl;
  27.755 +    int poolid;
  27.756 +    client_t *old_client = pool->client, *new_client;
  27.757 +
  27.758 +    ASSERT(is_shared(pool));
  27.759 +    if ( list_empty(&pool->share_list) )
  27.760 +    {
  27.761 +        ASSERT(pool->shared_count == 0);
  27.762 +        return;
  27.763 +    }
  27.764 +    old_client->pools[pool->pool_id] = NULL;
  27.765 +    sl = list_entry(pool->share_list.next, sharelist_t, share_list);
  27.766 +    ASSERT(sl->client != old_client);
  27.767 +    pool->client = new_client = sl->client;
  27.768 +    for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
  27.769 +        if (new_client->pools[poolid] == pool)
  27.770 +            break;
  27.771 +    ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
  27.772 +    printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
  27.773 +        cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
  27.774 +    pool->pool_id = poolid;
  27.775 +}
  27.776 +
  27.777 +/* destroy all objects with last_client same as passed cli_id,
  27.778 +   remove pool's cli_id from list of sharers of this pool */
  27.779 +static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
  27.780 +{
  27.781 +    sharelist_t *sl;
  27.782 +    int s_poolid;
  27.783 +
  27.784 +    ASSERT(is_shared(pool));
  27.785 +    ASSERT(pool->client != NULL);
  27.786 +    
  27.787 +    obj_free_selective(pool,cli_id);
  27.788 +    list_for_each_entry(sl,&pool->share_list, share_list)
  27.789 +    {
  27.790 +        if (sl->client->cli_id != cli_id)
  27.791 +            continue;
  27.792 +        list_del(&sl->share_list);
  27.793 +        tmem_free(sl,sizeof(sharelist_t),pool);
  27.794 +        --pool->shared_count;
  27.795 +        if (pool->client->cli_id == cli_id)
  27.796 +            shared_pool_reassign(pool);
  27.797 +        if (pool->shared_count)
  27.798 +            return pool->shared_count;
  27.799 +        for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
  27.800 +            if ( (global_shared_pools[s_poolid]) == pool )
  27.801 +            {
  27.802 +                global_shared_pools[s_poolid] = NULL;
  27.803 +                break;
  27.804 +            }
  27.805 +        return 0;
  27.806 +    }
  27.807 +    printk("tmem: no match unsharing pool, %s=%d\n",
  27.808 +        cli_id_str,pool->client->cli_id);
  27.809 +    return -1;
  27.810 +}
  27.811 +
  27.812 +/* flush all data (owned by cli_id) from a pool and, optionally, free it */
  27.813 +static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
  27.814 +{
  27.815 +    ASSERT(pool != NULL);
  27.816 +    if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
  27.817 +    {
  27.818 +        printk("tmem: unshared shared pool %d from %s=%d\n",
  27.819 +           pool->pool_id, cli_id_str,pool->client->cli_id);
  27.820 +        return;
  27.821 +    }
  27.822 +    printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
  27.823 +        is_persistent(pool) ? "persistent" : "ephemeral" ,
  27.824 +        is_shared(pool) ? "shared" : "private");
  27.825 +    printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
  27.826 +    obj_rb_destroy_all(pool);
  27.827 +    if ( destroy )
  27.828 +    {
  27.829 +        pool->client->pools[pool->pool_id] = NULL;
  27.830 +        pool_free(pool);
  27.831 +    }
  27.832 +}
  27.833 +
  27.834 +/************ CLIENT MANIPULATION OPERATIONS **************************/
  27.835 +
  27.836 +static client_t *client_create(void)
  27.837 +{
  27.838 +    client_t *client = tmem_malloc(client_t,NULL);
  27.839 +    cli_id_t cli_id = tmh_get_cli_id_from_current();
  27.840 +
  27.841 +    printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
  27.842 +    if ( client == NULL )
  27.843 +    {
  27.844 +        printk("failed... out of memory\n");
  27.845 +        return NULL;
  27.846 +    }
  27.847 +    memset(client,0,sizeof(client_t));
  27.848 +    if ( (client->tmh = tmh_client_init()) == NULL )
  27.849 +    {
  27.850 +        printk("failed... can't allocate host-dependent part of client\n");
  27.851 +        if ( client )
  27.852 +            tmem_free(client,sizeof(client_t),NULL);
  27.853 +        return NULL;
  27.854 +    }
  27.855 +    tmh_set_current_client(client);
  27.856 +    client->cli_id = cli_id;
  27.857 +#ifdef __i386__
  27.858 +    client->compress = 0;
  27.859 +#else
  27.860 +    client->compress = tmh_compression_enabled();
  27.861 +#endif
  27.862 +    list_add_tail(&client->client_list, &global_client_list);
  27.863 +    INIT_LIST_HEAD(&client->ephemeral_page_list);
  27.864 +    client->eph_count = client->eph_count_max = 0;
  27.865 +    printk("ok\n");
  27.866 +    return client;
  27.867 +}
  27.868 +
  27.869 +static void client_free(client_t *client)
  27.870 +{
  27.871 +    list_del(&client->client_list);
  27.872 +    tmh_client_destroy(client->tmh);
  27.873 +    tmh_set_current_client(NULL);
  27.874 +    tmem_free(client,sizeof(client_t),NULL);
  27.875 +}
  27.876 +
  27.877 +/* flush all data from a client and, optionally, free it */
  27.878 +static void client_flush(client_t *client, bool_t destroy)
  27.879 +{
  27.880 +    int i;
  27.881 +    pool_t *pool;
  27.882 +
  27.883 +    for  (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
  27.884 +    {
  27.885 +        if ( (pool = client->pools[i]) == NULL )
  27.886 +            continue;
  27.887 +        pool_flush(pool,client->cli_id,destroy);
  27.888 +        if ( destroy )
  27.889 +            client->pools[i] = NULL;
  27.890 +    }
  27.891 +    if ( destroy )
  27.892 +        client_free(client);
  27.893 +}
  27.894 +
  27.895 +static bool_t client_over_quota(client_t *client)
  27.896 +{
  27.897 +    int total = _atomic_read(client_weight_total);
  27.898 +
  27.899 +    ASSERT(client != NULL);
  27.900 +    if ( (total == 0) || (client->weight == 0) || 
  27.901 +          (client->eph_count == 0) )
  27.902 +        return 0;
  27.903 +    return ( ((global_eph_count*100L) / client->eph_count ) >
  27.904 +             ((total*100L) / client->weight) );
  27.905 +}
  27.906 +
  27.907 +/************ MEMORY REVOCATION ROUTINES *******************************/
  27.908 +
  27.909 +static int tmem_evict(void)
  27.910 +{
  27.911 +    client_t *client = tmh_client_from_current();
  27.912 +    pgp_t *pgp = NULL, *pgp_del;
  27.913 +    obj_t *obj;
  27.914 +    pool_t *pool;
  27.915 +    int ret = 0;
  27.916 +    bool_t hold_pool_rwlock = 0;
  27.917 +
  27.918 +    evict_attempts++;
  27.919 +    tmem_spin_lock(&eph_lists_spinlock);
  27.920 +    if ( (client != NULL) && client_over_quota(client) &&
  27.921 +         !list_empty(&client->ephemeral_page_list) )
  27.922 +    {
  27.923 +        list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
  27.924 +        {
  27.925 +            obj = pgp->obj;
  27.926 +            pool = obj->pool;
  27.927 +            if ( tmh_lock_all && !obj->no_evict )
  27.928 +                goto found;
  27.929 +            if ( tmem_spin_trylock(&obj->obj_spinlock) )
  27.930 +            {
  27.931 +                if ( obj->pgp_count > 1 )
  27.932 +                    goto found;
  27.933 +                if ( tmem_write_trylock(&pool->pool_rwlock) )
  27.934 +                {
  27.935 +                    hold_pool_rwlock = 1;
  27.936 +                    goto found;
  27.937 +                }
  27.938 +                tmem_spin_unlock(&obj->obj_spinlock);
  27.939 +            }
  27.940 +        }
  27.941 +    } else if ( list_empty(&global_ephemeral_page_list) ) {
  27.942 +        goto out;
  27.943 +    } else {
  27.944 +        list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
  27.945 +        {
  27.946 +            obj = pgp->obj;
  27.947 +            pool = obj->pool;
  27.948 +            if ( tmh_lock_all && !obj->no_evict )
  27.949 +                goto found;
  27.950 +            if ( tmem_spin_trylock(&obj->obj_spinlock) )
  27.951 +            {
  27.952 +                if ( obj->pgp_count > 1 )
  27.953 +                    goto found;
  27.954 +                if ( tmem_write_trylock(&pool->pool_rwlock) )
  27.955 +                {
  27.956 +                    hold_pool_rwlock = 1;
  27.957 +                    goto found;
  27.958 +                }
  27.959 +                tmem_spin_unlock(&obj->obj_spinlock);
  27.960 +            }
  27.961 +        }
  27.962 +    }
  27.963 +
  27.964 +    ret = 0;
  27.965 +    goto out;
  27.966 +
  27.967 +found:
  27.968 +    ASSERT(pgp != NULL);
  27.969 +    ASSERT_SENTINEL(pgp,PGD);
  27.970 +    obj = pgp->obj;
  27.971 +    ASSERT(obj != NULL);
  27.972 +    ASSERT(obj->no_evict == 0);
  27.973 +    ASSERT(obj->pool != NULL);
  27.974 +    ASSERT_SENTINEL(obj,OBJ);
  27.975 +
  27.976 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
  27.977 +    pgp_del = pgp_delete_from_obj(obj, pgp->index);
  27.978 +    ASSERT(pgp_del == pgp);
  27.979 +    pgp_delete(pgp,1);
  27.980 +    if ( obj->pgp_count == 0 )
  27.981 +    {
  27.982 +        ASSERT_WRITELOCK(&pool->pool_rwlock);
  27.983 +        obj_free(obj,0);
  27.984 +    }
  27.985 +    else
  27.986 +        tmem_spin_unlock(&obj->obj_spinlock);
  27.987 +    if ( hold_pool_rwlock )
  27.988 +        tmem_write_unlock(&pool->pool_rwlock);
  27.989 +    evicted_pgs++;
  27.990 +    ret = 1;
  27.991 +
  27.992 +out:
  27.993 +    tmem_spin_unlock(&eph_lists_spinlock);
  27.994 +    return ret;
  27.995 +}
  27.996 +
  27.997 +static unsigned long tmem_relinquish_npages(unsigned long n)
  27.998 +{
  27.999 +    unsigned long avail_pages = 0;
 27.1000 +
 27.1001 +    while ( (avail_pages = tmh_avail_pages()) < n )
 27.1002 +    {
 27.1003 +        if (  !tmem_evict() )
 27.1004 +            break;
 27.1005 +    }
 27.1006 +    if ( avail_pages )
 27.1007 +        tmh_release_avail_pages_to_host();
 27.1008 +    return avail_pages;
 27.1009 +}
 27.1010 +
 27.1011 +/************ TMEM CORE OPERATIONS ************************************/
 27.1012 +
 27.1013 +static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
 27.1014 +{
 27.1015 +    void *dst, *p;
 27.1016 +    size_t size;
 27.1017 +    int ret = 0;
 27.1018 +    DECL_LOCAL_CYC_COUNTER(compress);
 27.1019 +    
 27.1020 +    ASSERT(pgp != NULL);
 27.1021 +    ASSERT(pgp->obj != NULL);
 27.1022 +    ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
 27.1023 +    ASSERT(pgp->obj->pool != NULL);
 27.1024 +    ASSERT(pgp->obj->pool->client != NULL);
 27.1025 +#ifdef __i386__
 27.1026 +    return -ENOMEM;
 27.1027 +#endif
 27.1028 +    if ( pgp->pfp != NULL )
 27.1029 +        pgp_free_data(pgp, pgp->obj->pool);  /* FIXME... is this right? */
 27.1030 +    START_CYC_COUNTER(compress);
 27.1031 +    ret = tmh_compress_from_client(cmfn, &dst, &size);
 27.1032 +    if ( (ret == -EFAULT) || (ret == 0) )
 27.1033 +        goto out;
 27.1034 +    else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
 27.1035 +        ret = 0;
 27.1036 +    else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
 27.1037 +        ret = -ENOMEM;
 27.1038 +    else
 27.1039 +    {
 27.1040 +        memcpy(p,dst,size);
 27.1041 +        pgp->cdata = p;
 27.1042 +        pgp->size = size;
 27.1043 +        pgp->obj->pool->client->compressed_pages++;
 27.1044 +        pgp->obj->pool->client->compressed_sum_size += size;
 27.1045 +        ret = 1;
 27.1046 +    }
 27.1047 +
 27.1048 +out:
 27.1049 +    END_CYC_COUNTER(compress);
 27.1050 +    return ret;
 27.1051 +}
 27.1052 +
 27.1053 +static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
 27.1054 +              uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
 27.1055 +{
 27.1056 +    pool_t *pool;
 27.1057 +    obj_t *obj;
 27.1058 +    client_t *client;
 27.1059 +    pgp_t *pgpfound = NULL;
 27.1060 +    int ret;
 27.1061 +
 27.1062 +    /* if we can successfully manipulate pgp to change out the data, do so */
 27.1063 +    ASSERT(pgp != NULL);
 27.1064 +    ASSERT(pgp->pfp != NULL);
 27.1065 +    ASSERT(pgp->size != -1);
 27.1066 +    obj = pgp->obj;
 27.1067 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
 27.1068 +    ASSERT(obj != NULL);
 27.1069 +    pool = obj->pool;
 27.1070 +    ASSERT(pool != NULL);
 27.1071 +    client = pool->client;
 27.1072 +    if ( len != 0 && tmh_compression_enabled() &&
 27.1073 +         client->compress && pgp->size != 0 )
 27.1074 +    {
 27.1075 +        ret = do_tmem_put_compress(pgp,cmfn);
 27.1076 +        if ( ret == 1 )
 27.1077 +            goto done;
 27.1078 +        else if ( ret == 0 )
 27.1079 +            goto copy_uncompressed;
 27.1080 +        else if ( ret == -ENOMEM )
 27.1081 +            goto failed_dup;
 27.1082 +        else if ( ret == -EFAULT )
 27.1083 +            goto bad_copy;
 27.1084 +    }
 27.1085 +
 27.1086 +copy_uncompressed:
 27.1087 +    if ( pgp->pfp )
 27.1088 +        pgp_free_data(pgp, pool);
 27.1089 +    if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
 27.1090 +        goto failed_dup;
 27.1091 +    /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
 27.1092 +    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
 27.1093 +    if ( ret == -EFAULT )
 27.1094 +        goto bad_copy;
 27.1095 +    pgp->size = 0;
 27.1096 +
 27.1097 +done:
 27.1098 +    /* successfully replaced data, clean up and return success */
 27.1099 +    if ( is_shared(pool) )
 27.1100 +        obj->last_client = client->cli_id;
 27.1101 +    obj->no_evict = 0;
 27.1102 +    tmem_spin_unlock(&obj->obj_spinlock);
 27.1103 +    pool->dup_puts_replaced++;
 27.1104 +    pool->good_puts++;
 27.1105 +    return 1;
 27.1106 +
 27.1107 +bad_copy:
 27.1108 +    /* this should only happen if the client passed a bad mfn */
 27.1109 +    failed_copies++;
 27.1110 +ASSERT(0);
 27.1111 +    return -EFAULT;
 27.1112 +
 27.1113 +failed_dup:
 27.1114 +   /* couldn't change out the data, flush the old data and return
 27.1115 +    * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
 27.1116 +    pgpfound = pgp_delete_from_obj(obj, pgp->index);
 27.1117 +    ASSERT(pgpfound == pgp);
 27.1118 +    pgp_delete(pgpfound,0);
 27.1119 +    if ( obj->pgp_count == 0 )
 27.1120 +    {
 27.1121 +        tmem_write_lock(&pool->pool_rwlock);
 27.1122 +        obj_free(obj,0);
 27.1123 +        tmem_write_unlock(&pool->pool_rwlock);
 27.1124 +    } else {
 27.1125 +        obj->no_evict = 0;
 27.1126 +        tmem_spin_unlock(&obj->obj_spinlock);
 27.1127 +    }
 27.1128 +    pool->dup_puts_flushed++;
 27.1129 +    return -ENOSPC;
 27.1130 +}
 27.1131 +
 27.1132 +
 27.1133 +static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
 27.1134 +              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
 27.1135 +              uint32_t pfn_offset, uint32_t len)
 27.1136 +{
 27.1137 +    obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
 27.1138 +    pgp_t *pgp = NULL, *pgpdel = NULL;
 27.1139 +    client_t *client = pool->client;
 27.1140 +    int ret = client->frozen ? -EFROZEN : -ENOMEM;
 27.1141 +
 27.1142 +    ASSERT(pool != NULL);
 27.1143 +    pool->puts++;
 27.1144 +    /* does page already exist (dup)?  if so, handle specially */
 27.1145 +    if ( (obj = objfound = obj_find(pool,oid)) != NULL )
 27.1146 +    {
 27.1147 +        ASSERT_SPINLOCK(&objfound->obj_spinlock);
 27.1148 +        if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
 27.1149 +            return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
 27.1150 +    }
 27.1151 +
 27.1152 +    /* no puts allowed into a frozen pool (except dup puts) */
 27.1153 +    if ( client->frozen )
 27.1154 +        goto free;
 27.1155 +
 27.1156 +    if ( (objfound == NULL) )
 27.1157 +    {
 27.1158 +        tmem_write_lock(&pool->pool_rwlock);
 27.1159 +        if ( (obj = objnew = obj_new(pool,oid)) == NULL )
 27.1160 +        {
 27.1161 +            tmem_write_unlock(&pool->pool_rwlock);
 27.1162 +            return -ENOMEM;
 27.1163 +        }
 27.1164 +        ASSERT_SPINLOCK(&objnew->obj_spinlock);
 27.1165 +        tmem_write_unlock(&pool->pool_rwlock);
 27.1166 +    }
 27.1167 +
 27.1168 +    ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
 27.1169 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
 27.1170 +    if ( (pgp = pgp_alloc(obj)) == NULL )
 27.1171 +        goto free;
 27.1172 +
 27.1173 +    ret = pgp_add_to_obj(obj, index, pgp);
 27.1174 +    if ( ret == -ENOMEM  )
 27.1175 +        /* warning, may result in partially built radix tree ("stump") */
 27.1176 +        goto free;
 27.1177 +    ASSERT(ret != -EEXIST);
 27.1178 +    pgp->index = index;
 27.1179 +
 27.1180 +    if ( len != 0 && tmh_compression_enabled() && client->compress )
 27.1181 +    {
 27.1182 +        ASSERT(pgp->pfp == NULL);
 27.1183 +        ret = do_tmem_put_compress(pgp,cmfn);
 27.1184 +        if ( ret == 1 )
 27.1185 +            goto insert_page;
 27.1186 +        if ( ret == -ENOMEM )
 27.1187 +        {
 27.1188 +            client->compress_nomem++;
 27.1189 +            goto delete_and_free;
 27.1190 +        }
 27.1191 +        if ( ret == 0 )
 27.1192 +        {
 27.1193 +            client->compress_poor++;
 27.1194 +            goto copy_uncompressed;
 27.1195 +        }
 27.1196 +        if ( ret == -EFAULT )
 27.1197 +            goto bad_copy;
 27.1198 +    }
 27.1199 +
 27.1200 +copy_uncompressed:
 27.1201 +    if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
 27.1202 +    {
 27.1203 +        ret == -ENOMEM;
 27.1204 +        goto delete_and_free;
 27.1205 +    }
 27.1206 +    /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
 27.1207 +    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
 27.1208 +    if ( ret == -EFAULT )
 27.1209 +        goto bad_copy;
 27.1210 +    pgp->size = 0;
 27.1211 +
 27.1212 +insert_page:
 27.1213 +    if ( is_ephemeral(pool) )
 27.1214 +    {
 27.1215 +        tmem_spin_lock(&eph_lists_spinlock);
 27.1216 +        list_add_tail(&pgp->global_eph_pages,
 27.1217 +            &global_ephemeral_page_list);
 27.1218 +        if (++global_eph_count > global_eph_count_max)
 27.1219 +            global_eph_count_max = global_eph_count;
 27.1220 +        list_add_tail(&pgp->client_eph_pages,
 27.1221 +            &client->ephemeral_page_list);
 27.1222 +        if (++client->eph_count > client->eph_count_max)
 27.1223 +            client->eph_count_max = client->eph_count;
 27.1224 +        tmem_spin_unlock(&eph_lists_spinlock);
 27.1225 +    }
 27.1226 +    ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
 27.1227 +    if ( is_shared(pool) )
 27.1228 +        obj->last_client = client->cli_id;
 27.1229 +    obj->no_evict = 0;
 27.1230 +    tmem_spin_unlock(&obj->obj_spinlock);
 27.1231 +    pool->good_puts++;
 27.1232 +    return 1;
 27.1233 +
 27.1234 +delete_and_free:
 27.1235 +    ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
 27.1236 +    pgpdel = pgp_delete_from_obj(obj, pgp->index);
 27.1237 +    ASSERT(pgp == pgpdel);
 27.1238 +
 27.1239 +free:
 27.1240 +    if ( pgp )
 27.1241 +        pgp_delete(pgp,0);
 27.1242 +    if ( objfound )
 27.1243 +    {
 27.1244 +        objfound->no_evict = 0;
 27.1245 +        tmem_spin_unlock(&objfound->obj_spinlock);
 27.1246 +    }
 27.1247 +    if ( objnew )
 27.1248 +    {
 27.1249 +        tmem_write_lock(&pool->pool_rwlock);
 27.1250 +        obj_free(objnew,0);
 27.1251 +        tmem_write_unlock(&pool->pool_rwlock);
 27.1252 +    }
 27.1253 +    pool->no_mem_puts++;
 27.1254 +    return ret;
 27.1255 +
 27.1256 +bad_copy:
 27.1257 +    /* this should only happen if the client passed a bad mfn */
 27.1258 +    failed_copies++;
 27.1259 +ASSERT(0);
 27.1260 +    goto free;
 27.1261 +}
 27.1262 +
 27.1263 +static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
 27.1264 +              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
 27.1265 +              uint32_t pfn_offset, uint32_t len)
 27.1266 +{
 27.1267 +    obj_t *obj;
 27.1268 +    pgp_t *pgp;
 27.1269 +    client_t *client = pool->client;
 27.1270 +    DECL_LOCAL_CYC_COUNTER(decompress);
 27.1271 +
 27.1272 +    if ( !_atomic_read(pool->pgp_count) )
 27.1273 +        return -EEMPTY;
 27.1274 +
 27.1275 +    pool->gets++;
 27.1276 +    obj = obj_find(pool,oid);
 27.1277 +    if ( obj == NULL )
 27.1278 +        return 0;
 27.1279 +
 27.1280 +    ASSERT_SPINLOCK(&obj->obj_spinlock);
 27.1281 +    if (is_shared(pool) || is_persistent(pool) )
 27.1282 +        pgp = pgp_lookup_in_obj(obj, index);
 27.1283 +    else
 27.1284 +        pgp = pgp_delete_from_obj(obj, index);
 27.1285 +    if ( pgp == NULL )
 27.1286 +    {
 27.1287 +        obj->no_evict = 0;
 27.1288 +        tmem_spin_unlock(&obj->obj_spinlock);
 27.1289 +        return 0;
 27.1290 +    }
 27.1291 +    ASSERT(pgp->size != -1);
 27.1292 +    if ( pgp->size != 0 )
 27.1293 +    {
 27.1294 +        START_CYC_COUNTER(decompress);
 27.1295 +        if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
 27.1296 +            goto bad_copy;
 27.1297 +        END_CYC_COUNTER(decompress);
 27.1298 +    }
 27.1299 +    else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
 27.1300 +                                 pfn_offset, len) == -EFAULT)
 27.1301 +        goto bad_copy;
 27.1302 +    if ( is_ephemeral(pool) )
 27.1303 +    {
 27.1304 +        if ( is_private(pool) )
 27.1305 +        {
 27.1306 +            pgp_delete(pgp,0);
 27.1307 +            if ( obj->pgp_count == 0 )
 27.1308 +            {
 27.1309 +                tmem_write_lock(&pool->pool_rwlock);
 27.1310 +                obj_free(obj,0);
 27.1311 +                obj = NULL;
 27.1312 +                tmem_write_unlock(&pool->pool_rwlock);
 27.1313 +            }
 27.1314 +        } else {
 27.1315 +            tmem_spin_lock(&eph_lists_spinlock);
 27.1316 +            list_del(&pgp->global_eph_pages);
 27.1317 +            list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
 27.1318 +            list_del(&pgp->client_eph_pages);
 27.1319 +            list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
 27.1320 +            tmem_spin_unlock(&eph_lists_spinlock);
 27.1321 +            ASSERT(obj != NULL);
 27.1322 +            obj->last_client = tmh_get_cli_id_from_current();
 27.1323 +        }
 27.1324 +    }
 27.1325 +    if ( obj != NULL )
 27.1326 +    {
 27.1327 +        obj->no_evict = 0;
 27.1328 +        tmem_spin_unlock(&obj->obj_spinlock);
 27.1329 +    }
 27.1330 +    pool->found_gets++;
 27.1331 +    return 1;
 27.1332 +
 27.1333 +bad_copy:
 27.1334 +    /* this should only happen if the client passed a bad mfn */
 27.1335 +    failed_copies++;
 27.1336 +ASSERT(0);
 27.1337 +    return -EFAULT;
 27.1338 +
 27.1339 +}
 27.1340 +
 27.1341 +static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
 27.1342 +{
 27.1343 +    obj_t *obj;
 27.1344 +    pgp_t *pgp;
 27.1345 +
 27.1346 +    pool->flushs++;
 27.1347 +    obj = obj_find(pool,oid);
 27.1348 +    if ( obj == NULL )
 27.1349 +        goto out;
 27.1350 +    pgp = pgp_delete_from_obj(obj, index);
 27.1351 +    if ( pgp == NULL )
 27.1352 +    {
 27.1353 +        obj->no_evict = 0;
 27.1354 +        tmem_spin_unlock(&obj->obj_spinlock);
 27.1355 +        goto out;
 27.1356 +    }
 27.1357 +    pgp_delete(pgp,0);
 27.1358 +    if ( obj->pgp_count == 0 )
 27.1359 +    {
 27.1360 +        tmem_write_lock(&pool->pool_rwlock);
 27.1361 +        obj_free(obj,0);
 27.1362 +        tmem_write_unlock(&pool->pool_rwlock);
 27.1363 +    } else {
 27.1364 +        obj->no_evict = 0;
 27.1365 +        tmem_spin_unlock(&obj->obj_spinlock);
 27.1366 +    }
 27.1367 +    pool->flushs_found++;
 27.1368 +
 27.1369 +out:
 27.1370 +    if ( pool->client->frozen )
 27.1371 +        return -EFROZEN;
 27.1372 +    else
 27.1373 +        return 1;
 27.1374 +}
 27.1375 +
 27.1376 +static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
 27.1377 +{
 27.1378 +    obj_t *obj;
 27.1379 +
 27.1380 +    pool->flush_objs++;
 27.1381 +    obj = obj_find(pool,oid);
 27.1382 +    if ( obj == NULL )
 27.1383 +        goto out;
 27.1384 +    tmem_write_lock(&pool->pool_rwlock);
 27.1385 +    obj_destroy(obj);
 27.1386 +    pool->flush_objs_found++;
 27.1387 +    tmem_write_unlock(&pool->pool_rwlock);
 27.1388 +
 27.1389 +out:
 27.1390 +    if ( pool->client->frozen )
 27.1391 +        return -EFROZEN;
 27.1392 +    else
 27.1393 +        return 1;
 27.1394 +}
 27.1395 +
 27.1396 +static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
 27.1397 +{
 27.1398 +    client_t *client = tmh_client_from_current();
 27.1399 +    pool_t *pool;
 27.1400 +
 27.1401 +    if ( client->pools == NULL )
 27.1402 +        return 0;
 27.1403 +    if ( (pool = client->pools[pool_id]) == NULL )
 27.1404 +        return 0;
 27.1405 +    client->pools[pool_id] = NULL;
 27.1406 +    pool_flush(pool,client->cli_id,1);
 27.1407 +    return 1;
 27.1408 +}
 27.1409 +
 27.1410 +static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi)
 27.1411 +{
 27.1412 +    client_t *client = tmh_client_from_current();
 27.1413 +    cli_id_t cli_id = tmh_get_cli_id_from_current();
 27.1414 +    int persistent = flags & TMEM_POOL_PERSIST;
 27.1415 +    int shared = flags & TMEM_POOL_SHARED;
 27.1416 +    int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
 27.1417 +         & TMEM_POOL_PAGESIZE_MASK;
 27.1418 +    int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
 27.1419 +         & TMEM_POOL_VERSION_MASK;
 27.1420 +    pool_t *pool, *shpool;
 27.1421 +    int s_poolid, d_poolid, first_unused_s_poolid;
 27.1422 +
 27.1423 +    ASSERT(client != NULL);
 27.1424 +    printk("tmem: allocating %s-%s tmem pool for %s=%d...",
 27.1425 +        persistent ? "persistent" : "ephemeral" ,
 27.1426 +        shared ? "shared" : "private", cli_id_str, cli_id);
 27.1427 +    if ( specversion != 0 )
 27.1428 +    {
 27.1429 +        printk("failed... unsupported spec version\n");
 27.1430 +        return -EPERM;
 27.1431 +    }
 27.1432 +    if ( pagebits != (PAGE_SHIFT - 12) )
 27.1433 +    {
 27.1434 +        printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
 27.1435 +        return -EPERM;
 27.1436 +    }
 27.1437 +    if ( (pool = pool_alloc()) == NULL )
 27.1438 +    {
 27.1439 +        printk("failed... out of memory\n");
 27.1440 +        return -ENOMEM;
 27.1441 +    }
 27.1442 +    for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
 27.1443 +        if ( client->pools[d_poolid] == NULL )
 27.1444 +            break;
 27.1445 +    if ( d_poolid == MAX_POOLS_PER_DOMAIN )
 27.1446 +    {
 27.1447 +        printk("failed... no more pool slots available for this %s\n",
 27.1448 +            client_str);
 27.1449 +        goto fail;
 27.1450 +    }
 27.1451 +    pool->shared = shared;
 27.1452 +    pool->client = client;
 27.1453 +    if ( shared )
 27.1454 +    {
 27.1455 +        first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
 27.1456 +        for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
 27.1457 +        {
 27.1458 +            if ( (shpool = global_shared_pools[s_poolid]) != NULL )
 27.1459 +            {
 27.1460 +                if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
 27.1461 +                {
 27.1462 +                    printk("(matches shared pool uuid=%"PRIx64".%"PRIu64") ",
 27.1463 +                        uuid_hi, uuid_lo);
 27.1464 +                    printk("pool_id=%d\n",d_poolid);
 27.1465 +                    client->pools[d_poolid] = global_shared_pools[s_poolid];
 27.1466 +                    shared_pool_join(global_shared_pools[s_poolid], client);
 27.1467 +                    pool_free(pool);
 27.1468 +                    return d_poolid;
 27.1469 +                }
 27.1470 +            }
 27.1471 +            else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
 27.1472 +                first_unused_s_poolid = s_poolid;
 27.1473 +        }
 27.1474 +        if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
 27.1475 +        {
 27.1476 +            printk("tmem: failed... no global shared pool slots available\n");
 27.1477 +            goto fail;
 27.1478 +        }
 27.1479 +        else
 27.1480 +        {
 27.1481 +            INIT_LIST_HEAD(&pool->share_list);
 27.1482 +            pool->shared_count = 0;
 27.1483 +            global_shared_pools[first_unused_s_poolid] = pool;
 27.1484 +            (void)shared_pool_join(pool,client);
 27.1485 +        }
 27.1486 +    }
 27.1487 +    client->pools[d_poolid] = pool;
 27.1488 +    list_add_tail(&pool->pool_list, &global_pool_list);
 27.1489 +    pool->pool_id = d_poolid;
 27.1490 +    pool->persistent = persistent;
 27.1491 +    pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
 27.1492 +    printk("pool_id=%d\n",d_poolid);
 27.1493 +    return d_poolid;
 27.1494 +
 27.1495 +fail:
 27.1496 +    pool_free(pool);
 27.1497 +    return -EPERM;
 27.1498 +}
 27.1499 +
 27.1500 +/************ TMEM CONTROL OPERATIONS ************************************/
 27.1501 +
 27.1502 +/* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
 27.1503 +static int tmemc_freeze_pools(int cli_id, int arg)
 27.1504 +{
 27.1505 +    client_t *client;
 27.1506 +    bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
 27.1507 +    bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
 27.1508 +    char *s;
 27.1509 +
 27.1510 +    s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
 27.1511 +    if ( cli_id == CLI_ID_NULL )
 27.1512 +    {
 27.1513 +        list_for_each_entry(client,&global_client_list,client_list)
 27.1514 +        {
 27.1515 +            client->frozen = freeze;
 27.1516 +            printk("tmem: all pools %s for all %ss\n",s,client_str);
 27.1517 +        }
 27.1518 +    }
 27.1519 +    else
 27.1520 +    {
 27.1521 +        if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
 27.1522 +            return -1;
 27.1523 +        client->frozen = freeze;
 27.1524 +        printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
 27.1525 +    }
 27.1526 +    return 0;
 27.1527 +}
 27.1528 +
 27.1529 +static int tmemc_flush_mem(int cli_id, uint32_t kb)
 27.1530 +{
 27.1531 +    uint32_t npages, flushed_pages, flushed_kb;
 27.1532 +
 27.1533 +    if ( cli_id != CLI_ID_NULL )
 27.1534 +    {
 27.1535 +        printk("tmem: %s-specific flush not supported yet, use --all\n",
 27.1536 +           client_str);
 27.1537 +        return -1;
 27.1538 +    }
 27.1539 +    /* convert kb to pages, rounding up if necessary */
 27.1540 +    npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
 27.1541 +    flushed_pages = tmem_relinquish_npages(npages);
 27.1542 +    flushed_kb = flushed_pages << (PAGE_SHIFT-10);
 27.1543 +    return flushed_kb;
 27.1544 +}
 27.1545 +
 27.1546 +/*
 27.1547 + * These tmemc_list* routines output lots of stats in a format that is
 27.1548 + *  intended to be program-parseable, not human-readable. Further, by
 27.1549 + *  tying each group of stats to a line format indicator (e.g. G= for
 27.1550 + *  global stats) and each individual stat to a two-letter specifier
 27.1551 + *  (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
 27.1552 + *  global ephemeral pool), it should allow the stats reported to be
 27.1553 + *  forward and backwards compatible as tmem evolves.
 27.1554 + */
 27.1555 +#define BSIZE 1024
 27.1556 +
 27.1557 +static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off, 
 27.1558 +                             uint32_t len, bool_t use_long)
 27.1559 +{
 27.1560 +    char info[BSIZE];
 27.1561 +    int i, n = 0, sum = 0;
 27.1562 +    pool_t *p;
 27.1563 +    bool_t s;
 27.1564 +
 27.1565 +    n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d%c",
 27.1566 +        c->cli_id, c->weight, c->cap, c->compress,
 27.1567 +        c->frozen, use_long ? ',' : '\n');
 27.1568 +    if (use_long)
 27.1569 +        n += scnprintf(info+n,BSIZE-n,
 27.1570 +             "Ec:%ld,Em:%ld,cp:%ld,cb:%lld,cn:%ld,cm:%ld\n",
 27.1571 +             c->eph_count, c->eph_count_max,
 27.1572 +             c->compressed_pages, (long long)c->compressed_sum_size,
 27.1573 +             c->compress_poor, c->compress_nomem);
 27.1574 +    tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
 27.1575 +    sum += n;
 27.1576 +    for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
 27.1577 +    {
 27.1578 +        if ( (p = c->pools[i]) == NULL )
 27.1579 +            continue;
 27.1580 +        s = is_shared(p);
 27.1581 +        n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,PT:%c%c,U0:%llx,U1:%llx%c",
 27.1582 +             c->cli_id, p->pool_id,
 27.1583 +             is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
 27.1584 +             s ? p->uuid[0] : 0LL, s ? p->uuid[1] : 0LL,
 27.1585 +             use_long ? ',' : '\n');
 27.1586 +        if (use_long)
 27.1587 +            n += scnprintf(info+n,BSIZE-n,
 27.1588 +             "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
 27.1589 +             "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
 27.1590 +             "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
 27.1591 +             _atomic_read(p->pgp_count), p->pgp_count_max,
 27.1592 +             p->obj_count, p->obj_count_max,
 27.1593 +             p->objnode_count, p->objnode_count_max,
 27.1594 +             p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
 27.1595 +             p->no_mem_puts, 
 27.1596 +             p->found_gets, p->gets,
 27.1597 +             p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
 27.1598 +        if ( sum + n >= len )
 27.1599 +            return sum;
 27.1600 +        tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
 27.1601 +        sum += n;
 27.1602 +    }
 27.1603 +    return sum;
 27.1604 +}
 27.1605 +
 27.1606 +static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
 27.1607 +                              bool_t use_long)
 27.1608 +{
 27.1609 +    char info[BSIZE];
 27.1610 +    int i, n = 0, sum = 0;
 27.1611 +    pool_t *p;
 27.1612 +    sharelist_t *sl;
 27.1613 +
 27.1614 +    for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
 27.1615 +    {
 27.1616 +        if ( (p = global_shared_pools[i]) == NULL )
 27.1617 +            continue;
 27.1618 +        n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%llx,U1:%llx",
 27.1619 +            i, is_persistent(p) ? 'P' : 'E', is_shared(p) ? 'S' : 'P',
 27.1620 +             (unsigned long long)p->uuid[0], (unsigned long long)p->uuid[1]);
 27.1621 +        list_for_each_entry(sl,&p->share_list, share_list)
 27.1622 +            n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
 27.1623 +        n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
 27.1624 +        if (use_long)
 27.1625 +            n += scnprintf(info+n,BSIZE-n,
 27.1626 +             "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
 27.1627 +             "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
 27.1628 +             "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
 27.1629 +             _atomic_read(p->pgp_count), p->pgp_count_max,
 27.1630 +             p->obj_count, p->obj_count_max,
 27.1631 +             p->objnode_count, p->objnode_count_max,
 27.1632 +             p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
 27.1633 +             p->no_mem_puts, 
 27.1634 +             p->found_gets, p->gets,
 27.1635 +             p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
 27.1636 +        if ( sum + n >= len )
 27.1637 +            return sum;
 27.1638 +        tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
 27.1639 +        sum += n;
 27.1640 +    }
 27.1641 +    return sum;
 27.1642 +}
 27.1643 +
 27.1644 +#ifdef TMEM_PERF
 27.1645 +static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
 27.1646 +                              bool_t use_long)
 27.1647 +{
 27.1648 +    char info[BSIZE];
 27.1649 +    int n = 0, sum = 0;
 27.1650 +
 27.1651 +    n = scnprintf(info+n,BSIZE-n,"T=");
 27.1652 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
 27.1653 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
 27.1654 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
 27.1655 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
 27.1656 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
 27.1657 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
 27.1658 +#ifdef COMPARE_COPY_PAGE_SSE2
 27.1659 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
 27.1660 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
 27.1661 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
 27.1662 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
 27.1663 +#else
 27.1664 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
 27.1665 +#endif
 27.1666 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
 27.1667 +    n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
 27.1668 +    n--; /* overwrite trailing comma */
 27.1669 +    n += scnprintf(info+n,BSIZE-n,"\n");
 27.1670 +    if ( sum + n >= len )
 27.1671 +        return sum;
 27.1672 +    tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
 27.1673 +    sum += n;
 27.1674 +    return sum;
 27.1675 +}
 27.1676 +#else
 27.1677 +#define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
 27.1678 +#endif
 27.1679 +
 27.1680 +static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
 27.1681 +                              bool_t use_long)
 27.1682 +{
 27.1683 +    char info[BSIZE];
 27.1684 +    int n = 0, sum = off;
 27.1685 +
 27.1686 +    n += scnprintf(info,BSIZE,"G="
 27.1687 +      "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
 27.1688 +      "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
 27.1689 +      total_tmem_ops, errored_tmem_ops, failed_copies,
 27.1690 +      alloc_failed, alloc_page_failed, tmh_avail_pages(),
 27.1691 +      low_on_memory, evicted_pgs,
 27.1692 +      evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
 27.1693 +      total_flush_pool, use_long ? ',' : '\n');
 27.1694 +    if (use_long)
 27.1695 +        n += scnprintf(info+n,BSIZE-n,
 27.1696 +          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
 27.1697 +          global_eph_count, global_eph_count_max,
 27.1698 +          _atomic_read(global_obj_count), global_obj_count_max,
 27.1699 +          _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
 27.1700 +          _atomic_read(global_pgp_count), global_pgp_count_max);
 27.1701 +    if ( sum + n >= len )
 27.1702 +        return sum;
 27.1703 +    tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
 27.1704 +    sum += n;
 27.1705 +    return sum;
 27.1706 +}
 27.1707 +
 27.1708 +static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
 27.1709 +                               bool_t use_long)
 27.1710 +{
 27.1711 +    client_t *client;
 27.1712 +    int off = 0;
 27.1713 +
 27.1714 +    if ( cli_id == CLI_ID_NULL ) {
 27.1715 +        off = tmemc_list_global(buf,0,len,use_long);
 27.1716 +        off += tmemc_list_shared(buf,off,len-off,use_long);
 27.1717 +        list_for_each_entry(client,&global_client_list,client_list)
 27.1718 +            off += tmemc_list_client(client, buf, off, len-off, use_long);
 27.1719 +        off += tmemc_list_global_perf(buf,off,len-off,use_long);
 27.1720 +    }
 27.1721 +    else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
 27.1722 +        return -1;
 27.1723 +    else
 27.1724 +        off = tmemc_list_client(client, buf, 0, len, use_long);
 27.1725 +
 27.1726 +
 27.1727 +    return 0;
 27.1728 +}
 27.1729 +
 27.1730 +static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
 27.1731 +{
 27.1732 +    cli_id_t cli_id = client->cli_id;
 27.1733 +    uint32_t old_weight;
 27.1734 +
 27.1735 +    switch (subop)
 27.1736 +    {
 27.1737 +    case TMEMC_SET_WEIGHT:
 27.1738 +        old_weight = client->weight;
 27.1739 +        client->weight = arg1;
 27.1740 +        printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
 27.1741 +        atomic_sub(old_weight,&client_weight_total);
 27.1742 +        atomic_add(client->weight,&client_weight_total);
 27.1743 +        break;
 27.1744 +    case TMEMC_SET_CAP:
 27.1745 +        client->cap = arg1;
 27.1746 +        printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
 27.1747 +        break;
 27.1748 +    case TMEMC_SET_COMPRESS:
 27.1749 +        client->compress = arg1 ? 1 : 0;
 27.1750 +        printk("tmem: compression %s for %s=%d\n",
 27.1751 +            arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
 27.1752 +        break;
 27.1753 +    default:
 27.1754 +        printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
 27.1755 +        return -1;
 27.1756 +    }
 27.1757 +    return 0;
 27.1758 +}
 27.1759 +
 27.1760 +static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
 27.1761 +{
 27.1762 +    client_t *client;
 27.1763 +
 27.1764 +    if ( cli_id == CLI_ID_NULL )
 27.1765 +        list_for_each_entry(client,&global_client_list,client_list)
 27.1766 +            tmemc_set_var_one(client, subop, arg1);
 27.1767 +    else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
 27.1768 +        return -1;
 27.1769 +    else
 27.1770 +            tmemc_set_var_one(client, subop, arg1);
 27.1771 +    return 0;
 27.1772 +}
 27.1773 +
 27.1774 +static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
 27.1775 +   uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
 27.1776 +{
 27.1777 +    int ret;
 27.1778 +    cli_id_t cli_id = (cli_id_t)cli_id32;
 27.1779 +
 27.1780 +    if (!tmh_current_is_privileged())
 27.1781 +    {
 27.1782 +        /* don't fail... mystery: sometimes dom0 fails here */
 27.1783 +        /* return -EPERM; */
 27.1784 +    }
 27.1785 +    switch(subop)
 27.1786 +    {
 27.1787 +    case TMEMC_THAW:
 27.1788 +    case TMEMC_FREEZE:
 27.1789 +    case TMEMC_DESTROY:
 27.1790 +        ret = tmemc_freeze_pools(cli_id,subop);
 27.1791 +        break;
 27.1792 +    case TMEMC_FLUSH:
 27.1793 +        ret = tmemc_flush_mem(cli_id,arg1);
 27.1794 +        break;
 27.1795 +    case TMEMC_LIST:
 27.1796 +        ret = tmemc_list(cli_id,buf,arg1,arg2);
 27.1797 +        break;
 27.1798 +    case TMEMC_SET_WEIGHT:
 27.1799 +    case TMEMC_SET_CAP:
 27.1800 +    case TMEMC_SET_COMPRESS:
 27.1801 +        ret = tmemc_set_var(cli_id,subop,arg1);
 27.1802 +        break;
 27.1803 +    default:
 27.1804 +        ret = -1;
 27.1805 +    }
 27.1806 +    return ret;
 27.1807 +}
 27.1808 +
 27.1809 +/************ EXPORTed FUNCTIONS **************************************/
 27.1810 +
 27.1811 +EXPORT long do_tmem_op(tmem_cli_op_t uops)
 27.1812 +{
 27.1813 +    struct tmem_op op;
 27.1814 +    client_t *client = tmh_client_from_current();
 27.1815 +    pool_t *pool = NULL;
 27.1816 +    int rc = 0;
 27.1817 +    bool_t succ_get = 0, succ_put = 0;
 27.1818 +    bool_t non_succ_get = 0, non_succ_put = 0;
 27.1819 +    bool_t flush = 0, flush_obj = 0;
 27.1820 +    bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
 27.1821 +    static bool_t warned = 0;
 27.1822 +    DECL_LOCAL_CYC_COUNTER(succ_get);
 27.1823 +    DECL_LOCAL_CYC_COUNTER(succ_put);
 27.1824 +    DECL_LOCAL_CYC_COUNTER(non_succ_get);
 27.1825 +    DECL_LOCAL_CYC_COUNTER(non_succ_put);
 27.1826 +    DECL_LOCAL_CYC_COUNTER(flush);
 27.1827 +    DECL_LOCAL_CYC_COUNTER(flush_obj);
 27.1828 +
 27.1829 +    if ( !tmem_initialized )
 27.1830 +    {
 27.1831 +        if ( !warned )
 27.1832 +            printk("tmem: must specify tmem parameter on xen boot line\n");
 27.1833 +        warned = 1;
 27.1834 +        return -ENODEV;
 27.1835 +    }
 27.1836 +
 27.1837 +    total_tmem_ops++;
 27.1838 +
 27.1839 +    if ( tmh_lock_all )
 27.1840 +    {
 27.1841 +        if ( tmh_lock_all > 1 )
 27.1842 +            spin_lock_irq(&tmem_spinlock);
 27.1843 +        else
 27.1844 +            spin_lock(&tmem_spinlock);
 27.1845 +    }
 27.1846 +
 27.1847 +    START_CYC_COUNTER(succ_get);
 27.1848 +    DUP_START_CYC_COUNTER(succ_put,succ_get);
 27.1849 +    DUP_START_CYC_COUNTER(non_succ_get,succ_get);
 27.1850 +    DUP_START_CYC_COUNTER(non_succ_put,succ_get);
 27.1851 +    DUP_START_CYC_COUNTER(flush,succ_get);
 27.1852 +    DUP_START_CYC_COUNTER(flush_obj,succ_get);
 27.1853 +
 27.1854 +    if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
 27.1855 +    {
 27.1856 +        printk("tmem: can't get tmem struct from %s\n",client_str);
 27.1857 +        rc = -EFAULT;
 27.1858 +        goto out;
 27.1859 +    }
 27.1860 +
 27.1861 +    if ( op.cmd == TMEM_CONTROL )
 27.1862 +    {
 27.1863 +        tmem_write_lock(&tmem_rwlock);
 27.1864 +        tmem_write_lock_set = 1;
 27.1865 +        rc = do_tmem_control(op.subop, op.cli_id, op.arg1, op.arg2, op.buf);
 27.1866 +        goto out;
 27.1867 +    }
 27.1868 +
 27.1869 +    /* create per-client tmem structure dynamically on first use by client */
 27.1870 +    if ( client == NULL )
 27.1871 +    {
 27.1872 +        tmem_write_lock(&tmem_rwlock);
 27.1873 +        tmem_write_lock_set = 1;
 27.1874 +        if ( (client = client_create()) == NULL )
 27.1875 +        {
 27.1876 +            printk("tmem: can't create tmem structure for %s\n",client_str);
 27.1877 +            rc = -ENOMEM;
 27.1878 +            goto out;
 27.1879 +        }
 27.1880 +    }
 27.1881 +
 27.1882 +    if ( op.cmd == TMEM_NEW_POOL )
 27.1883 +    {
 27.1884 +        if ( !tmem_write_lock_set )
 27.1885 +        {
 27.1886 +            tmem_write_lock(&tmem_rwlock);
 27.1887 +            tmem_write_lock_set = 1;
 27.1888 +        }
 27.1889 +    }
 27.1890 +    else
 27.1891 +    {
 27.1892 +        if ( !tmem_write_lock_set )
 27.1893 +        {
 27.1894 +            tmem_read_lock(&tmem_rwlock);
 27.1895 +            tmem_read_lock_set = 1;
 27.1896 +        }
 27.1897 +        if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
 27.1898 +             ((pool = client->pools[op.pool_id]) == NULL) )
 27.1899 +        {
 27.1900 +            rc = -ENODEV;
 27.1901 +            printk("tmem: operation requested on uncreated pool\n");
 27.1902 +            goto out;
 27.1903 +        }
 27.1904 +        ASSERT_SENTINEL(pool,POOL);
 27.1905 +    }
 27.1906 +
 27.1907 +    switch ( op.cmd )
 27.1908 +    {
 27.1909 +    case TMEM_NEW_POOL:
 27.1910 +        rc = do_tmem_new_pool(op.flags,op.uuid[0],op.uuid[1]);
 27.1911 +        break;
 27.1912 +    case TMEM_NEW_PAGE:
 27.1913 +        rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, 0);
 27.1914 +        break;
 27.1915 +    case TMEM_PUT_PAGE:
 27.1916 +        rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
 27.1917 +        if (rc == 1) succ_put = 1;
 27.1918 +        else non_succ_put = 1;
 27.1919 +        break;
 27.1920 +    case TMEM_GET_PAGE:
 27.1921 +        rc = do_tmem_get(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
 27.1922 +        if (rc == 1) succ_get = 1;
 27.1923 +        else non_succ_get = 1;
 27.1924 +        break;
 27.1925 +    case TMEM_FLUSH_PAGE:
 27.1926 +        flush = 1;
 27.1927 +        rc = do_tmem_flush_page(pool, op.object, op.index);
 27.1928 +        break;
 27.1929 +    case TMEM_FLUSH_OBJECT:
 27.1930 +        rc = do_tmem_flush_object(pool, op.object);
 27.1931 +        flush_obj = 1;
 27.1932 +        break;
 27.1933 +    case TMEM_DESTROY_POOL:
 27.1934 +        flush = 1;
 27.1935 +        rc = do_tmem_destroy_pool(op.pool_id);
 27.1936 +        break;
 27.1937 +    case TMEM_READ:
 27.1938 +        rc = do_tmem_get(pool, op.object, op.index, op.cmfn,
 27.1939 +                         op.tmem_offset, op.pfn_offset, op.len);
 27.1940 +        break;
 27.1941 +    case TMEM_WRITE:
 27.1942 +        rc = do_tmem_put(pool, op.object, op.index, op.cmfn,
 27.1943 +                         op.tmem_offset, op.pfn_offset, op.len);
 27.1944 +        break;
 27.1945 +    case TMEM_XCHG:
 27.1946 +        /* need to hold global lock to ensure xchg is atomic */
 27.1947 +        printk("tmem_xchg op not implemented yet\n");
 27.1948 +        rc = 0;
 27.1949 +        break;
 27.1950 +    default:
 27.1951 +        printk("tmem: op %d not implemented\n", op.cmd);
 27.1952 +        rc = 0;
 27.1953 +        break;
 27.1954 +    }
 27.1955 +
 27.1956 +out:
 27.1957 +    if ( rc < 0 )
 27.1958 +        errored_tmem_ops++;
 27.1959 +    if ( succ_get )
 27.1960 +        END_CYC_COUNTER(succ_get);
 27.1961 +    else if ( succ_put )
 27.1962 +        END_CYC_COUNTER(succ_put);
 27.1963 +    else if ( non_succ_get )
 27.1964 +        END_CYC_COUNTER(non_succ_get);
 27.1965 +    else if ( non_succ_put )
 27.1966 +        END_CYC_COUNTER(non_succ_put);
 27.1967 +    else if ( flush )
 27.1968 +        END_CYC_COUNTER(flush);
 27.1969 +    else
 27.1970 +        END_CYC_COUNTER(flush_obj);
 27.1971 +
 27.1972 +    if ( tmh_lock_all )
 27.1973 +    {
 27.1974 +        if ( tmh_lock_all > 1 )
 27.1975 +            spin_unlock_irq(&tmem_spinlock);
 27.1976 +        else
 27.1977 +            spin_unlock(&tmem_spinlock);
 27.1978 +    } else {
 27.1979 +        if ( tmem_write_lock_set )
 27.1980 +            write_unlock(&tmem_rwlock);
 27.1981 +        else if ( tmem_read_lock_set )
 27.1982 +            read_unlock(&tmem_rwlock);
 27.1983 +        else 
 27.1984 +            ASSERT(0);
 27.1985 +    }
 27.1986 +
 27.1987 +    return rc;
 27.1988 +}
 27.1989 +
 27.1990 +/* this should be called when the host is destroying a client */
 27.1991 +EXPORT void tmem_destroy(void *v)
 27.1992 +{
 27.1993 +    client_t *client = (client_t *)v;
 27.1994 +
 27.1995 +    if ( tmh_lock_all )
 27.1996 +        spin_lock(&tmem_spinlock);
 27.1997 +    else
 27.1998 +        write_lock(&tmem_rwlock);
 27.1999 +
 27.2000 +    if ( client == NULL )
 27.2001 +        printk("tmem: can't destroy tmem pools for %s=%d\n",
 27.2002 +               cli_id_str,client->cli_id);
 27.2003 +    else
 27.2004 +    {
 27.2005 +        printk("tmem: flushing tmem pools for %s=%d\n",
 27.2006 +               cli_id_str,client->cli_id);
 27.2007 +        client_flush(client,1);
 27.2008 +    }
 27.2009 +
 27.2010 +    if ( tmh_lock_all )
 27.2011 +        spin_unlock(&tmem_spinlock);
 27.2012 +    else
 27.2013 +        write_unlock(&tmem_rwlock);
 27.2014 +}
 27.2015 +
 27.2016 +/* freezing all pools guarantees that no additional memory will be consumed */
 27.2017 +EXPORT void tmem_freeze_all(unsigned char key)
 27.2018 +{
 27.2019 +    static int freeze = 0;
 27.2020 + 
 27.2021 +    if ( tmh_lock_all )
 27.2022 +        spin_lock(&tmem_spinlock);
 27.2023 +    else
 27.2024 +        write_lock(&tmem_rwlock);
 27.2025 +
 27.2026 +    freeze = !freeze;
 27.2027 +    tmemc_freeze_pools(CLI_ID_NULL,freeze);
 27.2028 +
 27.2029 +    if ( tmh_lock_all )
 27.2030 +        spin_unlock(&tmem_spinlock);
 27.2031 +    else
 27.2032 +        write_unlock(&tmem_rwlock);
 27.2033 +}
 27.2034 +
 27.2035 +#define MAX_EVICTS 10  /* should be variable or set via TMEMC_ ?? */
 27.2036 +
 27.2037 +EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
 27.2038 +{
 27.2039 +    pfp_t *pfp;
 27.2040 +    unsigned long evicts_per_relinq = 0;
 27.2041 +    int max_evictions = 10;
 27.2042 +
 27.2043 +    if (!tmh_enabled())
 27.2044 +        return NULL;
 27.2045 +#ifdef __i386__
 27.2046 +    return NULL;
 27.2047 +#endif
 27.2048 +
 27.2049 +    relinq_attempts++;
 27.2050 +    if ( order > 0 )
 27.2051 +    {
 27.2052 +        printk("tmem_relinquish_page: failing order=%d\n", order);
 27.2053 +        return NULL;
 27.2054 +    }
 27.2055 +
 27.2056 +    if ( tmh_called_from_tmem(memflags) )
 27.2057 +    {
 27.2058 +        if ( tmh_lock_all )
 27.2059 +            spin_lock(&tmem_spinlock);
 27.2060 +        else
 27.2061 +            read_lock(&tmem_rwlock);
 27.2062 +    }
 27.2063 +
 27.2064 +    while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
 27.2065 +    {
 27.2066 +        if ( (max_evictions-- <= 0) || !tmem_evict())
 27.2067 +            break;
 27.2068 +        evicts_per_relinq++;
 27.2069 +    }
 27.2070 +    if ( evicts_per_relinq > max_evicts_per_relinq )
 27.2071 +        max_evicts_per_relinq = evicts_per_relinq;
 27.2072 +    tmh_scrub_page(pfp, memflags);
 27.2073 +    if ( pfp != NULL )
 27.2074 +        relinq_pgs++;
 27.2075 +
 27.2076 +    if ( tmh_called_from_tmem(memflags) )
 27.2077 +    {
 27.2078 +        if ( tmh_lock_all )
 27.2079 +            spin_unlock(&tmem_spinlock);
 27.2080 +        else
 27.2081 +            read_unlock(&tmem_rwlock);
 27.2082 +    }
 27.2083 +
 27.2084 +    return pfp;
 27.2085 +}
 27.2086 +
 27.2087 +/* called at hypervisor startup */
 27.2088 +EXPORT void init_tmem(void)
 27.2089 +{
 27.2090 +    if ( !tmh_enabled() )
 27.2091 +        return;
 27.2092 +
 27.2093 +    radix_tree_init();
 27.2094 +    if ( tmh_init() )
 27.2095 +    {
 27.2096 +        printk("tmem: initialized comp=%d global-lock=%d\n",
 27.2097 +            tmh_compression_enabled(), tmh_lock_all);
 27.2098 +        tmem_initialized = 1;
 27.2099 +    }
 27.2100 +    else
 27.2101 +        printk("tmem: initialization FAILED\n");
 27.2102 +}
 27.2103 +
 27.2104 +/*
 27.2105 + * Local variables:
 27.2106 + * mode: C
 27.2107 + * c-set-style: "BSD"
 27.2108 + * c-basic-offset: 4
 27.2109 + * tab-width: 4
 27.2110 + * indent-tabs-mode: nil
 27.2111 + * End:
 27.2112 + */
    28.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.2 +++ b/xen/common/tmem_xen.c	Tue May 26 11:05:04 2009 +0100
    28.3 @@ -0,0 +1,334 @@
    28.4 +/******************************************************************************
    28.5 + * tmem-xen.c
    28.6 + *
    28.7 + * Xen-specific Transcendent memory
    28.8 + *
    28.9 + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
   28.10 + */
   28.11 +
   28.12 +#include <xen/tmem.h>
   28.13 +#include <xen/tmem_xen.h>
   28.14 +#include <xen/lzo.h> /* compression code */
   28.15 +#include <xen/paging.h>
   28.16 +#include <xen/domain_page.h>
   28.17 +
   28.18 +#define EXPORT /* indicates code other modules are dependent upon */
   28.19 +
   28.20 +EXPORT int opt_tmem = 0;
   28.21 +boolean_param("tmem", opt_tmem);
   28.22 +
   28.23 +EXPORT int opt_tmem_compress = 0;
   28.24 +boolean_param("tmem_compress", opt_tmem_compress);
   28.25 +
   28.26 +EXPORT int opt_tmem_lock = 0;
   28.27 +integer_param("tmem_lock", opt_tmem_lock);
   28.28 +
   28.29 +#ifdef COMPARE_COPY_PAGE_SSE2
   28.30 +DECL_CYC_COUNTER(pg_copy1);
   28.31 +DECL_CYC_COUNTER(pg_copy2);
   28.32 +DECL_CYC_COUNTER(pg_copy3);
   28.33 +DECL_CYC_COUNTER(pg_copy4);
   28.34 +#else
   28.35 +DECL_CYC_COUNTER(pg_copy);
   28.36 +#endif
   28.37 +
   28.38 +/* these are a concurrency bottleneck, could be percpu and dynamically
   28.39 + * allocated iff opt_tmem_compress */
   28.40 +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
   28.41 +#define LZO_DSTMEM_PAGES 2
   28.42 +static DEFINE_PER_CPU(unsigned char *, workmem);
   28.43 +static DEFINE_PER_CPU(unsigned char *, dstmem);
   28.44 +
   28.45 +#ifdef COMPARE_COPY_PAGE_SSE2
   28.46 +#include <asm/flushtlb.h>  /* REMOVE ME AFTER TEST */
   28.47 +#include <asm/page.h>  /* REMOVE ME AFTER TEST */
   28.48 +#endif
   28.49 +void tmh_copy_page(char *to, char*from)
   28.50 +{
   28.51 +#ifdef COMPARE_COPY_PAGE_SSE2
   28.52 +    DECL_LOCAL_CYC_COUNTER(pg_copy1);
   28.53 +    DECL_LOCAL_CYC_COUNTER(pg_copy2);
   28.54 +    DECL_LOCAL_CYC_COUNTER(pg_copy3);
   28.55 +    DECL_LOCAL_CYC_COUNTER(pg_copy4);
   28.56 +    *to = *from;  /* don't measure TLB misses */
   28.57 +    flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
   28.58 +    flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
   28.59 +    START_CYC_COUNTER(pg_copy1);
   28.60 +    copy_page_sse2(to, from);  /* cold cache */
   28.61 +    END_CYC_COUNTER(pg_copy1);
   28.62 +    START_CYC_COUNTER(pg_copy2);
   28.63 +    copy_page_sse2(to, from);  /* hot cache */
   28.64 +    END_CYC_COUNTER(pg_copy2);
   28.65 +    flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
   28.66 +    flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
   28.67 +    START_CYC_COUNTER(pg_copy3);
   28.68 +    memcpy(to, from, PAGE_SIZE);  /* cold cache */
   28.69 +    END_CYC_COUNTER(pg_copy3);
   28.70 +    START_CYC_COUNTER(pg_copy4);
   28.71 +    memcpy(to, from, PAGE_SIZE); /* hot cache */
   28.72 +    END_CYC_COUNTER(pg_copy4);
   28.73 +#else
   28.74 +    DECL_LOCAL_CYC_COUNTER(pg_copy);
   28.75 +    START_CYC_COUNTER(pg_copy);
   28.76 +    memcpy(to, from, PAGE_SIZE);
   28.77 +    END_CYC_COUNTER(pg_copy);
   28.78 +#endif
   28.79 +}
   28.80 +
   28.81 +#ifdef __ia64__
   28.82 +static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
   28.83 +{
   28.84 +    ASSERT(0);
   28.85 +}
   28.86 +#define paging_mark_dirty(_x,_y) do {} while(0)
   28.87 +#else
   28.88 +static inline void *cli_mfn_to_va(tmem_cli_mfn_t cmfn, unsigned long *pcli_mfn)
   28.89 +{
   28.90 +    unsigned long cli_mfn;
   28.91 +    p2m_type_t t;
   28.92 +
   28.93 +
   28.94 +    if (is_pv_32on64_vcpu(current))
   28.95 +        cmfn.p = (void *)((unsigned long)cmfn.p & 0xffffffffUL);
   28.96 +    cli_mfn = mfn_x(gfn_to_mfn(current->domain,(unsigned long)cmfn.p,&t));
   28.97 +    if (t != p2m_ram_rw)
   28.98 +        return NULL;
   28.99 +    if (pcli_mfn != NULL)
  28.100 +        *pcli_mfn = cli_mfn;
  28.101 +    return map_domain_page(cli_mfn);
  28.102 +}
  28.103 +#endif
  28.104 +
  28.105 +EXPORT int tmh_copy_from_client(pfp_t *pfp,
  28.106 +    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
  28.107 +    uint32_t pfn_offset, uint32_t len)
  28.108 +{
  28.109 +    unsigned long tmem_mfn;
  28.110 +    void *tmem_va, *cli_va = NULL;
  28.111 +
  28.112 +    ASSERT(pfp != NULL);
  28.113 +    if ( tmem_offset || pfn_offset || len )
  28.114 +        if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
  28.115 +            return -EFAULT;
  28.116 +    tmem_mfn = page_to_mfn(pfp);
  28.117 +    tmem_va = map_domain_page(tmem_mfn);
  28.118 +    mb();
  28.119 +    if (!len && !tmem_offset && !pfn_offset)
  28.120 +        memset(tmem_va, 0, PAGE_SIZE);
  28.121 +    else if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
  28.122 +        tmh_copy_page(tmem_va, cli_va);
  28.123 +    else if ( (tmem_offset+len <= PAGE_SIZE) &&
  28.124 +                (pfn_offset+len <= PAGE_SIZE) ) 
  28.125 +        memcpy((char *)tmem_va+tmem_offset,(char *)cli_va+pfn_offset,len);
  28.126 +    unmap_domain_page(cli_va);
  28.127 +    unmap_domain_page(tmem_va);
  28.128 +    return 1;
  28.129 +}
  28.130 +
  28.131 +EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn,
  28.132 +    void **out_va, size_t *out_len)
  28.133 +{
  28.134 +    void *cli_va;
  28.135 +    int ret = 0;
  28.136 +    unsigned char *dmem = this_cpu(dstmem);
  28.137 +    unsigned char *wmem = this_cpu(workmem);
  28.138 +
  28.139 +    if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
  28.140 +        return -EFAULT;
  28.141 +    if ( dmem == NULL || wmem == NULL )
  28.142 +        return 0;  /* no buffer, so can't compress */
  28.143 +    mb();
  28.144 +    ret = lzo1x_1_compress(cli_va, PAGE_SIZE, dmem, out_len, wmem);
  28.145 +    ASSERT(ret == LZO_E_OK);
  28.146 +    *out_va = dmem;
  28.147 +    unmap_domain_page(cli_va);
  28.148 +    return 1;
  28.149 +}
  28.150 +
  28.151 +EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
  28.152 +    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
  28.153 +{
  28.154 +    unsigned long tmem_mfn, cli_mfn;
  28.155 +    void *tmem_va, *cli_va;
  28.156 +
  28.157 +    ASSERT(pfp != NULL);
  28.158 +    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
  28.159 +        return -EFAULT;
  28.160 +    tmem_mfn = page_to_mfn(pfp);
  28.161 +    tmem_va = map_domain_page(tmem_mfn);
  28.162 +    if (len == PAGE_SIZE && !tmem_offset && !pfn_offset)
  28.163 +        tmh_copy_page(cli_va, tmem_va);
  28.164 +    else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) )
  28.165 +        memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len);
  28.166 +    unmap_domain_page(tmem_va);
  28.167 +    unmap_domain_page(cli_va);
  28.168 +    paging_mark_dirty(current->domain,cli_mfn);
  28.169 +    mb();
  28.170 +    return 1;
  28.171 +}
  28.172 +
  28.173 +EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, size_t size)
  28.174 +{
  28.175 +    unsigned long cli_mfn;
  28.176 +    void *cli_va;
  28.177 +    size_t out_len = PAGE_SIZE;
  28.178 +    int ret;
  28.179 +
  28.180 +    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
  28.181 +        return -EFAULT;
  28.182 +    ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len);
  28.183 +    ASSERT(ret == LZO_E_OK);
  28.184 +    ASSERT(out_len == PAGE_SIZE);
  28.185 +    unmap_domain_page(cli_va);
  28.186 +    paging_mark_dirty(current->domain,cli_mfn);
  28.187 +    mb();
  28.188 +    return 1;
  28.189 +}
  28.190 +
  28.191 +/******************  XEN-SPECIFIC MEMORY ALLOCATION ********************/
  28.192 +
  28.193 +EXPORT struct xmem_pool *tmh_mempool = 0;
  28.194 +EXPORT unsigned int tmh_mempool_maxalloc = 0;
  28.195 +
  28.196 +EXPORT DEFINE_SPINLOCK(tmh_page_list_lock);
  28.197 +EXPORT PAGE_LIST_HEAD(tmh_page_list);
  28.198 +EXPORT unsigned long tmh_page_list_pages = 0;
  28.199 +
  28.200 +/* free anything on tmh_page_list to Xen's scrub list */
  28.201 +EXPORT void tmh_release_avail_pages_to_host(void)
  28.202 +{
  28.203 +    spin_lock(&tmh_page_list_lock);
  28.204 +    if ( !page_list_empty(&tmh_page_list) )
  28.205 +    {
  28.206 +        scrub_list_splice(&tmh_page_list);
  28.207 +        INIT_PAGE_LIST_HEAD(&tmh_page_list);
  28.208 +    }
  28.209 +    spin_unlock(&tmh_page_list_lock);
  28.210 +}
  28.211 +
  28.212 +EXPORT void tmh_scrub_page(struct page_info *pi, unsigned int memflags)
  28.213 +{
  28.214 +    if ( pi == NULL )
  28.215 +        return;
  28.216 +    if ( !(memflags & MEMF_tmem) )
  28.217 +        scrub_one_page(pi);
  28.218 +}
  28.219 +
  28.220 +#ifndef __i386__
  28.221 +static noinline void *tmh_mempool_page_get(unsigned long size)
  28.222 +{
  28.223 +    struct page_info *pi;
  28.224 +
  28.225 +    ASSERT(size == PAGE_SIZE);
  28.226 +    if ( (pi = tmh_alloc_page(NULL,0)) == NULL )
  28.227 +        return NULL;
  28.228 +    ASSERT(IS_VALID_PAGE(pi));
  28.229 +    return page_to_virt(pi);
  28.230 +}
  28.231 +
  28.232 +static void tmh_mempool_page_put(void *page_va)
  28.233 +{
  28.234 +    ASSERT(IS_PAGE_ALIGNED(page_va));
  28.235 +    tmh_free_page(virt_to_page(page_va));
  28.236 +}
  28.237 +
  28.238 +static int tmh_mempool_init(void)
  28.239 +{
  28.240 +    tmh_mempool = xmem_pool_create("tmem", tmh_mempool_page_get,
  28.241 +        tmh_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
  28.242 +    if ( tmh_mempool )
  28.243 +        tmh_mempool_maxalloc = xmem_pool_maxalloc(tmh_mempool);
  28.244 +    return tmh_mempool != NULL;
  28.245 +}
  28.246 +
  28.247 +/* persistent pools are per-domain */
  28.248 +
  28.249 +static void *tmh_persistent_pool_page_get(unsigned long size)
  28.250 +{
  28.251 +    struct page_info *pi;
  28.252 +    struct domain *d = current->domain;
  28.253 +
  28.254 +    ASSERT(size == PAGE_SIZE);
  28.255 +    if ( (pi = _tmh_alloc_page_thispool(d)) == NULL )
  28.256 +        return NULL;
  28.257 +    ASSERT(IS_VALID_PAGE(pi));
  28.258 +    return map_domain_page(page_to_mfn(pi));
  28.259 +}
  28.260 +
  28.261 +static void tmh_persistent_pool_page_put(void *page_va)
  28.262 +{
  28.263 +    struct page_info *pi;
  28.264 +
  28.265 +    ASSERT(IS_PAGE_ALIGNED(page_va));
  28.266 +    pi = virt_to_page(page_va);
  28.267 +    ASSERT(IS_VALID_PAGE(pi));
  28.268 +    _tmh_free_page_thispool(pi);
  28.269 +}
  28.270 +#endif
  28.271 +
  28.272 +/******************  XEN-SPECIFIC CLIENT HANDLING ********************/
  28.273 +
  28.274 +EXPORT tmh_client_t *tmh_client_init(void)
  28.275 +{
  28.276 +    tmh_client_t *tmh;
  28.277 +    char name[5];
  28.278 +    domid_t domid = current->domain->domain_id;
  28.279 +    int i, shift;
  28.280 +
  28.281 +    if ( (tmh = xmalloc(tmh_client_t)) == NULL )
  28.282 +        return NULL;
  28.283 +    for (i = 0, shift = 12; i < 4; shift -=4, i++)
  28.284 +        name[i] = ((unsigned short)domid >> shift) & 0xf;
  28.285 +    name[4] = '\0';
  28.286 +#ifndef __i386__
  28.287 +    tmh->persistent_pool = xmem_pool_create(name, tmh_persistent_pool_page_get,
  28.288 +        tmh_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
  28.289 +    if ( tmh->persistent_pool == NULL )
  28.290 +    {
  28.291 +        xfree(tmh);
  28.292 +        return NULL;
  28.293 +    }
  28.294 +#endif
  28.295 +    tmh->domain = current->domain;
  28.296 +    return tmh;
  28.297 +}
  28.298 +
  28.299 +EXPORT void tmh_client_destroy(tmh_client_t *tmh)
  28.300 +{
  28.301 +#ifndef __i386__
  28.302 +    xmem_pool_destroy(tmh->persistent_pool);
  28.303 +#endif
  28.304 +    xfree(tmh);
  28.305 +}
  28.306 +
  28.307 +/******************  XEN-SPECIFIC HOST INITIALIZATION ********************/
  28.308 +
  28.309 +EXPORT int tmh_init(void)
  28.310 +{
  28.311 +#ifndef __i386__
  28.312 +    int dstmem_order, workmem_order;
  28.313 +    bool_t bad_alloc = 0;
  28.314 +    struct page_info *pi;
  28.315 +    unsigned char *p1, *p2;
  28.316 +    int cpu;
  28.317 +
  28.318 +    if ( !tmh_mempool_init() )
  28.319 +        return 0;
  28.320 +
  28.321 +    dstmem_order = get_order_from_pages(LZO_DSTMEM_PAGES);
  28.322 +    workmem_order = get_order_from_bytes(LZO1X_1_MEM_COMPRESS);
  28.323 +    for_each_cpu ( cpu )
  28.324 +    {
  28.325 +        pi = alloc_domheap_pages(0,dstmem_order,0);
  28.326 +        per_cpu(dstmem, cpu) = p1 = ((pi == NULL) ? NULL : page_to_virt(pi));
  28.327 +        pi = alloc_domheap_pages(0,workmem_order,0);
  28.328 +        per_cpu(workmem, cpu) = p2 = ((pi == NULL) ? NULL : page_to_virt(pi));
  28.329 +        if ( (p1 == NULL) || (p2 == NULL) )
  28.330 +            bad_alloc++;
  28.331 +    }
  28.332 +    if ( bad_alloc )
  28.333 +        printk("tmem: can't allocate compression buffers for %d cpus\n",
  28.334 +               bad_alloc);
  28.335 +#endif
  28.336 +    return 1;
  28.337 +}
    29.1 --- a/xen/common/xmalloc_tlsf.c	Tue May 26 10:14:34 2009 +0100
    29.2 +++ b/xen/common/xmalloc_tlsf.c	Tue May 26 11:05:04 2009 +0100
    29.3 @@ -292,7 +292,6 @@ struct xmem_pool *xmem_pool_create(
    29.4      unsigned long grow_size)
    29.5  {
    29.6      struct xmem_pool *pool;
    29.7 -    void *region;
    29.8      int pool_bytes, pool_order;
    29.9  
   29.10      BUG_ON(max_size && (max_size < init_size));
   29.11 @@ -319,11 +318,9 @@ struct xmem_pool *xmem_pool_create(
   29.12      pool->get_mem = get_mem;
   29.13      pool->put_mem = put_mem;
   29.14      strlcpy(pool->name, name, sizeof(pool->name));
   29.15 -    region = get_mem(init_size);
   29.16 -    if ( region == NULL )
   29.17 -        goto out_region;
   29.18 -    ADD_REGION(region, init_size, pool);
   29.19 -    pool->init_region = region;
   29.20 +
   29.21 +    /* always obtain init_region lazily now to ensure it is get_mem'd
   29.22 +     * in the same "context" as all other regions */
   29.23  
   29.24      spin_lock_init(&pool->lock);
   29.25  
   29.26 @@ -332,10 +329,6 @@ struct xmem_pool *xmem_pool_create(
   29.27      spin_unlock(&pool_list_lock);
   29.28  
   29.29      return pool;
   29.30 -
   29.31 - out_region:
   29.32 -    free_xenheap_pages(pool, pool_order);
   29.33 -    return NULL;
   29.34  }
   29.35  
   29.36  unsigned long xmem_pool_get_used_size(struct xmem_pool *pool)
   29.37 @@ -354,13 +347,15 @@ unsigned long xmem_pool_get_total_size(s
   29.38  
   29.39  void xmem_pool_destroy(struct xmem_pool *pool) 
   29.40  {
   29.41 +    int pool_bytes, pool_order;
   29.42 +
   29.43      if ( pool == NULL )
   29.44          return;
   29.45  
   29.46      /* User is destroying without ever allocating from this pool */
   29.47      if ( xmem_pool_get_used_size(pool) == BHDR_OVERHEAD )
   29.48      {
   29.49 -        pool->put_mem(pool->init_region);
   29.50 +        ASSERT(!pool->init_region);
   29.51          pool->used_size -= BHDR_OVERHEAD;
   29.52      }
   29.53  
   29.54 @@ -373,7 +368,10 @@ void xmem_pool_destroy(struct xmem_pool 
   29.55      spin_lock(&pool_list_lock);
   29.56      list_del_init(&pool->list);
   29.57      spin_unlock(&pool_list_lock);
   29.58 -    pool->put_mem(pool);
   29.59 +
   29.60 +    pool_bytes = ROUNDUP_SIZE(sizeof(*pool));
   29.61 +    pool_order = get_order_from_bytes(pool_bytes);
   29.62 +    free_xenheap_pages(pool,pool_order);
   29.63  }
   29.64  
   29.65  void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool)
   29.66 @@ -382,6 +380,14 @@ void *xmem_pool_alloc(unsigned long size
   29.67      int fl, sl;
   29.68      unsigned long tmp_size;
   29.69  
   29.70 +    if ( pool->init_region == NULL )
   29.71 +    {
   29.72 +        if ( (region = pool->get_mem(pool->init_size)) == NULL )
   29.73 +            goto out;
   29.74 +        ADD_REGION(region, pool->init_size, pool);
   29.75 +        pool->init_region = region;
   29.76 +    }
   29.77 +
   29.78      size = (size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(size);
   29.79      /* Rounding up the requested size and calculating fl and sl */
   29.80  
   29.81 @@ -496,6 +502,11 @@ void xmem_pool_free(void *ptr, struct xm
   29.82      spin_unlock(&pool->lock);
   29.83  }
   29.84  
   29.85 +int xmem_pool_maxalloc(struct xmem_pool *pool)
   29.86 +{
   29.87 +    return pool->grow_size - (2 * BHDR_OVERHEAD);
   29.88 +}
   29.89 +
   29.90  /*
   29.91   * Glue for xmalloc().
   29.92   */
    30.1 --- a/xen/include/Makefile	Tue May 26 10:14:34 2009 +0100
    30.2 +++ b/xen/include/Makefile	Tue May 26 11:05:04 2009 +0100
    30.3 @@ -14,6 +14,7 @@ headers-y := \
    30.4      compat/physdev.h \
    30.5      compat/platform.h \
    30.6      compat/sched.h \
    30.7 +    compat/tmem.h \
    30.8      compat/trace.h \
    30.9      compat/vcpu.h \
   30.10      compat/version.h \
    31.1 --- a/xen/include/asm-ia64/mm.h	Tue May 26 10:14:34 2009 +0100
    31.2 +++ b/xen/include/asm-ia64/mm.h	Tue May 26 11:05:04 2009 +0100
    31.3 @@ -590,6 +590,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
    31.4  
    31.5  int steal_page(
    31.6      struct domain *d, struct page_info *page, unsigned int memflags);
    31.7 +int donate_page(
    31.8 +    struct domain *d, struct page_info *page, unsigned int memflags);
    31.9  
   31.10  #define domain_clamp_alloc_bitsize(d, b) (b)
   31.11  
    32.1 --- a/xen/include/asm-x86/mm.h	Tue May 26 10:14:34 2009 +0100
    32.2 +++ b/xen/include/asm-x86/mm.h	Tue May 26 11:05:04 2009 +0100
    32.3 @@ -487,6 +487,8 @@ int compat_subarch_memory_op(int op, XEN
    32.4  
    32.5  int steal_page(
    32.6      struct domain *d, struct page_info *page, unsigned int memflags);
    32.7 +int donate_page(
    32.8 +    struct domain *d, struct page_info *page, unsigned int memflags);
    32.9  
   32.10  int map_ldt_shadow_page(unsigned int);
   32.11  
    33.1 --- a/xen/include/asm-x86/spinlock.h	Tue May 26 10:14:34 2009 +0100
    33.2 +++ b/xen/include/asm-x86/spinlock.h	Tue May 26 11:05:04 2009 +0100
    33.3 @@ -32,10 +32,10 @@ static always_inline int _raw_spin_trylo
    33.4  }
    33.5  
    33.6  typedef struct {
    33.7 -    volatile unsigned int lock;
    33.8 +    volatile int lock;
    33.9  } raw_rwlock_t;
   33.10  
   33.11 -#define RW_LOCK_BIAS		 0x01000000
   33.12 +#define RW_LOCK_BIAS 0x01000000
   33.13  #define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { RW_LOCK_BIAS }
   33.14  
   33.15  static always_inline void _raw_read_lock(raw_rwlock_t *rw)
   33.16 @@ -66,6 +66,22 @@ static always_inline void _raw_write_loc
   33.17          : "=m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory" );
   33.18  }
   33.19  
   33.20 +static always_inline int _raw_write_trylock(raw_rwlock_t *rw)
   33.21 +{
   33.22 +    int rc;
   33.23 +
   33.24 +    asm volatile (
   33.25 +        "    lock; subl %2,%0      \n"
   33.26 +        "    jz 1f                 \n"
   33.27 +        "    lock; addl %2,%0      \n"
   33.28 +        "    dec %1                \n"
   33.29 +        "1:"
   33.30 +        : "=m" (rw->lock), "=r" (rc) : "i" (RW_LOCK_BIAS), "1" (1)
   33.31 +        : "memory" );
   33.32 +
   33.33 +    return rc;
   33.34 +}
   33.35 +
   33.36  static always_inline void _raw_read_unlock(raw_rwlock_t *rw)
   33.37  {
   33.38      asm volatile (
   33.39 @@ -81,5 +97,6 @@ static always_inline void _raw_write_unl
   33.40  }
   33.41  
   33.42  #define _raw_rw_is_locked(x) ((x)->lock < RW_LOCK_BIAS)
   33.43 +#define _raw_rw_is_write_locked(x) ((x)->lock <= 0)
   33.44  
   33.45  #endif /* __ASM_SPINLOCK_H */
    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/xen/include/public/tmem.h	Tue May 26 11:05:04 2009 +0100
    34.3 @@ -0,0 +1,112 @@
    34.4 +/******************************************************************************
    34.5 + * tmem.h
    34.6 + * 
    34.7 + * Guest OS interface to Xen Transcendent Memory.
    34.8 + * 
    34.9 + * Permission is hereby granted, free of charge, to any person obtaining a copy
   34.10 + * of this software and associated documentation files (the "Software"), to
   34.11 + * deal in the Software without restriction, including without limitation the
   34.12 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   34.13 + * sell copies of the Software, and to permit persons to whom the Software is
   34.14 + * furnished to do so, subject to the following conditions:
   34.15 + *
   34.16 + * The above copyright notice and this permission notice shall be included in
   34.17 + * all copies or substantial portions of the Software.
   34.18 + *
   34.19 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   34.20 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   34.21 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   34.22 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   34.23 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   34.24 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
   34.25 + * DEALINGS IN THE SOFTWARE.
   34.26 + *
   34.27 + * Copyright (c) 2004, K A Fraser
   34.28 + */
   34.29 +
   34.30 +#ifndef __XEN_PUBLIC_TMEM_H__
   34.31 +#define __XEN_PUBLIC_TMEM_H__
   34.32 +
   34.33 +#include "xen.h"
   34.34 +
   34.35 +/* Commands to HYPERVISOR_tmem_op() */
   34.36 +#define TMEM_CONTROL               0
   34.37 +#define TMEM_NEW_POOL              1
   34.38 +#define TMEM_DESTROY_POOL          2
   34.39 +#define TMEM_NEW_PAGE              3
   34.40 +#define TMEM_PUT_PAGE              4
   34.41 +#define TMEM_GET_PAGE              5
   34.42 +#define TMEM_FLUSH_PAGE            6
   34.43 +#define TMEM_FLUSH_OBJECT          7
   34.44 +#define TMEM_READ                  8
   34.45 +#define TMEM_WRITE                 9
   34.46 +#define TMEM_XCHG                 10
   34.47 +
   34.48 +/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
   34.49 +#define TMEMC_THAW                 0
   34.50 +#define TMEMC_FREEZE               1
   34.51 +#define TMEMC_FLUSH                2
   34.52 +#define TMEMC_DESTROY              3
   34.53 +#define TMEMC_LIST                 4
   34.54 +#define TMEMC_SET_WEIGHT           5
   34.55 +#define TMEMC_SET_CAP              6
   34.56 +#define TMEMC_SET_COMPRESS         7
   34.57 +
   34.58 +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
   34.59 +#define TMEM_POOL_PERSIST          1
   34.60 +#define TMEM_POOL_SHARED           2
   34.61 +#define TMEM_POOL_PAGESIZE_SHIFT   4
   34.62 +#define TMEM_POOL_PAGESIZE_MASK  0xf
   34.63 +#define TMEM_POOL_VERSION_SHIFT   24
   34.64 +#define TMEM_POOL_VERSION_MASK  0xff
   34.65 +
   34.66 +/* Special errno values */
   34.67 +#define EFROZEN                 1000
   34.68 +#define EEMPTY                  1001
   34.69 +
   34.70 +
   34.71 +#ifndef __ASSEMBLY__
   34.72 +typedef XEN_GUEST_HANDLE(void) tmem_cli_mfn_t;
   34.73 +typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
   34.74 +struct tmem_op {
   34.75 +    uint32_t cmd;
   34.76 +    int32_t pool_id; /* private > 0; shared < 0; 0 is invalid */
   34.77 +    union {
   34.78 +        struct {  /* for cmd == TMEM_NEW_POOL */
   34.79 +            uint64_t uuid[2];
   34.80 +            uint32_t flags;
   34.81 +        };
   34.82 +        struct {  /* for cmd == TMEM_CONTROL */
   34.83 +            uint32_t subop;
   34.84 +            uint32_t cli_id;
   34.85 +            uint32_t arg1;
   34.86 +            uint32_t arg2;
   34.87 +            tmem_cli_va_t buf;
   34.88 +        };
   34.89 +        struct {
   34.90 +            uint64_t object;
   34.91 +            uint32_t index;
   34.92 +            uint32_t tmem_offset;
   34.93 +            uint32_t pfn_offset;
   34.94 +            uint32_t len;
   34.95 +            tmem_cli_mfn_t cmfn; /* client machine page frame */
   34.96 +        };
   34.97 +    };
   34.98 +};
   34.99 +typedef struct tmem_op tmem_op_t;
  34.100 +DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
  34.101 +typedef XEN_GUEST_HANDLE_64(tmem_op_t) tmem_cli_op_t;
  34.102 +
  34.103 +#endif
  34.104 +
  34.105 +#endif /* __XEN_PUBLIC_TMEM_H__ */
  34.106 +
  34.107 +/*
  34.108 + * Local variables:
  34.109 + * mode: C
  34.110 + * c-set-style: "BSD"
  34.111 + * c-basic-offset: 4
  34.112 + * tab-width: 4
  34.113 + * indent-tabs-mode: nil
  34.114 + * End:
  34.115 + */
    35.1 --- a/xen/include/public/xen.h	Tue May 26 10:14:34 2009 +0100
    35.2 +++ b/xen/include/public/xen.h	Tue May 26 11:05:04 2009 +0100
    35.3 @@ -91,6 +91,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
    35.4  #define __HYPERVISOR_sysctl               35
    35.5  #define __HYPERVISOR_domctl               36
    35.6  #define __HYPERVISOR_kexec_op             37
    35.7 +#define __HYPERVISOR_tmem_op              38
    35.8  
    35.9  /* Architecture-specific hypercall definitions. */
   35.10  #define __HYPERVISOR_arch_0               48
    36.1 --- a/xen/include/xen/config.h	Tue May 26 10:14:34 2009 +0100
    36.2 +++ b/xen/include/xen/config.h	Tue May 26 11:05:04 2009 +0100
    36.3 @@ -11,7 +11,6 @@
    36.4  
    36.5  #define EXPORT_SYMBOL(var)
    36.6  #define EXPORT_SYMBOL_GPL(var)
    36.7 -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x))
    36.8  
    36.9  /*
   36.10   * The following log levels are as follows:
    37.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    37.2 +++ b/xen/include/xen/hash.h	Tue May 26 11:05:04 2009 +0100
    37.3 @@ -0,0 +1,58 @@
    37.4 +#ifndef _XEN_HASH_H
    37.5 +#define _XEN_HASH_H
    37.6 +/* Fast hashing routine for a long.
    37.7 +   (C) 2002 William Lee Irwin III, IBM */
    37.8 +
    37.9 +/*
   37.10 + * Knuth recommends primes in approximately golden ratio to the maximum
   37.11 + * integer representable by a machine word for multiplicative hashing.
   37.12 + * Chuck Lever verified the effectiveness of this technique:
   37.13 + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
   37.14 + *
   37.15 + * These primes are chosen to be bit-sparse, that is operations on
   37.16 + * them can use shifts and additions instead of multiplications for
   37.17 + * machines where multiplications are slow.
   37.18 + */
   37.19 +#if BITS_PER_LONG == 32
   37.20 +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
   37.21 +#define GOLDEN_RATIO_PRIME 0x9e370001UL
   37.22 +#elif BITS_PER_LONG == 64
   37.23 +/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
   37.24 +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
   37.25 +#else
   37.26 +#error Define GOLDEN_RATIO_PRIME for your wordsize.
   37.27 +#endif
   37.28 +
   37.29 +static inline unsigned long hash_long(unsigned long val, unsigned int bits)
   37.30 +{
   37.31 +    unsigned long hash = val;
   37.32 +
   37.33 +#if BITS_PER_LONG == 64
   37.34 +    /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
   37.35 +    unsigned long n = hash;
   37.36 +    n <<= 18;
   37.37 +    hash -= n;
   37.38 +    n <<= 33;
   37.39 +    hash -= n;
   37.40 +    n <<= 3;
   37.41 +    hash += n;
   37.42 +    n <<= 3;
   37.43 +    hash -= n;
   37.44 +    n <<= 4;
   37.45 +    hash += n;
   37.46 +    n <<= 2;
   37.47 +    hash += n;
   37.48 +#else
   37.49 +    /* On some cpus multiply is faster, on others gcc will do shifts */
   37.50 +    hash *= GOLDEN_RATIO_PRIME;
   37.51 +#endif
   37.52 +
   37.53 +    /* High bits are more random, so use them. */
   37.54 +    return hash >> (BITS_PER_LONG - bits);
   37.55 +}
   37.56 + 
   37.57 +static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
   37.58 +{
   37.59 +    return hash_long((unsigned long)ptr, bits);
   37.60 +}
   37.61 +#endif /* _XEN_HASH_H */
    38.1 --- a/xen/include/xen/hypercall.h	Tue May 26 10:14:34 2009 +0100
    38.2 +++ b/xen/include/xen/hypercall.h	Tue May 26 11:05:04 2009 +0100
    38.3 @@ -13,6 +13,7 @@
    38.4  #include <public/sysctl.h>
    38.5  #include <public/platform.h>
    38.6  #include <public/event_channel.h>
    38.7 +#include <public/tmem.h>
    38.8  #include <asm/hypercall.h>
    38.9  #include <xsm/xsm.h>
   38.10  
   38.11 @@ -116,6 +117,10 @@ extern long
   38.12  do_xsm_op(
   38.13      XEN_GUEST_HANDLE(xsm_op_t) u_xsm_op);
   38.14  
   38.15 +extern long
   38.16 +do_tmem_op(
   38.17 +    XEN_GUEST_HANDLE(tmem_op_t) uops);
   38.18 +
   38.19  #ifdef CONFIG_COMPAT
   38.20  
   38.21  extern int
    39.1 --- a/xen/include/xen/lib.h	Tue May 26 10:14:34 2009 +0100
    39.2 +++ b/xen/include/xen/lib.h	Tue May 26 11:05:04 2009 +0100
    39.3 @@ -45,6 +45,8 @@ do {                                    
    39.4  
    39.5  #define DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
    39.6  
    39.7 +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x))
    39.8 +
    39.9  #define reserve_bootmem(_p,_l) ((void)0)
   39.10  
   39.11  struct domain;
    40.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    40.2 +++ b/xen/include/xen/lzo.h	Tue May 26 11:05:04 2009 +0100
    40.3 @@ -0,0 +1,44 @@
    40.4 +#ifndef __LZO_H__
    40.5 +#define __LZO_H__
    40.6 +/*
    40.7 + *  LZO Public Kernel Interface
    40.8 + *  A mini subset of the LZO real-time data compression library
    40.9 + *
   40.10 + *  Copyright (C) 1996-2005 Markus F.X.J. Oberhumer <markus@oberhumer.com>
   40.11 + *
   40.12 + *  The full LZO package can be found at:
   40.13 + *  http://www.oberhumer.com/opensource/lzo/
   40.14 + *
   40.15 + *  Changed for kernel use by:
   40.16 + *  Nitin Gupta <nitingupta910@gmail.com>
   40.17 + *  Richard Purdie <rpurdie@openedhand.com>
   40.18 + */
   40.19 +
   40.20 +#define LZO1X_MEM_COMPRESS (16384 * sizeof(unsigned char *))
   40.21 +#define LZO1X_1_MEM_COMPRESS LZO1X_MEM_COMPRESS
   40.22 +
   40.23 +#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3)
   40.24 +
   40.25 +/* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */
   40.26 +int lzo1x_1_compress(const unsigned char *src, size_t src_len,
   40.27 +                     unsigned char *dst, size_t *dst_len, void *wrkmem);
   40.28 +
   40.29 +/* safe decompression with overrun testing */
   40.30 +int lzo1x_decompress_safe(const unsigned char *src, size_t src_len,
   40.31 +                          unsigned char *dst, size_t *dst_len);
   40.32 +
   40.33 +/*
   40.34 + * Return values (< 0 = Error)
   40.35 + */
   40.36 +#define LZO_E_OK                  0
   40.37 +#define LZO_E_ERROR               (-1)
   40.38 +#define LZO_E_OUT_OF_MEMORY       (-2)
   40.39 +#define LZO_E_NOT_COMPRESSIBLE    (-3)
   40.40 +#define LZO_E_INPUT_OVERRUN       (-4)
   40.41 +#define LZO_E_OUTPUT_OVERRUN      (-5)
   40.42 +#define LZO_E_LOOKBEHIND_OVERRUN  (-6)
   40.43 +#define LZO_E_EOF_NOT_FOUND       (-7)
   40.44 +#define LZO_E_INPUT_NOT_CONSUMED  (-8)
   40.45 +#define LZO_E_NOT_YET_IMPLEMENTED (-9)
   40.46 +
   40.47 +#endif
    41.1 --- a/xen/include/xen/mm.h	Tue May 26 10:14:34 2009 +0100
    41.2 +++ b/xen/include/xen/mm.h	Tue May 26 11:05:04 2009 +0100
    41.3 @@ -77,6 +77,8 @@ int assign_pages(
    41.4  #define  MEMF_no_refcount (1U<<_MEMF_no_refcount)
    41.5  #define _MEMF_populate_on_demand 1
    41.6  #define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
    41.7 +#define _MEMF_tmem        2
    41.8 +#define  MEMF_tmem        (1U<<_MEMF_tmem)
    41.9  #define _MEMF_node        8
   41.10  #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
   41.11  #define _MEMF_bits        24
   41.12 @@ -222,6 +224,32 @@ page_list_remove_head(struct page_list_h
   41.13      return page;
   41.14  }
   41.15  
   41.16 +static inline void
   41.17 +page_list_splice(struct page_list_head *list, struct page_list_head *head)
   41.18 +{
   41.19 +    struct page_info *first, *last, *at;
   41.20 +
   41.21 +    if ( page_list_empty(list) )
   41.22 +        return;
   41.23 +
   41.24 +    if ( page_list_empty(head) )
   41.25 +    {
   41.26 +        head->next = list->next;
   41.27 +        head->tail = list->tail;
   41.28 +        return;
   41.29 +    }
   41.30 +
   41.31 +    first = list->next;
   41.32 +    last = list->tail;
   41.33 +    at = head->next;
   41.34 +
   41.35 +    first->list.prev = page_to_mfn(head->next);
   41.36 +    head->next = first;
   41.37 +
   41.38 +    last->list.next = page_to_mfn(at);
   41.39 +    at->list.prev = page_to_mfn(last);
   41.40 +}
   41.41 +
   41.42  #define page_list_for_each(pos, head) \
   41.43      for ( pos = (head)->next; pos; pos = page_list_next(pos, head) )
   41.44  #define page_list_for_each_safe(pos, tmp, head) \
   41.45 @@ -258,6 +286,7 @@ page_list_remove_head(struct page_list_h
   41.46      list_for_each_entry_safe(pos, tmp, head, list)
   41.47  # define page_list_for_each_safe_reverse(pos, tmp, head) \
   41.48      list_for_each_entry_safe_reverse(pos, tmp, head, list)
   41.49 +# define page_list_splice(list, hd)        list_splice(list, hd)
   41.50  #endif
   41.51  
   41.52  /* Automatic page scrubbing for dead domains. */
   41.53 @@ -272,6 +301,9 @@ extern struct page_list_head page_scrub_
   41.54          if ( !page_list_empty(&page_scrub_list) )                       \
   41.55              cpumask_raise_softirq(cpu_online_map, PAGE_SCRUB_SOFTIRQ);  \
   41.56      } while ( 0 )
   41.57 +void scrub_list_splice(struct page_list_head *);
   41.58 +void scrub_list_add(struct page_info *);
   41.59 +void scrub_one_page(struct page_info *);
   41.60  unsigned long avail_scrub_pages(void);
   41.61  
   41.62  int guest_remove_page(struct domain *d, unsigned long gmfn);
    42.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    42.2 +++ b/xen/include/xen/radix-tree.h	Tue May 26 11:05:04 2009 +0100
    42.3 @@ -0,0 +1,78 @@
    42.4 +/*
    42.5 + * Copyright (C) 2001 Momchil Velikov
    42.6 + * Portions Copyright (C) 2001 Christoph Hellwig
    42.7 + * Adapted for Xen by Dan Magenheimer, Oracle Corp.
    42.8 + *
    42.9 + * This program is free software; you can redistribute it and/or
   42.10 + * modify it under the terms of the GNU General Public License as
   42.11 + * published by the Free Software Foundation; either version 2, or (at
   42.12 + * your option) any later version.
   42.13 + * 
   42.14 + * This program is distributed in the hope that it will be useful, but
   42.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   42.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   42.17 + * General Public License for more details.
   42.18 + * 
   42.19 + * You should have received a copy of the GNU General Public License
   42.20 + * along with this program; if not, write to the Free Software
   42.21 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
   42.22 + */
   42.23 +#ifndef _XEN_RADIX_TREE_H
   42.24 +#define _XEN_RADIX_TREE_H
   42.25 +
   42.26 +/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
   42.27 +struct radix_tree_root {
   42.28 +    unsigned int height;
   42.29 +    struct radix_tree_node *rnode;
   42.30 +};
   42.31 +
   42.32 +#define RADIX_TREE_MAP_SHIFT 6
   42.33 +
   42.34 +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
   42.35 +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
   42.36 +
   42.37 +#define RADIX_TREE_TAG_LONGS \
   42.38 + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
   42.39 +
   42.40 +struct radix_tree_node {
   42.41 +    unsigned int count;
   42.42 +    void  *slots[RADIX_TREE_MAP_SIZE];
   42.43 +};
   42.44 +
   42.45 +struct radix_tree_path {
   42.46 +    struct radix_tree_node *node;
   42.47 +    int offset;
   42.48 +};
   42.49 +
   42.50 +#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
   42.51 +#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
   42.52 +
   42.53 +
   42.54 +#define RADIX_TREE_INIT(mask) {     \
   42.55 + .height = 0,       \
   42.56 + .rnode = NULL,       \
   42.57 +}
   42.58 +
   42.59 +#define RADIX_TREE(name, mask) \
   42.60 + struct radix_tree_root name = RADIX_TREE_INIT(mask)
   42.61 +
   42.62 +#define INIT_RADIX_TREE(root, mask)     \
   42.63 +do {         \
   42.64 + (root)->height = 0;      \
   42.65 + (root)->rnode = NULL;      \
   42.66 +} while (0)
   42.67 +
   42.68 +int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
   42.69 +                      void *item, struct radix_tree_node *(*node_alloc)(void *), void *arg);
   42.70 +void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
   42.71 +void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
   42.72 +void radix_tree_destroy(struct radix_tree_root *root,
   42.73 +                        void (*slot_free)(void *), void (*node_free)(struct radix_tree_node *));
   42.74 +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index,
   42.75 +                        void(*node_free)(struct radix_tree_node *));
   42.76 +unsigned int
   42.77 +radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
   42.78 +                       unsigned long first_index, unsigned int max_items);
   42.79 +void radix_tree_init(void);
   42.80 +
   42.81 +#endif /* _XEN_RADIX_TREE_H */
    43.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    43.2 +++ b/xen/include/xen/rbtree.h	Tue May 26 11:05:04 2009 +0100
    43.3 @@ -0,0 +1,82 @@
    43.4 +/*
    43.5 +  Red Black Trees
    43.6 +  (C) 1999  Andrea Arcangeli <andrea@suse.de>
    43.7 +  
    43.8 +  This program is free software; you can redistribute it and/or modify
    43.9 +  it under the terms of the GNU General Public License as published by
   43.10 +  the Free Software Foundation; either version 2 of the License, or
   43.11 +  (at your option) any later version.
   43.12 +
   43.13 +  This program is distributed in the hope that it will be useful,
   43.14 +  but WITHOUT ANY WARRANTY; without even the implied warranty of
   43.15 +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   43.16 +  GNU General Public License for more details.
   43.17 +
   43.18 +  You should have received a copy of the GNU General Public License
   43.19 +  along with this program; if not, write to the Free Software
   43.20 +  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   43.21 +*/
   43.22 +
   43.23 +#ifndef __RBTREE_H__
   43.24 +#define __RBTREE_H__
   43.25 +
   43.26 +struct rb_node
   43.27 +{
   43.28 +    unsigned long  rb_parent_color;
   43.29 +#define RB_RED  0
   43.30 +#define RB_BLACK 1
   43.31 +    struct rb_node *rb_right;
   43.32 +    struct rb_node *rb_left;
   43.33 +};
   43.34 +
   43.35 +struct rb_root
   43.36 +{
   43.37 +    struct rb_node *rb_node;
   43.38 +};
   43.39 +
   43.40 +#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
   43.41 +#define rb_color(r)   ((r)->rb_parent_color & 1)
   43.42 +#define rb_is_red(r)   (!rb_color(r))
   43.43 +#define rb_is_black(r) rb_color(r)
   43.44 +#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
   43.45 +#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
   43.46 +
   43.47 +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
   43.48 +{
   43.49 +    rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
   43.50 +}
   43.51 +static inline void rb_set_color(struct rb_node *rb, int color)
   43.52 +{
   43.53 +    rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
   43.54 +}
   43.55 +
   43.56 +#define RB_ROOT (struct rb_root) { NULL, }
   43.57 +#define rb_entry(ptr, type, member) container_of(ptr, type, member)
   43.58 +
   43.59 +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
   43.60 +#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
   43.61 +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
   43.62 +
   43.63 +extern void rb_insert_color(struct rb_node *, struct rb_root *);
   43.64 +extern void rb_erase(struct rb_node *, struct rb_root *);
   43.65 +
   43.66 +/* Find logical next and previous nodes in a tree */
   43.67 +extern struct rb_node *rb_next(struct rb_node *);
   43.68 +extern struct rb_node *rb_prev(struct rb_node *);
   43.69 +extern struct rb_node *rb_first(struct rb_root *);
   43.70 +extern struct rb_node *rb_last(struct rb_root *);
   43.71 +
   43.72 +/* Fast replacement of a single node without remove/rebalance/add/rebalance */
   43.73 +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
   43.74 +                            struct rb_root *root);
   43.75 +
   43.76 +static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
   43.77 +                                struct rb_node ** rb_link)
   43.78 +{
   43.79 +    node->rb_parent_color = (unsigned long )parent;
   43.80 +    node->rb_left = node->rb_right = NULL;
   43.81 +
   43.82 +    *rb_link = node;
   43.83 +}
   43.84 +
   43.85 +#endif /* __RBTREE_H__ */
    44.1 --- a/xen/include/xen/sched.h	Tue May 26 10:14:34 2009 +0100
    44.2 +++ b/xen/include/xen/sched.h	Tue May 26 11:05:04 2009 +0100
    44.3 @@ -269,6 +269,9 @@ struct domain
    44.4  
    44.5      /* VRAM dirty support. */
    44.6      struct sh_dirty_vram *dirty_vram;
    44.7 +
    44.8 +    /* transcendent memory, auto-allocated on first tmem op by each domain */
    44.9 +    void *tmem;
   44.10  };
   44.11  
   44.12  struct domain_setup_info
    45.1 --- a/xen/include/xen/spinlock.h	Tue May 26 10:14:34 2009 +0100
    45.2 +++ b/xen/include/xen/spinlock.h	Tue May 26 11:05:04 2009 +0100
    45.3 @@ -67,12 +67,14 @@ void _read_unlock_irqrestore(rwlock_t *l
    45.4  void _write_lock(rwlock_t *lock);
    45.5  void _write_lock_irq(rwlock_t *lock);
    45.6  unsigned long _write_lock_irqsave(rwlock_t *lock);
    45.7 +int _write_trylock(rwlock_t *lock);
    45.8  
    45.9  void _write_unlock(rwlock_t *lock);
   45.10  void _write_unlock_irq(rwlock_t *lock);
   45.11  void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags);
   45.12  
   45.13  int _rw_is_locked(rwlock_t *lock);
   45.14 +int _rw_is_write_locked(rwlock_t *lock);
   45.15  
   45.16  #define spin_lock(l)                  _spin_lock(l)
   45.17  #define spin_lock_irq(l)              _spin_lock_irq(l)
   45.18 @@ -110,11 +112,13 @@ int _rw_is_locked(rwlock_t *lock);
   45.19  #define write_lock(l)                 _write_lock(l)
   45.20  #define write_lock_irq(l)             _write_lock_irq(l)
   45.21  #define write_lock_irqsave(l, f)      ((f) = _write_lock_irqsave(l))
   45.22 +#define write_trylock(l)              _write_trylock(l)
   45.23  
   45.24  #define write_unlock(l)               _write_unlock(l)
   45.25  #define write_unlock_irq(l)           _write_unlock_irq(l)
   45.26  #define write_unlock_irqrestore(l, f) _write_unlock_irqrestore(l, f)
   45.27  
   45.28  #define rw_is_locked(l)               _rw_is_locked(l)
   45.29 +#define rw_is_write_locked(l)         _rw_is_write_locked(l)
   45.30  
   45.31  #endif /* __SPINLOCK_H__ */
    46.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    46.2 +++ b/xen/include/xen/tmem.h	Tue May 26 11:05:04 2009 +0100
    46.3 @@ -0,0 +1,16 @@
    46.4 +/******************************************************************************
    46.5 + * tmem.h
    46.6 + *
    46.7 + * Transcendent memory
    46.8 + *
    46.9 + * Copyright (c) 2008, Dan Magenheimer, Oracle Corp.
   46.10 + */
   46.11 +
   46.12 +#ifndef __XEN_TMEM_H__
   46.13 +#define __XEN_TMEM_H__
   46.14 +
   46.15 +extern void init_tmem(void);
   46.16 +extern void tmem_destroy(void *);
   46.17 +extern void *tmem_relinquish_pages(unsigned int, unsigned int);
   46.18 +
   46.19 +#endif /* __XEN_TMEM_H__ */
    47.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    47.2 +++ b/xen/include/xen/tmem_xen.h	Tue May 26 11:05:04 2009 +0100
    47.3 @@ -0,0 +1,356 @@
    47.4 +/******************************************************************************
    47.5 + * tmem_xen.h
    47.6 + *
    47.7 + * Xen-specific Transcendent memory
    47.8 + *
    47.9 + * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
   47.10 + */
   47.11 +
   47.12 +#ifndef __XEN_TMEM_XEN_H__
   47.13 +#define __XEN_TMEM_XEN_H__
   47.14 +
   47.15 +#include <xen/config.h>
   47.16 +#include <xen/mm.h> /* heap alloc/free */
   47.17 +#include <xen/xmalloc.h> /* xmalloc/xfree */
   47.18 +#include <xen/sched.h>  /* struct domain */
   47.19 +#include <xen/guest_access.h> /* copy_from_guest */
   47.20 +#include <xen/hash.h> /* hash_long */
   47.21 +#include <public/tmem.h>
   47.22 +
   47.23 +struct tmem_host_dependent_client {
   47.24 +    struct domain *domain;
   47.25 +    struct xmem_pool *persistent_pool;
   47.26 +};
   47.27 +typedef struct tmem_host_dependent_client tmh_client_t;
   47.28 +
   47.29 +#define IS_PAGE_ALIGNED(addr) \
   47.30 +  ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr)
   47.31 +#define IS_VALID_PAGE(_pi)  ( mfn_valid(page_to_mfn(_pi)) )
   47.32 +
   47.33 +extern struct xmem_pool *tmh_mempool;
   47.34 +extern unsigned int tmh_mempool_maxalloc;
   47.35 +extern struct page_list_head tmh_page_list;
   47.36 +extern spinlock_t tmh_page_list_lock;
   47.37 +extern unsigned long tmh_page_list_pages;
   47.38 +
   47.39 +extern spinlock_t tmem_lock;
   47.40 +extern spinlock_t tmem_spinlock;
   47.41 +extern rwlock_t tmem_rwlock;
   47.42 +
   47.43 +extern void tmh_copy_page(char *to, char*from);
   47.44 +extern int tmh_init(void);
   47.45 +extern tmh_client_t *tmh_client_init(void);
   47.46 +extern void tmh_client_destroy(tmh_client_t *);
   47.47 +#define tmh_hash hash_long
   47.48 +
   47.49 +extern void tmh_release_avail_pages_to_host(void);
   47.50 +extern void tmh_scrub_page(struct page_info *pi, unsigned int memflags);
   47.51 +
   47.52 +extern int opt_tmem_compress;
   47.53 +static inline int tmh_compression_enabled(void)
   47.54 +{
   47.55 +    return opt_tmem_compress;
   47.56 +}
   47.57 +
   47.58 +extern int opt_tmem;
   47.59 +static inline int tmh_enabled(void)
   47.60 +{
   47.61 +    return opt_tmem;
   47.62 +}
   47.63 +
   47.64 +extern int opt_tmem_lock;
   47.65 +
   47.66 +extern int opt_tmem_flush_dups;
   47.67 +
   47.68 +/*
   47.69 + * Memory free page list management
   47.70 + */
   47.71 +
   47.72 +static inline struct page_info *tmh_page_list_get(void)
   47.73 +{
   47.74 +    struct page_info *pi;
   47.75 +
   47.76 +    spin_lock(&tmh_page_list_lock);
   47.77 +    if ( (pi = page_list_remove_head(&tmh_page_list)) != NULL )
   47.78 +        tmh_page_list_pages--;
   47.79 +    spin_unlock(&tmh_page_list_lock);
   47.80 +    ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
   47.81 +    return pi;
   47.82 +}
   47.83 +
   47.84 +static inline void tmh_page_list_put(struct page_info *pi)
   47.85 +{
   47.86 +    ASSERT(IS_VALID_PAGE(pi));
   47.87 +    spin_lock(&tmh_page_list_lock);
   47.88 +    page_list_add(pi, &tmh_page_list);
   47.89 +    tmh_page_list_pages++;
   47.90 +    spin_unlock(&tmh_page_list_lock);
   47.91 +}
   47.92 +
   47.93 +static inline unsigned long tmh_avail_pages(void)
   47.94 +{
   47.95 +    return tmh_page_list_pages;
   47.96 +}
   47.97 +
   47.98 +/*
   47.99 + * Ephemeral memory allocation for persistent data 
  47.100 + */
  47.101 +
  47.102 +static inline bool_t domain_fully_allocated(struct domain *d)
  47.103 +{
  47.104 +    return ( d->tot_pages >= d->max_pages );
  47.105 +}
  47.106 +#define tmh_client_memory_fully_allocated(_pool) \
  47.107 + domain_fully_allocated(_pool->client->tmh->domain)
  47.108 +
  47.109 +static inline void *_tmh_alloc_subpage_thispool(struct xmem_pool *cmem_mempool,
  47.110 +                                                 size_t size, size_t align)
  47.111 +{
  47.112 +#if 0
  47.113 +    if ( d->tot_pages >= d->max_pages )
  47.114 +        return NULL;
  47.115 +#endif
  47.116 +#ifdef __i386__
  47.117 +    return _xmalloc(size,align);
  47.118 +#else
  47.119 +    ASSERT( size < tmh_mempool_maxalloc );
  47.120 +    if ( cmem_mempool == NULL )
  47.121 +        return NULL;
  47.122 +    return xmem_pool_alloc(size, cmem_mempool);
  47.123 +#endif
  47.124 +}
  47.125 +#define tmh_alloc_subpage_thispool(_pool, _s, _a) \
  47.126 +            _tmh_alloc_subpage_thispool(pool->client->tmh->persistent_pool, \
  47.127 +                                         _s, _a)
  47.128 +
  47.129 +static inline void _tmh_free_subpage_thispool(struct xmem_pool *cmem_mempool,
  47.130 +                                               void *ptr, size_t size)
  47.131 +{
  47.132 +#ifdef __i386__
  47.133 +    xfree(ptr);
  47.134 +#else
  47.135 +    ASSERT( size < tmh_mempool_maxalloc );
  47.136 +    ASSERT( cmem_mempool != NULL );
  47.137 +    xmem_pool_free(ptr,cmem_mempool);
  47.138 +#endif
  47.139 +}
  47.140 +#define tmh_free_subpage_thispool(_pool, _p, _s) \
  47.141 + _tmh_free_subpage_thispool(_pool->client->tmh->persistent_pool, _p, _s)
  47.142 +
  47.143 +static inline struct page_info *_tmh_alloc_page_thispool(struct domain *d)
  47.144 +{
  47.145 +    struct page_info *pi;
  47.146 +
  47.147 +    /* note that this tot_pages check is not protected by d->page_alloc_lock,
  47.148 +     * so may race and periodically fail in donate_page or alloc_domheap_pages
  47.149 +     * That's OK... neither is a problem, though chatty if log_lvl is set */ 
  47.150 +    if ( d->tot_pages >= d->max_pages )
  47.151 +        return NULL;
  47.152 +
  47.153 +    if ( tmh_page_list_pages )
  47.154 +    {
  47.155 +        if ( (pi = tmh_page_list_get()) != NULL )
  47.156 +        {
  47.157 +            if ( donate_page(d,pi,0) == 0 )
  47.158 +                goto out;
  47.159 +            else
  47.160 +                tmh_page_list_put(pi);
  47.161 +        }
  47.162 +    }
  47.163 +
  47.164 +    pi = alloc_domheap_pages(d,0,MEMF_tmem);
  47.165 +
  47.166 +out:
  47.167 +    ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
  47.168 +    return pi;
  47.169 +}
  47.170 +#define tmh_alloc_page_thispool(_pool) \
  47.171 +    _tmh_alloc_page_thispool(_pool->client->tmh->domain)
  47.172 +
  47.173 +static inline void _tmh_free_page_thispool(struct page_info *pi)
  47.174 +{
  47.175 +    struct domain *d = page_get_owner(pi);
  47.176 +
  47.177 +    ASSERT(IS_VALID_PAGE(pi));
  47.178 +    if ( (d == NULL) || steal_page(d,pi,0) == 0 )
  47.179 +        tmh_page_list_put(pi);
  47.180 +    else
  47.181 +    {
  47.182 +        scrub_one_page(pi);
  47.183 +        ASSERT((pi->count_info & ~(PGC_allocated | 1)) == 0);
  47.184 +        free_domheap_pages(pi,0);
  47.185 +    }
  47.186 +}
  47.187 +#define tmh_free_page_thispool(_pool,_pg) \
  47.188 +    _tmh_free_page_thispool(_pg)
  47.189 +
  47.190 +/*
  47.191 + * Memory allocation for ephemeral (non-persistent) data
  47.192 + */
  47.193 +
  47.194 +static inline void *tmh_alloc_subpage(void *pool, size_t size,
  47.195 +                                                 size_t align)
  47.196 +{
  47.197 +#ifdef __i386__
  47.198 +    ASSERT( size < PAGE_SIZE );
  47.199 +    return _xmalloc(size, align);
  47.200 +#else
  47.201 +    ASSERT( size < tmh_mempool_maxalloc );
  47.202 +    ASSERT( tmh_mempool != NULL );
  47.203 +    return xmem_pool_alloc(size, tmh_mempool);
  47.204 +#endif
  47.205 +}
  47.206 +
  47.207 +static inline void tmh_free_subpage(void *ptr, size_t size)
  47.208 +{
  47.209 +#ifdef __i386__
  47.210 +    ASSERT( size < PAGE_SIZE );
  47.211 +    xfree(ptr);
  47.212 +#else
  47.213 +    ASSERT( size < tmh_mempool_maxalloc );
  47.214 +    xmem_pool_free(ptr,tmh_mempool);
  47.215 +#endif
  47.216 +}
  47.217 +
  47.218 +static inline struct page_info *tmh_alloc_page(void *pool, int no_heap)
  47.219 +{
  47.220 +    struct page_info *pi = tmh_page_list_get();
  47.221 +
  47.222 +    if ( pi == NULL && !no_heap )
  47.223 +        pi = alloc_domheap_pages(0,0,MEMF_tmem);
  47.224 +    ASSERT((pi == NULL) || IS_VALID_PAGE(pi));
  47.225 +    return pi;
  47.226 +}
  47.227 +
  47.228 +static inline void tmh_free_page(struct page_info *pi)
  47.229 +{
  47.230 +    ASSERT(IS_VALID_PAGE(pi));
  47.231 +    tmh_page_list_put(pi);
  47.232 +}
  47.233 +
  47.234 +static inline unsigned int tmem_subpage_maxsize(void)
  47.235 +{
  47.236 +    return tmh_mempool_maxalloc;
  47.237 +}
  47.238 +
  47.239 +#define tmh_lock_all  opt_tmem_lock
  47.240 +#define tmh_flush_dups  opt_tmem_flush_dups
  47.241 +#define tmh_called_from_tmem(_memflags) (_memflags & MEMF_tmem)
  47.242 +
  47.243 +/*  "Client" (==domain) abstraction */
  47.244 +
  47.245 +struct client;
  47.246 +typedef domid_t cli_id_t;
  47.247 +typedef struct domain tmh_cli_ptr_t;
  47.248 +typedef struct page_info pfp_t;
  47.249 +
  47.250 +/* this appears to be unreliable when a domain is being shut down */
  47.251 +static inline struct client *tmh_client_from_cli_id(cli_id_t cli_id)
  47.252 +{
  47.253 +    struct domain *d = get_domain_by_id(cli_id);
  47.254 +    if (d == NULL)
  47.255 +        return NULL;
  47.256 +    return (struct client *)(d->tmem);
  47.257 +}
  47.258 +
  47.259 +static inline struct client *tmh_client_from_current(void)
  47.260 +{
  47.261 +    return (struct client *)(current->domain->tmem);
  47.262 +}
  47.263 +
  47.264 +static inline cli_id_t tmh_get_cli_id_from_current(void)
  47.265 +{
  47.266 +    return current->domain->domain_id;
  47.267 +}
  47.268 +
  47.269 +static inline tmh_cli_ptr_t *tmh_get_cli_ptr_from_current(void)
  47.270 +{
  47.271 +    return current->domain;
  47.272 +}
  47.273 +
  47.274 +static inline void tmh_set_current_client(struct client *client)
  47.275 +{
  47.276 +    current->domain->tmem = client;
  47.277 +}
  47.278 +
  47.279 +static inline bool_t tmh_current_is_privileged(void)
  47.280 +{
  47.281 +    return IS_PRIV(current->domain);
  47.282 +}
  47.283 +
  47.284 +/* these typedefs are in the public/tmem.h interface
  47.285 +typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
  47.286 +typedef XEN_GUEST_HANDLE(char) cli_va_t;
  47.287 +typedef XEN_GUEST_HANDLE(tmem_op_t) cli_tmemop_t;
  47.288 +*/
  47.289 +
  47.290 +static inline int tmh_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops)
  47.291 +{
  47.292 +    return __copy_from_guest(op, uops, 1);
  47.293 +}
  47.294 +
  47.295 +static inline void tmh_copy_to_client_buf_offset(tmem_cli_va_t clibuf, int off,
  47.296 +                                           char *tmembuf, int len)
  47.297 +{
  47.298 +    copy_to_guest_offset(clibuf,off,tmembuf,len);
  47.299 +}
  47.300 +
  47.301 +#define TMH_CLI_ID_NULL ((cli_id_t)((domid_t)-1L))
  47.302 +
  47.303 +#define tmh_cli_id_str "domid"
  47.304 +#define tmh_client_str "domain"
  47.305 +
  47.306 +extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t);
  47.307 +
  47.308 +extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *);
  47.309 +
  47.310 +extern int tmh_copy_from_client(pfp_t *pfp,
  47.311 +    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
  47.312 +    uint32_t pfn_offset, uint32_t len);
  47.313 +
  47.314 +extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
  47.315 +    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len);
  47.316 +
  47.317 +
  47.318 +#define TMEM_PERF
  47.319 +#ifdef TMEM_PERF
  47.320 +#define DECL_CYC_COUNTER(x) \
  47.321 +    uint64_t x##_sum_cycles = 0, x##_count = 0; \
  47.322 +    uint32_t x##_min_cycles = 0x7fffffff, x##_max_cycles = 0;
  47.323 +#define EXTERN_CYC_COUNTER(x) \
  47.324 +    extern uint64_t x##_sum_cycles, x##_count; \
  47.325 +    extern uint32_t x##_min_cycles, x##_max_cycles;
  47.326 +#define DECL_LOCAL_CYC_COUNTER(x) \
  47.327 +    int64_t x##_start = 0
  47.328 +#define START_CYC_COUNTER(x) x##_start = get_cycles()
  47.329 +#define DUP_START_CYC_COUNTER(x,y) x##_start = y##_start
  47.330 +/* following might race, but since its advisory only, don't care */
  47.331 +#define END_CYC_COUNTER(x) \
  47.332 +    do { \
  47.333 +      x##_start = get_cycles() - x##_start; \
  47.334 +      if (x##_start > 0 && x##_start < 1000000000) { \
  47.335 +       x##_sum_cycles += x##_start; x##_count++; \
  47.336 +       if ((uint32_t)x##_start < x##_min_cycles) x##_min_cycles = x##_start; \
  47.337 +       if ((uint32_t)x##_start > x##_max_cycles) x##_max_cycles = x##_start; \
  47.338 +      } \
  47.339 +    } while (0)
  47.340 +#define RESET_CYC_COUNTER(x) { x##_sum_cycles = 0, x##_count = 0; \
  47.341 +  x##_min_cycles = 0x7fffffff, x##_max_cycles = 0; }
  47.342 +#define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) \
  47.343 +  scnprintf(buf,size, \
  47.344 +  tag"n:%"PRIu64","tag"t:%"PRIu64","tag"x:%"PRId32","tag"m:%"PRId32",", \
  47.345 +  x##_count,x##_sum_cycles,x##_max_cycles,x##_min_cycles)
  47.346 +#else
  47.347 +#define DECL_CYC_COUNTER(x)
  47.348 +#define EXTERN_CYC_COUNTER(x) \
  47.349 +    extern uint64_t x##_sum_cycles, x##_count; \
  47.350 +    extern uint32_t x##_min_cycles, x##_max_cycles;
  47.351 +#define DECL_LOCAL_CYC_COUNTER(x) do { } while (0)
  47.352 +#define START_CYC_COUNTER(x) do { } while (0)
  47.353 +#define DUP_START_CYC_COUNTER(x) do { } while (0)
  47.354 +#define END_CYC_COUNTER(x) do { } while (0)
  47.355 +#define SCNPRINTF_CYC_COUNTER(buf,size,x,tag) (0)
  47.356 +#define RESET_CYC_COUNTER(x) do { } while (0)
  47.357 +#endif
  47.358 +
  47.359 +#endif /* __XEN_TMEM_XEN_H__ */
    48.1 --- a/xen/include/xen/xmalloc.h	Tue May 26 10:14:34 2009 +0100
    48.2 +++ b/xen/include/xen/xmalloc.h	Tue May 26 11:05:04 2009 +0100
    48.3 @@ -76,7 +76,13 @@ void xmem_pool_destroy(struct xmem_pool 
    48.4  void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool);
    48.5  
    48.6  /**
    48.7 - * xmem_pool_free - free memory from given pool
    48.8 + * xmem_pool_maxalloc - xmem_pool_alloc's greater than this size will fail
    48.9 + * @mem_pool: pool
   48.10 + */
   48.11 +int xmem_pool_maxalloc(struct xmem_pool *pool);
   48.12 +
   48.13 +/**
   48.14 + * xmem_pool_maxsize - 
   48.15   * @ptr: address of memory to be freed
   48.16   * @mem_pool: pool to free from
   48.17   */
    49.1 --- a/xen/include/xlat.lst	Tue May 26 10:14:34 2009 +0100
    49.2 +++ b/xen/include/xlat.lst	Tue May 26 11:05:04 2009 +0100
    49.3 @@ -74,3 +74,6 @@
    49.4  ?	processor_px			platform.h
    49.5  !	psd_package			platform.h
    49.6  !	processor_performance		platform.h
    49.7 +# ?	tmem_op_t			tmem.h
    49.8 +# ?	tmem_cli_mfn_t			tmem.h
    49.9 +# ?	tmem_cli_va_t			tmem.h