ia64/xen-unstable

view xen/common/tmem.c @ 19835:edfdeb150f27

Fix buildsystem to detect udev > version 124

udev removed the udevinfo symlink from versions higher than 123 and
xen's build-system could not detect if udev is in place and has the
required version.

Signed-off-by: Marc-A. Dahlhaus <mad@wol.de>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 25 13:02:37 2009 +0100 (2009-06-25)
parents 0ea75c3b7743
children
line source
1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 /* TODO list: 090129
10 - improve on reclamation policy
11 - use different tlsf pools for each client (maybe each pool)
12 - implement page accounting and minimal QoS limits
13 - test shared access more completely (need pv cluster fs)
14 - add feedback-driven compression (not for persistent pools though!)
15 - add data-structure total bytes overhead stats
16 */
18 #ifdef __XEN__
19 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
20 #endif
22 #include <xen/tmem.h>
23 #include <xen/rbtree.h>
24 #include <xen/radix-tree.h>
25 #include <xen/list.h>
27 #define EXPORT /* indicates code other modules are dependent upon */
28 #define FORWARD
30 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
32 #define CLI_ID_NULL TMH_CLI_ID_NULL
33 #define cli_id_str tmh_cli_id_str
34 #define client_str tmh_client_str
36 /************ DEBUG and STATISTICS (+ some compression testing) *******/
38 #ifndef NDEBUG
39 #define SENTINELS
40 #define NOINLINE noinline
41 #else
42 #define NOINLINE
43 #endif
45 #ifdef SENTINELS
46 #define DECL_SENTINEL unsigned long sentinel;
47 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
48 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
49 #define ASSERT_SENTINEL(_x,_y) \
50 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
51 #ifdef __i386__
52 #define POOL_SENTINEL 0x87658765
53 #define OBJ_SENTINEL 0x12345678
54 #define OBJNODE_SENTINEL 0xfedcba09
55 #define PGD_SENTINEL 0x43214321
56 #else
57 #define POOL_SENTINEL 0x8765876587658765
58 #define OBJ_SENTINEL 0x1234567812345678
59 #define OBJNODE_SENTINEL 0xfedcba0987654321
60 #define PGD_SENTINEL 0x4321432143214321
61 #endif
62 #else
63 #define DECL_SENTINEL
64 #define SET_SENTINEL(_x,_y) do { } while (0)
65 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
66 #define INVERT_SENTINEL(_x,_y) do { } while (0)
67 #endif
69 /* global statistics (none need to be locked) */
70 static unsigned long total_tmem_ops = 0;
71 static unsigned long errored_tmem_ops = 0;
72 static unsigned long total_flush_pool = 0;
73 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
74 static unsigned long evicted_pgs = 0, evict_attempts = 0;
75 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
76 static unsigned long max_evicts_per_relinq = 0;
77 static unsigned long low_on_memory = 0;
78 static int global_obj_count_max = 0;
79 static int global_pgp_count_max = 0;
80 static int global_page_count_max = 0;
81 static int global_rtree_node_count_max = 0;
82 static long global_eph_count_max = 0;
83 static unsigned long failed_copies;
85 DECL_CYC_COUNTER(succ_get);
86 DECL_CYC_COUNTER(succ_put);
87 DECL_CYC_COUNTER(non_succ_get);
88 DECL_CYC_COUNTER(non_succ_put);
89 DECL_CYC_COUNTER(flush);
90 DECL_CYC_COUNTER(flush_obj);
91 #ifdef COMPARE_COPY_PAGE_SSE2
92 EXTERN_CYC_COUNTER(pg_copy1);
93 EXTERN_CYC_COUNTER(pg_copy2);
94 EXTERN_CYC_COUNTER(pg_copy3);
95 EXTERN_CYC_COUNTER(pg_copy4);
96 #else
97 EXTERN_CYC_COUNTER(pg_copy);
98 #endif
99 DECL_CYC_COUNTER(compress);
100 DECL_CYC_COUNTER(decompress);
102 /************ CORE DATA STRUCTURES ************************************/
104 #define MAX_POOLS_PER_DOMAIN 16
105 #define MAX_GLOBAL_SHARED_POOLS 16
107 struct tm_pool;
108 struct client {
109 struct list_head client_list;
110 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
111 tmh_client_t *tmh;
112 struct list_head ephemeral_page_list;
113 long eph_count, eph_count_max;
114 cli_id_t cli_id;
115 uint32_t weight;
116 uint32_t cap;
117 bool_t compress;
118 bool_t frozen;
119 unsigned long compress_poor, compress_nomem;
120 unsigned long compressed_pages;
121 uint64_t compressed_sum_size;
122 };
123 typedef struct client client_t;
125 struct share_list {
126 struct list_head share_list;
127 client_t *client;
128 };
129 typedef struct share_list sharelist_t;
131 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
132 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
133 #define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
135 struct tm_pool {
136 bool_t shared;
137 bool_t persistent;
138 struct list_head pool_list; /* FIXME do we need this anymore? */
139 client_t *client;
140 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
141 uint32_t pool_id;
142 rwlock_t pool_rwlock;
143 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
144 struct list_head share_list; /* valid if shared */
145 DECL_SENTINEL
146 int shared_count; /* valid if shared */
147 atomic_t pgp_count;
148 int pgp_count_max;
149 long obj_count; /* atomicity depends on pool_rwlock held for write */
150 long obj_count_max;
151 unsigned long objnode_count, objnode_count_max;
152 uint64_t sum_life_cycles;
153 uint64_t sum_evicted_cycles;
154 unsigned long puts, good_puts, no_mem_puts;
155 unsigned long dup_puts_flushed, dup_puts_replaced;
156 unsigned long gets, found_gets;
157 unsigned long flushs, flushs_found;
158 unsigned long flush_objs, flush_objs_found;
159 };
160 typedef struct tm_pool pool_t;
162 #define is_persistent(_p) (_p->persistent)
163 #define is_ephemeral(_p) (!(_p->persistent))
164 #define is_shared(_p) (_p->shared)
165 #define is_private(_p) (!(_p->shared))
167 struct tmem_object_root {
168 DECL_SENTINEL
169 uint64_t oid;
170 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
171 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
172 long pgp_count; /* atomicity depends on obj_spinlock */
173 struct radix_tree_root tree_root; /* tree of pages within object */
174 pool_t *pool;
175 cli_id_t last_client;
176 spinlock_t obj_spinlock;
177 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
178 };
179 typedef struct tmem_object_root obj_t;
181 typedef struct radix_tree_node rtn_t;
182 struct tmem_object_node {
183 obj_t *obj;
184 DECL_SENTINEL
185 rtn_t rtn;
186 };
187 typedef struct tmem_object_node objnode_t;
189 struct tmem_page_descriptor {
190 struct list_head global_eph_pages;
191 struct list_head client_eph_pages;
192 obj_t *obj;
193 uint32_t index;
194 size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
195 union {
196 pfp_t *pfp; /* page frame pointer */
197 char *cdata; /* compressed data */
198 };
199 uint64_t timestamp;
200 DECL_SENTINEL
201 };
202 typedef struct tmem_page_descriptor pgp_t;
204 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
206 static LIST_HEAD(global_client_list);
207 static LIST_HEAD(global_pool_list);
209 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
210 static atomic_t client_weight_total = ATOMIC_INIT(0);
211 static int tmem_initialized = 0;
213 /************ CONCURRENCY ***********************************************/
215 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
216 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
217 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
219 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
220 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
221 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
222 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
223 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
224 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
225 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
226 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
228 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
229 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
231 /* global counters (should use long_atomic_t access) */
232 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
233 static atomic_t global_obj_count = ATOMIC_INIT(0);
234 static atomic_t global_pgp_count = ATOMIC_INIT(0);
235 static atomic_t global_page_count = ATOMIC_INIT(0);
236 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
238 #define atomic_inc_and_max(_c) do { \
239 atomic_inc(&_c); \
240 if ( _atomic_read(_c) > _c##_max ) \
241 _c##_max = _atomic_read(_c); \
242 } while (0)
244 #define atomic_dec_and_assert(_c) do { \
245 atomic_dec(&_c); \
246 ASSERT(_atomic_read(_c) >= 0); \
247 } while (0)
250 /************ MEMORY ALLOCATION INTERFACE *****************************/
252 #define tmem_malloc(_type,_pool) \
253 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
255 #define tmem_malloc_bytes(_size,_pool) \
256 _tmem_malloc(_size, 1, _pool)
258 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
259 {
260 void *v;
262 if ( (pool != NULL) && is_persistent(pool) )
263 v = tmh_alloc_subpage_thispool(pool,size,align);
264 else
265 v = tmh_alloc_subpage(pool, size, align);
266 if ( v == NULL )
267 alloc_failed++;
268 return v;
269 }
271 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
272 {
273 if ( pool == NULL || !is_persistent(pool) )
274 tmh_free_subpage(p,size);
275 else
276 tmh_free_subpage_thispool(pool,p,size);
277 }
279 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
280 {
281 pfp_t *pfp = NULL;
283 if ( pool != NULL && is_persistent(pool) )
284 pfp = tmh_alloc_page_thispool(pool);
285 else
286 pfp = tmh_alloc_page(pool,0);
287 if ( pfp == NULL )
288 alloc_page_failed++;
289 else
290 atomic_inc_and_max(global_page_count);
291 return pfp;
292 }
294 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
295 {
296 ASSERT(pfp);
297 if ( pool == NULL || !is_persistent(pool) )
298 tmh_free_page(pfp);
299 else
300 tmh_free_page_thispool(pool,pfp);
301 atomic_dec_and_assert(global_page_count);
302 }
304 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
306 /* allocate a pgp_t and associate it with an object */
307 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
308 {
309 pgp_t *pgp;
310 pool_t *pool;
312 ASSERT(obj != NULL);
313 ASSERT(obj->pool != NULL);
314 pool = obj->pool;
315 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
316 return NULL;
317 pgp->obj = obj;
318 INIT_LIST_HEAD(&pgp->global_eph_pages);
319 INIT_LIST_HEAD(&pgp->client_eph_pages);
320 pgp->pfp = NULL;
321 pgp->size = -1;
322 pgp->index = -1;
323 pgp->timestamp = get_cycles();
324 SET_SENTINEL(pgp,PGD);
325 atomic_inc_and_max(global_pgp_count);
326 atomic_inc_and_max(pool->pgp_count);
327 return pgp;
328 }
330 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
331 {
332 ASSERT(obj != NULL);
333 ASSERT_SPINLOCK(&obj->obj_spinlock);
334 ASSERT_SENTINEL(obj,OBJ);
335 ASSERT(obj->pool != NULL);
336 ASSERT_SENTINEL(obj->pool,POOL);
337 return radix_tree_lookup(&obj->tree_root, index);
338 }
340 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
341 {
342 if ( pgp->pfp == NULL )
343 return;
344 if ( !pgp->size )
345 tmem_page_free(pgp->obj->pool,pgp->pfp);
346 else
347 {
348 tmem_free(pgp->cdata,pgp->size,pool);
349 if ( pool != NULL )
350 {
351 pool->client->compressed_pages--;
352 pool->client->compressed_sum_size -= pgp->size;
353 }
354 }
355 pgp->pfp = NULL;
356 pgp->size = -1;
357 }
359 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
360 {
361 pool_t *pool = NULL;
363 ASSERT_SENTINEL(pgp,PGD);
364 ASSERT(pgp->obj != NULL);
365 ASSERT_SENTINEL(pgp->obj,OBJ);
366 ASSERT_SENTINEL(pgp->obj->pool,POOL);
367 ASSERT(list_empty(&pgp->global_eph_pages));
368 ASSERT(list_empty(&pgp->client_eph_pages));
369 if ( from_delete )
370 ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
371 ASSERT(pgp->obj->pool != NULL);
372 pool = pgp->obj->pool;
373 pgp_free_data(pgp, pool);
374 INVERT_SENTINEL(pgp,PGD);
375 pgp->obj = NULL;
376 pgp->index = -1;
377 pgp->size = -1;
378 atomic_dec_and_assert(global_pgp_count);
379 atomic_dec_and_assert(pool->pgp_count);
380 tmem_free(pgp,sizeof(pgp_t),pool);
381 }
383 /* remove the page from appropriate lists but not from parent object */
384 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
385 {
386 ASSERT(pgp != NULL);
387 ASSERT(pgp->obj != NULL);
388 ASSERT(pgp->obj->pool != NULL);
389 ASSERT(pgp->obj->pool->client != NULL);
390 if ( is_ephemeral(pgp->obj->pool) )
391 {
392 if ( !no_eph_lock )
393 tmem_spin_lock(&eph_lists_spinlock);
394 if ( !list_empty(&pgp->client_eph_pages) )
395 pgp->obj->pool->client->eph_count--;
396 ASSERT(pgp->obj->pool->client->eph_count >= 0);
397 list_del_init(&pgp->client_eph_pages);
398 if ( !list_empty(&pgp->global_eph_pages) )
399 global_eph_count--;
400 ASSERT(global_eph_count >= 0);
401 list_del_init(&pgp->global_eph_pages);
402 if ( !no_eph_lock )
403 tmem_spin_unlock(&eph_lists_spinlock);
404 }
405 }
407 /* remove page from lists (but not from parent object) and free it */
408 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
409 {
410 uint64_t life;
412 ASSERT(pgp != NULL);
413 ASSERT(pgp->obj != NULL);
414 ASSERT(pgp->obj->pool != NULL);
415 life = get_cycles() - pgp->timestamp;
416 pgp->obj->pool->sum_life_cycles += life;
417 pgp_delist(pgp, no_eph_lock);
418 pgp_free(pgp,1);
419 }
421 /* called only indirectly by radix_tree_destroy */
422 static NOINLINE void pgp_destroy(void *v)
423 {
424 pgp_t *pgp = (pgp_t *)v;
426 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
427 pgp_delist(pgp,0);
428 ASSERT(pgp->obj != NULL);
429 pgp->obj->pgp_count--;
430 ASSERT(pgp->obj->pgp_count >= 0);
431 pgp_free(pgp,0);
432 }
434 FORWARD static rtn_t *rtn_alloc(void *arg);
435 FORWARD static void rtn_free(rtn_t *rtn);
437 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
438 {
439 int ret;
441 ASSERT_SPINLOCK(&obj->obj_spinlock);
442 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
443 if ( !ret )
444 obj->pgp_count++;
445 return ret;
446 }
448 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
449 {
450 pgp_t *pgp;
452 ASSERT(obj != NULL);
453 ASSERT_SPINLOCK(&obj->obj_spinlock);
454 ASSERT_SENTINEL(obj,OBJ);
455 ASSERT(obj->pool != NULL);
456 ASSERT_SENTINEL(obj->pool,POOL);
457 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
458 if ( pgp != NULL )
459 obj->pgp_count--;
460 ASSERT(obj->pgp_count >= 0);
462 return pgp;
463 }
465 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
467 /* called only indirectly from radix_tree_insert */
468 static NOINLINE rtn_t *rtn_alloc(void *arg)
469 {
470 objnode_t *objnode;
471 obj_t *obj = (obj_t *)arg;
473 ASSERT_SENTINEL(obj,OBJ);
474 ASSERT(obj->pool != NULL);
475 ASSERT_SENTINEL(obj->pool,POOL);
476 objnode = tmem_malloc(objnode_t,obj->pool);
477 if (objnode == NULL)
478 return NULL;
479 objnode->obj = obj;
480 SET_SENTINEL(objnode,OBJNODE);
481 memset(&objnode->rtn, 0, sizeof(rtn_t));
482 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
483 obj->pool->objnode_count_max = obj->pool->objnode_count;
484 atomic_inc_and_max(global_rtree_node_count);
485 obj->objnode_count++;
486 return &objnode->rtn;
487 }
489 /* called only indirectly from radix_tree_delete/destroy */
490 static void rtn_free(rtn_t *rtn)
491 {
492 pool_t *pool;
493 objnode_t *objnode;
494 int i;
496 ASSERT(rtn != NULL);
497 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
498 ASSERT(rtn->slots[i] == NULL);
499 objnode = container_of(rtn,objnode_t,rtn);
500 ASSERT_SENTINEL(objnode,OBJNODE);
501 INVERT_SENTINEL(objnode,OBJNODE);
502 ASSERT(objnode->obj != NULL);
503 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
504 ASSERT_SENTINEL(objnode->obj,OBJ);
505 pool = objnode->obj->pool;
506 ASSERT(pool != NULL);
507 ASSERT_SENTINEL(pool,POOL);
508 pool->objnode_count--;
509 objnode->obj->objnode_count--;
510 objnode->obj = NULL;
511 tmem_free(objnode,sizeof(objnode_t),pool);
512 atomic_dec_and_assert(global_rtree_node_count);
513 }
515 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
517 /* searches for object==oid in pool, returns locked object if found */
518 static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
519 {
520 struct rb_node *node;
521 obj_t *obj;
523 restart_find:
524 tmem_read_lock(&pool->pool_rwlock);
525 node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
526 while ( node )
527 {
528 obj = container_of(node, obj_t, rb_tree_node);
529 if ( obj->oid == oid )
530 {
531 if ( tmh_lock_all )
532 obj->no_evict = 1;
533 else
534 {
535 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
536 {
537 tmem_read_unlock(&pool->pool_rwlock);
538 goto restart_find;
539 }
540 tmem_read_unlock(&pool->pool_rwlock);
541 }
542 return obj;
543 }
544 else if ( oid < obj->oid )
545 node = node->rb_left;
546 else
547 node = node->rb_right;
548 }
549 tmem_read_unlock(&pool->pool_rwlock);
550 return NULL;
551 }
553 /* free an object that has no more pgps in it */
554 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
555 {
556 pool_t *pool;
557 uint64_t old_oid;
559 ASSERT_SPINLOCK(&obj->obj_spinlock);
560 ASSERT(obj != NULL);
561 ASSERT_SENTINEL(obj,OBJ);
562 ASSERT(obj->pgp_count == 0);
563 pool = obj->pool;
564 ASSERT(pool != NULL);
565 ASSERT_WRITELOCK(&pool->pool_rwlock);
566 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
567 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
568 ASSERT((long)obj->objnode_count == 0);
569 ASSERT(obj->tree_root.rnode == NULL);
570 pool->obj_count--;
571 ASSERT(pool->obj_count >= 0);
572 INVERT_SENTINEL(obj,OBJ);
573 obj->pool = NULL;
574 old_oid = obj->oid;
575 obj->oid = -1;
576 obj->last_client = CLI_ID_NULL;
577 atomic_dec_and_assert(global_obj_count);
578 /* use no_rebalance only if all objects are being destroyed anyway */
579 if ( !no_rebalance )
580 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
581 tmem_free(obj,sizeof(obj_t),pool);
582 }
584 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
585 {
586 struct rb_node **new, *parent = NULL;
587 obj_t *this;
589 new = &(root->rb_node);
590 while ( *new )
591 {
592 this = container_of(*new, obj_t, rb_tree_node);
593 parent = *new;
594 if ( obj->oid < this->oid )
595 new = &((*new)->rb_left);
596 else if ( obj->oid > this->oid )
597 new = &((*new)->rb_right);
598 else
599 return 0;
600 }
601 rb_link_node(&obj->rb_tree_node, parent, new);
602 rb_insert_color(&obj->rb_tree_node, root);
603 return 1;
604 }
606 /*
607 * allocate, initialize, and insert an tmem_object_root
608 * (should be called only if find failed)
609 */
610 static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
611 {
612 obj_t *obj;
614 ASSERT(pool != NULL);
615 ASSERT_WRITELOCK(&pool->pool_rwlock);
616 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
617 return NULL;
618 pool->obj_count++;
619 if (pool->obj_count > pool->obj_count_max)
620 pool->obj_count_max = pool->obj_count;
621 atomic_inc_and_max(global_obj_count);
622 INIT_RADIX_TREE(&obj->tree_root,0);
623 spin_lock_init(&obj->obj_spinlock);
624 obj->pool = pool;
625 obj->oid = oid;
626 obj->objnode_count = 0;
627 obj->pgp_count = 0;
628 obj->last_client = CLI_ID_NULL;
629 SET_SENTINEL(obj,OBJ);
630 tmem_spin_lock(&obj->obj_spinlock);
631 obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
632 obj->no_evict = 1;
633 ASSERT_SPINLOCK(&obj->obj_spinlock);
634 return obj;
635 }
637 /* free an object after destroying any pgps in it */
638 static NOINLINE void obj_destroy(obj_t *obj, int no_rebalance)
639 {
640 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
641 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
642 obj_free(obj,no_rebalance);
643 }
645 /* destroys all objs in a pool, or only if obj->last_client matches cli_id */
646 static void pool_destroy_objs(pool_t *pool, bool_t selective, cli_id_t cli_id)
647 {
648 struct rb_node *node;
649 obj_t *obj;
650 int i;
652 tmem_write_lock(&pool->pool_rwlock);
653 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
654 {
655 node = rb_first(&pool->obj_rb_root[i]);
656 while ( node != NULL )
657 {
658 obj = container_of(node, obj_t, rb_tree_node);
659 tmem_spin_lock(&obj->obj_spinlock);
660 node = rb_next(node);
661 ASSERT(obj->no_evict == 0);
662 if ( !selective )
663 obj_destroy(obj,1);
664 else if ( obj->last_client == cli_id )
665 obj_destroy(obj,0);
666 else
667 tmem_spin_unlock(&obj->obj_spinlock);
668 }
669 }
670 tmem_write_unlock(&pool->pool_rwlock);
671 }
674 /************ POOL MANIPULATION ROUTINES ******************************/
676 static pool_t * pool_alloc(void)
677 {
678 pool_t *pool;
679 int i;
681 if ( (pool = tmem_malloc(pool_t,NULL)) == NULL )
682 return NULL;
683 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
684 pool->obj_rb_root[i] = RB_ROOT;
685 INIT_LIST_HEAD(&pool->pool_list);
686 rwlock_init(&pool->pool_rwlock);
687 pool->pgp_count_max = pool->obj_count_max = 0;
688 pool->objnode_count = pool->objnode_count_max = 0;
689 atomic_set(&pool->pgp_count,0);
690 pool->obj_count = 0;
691 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
692 pool->dup_puts_replaced = pool->no_mem_puts = 0;
693 pool->found_gets = pool->gets = 0;
694 pool->flushs_found = pool->flushs = 0;
695 pool->flush_objs_found = pool->flush_objs = 0;
696 SET_SENTINEL(pool,POOL);
697 return pool;
698 }
700 static NOINLINE void pool_free(pool_t *pool)
701 {
702 ASSERT_SENTINEL(pool,POOL);
703 INVERT_SENTINEL(pool,POOL);
704 pool->client = NULL;
705 list_del(&pool->pool_list);
706 tmem_free(pool,sizeof(pool_t),NULL);
707 }
709 /* register new_client as a user of this shared pool and return new
710 total number of registered users */
711 static int shared_pool_join(pool_t *pool, client_t *new_client)
712 {
713 sharelist_t *sl;
715 ASSERT(is_shared(pool));
716 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
717 return -1;
718 sl->client = new_client;
719 list_add_tail(&sl->share_list, &pool->share_list);
720 if ( new_client->cli_id != pool->client->cli_id )
721 printk("adding new %s %d to shared pool owned by %s %d\n",
722 client_str, new_client->cli_id, client_str, pool->client->cli_id);
723 return ++pool->shared_count;
724 }
726 /* reassign "ownership" of the pool to another client that shares this pool */
727 static NOINLINE void shared_pool_reassign(pool_t *pool)
728 {
729 sharelist_t *sl;
730 int poolid;
731 client_t *old_client = pool->client, *new_client;
733 ASSERT(is_shared(pool));
734 if ( list_empty(&pool->share_list) )
735 {
736 ASSERT(pool->shared_count == 0);
737 return;
738 }
739 old_client->pools[pool->pool_id] = NULL;
740 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
741 ASSERT(sl->client != old_client);
742 pool->client = new_client = sl->client;
743 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
744 if (new_client->pools[poolid] == pool)
745 break;
746 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
747 new_client->eph_count += _atomic_read(pool->pgp_count);
748 old_client->eph_count -= _atomic_read(pool->pgp_count);
749 list_splice_init(&old_client->ephemeral_page_list,
750 &new_client->ephemeral_page_list);
751 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
752 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
753 pool->pool_id = poolid;
754 }
756 /* destroy all objects with last_client same as passed cli_id,
757 remove pool's cli_id from list of sharers of this pool */
758 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
759 {
760 sharelist_t *sl;
761 int s_poolid;
763 ASSERT(is_shared(pool));
764 ASSERT(pool->client != NULL);
766 ASSERT_WRITELOCK(&tmem_rwlock);
767 pool_destroy_objs(pool,1,cli_id);
768 list_for_each_entry(sl,&pool->share_list, share_list)
769 {
770 if (sl->client->cli_id != cli_id)
771 continue;
772 list_del(&sl->share_list);
773 tmem_free(sl,sizeof(sharelist_t),pool);
774 --pool->shared_count;
775 if (pool->client->cli_id == cli_id)
776 shared_pool_reassign(pool);
777 if (pool->shared_count)
778 return pool->shared_count;
779 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
780 if ( (global_shared_pools[s_poolid]) == pool )
781 {
782 global_shared_pools[s_poolid] = NULL;
783 break;
784 }
785 return 0;
786 }
787 printk("tmem: no match unsharing pool, %s=%d\n",
788 cli_id_str,pool->client->cli_id);
789 return -1;
790 }
792 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
793 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
794 {
795 ASSERT(pool != NULL);
796 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
797 {
798 printk("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
799 cli_id_str, cli_id, pool->pool_id, cli_id_str,pool->client->cli_id);
800 return;
801 }
802 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
803 is_persistent(pool) ? "persistent" : "ephemeral" ,
804 is_shared(pool) ? "shared" : "private");
805 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
806 pool_destroy_objs(pool,0,CLI_ID_NULL);
807 if ( destroy )
808 {
809 pool->client->pools[pool->pool_id] = NULL;
810 pool_free(pool);
811 }
812 }
814 /************ CLIENT MANIPULATION OPERATIONS **************************/
816 static client_t *client_create(void)
817 {
818 client_t *client = tmem_malloc(client_t,NULL);
819 cli_id_t cli_id = tmh_get_cli_id_from_current();
821 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
822 if ( client == NULL )
823 {
824 printk("failed... out of memory\n");
825 return NULL;
826 }
827 memset(client,0,sizeof(client_t));
828 if ( (client->tmh = tmh_client_init()) == NULL )
829 {
830 printk("failed... can't allocate host-dependent part of client\n");
831 if ( client )
832 tmem_free(client,sizeof(client_t),NULL);
833 return NULL;
834 }
835 tmh_set_current_client(client);
836 client->cli_id = cli_id;
837 #ifdef __i386__
838 client->compress = 0;
839 #else
840 client->compress = tmh_compression_enabled();
841 #endif
842 list_add_tail(&client->client_list, &global_client_list);
843 INIT_LIST_HEAD(&client->ephemeral_page_list);
844 client->eph_count = client->eph_count_max = 0;
845 printk("ok\n");
846 return client;
847 }
849 static void client_free(client_t *client)
850 {
851 list_del(&client->client_list);
852 tmh_client_destroy(client->tmh);
853 tmem_free(client,sizeof(client_t),NULL);
854 }
856 /* flush all data from a client and, optionally, free it */
857 static void client_flush(client_t *client, bool_t destroy)
858 {
859 int i;
860 pool_t *pool;
862 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
863 {
864 if ( (pool = client->pools[i]) == NULL )
865 continue;
866 pool_flush(pool,client->cli_id,destroy);
867 if ( destroy )
868 client->pools[i] = NULL;
869 }
870 if ( destroy )
871 client_free(client);
872 }
874 static bool_t client_over_quota(client_t *client)
875 {
876 int total = _atomic_read(client_weight_total);
878 ASSERT(client != NULL);
879 if ( (total == 0) || (client->weight == 0) ||
880 (client->eph_count == 0) )
881 return 0;
882 return ( ((global_eph_count*100L) / client->eph_count ) >
883 ((total*100L) / client->weight) );
884 }
886 /************ MEMORY REVOCATION ROUTINES *******************************/
888 static int tmem_evict(void)
889 {
890 client_t *client = tmh_client_from_current();
891 pgp_t *pgp = NULL, *pgp_del;
892 obj_t *obj;
893 pool_t *pool;
894 int ret = 0;
895 bool_t hold_pool_rwlock = 0;
897 evict_attempts++;
898 tmem_spin_lock(&eph_lists_spinlock);
899 if ( (client != NULL) && client_over_quota(client) &&
900 !list_empty(&client->ephemeral_page_list) )
901 {
902 list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
903 {
904 obj = pgp->obj;
905 pool = obj->pool;
906 if ( tmh_lock_all && !obj->no_evict )
907 goto found;
908 if ( tmem_spin_trylock(&obj->obj_spinlock) )
909 {
910 if ( obj->pgp_count > 1 )
911 goto found;
912 if ( tmem_write_trylock(&pool->pool_rwlock) )
913 {
914 hold_pool_rwlock = 1;
915 goto found;
916 }
917 tmem_spin_unlock(&obj->obj_spinlock);
918 }
919 }
920 } else if ( list_empty(&global_ephemeral_page_list) ) {
921 goto out;
922 } else {
923 list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
924 {
925 obj = pgp->obj;
926 pool = obj->pool;
927 if ( tmh_lock_all && !obj->no_evict )
928 goto found;
929 if ( tmem_spin_trylock(&obj->obj_spinlock) )
930 {
931 if ( obj->pgp_count > 1 )
932 goto found;
933 if ( tmem_write_trylock(&pool->pool_rwlock) )
934 {
935 hold_pool_rwlock = 1;
936 goto found;
937 }
938 tmem_spin_unlock(&obj->obj_spinlock);
939 }
940 }
941 }
943 ret = 0;
944 goto out;
946 found:
947 ASSERT(pgp != NULL);
948 ASSERT_SENTINEL(pgp,PGD);
949 obj = pgp->obj;
950 ASSERT(obj != NULL);
951 ASSERT(obj->no_evict == 0);
952 ASSERT(obj->pool != NULL);
953 ASSERT_SENTINEL(obj,OBJ);
955 ASSERT_SPINLOCK(&obj->obj_spinlock);
956 pgp_del = pgp_delete_from_obj(obj, pgp->index);
957 ASSERT(pgp_del == pgp);
958 pgp_delete(pgp,1);
959 if ( obj->pgp_count == 0 )
960 {
961 ASSERT_WRITELOCK(&pool->pool_rwlock);
962 obj_free(obj,0);
963 }
964 else
965 tmem_spin_unlock(&obj->obj_spinlock);
966 if ( hold_pool_rwlock )
967 tmem_write_unlock(&pool->pool_rwlock);
968 evicted_pgs++;
969 ret = 1;
971 out:
972 tmem_spin_unlock(&eph_lists_spinlock);
973 return ret;
974 }
976 static unsigned long tmem_relinquish_npages(unsigned long n)
977 {
978 unsigned long avail_pages = 0;
980 while ( (avail_pages = tmh_avail_pages()) < n )
981 {
982 if ( !tmem_evict() )
983 break;
984 }
985 if ( avail_pages )
986 tmh_release_avail_pages_to_host();
987 return avail_pages;
988 }
990 /************ TMEM CORE OPERATIONS ************************************/
992 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
993 {
994 void *dst, *p;
995 size_t size;
996 int ret = 0;
997 DECL_LOCAL_CYC_COUNTER(compress);
999 ASSERT(pgp != NULL);
1000 ASSERT(pgp->obj != NULL);
1001 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
1002 ASSERT(pgp->obj->pool != NULL);
1003 ASSERT(pgp->obj->pool->client != NULL);
1004 #ifdef __i386__
1005 return -ENOMEM;
1006 #endif
1007 if ( pgp->pfp != NULL )
1008 pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
1009 START_CYC_COUNTER(compress);
1010 ret = tmh_compress_from_client(cmfn, &dst, &size);
1011 if ( (ret == -EFAULT) || (ret == 0) )
1012 goto out;
1013 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
1014 ret = 0;
1015 else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
1016 ret = -ENOMEM;
1017 else
1019 memcpy(p,dst,size);
1020 pgp->cdata = p;
1021 pgp->size = size;
1022 pgp->obj->pool->client->compressed_pages++;
1023 pgp->obj->pool->client->compressed_sum_size += size;
1024 ret = 1;
1027 out:
1028 END_CYC_COUNTER(compress);
1029 return ret;
1032 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1033 uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
1035 pool_t *pool;
1036 obj_t *obj;
1037 client_t *client;
1038 pgp_t *pgpfound = NULL;
1039 int ret;
1041 /* if we can successfully manipulate pgp to change out the data, do so */
1042 ASSERT(pgp != NULL);
1043 ASSERT(pgp->pfp != NULL);
1044 ASSERT(pgp->size != -1);
1045 obj = pgp->obj;
1046 ASSERT_SPINLOCK(&obj->obj_spinlock);
1047 ASSERT(obj != NULL);
1048 pool = obj->pool;
1049 ASSERT(pool != NULL);
1050 client = pool->client;
1051 if ( len != 0 && tmh_compression_enabled() &&
1052 client->compress && pgp->size != 0 )
1054 ret = do_tmem_put_compress(pgp,cmfn);
1055 if ( ret == 1 )
1056 goto done;
1057 else if ( ret == 0 )
1058 goto copy_uncompressed;
1059 else if ( ret == -ENOMEM )
1060 goto failed_dup;
1061 else if ( ret == -EFAULT )
1062 goto bad_copy;
1065 copy_uncompressed:
1066 if ( pgp->pfp )
1067 pgp_free_data(pgp, pool);
1068 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1069 goto failed_dup;
1070 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
1071 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
1072 if ( ret == -EFAULT )
1073 goto bad_copy;
1074 pgp->size = 0;
1076 done:
1077 /* successfully replaced data, clean up and return success */
1078 if ( is_shared(pool) )
1079 obj->last_client = client->cli_id;
1080 obj->no_evict = 0;
1081 tmem_spin_unlock(&obj->obj_spinlock);
1082 pool->dup_puts_replaced++;
1083 pool->good_puts++;
1084 return 1;
1086 bad_copy:
1087 /* this should only happen if the client passed a bad mfn */
1088 failed_copies++;
1089 ASSERT(0);
1090 return -EFAULT;
1092 failed_dup:
1093 /* couldn't change out the data, flush the old data and return
1094 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
1095 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1096 ASSERT(pgpfound == pgp);
1097 pgp_delete(pgpfound,0);
1098 if ( obj->pgp_count == 0 )
1100 tmem_write_lock(&pool->pool_rwlock);
1101 obj_free(obj,0);
1102 tmem_write_unlock(&pool->pool_rwlock);
1103 } else {
1104 obj->no_evict = 0;
1105 tmem_spin_unlock(&obj->obj_spinlock);
1107 pool->dup_puts_flushed++;
1108 return -ENOSPC;
1112 static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
1113 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1114 uint32_t pfn_offset, uint32_t len)
1116 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
1117 pgp_t *pgp = NULL, *pgpdel = NULL;
1118 client_t *client = pool->client;
1119 int ret = client->frozen ? -EFROZEN : -ENOMEM;
1121 ASSERT(pool != NULL);
1122 pool->puts++;
1123 /* does page already exist (dup)? if so, handle specially */
1124 if ( (obj = objfound = obj_find(pool,oid)) != NULL )
1126 ASSERT_SPINLOCK(&objfound->obj_spinlock);
1127 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
1128 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
1131 /* no puts allowed into a frozen pool (except dup puts) */
1132 if ( client->frozen )
1133 goto free;
1135 if ( (objfound == NULL) )
1137 tmem_write_lock(&pool->pool_rwlock);
1138 if ( (obj = objnew = obj_new(pool,oid)) == NULL )
1140 tmem_write_unlock(&pool->pool_rwlock);
1141 return -ENOMEM;
1143 ASSERT_SPINLOCK(&objnew->obj_spinlock);
1144 tmem_write_unlock(&pool->pool_rwlock);
1147 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
1148 ASSERT_SPINLOCK(&obj->obj_spinlock);
1149 if ( (pgp = pgp_alloc(obj)) == NULL )
1150 goto free;
1152 ret = pgp_add_to_obj(obj, index, pgp);
1153 if ( ret == -ENOMEM )
1154 /* warning, may result in partially built radix tree ("stump") */
1155 goto free;
1156 ASSERT(ret != -EEXIST);
1157 pgp->index = index;
1159 if ( len != 0 && tmh_compression_enabled() && client->compress )
1161 ASSERT(pgp->pfp == NULL);
1162 ret = do_tmem_put_compress(pgp,cmfn);
1163 if ( ret == 1 )
1164 goto insert_page;
1165 if ( ret == -ENOMEM )
1167 client->compress_nomem++;
1168 goto delete_and_free;
1170 if ( ret == 0 )
1172 client->compress_poor++;
1173 goto copy_uncompressed;
1175 if ( ret == -EFAULT )
1176 goto bad_copy;
1179 copy_uncompressed:
1180 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1182 ret == -ENOMEM;
1183 goto delete_and_free;
1185 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
1186 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
1187 if ( ret == -EFAULT )
1188 goto bad_copy;
1189 pgp->size = 0;
1191 insert_page:
1192 if ( is_ephemeral(pool) )
1194 tmem_spin_lock(&eph_lists_spinlock);
1195 list_add_tail(&pgp->global_eph_pages,
1196 &global_ephemeral_page_list);
1197 if (++global_eph_count > global_eph_count_max)
1198 global_eph_count_max = global_eph_count;
1199 list_add_tail(&pgp->client_eph_pages,
1200 &client->ephemeral_page_list);
1201 if (++client->eph_count > client->eph_count_max)
1202 client->eph_count_max = client->eph_count;
1203 tmem_spin_unlock(&eph_lists_spinlock);
1205 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
1206 if ( is_shared(pool) )
1207 obj->last_client = client->cli_id;
1208 obj->no_evict = 0;
1209 tmem_spin_unlock(&obj->obj_spinlock);
1210 pool->good_puts++;
1211 return 1;
1213 delete_and_free:
1214 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1215 pgpdel = pgp_delete_from_obj(obj, pgp->index);
1216 ASSERT(pgp == pgpdel);
1218 free:
1219 if ( pgp )
1220 pgp_delete(pgp,0);
1221 if ( objfound )
1223 objfound->no_evict = 0;
1224 tmem_spin_unlock(&objfound->obj_spinlock);
1226 if ( objnew )
1228 tmem_write_lock(&pool->pool_rwlock);
1229 obj_free(objnew,0);
1230 tmem_write_unlock(&pool->pool_rwlock);
1232 pool->no_mem_puts++;
1233 return ret;
1235 bad_copy:
1236 /* this should only happen if the client passed a bad mfn */
1237 failed_copies++;
1238 ASSERT(0);
1239 goto free;
1242 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
1243 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1244 uint32_t pfn_offset, uint32_t len)
1246 obj_t *obj;
1247 pgp_t *pgp;
1248 client_t *client = pool->client;
1249 DECL_LOCAL_CYC_COUNTER(decompress);
1251 if ( !_atomic_read(pool->pgp_count) )
1252 return -EEMPTY;
1254 pool->gets++;
1255 obj = obj_find(pool,oid);
1256 if ( obj == NULL )
1257 return 0;
1259 ASSERT_SPINLOCK(&obj->obj_spinlock);
1260 if (is_shared(pool) || is_persistent(pool) )
1261 pgp = pgp_lookup_in_obj(obj, index);
1262 else
1263 pgp = pgp_delete_from_obj(obj, index);
1264 if ( pgp == NULL )
1266 obj->no_evict = 0;
1267 tmem_spin_unlock(&obj->obj_spinlock);
1268 return 0;
1270 ASSERT(pgp->size != -1);
1271 if ( pgp->size != 0 )
1273 START_CYC_COUNTER(decompress);
1274 if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
1275 goto bad_copy;
1276 END_CYC_COUNTER(decompress);
1278 else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
1279 pfn_offset, len) == -EFAULT)
1280 goto bad_copy;
1281 if ( is_ephemeral(pool) )
1283 if ( is_private(pool) )
1285 pgp_delete(pgp,0);
1286 if ( obj->pgp_count == 0 )
1288 tmem_write_lock(&pool->pool_rwlock);
1289 obj_free(obj,0);
1290 obj = NULL;
1291 tmem_write_unlock(&pool->pool_rwlock);
1293 } else {
1294 tmem_spin_lock(&eph_lists_spinlock);
1295 list_del(&pgp->global_eph_pages);
1296 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1297 list_del(&pgp->client_eph_pages);
1298 list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
1299 tmem_spin_unlock(&eph_lists_spinlock);
1300 ASSERT(obj != NULL);
1301 obj->last_client = tmh_get_cli_id_from_current();
1304 if ( obj != NULL )
1306 obj->no_evict = 0;
1307 tmem_spin_unlock(&obj->obj_spinlock);
1309 pool->found_gets++;
1310 return 1;
1312 bad_copy:
1313 /* this should only happen if the client passed a bad mfn */
1314 failed_copies++;
1315 ASSERT(0);
1316 return -EFAULT;
1320 static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
1322 obj_t *obj;
1323 pgp_t *pgp;
1325 pool->flushs++;
1326 obj = obj_find(pool,oid);
1327 if ( obj == NULL )
1328 goto out;
1329 pgp = pgp_delete_from_obj(obj, index);
1330 if ( pgp == NULL )
1332 obj->no_evict = 0;
1333 tmem_spin_unlock(&obj->obj_spinlock);
1334 goto out;
1336 pgp_delete(pgp,0);
1337 if ( obj->pgp_count == 0 )
1339 tmem_write_lock(&pool->pool_rwlock);
1340 obj_free(obj,0);
1341 tmem_write_unlock(&pool->pool_rwlock);
1342 } else {
1343 obj->no_evict = 0;
1344 tmem_spin_unlock(&obj->obj_spinlock);
1346 pool->flushs_found++;
1348 out:
1349 if ( pool->client->frozen )
1350 return -EFROZEN;
1351 else
1352 return 1;
1355 static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
1357 obj_t *obj;
1359 pool->flush_objs++;
1360 obj = obj_find(pool,oid);
1361 if ( obj == NULL )
1362 goto out;
1363 tmem_write_lock(&pool->pool_rwlock);
1364 obj_destroy(obj,0);
1365 pool->flush_objs_found++;
1366 tmem_write_unlock(&pool->pool_rwlock);
1368 out:
1369 if ( pool->client->frozen )
1370 return -EFROZEN;
1371 else
1372 return 1;
1375 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
1377 client_t *client = tmh_client_from_current();
1378 pool_t *pool;
1380 if ( client->pools == NULL )
1381 return 0;
1382 if ( (pool = client->pools[pool_id]) == NULL )
1383 return 0;
1384 client->pools[pool_id] = NULL;
1385 pool_flush(pool,client->cli_id,1);
1386 return 1;
1389 static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi)
1391 client_t *client = tmh_client_from_current();
1392 cli_id_t cli_id = tmh_get_cli_id_from_current();
1393 int persistent = flags & TMEM_POOL_PERSIST;
1394 int shared = flags & TMEM_POOL_SHARED;
1395 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1396 & TMEM_POOL_PAGESIZE_MASK;
1397 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1398 & TMEM_POOL_VERSION_MASK;
1399 pool_t *pool, *shpool;
1400 int s_poolid, d_poolid, first_unused_s_poolid;
1402 ASSERT(client != NULL);
1403 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
1404 persistent ? "persistent" : "ephemeral" ,
1405 shared ? "shared" : "private", cli_id_str, cli_id);
1406 if ( specversion != 0 )
1408 printk("failed... unsupported spec version\n");
1409 return -EPERM;
1411 if ( pagebits != (PAGE_SHIFT - 12) )
1413 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
1414 return -EPERM;
1416 if ( (pool = pool_alloc()) == NULL )
1418 printk("failed... out of memory\n");
1419 return -ENOMEM;
1421 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1422 if ( client->pools[d_poolid] == NULL )
1423 break;
1424 if ( d_poolid == MAX_POOLS_PER_DOMAIN )
1426 printk("failed... no more pool slots available for this %s\n",
1427 client_str);
1428 goto fail;
1430 pool->shared = shared;
1431 pool->client = client;
1432 if ( shared )
1434 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1435 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
1437 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
1439 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1441 printk("(matches shared pool uuid=%"PRIx64".%"PRIx64") ",
1442 uuid_hi, uuid_lo);
1443 printk("pool_id=%d\n",d_poolid);
1444 client->pools[d_poolid] = global_shared_pools[s_poolid];
1445 shared_pool_join(global_shared_pools[s_poolid], client);
1446 pool_free(pool);
1447 return d_poolid;
1450 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1451 first_unused_s_poolid = s_poolid;
1453 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1455 printk("tmem: failed... no global shared pool slots available\n");
1456 goto fail;
1458 else
1460 INIT_LIST_HEAD(&pool->share_list);
1461 pool->shared_count = 0;
1462 global_shared_pools[first_unused_s_poolid] = pool;
1463 (void)shared_pool_join(pool,client);
1466 client->pools[d_poolid] = pool;
1467 list_add_tail(&pool->pool_list, &global_pool_list);
1468 pool->pool_id = d_poolid;
1469 pool->persistent = persistent;
1470 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
1471 printk("pool_id=%d\n",d_poolid);
1472 return d_poolid;
1474 fail:
1475 pool_free(pool);
1476 return -EPERM;
1479 /************ TMEM CONTROL OPERATIONS ************************************/
1481 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
1482 static int tmemc_freeze_pools(int cli_id, int arg)
1484 client_t *client;
1485 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
1486 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
1487 char *s;
1489 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
1490 if ( cli_id == CLI_ID_NULL )
1492 list_for_each_entry(client,&global_client_list,client_list)
1493 client->frozen = freeze;
1494 printk("tmem: all pools %s for all %ss\n",s,client_str);
1496 else
1498 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1499 return -1;
1500 client->frozen = freeze;
1501 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
1503 return 0;
1506 static int tmemc_flush_mem(int cli_id, uint32_t kb)
1508 uint32_t npages, flushed_pages, flushed_kb;
1510 if ( cli_id != CLI_ID_NULL )
1512 printk("tmem: %s-specific flush not supported yet, use --all\n",
1513 client_str);
1514 return -1;
1516 /* convert kb to pages, rounding up if necessary */
1517 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
1518 flushed_pages = tmem_relinquish_npages(npages);
1519 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
1520 return flushed_kb;
1523 /*
1524 * These tmemc_list* routines output lots of stats in a format that is
1525 * intended to be program-parseable, not human-readable. Further, by
1526 * tying each group of stats to a line format indicator (e.g. G= for
1527 * global stats) and each individual stat to a two-letter specifier
1528 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
1529 * global ephemeral pool), it should allow the stats reported to be
1530 * forward and backwards compatible as tmem evolves.
1531 */
1532 #define BSIZE 1024
1534 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
1535 uint32_t len, bool_t use_long)
1537 char info[BSIZE];
1538 int i, n = 0, sum = 0;
1539 pool_t *p;
1540 bool_t s;
1542 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d%c",
1543 c->cli_id, c->weight, c->cap, c->compress,
1544 c->frozen, use_long ? ',' : '\n');
1545 if (use_long)
1546 n += scnprintf(info+n,BSIZE-n,
1547 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
1548 c->eph_count, c->eph_count_max,
1549 c->compressed_pages, c->compressed_sum_size,
1550 c->compress_poor, c->compress_nomem);
1551 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1552 sum += n;
1553 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
1555 if ( (p = c->pools[i]) == NULL )
1556 continue;
1557 s = is_shared(p);
1558 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
1559 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
1560 c->cli_id, p->pool_id,
1561 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
1562 (uint64_t)(s ? p->uuid[0] : 0),
1563 (uint64_t)(s ? p->uuid[1] : 0LL),
1564 use_long ? ',' : '\n');
1565 if (use_long)
1566 n += scnprintf(info+n,BSIZE-n,
1567 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1568 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1569 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1570 _atomic_read(p->pgp_count), p->pgp_count_max,
1571 p->obj_count, p->obj_count_max,
1572 p->objnode_count, p->objnode_count_max,
1573 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1574 p->no_mem_puts,
1575 p->found_gets, p->gets,
1576 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1577 if ( sum + n >= len )
1578 return sum;
1579 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1580 sum += n;
1582 return sum;
1585 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
1586 bool_t use_long)
1588 char info[BSIZE];
1589 int i, n = 0, sum = 0;
1590 pool_t *p;
1591 sharelist_t *sl;
1593 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1595 if ( (p = global_shared_pools[i]) == NULL )
1596 continue;
1597 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
1598 i, is_persistent(p) ? 'P' : 'E',
1599 is_shared(p) ? 'S' : 'P',
1600 p->uuid[0], p->uuid[1]);
1601 list_for_each_entry(sl,&p->share_list, share_list)
1602 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
1603 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
1604 if (use_long)
1605 n += scnprintf(info+n,BSIZE-n,
1606 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1607 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1608 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1609 _atomic_read(p->pgp_count), p->pgp_count_max,
1610 p->obj_count, p->obj_count_max,
1611 p->objnode_count, p->objnode_count_max,
1612 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1613 p->no_mem_puts,
1614 p->found_gets, p->gets,
1615 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1616 if ( sum + n >= len )
1617 return sum;
1618 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1619 sum += n;
1621 return sum;
1624 #ifdef TMEM_PERF
1625 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
1626 bool_t use_long)
1628 char info[BSIZE];
1629 int n = 0, sum = 0;
1631 n = scnprintf(info+n,BSIZE-n,"T=");
1632 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
1633 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
1634 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
1635 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
1636 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
1637 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
1638 #ifdef COMPARE_COPY_PAGE_SSE2
1639 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
1640 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
1641 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
1642 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
1643 #else
1644 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
1645 #endif
1646 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
1647 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
1648 n--; /* overwrite trailing comma */
1649 n += scnprintf(info+n,BSIZE-n,"\n");
1650 if ( sum + n >= len )
1651 return sum;
1652 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1653 sum += n;
1654 return sum;
1656 #else
1657 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
1658 #endif
1660 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
1661 bool_t use_long)
1663 char info[BSIZE];
1664 int n = 0, sum = off;
1666 n += scnprintf(info,BSIZE,"G="
1667 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
1668 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
1669 total_tmem_ops, errored_tmem_ops, failed_copies,
1670 alloc_failed, alloc_page_failed, tmh_avail_pages(),
1671 low_on_memory, evicted_pgs,
1672 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
1673 total_flush_pool, use_long ? ',' : '\n');
1674 if (use_long)
1675 n += scnprintf(info+n,BSIZE-n,
1676 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
1677 global_eph_count, global_eph_count_max,
1678 _atomic_read(global_obj_count), global_obj_count_max,
1679 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
1680 _atomic_read(global_pgp_count), global_pgp_count_max);
1681 if ( sum + n >= len )
1682 return sum;
1683 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1684 sum += n;
1685 return sum;
1688 static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
1689 bool_t use_long)
1691 client_t *client;
1692 int off = 0;
1694 if ( cli_id == CLI_ID_NULL ) {
1695 off = tmemc_list_global(buf,0,len,use_long);
1696 off += tmemc_list_shared(buf,off,len-off,use_long);
1697 list_for_each_entry(client,&global_client_list,client_list)
1698 off += tmemc_list_client(client, buf, off, len-off, use_long);
1699 off += tmemc_list_global_perf(buf,off,len-off,use_long);
1701 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1702 return -1;
1703 else
1704 off = tmemc_list_client(client, buf, 0, len, use_long);
1707 return 0;
1710 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
1712 cli_id_t cli_id = client->cli_id;
1713 uint32_t old_weight;
1715 switch (subop)
1717 case TMEMC_SET_WEIGHT:
1718 old_weight = client->weight;
1719 client->weight = arg1;
1720 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1721 atomic_sub(old_weight,&client_weight_total);
1722 atomic_add(client->weight,&client_weight_total);
1723 break;
1724 case TMEMC_SET_CAP:
1725 client->cap = arg1;
1726 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1727 break;
1728 case TMEMC_SET_COMPRESS:
1729 client->compress = arg1 ? 1 : 0;
1730 printk("tmem: compression %s for %s=%d\n",
1731 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
1732 break;
1733 default:
1734 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
1735 return -1;
1737 return 0;
1740 static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
1742 client_t *client;
1744 if ( cli_id == CLI_ID_NULL )
1745 list_for_each_entry(client,&global_client_list,client_list)
1746 tmemc_set_var_one(client, subop, arg1);
1747 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1748 return -1;
1749 else
1750 tmemc_set_var_one(client, subop, arg1);
1751 return 0;
1754 static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
1755 uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
1757 int ret;
1758 cli_id_t cli_id = (cli_id_t)cli_id32;
1760 if (!tmh_current_is_privileged())
1762 /* don't fail... mystery: sometimes dom0 fails here */
1763 /* return -EPERM; */
1765 switch(subop)
1767 case TMEMC_THAW:
1768 case TMEMC_FREEZE:
1769 case TMEMC_DESTROY:
1770 ret = tmemc_freeze_pools(cli_id,subop);
1771 break;
1772 case TMEMC_FLUSH:
1773 ret = tmemc_flush_mem(cli_id,arg1);
1774 break;
1775 case TMEMC_LIST:
1776 ret = tmemc_list(cli_id,buf,arg1,arg2);
1777 break;
1778 case TMEMC_SET_WEIGHT:
1779 case TMEMC_SET_CAP:
1780 case TMEMC_SET_COMPRESS:
1781 ret = tmemc_set_var(cli_id,subop,arg1);
1782 break;
1783 default:
1784 ret = -1;
1786 return ret;
1789 /************ EXPORTed FUNCTIONS **************************************/
1791 EXPORT long do_tmem_op(tmem_cli_op_t uops)
1793 struct tmem_op op;
1794 client_t *client = tmh_client_from_current();
1795 pool_t *pool = NULL;
1796 int rc = 0;
1797 bool_t succ_get = 0, succ_put = 0;
1798 bool_t non_succ_get = 0, non_succ_put = 0;
1799 bool_t flush = 0, flush_obj = 0;
1800 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
1801 static bool_t warned = 0;
1802 DECL_LOCAL_CYC_COUNTER(succ_get);
1803 DECL_LOCAL_CYC_COUNTER(succ_put);
1804 DECL_LOCAL_CYC_COUNTER(non_succ_get);
1805 DECL_LOCAL_CYC_COUNTER(non_succ_put);
1806 DECL_LOCAL_CYC_COUNTER(flush);
1807 DECL_LOCAL_CYC_COUNTER(flush_obj);
1809 if ( !tmem_initialized )
1811 if ( !warned )
1812 printk("tmem: must specify tmem parameter on xen boot line\n");
1813 warned = 1;
1814 return -ENODEV;
1817 total_tmem_ops++;
1819 if ( tmh_lock_all )
1821 if ( tmh_lock_all > 1 )
1822 spin_lock_irq(&tmem_spinlock);
1823 else
1824 spin_lock(&tmem_spinlock);
1827 START_CYC_COUNTER(succ_get);
1828 DUP_START_CYC_COUNTER(succ_put,succ_get);
1829 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
1830 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
1831 DUP_START_CYC_COUNTER(flush,succ_get);
1832 DUP_START_CYC_COUNTER(flush_obj,succ_get);
1834 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
1836 printk("tmem: can't get tmem struct from %s\n",client_str);
1837 rc = -EFAULT;
1838 goto out;
1841 if ( op.cmd == TMEM_CONTROL )
1843 tmem_write_lock(&tmem_rwlock);
1844 tmem_write_lock_set = 1;
1845 rc = do_tmem_control(op.u.ctrl.subop, op.u.ctrl.cli_id,
1846 op.u.ctrl.arg1, op.u.ctrl.arg2, op.u.ctrl.buf);
1847 goto out;
1850 /* create per-client tmem structure dynamically on first use by client */
1851 if ( client == NULL )
1853 tmem_write_lock(&tmem_rwlock);
1854 tmem_write_lock_set = 1;
1855 if ( (client = client_create()) == NULL )
1857 printk("tmem: can't create tmem structure for %s\n",client_str);
1858 rc = -ENOMEM;
1859 goto out;
1863 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
1865 if ( !tmem_write_lock_set )
1867 tmem_write_lock(&tmem_rwlock);
1868 tmem_write_lock_set = 1;
1871 else
1873 if ( !tmem_write_lock_set )
1875 tmem_read_lock(&tmem_rwlock);
1876 tmem_read_lock_set = 1;
1878 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
1879 ((pool = client->pools[op.pool_id]) == NULL) )
1881 rc = -ENODEV;
1882 printk("tmem: operation requested on uncreated pool\n");
1883 goto out;
1885 ASSERT_SENTINEL(pool,POOL);
1888 switch ( op.cmd )
1890 case TMEM_NEW_POOL:
1891 rc = do_tmem_new_pool(op.u.new.flags,
1892 op.u.new.uuid[0], op.u.new.uuid[1]);
1893 break;
1894 case TMEM_NEW_PAGE:
1895 rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1896 0, 0, 0);
1897 break;
1898 case TMEM_PUT_PAGE:
1899 rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1900 0, 0, PAGE_SIZE);
1901 if (rc == 1) succ_put = 1;
1902 else non_succ_put = 1;
1903 break;
1904 case TMEM_GET_PAGE:
1905 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1906 0, 0, PAGE_SIZE);
1907 if (rc == 1) succ_get = 1;
1908 else non_succ_get = 1;
1909 break;
1910 case TMEM_FLUSH_PAGE:
1911 flush = 1;
1912 rc = do_tmem_flush_page(pool, op.u.gen.object, op.u.gen.index);
1913 break;
1914 case TMEM_FLUSH_OBJECT:
1915 rc = do_tmem_flush_object(pool, op.u.gen.object);
1916 flush_obj = 1;
1917 break;
1918 case TMEM_DESTROY_POOL:
1919 flush = 1;
1920 rc = do_tmem_destroy_pool(op.pool_id);
1921 break;
1922 case TMEM_READ:
1923 rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1924 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
1925 op.u.gen.len);
1926 break;
1927 case TMEM_WRITE:
1928 rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
1929 op.u.gen.tmem_offset, op.u.gen.pfn_offset,
1930 op.u.gen.len);
1931 break;
1932 case TMEM_XCHG:
1933 /* need to hold global lock to ensure xchg is atomic */
1934 printk("tmem_xchg op not implemented yet\n");
1935 rc = 0;
1936 break;
1937 default:
1938 printk("tmem: op %d not implemented\n", op.cmd);
1939 rc = 0;
1940 break;
1943 out:
1944 if ( rc < 0 )
1945 errored_tmem_ops++;
1946 if ( succ_get )
1947 END_CYC_COUNTER(succ_get);
1948 else if ( succ_put )
1949 END_CYC_COUNTER(succ_put);
1950 else if ( non_succ_get )
1951 END_CYC_COUNTER(non_succ_get);
1952 else if ( non_succ_put )
1953 END_CYC_COUNTER(non_succ_put);
1954 else if ( flush )
1955 END_CYC_COUNTER(flush);
1956 else
1957 END_CYC_COUNTER(flush_obj);
1959 if ( tmh_lock_all )
1961 if ( tmh_lock_all > 1 )
1962 spin_unlock_irq(&tmem_spinlock);
1963 else
1964 spin_unlock(&tmem_spinlock);
1965 } else {
1966 if ( tmem_write_lock_set )
1967 write_unlock(&tmem_rwlock);
1968 else if ( tmem_read_lock_set )
1969 read_unlock(&tmem_rwlock);
1970 else
1971 ASSERT(0);
1974 return rc;
1977 /* this should be called when the host is destroying a client */
1978 EXPORT void tmem_destroy(void *v)
1980 client_t *client = (client_t *)v;
1982 if ( client == NULL )
1983 return;
1985 if ( tmh_lock_all )
1986 spin_lock(&tmem_spinlock);
1987 else
1988 write_lock(&tmem_rwlock);
1990 printk("tmem: flushing tmem pools for %s=%d\n",
1991 cli_id_str, client->cli_id);
1992 client_flush(client, 1);
1994 if ( tmh_lock_all )
1995 spin_unlock(&tmem_spinlock);
1996 else
1997 write_unlock(&tmem_rwlock);
2000 /* freezing all pools guarantees that no additional memory will be consumed */
2001 EXPORT void tmem_freeze_all(unsigned char key)
2003 static int freeze = 0;
2005 if ( tmh_lock_all )
2006 spin_lock(&tmem_spinlock);
2007 else
2008 write_lock(&tmem_rwlock);
2010 freeze = !freeze;
2011 tmemc_freeze_pools(CLI_ID_NULL,freeze);
2013 if ( tmh_lock_all )
2014 spin_unlock(&tmem_spinlock);
2015 else
2016 write_unlock(&tmem_rwlock);
2019 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
2021 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2023 pfp_t *pfp;
2024 unsigned long evicts_per_relinq = 0;
2025 int max_evictions = 10;
2027 if (!tmh_enabled())
2028 return NULL;
2029 #ifdef __i386__
2030 return NULL;
2031 #endif
2033 relinq_attempts++;
2034 if ( order > 0 )
2036 printk("tmem_relinquish_page: failing order=%d\n", order);
2037 return NULL;
2040 if ( tmh_called_from_tmem(memflags) )
2042 if ( tmh_lock_all )
2043 spin_lock(&tmem_spinlock);
2044 else
2045 read_lock(&tmem_rwlock);
2048 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
2050 if ( (max_evictions-- <= 0) || !tmem_evict())
2051 break;
2052 evicts_per_relinq++;
2054 if ( evicts_per_relinq > max_evicts_per_relinq )
2055 max_evicts_per_relinq = evicts_per_relinq;
2056 tmh_scrub_page(pfp, memflags);
2057 if ( pfp != NULL )
2058 relinq_pgs++;
2060 if ( tmh_called_from_tmem(memflags) )
2062 if ( tmh_lock_all )
2063 spin_unlock(&tmem_spinlock);
2064 else
2065 read_unlock(&tmem_rwlock);
2068 return pfp;
2071 /* called at hypervisor startup */
2072 EXPORT void init_tmem(void)
2074 if ( !tmh_enabled() )
2075 return;
2077 radix_tree_init();
2078 if ( tmh_init() )
2080 printk("tmem: initialized comp=%d global-lock=%d\n",
2081 tmh_compression_enabled(), tmh_lock_all);
2082 tmem_initialized = 1;
2084 else
2085 printk("tmem: initialization FAILED\n");
2088 /*
2089 * Local variables:
2090 * mode: C
2091 * c-set-style: "BSD"
2092 * c-basic-offset: 4
2093 * tab-width: 4
2094 * indent-tabs-mode: nil
2095 * End:
2096 */