ia64/xen-unstable

view xen/common/tmem.c @ 19686:50134a902c66

tmem: fix corner case crash on forcible domain destruction

When a tmem-enabled domain is destroyed, if the domain was
using a persistent pool, the domain destruction process
to scrubs page races tmem's attempts to gracefully dismantle
data structures. Move tmem_destroy earlier in the domain
destruction process.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jun 01 14:07:46 2009 +0100 (2009-06-01)
parents fcc71d023408
children 4294a04b24bc
line source
1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
9 /* TODO list: 090129
10 - improve on reclamation policy
11 - use different tlsf pools for each client (maybe each pool)
12 - implement page accounting and minimal QoS limits
13 - test shared access more completely (need pv cluster fs)
14 - add feedback-driven compression (not for persistent pools though!)
15 - add data-structure total bytes overhead stats
16 */
18 #ifdef __XEN__
19 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here */
20 #endif
22 #include <xen/tmem.h>
23 #include <xen/rbtree.h>
24 #include <xen/radix-tree.h>
25 #include <xen/list.h>
27 #define EXPORT /* indicates code other modules are dependent upon */
28 #define FORWARD
30 /************ INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
32 #define CLI_ID_NULL TMH_CLI_ID_NULL
33 #define cli_id_str tmh_cli_id_str
34 #define client_str tmh_client_str
36 /************ DEBUG and STATISTICS (+ some compression testing) *******/
38 #ifndef NDEBUG
39 #define SENTINELS
40 #define NOINLINE noinline
41 #else
42 #define NOINLINE
43 #endif
45 #ifdef SENTINELS
46 #define DECL_SENTINEL unsigned long sentinel;
47 #define SET_SENTINEL(_x,_y) _x->sentinel = _y##_SENTINEL
48 #define INVERT_SENTINEL(_x,_y) _x->sentinel = ~_y##_SENTINEL
49 #define ASSERT_SENTINEL(_x,_y) \
50 ASSERT(_x->sentinel != ~_y##_SENTINEL);ASSERT(_x->sentinel == _y##_SENTINEL)
51 #ifdef __i386__
52 #define POOL_SENTINEL 0x87658765
53 #define OBJ_SENTINEL 0x12345678
54 #define OBJNODE_SENTINEL 0xfedcba09
55 #define PGD_SENTINEL 0x43214321
56 #else
57 #define POOL_SENTINEL 0x8765876587658765
58 #define OBJ_SENTINEL 0x1234567812345678
59 #define OBJNODE_SENTINEL 0xfedcba0987654321
60 #define PGD_SENTINEL 0x4321432143214321
61 #endif
62 #else
63 #define DECL_SENTINEL
64 #define SET_SENTINEL(_x,_y) do { } while (0)
65 #define ASSERT_SENTINEL(_x,_y) do { } while (0)
66 #define INVERT_SENTINEL(_x,_y) do { } while (0)
67 #endif
69 /* global statistics (none need to be locked) */
70 static unsigned long total_tmem_ops = 0;
71 static unsigned long errored_tmem_ops = 0;
72 static unsigned long total_flush_pool = 0;
73 static unsigned long alloc_failed = 0, alloc_page_failed = 0;
74 static unsigned long evicted_pgs = 0, evict_attempts = 0;
75 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
76 static unsigned long max_evicts_per_relinq = 0;
77 static unsigned long low_on_memory = 0;
78 static int global_obj_count_max = 0;
79 static int global_pgp_count_max = 0;
80 static int global_page_count_max = 0;
81 static int global_rtree_node_count_max = 0;
82 static long global_eph_count_max = 0;
83 static unsigned long failed_copies;
85 DECL_CYC_COUNTER(succ_get);
86 DECL_CYC_COUNTER(succ_put);
87 DECL_CYC_COUNTER(non_succ_get);
88 DECL_CYC_COUNTER(non_succ_put);
89 DECL_CYC_COUNTER(flush);
90 DECL_CYC_COUNTER(flush_obj);
91 #ifdef COMPARE_COPY_PAGE_SSE2
92 EXTERN_CYC_COUNTER(pg_copy1);
93 EXTERN_CYC_COUNTER(pg_copy2);
94 EXTERN_CYC_COUNTER(pg_copy3);
95 EXTERN_CYC_COUNTER(pg_copy4);
96 #else
97 EXTERN_CYC_COUNTER(pg_copy);
98 #endif
99 DECL_CYC_COUNTER(compress);
100 DECL_CYC_COUNTER(decompress);
102 /************ CORE DATA STRUCTURES ************************************/
104 #define MAX_POOLS_PER_DOMAIN 16
105 #define MAX_GLOBAL_SHARED_POOLS 16
107 struct tm_pool;
108 struct client {
109 struct list_head client_list;
110 struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
111 tmh_client_t *tmh;
112 struct list_head ephemeral_page_list;
113 long eph_count, eph_count_max;
114 cli_id_t cli_id;
115 uint32_t weight;
116 uint32_t cap;
117 bool_t compress;
118 bool_t frozen;
119 unsigned long compress_poor, compress_nomem;
120 unsigned long compressed_pages;
121 uint64_t compressed_sum_size;
122 };
123 typedef struct client client_t;
125 struct share_list {
126 struct list_head share_list;
127 client_t *client;
128 };
129 typedef struct share_list sharelist_t;
131 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
132 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
133 #define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
135 struct tm_pool {
136 bool_t shared;
137 bool_t persistent;
138 struct list_head pool_list; /* FIXME do we need this anymore? */
139 client_t *client;
140 uint64_t uuid[2]; /* 0 for private, non-zero for shared */
141 uint32_t pool_id;
142 rwlock_t pool_rwlock;
143 struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */
144 struct list_head share_list; /* valid if shared */
145 DECL_SENTINEL
146 int shared_count; /* valid if shared */
147 atomic_t pgp_count;
148 int pgp_count_max;
149 long obj_count; /* atomicity depends on pool_rwlock held for write */
150 long obj_count_max;
151 unsigned long objnode_count, objnode_count_max;
152 uint64_t sum_life_cycles;
153 uint64_t sum_evicted_cycles;
154 unsigned long puts, good_puts, no_mem_puts;
155 unsigned long dup_puts_flushed, dup_puts_replaced;
156 unsigned long gets, found_gets;
157 unsigned long flushs, flushs_found;
158 unsigned long flush_objs, flush_objs_found;
159 };
160 typedef struct tm_pool pool_t;
162 #define is_persistent(_p) (_p->persistent)
163 #define is_ephemeral(_p) (!(_p->persistent))
164 #define is_shared(_p) (_p->shared)
165 #define is_private(_p) (!(_p->shared))
167 struct tmem_object_root {
168 DECL_SENTINEL
169 uint64_t oid;
170 struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
171 unsigned long objnode_count; /* atomicity depends on obj_spinlock */
172 long pgp_count; /* atomicity depends on obj_spinlock */
173 struct radix_tree_root tree_root; /* tree of pages within object */
174 pool_t *pool;
175 cli_id_t last_client;
176 spinlock_t obj_spinlock;
177 bool_t no_evict; /* if globally locked, pseudo-locks against eviction */
178 };
179 typedef struct tmem_object_root obj_t;
181 typedef struct radix_tree_node rtn_t;
182 struct tmem_object_node {
183 obj_t *obj;
184 DECL_SENTINEL
185 rtn_t rtn;
186 };
187 typedef struct tmem_object_node objnode_t;
189 struct tmem_page_descriptor {
190 struct list_head global_eph_pages;
191 struct list_head client_eph_pages;
192 obj_t *obj;
193 uint32_t index;
194 size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
195 union {
196 pfp_t *pfp; /* page frame pointer */
197 char *cdata; /* compressed data */
198 };
199 uint64_t timestamp;
200 DECL_SENTINEL
201 };
202 typedef struct tmem_page_descriptor pgp_t;
204 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */
206 static LIST_HEAD(global_client_list);
207 static LIST_HEAD(global_pool_list);
209 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
210 static atomic_t client_weight_total = ATOMIC_INIT(0);
211 static int tmem_initialized = 0;
213 /************ CONCURRENCY ***********************************************/
215 EXPORT DEFINE_SPINLOCK(tmem_spinlock); /* used iff tmh_lock_all */
216 EXPORT DEFINE_RWLOCK(tmem_rwlock); /* used iff !tmh_lock_all */
217 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
219 #define tmem_spin_lock(_l) do {if (!tmh_lock_all) spin_lock(_l);}while(0)
220 #define tmem_spin_unlock(_l) do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
221 #define tmem_read_lock(_l) do {if (!tmh_lock_all) read_lock(_l);}while(0)
222 #define tmem_read_unlock(_l) do {if (!tmh_lock_all) read_unlock(_l);}while(0)
223 #define tmem_write_lock(_l) do {if (!tmh_lock_all) write_lock(_l);}while(0)
224 #define tmem_write_unlock(_l) do {if (!tmh_lock_all) write_unlock(_l);}while(0)
225 #define tmem_write_trylock(_l) ((tmh_lock_all)?1:write_trylock(_l))
226 #define tmem_spin_trylock(_l) (tmh_lock_all?1:spin_trylock(_l))
228 #define ASSERT_SPINLOCK(_l) ASSERT(tmh_lock_all || spin_is_locked(_l))
229 #define ASSERT_WRITELOCK(_l) ASSERT(tmh_lock_all || rw_is_write_locked(_l))
231 /* global counters (should use long_atomic_t access) */
232 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
233 static atomic_t global_obj_count = ATOMIC_INIT(0);
234 static atomic_t global_pgp_count = ATOMIC_INIT(0);
235 static atomic_t global_page_count = ATOMIC_INIT(0);
236 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
238 #define atomic_inc_and_max(_c) do { \
239 atomic_inc(&_c); \
240 if ( _atomic_read(_c) > _c##_max ) \
241 _c##_max = _atomic_read(_c); \
242 } while (0)
244 #define atomic_dec_and_assert(_c) do { \
245 atomic_dec(&_c); \
246 ASSERT(_atomic_read(_c) >= 0); \
247 } while (0)
250 /************ MEMORY ALLOCATION INTERFACE *****************************/
252 #define tmem_malloc(_type,_pool) \
253 _tmem_malloc(sizeof(_type), __alignof__(_type), _pool)
255 #define tmem_malloc_bytes(_size,_pool) \
256 _tmem_malloc(_size, 1, _pool)
258 static NOINLINE void *_tmem_malloc(size_t size, size_t align, pool_t *pool)
259 {
260 void *v;
262 if ( (pool != NULL) && is_persistent(pool) )
263 v = tmh_alloc_subpage_thispool(pool,size,align);
264 else
265 v = tmh_alloc_subpage(pool, size, align);
266 if ( v == NULL )
267 alloc_failed++;
268 return v;
269 }
271 static NOINLINE void tmem_free(void *p, size_t size, pool_t *pool)
272 {
273 if ( pool == NULL || !is_persistent(pool) )
274 tmh_free_subpage(p,size);
275 else
276 tmh_free_subpage_thispool(pool,p,size);
277 }
279 static NOINLINE pfp_t *tmem_page_alloc(pool_t *pool)
280 {
281 pfp_t *pfp = NULL;
283 if ( pool != NULL && is_persistent(pool) )
284 pfp = tmh_alloc_page_thispool(pool);
285 else
286 pfp = tmh_alloc_page(pool,0);
287 if ( pfp == NULL )
288 alloc_page_failed++;
289 else
290 atomic_inc_and_max(global_page_count);
291 return pfp;
292 }
294 static NOINLINE void tmem_page_free(pool_t *pool, pfp_t *pfp)
295 {
296 ASSERT(pfp);
297 if ( pool == NULL || !is_persistent(pool) )
298 tmh_free_page(pfp);
299 else
300 tmh_free_page_thispool(pool,pfp);
301 atomic_dec_and_assert(global_page_count);
302 }
304 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
306 /* allocate a pgp_t and associate it with an object */
307 static NOINLINE pgp_t *pgp_alloc(obj_t *obj)
308 {
309 pgp_t *pgp;
310 pool_t *pool;
312 ASSERT(obj != NULL);
313 ASSERT(obj->pool != NULL);
314 pool = obj->pool;
315 if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
316 return NULL;
317 pgp->obj = obj;
318 INIT_LIST_HEAD(&pgp->global_eph_pages);
319 INIT_LIST_HEAD(&pgp->client_eph_pages);
320 pgp->pfp = NULL;
321 pgp->size = -1;
322 pgp->index = -1;
323 pgp->timestamp = get_cycles();
324 SET_SENTINEL(pgp,PGD);
325 atomic_inc_and_max(global_pgp_count);
326 atomic_inc_and_max(pool->pgp_count);
327 return pgp;
328 }
330 static pgp_t *pgp_lookup_in_obj(obj_t *obj, uint32_t index)
331 {
332 ASSERT(obj != NULL);
333 ASSERT_SPINLOCK(&obj->obj_spinlock);
334 ASSERT_SENTINEL(obj,OBJ);
335 ASSERT(obj->pool != NULL);
336 ASSERT_SENTINEL(obj->pool,POOL);
337 return radix_tree_lookup(&obj->tree_root, index);
338 }
340 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
341 {
342 if ( pgp->pfp == NULL )
343 return;
344 if ( !pgp->size )
345 tmem_page_free(pgp->obj->pool,pgp->pfp);
346 else
347 {
348 tmem_free(pgp->cdata,pgp->size,pool);
349 if ( pool != NULL )
350 {
351 pool->client->compressed_pages--;
352 pool->client->compressed_sum_size -= pgp->size;
353 }
354 }
355 pgp->pfp = NULL;
356 pgp->size = -1;
357 }
359 static NOINLINE void pgp_free(pgp_t *pgp, int from_delete)
360 {
361 pool_t *pool = NULL;
363 ASSERT_SENTINEL(pgp,PGD);
364 ASSERT(pgp->obj != NULL);
365 ASSERT_SENTINEL(pgp->obj,OBJ);
366 ASSERT_SENTINEL(pgp->obj->pool,POOL);
367 ASSERT(list_empty(&pgp->global_eph_pages));
368 ASSERT(list_empty(&pgp->client_eph_pages));
369 if ( from_delete )
370 ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
371 ASSERT(pgp->obj->pool != NULL);
372 pool = pgp->obj->pool;
373 pgp_free_data(pgp, pool);
374 INVERT_SENTINEL(pgp,PGD);
375 pgp->obj = NULL;
376 pgp->index = -1;
377 pgp->size = -1;
378 atomic_dec_and_assert(global_pgp_count);
379 atomic_dec_and_assert(pool->pgp_count);
380 tmem_free(pgp,sizeof(pgp_t),pool);
381 }
383 /* remove the page from appropriate lists but not from parent object */
384 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
385 {
386 ASSERT(pgp != NULL);
387 ASSERT(pgp->obj != NULL);
388 ASSERT(pgp->obj->pool != NULL);
389 ASSERT(pgp->obj->pool->client != NULL);
390 if ( is_ephemeral(pgp->obj->pool) )
391 {
392 if ( !no_eph_lock )
393 tmem_spin_lock(&eph_lists_spinlock);
394 if ( !list_empty(&pgp->client_eph_pages) )
395 pgp->obj->pool->client->eph_count--;
396 ASSERT(pgp->obj->pool->client->eph_count >= 0);
397 list_del_init(&pgp->client_eph_pages);
398 if ( !list_empty(&pgp->global_eph_pages) )
399 global_eph_count--;
400 ASSERT(global_eph_count >= 0);
401 list_del_init(&pgp->global_eph_pages);
402 if ( !no_eph_lock )
403 tmem_spin_unlock(&eph_lists_spinlock);
404 }
405 }
407 /* remove page from lists (but not from parent object) and free it */
408 static NOINLINE void pgp_delete(pgp_t *pgp, bool_t no_eph_lock)
409 {
410 uint64_t life;
412 ASSERT(pgp != NULL);
413 ASSERT(pgp->obj != NULL);
414 ASSERT(pgp->obj->pool != NULL);
415 life = get_cycles() - pgp->timestamp;
416 pgp->obj->pool->sum_life_cycles += life;
417 pgp_delist(pgp, no_eph_lock);
418 pgp_free(pgp,1);
419 }
421 /* called only indirectly by radix_tree_destroy */
422 static NOINLINE void pgp_destroy(void *v)
423 {
424 pgp_t *pgp = (pgp_t *)v;
426 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
427 pgp_delist(pgp,0);
428 ASSERT(pgp->obj != NULL);
429 pgp->obj->pgp_count--;
430 ASSERT(pgp->obj->pgp_count >= 0);
431 pgp_free(pgp,0);
432 }
434 FORWARD static rtn_t *rtn_alloc(void *arg);
435 FORWARD static void rtn_free(rtn_t *rtn);
437 static int pgp_add_to_obj(obj_t *obj, uint32_t index, pgp_t *pgp)
438 {
439 int ret;
441 ASSERT_SPINLOCK(&obj->obj_spinlock);
442 ret = radix_tree_insert(&obj->tree_root, index, pgp, rtn_alloc, obj);
443 if ( !ret )
444 obj->pgp_count++;
445 return ret;
446 }
448 static NOINLINE pgp_t *pgp_delete_from_obj(obj_t *obj, uint32_t index)
449 {
450 pgp_t *pgp;
452 ASSERT(obj != NULL);
453 ASSERT_SPINLOCK(&obj->obj_spinlock);
454 ASSERT_SENTINEL(obj,OBJ);
455 ASSERT(obj->pool != NULL);
456 ASSERT_SENTINEL(obj->pool,POOL);
457 pgp = radix_tree_delete(&obj->tree_root, index, rtn_free);
458 if ( pgp != NULL )
459 obj->pgp_count--;
460 ASSERT(obj->pgp_count >= 0);
462 return pgp;
463 }
465 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
467 /* called only indirectly from radix_tree_insert */
468 static NOINLINE rtn_t *rtn_alloc(void *arg)
469 {
470 objnode_t *objnode;
471 obj_t *obj = (obj_t *)arg;
473 ASSERT_SENTINEL(obj,OBJ);
474 ASSERT(obj->pool != NULL);
475 ASSERT_SENTINEL(obj->pool,POOL);
476 objnode = tmem_malloc(objnode_t,obj->pool);
477 if (objnode == NULL)
478 return NULL;
479 objnode->obj = obj;
480 SET_SENTINEL(objnode,OBJNODE);
481 memset(&objnode->rtn, 0, sizeof(rtn_t));
482 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
483 obj->pool->objnode_count_max = obj->pool->objnode_count;
484 atomic_inc_and_max(global_rtree_node_count);
485 obj->objnode_count++;
486 return &objnode->rtn;
487 }
489 /* called only indirectly from radix_tree_delete/destroy */
490 static void rtn_free(rtn_t *rtn)
491 {
492 pool_t *pool;
493 objnode_t *objnode;
494 int i;
496 ASSERT(rtn != NULL);
497 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
498 ASSERT(rtn->slots[i] == NULL);
499 objnode = container_of(rtn,objnode_t,rtn);
500 ASSERT_SENTINEL(objnode,OBJNODE);
501 INVERT_SENTINEL(objnode,OBJNODE);
502 ASSERT(objnode->obj != NULL);
503 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
504 ASSERT_SENTINEL(objnode->obj,OBJ);
505 pool = objnode->obj->pool;
506 ASSERT(pool != NULL);
507 ASSERT_SENTINEL(pool,POOL);
508 pool->objnode_count--;
509 objnode->obj->objnode_count--;
510 objnode->obj = NULL;
511 tmem_free(objnode,sizeof(objnode_t),pool);
512 atomic_dec_and_assert(global_rtree_node_count);
513 }
515 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
517 /* searches for object==oid in pool, returns locked object if found */
518 static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
519 {
520 struct rb_node *node;
521 obj_t *obj;
523 restart_find:
524 tmem_read_lock(&pool->pool_rwlock);
525 node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
526 while ( node )
527 {
528 obj = container_of(node, obj_t, rb_tree_node);
529 if ( obj->oid == oid )
530 {
531 if ( tmh_lock_all )
532 obj->no_evict = 1;
533 else
534 {
535 if ( !tmem_spin_trylock(&obj->obj_spinlock) )
536 {
537 tmem_read_unlock(&pool->pool_rwlock);
538 goto restart_find;
539 }
540 tmem_read_unlock(&pool->pool_rwlock);
541 }
542 return obj;
543 }
544 else if ( oid < obj->oid )
545 node = node->rb_left;
546 else
547 node = node->rb_right;
548 }
549 tmem_read_unlock(&pool->pool_rwlock);
550 return NULL;
551 }
553 /* free an object that has no more pgps in it */
554 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
555 {
556 pool_t *pool;
557 uint64_t old_oid;
559 ASSERT_SPINLOCK(&obj->obj_spinlock);
560 ASSERT(obj != NULL);
561 ASSERT_SENTINEL(obj,OBJ);
562 ASSERT(obj->pgp_count == 0);
563 pool = obj->pool;
564 ASSERT(pool != NULL);
565 ASSERT_WRITELOCK(&pool->pool_rwlock);
566 if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
567 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
568 ASSERT((long)obj->objnode_count == 0);
569 ASSERT(obj->tree_root.rnode == NULL);
570 pool->obj_count--;
571 ASSERT(pool->obj_count >= 0);
572 INVERT_SENTINEL(obj,OBJ);
573 obj->pool = NULL;
574 old_oid = obj->oid;
575 obj->oid = -1;
576 obj->last_client = CLI_ID_NULL;
577 atomic_dec_and_assert(global_obj_count);
578 /* use no_rebalance only if all objects are being destroyed anyway */
579 if ( !no_rebalance )
580 rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
581 tmem_free(obj,sizeof(obj_t),pool);
582 }
584 static NOINLINE void obj_rb_destroy_node(struct rb_node *node)
585 {
586 obj_t * obj;
588 if ( node == NULL )
589 return;
590 obj_rb_destroy_node(node->rb_left);
591 obj_rb_destroy_node(node->rb_right);
592 obj = container_of(node, obj_t, rb_tree_node);
593 tmem_spin_lock(&obj->obj_spinlock);
594 ASSERT(obj->no_evict == 0);
595 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
596 obj_free(obj,1);
597 }
599 static NOINLINE int obj_rb_insert(struct rb_root *root, obj_t *obj)
600 {
601 struct rb_node **new, *parent = NULL;
602 obj_t *this;
604 new = &(root->rb_node);
605 while ( *new )
606 {
607 this = container_of(*new, obj_t, rb_tree_node);
608 parent = *new;
609 if ( obj->oid < this->oid )
610 new = &((*new)->rb_left);
611 else if ( obj->oid > this->oid )
612 new = &((*new)->rb_right);
613 else
614 return 0;
615 }
616 rb_link_node(&obj->rb_tree_node, parent, new);
617 rb_insert_color(&obj->rb_tree_node, root);
618 return 1;
619 }
621 /*
622 * allocate, initialize, and insert an tmem_object_root
623 * (should be called only if find failed)
624 */
625 static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
626 {
627 obj_t *obj;
629 ASSERT(pool != NULL);
630 ASSERT_WRITELOCK(&pool->pool_rwlock);
631 if ( (obj = tmem_malloc(obj_t,pool)) == NULL )
632 return NULL;
633 pool->obj_count++;
634 if (pool->obj_count > pool->obj_count_max)
635 pool->obj_count_max = pool->obj_count;
636 atomic_inc_and_max(global_obj_count);
637 INIT_RADIX_TREE(&obj->tree_root,0);
638 spin_lock_init(&obj->obj_spinlock);
639 obj->pool = pool;
640 obj->oid = oid;
641 obj->objnode_count = 0;
642 obj->pgp_count = 0;
643 obj->last_client = CLI_ID_NULL;
644 SET_SENTINEL(obj,OBJ);
645 tmem_spin_lock(&obj->obj_spinlock);
646 obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
647 obj->no_evict = 1;
648 ASSERT_SPINLOCK(&obj->obj_spinlock);
649 return obj;
650 }
652 /* free an object after destroying any pgps in it */
653 static NOINLINE void obj_destroy(obj_t *obj)
654 {
655 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
656 radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
657 obj_free(obj,0);
658 }
660 /* destroy all objects in a pool */
661 static NOINLINE void obj_rb_destroy_all(pool_t *pool)
662 {
663 int i;
665 tmem_write_lock(&pool->pool_rwlock);
666 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
667 obj_rb_destroy_node(pool->obj_rb_root[i].rb_node);
668 tmem_write_unlock(&pool->pool_rwlock);
669 }
671 /* destroys all objects in a pool that have last_client set to cli_id */
672 static void obj_free_selective(pool_t *pool, cli_id_t cli_id)
673 {
674 struct rb_node *node;
675 obj_t *obj;
676 int i;
678 tmem_write_lock(&pool->pool_rwlock);
679 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
680 {
681 node = rb_first(&pool->obj_rb_root[i]);
682 while ( node != NULL )
683 {
684 obj = container_of(node, obj_t, rb_tree_node);
685 tmem_spin_lock(&obj->obj_spinlock);
686 node = rb_next(node);
687 if ( obj->last_client == cli_id )
688 obj_destroy(obj);
689 else
690 tmem_spin_unlock(&obj->obj_spinlock);
691 }
692 }
693 tmem_write_unlock(&pool->pool_rwlock);
694 }
697 /************ POOL MANIPULATION ROUTINES ******************************/
699 static pool_t * pool_alloc(void)
700 {
701 pool_t *pool;
702 int i;
704 if ( (pool = tmem_malloc(pool_t,NULL)) == NULL )
705 return NULL;
706 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
707 pool->obj_rb_root[i] = RB_ROOT;
708 INIT_LIST_HEAD(&pool->pool_list);
709 rwlock_init(&pool->pool_rwlock);
710 pool->pgp_count_max = pool->obj_count_max = 0;
711 pool->objnode_count = pool->objnode_count_max = 0;
712 atomic_set(&pool->pgp_count,0);
713 pool->obj_count = 0;
714 pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
715 pool->dup_puts_replaced = pool->no_mem_puts = 0;
716 pool->found_gets = pool->gets = 0;
717 pool->flushs_found = pool->flushs = 0;
718 pool->flush_objs_found = pool->flush_objs = 0;
719 SET_SENTINEL(pool,POOL);
720 return pool;
721 }
723 static NOINLINE void pool_free(pool_t *pool)
724 {
725 ASSERT_SENTINEL(pool,POOL);
726 INVERT_SENTINEL(pool,POOL);
727 pool->client = NULL;
728 list_del(&pool->pool_list);
729 tmem_free(pool,sizeof(pool_t),NULL);
730 }
732 /* register new_client as a user of this shared pool and return new
733 total number of registered users */
734 static int shared_pool_join(pool_t *pool, client_t *new_client)
735 {
736 sharelist_t *sl;
738 ASSERT(is_shared(pool));
739 if ( (sl = tmem_malloc(sharelist_t,NULL)) == NULL )
740 return -1;
741 sl->client = new_client;
742 list_add_tail(&sl->share_list, &pool->share_list);
743 printk("adding new %s %d to shared pool owned by %s %d\n",
744 client_str, new_client->cli_id, client_str, pool->client->cli_id);
745 return ++pool->shared_count;
746 }
748 /* reassign "ownership" of the pool to another client that shares this pool */
749 static NOINLINE void shared_pool_reassign(pool_t *pool)
750 {
751 sharelist_t *sl;
752 int poolid;
753 client_t *old_client = pool->client, *new_client;
755 ASSERT(is_shared(pool));
756 if ( list_empty(&pool->share_list) )
757 {
758 ASSERT(pool->shared_count == 0);
759 return;
760 }
761 old_client->pools[pool->pool_id] = NULL;
762 sl = list_entry(pool->share_list.next, sharelist_t, share_list);
763 ASSERT(sl->client != old_client);
764 pool->client = new_client = sl->client;
765 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
766 if (new_client->pools[poolid] == pool)
767 break;
768 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
769 printk("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
770 cli_id_str, old_client->cli_id, cli_id_str, new_client->cli_id, poolid);
771 pool->pool_id = poolid;
772 }
774 /* destroy all objects with last_client same as passed cli_id,
775 remove pool's cli_id from list of sharers of this pool */
776 static NOINLINE int shared_pool_quit(pool_t *pool, cli_id_t cli_id)
777 {
778 sharelist_t *sl;
779 int s_poolid;
781 ASSERT(is_shared(pool));
782 ASSERT(pool->client != NULL);
784 obj_free_selective(pool,cli_id);
785 list_for_each_entry(sl,&pool->share_list, share_list)
786 {
787 if (sl->client->cli_id != cli_id)
788 continue;
789 list_del(&sl->share_list);
790 tmem_free(sl,sizeof(sharelist_t),pool);
791 --pool->shared_count;
792 if (pool->client->cli_id == cli_id)
793 shared_pool_reassign(pool);
794 if (pool->shared_count)
795 return pool->shared_count;
796 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
797 if ( (global_shared_pools[s_poolid]) == pool )
798 {
799 global_shared_pools[s_poolid] = NULL;
800 break;
801 }
802 return 0;
803 }
804 printk("tmem: no match unsharing pool, %s=%d\n",
805 cli_id_str,pool->client->cli_id);
806 return -1;
807 }
809 /* flush all data (owned by cli_id) from a pool and, optionally, free it */
810 static void pool_flush(pool_t *pool, cli_id_t cli_id, bool_t destroy)
811 {
812 ASSERT(pool != NULL);
813 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
814 {
815 printk("tmem: unshared shared pool %d from %s=%d\n",
816 pool->pool_id, cli_id_str,pool->client->cli_id);
817 return;
818 }
819 printk("%s %s-%s tmem pool ",destroy?"destroying":"flushing",
820 is_persistent(pool) ? "persistent" : "ephemeral" ,
821 is_shared(pool) ? "shared" : "private");
822 printk("%s=%d pool_id=%d\n", cli_id_str,pool->client->cli_id,pool->pool_id);
823 obj_rb_destroy_all(pool);
824 if ( destroy )
825 {
826 pool->client->pools[pool->pool_id] = NULL;
827 pool_free(pool);
828 }
829 }
831 /************ CLIENT MANIPULATION OPERATIONS **************************/
833 static client_t *client_create(void)
834 {
835 client_t *client = tmem_malloc(client_t,NULL);
836 cli_id_t cli_id = tmh_get_cli_id_from_current();
838 printk("tmem: initializing tmem capability for %s=%d...",cli_id_str,cli_id);
839 if ( client == NULL )
840 {
841 printk("failed... out of memory\n");
842 return NULL;
843 }
844 memset(client,0,sizeof(client_t));
845 if ( (client->tmh = tmh_client_init()) == NULL )
846 {
847 printk("failed... can't allocate host-dependent part of client\n");
848 if ( client )
849 tmem_free(client,sizeof(client_t),NULL);
850 return NULL;
851 }
852 tmh_set_current_client(client);
853 client->cli_id = cli_id;
854 #ifdef __i386__
855 client->compress = 0;
856 #else
857 client->compress = tmh_compression_enabled();
858 #endif
859 list_add_tail(&client->client_list, &global_client_list);
860 INIT_LIST_HEAD(&client->ephemeral_page_list);
861 client->eph_count = client->eph_count_max = 0;
862 printk("ok\n");
863 return client;
864 }
866 static void client_free(client_t *client)
867 {
868 list_del(&client->client_list);
869 tmh_client_destroy(client->tmh);
870 tmem_free(client,sizeof(client_t),NULL);
871 }
873 /* flush all data from a client and, optionally, free it */
874 static void client_flush(client_t *client, bool_t destroy)
875 {
876 int i;
877 pool_t *pool;
879 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
880 {
881 if ( (pool = client->pools[i]) == NULL )
882 continue;
883 pool_flush(pool,client->cli_id,destroy);
884 if ( destroy )
885 client->pools[i] = NULL;
886 }
887 if ( destroy )
888 client_free(client);
889 }
891 static bool_t client_over_quota(client_t *client)
892 {
893 int total = _atomic_read(client_weight_total);
895 ASSERT(client != NULL);
896 if ( (total == 0) || (client->weight == 0) ||
897 (client->eph_count == 0) )
898 return 0;
899 return ( ((global_eph_count*100L) / client->eph_count ) >
900 ((total*100L) / client->weight) );
901 }
903 /************ MEMORY REVOCATION ROUTINES *******************************/
905 static int tmem_evict(void)
906 {
907 client_t *client = tmh_client_from_current();
908 pgp_t *pgp = NULL, *pgp_del;
909 obj_t *obj;
910 pool_t *pool;
911 int ret = 0;
912 bool_t hold_pool_rwlock = 0;
914 evict_attempts++;
915 tmem_spin_lock(&eph_lists_spinlock);
916 if ( (client != NULL) && client_over_quota(client) &&
917 !list_empty(&client->ephemeral_page_list) )
918 {
919 list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
920 {
921 obj = pgp->obj;
922 pool = obj->pool;
923 if ( tmh_lock_all && !obj->no_evict )
924 goto found;
925 if ( tmem_spin_trylock(&obj->obj_spinlock) )
926 {
927 if ( obj->pgp_count > 1 )
928 goto found;
929 if ( tmem_write_trylock(&pool->pool_rwlock) )
930 {
931 hold_pool_rwlock = 1;
932 goto found;
933 }
934 tmem_spin_unlock(&obj->obj_spinlock);
935 }
936 }
937 } else if ( list_empty(&global_ephemeral_page_list) ) {
938 goto out;
939 } else {
940 list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
941 {
942 obj = pgp->obj;
943 pool = obj->pool;
944 if ( tmh_lock_all && !obj->no_evict )
945 goto found;
946 if ( tmem_spin_trylock(&obj->obj_spinlock) )
947 {
948 if ( obj->pgp_count > 1 )
949 goto found;
950 if ( tmem_write_trylock(&pool->pool_rwlock) )
951 {
952 hold_pool_rwlock = 1;
953 goto found;
954 }
955 tmem_spin_unlock(&obj->obj_spinlock);
956 }
957 }
958 }
960 ret = 0;
961 goto out;
963 found:
964 ASSERT(pgp != NULL);
965 ASSERT_SENTINEL(pgp,PGD);
966 obj = pgp->obj;
967 ASSERT(obj != NULL);
968 ASSERT(obj->no_evict == 0);
969 ASSERT(obj->pool != NULL);
970 ASSERT_SENTINEL(obj,OBJ);
972 ASSERT_SPINLOCK(&obj->obj_spinlock);
973 pgp_del = pgp_delete_from_obj(obj, pgp->index);
974 ASSERT(pgp_del == pgp);
975 pgp_delete(pgp,1);
976 if ( obj->pgp_count == 0 )
977 {
978 ASSERT_WRITELOCK(&pool->pool_rwlock);
979 obj_free(obj,0);
980 }
981 else
982 tmem_spin_unlock(&obj->obj_spinlock);
983 if ( hold_pool_rwlock )
984 tmem_write_unlock(&pool->pool_rwlock);
985 evicted_pgs++;
986 ret = 1;
988 out:
989 tmem_spin_unlock(&eph_lists_spinlock);
990 return ret;
991 }
993 static unsigned long tmem_relinquish_npages(unsigned long n)
994 {
995 unsigned long avail_pages = 0;
997 while ( (avail_pages = tmh_avail_pages()) < n )
998 {
999 if ( !tmem_evict() )
1000 break;
1002 if ( avail_pages )
1003 tmh_release_avail_pages_to_host();
1004 return avail_pages;
1007 /************ TMEM CORE OPERATIONS ************************************/
1009 static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
1011 void *dst, *p;
1012 size_t size;
1013 int ret = 0;
1014 DECL_LOCAL_CYC_COUNTER(compress);
1016 ASSERT(pgp != NULL);
1017 ASSERT(pgp->obj != NULL);
1018 ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
1019 ASSERT(pgp->obj->pool != NULL);
1020 ASSERT(pgp->obj->pool->client != NULL);
1021 #ifdef __i386__
1022 return -ENOMEM;
1023 #endif
1024 if ( pgp->pfp != NULL )
1025 pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */
1026 START_CYC_COUNTER(compress);
1027 ret = tmh_compress_from_client(cmfn, &dst, &size);
1028 if ( (ret == -EFAULT) || (ret == 0) )
1029 goto out;
1030 else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
1031 ret = 0;
1032 else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
1033 ret = -ENOMEM;
1034 else
1036 memcpy(p,dst,size);
1037 pgp->cdata = p;
1038 pgp->size = size;
1039 pgp->obj->pool->client->compressed_pages++;
1040 pgp->obj->pool->client->compressed_sum_size += size;
1041 ret = 1;
1044 out:
1045 END_CYC_COUNTER(compress);
1046 return ret;
1049 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
1050 uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
1052 pool_t *pool;
1053 obj_t *obj;
1054 client_t *client;
1055 pgp_t *pgpfound = NULL;
1056 int ret;
1058 /* if we can successfully manipulate pgp to change out the data, do so */
1059 ASSERT(pgp != NULL);
1060 ASSERT(pgp->pfp != NULL);
1061 ASSERT(pgp->size != -1);
1062 obj = pgp->obj;
1063 ASSERT_SPINLOCK(&obj->obj_spinlock);
1064 ASSERT(obj != NULL);
1065 pool = obj->pool;
1066 ASSERT(pool != NULL);
1067 client = pool->client;
1068 if ( len != 0 && tmh_compression_enabled() &&
1069 client->compress && pgp->size != 0 )
1071 ret = do_tmem_put_compress(pgp,cmfn);
1072 if ( ret == 1 )
1073 goto done;
1074 else if ( ret == 0 )
1075 goto copy_uncompressed;
1076 else if ( ret == -ENOMEM )
1077 goto failed_dup;
1078 else if ( ret == -EFAULT )
1079 goto bad_copy;
1082 copy_uncompressed:
1083 if ( pgp->pfp )
1084 pgp_free_data(pgp, pool);
1085 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1086 goto failed_dup;
1087 /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
1088 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
1089 if ( ret == -EFAULT )
1090 goto bad_copy;
1091 pgp->size = 0;
1093 done:
1094 /* successfully replaced data, clean up and return success */
1095 if ( is_shared(pool) )
1096 obj->last_client = client->cli_id;
1097 obj->no_evict = 0;
1098 tmem_spin_unlock(&obj->obj_spinlock);
1099 pool->dup_puts_replaced++;
1100 pool->good_puts++;
1101 return 1;
1103 bad_copy:
1104 /* this should only happen if the client passed a bad mfn */
1105 failed_copies++;
1106 ASSERT(0);
1107 return -EFAULT;
1109 failed_dup:
1110 /* couldn't change out the data, flush the old data and return
1111 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */
1112 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1113 ASSERT(pgpfound == pgp);
1114 pgp_delete(pgpfound,0);
1115 if ( obj->pgp_count == 0 )
1117 tmem_write_lock(&pool->pool_rwlock);
1118 obj_free(obj,0);
1119 tmem_write_unlock(&pool->pool_rwlock);
1120 } else {
1121 obj->no_evict = 0;
1122 tmem_spin_unlock(&obj->obj_spinlock);
1124 pool->dup_puts_flushed++;
1125 return -ENOSPC;
1129 static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
1130 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1131 uint32_t pfn_offset, uint32_t len)
1133 obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
1134 pgp_t *pgp = NULL, *pgpdel = NULL;
1135 client_t *client = pool->client;
1136 int ret = client->frozen ? -EFROZEN : -ENOMEM;
1138 ASSERT(pool != NULL);
1139 pool->puts++;
1140 /* does page already exist (dup)? if so, handle specially */
1141 if ( (obj = objfound = obj_find(pool,oid)) != NULL )
1143 ASSERT_SPINLOCK(&objfound->obj_spinlock);
1144 if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
1145 return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
1148 /* no puts allowed into a frozen pool (except dup puts) */
1149 if ( client->frozen )
1150 goto free;
1152 if ( (objfound == NULL) )
1154 tmem_write_lock(&pool->pool_rwlock);
1155 if ( (obj = objnew = obj_new(pool,oid)) == NULL )
1157 tmem_write_unlock(&pool->pool_rwlock);
1158 return -ENOMEM;
1160 ASSERT_SPINLOCK(&objnew->obj_spinlock);
1161 tmem_write_unlock(&pool->pool_rwlock);
1164 ASSERT((obj != NULL)&&((objnew==obj)||(objfound==obj))&&(objnew!=objfound));
1165 ASSERT_SPINLOCK(&obj->obj_spinlock);
1166 if ( (pgp = pgp_alloc(obj)) == NULL )
1167 goto free;
1169 ret = pgp_add_to_obj(obj, index, pgp);
1170 if ( ret == -ENOMEM )
1171 /* warning, may result in partially built radix tree ("stump") */
1172 goto free;
1173 ASSERT(ret != -EEXIST);
1174 pgp->index = index;
1176 if ( len != 0 && tmh_compression_enabled() && client->compress )
1178 ASSERT(pgp->pfp == NULL);
1179 ret = do_tmem_put_compress(pgp,cmfn);
1180 if ( ret == 1 )
1181 goto insert_page;
1182 if ( ret == -ENOMEM )
1184 client->compress_nomem++;
1185 goto delete_and_free;
1187 if ( ret == 0 )
1189 client->compress_poor++;
1190 goto copy_uncompressed;
1192 if ( ret == -EFAULT )
1193 goto bad_copy;
1196 copy_uncompressed:
1197 if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
1199 ret == -ENOMEM;
1200 goto delete_and_free;
1202 /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
1203 ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
1204 if ( ret == -EFAULT )
1205 goto bad_copy;
1206 pgp->size = 0;
1208 insert_page:
1209 if ( is_ephemeral(pool) )
1211 tmem_spin_lock(&eph_lists_spinlock);
1212 list_add_tail(&pgp->global_eph_pages,
1213 &global_ephemeral_page_list);
1214 if (++global_eph_count > global_eph_count_max)
1215 global_eph_count_max = global_eph_count;
1216 list_add_tail(&pgp->client_eph_pages,
1217 &client->ephemeral_page_list);
1218 if (++client->eph_count > client->eph_count_max)
1219 client->eph_count_max = client->eph_count;
1220 tmem_spin_unlock(&eph_lists_spinlock);
1222 ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
1223 if ( is_shared(pool) )
1224 obj->last_client = client->cli_id;
1225 obj->no_evict = 0;
1226 tmem_spin_unlock(&obj->obj_spinlock);
1227 pool->good_puts++;
1228 return 1;
1230 delete_and_free:
1231 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1232 pgpdel = pgp_delete_from_obj(obj, pgp->index);
1233 ASSERT(pgp == pgpdel);
1235 free:
1236 if ( pgp )
1237 pgp_delete(pgp,0);
1238 if ( objfound )
1240 objfound->no_evict = 0;
1241 tmem_spin_unlock(&objfound->obj_spinlock);
1243 if ( objnew )
1245 tmem_write_lock(&pool->pool_rwlock);
1246 obj_free(objnew,0);
1247 tmem_write_unlock(&pool->pool_rwlock);
1249 pool->no_mem_puts++;
1250 return ret;
1252 bad_copy:
1253 /* this should only happen if the client passed a bad mfn */
1254 failed_copies++;
1255 ASSERT(0);
1256 goto free;
1259 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
1260 tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
1261 uint32_t pfn_offset, uint32_t len)
1263 obj_t *obj;
1264 pgp_t *pgp;
1265 client_t *client = pool->client;
1266 DECL_LOCAL_CYC_COUNTER(decompress);
1268 if ( !_atomic_read(pool->pgp_count) )
1269 return -EEMPTY;
1271 pool->gets++;
1272 obj = obj_find(pool,oid);
1273 if ( obj == NULL )
1274 return 0;
1276 ASSERT_SPINLOCK(&obj->obj_spinlock);
1277 if (is_shared(pool) || is_persistent(pool) )
1278 pgp = pgp_lookup_in_obj(obj, index);
1279 else
1280 pgp = pgp_delete_from_obj(obj, index);
1281 if ( pgp == NULL )
1283 obj->no_evict = 0;
1284 tmem_spin_unlock(&obj->obj_spinlock);
1285 return 0;
1287 ASSERT(pgp->size != -1);
1288 if ( pgp->size != 0 )
1290 START_CYC_COUNTER(decompress);
1291 if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
1292 goto bad_copy;
1293 END_CYC_COUNTER(decompress);
1295 else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
1296 pfn_offset, len) == -EFAULT)
1297 goto bad_copy;
1298 if ( is_ephemeral(pool) )
1300 if ( is_private(pool) )
1302 pgp_delete(pgp,0);
1303 if ( obj->pgp_count == 0 )
1305 tmem_write_lock(&pool->pool_rwlock);
1306 obj_free(obj,0);
1307 obj = NULL;
1308 tmem_write_unlock(&pool->pool_rwlock);
1310 } else {
1311 tmem_spin_lock(&eph_lists_spinlock);
1312 list_del(&pgp->global_eph_pages);
1313 list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
1314 list_del(&pgp->client_eph_pages);
1315 list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
1316 tmem_spin_unlock(&eph_lists_spinlock);
1317 ASSERT(obj != NULL);
1318 obj->last_client = tmh_get_cli_id_from_current();
1321 if ( obj != NULL )
1323 obj->no_evict = 0;
1324 tmem_spin_unlock(&obj->obj_spinlock);
1326 pool->found_gets++;
1327 return 1;
1329 bad_copy:
1330 /* this should only happen if the client passed a bad mfn */
1331 failed_copies++;
1332 ASSERT(0);
1333 return -EFAULT;
1337 static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t index)
1339 obj_t *obj;
1340 pgp_t *pgp;
1342 pool->flushs++;
1343 obj = obj_find(pool,oid);
1344 if ( obj == NULL )
1345 goto out;
1346 pgp = pgp_delete_from_obj(obj, index);
1347 if ( pgp == NULL )
1349 obj->no_evict = 0;
1350 tmem_spin_unlock(&obj->obj_spinlock);
1351 goto out;
1353 pgp_delete(pgp,0);
1354 if ( obj->pgp_count == 0 )
1356 tmem_write_lock(&pool->pool_rwlock);
1357 obj_free(obj,0);
1358 tmem_write_unlock(&pool->pool_rwlock);
1359 } else {
1360 obj->no_evict = 0;
1361 tmem_spin_unlock(&obj->obj_spinlock);
1363 pool->flushs_found++;
1365 out:
1366 if ( pool->client->frozen )
1367 return -EFROZEN;
1368 else
1369 return 1;
1372 static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
1374 obj_t *obj;
1376 pool->flush_objs++;
1377 obj = obj_find(pool,oid);
1378 if ( obj == NULL )
1379 goto out;
1380 tmem_write_lock(&pool->pool_rwlock);
1381 obj_destroy(obj);
1382 pool->flush_objs_found++;
1383 tmem_write_unlock(&pool->pool_rwlock);
1385 out:
1386 if ( pool->client->frozen )
1387 return -EFROZEN;
1388 else
1389 return 1;
1392 static NOINLINE int do_tmem_destroy_pool(uint32_t pool_id)
1394 client_t *client = tmh_client_from_current();
1395 pool_t *pool;
1397 if ( client->pools == NULL )
1398 return 0;
1399 if ( (pool = client->pools[pool_id]) == NULL )
1400 return 0;
1401 client->pools[pool_id] = NULL;
1402 pool_flush(pool,client->cli_id,1);
1403 return 1;
1406 static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi)
1408 client_t *client = tmh_client_from_current();
1409 cli_id_t cli_id = tmh_get_cli_id_from_current();
1410 int persistent = flags & TMEM_POOL_PERSIST;
1411 int shared = flags & TMEM_POOL_SHARED;
1412 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1413 & TMEM_POOL_PAGESIZE_MASK;
1414 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1415 & TMEM_POOL_VERSION_MASK;
1416 pool_t *pool, *shpool;
1417 int s_poolid, d_poolid, first_unused_s_poolid;
1419 ASSERT(client != NULL);
1420 printk("tmem: allocating %s-%s tmem pool for %s=%d...",
1421 persistent ? "persistent" : "ephemeral" ,
1422 shared ? "shared" : "private", cli_id_str, cli_id);
1423 if ( specversion != 0 )
1425 printk("failed... unsupported spec version\n");
1426 return -EPERM;
1428 if ( pagebits != (PAGE_SHIFT - 12) )
1430 printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
1431 return -EPERM;
1433 if ( (pool = pool_alloc()) == NULL )
1435 printk("failed... out of memory\n");
1436 return -ENOMEM;
1438 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1439 if ( client->pools[d_poolid] == NULL )
1440 break;
1441 if ( d_poolid == MAX_POOLS_PER_DOMAIN )
1443 printk("failed... no more pool slots available for this %s\n",
1444 client_str);
1445 goto fail;
1447 pool->shared = shared;
1448 pool->client = client;
1449 if ( shared )
1451 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1452 for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ )
1454 if ( (shpool = global_shared_pools[s_poolid]) != NULL )
1456 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1458 printk("(matches shared pool uuid=%"PRIx64".%"PRIu64") ",
1459 uuid_hi, uuid_lo);
1460 printk("pool_id=%d\n",d_poolid);
1461 client->pools[d_poolid] = global_shared_pools[s_poolid];
1462 shared_pool_join(global_shared_pools[s_poolid], client);
1463 pool_free(pool);
1464 return d_poolid;
1467 else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1468 first_unused_s_poolid = s_poolid;
1470 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1472 printk("tmem: failed... no global shared pool slots available\n");
1473 goto fail;
1475 else
1477 INIT_LIST_HEAD(&pool->share_list);
1478 pool->shared_count = 0;
1479 global_shared_pools[first_unused_s_poolid] = pool;
1480 (void)shared_pool_join(pool,client);
1483 client->pools[d_poolid] = pool;
1484 list_add_tail(&pool->pool_list, &global_pool_list);
1485 pool->pool_id = d_poolid;
1486 pool->persistent = persistent;
1487 pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi;
1488 printk("pool_id=%d\n",d_poolid);
1489 return d_poolid;
1491 fail:
1492 pool_free(pool);
1493 return -EPERM;
1496 /************ TMEM CONTROL OPERATIONS ************************************/
1498 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
1499 static int tmemc_freeze_pools(int cli_id, int arg)
1501 client_t *client;
1502 bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
1503 bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0;
1504 char *s;
1506 s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" );
1507 if ( cli_id == CLI_ID_NULL )
1509 list_for_each_entry(client,&global_client_list,client_list)
1511 client->frozen = freeze;
1512 printk("tmem: all pools %s for all %ss\n",s,client_str);
1515 else
1517 if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1518 return -1;
1519 client->frozen = freeze;
1520 printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
1522 return 0;
1525 static int tmemc_flush_mem(int cli_id, uint32_t kb)
1527 uint32_t npages, flushed_pages, flushed_kb;
1529 if ( cli_id != CLI_ID_NULL )
1531 printk("tmem: %s-specific flush not supported yet, use --all\n",
1532 client_str);
1533 return -1;
1535 /* convert kb to pages, rounding up if necessary */
1536 npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10);
1537 flushed_pages = tmem_relinquish_npages(npages);
1538 flushed_kb = flushed_pages << (PAGE_SHIFT-10);
1539 return flushed_kb;
1542 /*
1543 * These tmemc_list* routines output lots of stats in a format that is
1544 * intended to be program-parseable, not human-readable. Further, by
1545 * tying each group of stats to a line format indicator (e.g. G= for
1546 * global stats) and each individual stat to a two-letter specifier
1547 * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the
1548 * global ephemeral pool), it should allow the stats reported to be
1549 * forward and backwards compatible as tmem evolves.
1550 */
1551 #define BSIZE 1024
1553 static int tmemc_list_client(client_t *c, tmem_cli_va_t buf, int off,
1554 uint32_t len, bool_t use_long)
1556 char info[BSIZE];
1557 int i, n = 0, sum = 0;
1558 pool_t *p;
1559 bool_t s;
1561 n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d%c",
1562 c->cli_id, c->weight, c->cap, c->compress,
1563 c->frozen, use_long ? ',' : '\n');
1564 if (use_long)
1565 n += scnprintf(info+n,BSIZE-n,
1566 "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n",
1567 c->eph_count, c->eph_count_max,
1568 c->compressed_pages, c->compressed_sum_size,
1569 c->compress_poor, c->compress_nomem);
1570 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1571 sum += n;
1572 for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ )
1574 if ( (p = c->pools[i]) == NULL )
1575 continue;
1576 s = is_shared(p);
1577 n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d,"
1578 "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c",
1579 c->cli_id, p->pool_id,
1580 is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P',
1581 (uint64_t)(s ? p->uuid[0] : 0),
1582 (uint64_t)(s ? p->uuid[1] : 0LL),
1583 use_long ? ',' : '\n');
1584 if (use_long)
1585 n += scnprintf(info+n,BSIZE-n,
1586 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1587 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1588 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1589 _atomic_read(p->pgp_count), p->pgp_count_max,
1590 p->obj_count, p->obj_count_max,
1591 p->objnode_count, p->objnode_count_max,
1592 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1593 p->no_mem_puts,
1594 p->found_gets, p->gets,
1595 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1596 if ( sum + n >= len )
1597 return sum;
1598 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1599 sum += n;
1601 return sum;
1604 static int tmemc_list_shared(tmem_cli_va_t buf, int off, uint32_t len,
1605 bool_t use_long)
1607 char info[BSIZE];
1608 int i, n = 0, sum = 0;
1609 pool_t *p;
1610 sharelist_t *sl;
1612 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1614 if ( (p = global_shared_pools[i]) == NULL )
1615 continue;
1616 n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64,
1617 i, is_persistent(p) ? 'P' : 'E',
1618 is_shared(p) ? 'S' : 'P',
1619 p->uuid[0], p->uuid[1]);
1620 list_for_each_entry(sl,&p->share_list, share_list)
1621 n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id);
1622 n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n');
1623 if (use_long)
1624 n += scnprintf(info+n,BSIZE-n,
1625 "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu,"
1626 "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu,"
1627 "fs:%lu,ft:%lu,os:%lu,ot:%lu\n",
1628 _atomic_read(p->pgp_count), p->pgp_count_max,
1629 p->obj_count, p->obj_count_max,
1630 p->objnode_count, p->objnode_count_max,
1631 p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced,
1632 p->no_mem_puts,
1633 p->found_gets, p->gets,
1634 p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs);
1635 if ( sum + n >= len )
1636 return sum;
1637 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1638 sum += n;
1640 return sum;
1643 #ifdef TMEM_PERF
1644 static int tmemc_list_global_perf(tmem_cli_va_t buf, int off, uint32_t len,
1645 bool_t use_long)
1647 char info[BSIZE];
1648 int n = 0, sum = 0;
1650 n = scnprintf(info+n,BSIZE-n,"T=");
1651 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_get,"G");
1652 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,succ_put,"P");
1653 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_get,"g");
1654 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,non_succ_put,"p");
1655 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush,"F");
1656 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,flush_obj,"O");
1657 #ifdef COMPARE_COPY_PAGE_SSE2
1658 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy1,"1");
1659 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy2,"2");
1660 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy3,"3");
1661 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy4,"4");
1662 #else
1663 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,pg_copy,"C");
1664 #endif
1665 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,compress,"c");
1666 n += SCNPRINTF_CYC_COUNTER(info+n,BSIZE-n,decompress,"d");
1667 n--; /* overwrite trailing comma */
1668 n += scnprintf(info+n,BSIZE-n,"\n");
1669 if ( sum + n >= len )
1670 return sum;
1671 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1672 sum += n;
1673 return sum;
1675 #else
1676 #define tmemc_list_global_perf(_buf,_off,_len,_use) (0)
1677 #endif
1679 static int tmemc_list_global(tmem_cli_va_t buf, int off, uint32_t len,
1680 bool_t use_long)
1682 char info[BSIZE];
1683 int n = 0, sum = off;
1685 n += scnprintf(info,BSIZE,"G="
1686 "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu,"
1687 "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c",
1688 total_tmem_ops, errored_tmem_ops, failed_copies,
1689 alloc_failed, alloc_page_failed, tmh_avail_pages(),
1690 low_on_memory, evicted_pgs,
1691 evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq,
1692 total_flush_pool, use_long ? ',' : '\n');
1693 if (use_long)
1694 n += scnprintf(info+n,BSIZE-n,
1695 "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
1696 global_eph_count, global_eph_count_max,
1697 _atomic_read(global_obj_count), global_obj_count_max,
1698 _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
1699 _atomic_read(global_pgp_count), global_pgp_count_max);
1700 if ( sum + n >= len )
1701 return sum;
1702 tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
1703 sum += n;
1704 return sum;
1707 static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
1708 bool_t use_long)
1710 client_t *client;
1711 int off = 0;
1713 if ( cli_id == CLI_ID_NULL ) {
1714 off = tmemc_list_global(buf,0,len,use_long);
1715 off += tmemc_list_shared(buf,off,len-off,use_long);
1716 list_for_each_entry(client,&global_client_list,client_list)
1717 off += tmemc_list_client(client, buf, off, len-off, use_long);
1718 off += tmemc_list_global_perf(buf,off,len-off,use_long);
1720 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1721 return -1;
1722 else
1723 off = tmemc_list_client(client, buf, 0, len, use_long);
1726 return 0;
1729 static int tmemc_set_var_one(client_t *client, uint32_t subop, uint32_t arg1)
1731 cli_id_t cli_id = client->cli_id;
1732 uint32_t old_weight;
1734 switch (subop)
1736 case TMEMC_SET_WEIGHT:
1737 old_weight = client->weight;
1738 client->weight = arg1;
1739 printk("tmem: weight set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1740 atomic_sub(old_weight,&client_weight_total);
1741 atomic_add(client->weight,&client_weight_total);
1742 break;
1743 case TMEMC_SET_CAP:
1744 client->cap = arg1;
1745 printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
1746 break;
1747 case TMEMC_SET_COMPRESS:
1748 client->compress = arg1 ? 1 : 0;
1749 printk("tmem: compression %s for %s=%d\n",
1750 arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
1751 break;
1752 default:
1753 printk("tmem: unknown subop %d for tmemc_set_var\n",subop);
1754 return -1;
1756 return 0;
1759 static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
1761 client_t *client;
1763 if ( cli_id == CLI_ID_NULL )
1764 list_for_each_entry(client,&global_client_list,client_list)
1765 tmemc_set_var_one(client, subop, arg1);
1766 else if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
1767 return -1;
1768 else
1769 tmemc_set_var_one(client, subop, arg1);
1770 return 0;
1773 static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
1774 uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
1776 int ret;
1777 cli_id_t cli_id = (cli_id_t)cli_id32;
1779 if (!tmh_current_is_privileged())
1781 /* don't fail... mystery: sometimes dom0 fails here */
1782 /* return -EPERM; */
1784 switch(subop)
1786 case TMEMC_THAW:
1787 case TMEMC_FREEZE:
1788 case TMEMC_DESTROY:
1789 ret = tmemc_freeze_pools(cli_id,subop);
1790 break;
1791 case TMEMC_FLUSH:
1792 ret = tmemc_flush_mem(cli_id,arg1);
1793 break;
1794 case TMEMC_LIST:
1795 ret = tmemc_list(cli_id,buf,arg1,arg2);
1796 break;
1797 case TMEMC_SET_WEIGHT:
1798 case TMEMC_SET_CAP:
1799 case TMEMC_SET_COMPRESS:
1800 ret = tmemc_set_var(cli_id,subop,arg1);
1801 break;
1802 default:
1803 ret = -1;
1805 return ret;
1808 /************ EXPORTed FUNCTIONS **************************************/
1810 EXPORT long do_tmem_op(tmem_cli_op_t uops)
1812 struct tmem_op op;
1813 client_t *client = tmh_client_from_current();
1814 pool_t *pool = NULL;
1815 int rc = 0;
1816 bool_t succ_get = 0, succ_put = 0;
1817 bool_t non_succ_get = 0, non_succ_put = 0;
1818 bool_t flush = 0, flush_obj = 0;
1819 bool_t tmem_write_lock_set = 0, tmem_read_lock_set = 0;
1820 static bool_t warned = 0;
1821 DECL_LOCAL_CYC_COUNTER(succ_get);
1822 DECL_LOCAL_CYC_COUNTER(succ_put);
1823 DECL_LOCAL_CYC_COUNTER(non_succ_get);
1824 DECL_LOCAL_CYC_COUNTER(non_succ_put);
1825 DECL_LOCAL_CYC_COUNTER(flush);
1826 DECL_LOCAL_CYC_COUNTER(flush_obj);
1828 if ( !tmem_initialized )
1830 if ( !warned )
1831 printk("tmem: must specify tmem parameter on xen boot line\n");
1832 warned = 1;
1833 return -ENODEV;
1836 total_tmem_ops++;
1838 if ( tmh_lock_all )
1840 if ( tmh_lock_all > 1 )
1841 spin_lock_irq(&tmem_spinlock);
1842 else
1843 spin_lock(&tmem_spinlock);
1846 START_CYC_COUNTER(succ_get);
1847 DUP_START_CYC_COUNTER(succ_put,succ_get);
1848 DUP_START_CYC_COUNTER(non_succ_get,succ_get);
1849 DUP_START_CYC_COUNTER(non_succ_put,succ_get);
1850 DUP_START_CYC_COUNTER(flush,succ_get);
1851 DUP_START_CYC_COUNTER(flush_obj,succ_get);
1853 if ( unlikely(tmh_get_tmemop_from_client(&op, uops) != 0) )
1855 printk("tmem: can't get tmem struct from %s\n",client_str);
1856 rc = -EFAULT;
1857 goto out;
1860 if ( op.cmd == TMEM_CONTROL )
1862 tmem_write_lock(&tmem_rwlock);
1863 tmem_write_lock_set = 1;
1864 rc = do_tmem_control(op.subop, op.cli_id, op.arg1, op.arg2, op.buf);
1865 goto out;
1868 /* create per-client tmem structure dynamically on first use by client */
1869 if ( client == NULL )
1871 tmem_write_lock(&tmem_rwlock);
1872 tmem_write_lock_set = 1;
1873 if ( (client = client_create()) == NULL )
1875 printk("tmem: can't create tmem structure for %s\n",client_str);
1876 rc = -ENOMEM;
1877 goto out;
1881 if ( op.cmd == TMEM_NEW_POOL )
1883 if ( !tmem_write_lock_set )
1885 tmem_write_lock(&tmem_rwlock);
1886 tmem_write_lock_set = 1;
1889 else
1891 if ( !tmem_write_lock_set )
1893 tmem_read_lock(&tmem_rwlock);
1894 tmem_read_lock_set = 1;
1896 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
1897 ((pool = client->pools[op.pool_id]) == NULL) )
1899 rc = -ENODEV;
1900 printk("tmem: operation requested on uncreated pool\n");
1901 goto out;
1903 ASSERT_SENTINEL(pool,POOL);
1906 switch ( op.cmd )
1908 case TMEM_NEW_POOL:
1909 rc = do_tmem_new_pool(op.flags,op.uuid[0],op.uuid[1]);
1910 break;
1911 case TMEM_NEW_PAGE:
1912 rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, 0);
1913 break;
1914 case TMEM_PUT_PAGE:
1915 rc = do_tmem_put(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
1916 if (rc == 1) succ_put = 1;
1917 else non_succ_put = 1;
1918 break;
1919 case TMEM_GET_PAGE:
1920 rc = do_tmem_get(pool, op.object, op.index, op.cmfn, 0, 0, PAGE_SIZE);
1921 if (rc == 1) succ_get = 1;
1922 else non_succ_get = 1;
1923 break;
1924 case TMEM_FLUSH_PAGE:
1925 flush = 1;
1926 rc = do_tmem_flush_page(pool, op.object, op.index);
1927 break;
1928 case TMEM_FLUSH_OBJECT:
1929 rc = do_tmem_flush_object(pool, op.object);
1930 flush_obj = 1;
1931 break;
1932 case TMEM_DESTROY_POOL:
1933 flush = 1;
1934 rc = do_tmem_destroy_pool(op.pool_id);
1935 break;
1936 case TMEM_READ:
1937 rc = do_tmem_get(pool, op.object, op.index, op.cmfn,
1938 op.tmem_offset, op.pfn_offset, op.len);
1939 break;
1940 case TMEM_WRITE:
1941 rc = do_tmem_put(pool, op.object, op.index, op.cmfn,
1942 op.tmem_offset, op.pfn_offset, op.len);
1943 break;
1944 case TMEM_XCHG:
1945 /* need to hold global lock to ensure xchg is atomic */
1946 printk("tmem_xchg op not implemented yet\n");
1947 rc = 0;
1948 break;
1949 default:
1950 printk("tmem: op %d not implemented\n", op.cmd);
1951 rc = 0;
1952 break;
1955 out:
1956 if ( rc < 0 )
1957 errored_tmem_ops++;
1958 if ( succ_get )
1959 END_CYC_COUNTER(succ_get);
1960 else if ( succ_put )
1961 END_CYC_COUNTER(succ_put);
1962 else if ( non_succ_get )
1963 END_CYC_COUNTER(non_succ_get);
1964 else if ( non_succ_put )
1965 END_CYC_COUNTER(non_succ_put);
1966 else if ( flush )
1967 END_CYC_COUNTER(flush);
1968 else
1969 END_CYC_COUNTER(flush_obj);
1971 if ( tmh_lock_all )
1973 if ( tmh_lock_all > 1 )
1974 spin_unlock_irq(&tmem_spinlock);
1975 else
1976 spin_unlock(&tmem_spinlock);
1977 } else {
1978 if ( tmem_write_lock_set )
1979 write_unlock(&tmem_rwlock);
1980 else if ( tmem_read_lock_set )
1981 read_unlock(&tmem_rwlock);
1982 else
1983 ASSERT(0);
1986 return rc;
1989 /* this should be called when the host is destroying a client */
1990 EXPORT void tmem_destroy(void *v)
1992 client_t *client = (client_t *)v;
1994 if ( client == NULL )
1995 return;
1997 if ( tmh_lock_all )
1998 spin_lock(&tmem_spinlock);
1999 else
2000 write_lock(&tmem_rwlock);
2002 printk("tmem: flushing tmem pools for %s=%d\n",
2003 cli_id_str, client->cli_id);
2004 client_flush(client, 1);
2006 if ( tmh_lock_all )
2007 spin_unlock(&tmem_spinlock);
2008 else
2009 write_unlock(&tmem_rwlock);
2012 /* freezing all pools guarantees that no additional memory will be consumed */
2013 EXPORT void tmem_freeze_all(unsigned char key)
2015 static int freeze = 0;
2017 if ( tmh_lock_all )
2018 spin_lock(&tmem_spinlock);
2019 else
2020 write_lock(&tmem_rwlock);
2022 freeze = !freeze;
2023 tmemc_freeze_pools(CLI_ID_NULL,freeze);
2025 if ( tmh_lock_all )
2026 spin_unlock(&tmem_spinlock);
2027 else
2028 write_unlock(&tmem_rwlock);
2031 #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */
2033 EXPORT void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2035 pfp_t *pfp;
2036 unsigned long evicts_per_relinq = 0;
2037 int max_evictions = 10;
2039 if (!tmh_enabled())
2040 return NULL;
2041 #ifdef __i386__
2042 return NULL;
2043 #endif
2045 relinq_attempts++;
2046 if ( order > 0 )
2048 printk("tmem_relinquish_page: failing order=%d\n", order);
2049 return NULL;
2052 if ( tmh_called_from_tmem(memflags) )
2054 if ( tmh_lock_all )
2055 spin_lock(&tmem_spinlock);
2056 else
2057 read_lock(&tmem_rwlock);
2060 while ( (pfp = tmh_alloc_page(NULL,1)) == NULL )
2062 if ( (max_evictions-- <= 0) || !tmem_evict())
2063 break;
2064 evicts_per_relinq++;
2066 if ( evicts_per_relinq > max_evicts_per_relinq )
2067 max_evicts_per_relinq = evicts_per_relinq;
2068 tmh_scrub_page(pfp, memflags);
2069 if ( pfp != NULL )
2070 relinq_pgs++;
2072 if ( tmh_called_from_tmem(memflags) )
2074 if ( tmh_lock_all )
2075 spin_unlock(&tmem_spinlock);
2076 else
2077 read_unlock(&tmem_rwlock);
2080 return pfp;
2083 /* called at hypervisor startup */
2084 EXPORT void init_tmem(void)
2086 if ( !tmh_enabled() )
2087 return;
2089 radix_tree_init();
2090 if ( tmh_init() )
2092 printk("tmem: initialized comp=%d global-lock=%d\n",
2093 tmh_compression_enabled(), tmh_lock_all);
2094 tmem_initialized = 1;
2096 else
2097 printk("tmem: initialization FAILED\n");
2100 /*
2101 * Local variables:
2102 * mode: C
2103 * c-set-style: "BSD"
2104 * c-basic-offset: 4
2105 * tab-width: 4
2106 * indent-tabs-mode: nil
2107 * End:
2108 */